fs/xfs/scrub/scrub.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <djwong@kernel.org>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_log_format.h"
  13 #include "xfs_trans.h"
  14 #include "xfs_inode.h"
  15 #include "xfs_quota.h"
  16 #include "xfs_qm.h"
  17 #include "xfs_scrub.h"
  18 #include "xfs_buf_mem.h"
  19 #include "xfs_rmap.h"
  20 #include "xfs_exchrange.h"
  21 #include "xfs_exchmaps.h"
  22 #include "xfs_dir2.h"
  23 #include "xfs_parent.h"
  24 #include "xfs_icache.h"
  25 #include "scrub/scrub.h"
  26 #include "scrub/common.h"
  27 #include "scrub/trace.h"
  28 #include "scrub/repair.h"
  29 #include "scrub/health.h"
  30 #include "scrub/stats.h"
  31 #include "scrub/xfile.h"
  32 #include "scrub/tempfile.h"
  33 #include "scrub/orphanage.h"
  34
  35 /*
  36  * Online Scrub and Repair
  37  *
  38  * Traditionally, XFS (the kernel driver) did not know how to check or
  39  * repair on-disk data structures.  That task was left to the xfs_check
  40  * and xfs_repair tools, both of which require taking the filesystem
  41  * offline for a thorough but time consuming examination.  Online
  42  * scrub & repair, on the other hand, enables us to check the metadata
  43  * for obvious errors while carefully stepping around the filesystem's
  44  * ongoing operations, locking rules, etc.
  45  *
  46  * Given that most XFS metadata consist of records stored in a btree,
  47  * most of the checking functions iterate the btree blocks themselves
  48  * looking for irregularities.  When a record block is encountered, each
  49  * record can be checked for obviously bad values.  Record values can
  50  * also be cross-referenced against other btrees to look for potential
  51  * misunderstandings between pieces of metadata.
  52  *
  53  * It is expected that the checkers responsible for per-AG metadata
  54  * structures will lock the AG headers (AGI, AGF, AGFL), iterate the
  55  * metadata structure, and perform any relevant cross-referencing before
  56  * unlocking the AG and returning the results to userspace.  These
  57  * scrubbers must not keep an AG locked for too long to avoid tying up
  58  * the block and inode allocators.
  59  *
  60  * Block maps and b-trees rooted in an inode present a special challenge
  61  * because they can involve extents from any AG.  The general scrubber
  62  * structure of lock -> check -> xref -> unlock still holds, but AG
  63  * locking order rules /must/ be obeyed to avoid deadlocks.  The
  64  * ordering rule, of course, is that we must lock in increasing AG
  65  * order.  Helper functions are provided to track which AG headers we've
  66  * already locked.  If we detect an imminent locking order violation, we
  67  * can signal a potential deadlock, in which case the scrubber can jump
  68  * out to the top level, lock all the AGs in order, and retry the scrub.
  69  *
  70  * For file data (directories, extended attributes, symlinks) scrub, we
  71  * can simply lock the inode and walk the data.  For btree data
  72  * (directories and attributes) we follow the same btree-scrubbing
  73  * strategy outlined previously to check the records.
  74  *
  75  * We use a bit of trickery with transactions to avoid buffer deadlocks
  76  * if there is a cycle in the metadata.  The basic problem is that
  77  * travelling down a btree involves locking the current buffer at each
  78  * tree level.  If a pointer should somehow point back to a buffer that
  79  * we've already examined, we will deadlock due to the second buffer
  80  * locking attempt.  Note however that grabbing a buffer in transaction
  81  * context links the locked buffer to the transaction.  If we try to
  82  * re-grab the buffer in the context of the same transaction, we avoid
  83  * the second lock attempt and continue.  Between the verifier and the
  84  * scrubber, something will notice that something is amiss and report
  85  * the corruption.  Therefore, each scrubber will allocate an empty
  86  * transaction, attach buffers to it, and cancel the transaction at the
  87  * end of the scrub run.  Cancelling a non-dirty transaction simply
  88  * unlocks the buffers.
  89  *
  90  * There are four pieces of data that scrub can communicate to
  91  * userspace.  The first is the error code (errno), which can be used to
  92  * communicate operational errors in performing the scrub.  There are
  93  * also three flags that can be set in the scrub context.  If the data
  94  * structure itself is corrupt, the CORRUPT flag will be set.  If
  95  * the metadata is correct but otherwise suboptimal, the PREEN flag
  96  * will be set.
  97  *
  98  * We perform secondary validation of filesystem metadata by
  99  * cross-referencing every record with all other available metadata.
 100  * For example, for block mapping extents, we verify that there are no
 101  * records in the free space and inode btrees corresponding to that
 102  * space extent and that there is a corresponding entry in the reverse
 103  * mapping btree.  Inconsistent metadata is noted by setting the
 104  * XCORRUPT flag; btree query function errors are noted by setting the
 105  * XFAIL flag and deleting the cursor to prevent further attempts to
 106  * cross-reference with a defective btree.
 107  *
 108  * If a piece of metadata proves corrupt or suboptimal, the userspace
 109  * program can ask the kernel to apply some tender loving care (TLC) to
 110  * the metadata object by setting the REPAIR flag and re-calling the
 111  * scrub ioctl.  "Corruption" is defined by metadata violating the
 112  * on-disk specification; operations cannot continue if the violation is
 113  * left untreated.  It is possible for XFS to continue if an object is
 114  * "suboptimal", however performance may be degraded.  Repairs are
 115  * usually performed by rebuilding the metadata entirely out of
 116  * redundant metadata.  Optimizing, on the other hand, can sometimes be
 117  * done without rebuilding entire structures.
 118  *
 119  * Generally speaking, the repair code has the following code structure:
 120  * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock.
 121  * The first check helps us figure out if we need to rebuild or simply
 122  * optimize the structure so that the rebuild knows what to do.  The
 123  * second check evaluates the completeness of the repair; that is what
 124  * is reported to userspace.
 125  *
 126  * A quick note on symbol prefixes:
 127  * - "xfs_" are general XFS symbols.
 128  * - "xchk_" are symbols related to metadata checking.
 129  * - "xrep_" are symbols related to metadata repair.
 130  * - "xfs_scrub_" are symbols that tie online fsck to the rest of XFS.
 131  */
 132
 133 /*
 134  * Scrub probe -- userspace uses this to probe if we're willing to scrub
 135  * or repair a given mountpoint.  This will be used by xfs_scrub to
 136  * probe the kernel's abilities to scrub (and repair) the metadata.  We
 137  * do this by validating the ioctl inputs from userspace, preparing the
 138  * filesystem for a scrub (or a repair) operation, and immediately
 139  * returning to userspace.  Userspace can use the returned errno and
 140  * structure state to decide (in broad terms) if scrub/repair are
 141  * supported by the running kernel.
 142  */
 143 static int
 144 xchk_probe(
 145         struct xfs_scrub        *sc)
 146 {
 147         int                     error = 0;
 148
 149         if (xchk_should_terminate(sc, &error))
 150                 return error;
 151
 152         return 0;
 153 }
 154
 155 /* Scrub setup and teardown */
 156
 157 static inline void
 158 xchk_fsgates_disable(
 159         struct xfs_scrub        *sc)
 160 {
 161         if (!(sc->flags & XCHK_FSGATES_ALL))
 162                 return;
 163
 164         trace_xchk_fsgates_disable(sc, sc->flags & XCHK_FSGATES_ALL);
 165
 166         if (sc->flags & XCHK_FSGATES_DRAIN)
 167                 xfs_drain_wait_disable();
 168
 169         if (sc->flags & XCHK_FSGATES_QUOTA)
 170                 xfs_dqtrx_hook_disable();
 171
 172         if (sc->flags & XCHK_FSGATES_DIRENTS)
 173                 xfs_dir_hook_disable();
 174
 175         if (sc->flags & XCHK_FSGATES_RMAP)
 176                 xfs_rmap_hook_disable();
 177
 178         sc->flags &= ~XCHK_FSGATES_ALL;
 179 }
 180
 181 /* Free the resources associated with a scrub subtype. */
 182 void
 183 xchk_scrub_free_subord(
 184         struct xfs_scrub_subord *sub)
 185 {
 186         struct xfs_scrub        *sc = sub->parent_sc;
 187
 188         ASSERT(sc->ip == sub->sc.ip);
 189         ASSERT(sc->orphanage == sub->sc.orphanage);
 190         ASSERT(sc->tempip == sub->sc.tempip);
 191
 192         sc->sm->sm_type = sub->old_smtype;
 193         sc->sm->sm_flags = sub->old_smflags |
 194                                 (sc->sm->sm_flags & XFS_SCRUB_FLAGS_OUT);
 195         sc->tp = sub->sc.tp;
 196
 197         if (sub->sc.buf) {
 198                 if (sub->sc.buf_cleanup)
 199                         sub->sc.buf_cleanup(sub->sc.buf);
 200                 kvfree(sub->sc.buf);
 201         }
 202         if (sub->sc.xmbtp)
 203                 xmbuf_free(sub->sc.xmbtp);
 204         if (sub->sc.xfile)
 205                 xfile_destroy(sub->sc.xfile);
 206
 207         sc->ilock_flags = sub->sc.ilock_flags;
 208         sc->orphanage_ilock_flags = sub->sc.orphanage_ilock_flags;
 209         sc->temp_ilock_flags = sub->sc.temp_ilock_flags;
 210
 211         kfree(sub);
 212 }
 213
 214 /* Free all the resources and finish the transactions. */
 215 STATIC int
 216 xchk_teardown(
 217         struct xfs_scrub        *sc,
 218         int                     error)
 219 {
 220         xchk_ag_free(sc, &sc->sa);
 221         if (sc->tp) {
 222                 if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
 223                         error = xfs_trans_commit(sc->tp);
 224                 else
 225                         xfs_trans_cancel(sc->tp);
 226                 sc->tp = NULL;
 227         }
 228         if (sc->ip) {
 229                 if (sc->ilock_flags)
 230                         xchk_iunlock(sc, sc->ilock_flags);
 231                 xchk_irele(sc, sc->ip);
 232                 sc->ip = NULL;
 233         }
 234         if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
 235                 sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
 236                 mnt_drop_write_file(sc->file);
 237         }
 238         if (sc->xmbtp) {
 239                 xmbuf_free(sc->xmbtp);
 240                 sc->xmbtp = NULL;
 241         }
 242         if (sc->xfile) {
 243                 xfile_destroy(sc->xfile);
 244                 sc->xfile = NULL;
 245         }
 246         if (sc->buf) {
 247                 if (sc->buf_cleanup)
 248                         sc->buf_cleanup(sc->buf);
 249                 kvfree(sc->buf);
 250                 sc->buf_cleanup = NULL;
 251                 sc->buf = NULL;
 252         }
 253
 254         xrep_tempfile_rele(sc);
 255         xrep_orphanage_rele(sc);
 256         xchk_fsgates_disable(sc);
 257         return error;
 258 }
 259
 260 /* Scrubbing dispatch. */
 261
 262 static const struct xchk_meta_ops meta_scrub_ops[] = {
 263         [XFS_SCRUB_TYPE_PROBE] = {      /* ioctl presence test */
 264                 .type   = ST_NONE,
 265                 .setup  = xchk_setup_fs,
 266                 .scrub  = xchk_probe,
 267                 .repair = xrep_probe,
 268         },
 269         [XFS_SCRUB_TYPE_SB] = {         /* superblock */
 270                 .type   = ST_PERAG,
 271                 .setup  = xchk_setup_agheader,
 272                 .scrub  = xchk_superblock,
 273                 .repair = xrep_superblock,
 274         },
 275         [XFS_SCRUB_TYPE_AGF] = {        /* agf */
 276                 .type   = ST_PERAG,
 277                 .setup  = xchk_setup_agheader,
 278                 .scrub  = xchk_agf,
 279                 .repair = xrep_agf,
 280         },
 281         [XFS_SCRUB_TYPE_AGFL]= {        /* agfl */
 282                 .type   = ST_PERAG,
 283                 .setup  = xchk_setup_agheader,
 284                 .scrub  = xchk_agfl,
 285                 .repair = xrep_agfl,
 286         },
 287         [XFS_SCRUB_TYPE_AGI] = {        /* agi */
 288                 .type   = ST_PERAG,
 289                 .setup  = xchk_setup_agheader,
 290                 .scrub  = xchk_agi,
 291                 .repair = xrep_agi,
 292         },
 293         [XFS_SCRUB_TYPE_BNOBT] = {      /* bnobt */
 294                 .type   = ST_PERAG,
 295                 .setup  = xchk_setup_ag_allocbt,
 296                 .scrub  = xchk_allocbt,
 297                 .repair = xrep_allocbt,
 298                 .repair_eval = xrep_revalidate_allocbt,
 299         },
 300         [XFS_SCRUB_TYPE_CNTBT] = {      /* cntbt */
 301                 .type   = ST_PERAG,
 302                 .setup  = xchk_setup_ag_allocbt,
 303                 .scrub  = xchk_allocbt,
 304                 .repair = xrep_allocbt,
 305                 .repair_eval = xrep_revalidate_allocbt,
 306         },
 307         [XFS_SCRUB_TYPE_INOBT] = {      /* inobt */
 308                 .type   = ST_PERAG,
 309                 .setup  = xchk_setup_ag_iallocbt,
 310                 .scrub  = xchk_iallocbt,
 311                 .repair = xrep_iallocbt,
 312                 .repair_eval = xrep_revalidate_iallocbt,
 313         },
 314         [XFS_SCRUB_TYPE_FINOBT] = {     /* finobt */
 315                 .type   = ST_PERAG,
 316                 .setup  = xchk_setup_ag_iallocbt,
 317                 .scrub  = xchk_iallocbt,
 318                 .has    = xfs_has_finobt,
 319                 .repair = xrep_iallocbt,
 320                 .repair_eval = xrep_revalidate_iallocbt,
 321         },
 322         [XFS_SCRUB_TYPE_RMAPBT] = {     /* rmapbt */
 323                 .type   = ST_PERAG,
 324                 .setup  = xchk_setup_ag_rmapbt,
 325                 .scrub  = xchk_rmapbt,
 326                 .has    = xfs_has_rmapbt,
 327                 .repair = xrep_rmapbt,
 328         },
 329         [XFS_SCRUB_TYPE_REFCNTBT] = {   /* refcountbt */
 330                 .type   = ST_PERAG,
 331                 .setup  = xchk_setup_ag_refcountbt,
 332                 .scrub  = xchk_refcountbt,
 333                 .has    = xfs_has_reflink,
 334                 .repair = xrep_refcountbt,
 335         },
 336         [XFS_SCRUB_TYPE_INODE] = {      /* inode record */
 337                 .type   = ST_INODE,
 338                 .setup  = xchk_setup_inode,
 339                 .scrub  = xchk_inode,
 340                 .repair = xrep_inode,
 341         },
 342         [XFS_SCRUB_TYPE_BMBTD] = {      /* inode data fork */
 343                 .type   = ST_INODE,
 344                 .setup  = xchk_setup_inode_bmap,
 345                 .scrub  = xchk_bmap_data,
 346                 .repair = xrep_bmap_data,
 347         },
 348         [XFS_SCRUB_TYPE_BMBTA] = {      /* inode attr fork */
 349                 .type   = ST_INODE,
 350                 .setup  = xchk_setup_inode_bmap,
 351                 .scrub  = xchk_bmap_attr,
 352                 .repair = xrep_bmap_attr,
 353         },
 354         [XFS_SCRUB_TYPE_BMBTC] = {      /* inode CoW fork */
 355                 .type   = ST_INODE,
 356                 .setup  = xchk_setup_inode_bmap,
 357                 .scrub  = xchk_bmap_cow,
 358                 .repair = xrep_bmap_cow,
 359         },
 360         [XFS_SCRUB_TYPE_DIR] = {        /* directory */
 361                 .type   = ST_INODE,
 362                 .setup  = xchk_setup_directory,
 363                 .scrub  = xchk_directory,
 364                 .repair = xrep_directory,
 365         },
 366         [XFS_SCRUB_TYPE_XATTR] = {      /* extended attributes */
 367                 .type   = ST_INODE,
 368                 .setup  = xchk_setup_xattr,
 369                 .scrub  = xchk_xattr,
 370                 .repair = xrep_xattr,
 371         },
 372         [XFS_SCRUB_TYPE_SYMLINK] = {    /* symbolic link */
 373                 .type   = ST_INODE,
 374                 .setup  = xchk_setup_symlink,
 375                 .scrub  = xchk_symlink,
 376                 .repair = xrep_symlink,
 377         },
 378         [XFS_SCRUB_TYPE_PARENT] = {     /* parent pointers */
 379                 .type   = ST_INODE,
 380                 .setup  = xchk_setup_parent,
 381                 .scrub  = xchk_parent,
 382                 .repair = xrep_parent,
 383         },
 384         [XFS_SCRUB_TYPE_RTBITMAP] = {   /* realtime bitmap */
 385                 .type   = ST_FS,
 386                 .setup  = xchk_setup_rtbitmap,
 387                 .scrub  = xchk_rtbitmap,
 388                 .repair = xrep_rtbitmap,
 389         },
 390         [XFS_SCRUB_TYPE_RTSUM] = {      /* realtime summary */
 391                 .type   = ST_FS,
 392                 .setup  = xchk_setup_rtsummary,
 393                 .scrub  = xchk_rtsummary,
 394                 .repair = xrep_rtsummary,
 395         },
 396         [XFS_SCRUB_TYPE_UQUOTA] = {     /* user quota */
 397                 .type   = ST_FS,
 398                 .setup  = xchk_setup_quota,
 399                 .scrub  = xchk_quota,
 400                 .repair = xrep_quota,
 401         },
 402         [XFS_SCRUB_TYPE_GQUOTA] = {     /* group quota */
 403                 .type   = ST_FS,
 404                 .setup  = xchk_setup_quota,
 405                 .scrub  = xchk_quota,
 406                 .repair = xrep_quota,
 407         },
 408         [XFS_SCRUB_TYPE_PQUOTA] = {     /* project quota */
 409                 .type   = ST_FS,
 410                 .setup  = xchk_setup_quota,
 411                 .scrub  = xchk_quota,
 412                 .repair = xrep_quota,
 413         },
 414         [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */
 415                 .type   = ST_FS,
 416                 .setup  = xchk_setup_fscounters,
 417                 .scrub  = xchk_fscounters,
 418                 .repair = xrep_fscounters,
 419         },
 420         [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */
 421                 .type   = ST_FS,
 422                 .setup  = xchk_setup_quotacheck,
 423                 .scrub  = xchk_quotacheck,
 424                 .repair = xrep_quotacheck,
 425         },
 426         [XFS_SCRUB_TYPE_NLINKS] = {     /* inode link counts */
 427                 .type   = ST_FS,
 428                 .setup  = xchk_setup_nlinks,
 429                 .scrub  = xchk_nlinks,
 430                 .repair = xrep_nlinks,
 431         },
 432         [XFS_SCRUB_TYPE_HEALTHY] = {    /* fs healthy; clean all reminders */
 433                 .type   = ST_FS,
 434                 .setup  = xchk_setup_fs,
 435                 .scrub  = xchk_health_record,
 436                 .repair = xrep_notsupported,
 437         },
 438         [XFS_SCRUB_TYPE_DIRTREE] = {    /* directory tree structure */
 439                 .type   = ST_INODE,
 440                 .setup  = xchk_setup_dirtree,
 441                 .scrub  = xchk_dirtree,
 442                 .has    = xfs_has_parent,
 443                 .repair = xrep_dirtree,
 444         },
 445 };
 446
 447 static int
 448 xchk_validate_inputs(
 449         struct xfs_mount                *mp,
 450         struct xfs_scrub_metadata       *sm)
 451 {
 452         int                             error;
 453         const struct xchk_meta_ops      *ops;
 454
 455         error = -EINVAL;
 456         /* Check our inputs. */
 457         sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
 458         if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
 459                 goto out;
 460         /* sm_reserved[] must be zero */
 461         if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
 462                 goto out;
 463
 464         error = -ENOENT;
 465         /* Do we know about this type of metadata? */
 466         if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
 467                 goto out;
 468         ops = &meta_scrub_ops[sm->sm_type];
 469         if (ops->setup == NULL || ops->scrub == NULL)
 470                 goto out;
 471         /* Does this fs even support this type of metadata? */
 472         if (ops->has && !ops->has(mp))
 473                 goto out;
 474
 475         error = -EINVAL;
 476         /* restricting fields must be appropriate for type */
 477         switch (ops->type) {
 478         case ST_NONE:
 479         case ST_FS:
 480                 if (sm->sm_ino || sm->sm_gen || sm->sm_agno)
 481                         goto out;
 482                 break;
 483         case ST_PERAG:
 484                 if (sm->sm_ino || sm->sm_gen ||
 485                     sm->sm_agno >= mp->m_sb.sb_agcount)
 486                         goto out;
 487                 break;
 488         case ST_INODE:
 489                 if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
 490                         goto out;
 491                 break;
 492         default:
 493                 goto out;
 494         }
 495
 496         /* No rebuild without repair. */
 497         if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) &&
 498             !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
 499                 return -EINVAL;
 500
 501         /*
 502          * We only want to repair read-write v5+ filesystems.  Defer the check
 503          * for ops->repair until after our scrub confirms that we need to
 504          * perform repairs so that we avoid failing due to not supporting
 505          * repairing an object that doesn't need repairs.
 506          */
 507         if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
 508                 error = -EOPNOTSUPP;
 509                 if (!xfs_has_crc(mp))
 510                         goto out;
 511
 512                 error = -EROFS;
 513                 if (xfs_is_readonly(mp))
 514                         goto out;
 515         }
 516
 517         error = 0;
 518 out:
 519         return error;
 520 }
 521
 522 #ifdef CONFIG_XFS_ONLINE_REPAIR
 523 static inline void xchk_postmortem(struct xfs_scrub *sc)
 524 {
 525         /*
 526          * Userspace asked us to repair something, we repaired it, rescanned
 527          * it, and the rescan says it's still broken.  Scream about this in
 528          * the system logs.
 529          */
 530         if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
 531             (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
 532                                  XFS_SCRUB_OFLAG_XCORRUPT)))
 533                 xrep_failure(sc->mp);
 534 }
 535 #else
 536 static inline void xchk_postmortem(struct xfs_scrub *sc)
 537 {
 538         /*
 539          * Userspace asked us to scrub something, it's broken, and we have no
 540          * way of fixing it.  Scream in the logs.
 541          */
 542         if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
 543                                 XFS_SCRUB_OFLAG_XCORRUPT))
 544                 xfs_alert_ratelimited(sc->mp,
 545                                 "Corruption detected during scrub.");
 546 }
 547 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 548
 549 /*
 550  * Create a new scrub context from an existing one, but with a different scrub
 551  * type.
 552  */
 553 struct xfs_scrub_subord *
 554 xchk_scrub_create_subord(
 555         struct xfs_scrub        *sc,
 556         unsigned int            subtype)
 557 {
 558         struct xfs_scrub_subord *sub;
 559
 560         sub = kzalloc(sizeof(*sub), XCHK_GFP_FLAGS);
 561         if (!sub)
 562                 return ERR_PTR(-ENOMEM);
 563
 564         sub->old_smtype = sc->sm->sm_type;
 565         sub->old_smflags = sc->sm->sm_flags;
 566         sub->parent_sc = sc;
 567         memcpy(&sub->sc, sc, sizeof(struct xfs_scrub));
 568         sub->sc.ops = &meta_scrub_ops[subtype];
 569         sub->sc.sm->sm_type = subtype;
 570         sub->sc.sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
 571         sub->sc.buf = NULL;
 572         sub->sc.buf_cleanup = NULL;
 573         sub->sc.xfile = NULL;
 574         sub->sc.xmbtp = NULL;
 575
 576         return sub;
 577 }
 578
 579 /* Dispatch metadata scrubbing. */
 580 STATIC int
 581 xfs_scrub_metadata(
 582         struct file                     *file,
 583         struct xfs_scrub_metadata       *sm)
 584 {
 585         struct xchk_stats_run           run = { };
 586         struct xfs_scrub                *sc;
 587         struct xfs_mount                *mp = XFS_I(file_inode(file))->i_mount;
 588         u64                             check_start;
 589         int                             error = 0;
 590
 591         BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
 592                 (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR));
 593
 594         trace_xchk_start(XFS_I(file_inode(file)), sm, error);
 595
 596         /* Forbidden if we are shut down or mounted norecovery. */
 597         error = -ESHUTDOWN;
 598         if (xfs_is_shutdown(mp))
 599                 goto out;
 600         error = -ENOTRECOVERABLE;
 601         if (xfs_has_norecovery(mp))
 602                 goto out;
 603
 604         error = xchk_validate_inputs(mp, sm);
 605         if (error)
 606                 goto out;
 607
 608         xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
 609  "EXPERIMENTAL online scrub feature in use. Use at your own risk!");
 610
 611         sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
 612         if (!sc) {
 613                 error = -ENOMEM;
 614                 goto out;
 615         }
 616
 617         sc->mp = mp;
 618         sc->file = file;
 619         sc->sm = sm;
 620         sc->ops = &meta_scrub_ops[sm->sm_type];
 621         sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type);
 622         sc->relax = INIT_XCHK_RELAX;
 623 retry_op:
 624         /*
 625          * When repairs are allowed, prevent freezing or readonly remount while
 626          * scrub is running with a real transaction.
 627          */
 628         if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
 629                 error = mnt_want_write_file(sc->file);
 630                 if (error)
 631                         goto out_sc;
 632
 633                 sc->flags |= XCHK_HAVE_FREEZE_PROT;
 634         }
 635
 636         /* Set up for the operation. */
 637         error = sc->ops->setup(sc);
 638         if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
 639                 goto try_harder;
 640         if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
 641                 goto need_drain;
 642         if (error)
 643                 goto out_teardown;
 644
 645         /* Scrub for errors. */
 646         check_start = xchk_stats_now();
 647         if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL)
 648                 error = sc->ops->repair_eval(sc);
 649         else
 650                 error = sc->ops->scrub(sc);
 651         run.scrub_ns += xchk_stats_elapsed_ns(check_start);
 652         if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
 653                 goto try_harder;
 654         if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
 655                 goto need_drain;
 656         if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE))
 657                 goto out_teardown;
 658
 659         xchk_update_health(sc);
 660
 661         if (xchk_could_repair(sc)) {
 662                 /*
 663                  * If userspace asked for a repair but it wasn't necessary,
 664                  * report that back to userspace.
 665                  */
 666                 if (!xrep_will_attempt(sc)) {
 667                         sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
 668                         goto out_nofix;
 669                 }
 670
 671                 /*
 672                  * If it's broken, userspace wants us to fix it, and we haven't
 673                  * already tried to fix it, then attempt a repair.
 674                  */
 675                 error = xrep_attempt(sc, &run);
 676                 if (error == -EAGAIN) {
 677                         /*
 678                          * Either the repair function succeeded or it couldn't
 679                          * get all the resources it needs; either way, we go
 680                          * back to the beginning and call the scrub function.
 681                          */
 682                         error = xchk_teardown(sc, 0);
 683                         if (error) {
 684                                 xrep_failure(mp);
 685                                 goto out_sc;
 686                         }
 687                         goto retry_op;
 688                 }
 689         }
 690
 691 out_nofix:
 692         xchk_postmortem(sc);
 693 out_teardown:
 694         error = xchk_teardown(sc, error);
 695 out_sc:
 696         if (error != -ENOENT)
 697                 xchk_stats_merge(mp, sm, &run);
 698         kfree(sc);
 699 out:
 700         trace_xchk_done(XFS_I(file_inode(file)), sm, error);
 701         if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
 702                 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
 703                 error = 0;
 704         }
 705         return error;
 706 need_drain:
 707         error = xchk_teardown(sc, 0);
 708         if (error)
 709                 goto out_sc;
 710         sc->flags |= XCHK_NEED_DRAIN;
 711         run.retries++;
 712         goto retry_op;
 713 try_harder:
 714         /*
 715          * Scrubbers return -EDEADLOCK to mean 'try harder'.  Tear down
 716          * everything we hold, then set up again with preparation for
 717          * worst-case scenarios.
 718          */
 719         error = xchk_teardown(sc, 0);
 720         if (error)
 721                 goto out_sc;
 722         sc->flags |= XCHK_TRY_HARDER;
 723         run.retries++;
 724         goto retry_op;
 725 }
 726
 727 /* Scrub one aspect of one piece of metadata. */
 728 int
 729 xfs_ioc_scrub_metadata(
 730         struct file                     *file,
 731         void                            __user *arg)
 732 {
 733         struct xfs_scrub_metadata       scrub;
 734         int                             error;
 735
 736         if (!capable(CAP_SYS_ADMIN))
 737                 return -EPERM;
 738
 739         if (copy_from_user(&scrub, arg, sizeof(scrub)))
 740                 return -EFAULT;
 741
 742         error = xfs_scrub_metadata(file, &scrub);
 743         if (error)
 744                 return error;
 745
 746         if (copy_to_user(arg, &scrub, sizeof(scrub)))
 747                 return -EFAULT;
 748
 749         return 0;
 750 }
 751
 752 /* Decide if there have been any scrub failures up to this point. */
 753 static inline int
 754 xfs_scrubv_check_barrier(
 755         struct xfs_mount                *mp,
 756         const struct xfs_scrub_vec      *vectors,
 757         const struct xfs_scrub_vec      *stop_vec)
 758 {
 759         const struct xfs_scrub_vec      *v;
 760         __u32                           failmask;
 761
 762         failmask = stop_vec->sv_flags & XFS_SCRUB_FLAGS_OUT;
 763
 764         for (v = vectors; v < stop_vec; v++) {
 765                 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER)
 766                         continue;
 767
 768                 /*
 769                  * Runtime errors count as a previous failure, except the ones
 770                  * used to ask userspace to retry.
 771                  */
 772                 switch (v->sv_ret) {
 773                 case -EBUSY:
 774                 case -ENOENT:
 775                 case -EUSERS:
 776                 case 0:
 777                         break;
 778                 default:
 779                         return -ECANCELED;
 780                 }
 781
 782                 /*
 783                  * If any of the out-flags on the scrub vector match the mask
 784                  * that was set on the barrier vector, that's a previous fail.
 785                  */
 786                 if (v->sv_flags & failmask)
 787                         return -ECANCELED;
 788         }
 789
 790         return 0;
 791 }
 792
 793 /*
 794  * If the caller provided us with a nonzero inode number that isn't the ioctl
 795  * file, try to grab a reference to it to eliminate all further untrusted inode
 796  * lookups.  If we can't get the inode, let each scrub function try again.
 797  */
 798 STATIC struct xfs_inode *
 799 xchk_scrubv_open_by_handle(
 800         struct xfs_mount                *mp,
 801         const struct xfs_scrub_vec_head *head)
 802 {
 803         struct xfs_trans                *tp;
 804         struct xfs_inode                *ip;
 805         int                             error;
 806
 807         error = xfs_trans_alloc_empty(mp, &tp);
 808         if (error)
 809                 return NULL;
 810
 811         error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip);
 812         xfs_trans_cancel(tp);
 813         if (error)
 814                 return NULL;
 815
 816         if (VFS_I(ip)->i_generation != head->svh_gen) {
 817                 xfs_irele(ip);
 818                 return NULL;
 819         }
 820
 821         return ip;
 822 }
 823
 824 /* Vectored scrub implementation to reduce ioctl calls. */
 825 int
 826 xfs_ioc_scrubv_metadata(
 827         struct file                     *file,
 828         void                            __user *arg)
 829 {
 830         struct xfs_scrub_vec_head       head;
 831         struct xfs_scrub_vec_head       __user *uhead = arg;
 832         struct xfs_scrub_vec            *vectors;
 833         struct xfs_scrub_vec            __user *uvectors;
 834         struct xfs_inode                *ip_in = XFS_I(file_inode(file));
 835         struct xfs_mount                *mp = ip_in->i_mount;
 836         struct xfs_inode                *handle_ip = NULL;
 837         struct xfs_scrub_vec            *v;
 838         size_t                          vec_bytes;
 839         unsigned int                    i;
 840         int                             error = 0;
 841
 842         if (!capable(CAP_SYS_ADMIN))
 843                 return -EPERM;
 844
 845         if (copy_from_user(&head, uhead, sizeof(head)))
 846                 return -EFAULT;
 847
 848         if (head.svh_reserved)
 849                 return -EINVAL;
 850         if (head.svh_flags & ~XFS_SCRUB_VEC_FLAGS_ALL)
 851                 return -EINVAL;
 852         if (head.svh_nr == 0)
 853                 return 0;
 854
 855         vec_bytes = array_size(head.svh_nr, sizeof(struct xfs_scrub_vec));
 856         if (vec_bytes > PAGE_SIZE)
 857                 return -ENOMEM;
 858
 859         uvectors = u64_to_user_ptr(head.svh_vectors);
 860         vectors = memdup_user(uvectors, vec_bytes);
 861         if (IS_ERR(vectors))
 862                 return PTR_ERR(vectors);
 863
 864         trace_xchk_scrubv_start(ip_in, &head);
 865
 866         for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
 867                 if (v->sv_reserved) {
 868                         error = -EINVAL;
 869                         goto out_free;
 870                 }
 871
 872                 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER &&
 873                     (v->sv_flags & ~XFS_SCRUB_FLAGS_OUT)) {
 874                         error = -EINVAL;
 875                         goto out_free;
 876                 }
 877
 878                 trace_xchk_scrubv_item(mp, &head, i, v);
 879         }
 880
 881         /*
 882          * If the caller wants us to do a scrub-by-handle and the file used to
 883          * call the ioctl is not the same file, load the incore inode and pin
 884          * it across all the scrubv actions to avoid repeated UNTRUSTED
 885          * lookups.  The reference is not passed to deeper layers of scrub
 886          * because each scrubber gets to decide its own strategy and return
 887          * values for getting an inode.
 888          */
 889         if (head.svh_ino && head.svh_ino != ip_in->i_ino)
 890                 handle_ip = xchk_scrubv_open_by_handle(mp, &head);
 891
 892         /* Run all the scrubbers. */
 893         for (i = 0, v = vectors; i < head.svh_nr; i++, v++) {
 894                 struct xfs_scrub_metadata       sm = {
 895                         .sm_type                = v->sv_type,
 896                         .sm_flags               = v->sv_flags,
 897                         .sm_ino                 = head.svh_ino,
 898                         .sm_gen                 = head.svh_gen,
 899                         .sm_agno                = head.svh_agno,
 900                 };
 901
 902                 if (v->sv_type == XFS_SCRUB_TYPE_BARRIER) {
 903                         v->sv_ret = xfs_scrubv_check_barrier(mp, vectors, v);
 904                         if (v->sv_ret) {
 905                                 trace_xchk_scrubv_barrier_fail(mp, &head, i, v);
 906                                 break;
 907                         }
 908
 909                         continue;
 910                 }
 911
 912                 v->sv_ret = xfs_scrub_metadata(file, &sm);
 913                 v->sv_flags = sm.sm_flags;
 914
 915                 trace_xchk_scrubv_outcome(mp, &head, i, v);
 916
 917                 if (head.svh_rest_us) {
 918                         ktime_t         expires;
 919
 920                         expires = ktime_add_ns(ktime_get(),
 921                                         head.svh_rest_us * 1000);
 922                         set_current_state(TASK_KILLABLE);
 923                         schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
 924                 }
 925
 926                 if (fatal_signal_pending(current)) {
 927                         error = -EINTR;
 928                         goto out_free;
 929                 }
 930         }
 931
 932         if (copy_to_user(uvectors, vectors, vec_bytes) ||
 933             copy_to_user(uhead, &head, sizeof(head))) {
 934                 error = -EFAULT;
 935                 goto out_free;
 936         }
 937
 938 out_free:
 939         if (handle_ip)
 940                 xfs_irele(handle_ip);
 941         kfree(vectors);
 942         return error;
 943 }