fs/xfs/xfs_log_recover.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_fs.h"
  20 #include "xfs_shared.h"
  21 #include "xfs_format.h"
  22 #include "xfs_log_format.h"
  23 #include "xfs_trans_resv.h"
  24 #include "xfs_bit.h"
  25 #include "xfs_sb.h"
  26 #include "xfs_mount.h"
  27 #include "xfs_da_format.h"
  28 #include "xfs_da_btree.h"
  29 #include "xfs_inode.h"
  30 #include "xfs_trans.h"
  31 #include "xfs_log.h"
  32 #include "xfs_log_priv.h"
  33 #include "xfs_log_recover.h"
  34 #include "xfs_inode_item.h"
  35 #include "xfs_extfree_item.h"
  36 #include "xfs_trans_priv.h"
  37 #include "xfs_alloc.h"
  38 #include "xfs_ialloc.h"
  39 #include "xfs_quota.h"
  40 #include "xfs_cksum.h"
  41 #include "xfs_trace.h"
  42 #include "xfs_icache.h"
  43 #include "xfs_bmap_btree.h"
  44 #include "xfs_error.h"
  45 #include "xfs_dir2.h"
  46
  47 #define BLK_AVG(blk1, blk2)     ((blk1+blk2) >> 1)
  48
  49 STATIC int
  50 xlog_find_zeroed(
  51         struct xlog     *,
  52         xfs_daddr_t     *);
  53 STATIC int
  54 xlog_clear_stale_blocks(
  55         struct xlog     *,
  56         xfs_lsn_t);
  57 #if defined(DEBUG)
  58 STATIC void
  59 xlog_recover_check_summary(
  60         struct xlog *);
  61 #else
  62 #define xlog_recover_check_summary(log)
  63 #endif
  64 STATIC int
  65 xlog_do_recovery_pass(
  66         struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
  67
  68 /*
  69  * This structure is used during recovery to record the buf log items which
  70  * have been canceled and should not be replayed.
  71  */
  72 struct xfs_buf_cancel {
  73         xfs_daddr_t             bc_blkno;
  74         uint                    bc_len;
  75         int                     bc_refcount;
  76         struct list_head        bc_list;
  77 };
  78
  79 /*
  80  * Sector aligned buffer routines for buffer create/read/write/access
  81  */
  82
  83 /*
  84  * Verify the given count of basic blocks is valid number of blocks
  85  * to specify for an operation involving the given XFS log buffer.
  86  * Returns nonzero if the count is valid, 0 otherwise.
  87  */
  88
  89 static inline int
  90 xlog_buf_bbcount_valid(
  91         struct xlog     *log,
  92         int             bbcount)
  93 {
  94         return bbcount > 0 && bbcount <= log->l_logBBsize;
  95 }
  96
  97 /*
  98  * Allocate a buffer to hold log data.  The buffer needs to be able
  99  * to map to a range of nbblks basic blocks at any valid (basic
 100  * block) offset within the log.
 101  */
 102 STATIC xfs_buf_t *
 103 xlog_get_bp(
 104         struct xlog     *log,
 105         int             nbblks)
 106 {
 107         struct xfs_buf  *bp;
 108
 109         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 110                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 111                         nbblks);
 112                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 113                 return NULL;
 114         }
 115
 116         /*
 117          * We do log I/O in units of log sectors (a power-of-2
 118          * multiple of the basic block size), so we round up the
 119          * requested size to accommodate the basic blocks required
 120          * for complete log sectors.
 121          *
 122          * In addition, the buffer may be used for a non-sector-
 123          * aligned block offset, in which case an I/O of the
 124          * requested size could extend beyond the end of the
 125          * buffer.  If the requested size is only 1 basic block it
 126          * will never straddle a sector boundary, so this won't be
 127          * an issue.  Nor will this be a problem if the log I/O is
 128          * done in basic blocks (sector size 1).  But otherwise we
 129          * extend the buffer by one extra log sector to ensure
 130          * there's space to accommodate this possibility.
 131          */
 132         if (nbblks > 1 && log->l_sectBBsize > 1)
 133                 nbblks += log->l_sectBBsize;
 134         nbblks = round_up(nbblks, log->l_sectBBsize);
 135
 136         bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
 137         if (bp)
 138                 xfs_buf_unlock(bp);
 139         return bp;
 140 }
 141
 142 STATIC void
 143 xlog_put_bp(
 144         xfs_buf_t       *bp)
 145 {
 146         xfs_buf_free(bp);
 147 }
 148
 149 /*
 150  * Return the address of the start of the given block number's data
 151  * in a log buffer.  The buffer covers a log sector-aligned region.
 152  */
 153 STATIC char *
 154 xlog_align(
 155         struct xlog     *log,
 156         xfs_daddr_t     blk_no,
 157         int             nbblks,
 158         struct xfs_buf  *bp)
 159 {
 160         xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
 161
 162         ASSERT(offset + nbblks <= bp->b_length);
 163         return bp->b_addr + BBTOB(offset);
 164 }
 165
 166
 167 /*
 168  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
 169  */
 170 STATIC int
 171 xlog_bread_noalign(
 172         struct xlog     *log,
 173         xfs_daddr_t     blk_no,
 174         int             nbblks,
 175         struct xfs_buf  *bp)
 176 {
 177         int             error;
 178
 179         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 180                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 181                         nbblks);
 182                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 183                 return -EFSCORRUPTED;
 184         }
 185
 186         blk_no = round_down(blk_no, log->l_sectBBsize);
 187         nbblks = round_up(nbblks, log->l_sectBBsize);
 188
 189         ASSERT(nbblks > 0);
 190         ASSERT(nbblks <= bp->b_length);
 191
 192         XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 193         XFS_BUF_READ(bp);
 194         bp->b_io_length = nbblks;
 195         bp->b_error = 0;
 196
 197         error = xfs_buf_submit_wait(bp);
 198         if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
 199                 xfs_buf_ioerror_alert(bp, __func__);
 200         return error;
 201 }
 202
 203 STATIC int
 204 xlog_bread(
 205         struct xlog     *log,
 206         xfs_daddr_t     blk_no,
 207         int             nbblks,
 208         struct xfs_buf  *bp,
 209         char            **offset)
 210 {
 211         int             error;
 212
 213         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 214         if (error)
 215                 return error;
 216
 217         *offset = xlog_align(log, blk_no, nbblks, bp);
 218         return 0;
 219 }
 220
 221 /*
 222  * Read at an offset into the buffer. Returns with the buffer in it's original
 223  * state regardless of the result of the read.
 224  */
 225 STATIC int
 226 xlog_bread_offset(
 227         struct xlog     *log,
 228         xfs_daddr_t     blk_no,         /* block to read from */
 229         int             nbblks,         /* blocks to read */
 230         struct xfs_buf  *bp,
 231         char            *offset)
 232 {
 233         char            *orig_offset = bp->b_addr;
 234         int             orig_len = BBTOB(bp->b_length);
 235         int             error, error2;
 236
 237         error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
 238         if (error)
 239                 return error;
 240
 241         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 242
 243         /* must reset buffer pointer even on error */
 244         error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
 245         if (error)
 246                 return error;
 247         return error2;
 248 }
 249
 250 /*
 251  * Write out the buffer at the given block for the given number of blocks.
 252  * The buffer is kept locked across the write and is returned locked.
 253  * This can only be used for synchronous log writes.
 254  */
 255 STATIC int
 256 xlog_bwrite(
 257         struct xlog     *log,
 258         xfs_daddr_t     blk_no,
 259         int             nbblks,
 260         struct xfs_buf  *bp)
 261 {
 262         int             error;
 263
 264         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 265                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 266                         nbblks);
 267                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 268                 return -EFSCORRUPTED;
 269         }
 270
 271         blk_no = round_down(blk_no, log->l_sectBBsize);
 272         nbblks = round_up(nbblks, log->l_sectBBsize);
 273
 274         ASSERT(nbblks > 0);
 275         ASSERT(nbblks <= bp->b_length);
 276
 277         XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 278         XFS_BUF_ZEROFLAGS(bp);
 279         xfs_buf_hold(bp);
 280         xfs_buf_lock(bp);
 281         bp->b_io_length = nbblks;
 282         bp->b_error = 0;
 283
 284         error = xfs_bwrite(bp);
 285         if (error)
 286                 xfs_buf_ioerror_alert(bp, __func__);
 287         xfs_buf_relse(bp);
 288         return error;
 289 }
 290
 291 #ifdef DEBUG
 292 /*
 293  * dump debug superblock and log record information
 294  */
 295 STATIC void
 296 xlog_header_check_dump(
 297         xfs_mount_t             *mp,
 298         xlog_rec_header_t       *head)
 299 {
 300         xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d",
 301                 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
 302         xfs_debug(mp, "    log : uuid = %pU, fmt = %d",
 303                 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 304 }
 305 #else
 306 #define xlog_header_check_dump(mp, head)
 307 #endif
 308
 309 /*
 310  * check log record header for recovery
 311  */
 312 STATIC int
 313 xlog_header_check_recover(
 314         xfs_mount_t             *mp,
 315         xlog_rec_header_t       *head)
 316 {
 317         ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 318
 319         /*
 320          * IRIX doesn't write the h_fmt field and leaves it zeroed
 321          * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
 322          * a dirty log created in IRIX.
 323          */
 324         if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
 325                 xfs_warn(mp,
 326         "dirty log written in incompatible format - can't recover");
 327                 xlog_header_check_dump(mp, head);
 328                 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
 329                                  XFS_ERRLEVEL_HIGH, mp);
 330                 return -EFSCORRUPTED;
 331         } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 332                 xfs_warn(mp,
 333         "dirty log entry has mismatched uuid - can't recover");
 334                 xlog_header_check_dump(mp, head);
 335                 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
 336                                  XFS_ERRLEVEL_HIGH, mp);
 337                 return -EFSCORRUPTED;
 338         }
 339         return 0;
 340 }
 341
 342 /*
 343  * read the head block of the log and check the header
 344  */
 345 STATIC int
 346 xlog_header_check_mount(
 347         xfs_mount_t             *mp,
 348         xlog_rec_header_t       *head)
 349 {
 350         ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 351
 352         if (uuid_is_nil(&head->h_fs_uuid)) {
 353                 /*
 354                  * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
 355                  * h_fs_uuid is nil, we assume this log was last mounted
 356                  * by IRIX and continue.
 357                  */
 358                 xfs_warn(mp, "nil uuid in log - IRIX style log");
 359         } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 360                 xfs_warn(mp, "log has mismatched uuid - can't recover");
 361                 xlog_header_check_dump(mp, head);
 362                 XFS_ERROR_REPORT("xlog_header_check_mount",
 363                                  XFS_ERRLEVEL_HIGH, mp);
 364                 return -EFSCORRUPTED;
 365         }
 366         return 0;
 367 }
 368
 369 STATIC void
 370 xlog_recover_iodone(
 371         struct xfs_buf  *bp)
 372 {
 373         if (bp->b_error) {
 374                 /*
 375                  * We're not going to bother about retrying
 376                  * this during recovery. One strike!
 377                  */
 378                 if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
 379                         xfs_buf_ioerror_alert(bp, __func__);
 380                         xfs_force_shutdown(bp->b_target->bt_mount,
 381                                                 SHUTDOWN_META_IO_ERROR);
 382                 }
 383         }
 384         bp->b_iodone = NULL;
 385         xfs_buf_ioend(bp);
 386 }
 387
 388 /*
 389  * This routine finds (to an approximation) the first block in the physical
 390  * log which contains the given cycle.  It uses a binary search algorithm.
 391  * Note that the algorithm can not be perfect because the disk will not
 392  * necessarily be perfect.
 393  */
 394 STATIC int
 395 xlog_find_cycle_start(
 396         struct xlog     *log,
 397         struct xfs_buf  *bp,
 398         xfs_daddr_t     first_blk,
 399         xfs_daddr_t     *last_blk,
 400         uint            cycle)
 401 {
 402         char            *offset;
 403         xfs_daddr_t     mid_blk;
 404         xfs_daddr_t     end_blk;
 405         uint            mid_cycle;
 406         int             error;
 407
 408         end_blk = *last_blk;
 409         mid_blk = BLK_AVG(first_blk, end_blk);
 410         while (mid_blk != first_blk && mid_blk != end_blk) {
 411                 error = xlog_bread(log, mid_blk, 1, bp, &offset);
 412                 if (error)
 413                         return error;
 414                 mid_cycle = xlog_get_cycle(offset);
 415                 if (mid_cycle == cycle)
 416                         end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
 417                 else
 418                         first_blk = mid_blk; /* first_half_cycle == mid_cycle */
 419                 mid_blk = BLK_AVG(first_blk, end_blk);
 420         }
 421         ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
 422                (mid_blk == end_blk && mid_blk-1 == first_blk));
 423
 424         *last_blk = end_blk;
 425
 426         return 0;
 427 }
 428
 429 /*
 430  * Check that a range of blocks does not contain stop_on_cycle_no.
 431  * Fill in *new_blk with the block offset where such a block is
 432  * found, or with -1 (an invalid block number) if there is no such
 433  * block in the range.  The scan needs to occur from front to back
 434  * and the pointer into the region must be updated since a later
 435  * routine will need to perform another test.
 436  */
 437 STATIC int
 438 xlog_find_verify_cycle(
 439         struct xlog     *log,
 440         xfs_daddr_t     start_blk,
 441         int             nbblks,
 442         uint            stop_on_cycle_no,
 443         xfs_daddr_t     *new_blk)
 444 {
 445         xfs_daddr_t     i, j;
 446         uint            cycle;
 447         xfs_buf_t       *bp;
 448         xfs_daddr_t     bufblks;
 449         char            *buf = NULL;
 450         int             error = 0;
 451
 452         /*
 453          * Greedily allocate a buffer big enough to handle the full
 454          * range of basic blocks we'll be examining.  If that fails,
 455          * try a smaller size.  We need to be able to read at least
 456          * a log sector, or we're out of luck.
 457          */
 458         bufblks = 1 << ffs(nbblks);
 459         while (bufblks > log->l_logBBsize)
 460                 bufblks >>= 1;
 461         while (!(bp = xlog_get_bp(log, bufblks))) {
 462                 bufblks >>= 1;
 463                 if (bufblks < log->l_sectBBsize)
 464                         return -ENOMEM;
 465         }
 466
 467         for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 468                 int     bcount;
 469
 470                 bcount = min(bufblks, (start_blk + nbblks - i));
 471
 472                 error = xlog_bread(log, i, bcount, bp, &buf);
 473                 if (error)
 474                         goto out;
 475
 476                 for (j = 0; j < bcount; j++) {
 477                         cycle = xlog_get_cycle(buf);
 478                         if (cycle == stop_on_cycle_no) {
 479                                 *new_blk = i+j;
 480                                 goto out;
 481                         }
 482
 483                         buf += BBSIZE;
 484                 }
 485         }
 486
 487         *new_blk = -1;
 488
 489 out:
 490         xlog_put_bp(bp);
 491         return error;
 492 }
 493
 494 /*
 495  * Potentially backup over partial log record write.
 496  *
 497  * In the typical case, last_blk is the number of the block directly after
 498  * a good log record.  Therefore, we subtract one to get the block number
 499  * of the last block in the given buffer.  extra_bblks contains the number
 500  * of blocks we would have read on a previous read.  This happens when the
 501  * last log record is split over the end of the physical log.
 502  *
 503  * extra_bblks is the number of blocks potentially verified on a previous
 504  * call to this routine.
 505  */
 506 STATIC int
 507 xlog_find_verify_log_record(
 508         struct xlog             *log,
 509         xfs_daddr_t             start_blk,
 510         xfs_daddr_t             *last_blk,
 511         int                     extra_bblks)
 512 {
 513         xfs_daddr_t             i;
 514         xfs_buf_t               *bp;
 515         char                    *offset = NULL;
 516         xlog_rec_header_t       *head = NULL;
 517         int                     error = 0;
 518         int                     smallmem = 0;
 519         int                     num_blks = *last_blk - start_blk;
 520         int                     xhdrs;
 521
 522         ASSERT(start_blk != 0 || *last_blk != start_blk);
 523
 524         if (!(bp = xlog_get_bp(log, num_blks))) {
 525                 if (!(bp = xlog_get_bp(log, 1)))
 526                         return -ENOMEM;
 527                 smallmem = 1;
 528         } else {
 529                 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
 530                 if (error)
 531                         goto out;
 532                 offset += ((num_blks - 1) << BBSHIFT);
 533         }
 534
 535         for (i = (*last_blk) - 1; i >= 0; i--) {
 536                 if (i < start_blk) {
 537                         /* valid log record not found */
 538                         xfs_warn(log->l_mp,
 539                 "Log inconsistent (didn't find previous header)");
 540                         ASSERT(0);
 541                         error = -EIO;
 542                         goto out;
 543                 }
 544
 545                 if (smallmem) {
 546                         error = xlog_bread(log, i, 1, bp, &offset);
 547                         if (error)
 548                                 goto out;
 549                 }
 550
 551                 head = (xlog_rec_header_t *)offset;
 552
 553                 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 554                         break;
 555
 556                 if (!smallmem)
 557                         offset -= BBSIZE;
 558         }
 559
 560         /*
 561          * We hit the beginning of the physical log & still no header.  Return
 562          * to caller.  If caller can handle a return of -1, then this routine
 563          * will be called again for the end of the physical log.
 564          */
 565         if (i == -1) {
 566                 error = 1;
 567                 goto out;
 568         }
 569
 570         /*
 571          * We have the final block of the good log (the first block
 572          * of the log record _before_ the head. So we check the uuid.
 573          */
 574         if ((error = xlog_header_check_mount(log->l_mp, head)))
 575                 goto out;
 576
 577         /*
 578          * We may have found a log record header before we expected one.
 579          * last_blk will be the 1st block # with a given cycle #.  We may end
 580          * up reading an entire log record.  In this case, we don't want to
 581          * reset last_blk.  Only when last_blk points in the middle of a log
 582          * record do we update last_blk.
 583          */
 584         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 585                 uint    h_size = be32_to_cpu(head->h_size);
 586
 587                 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 588                 if (h_size % XLOG_HEADER_CYCLE_SIZE)
 589                         xhdrs++;
 590         } else {
 591                 xhdrs = 1;
 592         }
 593
 594         if (*last_blk - i + extra_bblks !=
 595             BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 596                 *last_blk = i;
 597
 598 out:
 599         xlog_put_bp(bp);
 600         return error;
 601 }
 602
 603 /*
 604  * Head is defined to be the point of the log where the next log write
 605  * could go.  This means that incomplete LR writes at the end are
 606  * eliminated when calculating the head.  We aren't guaranteed that previous
 607  * LR have complete transactions.  We only know that a cycle number of
 608  * current cycle number -1 won't be present in the log if we start writing
 609  * from our current block number.
 610  *
 611  * last_blk contains the block number of the first block with a given
 612  * cycle number.
 613  *
 614  * Return: zero if normal, non-zero if error.
 615  */
 616 STATIC int
 617 xlog_find_head(
 618         struct xlog     *log,
 619         xfs_daddr_t     *return_head_blk)
 620 {
 621         xfs_buf_t       *bp;
 622         char            *offset;
 623         xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
 624         int             num_scan_bblks;
 625         uint            first_half_cycle, last_half_cycle;
 626         uint            stop_on_cycle;
 627         int             error, log_bbnum = log->l_logBBsize;
 628
 629         /* Is the end of the log device zeroed? */
 630         error = xlog_find_zeroed(log, &first_blk);
 631         if (error < 0) {
 632                 xfs_warn(log->l_mp, "empty log check failed");
 633                 return error;
 634         }
 635         if (error == 1) {
 636                 *return_head_blk = first_blk;
 637
 638                 /* Is the whole lot zeroed? */
 639                 if (!first_blk) {
 640                         /* Linux XFS shouldn't generate totally zeroed logs -
 641                          * mkfs etc write a dummy unmount record to a fresh
 642                          * log so we can store the uuid in there
 643                          */
 644                         xfs_warn(log->l_mp, "totally zeroed log");
 645                 }
 646
 647                 return 0;
 648         }
 649
 650         first_blk = 0;                  /* get cycle # of 1st block */
 651         bp = xlog_get_bp(log, 1);
 652         if (!bp)
 653                 return -ENOMEM;
 654
 655         error = xlog_bread(log, 0, 1, bp, &offset);
 656         if (error)
 657                 goto bp_err;
 658
 659         first_half_cycle = xlog_get_cycle(offset);
 660
 661         last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
 662         error = xlog_bread(log, last_blk, 1, bp, &offset);
 663         if (error)
 664                 goto bp_err;
 665
 666         last_half_cycle = xlog_get_cycle(offset);
 667         ASSERT(last_half_cycle != 0);
 668
 669         /*
 670          * If the 1st half cycle number is equal to the last half cycle number,
 671          * then the entire log is stamped with the same cycle number.  In this
 672          * case, head_blk can't be set to zero (which makes sense).  The below
 673          * math doesn't work out properly with head_blk equal to zero.  Instead,
 674          * we set it to log_bbnum which is an invalid block number, but this
 675          * value makes the math correct.  If head_blk doesn't changed through
 676          * all the tests below, *head_blk is set to zero at the very end rather
 677          * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 678          * in a circular file.
 679          */
 680         if (first_half_cycle == last_half_cycle) {
 681                 /*
 682                  * In this case we believe that the entire log should have
 683                  * cycle number last_half_cycle.  We need to scan backwards
 684                  * from the end verifying that there are no holes still
 685                  * containing last_half_cycle - 1.  If we find such a hole,
 686                  * then the start of that hole will be the new head.  The
 687                  * simple case looks like
 688                  *        x | x ... | x - 1 | x
 689                  * Another case that fits this picture would be
 690                  *        x | x + 1 | x ... | x
 691                  * In this case the head really is somewhere at the end of the
 692                  * log, as one of the latest writes at the beginning was
 693                  * incomplete.
 694                  * One more case is
 695                  *        x | x + 1 | x ... | x - 1 | x
 696                  * This is really the combination of the above two cases, and
 697                  * the head has to end up at the start of the x-1 hole at the
 698                  * end of the log.
 699                  *
 700                  * In the 256k log case, we will read from the beginning to the
 701                  * end of the log and search for cycle numbers equal to x-1.
 702                  * We don't worry about the x+1 blocks that we encounter,
 703                  * because we know that they cannot be the head since the log
 704                  * started with x.
 705                  */
 706                 head_blk = log_bbnum;
 707                 stop_on_cycle = last_half_cycle - 1;
 708         } else {
 709                 /*
 710                  * In this case we want to find the first block with cycle
 711                  * number matching last_half_cycle.  We expect the log to be
 712                  * some variation on
 713                  *        x + 1 ... | x ... | x
 714                  * The first block with cycle number x (last_half_cycle) will
 715                  * be where the new head belongs.  First we do a binary search
 716                  * for the first occurrence of last_half_cycle.  The binary
 717                  * search may not be totally accurate, so then we scan back
 718                  * from there looking for occurrences of last_half_cycle before
 719                  * us.  If that backwards scan wraps around the beginning of
 720                  * the log, then we look for occurrences of last_half_cycle - 1
 721                  * at the end of the log.  The cases we're looking for look
 722                  * like
 723                  *                               v binary search stopped here
 724                  *        x + 1 ... | x | x + 1 | x ... | x
 725                  *                   ^ but we want to locate this spot
 726                  * or
 727                  *        <---------> less than scan distance
 728                  *        x + 1 ... | x ... | x - 1 | x
 729                  *                           ^ we want to locate this spot
 730                  */
 731                 stop_on_cycle = last_half_cycle;
 732                 if ((error = xlog_find_cycle_start(log, bp, first_blk,
 733                                                 &head_blk, last_half_cycle)))
 734                         goto bp_err;
 735         }
 736
 737         /*
 738          * Now validate the answer.  Scan back some number of maximum possible
 739          * blocks and make sure each one has the expected cycle number.  The
 740          * maximum is determined by the total possible amount of buffering
 741          * in the in-core log.  The following number can be made tighter if
 742          * we actually look at the block size of the filesystem.
 743          */
 744         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 745         if (head_blk >= num_scan_bblks) {
 746                 /*
 747                  * We are guaranteed that the entire check can be performed
 748                  * in one buffer.
 749                  */
 750                 start_blk = head_blk - num_scan_bblks;
 751                 if ((error = xlog_find_verify_cycle(log,
 752                                                 start_blk, num_scan_bblks,
 753                                                 stop_on_cycle, &new_blk)))
 754                         goto bp_err;
 755                 if (new_blk != -1)
 756                         head_blk = new_blk;
 757         } else {                /* need to read 2 parts of log */
 758                 /*
 759                  * We are going to scan backwards in the log in two parts.
 760                  * First we scan the physical end of the log.  In this part
 761                  * of the log, we are looking for blocks with cycle number
 762                  * last_half_cycle - 1.
 763                  * If we find one, then we know that the log starts there, as
 764                  * we've found a hole that didn't get written in going around
 765                  * the end of the physical log.  The simple case for this is
 766                  *        x + 1 ... | x ... | x - 1 | x
 767                  *        <---------> less than scan distance
 768                  * If all of the blocks at the end of the log have cycle number
 769                  * last_half_cycle, then we check the blocks at the start of
 770                  * the log looking for occurrences of last_half_cycle.  If we
 771                  * find one, then our current estimate for the location of the
 772                  * first occurrence of last_half_cycle is wrong and we move
 773                  * back to the hole we've found.  This case looks like
 774                  *        x + 1 ... | x | x + 1 | x ...
 775                  *                               ^ binary search stopped here
 776                  * Another case we need to handle that only occurs in 256k
 777                  * logs is
 778                  *        x + 1 ... | x ... | x+1 | x ...
 779                  *                   ^ binary search stops here
 780                  * In a 256k log, the scan at the end of the log will see the
 781                  * x + 1 blocks.  We need to skip past those since that is
 782                  * certainly not the head of the log.  By searching for
 783                  * last_half_cycle-1 we accomplish that.
 784                  */
 785                 ASSERT(head_blk <= INT_MAX &&
 786                         (xfs_daddr_t) num_scan_bblks >= head_blk);
 787                 start_blk = log_bbnum - (num_scan_bblks - head_blk);
 788                 if ((error = xlog_find_verify_cycle(log, start_blk,
 789                                         num_scan_bblks - (int)head_blk,
 790                                         (stop_on_cycle - 1), &new_blk)))
 791                         goto bp_err;
 792                 if (new_blk != -1) {
 793                         head_blk = new_blk;
 794                         goto validate_head;
 795                 }
 796
 797                 /*
 798                  * Scan beginning of log now.  The last part of the physical
 799                  * log is good.  This scan needs to verify that it doesn't find
 800                  * the last_half_cycle.
 801                  */
 802                 start_blk = 0;
 803                 ASSERT(head_blk <= INT_MAX);
 804                 if ((error = xlog_find_verify_cycle(log,
 805                                         start_blk, (int)head_blk,
 806                                         stop_on_cycle, &new_blk)))
 807                         goto bp_err;
 808                 if (new_blk != -1)
 809                         head_blk = new_blk;
 810         }
 811
 812 validate_head:
 813         /*
 814          * Now we need to make sure head_blk is not pointing to a block in
 815          * the middle of a log record.
 816          */
 817         num_scan_bblks = XLOG_REC_SHIFT(log);
 818         if (head_blk >= num_scan_bblks) {
 819                 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 820
 821                 /* start ptr at last block ptr before head_blk */
 822                 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 823                 if (error == 1)
 824                         error = -EIO;
 825                 if (error)
 826                         goto bp_err;
 827         } else {
 828                 start_blk = 0;
 829                 ASSERT(head_blk <= INT_MAX);
 830                 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 831                 if (error < 0)
 832                         goto bp_err;
 833                 if (error == 1) {
 834                         /* We hit the beginning of the log during our search */
 835                         start_blk = log_bbnum - (num_scan_bblks - head_blk);
 836                         new_blk = log_bbnum;
 837                         ASSERT(start_blk <= INT_MAX &&
 838                                 (xfs_daddr_t) log_bbnum-start_blk >= 0);
 839                         ASSERT(head_blk <= INT_MAX);
 840                         error = xlog_find_verify_log_record(log, start_blk,
 841                                                         &new_blk, (int)head_blk);
 842                         if (error == 1)
 843                                 error = -EIO;
 844                         if (error)
 845                                 goto bp_err;
 846                         if (new_blk != log_bbnum)
 847                                 head_blk = new_blk;
 848                 } else if (error)
 849                         goto bp_err;
 850         }
 851
 852         xlog_put_bp(bp);
 853         if (head_blk == log_bbnum)
 854                 *return_head_blk = 0;
 855         else
 856                 *return_head_blk = head_blk;
 857         /*
 858          * When returning here, we have a good block number.  Bad block
 859          * means that during a previous crash, we didn't have a clean break
 860          * from cycle number N to cycle number N-1.  In this case, we need
 861          * to find the first block with cycle number N-1.
 862          */
 863         return 0;
 864
 865  bp_err:
 866         xlog_put_bp(bp);
 867
 868         if (error)
 869                 xfs_warn(log->l_mp, "failed to find log head");
 870         return error;
 871 }
 872
 873 /*
 874  * Seek backwards in the log for log record headers.
 875  *
 876  * Given a starting log block, walk backwards until we find the provided number
 877  * of records or hit the provided tail block. The return value is the number of
 878  * records encountered or a negative error code. The log block and buffer
 879  * pointer of the last record seen are returned in rblk and rhead respectively.
 880  */
 881 STATIC int
 882 xlog_rseek_logrec_hdr(
 883         struct xlog             *log,
 884         xfs_daddr_t             head_blk,
 885         xfs_daddr_t             tail_blk,
 886         int                     count,
 887         struct xfs_buf          *bp,
 888         xfs_daddr_t             *rblk,
 889         struct xlog_rec_header  **rhead,
 890         bool                    *wrapped)
 891 {
 892         int                     i;
 893         int                     error;
 894         int                     found = 0;
 895         char                    *offset = NULL;
 896         xfs_daddr_t             end_blk;
 897
 898         *wrapped = false;
 899
 900         /*
 901          * Walk backwards from the head block until we hit the tail or the first
 902          * block in the log.
 903          */
 904         end_blk = head_blk > tail_blk ? tail_blk : 0;
 905         for (i = (int) head_blk - 1; i >= end_blk; i--) {
 906                 error = xlog_bread(log, i, 1, bp, &offset);
 907                 if (error)
 908                         goto out_error;
 909
 910                 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 911                         *rblk = i;
 912                         *rhead = (struct xlog_rec_header *) offset;
 913                         if (++found == count)
 914                                 break;
 915                 }
 916         }
 917
 918         /*
 919          * If we haven't hit the tail block or the log record header count,
 920          * start looking again from the end of the physical log. Note that
 921          * callers can pass head == tail if the tail is not yet known.
 922          */
 923         if (tail_blk >= head_blk && found != count) {
 924                 for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
 925                         error = xlog_bread(log, i, 1, bp, &offset);
 926                         if (error)
 927                                 goto out_error;
 928
 929                         if (*(__be32 *)offset ==
 930                             cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 931                                 *wrapped = true;
 932                                 *rblk = i;
 933                                 *rhead = (struct xlog_rec_header *) offset;
 934                                 if (++found == count)
 935                                         break;
 936                         }
 937                 }
 938         }
 939
 940         return found;
 941
 942 out_error:
 943         return error;
 944 }
 945
 946 /*
 947  * Seek forward in the log for log record headers.
 948  *
 949  * Given head and tail blocks, walk forward from the tail block until we find
 950  * the provided number of records or hit the head block. The return value is the
 951  * number of records encountered or a negative error code. The log block and
 952  * buffer pointer of the last record seen are returned in rblk and rhead
 953  * respectively.
 954  */
 955 STATIC int
 956 xlog_seek_logrec_hdr(
 957         struct xlog             *log,
 958         xfs_daddr_t             head_blk,
 959         xfs_daddr_t             tail_blk,
 960         int                     count,
 961         struct xfs_buf          *bp,
 962         xfs_daddr_t             *rblk,
 963         struct xlog_rec_header  **rhead,
 964         bool                    *wrapped)
 965 {
 966         int                     i;
 967         int                     error;
 968         int                     found = 0;
 969         char                    *offset = NULL;
 970         xfs_daddr_t             end_blk;
 971
 972         *wrapped = false;
 973
 974         /*
 975          * Walk forward from the tail block until we hit the head or the last
 976          * block in the log.
 977          */
 978         end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
 979         for (i = (int) tail_blk; i <= end_blk; i++) {
 980                 error = xlog_bread(log, i, 1, bp, &offset);
 981                 if (error)
 982                         goto out_error;
 983
 984                 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 985                         *rblk = i;
 986                         *rhead = (struct xlog_rec_header *) offset;
 987                         if (++found == count)
 988                                 break;
 989                 }
 990         }
 991
 992         /*
 993          * If we haven't hit the head block or the log record header count,
 994          * start looking again from the start of the physical log.
 995          */
 996         if (tail_blk > head_blk && found != count) {
 997                 for (i = 0; i < (int) head_blk; i++) {
 998                         error = xlog_bread(log, i, 1, bp, &offset);
 999                         if (error)
1000                                 goto out_error;
1001
1002                         if (*(__be32 *)offset ==
1003                             cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
1004                                 *wrapped = true;
1005                                 *rblk = i;
1006                                 *rhead = (struct xlog_rec_header *) offset;
1007                                 if (++found == count)
1008                                         break;
1009                         }
1010                 }
1011         }
1012
1013         return found;
1014
1015 out_error:
1016         return error;
1017 }
1018
1019 /*
1020  * Check the log tail for torn writes. This is required when torn writes are
1021  * detected at the head and the head had to be walked back to a previous record.
1022  * The tail of the previous record must now be verified to ensure the torn
1023  * writes didn't corrupt the previous tail.
1024  *
1025  * Return an error if CRC verification fails as recovery cannot proceed.
1026  */
1027 STATIC int
1028 xlog_verify_tail(
1029         struct xlog             *log,
1030         xfs_daddr_t             head_blk,
1031         xfs_daddr_t             tail_blk)
1032 {
1033         struct xlog_rec_header  *thead;
1034         struct xfs_buf          *bp;
1035         xfs_daddr_t             first_bad;
1036         int                     count;
1037         int                     error = 0;
1038         bool                    wrapped;
1039         xfs_daddr_t             tmp_head;
1040
1041         bp = xlog_get_bp(log, 1);
1042         if (!bp)
1043                 return -ENOMEM;
1044
1045         /*
1046          * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
1047          * a temporary head block that points after the last possible
1048          * concurrently written record of the tail.
1049          */
1050         count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
1051                                      XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
1052                                      &wrapped);
1053         if (count < 0) {
1054                 error = count;
1055                 goto out;
1056         }
1057
1058         /*
1059          * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
1060          * into the actual log head. tmp_head points to the start of the record
1061          * so update it to the actual head block.
1062          */
1063         if (count < XLOG_MAX_ICLOGS + 1)
1064                 tmp_head = head_blk;
1065
1066         /*
1067          * We now have a tail and temporary head block that covers at least
1068          * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
1069          * records were completely written. Run a CRC verification pass from
1070          * tail to head and return the result.
1071          */
1072         error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
1073                                       XLOG_RECOVER_CRCPASS, &first_bad);
1074
1075 out:
1076         xlog_put_bp(bp);
1077         return error;
1078 }
1079
1080 /*
1081  * Detect and trim torn writes from the head of the log.
1082  *
1083  * Storage without sector atomicity guarantees can result in torn writes in the
1084  * log in the event of a crash. Our only means to detect this scenario is via
1085  * CRC verification. While we can't always be certain that CRC verification
1086  * failure is due to a torn write vs. an unrelated corruption, we do know that
1087  * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1088  * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1089  * the log and treat failures in this range as torn writes as a matter of
1090  * policy. In the event of CRC failure, the head is walked back to the last good
1091  * record in the log and the tail is updated from that record and verified.
1092  */
1093 STATIC int
1094 xlog_verify_head(
1095         struct xlog             *log,
1096         xfs_daddr_t             *head_blk,      /* in/out: unverified head */
1097         xfs_daddr_t             *tail_blk,      /* out: tail block */
1098         struct xfs_buf          *bp,
1099         xfs_daddr_t             *rhead_blk,     /* start blk of last record */
1100         struct xlog_rec_header  **rhead,        /* ptr to last record */
1101         bool                    *wrapped)       /* last rec. wraps phys. log */
1102 {
1103         struct xlog_rec_header  *tmp_rhead;
1104         struct xfs_buf          *tmp_bp;
1105         xfs_daddr_t             first_bad;
1106         xfs_daddr_t             tmp_rhead_blk;
1107         int                     found;
1108         int                     error;
1109         bool                    tmp_wrapped;
1110
1111         /*
1112          * Search backwards through the log looking for the log record header
1113          * block. This wraps all the way back around to the head so something is
1114          * seriously wrong if we can't find it.
1115          */
1116         found = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, rhead_blk,
1117                                       rhead, wrapped);
1118         if (found < 0)
1119                 return found;
1120         if (!found) {
1121                 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1122                 return -EIO;
1123         }
1124
1125         *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1126
1127         /*
1128          * Now that we have a tail block, check the head of the log for torn
1129          * writes. Search again until we hit the tail or the maximum number of
1130          * log record I/Os that could have been in flight at one time. Use a
1131          * temporary buffer so we don't trash the rhead/bp pointer from the
1132          * call above.
1133          */
1134         tmp_bp = xlog_get_bp(log, 1);
1135         if (!tmp_bp)
1136                 return -ENOMEM;
1137         error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1138                                       XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
1139                                       &tmp_rhead, &tmp_wrapped);
1140         xlog_put_bp(tmp_bp);
1141         if (error < 0)
1142                 return error;
1143
1144         /*
1145          * Now run a CRC verification pass over the records starting at the
1146          * block found above to the current head. If a CRC failure occurs, the
1147          * log block of the first bad record is saved in first_bad.
1148          */
1149         error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1150                                       XLOG_RECOVER_CRCPASS, &first_bad);
1151         if (error == -EFSBADCRC) {
1152                 /*
1153                  * We've hit a potential torn write. Reset the error and warn
1154                  * about it.
1155                  */
1156                 error = 0;
1157                 xfs_warn(log->l_mp,
1158 "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1159                          first_bad, *head_blk);
1160
1161                 /*
1162                  * Get the header block and buffer pointer for the last good
1163                  * record before the bad record.
1164                  *
1165                  * Note that xlog_find_tail() clears the blocks at the new head
1166                  * (i.e., the records with invalid CRC) if the cycle number
1167                  * matches the the current cycle.
1168                  */
1169                 found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
1170                                               rhead_blk, rhead, wrapped);
1171                 if (found < 0)
1172                         return found;
1173                 if (found == 0)         /* XXX: right thing to do here? */
1174                         return -EIO;
1175
1176                 /*
1177                  * Reset the head block to the starting block of the first bad
1178                  * log record and set the tail block based on the last good
1179                  * record.
1180                  *
1181                  * Bail out if the updated head/tail match as this indicates
1182                  * possible corruption outside of the acceptable
1183                  * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1184                  */
1185                 *head_blk = first_bad;
1186                 *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1187                 if (*head_blk == *tail_blk) {
1188                         ASSERT(0);
1189                         return 0;
1190                 }
1191
1192                 /*
1193                  * Now verify the tail based on the updated head. This is
1194                  * required because the torn writes trimmed from the head could
1195                  * have been written over the tail of a previous record. Return
1196                  * any errors since recovery cannot proceed if the tail is
1197                  * corrupt.
1198                  *
1199                  * XXX: This leaves a gap in truly robust protection from torn
1200                  * writes in the log. If the head is behind the tail, the tail
1201                  * pushes forward to create some space and then a crash occurs
1202                  * causing the writes into the previous record's tail region to
1203                  * tear, log recovery isn't able to recover.
1204                  *
1205                  * How likely is this to occur? If possible, can we do something
1206                  * more intelligent here? Is it safe to push the tail forward if
1207                  * we can determine that the tail is within the range of the
1208                  * torn write (e.g., the kernel can only overwrite the tail if
1209                  * it has actually been pushed forward)? Alternatively, could we
1210                  * somehow prevent this condition at runtime?
1211                  */
1212                 error = xlog_verify_tail(log, *head_blk, *tail_blk);
1213         }
1214
1215         return error;
1216 }
1217
1218 /*
1219  * Find the sync block number or the tail of the log.
1220  *
1221  * This will be the block number of the last record to have its
1222  * associated buffers synced to disk.  Every log record header has
1223  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
1224  * to get a sync block number.  The only concern is to figure out which
1225  * log record header to believe.
1226  *
1227  * The following algorithm uses the log record header with the largest
1228  * lsn.  The entire log record does not need to be valid.  We only care
1229  * that the header is valid.
1230  *
1231  * We could speed up search by using current head_blk buffer, but it is not
1232  * available.
1233  */
1234 STATIC int
1235 xlog_find_tail(
1236         struct xlog             *log,
1237         xfs_daddr_t             *head_blk,
1238         xfs_daddr_t             *tail_blk)
1239 {
1240         xlog_rec_header_t       *rhead;
1241         xlog_op_header_t        *op_head;
1242         char                    *offset = NULL;
1243         xfs_buf_t               *bp;
1244         int                     error;
1245         xfs_daddr_t             umount_data_blk;
1246         xfs_daddr_t             after_umount_blk;
1247         xfs_daddr_t             rhead_blk;
1248         xfs_lsn_t               tail_lsn;
1249         int                     hblks;
1250         bool                    wrapped = false;
1251
1252         /*
1253          * Find previous log record
1254          */
1255         if ((error = xlog_find_head(log, head_blk)))
1256                 return error;
1257
1258         bp = xlog_get_bp(log, 1);
1259         if (!bp)
1260                 return -ENOMEM;
1261         if (*head_blk == 0) {                           /* special case */
1262                 error = xlog_bread(log, 0, 1, bp, &offset);
1263                 if (error)
1264                         goto done;
1265
1266                 if (xlog_get_cycle(offset) == 0) {
1267                         *tail_blk = 0;
1268                         /* leave all other log inited values alone */
1269                         goto done;
1270                 }
1271         }
1272
1273         /*
1274          * Trim the head block back to skip over torn records. We can have
1275          * multiple log I/Os in flight at any time, so we assume CRC failures
1276          * back through the previous several records are torn writes and skip
1277          * them.
1278          */
1279         ASSERT(*head_blk < INT_MAX);
1280         error = xlog_verify_head(log, head_blk, tail_blk, bp, &rhead_blk,
1281                                  &rhead, &wrapped);
1282         if (error)
1283                 goto done;
1284
1285         /*
1286          * Reset log values according to the state of the log when we
1287          * crashed.  In the case where head_blk == 0, we bump curr_cycle
1288          * one because the next write starts a new cycle rather than
1289          * continuing the cycle of the last good log record.  At this
1290          * point we have guaranteed that all partial log records have been
1291          * accounted for.  Therefore, we know that the last good log record
1292          * written was complete and ended exactly on the end boundary
1293          * of the physical log.
1294          */
1295         log->l_prev_block = rhead_blk;
1296         log->l_curr_block = (int)*head_blk;
1297         log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1298         if (wrapped)
1299                 log->l_curr_cycle++;
1300         atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1301         atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1302         xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1303                                         BBTOB(log->l_curr_block));
1304         xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1305                                         BBTOB(log->l_curr_block));
1306
1307         /*
1308          * Look for unmount record.  If we find it, then we know there
1309          * was a clean unmount.  Since 'i' could be the last block in
1310          * the physical log, we convert to a log block before comparing
1311          * to the head_blk.
1312          *
1313          * Save the current tail lsn to use to pass to
1314          * xlog_clear_stale_blocks() below.  We won't want to clear the
1315          * unmount record if there is one, so we pass the lsn of the
1316          * unmount record rather than the block after it.
1317          */
1318         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1319                 int     h_size = be32_to_cpu(rhead->h_size);
1320                 int     h_version = be32_to_cpu(rhead->h_version);
1321
1322                 if ((h_version & XLOG_VERSION_2) &&
1323                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1324                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1325                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
1326                                 hblks++;
1327                 } else {
1328                         hblks = 1;
1329                 }
1330         } else {
1331                 hblks = 1;
1332         }
1333         after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
1334         after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
1335         tail_lsn = atomic64_read(&log->l_tail_lsn);
1336         if (*head_blk == after_umount_blk &&
1337             be32_to_cpu(rhead->h_num_logops) == 1) {
1338                 umount_data_blk = rhead_blk + hblks;
1339                 umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
1340                 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
1341                 if (error)
1342                         goto done;
1343
1344                 op_head = (xlog_op_header_t *)offset;
1345                 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1346                         /*
1347                          * Set tail and last sync so that newly written
1348                          * log records will point recovery to after the
1349                          * current unmount record.
1350                          */
1351                         xlog_assign_atomic_lsn(&log->l_tail_lsn,
1352                                         log->l_curr_cycle, after_umount_blk);
1353                         xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1354                                         log->l_curr_cycle, after_umount_blk);
1355                         *tail_blk = after_umount_blk;
1356
1357                         /*
1358                          * Note that the unmount was clean. If the unmount
1359                          * was not clean, we need to know this to rebuild the
1360                          * superblock counters from the perag headers if we
1361                          * have a filesystem using non-persistent counters.
1362                          */
1363                         log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1364                 }
1365         }
1366
1367         /*
1368          * Make sure that there are no blocks in front of the head
1369          * with the same cycle number as the head.  This can happen
1370          * because we allow multiple outstanding log writes concurrently,
1371          * and the later writes might make it out before earlier ones.
1372          *
1373          * We use the lsn from before modifying it so that we'll never
1374          * overwrite the unmount record after a clean unmount.
1375          *
1376          * Do this only if we are going to recover the filesystem
1377          *
1378          * NOTE: This used to say "if (!readonly)"
1379          * However on Linux, we can & do recover a read-only filesystem.
1380          * We only skip recovery if NORECOVERY is specified on mount,
1381          * in which case we would not be here.
1382          *
1383          * But... if the -device- itself is readonly, just skip this.
1384          * We can't recover this device anyway, so it won't matter.
1385          */
1386         if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1387                 error = xlog_clear_stale_blocks(log, tail_lsn);
1388
1389 done:
1390         xlog_put_bp(bp);
1391
1392         if (error)
1393                 xfs_warn(log->l_mp, "failed to locate log tail");
1394         return error;
1395 }
1396
1397 /*
1398  * Is the log zeroed at all?
1399  *
1400  * The last binary search should be changed to perform an X block read
1401  * once X becomes small enough.  You can then search linearly through
1402  * the X blocks.  This will cut down on the number of reads we need to do.
1403  *
1404  * If the log is partially zeroed, this routine will pass back the blkno
1405  * of the first block with cycle number 0.  It won't have a complete LR
1406  * preceding it.
1407  *
1408  * Return:
1409  *      0  => the log is completely written to
1410  *      1 => use *blk_no as the first block of the log
1411  *      <0 => error has occurred
1412  */
1413 STATIC int
1414 xlog_find_zeroed(
1415         struct xlog     *log,
1416         xfs_daddr_t     *blk_no)
1417 {
1418         xfs_buf_t       *bp;
1419         char            *offset;
1420         uint            first_cycle, last_cycle;
1421         xfs_daddr_t     new_blk, last_blk, start_blk;
1422         xfs_daddr_t     num_scan_bblks;
1423         int             error, log_bbnum = log->l_logBBsize;
1424
1425         *blk_no = 0;
1426
1427         /* check totally zeroed log */
1428         bp = xlog_get_bp(log, 1);
1429         if (!bp)
1430                 return -ENOMEM;
1431         error = xlog_bread(log, 0, 1, bp, &offset);
1432         if (error)
1433                 goto bp_err;
1434
1435         first_cycle = xlog_get_cycle(offset);
1436         if (first_cycle == 0) {         /* completely zeroed log */
1437                 *blk_no = 0;
1438                 xlog_put_bp(bp);
1439                 return 1;
1440         }
1441
1442         /* check partially zeroed log */
1443         error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1444         if (error)
1445                 goto bp_err;
1446
1447         last_cycle = xlog_get_cycle(offset);
1448         if (last_cycle != 0) {          /* log completely written to */
1449                 xlog_put_bp(bp);
1450                 return 0;
1451         } else if (first_cycle != 1) {
1452                 /*
1453                  * If the cycle of the last block is zero, the cycle of
1454                  * the first block must be 1. If it's not, maybe we're
1455                  * not looking at a log... Bail out.
1456                  */
1457                 xfs_warn(log->l_mp,
1458                         "Log inconsistent or not a log (last==0, first!=1)");
1459                 error = -EINVAL;
1460                 goto bp_err;
1461         }
1462
1463         /* we have a partially zeroed log */
1464         last_blk = log_bbnum-1;
1465         if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1466                 goto bp_err;
1467
1468         /*
1469          * Validate the answer.  Because there is no way to guarantee that
1470          * the entire log is made up of log records which are the same size,
1471          * we scan over the defined maximum blocks.  At this point, the maximum
1472          * is not chosen to mean anything special.   XXXmiken
1473          */
1474         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1475         ASSERT(num_scan_bblks <= INT_MAX);
1476
1477         if (last_blk < num_scan_bblks)
1478                 num_scan_bblks = last_blk;
1479         start_blk = last_blk - num_scan_bblks;
1480
1481         /*
1482          * We search for any instances of cycle number 0 that occur before
1483          * our current estimate of the head.  What we're trying to detect is
1484          *        1 ... | 0 | 1 | 0...
1485          *                       ^ binary search ends here
1486          */
1487         if ((error = xlog_find_verify_cycle(log, start_blk,
1488                                          (int)num_scan_bblks, 0, &new_blk)))
1489                 goto bp_err;
1490         if (new_blk != -1)
1491                 last_blk = new_blk;
1492
1493         /*
1494          * Potentially backup over partial log record write.  We don't need
1495          * to search the end of the log because we know it is zero.
1496          */
1497         error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1498         if (error == 1)
1499                 error = -EIO;
1500         if (error)
1501                 goto bp_err;
1502
1503         *blk_no = last_blk;
1504 bp_err:
1505         xlog_put_bp(bp);
1506         if (error)
1507                 return error;
1508         return 1;
1509 }
1510
1511 /*
1512  * These are simple subroutines used by xlog_clear_stale_blocks() below
1513  * to initialize a buffer full of empty log record headers and write
1514  * them into the log.
1515  */
1516 STATIC void
1517 xlog_add_record(
1518         struct xlog             *log,
1519         char                    *buf,
1520         int                     cycle,
1521         int                     block,
1522         int                     tail_cycle,
1523         int                     tail_block)
1524 {
1525         xlog_rec_header_t       *recp = (xlog_rec_header_t *)buf;
1526
1527         memset(buf, 0, BBSIZE);
1528         recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1529         recp->h_cycle = cpu_to_be32(cycle);
1530         recp->h_version = cpu_to_be32(
1531                         xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1532         recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1533         recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1534         recp->h_fmt = cpu_to_be32(XLOG_FMT);
1535         memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1536 }
1537
1538 STATIC int
1539 xlog_write_log_records(
1540         struct xlog     *log,
1541         int             cycle,
1542         int             start_block,
1543         int             blocks,
1544         int             tail_cycle,
1545         int             tail_block)
1546 {
1547         char            *offset;
1548         xfs_buf_t       *bp;
1549         int             balign, ealign;
1550         int             sectbb = log->l_sectBBsize;
1551         int             end_block = start_block + blocks;
1552         int             bufblks;
1553         int             error = 0;
1554         int             i, j = 0;
1555
1556         /*
1557          * Greedily allocate a buffer big enough to handle the full
1558          * range of basic blocks to be written.  If that fails, try
1559          * a smaller size.  We need to be able to write at least a
1560          * log sector, or we're out of luck.
1561          */
1562         bufblks = 1 << ffs(blocks);
1563         while (bufblks > log->l_logBBsize)
1564                 bufblks >>= 1;
1565         while (!(bp = xlog_get_bp(log, bufblks))) {
1566                 bufblks >>= 1;
1567                 if (bufblks < sectbb)
1568                         return -ENOMEM;
1569         }
1570
1571         /* We may need to do a read at the start to fill in part of
1572          * the buffer in the starting sector not covered by the first
1573          * write below.
1574          */
1575         balign = round_down(start_block, sectbb);
1576         if (balign != start_block) {
1577                 error = xlog_bread_noalign(log, start_block, 1, bp);
1578                 if (error)
1579                         goto out_put_bp;
1580
1581                 j = start_block - balign;
1582         }
1583
1584         for (i = start_block; i < end_block; i += bufblks) {
1585                 int             bcount, endcount;
1586
1587                 bcount = min(bufblks, end_block - start_block);
1588                 endcount = bcount - j;
1589
1590                 /* We may need to do a read at the end to fill in part of
1591                  * the buffer in the final sector not covered by the write.
1592                  * If this is the same sector as the above read, skip it.
1593                  */
1594                 ealign = round_down(end_block, sectbb);
1595                 if (j == 0 && (start_block + endcount > ealign)) {
1596                         offset = bp->b_addr + BBTOB(ealign - start_block);
1597                         error = xlog_bread_offset(log, ealign, sectbb,
1598                                                         bp, offset);
1599                         if (error)
1600                                 break;
1601
1602                 }
1603
1604                 offset = xlog_align(log, start_block, endcount, bp);
1605                 for (; j < endcount; j++) {
1606                         xlog_add_record(log, offset, cycle, i+j,
1607                                         tail_cycle, tail_block);
1608                         offset += BBSIZE;
1609                 }
1610                 error = xlog_bwrite(log, start_block, endcount, bp);
1611                 if (error)
1612                         break;
1613                 start_block += endcount;
1614                 j = 0;
1615         }
1616
1617  out_put_bp:
1618         xlog_put_bp(bp);
1619         return error;
1620 }
1621
1622 /*
1623  * This routine is called to blow away any incomplete log writes out
1624  * in front of the log head.  We do this so that we won't become confused
1625  * if we come up, write only a little bit more, and then crash again.
1626  * If we leave the partial log records out there, this situation could
1627  * cause us to think those partial writes are valid blocks since they
1628  * have the current cycle number.  We get rid of them by overwriting them
1629  * with empty log records with the old cycle number rather than the
1630  * current one.
1631  *
1632  * The tail lsn is passed in rather than taken from
1633  * the log so that we will not write over the unmount record after a
1634  * clean unmount in a 512 block log.  Doing so would leave the log without
1635  * any valid log records in it until a new one was written.  If we crashed
1636  * during that time we would not be able to recover.
1637  */
1638 STATIC int
1639 xlog_clear_stale_blocks(
1640         struct xlog     *log,
1641         xfs_lsn_t       tail_lsn)
1642 {
1643         int             tail_cycle, head_cycle;
1644         int             tail_block, head_block;
1645         int             tail_distance, max_distance;
1646         int             distance;
1647         int             error;
1648
1649         tail_cycle = CYCLE_LSN(tail_lsn);
1650         tail_block = BLOCK_LSN(tail_lsn);
1651         head_cycle = log->l_curr_cycle;
1652         head_block = log->l_curr_block;
1653
1654         /*
1655          * Figure out the distance between the new head of the log
1656          * and the tail.  We want to write over any blocks beyond the
1657          * head that we may have written just before the crash, but
1658          * we don't want to overwrite the tail of the log.
1659          */
1660         if (head_cycle == tail_cycle) {
1661                 /*
1662                  * The tail is behind the head in the physical log,
1663                  * so the distance from the head to the tail is the
1664                  * distance from the head to the end of the log plus
1665                  * the distance from the beginning of the log to the
1666                  * tail.
1667                  */
1668                 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1669                         XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1670                                          XFS_ERRLEVEL_LOW, log->l_mp);
1671                         return -EFSCORRUPTED;
1672                 }
1673                 tail_distance = tail_block + (log->l_logBBsize - head_block);
1674         } else {
1675                 /*
1676                  * The head is behind the tail in the physical log,
1677                  * so the distance from the head to the tail is just
1678                  * the tail block minus the head block.
1679                  */
1680                 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1681                         XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1682                                          XFS_ERRLEVEL_LOW, log->l_mp);
1683                         return -EFSCORRUPTED;
1684                 }
1685                 tail_distance = tail_block - head_block;
1686         }
1687
1688         /*
1689          * If the head is right up against the tail, we can't clear
1690          * anything.
1691          */
1692         if (tail_distance <= 0) {
1693                 ASSERT(tail_distance == 0);
1694                 return 0;
1695         }
1696
1697         max_distance = XLOG_TOTAL_REC_SHIFT(log);
1698         /*
1699          * Take the smaller of the maximum amount of outstanding I/O
1700          * we could have and the distance to the tail to clear out.
1701          * We take the smaller so that we don't overwrite the tail and
1702          * we don't waste all day writing from the head to the tail
1703          * for no reason.
1704          */
1705         max_distance = MIN(max_distance, tail_distance);
1706
1707         if ((head_block + max_distance) <= log->l_logBBsize) {
1708                 /*
1709                  * We can stomp all the blocks we need to without
1710                  * wrapping around the end of the log.  Just do it
1711                  * in a single write.  Use the cycle number of the
1712                  * current cycle minus one so that the log will look like:
1713                  *     n ... | n - 1 ...
1714                  */
1715                 error = xlog_write_log_records(log, (head_cycle - 1),
1716                                 head_block, max_distance, tail_cycle,
1717                                 tail_block);
1718                 if (error)
1719                         return error;
1720         } else {
1721                 /*
1722                  * We need to wrap around the end of the physical log in
1723                  * order to clear all the blocks.  Do it in two separate
1724                  * I/Os.  The first write should be from the head to the
1725                  * end of the physical log, and it should use the current
1726                  * cycle number minus one just like above.
1727                  */
1728                 distance = log->l_logBBsize - head_block;
1729                 error = xlog_write_log_records(log, (head_cycle - 1),
1730                                 head_block, distance, tail_cycle,
1731                                 tail_block);
1732
1733                 if (error)
1734                         return error;
1735
1736                 /*
1737                  * Now write the blocks at the start of the physical log.
1738                  * This writes the remainder of the blocks we want to clear.
1739                  * It uses the current cycle number since we're now on the
1740                  * same cycle as the head so that we get:
1741                  *    n ... n ... | n - 1 ...
1742                  *    ^^^^^ blocks we're writing
1743                  */
1744                 distance = max_distance - (log->l_logBBsize - head_block);
1745                 error = xlog_write_log_records(log, head_cycle, 0, distance,
1746                                 tail_cycle, tail_block);
1747                 if (error)
1748                         return error;
1749         }
1750
1751         return 0;
1752 }
1753
1754 /******************************************************************************
1755  *
1756  *              Log recover routines
1757  *
1758  ******************************************************************************
1759  */
1760
1761 /*
1762  * Sort the log items in the transaction.
1763  *
1764  * The ordering constraints are defined by the inode allocation and unlink
1765  * behaviour. The rules are:
1766  *
1767  *      1. Every item is only logged once in a given transaction. Hence it
1768  *         represents the last logged state of the item. Hence ordering is
1769  *         dependent on the order in which operations need to be performed so
1770  *         required initial conditions are always met.
1771  *
1772  *      2. Cancelled buffers are recorded in pass 1 in a separate table and
1773  *         there's nothing to replay from them so we can simply cull them
1774  *         from the transaction. However, we can't do that until after we've
1775  *         replayed all the other items because they may be dependent on the
1776  *         cancelled buffer and replaying the cancelled buffer can remove it
1777  *         form the cancelled buffer table. Hence they have tobe done last.
1778  *
1779  *      3. Inode allocation buffers must be replayed before inode items that
1780  *         read the buffer and replay changes into it. For filesystems using the
1781  *         ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1782  *         treated the same as inode allocation buffers as they create and
1783  *         initialise the buffers directly.
1784  *
1785  *      4. Inode unlink buffers must be replayed after inode items are replayed.
1786  *         This ensures that inodes are completely flushed to the inode buffer
1787  *         in a "free" state before we remove the unlinked inode list pointer.
1788  *
1789  * Hence the ordering needs to be inode allocation buffers first, inode items
1790  * second, inode unlink buffers third and cancelled buffers last.
1791  *
1792  * But there's a problem with that - we can't tell an inode allocation buffer
1793  * apart from a regular buffer, so we can't separate them. We can, however,
1794  * tell an inode unlink buffer from the others, and so we can separate them out
1795  * from all the other buffers and move them to last.
1796  *
1797  * Hence, 4 lists, in order from head to tail:
1798  *      - buffer_list for all buffers except cancelled/inode unlink buffers
1799  *      - item_list for all non-buffer items
1800  *      - inode_buffer_list for inode unlink buffers
1801  *      - cancel_list for the cancelled buffers
1802  *
1803  * Note that we add objects to the tail of the lists so that first-to-last
1804  * ordering is preserved within the lists. Adding objects to the head of the
1805  * list means when we traverse from the head we walk them in last-to-first
1806  * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1807  * but for all other items there may be specific ordering that we need to
1808  * preserve.
1809  */
1810 STATIC int
1811 xlog_recover_reorder_trans(
1812         struct xlog             *log,
1813         struct xlog_recover     *trans,
1814         int                     pass)
1815 {
1816         xlog_recover_item_t     *item, *n;
1817         int                     error = 0;
1818         LIST_HEAD(sort_list);
1819         LIST_HEAD(cancel_list);
1820         LIST_HEAD(buffer_list);
1821         LIST_HEAD(inode_buffer_list);
1822         LIST_HEAD(inode_list);
1823
1824         list_splice_init(&trans->r_itemq, &sort_list);
1825         list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1826                 xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
1827
1828                 switch (ITEM_TYPE(item)) {
1829                 case XFS_LI_ICREATE:
1830                         list_move_tail(&item->ri_list, &buffer_list);
1831                         break;
1832                 case XFS_LI_BUF:
1833                         if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1834                                 trace_xfs_log_recover_item_reorder_head(log,
1835                                                         trans, item, pass);
1836                                 list_move(&item->ri_list, &cancel_list);
1837                                 break;
1838                         }
1839                         if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1840                                 list_move(&item->ri_list, &inode_buffer_list);
1841                                 break;
1842                         }
1843                         list_move_tail(&item->ri_list, &buffer_list);
1844                         break;
1845                 case XFS_LI_INODE:
1846                 case XFS_LI_DQUOT:
1847                 case XFS_LI_QUOTAOFF:
1848                 case XFS_LI_EFD:
1849                 case XFS_LI_EFI:
1850                         trace_xfs_log_recover_item_reorder_tail(log,
1851                                                         trans, item, pass);
1852                         list_move_tail(&item->ri_list, &inode_list);
1853                         break;
1854                 default:
1855                         xfs_warn(log->l_mp,
1856                                 "%s: unrecognized type of log operation",
1857                                 __func__);
1858                         ASSERT(0);
1859                         /*
1860                          * return the remaining items back to the transaction
1861                          * item list so they can be freed in caller.
1862                          */
1863                         if (!list_empty(&sort_list))
1864                                 list_splice_init(&sort_list, &trans->r_itemq);
1865                         error = -EIO;
1866                         goto out;
1867                 }
1868         }
1869 out:
1870         ASSERT(list_empty(&sort_list));
1871         if (!list_empty(&buffer_list))
1872                 list_splice(&buffer_list, &trans->r_itemq);
1873         if (!list_empty(&inode_list))
1874                 list_splice_tail(&inode_list, &trans->r_itemq);
1875         if (!list_empty(&inode_buffer_list))
1876                 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1877         if (!list_empty(&cancel_list))
1878                 list_splice_tail(&cancel_list, &trans->r_itemq);
1879         return error;
1880 }
1881
1882 /*
1883  * Build up the table of buf cancel records so that we don't replay
1884  * cancelled data in the second pass.  For buffer records that are
1885  * not cancel records, there is nothing to do here so we just return.
1886  *
1887  * If we get a cancel record which is already in the table, this indicates
1888  * that the buffer was cancelled multiple times.  In order to ensure
1889  * that during pass 2 we keep the record in the table until we reach its
1890  * last occurrence in the log, we keep a reference count in the cancel
1891  * record in the table to tell us how many times we expect to see this
1892  * record during the second pass.
1893  */
1894 STATIC int
1895 xlog_recover_buffer_pass1(
1896         struct xlog                     *log,
1897         struct xlog_recover_item        *item)
1898 {
1899         xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
1900         struct list_head        *bucket;
1901         struct xfs_buf_cancel   *bcp;
1902
1903         /*
1904          * If this isn't a cancel buffer item, then just return.
1905          */
1906         if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1907                 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1908                 return 0;
1909         }
1910
1911         /*
1912          * Insert an xfs_buf_cancel record into the hash table of them.
1913          * If there is already an identical record, bump its reference count.
1914          */
1915         bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1916         list_for_each_entry(bcp, bucket, bc_list) {
1917                 if (bcp->bc_blkno == buf_f->blf_blkno &&
1918                     bcp->bc_len == buf_f->blf_len) {
1919                         bcp->bc_refcount++;
1920                         trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1921                         return 0;
1922                 }
1923         }
1924
1925         bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1926         bcp->bc_blkno = buf_f->blf_blkno;
1927         bcp->bc_len = buf_f->blf_len;
1928         bcp->bc_refcount = 1;
1929         list_add_tail(&bcp->bc_list, bucket);
1930
1931         trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1932         return 0;
1933 }
1934
1935 /*
1936  * Check to see whether the buffer being recovered has a corresponding
1937  * entry in the buffer cancel record table. If it is, return the cancel
1938  * buffer structure to the caller.
1939  */
1940 STATIC struct xfs_buf_cancel *
1941 xlog_peek_buffer_cancelled(
1942         struct xlog             *log,
1943         xfs_daddr_t             blkno,
1944         uint                    len,
1945         ushort                  flags)
1946 {
1947         struct list_head        *bucket;
1948         struct xfs_buf_cancel   *bcp;
1949
1950         if (!log->l_buf_cancel_table) {
1951                 /* empty table means no cancelled buffers in the log */
1952                 ASSERT(!(flags & XFS_BLF_CANCEL));
1953                 return NULL;
1954         }
1955
1956         bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1957         list_for_each_entry(bcp, bucket, bc_list) {
1958                 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1959                         return bcp;
1960         }
1961
1962         /*
1963          * We didn't find a corresponding entry in the table, so return 0 so
1964          * that the buffer is NOT cancelled.
1965          */
1966         ASSERT(!(flags & XFS_BLF_CANCEL));
1967         return NULL;
1968 }
1969
1970 /*
1971  * If the buffer is being cancelled then return 1 so that it will be cancelled,
1972  * otherwise return 0.  If the buffer is actually a buffer cancel item
1973  * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
1974  * table and remove it from the table if this is the last reference.
1975  *
1976  * We remove the cancel record from the table when we encounter its last
1977  * occurrence in the log so that if the same buffer is re-used again after its
1978  * last cancellation we actually replay the changes made at that point.
1979  */
1980 STATIC int
1981 xlog_check_buffer_cancelled(
1982         struct xlog             *log,
1983         xfs_daddr_t             blkno,
1984         uint                    len,
1985         ushort                  flags)
1986 {
1987         struct xfs_buf_cancel   *bcp;
1988
1989         bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
1990         if (!bcp)
1991                 return 0;
1992
1993         /*
1994          * We've go a match, so return 1 so that the recovery of this buffer
1995          * is cancelled.  If this buffer is actually a buffer cancel log
1996          * item, then decrement the refcount on the one in the table and
1997          * remove it if this is the last reference.
1998          */
1999         if (flags & XFS_BLF_CANCEL) {
2000                 if (--bcp->bc_refcount == 0) {
2001                         list_del(&bcp->bc_list);
2002                         kmem_free(bcp);
2003                 }
2004         }
2005         return 1;
2006 }
2007
2008 /*
2009  * Perform recovery for a buffer full of inodes.  In these buffers, the only
2010  * data which should be recovered is that which corresponds to the
2011  * di_next_unlinked pointers in the on disk inode structures.  The rest of the
2012  * data for the inodes is always logged through the inodes themselves rather
2013  * than the inode buffer and is recovered in xlog_recover_inode_pass2().
2014  *
2015  * The only time when buffers full of inodes are fully recovered is when the
2016  * buffer is full of newly allocated inodes.  In this case the buffer will
2017  * not be marked as an inode buffer and so will be sent to
2018  * xlog_recover_do_reg_buffer() below during recovery.
2019  */
2020 STATIC int
2021 xlog_recover_do_inode_buffer(
2022         struct xfs_mount        *mp,
2023         xlog_recover_item_t     *item,
2024         struct xfs_buf          *bp,
2025         xfs_buf_log_format_t    *buf_f)
2026 {
2027         int                     i;
2028         int                     item_index = 0;
2029         int                     bit = 0;
2030         int                     nbits = 0;
2031         int                     reg_buf_offset = 0;
2032         int                     reg_buf_bytes = 0;
2033         int                     next_unlinked_offset;
2034         int                     inodes_per_buf;
2035         xfs_agino_t             *logged_nextp;
2036         xfs_agino_t             *buffer_nextp;
2037
2038         trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
2039
2040         /*
2041          * Post recovery validation only works properly on CRC enabled
2042          * filesystems.
2043          */
2044         if (xfs_sb_version_hascrc(&mp->m_sb))
2045                 bp->b_ops = &xfs_inode_buf_ops;
2046
2047         inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
2048         for (i = 0; i < inodes_per_buf; i++) {
2049                 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
2050                         offsetof(xfs_dinode_t, di_next_unlinked);
2051
2052                 while (next_unlinked_offset >=
2053                        (reg_buf_offset + reg_buf_bytes)) {
2054                         /*
2055                          * The next di_next_unlinked field is beyond
2056                          * the current logged region.  Find the next
2057                          * logged region that contains or is beyond
2058                          * the current di_next_unlinked field.
2059                          */
2060                         bit += nbits;
2061                         bit = xfs_next_bit(buf_f->blf_data_map,
2062                                            buf_f->blf_map_size, bit);
2063
2064                         /*
2065                          * If there are no more logged regions in the
2066                          * buffer, then we're done.
2067                          */
2068                         if (bit == -1)
2069                                 return 0;
2070
2071                         nbits = xfs_contig_bits(buf_f->blf_data_map,
2072                                                 buf_f->blf_map_size, bit);
2073                         ASSERT(nbits > 0);
2074                         reg_buf_offset = bit << XFS_BLF_SHIFT;
2075                         reg_buf_bytes = nbits << XFS_BLF_SHIFT;
2076                         item_index++;
2077                 }
2078
2079                 /*
2080                  * If the current logged region starts after the current
2081                  * di_next_unlinked field, then move on to the next
2082                  * di_next_unlinked field.
2083                  */
2084                 if (next_unlinked_offset < reg_buf_offset)
2085                         continue;
2086
2087                 ASSERT(item->ri_buf[item_index].i_addr != NULL);
2088                 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
2089                 ASSERT((reg_buf_offset + reg_buf_bytes) <=
2090                                                         BBTOB(bp->b_io_length));
2091
2092                 /*
2093                  * The current logged region contains a copy of the
2094                  * current di_next_unlinked field.  Extract its value
2095                  * and copy it to the buffer copy.
2096                  */
2097                 logged_nextp = item->ri_buf[item_index].i_addr +
2098                                 next_unlinked_offset - reg_buf_offset;
2099                 if (unlikely(*logged_nextp == 0)) {
2100                         xfs_alert(mp,
2101                 "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
2102                 "Trying to replay bad (0) inode di_next_unlinked field.",
2103                                 item, bp);
2104                         XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
2105                                          XFS_ERRLEVEL_LOW, mp);
2106                         return -EFSCORRUPTED;
2107                 }
2108
2109                 buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
2110                 *buffer_nextp = *logged_nextp;
2111
2112                 /*
2113                  * If necessary, recalculate the CRC in the on-disk inode. We
2114                  * have to leave the inode in a consistent state for whoever
2115                  * reads it next....
2116                  */
2117                 xfs_dinode_calc_crc(mp,
2118                                 xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
2119
2120         }
2121
2122         return 0;
2123 }
2124
2125 /*
2126  * V5 filesystems know the age of the buffer on disk being recovered. We can
2127  * have newer objects on disk than we are replaying, and so for these cases we
2128  * don't want to replay the current change as that will make the buffer contents
2129  * temporarily invalid on disk.
2130  *
2131  * The magic number might not match the buffer type we are going to recover
2132  * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags.  Hence
2133  * extract the LSN of the existing object in the buffer based on it's current
2134  * magic number.  If we don't recognise the magic number in the buffer, then
2135  * return a LSN of -1 so that the caller knows it was an unrecognised block and
2136  * so can recover the buffer.
2137  *
2138  * Note: we cannot rely solely on magic number matches to determine that the
2139  * buffer has a valid LSN - we also need to verify that it belongs to this
2140  * filesystem, so we need to extract the object's LSN and compare it to that
2141  * which we read from the superblock. If the UUIDs don't match, then we've got a
2142  * stale metadata block from an old filesystem instance that we need to recover
2143  * over the top of.
2144  */
2145 static xfs_lsn_t
2146 xlog_recover_get_buf_lsn(
2147         struct xfs_mount        *mp,
2148         struct xfs_buf          *bp)
2149 {
2150         __uint32_t              magic32;
2151         __uint16_t              magic16;
2152         __uint16_t              magicda;
2153         void                    *blk = bp->b_addr;
2154         uuid_t                  *uuid;
2155         xfs_lsn_t               lsn = -1;
2156
2157         /* v4 filesystems always recover immediately */
2158         if (!xfs_sb_version_hascrc(&mp->m_sb))
2159                 goto recover_immediately;
2160
2161         magic32 = be32_to_cpu(*(__be32 *)blk);
2162         switch (magic32) {
2163         case XFS_ABTB_CRC_MAGIC:
2164         case XFS_ABTC_CRC_MAGIC:
2165         case XFS_ABTB_MAGIC:
2166         case XFS_ABTC_MAGIC:
2167         case XFS_IBT_CRC_MAGIC:
2168         case XFS_IBT_MAGIC: {
2169                 struct xfs_btree_block *btb = blk;
2170
2171                 lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
2172                 uuid = &btb->bb_u.s.bb_uuid;
2173                 break;
2174         }
2175         case XFS_BMAP_CRC_MAGIC:
2176         case XFS_BMAP_MAGIC: {
2177                 struct xfs_btree_block *btb = blk;
2178
2179                 lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
2180                 uuid = &btb->bb_u.l.bb_uuid;
2181                 break;
2182         }
2183         case XFS_AGF_MAGIC:
2184                 lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2185                 uuid = &((struct xfs_agf *)blk)->agf_uuid;
2186                 break;
2187         case XFS_AGFL_MAGIC:
2188                 lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2189                 uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
2190                 break;
2191         case XFS_AGI_MAGIC:
2192                 lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2193                 uuid = &((struct xfs_agi *)blk)->agi_uuid;
2194                 break;
2195         case XFS_SYMLINK_MAGIC:
2196                 lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2197                 uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
2198                 break;
2199         case XFS_DIR3_BLOCK_MAGIC:
2200         case XFS_DIR3_DATA_MAGIC:
2201         case XFS_DIR3_FREE_MAGIC:
2202                 lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2203                 uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
2204                 break;
2205         case XFS_ATTR3_RMT_MAGIC:
2206                 /*
2207                  * Remote attr blocks are written synchronously, rather than
2208                  * being logged. That means they do not contain a valid LSN
2209                  * (i.e. transactionally ordered) in them, and hence any time we
2210                  * see a buffer to replay over the top of a remote attribute
2211                  * block we should simply do so.
2212                  */
2213                 goto recover_immediately;
2214         case XFS_SB_MAGIC:
2215                 /*
2216                  * superblock uuids are magic. We may or may not have a
2217                  * sb_meta_uuid on disk, but it will be set in the in-core
2218                  * superblock. We set the uuid pointer for verification
2219                  * according to the superblock feature mask to ensure we check
2220                  * the relevant UUID in the superblock.
2221                  */
2222                 lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
2223                 if (xfs_sb_version_hasmetauuid(&mp->m_sb))
2224                         uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
2225                 else
2226                         uuid = &((struct xfs_dsb *)blk)->sb_uuid;
2227                 break;
2228         default:
2229                 break;
2230         }
2231
2232         if (lsn != (xfs_lsn_t)-1) {
2233                 if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
2234                         goto recover_immediately;
2235                 return lsn;
2236         }
2237
2238         magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2239         switch (magicda) {
2240         case XFS_DIR3_LEAF1_MAGIC:
2241         case XFS_DIR3_LEAFN_MAGIC:
2242         case XFS_DA3_NODE_MAGIC:
2243                 lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2244                 uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
2245                 break;
2246         default:
2247                 break;
2248         }
2249
2250         if (lsn != (xfs_lsn_t)-1) {
2251                 if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
2252                         goto recover_immediately;
2253                 return lsn;
2254         }
2255
2256         /*
2257          * We do individual object checks on dquot and inode buffers as they
2258          * have their own individual LSN records. Also, we could have a stale
2259          * buffer here, so we have to at least recognise these buffer types.
2260          *
2261          * A notd complexity here is inode unlinked list processing - it logs
2262          * the inode directly in the buffer, but we don't know which inodes have
2263          * been modified, and there is no global buffer LSN. Hence we need to
2264          * recover all inode buffer types immediately. This problem will be
2265          * fixed by logical logging of the unlinked list modifications.
2266          */
2267         magic16 = be16_to_cpu(*(__be16 *)blk);
2268         switch (magic16) {
2269         case XFS_DQUOT_MAGIC:
2270         case XFS_DINODE_MAGIC:
2271                 goto recover_immediately;
2272         default:
2273                 break;
2274         }
2275
2276         /* unknown buffer contents, recover immediately */
2277
2278 recover_immediately:
2279         return (xfs_lsn_t)-1;
2280
2281 }
2282
2283 /*
2284  * Validate the recovered buffer is of the correct type and attach the
2285  * appropriate buffer operations to them for writeback. Magic numbers are in a
2286  * few places:
2287  *      the first 16 bits of the buffer (inode buffer, dquot buffer),
2288  *      the first 32 bits of the buffer (most blocks),
2289  *      inside a struct xfs_da_blkinfo at the start of the buffer.
2290  */
2291 static void
2292 xlog_recover_validate_buf_type(
2293         struct xfs_mount        *mp,
2294         struct xfs_buf          *bp,
2295         xfs_buf_log_format_t    *buf_f)
2296 {
2297         struct xfs_da_blkinfo   *info = bp->b_addr;
2298         __uint32_t              magic32;
2299         __uint16_t              magic16;
2300         __uint16_t              magicda;
2301
2302         /*
2303          * We can only do post recovery validation on items on CRC enabled
2304          * fielsystems as we need to know when the buffer was written to be able
2305          * to determine if we should have replayed the item. If we replay old
2306          * metadata over a newer buffer, then it will enter a temporarily
2307          * inconsistent state resulting in verification failures. Hence for now
2308          * just avoid the verification stage for non-crc filesystems
2309          */
2310         if (!xfs_sb_version_hascrc(&mp->m_sb))
2311                 return;
2312
2313         magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
2314         magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
2315         magicda = be16_to_cpu(info->magic);
2316         switch (xfs_blft_from_flags(buf_f)) {
2317         case XFS_BLFT_BTREE_BUF:
2318                 switch (magic32) {
2319                 case XFS_ABTB_CRC_MAGIC:
2320                 case XFS_ABTC_CRC_MAGIC:
2321                 case XFS_ABTB_MAGIC:
2322                 case XFS_ABTC_MAGIC:
2323                         bp->b_ops = &xfs_allocbt_buf_ops;
2324                         break;
2325                 case XFS_IBT_CRC_MAGIC:
2326                 case XFS_FIBT_CRC_MAGIC:
2327                 case XFS_IBT_MAGIC:
2328                 case XFS_FIBT_MAGIC:
2329                         bp->b_ops = &xfs_inobt_buf_ops;
2330                         break;
2331                 case XFS_BMAP_CRC_MAGIC:
2332                 case XFS_BMAP_MAGIC:
2333                         bp->b_ops = &xfs_bmbt_buf_ops;
2334                         break;
2335                 default:
2336                         xfs_warn(mp, "Bad btree block magic!");
2337                         ASSERT(0);
2338                         break;
2339                 }
2340                 break;
2341         case XFS_BLFT_AGF_BUF:
2342                 if (magic32 != XFS_AGF_MAGIC) {
2343                         xfs_warn(mp, "Bad AGF block magic!");
2344                         ASSERT(0);
2345                         break;
2346                 }
2347                 bp->b_ops = &xfs_agf_buf_ops;
2348                 break;
2349         case XFS_BLFT_AGFL_BUF:
2350                 if (magic32 != XFS_AGFL_MAGIC) {
2351                         xfs_warn(mp, "Bad AGFL block magic!");
2352                         ASSERT(0);
2353                         break;
2354                 }
2355                 bp->b_ops = &xfs_agfl_buf_ops;
2356                 break;
2357         case XFS_BLFT_AGI_BUF:
2358                 if (magic32 != XFS_AGI_MAGIC) {
2359                         xfs_warn(mp, "Bad AGI block magic!");
2360                         ASSERT(0);
2361                         break;
2362                 }
2363                 bp->b_ops = &xfs_agi_buf_ops;
2364                 break;
2365         case XFS_BLFT_UDQUOT_BUF:
2366         case XFS_BLFT_PDQUOT_BUF:
2367         case XFS_BLFT_GDQUOT_BUF:
2368 #ifdef CONFIG_XFS_QUOTA
2369                 if (magic16 != XFS_DQUOT_MAGIC) {
2370                         xfs_warn(mp, "Bad DQUOT block magic!");
2371                         ASSERT(0);
2372                         break;
2373                 }
2374                 bp->b_ops = &xfs_dquot_buf_ops;
2375 #else
2376                 xfs_alert(mp,
2377         "Trying to recover dquots without QUOTA support built in!");
2378                 ASSERT(0);
2379 #endif
2380                 break;
2381         case XFS_BLFT_DINO_BUF:
2382                 if (magic16 != XFS_DINODE_MAGIC) {
2383                         xfs_warn(mp, "Bad INODE block magic!");
2384                         ASSERT(0);
2385                         break;
2386                 }
2387                 bp->b_ops = &xfs_inode_buf_ops;
2388                 break;
2389         case XFS_BLFT_SYMLINK_BUF:
2390                 if (magic32 != XFS_SYMLINK_MAGIC) {
2391                         xfs_warn(mp, "Bad symlink block magic!");
2392                         ASSERT(0);
2393                         break;
2394                 }
2395                 bp->b_ops = &xfs_symlink_buf_ops;
2396                 break;
2397         case XFS_BLFT_DIR_BLOCK_BUF:
2398                 if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
2399                     magic32 != XFS_DIR3_BLOCK_MAGIC) {
2400                         xfs_warn(mp, "Bad dir block magic!");
2401                         ASSERT(0);
2402                         break;
2403                 }
2404                 bp->b_ops = &xfs_dir3_block_buf_ops;
2405                 break;
2406         case XFS_BLFT_DIR_DATA_BUF:
2407                 if (magic32 != XFS_DIR2_DATA_MAGIC &&
2408                     magic32 != XFS_DIR3_DATA_MAGIC) {
2409                         xfs_warn(mp, "Bad dir data magic!");
2410                         ASSERT(0);
2411                         break;
2412                 }
2413                 bp->b_ops = &xfs_dir3_data_buf_ops;
2414                 break;
2415         case XFS_BLFT_DIR_FREE_BUF:
2416                 if (magic32 != XFS_DIR2_FREE_MAGIC &&
2417                     magic32 != XFS_DIR3_FREE_MAGIC) {
2418                         xfs_warn(mp, "Bad dir3 free magic!");
2419                         ASSERT(0);
2420                         break;
2421                 }
2422                 bp->b_ops = &xfs_dir3_free_buf_ops;
2423                 break;
2424         case XFS_BLFT_DIR_LEAF1_BUF:
2425                 if (magicda != XFS_DIR2_LEAF1_MAGIC &&
2426                     magicda != XFS_DIR3_LEAF1_MAGIC) {
2427                         xfs_warn(mp, "Bad dir leaf1 magic!");
2428                         ASSERT(0);
2429                         break;
2430                 }
2431                 bp->b_ops = &xfs_dir3_leaf1_buf_ops;
2432                 break;
2433         case XFS_BLFT_DIR_LEAFN_BUF:
2434                 if (magicda != XFS_DIR2_LEAFN_MAGIC &&
2435                     magicda != XFS_DIR3_LEAFN_MAGIC) {
2436                         xfs_warn(mp, "Bad dir leafn magic!");
2437                         ASSERT(0);
2438                         break;
2439                 }
2440                 bp->b_ops = &xfs_dir3_leafn_buf_ops;
2441                 break;
2442         case XFS_BLFT_DA_NODE_BUF:
2443                 if (magicda != XFS_DA_NODE_MAGIC &&
2444                     magicda != XFS_DA3_NODE_MAGIC) {
2445                         xfs_warn(mp, "Bad da node magic!");
2446                         ASSERT(0);
2447                         break;
2448                 }
2449                 bp->b_ops = &xfs_da3_node_buf_ops;
2450                 break;
2451         case XFS_BLFT_ATTR_LEAF_BUF:
2452                 if (magicda != XFS_ATTR_LEAF_MAGIC &&
2453                     magicda != XFS_ATTR3_LEAF_MAGIC) {
2454                         xfs_warn(mp, "Bad attr leaf magic!");
2455                         ASSERT(0);
2456                         break;
2457                 }
2458                 bp->b_ops = &xfs_attr3_leaf_buf_ops;
2459                 break;
2460         case XFS_BLFT_ATTR_RMT_BUF:
2461                 if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2462                         xfs_warn(mp, "Bad attr remote magic!");
2463                         ASSERT(0);
2464                         break;
2465                 }
2466                 bp->b_ops = &xfs_attr3_rmt_buf_ops;
2467                 break;
2468         case XFS_BLFT_SB_BUF:
2469                 if (magic32 != XFS_SB_MAGIC) {
2470                         xfs_warn(mp, "Bad SB block magic!");
2471                         ASSERT(0);
2472                         break;
2473                 }
2474                 bp->b_ops = &xfs_sb_buf_ops;
2475                 break;
2476         default:
2477                 xfs_warn(mp, "Unknown buffer type %d!",
2478                          xfs_blft_from_flags(buf_f));
2479                 break;
2480         }
2481 }
2482
2483 /*
2484  * Perform a 'normal' buffer recovery.  Each logged region of the
2485  * buffer should be copied over the corresponding region in the
2486  * given buffer.  The bitmap in the buf log format structure indicates
2487  * where to place the logged data.
2488  */
2489 STATIC void
2490 xlog_recover_do_reg_buffer(
2491         struct xfs_mount        *mp,
2492         xlog_recover_item_t     *item,
2493         struct xfs_buf          *bp,
2494         xfs_buf_log_format_t    *buf_f)
2495 {
2496         int                     i;
2497         int                     bit;
2498         int                     nbits;
2499         int                     error;
2500
2501         trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
2502
2503         bit = 0;
2504         i = 1;  /* 0 is the buf format structure */
2505         while (1) {
2506                 bit = xfs_next_bit(buf_f->blf_data_map,
2507                                    buf_f->blf_map_size, bit);
2508                 if (bit == -1)
2509                         break;
2510                 nbits = xfs_contig_bits(buf_f->blf_data_map,
2511                                         buf_f->blf_map_size, bit);
2512                 ASSERT(nbits > 0);
2513                 ASSERT(item->ri_buf[i].i_addr != NULL);
2514                 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
2515                 ASSERT(BBTOB(bp->b_io_length) >=
2516                        ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
2517
2518                 /*
2519                  * The dirty regions logged in the buffer, even though
2520                  * contiguous, may span multiple chunks. This is because the
2521                  * dirty region may span a physical page boundary in a buffer
2522                  * and hence be split into two separate vectors for writing into
2523                  * the log. Hence we need to trim nbits back to the length of
2524                  * the current region being copied out of the log.
2525                  */
2526                 if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
2527                         nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
2528
2529                 /*
2530                  * Do a sanity check if this is a dquot buffer. Just checking
2531                  * the first dquot in the buffer should do. XXXThis is
2532                  * probably a good thing to do for other buf types also.
2533                  */
2534                 error = 0;
2535                 if (buf_f->blf_flags &
2536                    (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2537                         if (item->ri_buf[i].i_addr == NULL) {
2538                                 xfs_alert(mp,
2539                                         "XFS: NULL dquot in %s.", __func__);
2540                                 goto next;
2541                         }
2542                         if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
2543                                 xfs_alert(mp,
2544                                         "XFS: dquot too small (%d) in %s.",
2545                                         item->ri_buf[i].i_len, __func__);
2546                                 goto next;
2547                         }
2548                         error = xfs_dqcheck(mp, item->ri_buf[i].i_addr,
2549                                                -1, 0, XFS_QMOPT_DOWARN,
2550                                                "dquot_buf_recover");
2551                         if (error)
2552                                 goto next;
2553                 }
2554
2555                 memcpy(xfs_buf_offset(bp,
2556                         (uint)bit << XFS_BLF_SHIFT),    /* dest */
2557                         item->ri_buf[i].i_addr,         /* source */
2558                         nbits<<XFS_BLF_SHIFT);          /* length */
2559  next:
2560                 i++;
2561                 bit += nbits;
2562         }
2563
2564         /* Shouldn't be any more regions */
2565         ASSERT(i == item->ri_total);
2566
2567         xlog_recover_validate_buf_type(mp, bp, buf_f);
2568 }
2569
2570 /*
2571  * Perform a dquot buffer recovery.
2572  * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2573  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2574  * Else, treat it as a regular buffer and do recovery.
2575  *
2576  * Return false if the buffer was tossed and true if we recovered the buffer to
2577  * indicate to the caller if the buffer needs writing.
2578  */
2579 STATIC bool
2580 xlog_recover_do_dquot_buffer(
2581         struct xfs_mount                *mp,
2582         struct xlog                     *log,
2583         struct xlog_recover_item        *item,
2584         struct xfs_buf                  *bp,
2585         struct xfs_buf_log_format       *buf_f)
2586 {
2587         uint                    type;
2588
2589         trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2590
2591         /*
2592          * Filesystems are required to send in quota flags at mount time.
2593          */
2594         if (!mp->m_qflags)
2595                 return false;
2596
2597         type = 0;
2598         if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2599                 type |= XFS_DQ_USER;
2600         if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2601                 type |= XFS_DQ_PROJ;
2602         if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2603                 type |= XFS_DQ_GROUP;
2604         /*
2605          * This type of quotas was turned off, so ignore this buffer
2606          */
2607         if (log->l_quotaoffs_flag & type)
2608                 return false;
2609
2610         xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2611         return true;
2612 }
2613
2614 /*
2615  * This routine replays a modification made to a buffer at runtime.
2616  * There are actually two types of buffer, regular and inode, which
2617  * are handled differently.  Inode buffers are handled differently
2618  * in that we only recover a specific set of data from them, namely
2619  * the inode di_next_unlinked fields.  This is because all other inode
2620  * data is actually logged via inode records and any data we replay
2621  * here which overlaps that may be stale.
2622  *
2623  * When meta-data buffers are freed at run time we log a buffer item
2624  * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2625  * of the buffer in the log should not be replayed at recovery time.
2626  * This is so that if the blocks covered by the buffer are reused for
2627  * file data before we crash we don't end up replaying old, freed
2628  * meta-data into a user's file.
2629  *
2630  * To handle the cancellation of buffer log items, we make two passes
2631  * over the log during recovery.  During the first we build a table of
2632  * those buffers which have been cancelled, and during the second we
2633  * only replay those buffers which do not have corresponding cancel
2634  * records in the table.  See xlog_recover_buffer_pass[1,2] above
2635  * for more details on the implementation of the table of cancel records.
2636  */
2637 STATIC int
2638 xlog_recover_buffer_pass2(
2639         struct xlog                     *log,
2640         struct list_head                *buffer_list,
2641         struct xlog_recover_item        *item,
2642         xfs_lsn_t                       current_lsn)
2643 {
2644         xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
2645         xfs_mount_t             *mp = log->l_mp;
2646         xfs_buf_t               *bp;
2647         int                     error;
2648         uint                    buf_flags;
2649         xfs_lsn_t               lsn;
2650
2651         /*
2652          * In this pass we only want to recover all the buffers which have
2653          * not been cancelled and are not cancellation buffers themselves.
2654          */
2655         if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2656                         buf_f->blf_len, buf_f->blf_flags)) {
2657                 trace_xfs_log_recover_buf_cancel(log, buf_f);
2658                 return 0;
2659         }
2660
2661         trace_xfs_log_recover_buf_recover(log, buf_f);
2662
2663         buf_flags = 0;
2664         if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2665                 buf_flags |= XBF_UNMAPPED;
2666
2667         bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2668                           buf_flags, NULL);
2669         if (!bp)
2670                 return -ENOMEM;
2671         error = bp->b_error;
2672         if (error) {
2673                 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2674                 goto out_release;
2675         }
2676
2677         /*
2678          * Recover the buffer only if we get an LSN from it and it's less than
2679          * the lsn of the transaction we are replaying.
2680          *
2681          * Note that we have to be extremely careful of readahead here.
2682          * Readahead does not attach verfiers to the buffers so if we don't
2683          * actually do any replay after readahead because of the LSN we found
2684          * in the buffer if more recent than that current transaction then we
2685          * need to attach the verifier directly. Failure to do so can lead to
2686          * future recovery actions (e.g. EFI and unlinked list recovery) can
2687          * operate on the buffers and they won't get the verifier attached. This
2688          * can lead to blocks on disk having the correct content but a stale
2689          * CRC.
2690          *
2691          * It is safe to assume these clean buffers are currently up to date.
2692          * If the buffer is dirtied by a later transaction being replayed, then
2693          * the verifier will be reset to match whatever recover turns that
2694          * buffer into.
2695          */
2696         lsn = xlog_recover_get_buf_lsn(mp, bp);
2697         if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2698                 xlog_recover_validate_buf_type(mp, bp, buf_f);
2699                 goto out_release;
2700         }
2701
2702         if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2703                 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2704                 if (error)
2705                         goto out_release;
2706         } else if (buf_f->blf_flags &
2707                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2708                 bool    dirty;
2709
2710                 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2711                 if (!dirty)
2712                         goto out_release;
2713         } else {
2714                 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2715         }
2716
2717         /*
2718          * Perform delayed write on the buffer.  Asynchronous writes will be
2719          * slower when taking into account all the buffers to be flushed.
2720          *
2721          * Also make sure that only inode buffers with good sizes stay in
2722          * the buffer cache.  The kernel moves inodes in buffers of 1 block
2723          * or mp->m_inode_cluster_size bytes, whichever is bigger.  The inode
2724          * buffers in the log can be a different size if the log was generated
2725          * by an older kernel using unclustered inode buffers or a newer kernel
2726          * running with a different inode cluster size.  Regardless, if the
2727          * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
2728          * for *our* value of mp->m_inode_cluster_size, then we need to keep
2729          * the buffer out of the buffer cache so that the buffer won't
2730          * overlap with future reads of those inodes.
2731          */
2732         if (XFS_DINODE_MAGIC ==
2733             be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2734             (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
2735                         (__uint32_t)log->l_mp->m_inode_cluster_size))) {
2736                 xfs_buf_stale(bp);
2737                 error = xfs_bwrite(bp);
2738         } else {
2739                 ASSERT(bp->b_target->bt_mount == mp);
2740                 bp->b_iodone = xlog_recover_iodone;
2741                 xfs_buf_delwri_queue(bp, buffer_list);
2742         }
2743
2744 out_release:
2745         xfs_buf_relse(bp);
2746         return error;
2747 }
2748
2749 /*
2750  * Inode fork owner changes
2751  *
2752  * If we have been told that we have to reparent the inode fork, it's because an
2753  * extent swap operation on a CRC enabled filesystem has been done and we are
2754  * replaying it. We need to walk the BMBT of the appropriate fork and change the
2755  * owners of it.
2756  *
2757  * The complexity here is that we don't have an inode context to work with, so
2758  * after we've replayed the inode we need to instantiate one.  This is where the
2759  * fun begins.
2760  *
2761  * We are in the middle of log recovery, so we can't run transactions. That
2762  * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2763  * that will result in the corresponding iput() running the inode through
2764  * xfs_inactive(). If we've just replayed an inode core that changes the link
2765  * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2766  * transactions (bad!).
2767  *
2768  * So, to avoid this, we instantiate an inode directly from the inode core we've
2769  * just recovered. We have the buffer still locked, and all we really need to
2770  * instantiate is the inode core and the forks being modified. We can do this
2771  * manually, then run the inode btree owner change, and then tear down the
2772  * xfs_inode without having to run any transactions at all.
2773  *
2774  * Also, because we don't have a transaction context available here but need to
2775  * gather all the buffers we modify for writeback so we pass the buffer_list
2776  * instead for the operation to use.
2777  */
2778
2779 STATIC int
2780 xfs_recover_inode_owner_change(
2781         struct xfs_mount        *mp,
2782         struct xfs_dinode       *dip,
2783         struct xfs_inode_log_format *in_f,
2784         struct list_head        *buffer_list)
2785 {
2786         struct xfs_inode        *ip;
2787         int                     error;
2788
2789         ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
2790
2791         ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2792         if (!ip)
2793                 return -ENOMEM;
2794
2795         /* instantiate the inode */
2796         xfs_dinode_from_disk(&ip->i_d, dip);
2797         ASSERT(ip->i_d.di_version >= 3);
2798
2799         error = xfs_iformat_fork(ip, dip);
2800         if (error)
2801                 goto out_free_ip;
2802
2803
2804         if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2805                 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2806                 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2807                                               ip->i_ino, buffer_list);
2808                 if (error)
2809                         goto out_free_ip;
2810         }
2811
2812         if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2813                 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2814                 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2815                                               ip->i_ino, buffer_list);
2816                 if (error)
2817                         goto out_free_ip;
2818         }
2819
2820 out_free_ip:
2821         xfs_inode_free(ip);
2822         return error;
2823 }
2824
2825 STATIC int
2826 xlog_recover_inode_pass2(
2827         struct xlog                     *log,
2828         struct list_head                *buffer_list,
2829         struct xlog_recover_item        *item,
2830         xfs_lsn_t                       current_lsn)
2831 {
2832         xfs_inode_log_format_t  *in_f;
2833         xfs_mount_t             *mp = log->l_mp;
2834         xfs_buf_t               *bp;
2835         xfs_dinode_t            *dip;
2836         int                     len;
2837         char                    *src;
2838         char                    *dest;
2839         int                     error;
2840         int                     attr_index;
2841         uint                    fields;
2842         xfs_icdinode_t          *dicp;
2843         uint                    isize;
2844         int                     need_free = 0;
2845
2846         if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2847                 in_f = item->ri_buf[0].i_addr;
2848         } else {
2849                 in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
2850                 need_free = 1;
2851                 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2852                 if (error)
2853                         goto error;
2854         }
2855
2856         /*
2857          * Inode buffers can be freed, look out for it,
2858          * and do not replay the inode.
2859          */
2860         if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2861                                         in_f->ilf_len, 0)) {
2862                 error = 0;
2863                 trace_xfs_log_recover_inode_cancel(log, in_f);
2864                 goto error;
2865         }
2866         trace_xfs_log_recover_inode_recover(log, in_f);
2867
2868         bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2869                           &xfs_inode_buf_ops);
2870         if (!bp) {
2871                 error = -ENOMEM;
2872                 goto error;
2873         }
2874         error = bp->b_error;
2875         if (error) {
2876                 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2877                 goto out_release;
2878         }
2879         ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2880         dip = xfs_buf_offset(bp, in_f->ilf_boffset);
2881
2882         /*
2883          * Make sure the place we're flushing out to really looks
2884          * like an inode!
2885          */
2886         if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
2887                 xfs_alert(mp,
2888         "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2889                         __func__, dip, bp, in_f->ilf_ino);
2890                 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2891                                  XFS_ERRLEVEL_LOW, mp);
2892                 error = -EFSCORRUPTED;
2893                 goto out_release;
2894         }
2895         dicp = item->ri_buf[1].i_addr;
2896         if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2897                 xfs_alert(mp,
2898                         "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2899                         __func__, item, in_f->ilf_ino);
2900                 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2901                                  XFS_ERRLEVEL_LOW, mp);
2902                 error = -EFSCORRUPTED;
2903                 goto out_release;
2904         }
2905
2906         /*
2907          * If the inode has an LSN in it, recover the inode only if it's less
2908          * than the lsn of the transaction we are replaying. Note: we still
2909          * need to replay an owner change even though the inode is more recent
2910          * than the transaction as there is no guarantee that all the btree
2911          * blocks are more recent than this transaction, too.
2912          */
2913         if (dip->di_version >= 3) {
2914                 xfs_lsn_t       lsn = be64_to_cpu(dip->di_lsn);
2915
2916                 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2917                         trace_xfs_log_recover_inode_skip(log, in_f);
2918                         error = 0;
2919                         goto out_owner_change;
2920                 }
2921         }
2922
2923         /*
2924          * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
2925          * are transactional and if ordering is necessary we can determine that
2926          * more accurately by the LSN field in the V3 inode core. Don't trust
2927          * the inode versions we might be changing them here - use the
2928          * superblock flag to determine whether we need to look at di_flushiter
2929          * to skip replay when the on disk inode is newer than the log one
2930          */
2931         if (!xfs_sb_version_hascrc(&mp->m_sb) &&
2932             dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2933                 /*
2934                  * Deal with the wrap case, DI_MAX_FLUSH is less
2935                  * than smaller numbers
2936                  */
2937                 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
2938                     dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2939                         /* do nothing */
2940                 } else {
2941                         trace_xfs_log_recover_inode_skip(log, in_f);
2942                         error = 0;
2943                         goto out_release;
2944                 }
2945         }
2946
2947         /* Take the opportunity to reset the flush iteration count */
2948         dicp->di_flushiter = 0;
2949
2950         if (unlikely(S_ISREG(dicp->di_mode))) {
2951                 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2952                     (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2953                         XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2954                                          XFS_ERRLEVEL_LOW, mp, dicp);
2955                         xfs_alert(mp,
2956                 "%s: Bad regular inode log record, rec ptr 0x%p, "
2957                 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2958                                 __func__, item, dip, bp, in_f->ilf_ino);
2959                         error = -EFSCORRUPTED;
2960                         goto out_release;
2961                 }
2962         } else if (unlikely(S_ISDIR(dicp->di_mode))) {
2963                 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2964                     (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2965                     (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2966                         XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2967                                              XFS_ERRLEVEL_LOW, mp, dicp);
2968                         xfs_alert(mp,
2969                 "%s: Bad dir inode log record, rec ptr 0x%p, "
2970                 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2971                                 __func__, item, dip, bp, in_f->ilf_ino);
2972                         error = -EFSCORRUPTED;
2973                         goto out_release;
2974                 }
2975         }
2976         if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2977                 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2978                                      XFS_ERRLEVEL_LOW, mp, dicp);
2979                 xfs_alert(mp,
2980         "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2981         "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2982                         __func__, item, dip, bp, in_f->ilf_ino,
2983                         dicp->di_nextents + dicp->di_anextents,
2984                         dicp->di_nblocks);
2985                 error = -EFSCORRUPTED;
2986                 goto out_release;
2987         }
2988         if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2989                 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2990                                      XFS_ERRLEVEL_LOW, mp, dicp);
2991                 xfs_alert(mp,
2992         "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2993         "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2994                         item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2995                 error = -EFSCORRUPTED;
2996                 goto out_release;
2997         }
2998         isize = xfs_icdinode_size(dicp->di_version);
2999         if (unlikely(item->ri_buf[1].i_len > isize)) {
3000                 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
3001                                      XFS_ERRLEVEL_LOW, mp, dicp);
3002                 xfs_alert(mp,
3003                         "%s: Bad inode log record length %d, rec ptr 0x%p",
3004                         __func__, item->ri_buf[1].i_len, item);
3005                 error = -EFSCORRUPTED;
3006                 goto out_release;
3007         }
3008
3009         /* The core is in in-core format */
3010         xfs_dinode_to_disk(dip, dicp);
3011
3012         /* the rest is in on-disk format */
3013         if (item->ri_buf[1].i_len > isize) {
3014                 memcpy((char *)dip + isize,
3015                         item->ri_buf[1].i_addr + isize,
3016                         item->ri_buf[1].i_len - isize);
3017         }
3018
3019         fields = in_f->ilf_fields;
3020         switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
3021         case XFS_ILOG_DEV:
3022                 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
3023                 break;
3024         case XFS_ILOG_UUID:
3025                 memcpy(XFS_DFORK_DPTR(dip),
3026                        &in_f->ilf_u.ilfu_uuid,
3027                        sizeof(uuid_t));
3028                 break;
3029         }
3030
3031         if (in_f->ilf_size == 2)
3032                 goto out_owner_change;
3033         len = item->ri_buf[2].i_len;
3034         src = item->ri_buf[2].i_addr;
3035         ASSERT(in_f->ilf_size <= 4);
3036         ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
3037         ASSERT(!(fields & XFS_ILOG_DFORK) ||
3038                (len == in_f->ilf_dsize));
3039
3040         switch (fields & XFS_ILOG_DFORK) {
3041         case XFS_ILOG_DDATA:
3042         case XFS_ILOG_DEXT:
3043                 memcpy(XFS_DFORK_DPTR(dip), src, len);
3044                 break;
3045
3046         case XFS_ILOG_DBROOT:
3047                 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
3048                                  (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
3049                                  XFS_DFORK_DSIZE(dip, mp));
3050                 break;
3051
3052         default:
3053                 /*
3054                  * There are no data fork flags set.
3055                  */
3056                 ASSERT((fields & XFS_ILOG_DFORK) == 0);
3057                 break;
3058         }
3059
3060         /*
3061          * If we logged any attribute data, recover it.  There may or
3062          * may not have been any other non-core data logged in this
3063          * transaction.
3064          */
3065         if (in_f->ilf_fields & XFS_ILOG_AFORK) {
3066                 if (in_f->ilf_fields & XFS_ILOG_DFORK) {
3067                         attr_index = 3;
3068                 } else {
3069                         attr_index = 2;
3070                 }
3071                 len = item->ri_buf[attr_index].i_len;
3072                 src = item->ri_buf[attr_index].i_addr;
3073                 ASSERT(len == in_f->ilf_asize);
3074
3075                 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
3076                 case XFS_ILOG_ADATA:
3077                 case XFS_ILOG_AEXT:
3078                         dest = XFS_DFORK_APTR(dip);
3079                         ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
3080                         memcpy(dest, src, len);
3081                         break;
3082
3083                 case XFS_ILOG_ABROOT:
3084                         dest = XFS_DFORK_APTR(dip);
3085                         xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
3086                                          len, (xfs_bmdr_block_t*)dest,
3087                                          XFS_DFORK_ASIZE(dip, mp));
3088                         break;
3089
3090                 default:
3091                         xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
3092                         ASSERT(0);
3093                         error = -EIO;
3094                         goto out_release;
3095                 }
3096         }
3097
3098 out_owner_change:
3099         if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
3100                 error = xfs_recover_inode_owner_change(mp, dip, in_f,
3101                                                        buffer_list);
3102         /* re-generate the checksum. */
3103         xfs_dinode_calc_crc(log->l_mp, dip);
3104
3105         ASSERT(bp->b_target->bt_mount == mp);
3106         bp->b_iodone = xlog_recover_iodone;
3107         xfs_buf_delwri_queue(bp, buffer_list);
3108
3109 out_release:
3110         xfs_buf_relse(bp);
3111 error:
3112         if (need_free)
3113                 kmem_free(in_f);
3114         return error;
3115 }
3116
3117 /*
3118  * Recover QUOTAOFF records. We simply make a note of it in the xlog
3119  * structure, so that we know not to do any dquot item or dquot buffer recovery,
3120  * of that type.
3121  */
3122 STATIC int
3123 xlog_recover_quotaoff_pass1(
3124         struct xlog                     *log,
3125         struct xlog_recover_item        *item)
3126 {
3127         xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
3128         ASSERT(qoff_f);
3129
3130         /*
3131          * The logitem format's flag tells us if this was user quotaoff,
3132          * group/project quotaoff or both.
3133          */
3134         if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
3135                 log->l_quotaoffs_flag |= XFS_DQ_USER;
3136         if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
3137                 log->l_quotaoffs_flag |= XFS_DQ_PROJ;
3138         if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
3139                 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
3140
3141         return 0;
3142 }
3143
3144 /*
3145  * Recover a dquot record
3146  */
3147 STATIC int
3148 xlog_recover_dquot_pass2(
3149         struct xlog                     *log,
3150         struct list_head                *buffer_list,
3151         struct xlog_recover_item        *item,
3152         xfs_lsn_t                       current_lsn)
3153 {
3154         xfs_mount_t             *mp = log->l_mp;
3155         xfs_buf_t               *bp;
3156         struct xfs_disk_dquot   *ddq, *recddq;
3157         int                     error;
3158         xfs_dq_logformat_t      *dq_f;
3159         uint                    type;
3160
3161
3162         /*
3163          * Filesystems are required to send in quota flags at mount time.
3164          */
3165         if (mp->m_qflags == 0)
3166                 return 0;
3167
3168         recddq = item->ri_buf[1].i_addr;
3169         if (recddq == NULL) {
3170                 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
3171                 return -EIO;
3172         }
3173         if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
3174                 xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
3175                         item->ri_buf[1].i_len, __func__);
3176                 return -EIO;
3177         }
3178
3179         /*
3180          * This type of quotas was turned off, so ignore this record.
3181          */
3182         type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3183         ASSERT(type);
3184         if (log->l_quotaoffs_flag & type)
3185                 return 0;
3186
3187         /*
3188          * At this point we know that quota was _not_ turned off.
3189          * Since the mount flags are not indicating to us otherwise, this
3190          * must mean that quota is on, and the dquot needs to be replayed.
3191          * Remember that we may not have fully recovered the superblock yet,
3192          * so we can't do the usual trick of looking at the SB quota bits.
3193          *
3194          * The other possibility, of course, is that the quota subsystem was
3195          * removed since the last mount - ENOSYS.
3196          */
3197         dq_f = item->ri_buf[0].i_addr;
3198         ASSERT(dq_f);
3199         error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
3200                            "xlog_recover_dquot_pass2 (log copy)");
3201         if (error)
3202                 return -EIO;
3203         ASSERT(dq_f->qlf_len == 1);
3204
3205         /*
3206          * At this point we are assuming that the dquots have been allocated
3207          * and hence the buffer has valid dquots stamped in it. It should,
3208          * therefore, pass verifier validation. If the dquot is bad, then the
3209          * we'll return an error here, so we don't need to specifically check
3210          * the dquot in the buffer after the verifier has run.
3211          */
3212         error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
3213                                    XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
3214                                    &xfs_dquot_buf_ops);
3215         if (error)
3216                 return error;
3217
3218         ASSERT(bp);
3219         ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
3220
3221         /*
3222          * If the dquot has an LSN in it, recover the dquot only if it's less
3223          * than the lsn of the transaction we are replaying.
3224          */
3225         if (xfs_sb_version_hascrc(&mp->m_sb)) {
3226                 struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
3227                 xfs_lsn_t       lsn = be64_to_cpu(dqb->dd_lsn);
3228
3229                 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3230                         goto out_release;
3231                 }
3232         }
3233
3234         memcpy(ddq, recddq, item->ri_buf[1].i_len);
3235         if (xfs_sb_version_hascrc(&mp->m_sb)) {
3236                 xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
3237                                  XFS_DQUOT_CRC_OFF);
3238         }
3239
3240         ASSERT(dq_f->qlf_size == 2);
3241         ASSERT(bp->b_target->bt_mount == mp);
3242         bp->b_iodone = xlog_recover_iodone;
3243         xfs_buf_delwri_queue(bp, buffer_list);
3244
3245 out_release:
3246         xfs_buf_relse(bp);
3247         return 0;
3248 }
3249
3250 /*
3251  * This routine is called to create an in-core extent free intent
3252  * item from the efi format structure which was logged on disk.
3253  * It allocates an in-core efi, copies the extents from the format
3254  * structure into it, and adds the efi to the AIL with the given
3255  * LSN.
3256  */
3257 STATIC int
3258 xlog_recover_efi_pass2(
3259         struct xlog                     *log,
3260         struct xlog_recover_item        *item,
3261         xfs_lsn_t                       lsn)
3262 {
3263         int                             error;
3264         struct xfs_mount                *mp = log->l_mp;
3265         struct xfs_efi_log_item         *efip;
3266         struct xfs_efi_log_format       *efi_formatp;
3267
3268         efi_formatp = item->ri_buf[0].i_addr;
3269
3270         efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
3271         error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
3272         if (error) {
3273                 xfs_efi_item_free(efip);
3274                 return error;
3275         }
3276         atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
3277
3278         spin_lock(&log->l_ailp->xa_lock);
3279         /*
3280          * The EFI has two references. One for the EFD and one for EFI to ensure
3281          * it makes it into the AIL. Insert the EFI into the AIL directly and
3282          * drop the EFI reference. Note that xfs_trans_ail_update() drops the
3283          * AIL lock.
3284          */
3285         xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
3286         xfs_efi_release(efip);
3287         return 0;
3288 }
3289
3290
3291 /*
3292  * This routine is called when an EFD format structure is found in a committed
3293  * transaction in the log. Its purpose is to cancel the corresponding EFI if it
3294  * was still in the log. To do this it searches the AIL for the EFI with an id
3295  * equal to that in the EFD format structure. If we find it we drop the EFD
3296  * reference, which removes the EFI from the AIL and frees it.
3297  */
3298 STATIC int
3299 xlog_recover_efd_pass2(
3300         struct xlog                     *log,
3301         struct xlog_recover_item        *item)
3302 {
3303         xfs_efd_log_format_t    *efd_formatp;
3304         xfs_efi_log_item_t      *efip = NULL;
3305         xfs_log_item_t          *lip;
3306         __uint64_t              efi_id;
3307         struct xfs_ail_cursor   cur;
3308         struct xfs_ail          *ailp = log->l_ailp;
3309
3310         efd_formatp = item->ri_buf[0].i_addr;
3311         ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
3312                 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
3313                (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
3314                 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
3315         efi_id = efd_formatp->efd_efi_id;
3316
3317         /*
3318          * Search for the EFI with the id in the EFD format structure in the
3319          * AIL.
3320          */
3321         spin_lock(&ailp->xa_lock);
3322         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3323         while (lip != NULL) {
3324                 if (lip->li_type == XFS_LI_EFI) {
3325                         efip = (xfs_efi_log_item_t *)lip;
3326                         if (efip->efi_format.efi_id == efi_id) {
3327                                 /*
3328                                  * Drop the EFD reference to the EFI. This
3329                                  * removes the EFI from the AIL and frees it.
3330                                  */
3331                                 spin_unlock(&ailp->xa_lock);
3332                                 xfs_efi_release(efip);
3333                                 spin_lock(&ailp->xa_lock);
3334                                 break;
3335                         }
3336                 }
3337                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3338         }
3339
3340         xfs_trans_ail_cursor_done(&cur);
3341         spin_unlock(&ailp->xa_lock);
3342
3343         return 0;
3344 }
3345
3346 /*
3347  * This routine is called when an inode create format structure is found in a
3348  * committed transaction in the log.  It's purpose is to initialise the inodes
3349  * being allocated on disk. This requires us to get inode cluster buffers that
3350  * match the range to be intialised, stamped with inode templates and written
3351  * by delayed write so that subsequent modifications will hit the cached buffer
3352  * and only need writing out at the end of recovery.
3353  */
3354 STATIC int
3355 xlog_recover_do_icreate_pass2(
3356         struct xlog             *log,
3357         struct list_head        *buffer_list,
3358         xlog_recover_item_t     *item)
3359 {
3360         struct xfs_mount        *mp = log->l_mp;
3361         struct xfs_icreate_log  *icl;
3362         xfs_agnumber_t          agno;
3363         xfs_agblock_t           agbno;
3364         unsigned int            count;
3365         unsigned int            isize;
3366         xfs_agblock_t           length;
3367         int                     blks_per_cluster;
3368         int                     bb_per_cluster;
3369         int                     cancel_count;
3370         int                     nbufs;
3371         int                     i;
3372
3373         icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3374         if (icl->icl_type != XFS_LI_ICREATE) {
3375                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3376                 return -EINVAL;
3377         }
3378
3379         if (icl->icl_size != 1) {
3380                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3381                 return -EINVAL;
3382         }
3383
3384         agno = be32_to_cpu(icl->icl_ag);
3385         if (agno >= mp->m_sb.sb_agcount) {
3386                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3387                 return -EINVAL;
3388         }
3389         agbno = be32_to_cpu(icl->icl_agbno);
3390         if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3391                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3392                 return -EINVAL;
3393         }
3394         isize = be32_to_cpu(icl->icl_isize);
3395         if (isize != mp->m_sb.sb_inodesize) {
3396                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3397                 return -EINVAL;
3398         }
3399         count = be32_to_cpu(icl->icl_count);
3400         if (!count) {
3401                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3402                 return -EINVAL;
3403         }
3404         length = be32_to_cpu(icl->icl_length);
3405         if (!length || length >= mp->m_sb.sb_agblocks) {
3406                 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3407                 return -EINVAL;
3408         }
3409
3410         /*
3411          * The inode chunk is either full or sparse and we only support
3412          * m_ialloc_min_blks sized sparse allocations at this time.
3413          */
3414         if (length != mp->m_ialloc_blks &&
3415             length != mp->m_ialloc_min_blks) {
3416                 xfs_warn(log->l_mp,
3417                          "%s: unsupported chunk length", __FUNCTION__);
3418                 return -EINVAL;
3419         }
3420
3421         /* verify inode count is consistent with extent length */
3422         if ((count >> mp->m_sb.sb_inopblog) != length) {
3423                 xfs_warn(log->l_mp,
3424                          "%s: inconsistent inode count and chunk length",
3425                          __FUNCTION__);
3426                 return -EINVAL;
3427         }
3428
3429         /*
3430          * The icreate transaction can cover multiple cluster buffers and these
3431          * buffers could have been freed and reused. Check the individual
3432          * buffers for cancellation so we don't overwrite anything written after
3433          * a cancellation.
3434          */
3435         blks_per_cluster = xfs_icluster_size_fsb(mp);
3436         bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
3437         nbufs = length / blks_per_cluster;
3438         for (i = 0, cancel_count = 0; i < nbufs; i++) {
3439                 xfs_daddr_t     daddr;
3440
3441                 daddr = XFS_AGB_TO_DADDR(mp, agno,
3442                                          agbno + i * blks_per_cluster);
3443                 if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
3444                         cancel_count++;
3445         }
3446
3447         /*
3448          * We currently only use icreate for a single allocation at a time. This
3449          * means we should expect either all or none of the buffers to be
3450          * cancelled. Be conservative and skip replay if at least one buffer is
3451          * cancelled, but warn the user that something is awry if the buffers
3452          * are not consistent.
3453          *
3454          * XXX: This must be refined to only skip cancelled clusters once we use
3455          * icreate for multiple chunk allocations.
3456          */
3457         ASSERT(!cancel_count || cancel_count == nbufs);
3458         if (cancel_count) {
3459                 if (cancel_count != nbufs)
3460                         xfs_warn(mp,
3461         "WARNING: partial inode chunk cancellation, skipped icreate.");
3462                 trace_xfs_log_recover_icreate_cancel(log, icl);
3463                 return 0;
3464         }
3465
3466         trace_xfs_log_recover_icreate_recover(log, icl);
3467         return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
3468                                      length, be32_to_cpu(icl->icl_gen));
3469 }
3470
3471 STATIC void
3472 xlog_recover_buffer_ra_pass2(
3473         struct xlog                     *log,
3474         struct xlog_recover_item        *item)
3475 {
3476         struct xfs_buf_log_format       *buf_f = item->ri_buf[0].i_addr;
3477         struct xfs_mount                *mp = log->l_mp;
3478
3479         if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
3480                         buf_f->blf_len, buf_f->blf_flags)) {
3481                 return;
3482         }
3483
3484         xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
3485                                 buf_f->blf_len, NULL);
3486 }
3487
3488 STATIC void
3489 xlog_recover_inode_ra_pass2(
3490         struct xlog                     *log,
3491         struct xlog_recover_item        *item)
3492 {
3493         struct xfs_inode_log_format     ilf_buf;
3494         struct xfs_inode_log_format     *ilfp;
3495         struct xfs_mount                *mp = log->l_mp;
3496         int                     error;
3497
3498         if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3499                 ilfp = item->ri_buf[0].i_addr;
3500         } else {
3501                 ilfp = &ilf_buf;
3502                 memset(ilfp, 0, sizeof(*ilfp));
3503                 error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
3504                 if (error)
3505                         return;
3506         }
3507
3508         if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
3509                 return;
3510
3511         xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
3512                                 ilfp->ilf_len, &xfs_inode_buf_ra_ops);
3513 }
3514
3515 STATIC void
3516 xlog_recover_dquot_ra_pass2(
3517         struct xlog                     *log,
3518         struct xlog_recover_item        *item)
3519 {
3520         struct xfs_mount        *mp = log->l_mp;
3521         struct xfs_disk_dquot   *recddq;
3522         struct xfs_dq_logformat *dq_f;
3523         uint                    type;
3524         int                     len;
3525
3526
3527         if (mp->m_qflags == 0)
3528                 return;
3529
3530         recddq = item->ri_buf[1].i_addr;
3531         if (recddq == NULL)
3532                 return;
3533         if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
3534                 return;
3535
3536         type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3537         ASSERT(type);
3538         if (log->l_quotaoffs_flag & type)
3539                 return;
3540
3541         dq_f = item->ri_buf[0].i_addr;
3542         ASSERT(dq_f);
3543         ASSERT(dq_f->qlf_len == 1);
3544
3545         len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
3546         if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
3547                 return;
3548
3549         xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
3550                           &xfs_dquot_buf_ra_ops);
3551 }
3552
3553 STATIC void
3554 xlog_recover_ra_pass2(
3555         struct xlog                     *log,
3556         struct xlog_recover_item        *item)
3557 {
3558         switch (ITEM_TYPE(item)) {
3559         case XFS_LI_BUF:
3560                 xlog_recover_buffer_ra_pass2(log, item);
3561                 break;
3562         case XFS_LI_INODE:
3563                 xlog_recover_inode_ra_pass2(log, item);
3564                 break;
3565         case XFS_LI_DQUOT:
3566                 xlog_recover_dquot_ra_pass2(log, item);
3567                 break;
3568         case XFS_LI_EFI:
3569         case XFS_LI_EFD:
3570         case XFS_LI_QUOTAOFF:
3571         default:
3572                 break;
3573         }
3574 }
3575
3576 STATIC int
3577 xlog_recover_commit_pass1(
3578         struct xlog                     *log,
3579         struct xlog_recover             *trans,
3580         struct xlog_recover_item        *item)
3581 {
3582         trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
3583
3584         switch (ITEM_TYPE(item)) {
3585         case XFS_LI_BUF:
3586                 return xlog_recover_buffer_pass1(log, item);
3587         case XFS_LI_QUOTAOFF:
3588                 return xlog_recover_quotaoff_pass1(log, item);
3589         case XFS_LI_INODE:
3590         case XFS_LI_EFI:
3591         case XFS_LI_EFD:
3592         case XFS_LI_DQUOT:
3593         case XFS_LI_ICREATE:
3594                 /* nothing to do in pass 1 */
3595                 return 0;
3596         default:
3597                 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
3598                         __func__, ITEM_TYPE(item));
3599                 ASSERT(0);
3600                 return -EIO;
3601         }
3602 }
3603
3604 STATIC int
3605 xlog_recover_commit_pass2(
3606         struct xlog                     *log,
3607         struct xlog_recover             *trans,
3608         struct list_head                *buffer_list,
3609         struct xlog_recover_item        *item)
3610 {
3611         trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
3612
3613         switch (ITEM_TYPE(item)) {
3614         case XFS_LI_BUF:
3615                 return xlog_recover_buffer_pass2(log, buffer_list, item,
3616                                                  trans->r_lsn);
3617         case XFS_LI_INODE:
3618                 return xlog_recover_inode_pass2(log, buffer_list, item,
3619                                                  trans->r_lsn);
3620         case XFS_LI_EFI:
3621                 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
3622         case XFS_LI_EFD:
3623                 return xlog_recover_efd_pass2(log, item);
3624         case XFS_LI_DQUOT:
3625                 return xlog_recover_dquot_pass2(log, buffer_list, item,
3626                                                 trans->r_lsn);
3627         case XFS_LI_ICREATE:
3628                 return xlog_recover_do_icreate_pass2(log, buffer_list, item);
3629         case XFS_LI_QUOTAOFF:
3630                 /* nothing to do in pass2 */
3631                 return 0;
3632         default:
3633                 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
3634                         __func__, ITEM_TYPE(item));
3635                 ASSERT(0);
3636                 return -EIO;
3637         }
3638 }
3639
3640 STATIC int
3641 xlog_recover_items_pass2(
3642         struct xlog                     *log,
3643         struct xlog_recover             *trans,
3644         struct list_head                *buffer_list,
3645         struct list_head                *item_list)
3646 {
3647         struct xlog_recover_item        *item;
3648         int                             error = 0;
3649
3650         list_for_each_entry(item, item_list, ri_list) {
3651                 error = xlog_recover_commit_pass2(log, trans,
3652                                           buffer_list, item);
3653                 if (error)
3654                         return error;
3655         }
3656
3657         return error;
3658 }
3659
3660 /*
3661  * Perform the transaction.
3662  *
3663  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
3664  * EFIs and EFDs get queued up by adding entries into the AIL for them.
3665  */
3666 STATIC int
3667 xlog_recover_commit_trans(
3668         struct xlog             *log,
3669         struct xlog_recover     *trans,
3670         int                     pass)
3671 {
3672         int                             error = 0;
3673         int                             error2;
3674         int                             items_queued = 0;
3675         struct xlog_recover_item        *item;
3676         struct xlog_recover_item        *next;
3677         LIST_HEAD                       (buffer_list);
3678         LIST_HEAD                       (ra_list);
3679         LIST_HEAD                       (done_list);
3680
3681         #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
3682
3683         hlist_del(&trans->r_list);
3684
3685         error = xlog_recover_reorder_trans(log, trans, pass);
3686         if (error)
3687                 return error;
3688
3689         list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
3690                 switch (pass) {
3691                 case XLOG_RECOVER_PASS1:
3692                         error = xlog_recover_commit_pass1(log, trans, item);
3693                         break;
3694                 case XLOG_RECOVER_PASS2:
3695                         xlog_recover_ra_pass2(log, item);
3696                         list_move_tail(&item->ri_list, &ra_list);
3697                         items_queued++;
3698                         if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
3699                                 error = xlog_recover_items_pass2(log, trans,
3700                                                 &buffer_list, &ra_list);
3701                                 list_splice_tail_init(&ra_list, &done_list);
3702                                 items_queued = 0;
3703                         }
3704
3705                         break;
3706                 default:
3707                         ASSERT(0);
3708                 }
3709
3710                 if (error)
3711                         goto out;
3712         }
3713
3714 out:
3715         if (!list_empty(&ra_list)) {
3716                 if (!error)
3717                         error = xlog_recover_items_pass2(log, trans,
3718                                         &buffer_list, &ra_list);
3719                 list_splice_tail_init(&ra_list, &done_list);
3720         }
3721
3722         if (!list_empty(&done_list))
3723                 list_splice_init(&done_list, &trans->r_itemq);
3724
3725         error2 = xfs_buf_delwri_submit(&buffer_list);
3726         return error ? error : error2;
3727 }
3728
3729 STATIC void
3730 xlog_recover_add_item(
3731         struct list_head        *head)
3732 {
3733         xlog_recover_item_t     *item;
3734
3735         item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
3736         INIT_LIST_HEAD(&item->ri_list);
3737         list_add_tail(&item->ri_list, head);
3738 }
3739
3740 STATIC int
3741 xlog_recover_add_to_cont_trans(
3742         struct xlog             *log,
3743         struct xlog_recover     *trans,
3744         char                    *dp,
3745         int                     len)
3746 {
3747         xlog_recover_item_t     *item;
3748         char                    *ptr, *old_ptr;
3749         int                     old_len;
3750
3751         /*
3752          * If the transaction is empty, the header was split across this and the
3753          * previous record. Copy the rest of the header.
3754          */
3755         if (list_empty(&trans->r_itemq)) {
3756                 ASSERT(len <= sizeof(struct xfs_trans_header));
3757                 if (len > sizeof(struct xfs_trans_header)) {
3758                         xfs_warn(log->l_mp, "%s: bad header length", __func__);
3759                         return -EIO;
3760                 }
3761
3762                 xlog_recover_add_item(&trans->r_itemq);
3763                 ptr = (char *)&trans->r_theader +
3764                                 sizeof(struct xfs_trans_header) - len;
3765                 memcpy(ptr, dp, len);
3766                 return 0;
3767         }
3768
3769         /* take the tail entry */
3770         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
3771
3772         old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
3773         old_len = item->ri_buf[item->ri_cnt-1].i_len;
3774
3775         ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
3776         memcpy(&ptr[old_len], dp, len);
3777         item->ri_buf[item->ri_cnt-1].i_len += len;
3778         item->ri_buf[item->ri_cnt-1].i_addr = ptr;
3779         trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
3780         return 0;
3781 }
3782
3783 /*
3784  * The next region to add is the start of a new region.  It could be
3785  * a whole region or it could be the first part of a new region.  Because
3786  * of this, the assumption here is that the type and size fields of all
3787  * format structures fit into the first 32 bits of the structure.
3788  *
3789  * This works because all regions must be 32 bit aligned.  Therefore, we
3790  * either have both fields or we have neither field.  In the case we have
3791  * neither field, the data part of the region is zero length.  We only have
3792  * a log_op_header and can throw away the header since a new one will appear
3793  * later.  If we have at least 4 bytes, then we can determine how many regions
3794  * will appear in the current log item.
3795  */
3796 STATIC int
3797 xlog_recover_add_to_trans(
3798         struct xlog             *log,
3799         struct xlog_recover     *trans,
3800         char                    *dp,
3801         int                     len)
3802 {
3803         xfs_inode_log_format_t  *in_f;                  /* any will do */
3804         xlog_recover_item_t     *item;
3805         char                    *ptr;
3806
3807         if (!len)
3808                 return 0;
3809         if (list_empty(&trans->r_itemq)) {
3810                 /* we need to catch log corruptions here */
3811                 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
3812                         xfs_warn(log->l_mp, "%s: bad header magic number",
3813                                 __func__);
3814                         ASSERT(0);
3815                         return -EIO;
3816                 }
3817
3818                 if (len > sizeof(struct xfs_trans_header)) {
3819                         xfs_warn(log->l_mp, "%s: bad header length", __func__);
3820                         ASSERT(0);
3821                         return -EIO;
3822                 }
3823
3824                 /*
3825                  * The transaction header can be arbitrarily split across op
3826                  * records. If we don't have the whole thing here, copy what we
3827                  * do have and handle the rest in the next record.
3828                  */
3829                 if (len == sizeof(struct xfs_trans_header))
3830                         xlog_recover_add_item(&trans->r_itemq);
3831                 memcpy(&trans->r_theader, dp, len);
3832                 return 0;
3833         }
3834
3835         ptr = kmem_alloc(len, KM_SLEEP);
3836         memcpy(ptr, dp, len);
3837         in_f = (xfs_inode_log_format_t *)ptr;
3838
3839         /* take the tail entry */
3840         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
3841         if (item->ri_total != 0 &&
3842              item->ri_total == item->ri_cnt) {
3843                 /* tail item is in use, get a new one */
3844                 xlog_recover_add_item(&trans->r_itemq);
3845                 item = list_entry(trans->r_itemq.prev,
3846                                         xlog_recover_item_t, ri_list);
3847         }
3848
3849         if (item->ri_total == 0) {              /* first region to be added */
3850                 if (in_f->ilf_size == 0 ||
3851                     in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
3852                         xfs_warn(log->l_mp,
3853                 "bad number of regions (%d) in inode log format",
3854                                   in_f->ilf_size);
3855                         ASSERT(0);
3856                         kmem_free(ptr);
3857                         return -EIO;
3858                 }
3859
3860                 item->ri_total = in_f->ilf_size;
3861                 item->ri_buf =
3862                         kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
3863                                     KM_SLEEP);
3864         }
3865         ASSERT(item->ri_total > item->ri_cnt);
3866         /* Description region is ri_buf[0] */
3867         item->ri_buf[item->ri_cnt].i_addr = ptr;
3868         item->ri_buf[item->ri_cnt].i_len  = len;
3869         item->ri_cnt++;
3870         trace_xfs_log_recover_item_add(log, trans, item, 0);
3871         return 0;
3872 }
3873
3874 /*
3875  * Free up any resources allocated by the transaction
3876  *
3877  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
3878  */
3879 STATIC void
3880 xlog_recover_free_trans(
3881         struct xlog_recover     *trans)
3882 {
3883         xlog_recover_item_t     *item, *n;
3884         int                     i;
3885
3886         list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
3887                 /* Free the regions in the item. */
3888                 list_del(&item->ri_list);
3889                 for (i = 0; i < item->ri_cnt; i++)
3890                         kmem_free(item->ri_buf[i].i_addr);
3891                 /* Free the item itself */
3892                 kmem_free(item->ri_buf);
3893                 kmem_free(item);
3894         }
3895         /* Free the transaction recover structure */
3896         kmem_free(trans);
3897 }
3898
3899 /*
3900  * On error or completion, trans is freed.
3901  */
3902 STATIC int
3903 xlog_recovery_process_trans(
3904         struct xlog             *log,
3905         struct xlog_recover     *trans,
3906         char                    *dp,
3907         unsigned int            len,
3908         unsigned int            flags,
3909         int                     pass)
3910 {
3911         int                     error = 0;
3912         bool                    freeit = false;
3913
3914         /* mask off ophdr transaction container flags */
3915         flags &= ~XLOG_END_TRANS;
3916         if (flags & XLOG_WAS_CONT_TRANS)
3917                 flags &= ~XLOG_CONTINUE_TRANS;
3918
3919         /*
3920          * Callees must not free the trans structure. We'll decide if we need to
3921          * free it or not based on the operation being done and it's result.
3922          */
3923         switch (flags) {
3924         /* expected flag values */
3925         case 0:
3926         case XLOG_CONTINUE_TRANS:
3927                 error = xlog_recover_add_to_trans(log, trans, dp, len);
3928                 break;
3929         case XLOG_WAS_CONT_TRANS:
3930                 error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
3931                 break;
3932         case XLOG_COMMIT_TRANS:
3933                 error = xlog_recover_commit_trans(log, trans, pass);
3934                 /* success or fail, we are now done with this transaction. */
3935                 freeit = true;
3936                 break;
3937
3938         /* unexpected flag values */
3939         case XLOG_UNMOUNT_TRANS:
3940                 /* just skip trans */
3941                 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
3942                 freeit = true;
3943                 break;
3944         case XLOG_START_TRANS:
3945         default:
3946                 xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
3947                 ASSERT(0);
3948                 error = -EIO;
3949                 break;
3950         }
3951         if (error || freeit)
3952                 xlog_recover_free_trans(trans);
3953         return error;
3954 }
3955
3956 /*
3957  * Lookup the transaction recovery structure associated with the ID in the
3958  * current ophdr. If the transaction doesn't exist and the start flag is set in
3959  * the ophdr, then allocate a new transaction for future ID matches to find.
3960  * Either way, return what we found during the lookup - an existing transaction
3961  * or nothing.
3962  */
3963 STATIC struct xlog_recover *
3964 xlog_recover_ophdr_to_trans(
3965         struct hlist_head       rhash[],
3966         struct xlog_rec_header  *rhead,
3967         struct xlog_op_header   *ohead)
3968 {
3969         struct xlog_recover     *trans;
3970         xlog_tid_t              tid;
3971         struct hlist_head       *rhp;
3972
3973         tid = be32_to_cpu(ohead->oh_tid);
3974         rhp = &rhash[XLOG_RHASH(tid)];
3975         hlist_for_each_entry(trans, rhp, r_list) {
3976                 if (trans->r_log_tid == tid)
3977                         return trans;
3978         }
3979
3980         /*
3981          * skip over non-start transaction headers - we could be
3982          * processing slack space before the next transaction starts
3983          */
3984         if (!(ohead->oh_flags & XLOG_START_TRANS))
3985                 return NULL;
3986
3987         ASSERT(be32_to_cpu(ohead->oh_len) == 0);
3988
3989         /*
3990          * This is a new transaction so allocate a new recovery container to
3991          * hold the recovery ops that will follow.
3992          */
3993         trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
3994         trans->r_log_tid = tid;
3995         trans->r_lsn = be64_to_cpu(rhead->h_lsn);
3996         INIT_LIST_HEAD(&trans->r_itemq);
3997         INIT_HLIST_NODE(&trans->r_list);
3998         hlist_add_head(&trans->r_list, rhp);
3999
4000         /*
4001          * Nothing more to do for this ophdr. Items to be added to this new
4002          * transaction will be in subsequent ophdr containers.
4003          */
4004         return NULL;
4005 }
4006
4007 STATIC int
4008 xlog_recover_process_ophdr(
4009         struct xlog             *log,
4010         struct hlist_head       rhash[],
4011         struct xlog_rec_header  *rhead,
4012         struct xlog_op_header   *ohead,
4013         char                    *dp,
4014         char                    *end,
4015         int                     pass)
4016 {
4017         struct xlog_recover     *trans;
4018         unsigned int            len;
4019
4020         /* Do we understand who wrote this op? */
4021         if (ohead->oh_clientid != XFS_TRANSACTION &&
4022             ohead->oh_clientid != XFS_LOG) {
4023                 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
4024                         __func__, ohead->oh_clientid);
4025                 ASSERT(0);
4026                 return -EIO;
4027         }
4028
4029         /*
4030          * Check the ophdr contains all the data it is supposed to contain.
4031          */
4032         len = be32_to_cpu(ohead->oh_len);
4033         if (dp + len > end) {
4034                 xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
4035                 WARN_ON(1);
4036                 return -EIO;
4037         }
4038
4039         trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
4040         if (!trans) {
4041                 /* nothing to do, so skip over this ophdr */
4042                 return 0;
4043         }
4044
4045         return xlog_recovery_process_trans(log, trans, dp, len,
4046                                            ohead->oh_flags, pass);
4047 }
4048
4049 /*
4050  * There are two valid states of the r_state field.  0 indicates that the
4051  * transaction structure is in a normal state.  We have either seen the
4052  * start of the transaction or the last operation we added was not a partial
4053  * operation.  If the last operation we added to the transaction was a
4054  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
4055  *
4056  * NOTE: skip LRs with 0 data length.
4057  */
4058 STATIC int
4059 xlog_recover_process_data(
4060         struct xlog             *log,
4061         struct hlist_head       rhash[],
4062         struct xlog_rec_header  *rhead,
4063         char                    *dp,
4064         int                     pass)
4065 {
4066         struct xlog_op_header   *ohead;
4067         char                    *end;
4068         int                     num_logops;
4069         int                     error;
4070
4071         end = dp + be32_to_cpu(rhead->h_len);
4072         num_logops = be32_to_cpu(rhead->h_num_logops);
4073
4074         /* check the log format matches our own - else we can't recover */
4075         if (xlog_header_check_recover(log->l_mp, rhead))
4076                 return -EIO;
4077
4078         while ((dp < end) && num_logops) {
4079
4080                 ohead = (struct xlog_op_header *)dp;
4081                 dp += sizeof(*ohead);
4082                 ASSERT(dp <= end);
4083
4084                 /* errors will abort recovery */
4085                 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
4086                                                     dp, end, pass);
4087                 if (error)
4088                         return error;
4089
4090                 dp += be32_to_cpu(ohead->oh_len);
4091                 num_logops--;
4092         }
4093         return 0;
4094 }
4095
4096 /*
4097  * Process an extent free intent item that was recovered from
4098  * the log.  We need to free the extents that it describes.
4099  */
4100 STATIC int
4101 xlog_recover_process_efi(
4102         xfs_mount_t             *mp,
4103         xfs_efi_log_item_t      *efip)
4104 {
4105         xfs_efd_log_item_t      *efdp;
4106         xfs_trans_t             *tp;
4107         int                     i;
4108         int                     error = 0;
4109         xfs_extent_t            *extp;
4110         xfs_fsblock_t           startblock_fsb;
4111
4112         ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
4113
4114         /*
4115          * First check the validity of the extents described by the
4116          * EFI.  If any are bad, then assume that all are bad and
4117          * just toss the EFI.
4118          */
4119         for (i = 0; i < efip->efi_format.efi_nextents; i++) {
4120                 extp = &(efip->efi_format.efi_extents[i]);
4121                 startblock_fsb = XFS_BB_TO_FSB(mp,
4122                                    XFS_FSB_TO_DADDR(mp, extp->ext_start));
4123                 if ((startblock_fsb == 0) ||
4124                     (extp->ext_len == 0) ||
4125                     (startblock_fsb >= mp->m_sb.sb_dblocks) ||
4126                     (extp->ext_len >= mp->m_sb.sb_agblocks)) {
4127                         /*
4128                          * This will pull the EFI from the AIL and
4129                          * free the memory associated with it.
4130                          */
4131                         set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
4132                         xfs_efi_release(efip);
4133                         return -EIO;
4134                 }
4135         }
4136
4137         tp = xfs_trans_alloc(mp, 0);
4138         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
4139         if (error)
4140                 goto abort_error;
4141         efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
4142
4143         for (i = 0; i < efip->efi_format.efi_nextents; i++) {
4144                 extp = &(efip->efi_format.efi_extents[i]);
4145                 error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
4146                                               extp->ext_len);
4147                 if (error)
4148                         goto abort_error;
4149
4150         }
4151
4152         set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
4153         error = xfs_trans_commit(tp);
4154         return error;
4155
4156 abort_error:
4157         xfs_trans_cancel(tp);
4158         return error;
4159 }
4160
4161 /*
4162  * When this is called, all of the EFIs which did not have
4163  * corresponding EFDs should be in the AIL.  What we do now
4164  * is free the extents associated with each one.
4165  *
4166  * Since we process the EFIs in normal transactions, they
4167  * will be removed at some point after the commit.  This prevents
4168  * us from just walking down the list processing each one.
4169  * We'll use a flag in the EFI to skip those that we've already
4170  * processed and use the AIL iteration mechanism's generation
4171  * count to try to speed this up at least a bit.
4172  *
4173  * When we start, we know that the EFIs are the only things in
4174  * the AIL.  As we process them, however, other items are added
4175  * to the AIL.  Since everything added to the AIL must come after
4176  * everything already in the AIL, we stop processing as soon as
4177  * we see something other than an EFI in the AIL.
4178  */
4179 STATIC int
4180 xlog_recover_process_efis(
4181         struct xlog             *log)
4182 {
4183         struct xfs_log_item     *lip;
4184         struct xfs_efi_log_item *efip;
4185         int                     error = 0;
4186         struct xfs_ail_cursor   cur;
4187         struct xfs_ail          *ailp;
4188
4189         ailp = log->l_ailp;
4190         spin_lock(&ailp->xa_lock);
4191         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4192         while (lip != NULL) {
4193                 /*
4194                  * We're done when we see something other than an EFI.
4195                  * There should be no EFIs left in the AIL now.
4196                  */
4197                 if (lip->li_type != XFS_LI_EFI) {
4198 #ifdef DEBUG
4199                         for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
4200                                 ASSERT(lip->li_type != XFS_LI_EFI);
4201 #endif
4202                         break;
4203                 }
4204
4205                 /*
4206                  * Skip EFIs that we've already processed.
4207                  */
4208                 efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4209                 if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
4210                         lip = xfs_trans_ail_cursor_next(ailp, &cur);
4211                         continue;
4212                 }
4213
4214                 spin_unlock(&ailp->xa_lock);
4215                 error = xlog_recover_process_efi(log->l_mp, efip);
4216                 spin_lock(&ailp->xa_lock);
4217                 if (error)
4218                         goto out;
4219                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
4220         }
4221 out:
4222         xfs_trans_ail_cursor_done(&cur);
4223         spin_unlock(&ailp->xa_lock);
4224         return error;
4225 }
4226
4227 /*
4228  * A cancel occurs when the mount has failed and we're bailing out. Release all
4229  * pending EFIs so they don't pin the AIL.
4230  */
4231 STATIC int
4232 xlog_recover_cancel_efis(
4233         struct xlog             *log)
4234 {
4235         struct xfs_log_item     *lip;
4236         struct xfs_efi_log_item *efip;
4237         int                     error = 0;
4238         struct xfs_ail_cursor   cur;
4239         struct xfs_ail          *ailp;
4240
4241         ailp = log->l_ailp;
4242         spin_lock(&ailp->xa_lock);
4243         lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4244         while (lip != NULL) {
4245                 /*
4246                  * We're done when we see something other than an EFI.
4247                  * There should be no EFIs left in the AIL now.
4248                  */
4249                 if (lip->li_type != XFS_LI_EFI) {
4250 #ifdef DEBUG
4251                         for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
4252                                 ASSERT(lip->li_type != XFS_LI_EFI);
4253 #endif
4254                         break;
4255                 }
4256
4257                 efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4258
4259                 spin_unlock(&ailp->xa_lock);
4260                 xfs_efi_release(efip);
4261                 spin_lock(&ailp->xa_lock);
4262
4263                 lip = xfs_trans_ail_cursor_next(ailp, &cur);
4264         }
4265
4266         xfs_trans_ail_cursor_done(&cur);
4267         spin_unlock(&ailp->xa_lock);
4268         return error;
4269 }
4270
4271 /*
4272  * This routine performs a transaction to null out a bad inode pointer
4273  * in an agi unlinked inode hash bucket.
4274  */
4275 STATIC void
4276 xlog_recover_clear_agi_bucket(
4277         xfs_mount_t     *mp,
4278         xfs_agnumber_t  agno,
4279         int             bucket)
4280 {
4281         xfs_trans_t     *tp;
4282         xfs_agi_t       *agi;
4283         xfs_buf_t       *agibp;
4284         int             offset;
4285         int             error;
4286
4287         tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
4288         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
4289         if (error)
4290                 goto out_abort;
4291
4292         error = xfs_read_agi(mp, tp, agno, &agibp);
4293         if (error)
4294                 goto out_abort;
4295
4296         agi = XFS_BUF_TO_AGI(agibp);
4297         agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
4298         offset = offsetof(xfs_agi_t, agi_unlinked) +
4299                  (sizeof(xfs_agino_t) * bucket);
4300         xfs_trans_log_buf(tp, agibp, offset,
4301                           (offset + sizeof(xfs_agino_t) - 1));
4302
4303         error = xfs_trans_commit(tp);
4304         if (error)
4305                 goto out_error;
4306         return;
4307
4308 out_abort:
4309         xfs_trans_cancel(tp);
4310 out_error:
4311         xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
4312         return;
4313 }
4314
4315 STATIC xfs_agino_t
4316 xlog_recover_process_one_iunlink(
4317         struct xfs_mount                *mp,
4318         xfs_agnumber_t                  agno,
4319         xfs_agino_t                     agino,
4320         int                             bucket)
4321 {
4322         struct xfs_buf                  *ibp;
4323         struct xfs_dinode               *dip;
4324         struct xfs_inode                *ip;
4325         xfs_ino_t                       ino;
4326         int                             error;
4327
4328         ino = XFS_AGINO_TO_INO(mp, agno, agino);
4329         error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
4330         if (error)
4331                 goto fail;
4332
4333         /*
4334          * Get the on disk inode to find the next inode in the bucket.
4335          */
4336         error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
4337         if (error)
4338                 goto fail_iput;
4339
4340         ASSERT(ip->i_d.di_nlink == 0);
4341         ASSERT(ip->i_d.di_mode != 0);
4342
4343         /* setup for the next pass */
4344         agino = be32_to_cpu(dip->di_next_unlinked);
4345         xfs_buf_relse(ibp);
4346
4347         /*
4348          * Prevent any DMAPI event from being sent when the reference on
4349          * the inode is dropped.
4350          */
4351         ip->i_d.di_dmevmask = 0;
4352
4353         IRELE(ip);
4354         return agino;
4355
4356  fail_iput:
4357         IRELE(ip);
4358  fail:
4359         /*
4360          * We can't read in the inode this bucket points to, or this inode
4361          * is messed up.  Just ditch this bucket of inodes.  We will lose
4362          * some inodes and space, but at least we won't hang.
4363          *
4364          * Call xlog_recover_clear_agi_bucket() to perform a transaction to
4365          * clear the inode pointer in the bucket.
4366          */
4367         xlog_recover_clear_agi_bucket(mp, agno, bucket);
4368         return NULLAGINO;
4369 }
4370
4371 /*
4372  * xlog_iunlink_recover
4373  *
4374  * This is called during recovery to process any inodes which
4375  * we unlinked but not freed when the system crashed.  These
4376  * inodes will be on the lists in the AGI blocks.  What we do
4377  * here is scan all the AGIs and fully truncate and free any
4378  * inodes found on the lists.  Each inode is removed from the
4379  * lists when it has been fully truncated and is freed.  The
4380  * freeing of the inode and its removal from the list must be
4381  * atomic.
4382  */
4383 STATIC void
4384 xlog_recover_process_iunlinks(
4385         struct xlog     *log)
4386 {
4387         xfs_mount_t     *mp;
4388         xfs_agnumber_t  agno;
4389         xfs_agi_t       *agi;
4390         xfs_buf_t       *agibp;
4391         xfs_agino_t     agino;
4392         int             bucket;
4393         int             error;
4394         uint            mp_dmevmask;
4395
4396         mp = log->l_mp;
4397
4398         /*
4399          * Prevent any DMAPI event from being sent while in this function.
4400          */
4401         mp_dmevmask = mp->m_dmevmask;
4402         mp->m_dmevmask = 0;
4403
4404         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4405                 /*
4406                  * Find the agi for this ag.
4407                  */
4408                 error = xfs_read_agi(mp, NULL, agno, &agibp);
4409                 if (error) {
4410                         /*
4411                          * AGI is b0rked. Don't process it.
4412                          *
4413                          * We should probably mark the filesystem as corrupt
4414                          * after we've recovered all the ag's we can....
4415                          */
4416                         continue;
4417                 }
4418                 /*
4419                  * Unlock the buffer so that it can be acquired in the normal
4420                  * course of the transaction to truncate and free each inode.
4421                  * Because we are not racing with anyone else here for the AGI
4422                  * buffer, we don't even need to hold it locked to read the
4423                  * initial unlinked bucket entries out of the buffer. We keep
4424                  * buffer reference though, so that it stays pinned in memory
4425                  * while we need the buffer.
4426                  */
4427                 agi = XFS_BUF_TO_AGI(agibp);
4428                 xfs_buf_unlock(agibp);
4429
4430                 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
4431                         agino = be32_to_cpu(agi->agi_unlinked[bucket]);
4432                         while (agino != NULLAGINO) {
4433                                 agino = xlog_recover_process_one_iunlink(mp,
4434                                                         agno, agino, bucket);
4435                         }
4436                 }
4437                 xfs_buf_rele(agibp);
4438         }
4439
4440         mp->m_dmevmask = mp_dmevmask;
4441 }
4442
4443 STATIC int
4444 xlog_unpack_data(
4445         struct xlog_rec_header  *rhead,
4446         char                    *dp,
4447         struct xlog             *log)
4448 {
4449         int                     i, j, k;
4450
4451         for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
4452                   i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
4453                 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
4454                 dp += BBSIZE;
4455         }
4456
4457         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
4458                 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
4459                 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
4460                         j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
4461                         k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
4462                         *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
4463                         dp += BBSIZE;
4464                 }
4465         }
4466
4467         return 0;
4468 }
4469
4470 /*
4471  * CRC check, unpack and process a log record.
4472  */
4473 STATIC int
4474 xlog_recover_process(
4475         struct xlog             *log,
4476         struct hlist_head       rhash[],
4477         struct xlog_rec_header  *rhead,
4478         char                    *dp,
4479         int                     pass)
4480 {
4481         int                     error;
4482         __le32                  crc;
4483
4484         crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
4485
4486         /*
4487          * Nothing else to do if this is a CRC verification pass. Just return
4488          * if this a record with a non-zero crc. Unfortunately, mkfs always
4489          * sets h_crc to 0 so we must consider this valid even on v5 supers.
4490          * Otherwise, return EFSBADCRC on failure so the callers up the stack
4491          * know precisely what failed.
4492          */
4493         if (pass == XLOG_RECOVER_CRCPASS) {
4494                 if (rhead->h_crc && crc != rhead->h_crc)
4495                         return -EFSBADCRC;
4496                 return 0;
4497         }
4498
4499         /*
4500          * We're in the normal recovery path. Issue a warning if and only if the
4501          * CRC in the header is non-zero. This is an advisory warning and the
4502          * zero CRC check prevents warnings from being emitted when upgrading
4503          * the kernel from one that does not add CRCs by default.
4504          */
4505         if (crc != rhead->h_crc) {
4506                 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
4507                         xfs_alert(log->l_mp,
4508                 "log record CRC mismatch: found 0x%x, expected 0x%x.",
4509                                         le32_to_cpu(rhead->h_crc),
4510                                         le32_to_cpu(crc));
4511                         xfs_hex_dump(dp, 32);
4512                 }
4513
4514                 /*
4515                  * If the filesystem is CRC enabled, this mismatch becomes a
4516                  * fatal log corruption failure.
4517                  */
4518                 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
4519                         return -EFSCORRUPTED;
4520         }
4521
4522         error = xlog_unpack_data(rhead, dp, log);
4523         if (error)
4524                 return error;
4525
4526         return xlog_recover_process_data(log, rhash, rhead, dp, pass);
4527 }
4528
4529 STATIC int
4530 xlog_valid_rec_header(
4531         struct xlog             *log,
4532         struct xlog_rec_header  *rhead,
4533         xfs_daddr_t             blkno)
4534 {
4535         int                     hlen;
4536
4537         if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
4538                 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
4539                                 XFS_ERRLEVEL_LOW, log->l_mp);
4540                 return -EFSCORRUPTED;
4541         }
4542         if (unlikely(
4543             (!rhead->h_version ||
4544             (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
4545                 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
4546                         __func__, be32_to_cpu(rhead->h_version));
4547                 return -EIO;
4548         }
4549
4550         /* LR body must have data or it wouldn't have been written */
4551         hlen = be32_to_cpu(rhead->h_len);
4552         if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
4553                 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
4554                                 XFS_ERRLEVEL_LOW, log->l_mp);
4555                 return -EFSCORRUPTED;
4556         }
4557         if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
4558                 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
4559                                 XFS_ERRLEVEL_LOW, log->l_mp);
4560                 return -EFSCORRUPTED;
4561         }
4562         return 0;
4563 }
4564
4565 /*
4566  * Read the log from tail to head and process the log records found.
4567  * Handle the two cases where the tail and head are in the same cycle
4568  * and where the active portion of the log wraps around the end of
4569  * the physical log separately.  The pass parameter is passed through
4570  * to the routines called to process the data and is not looked at
4571  * here.
4572  */
4573 STATIC int
4574 xlog_do_recovery_pass(
4575         struct xlog             *log,
4576         xfs_daddr_t             head_blk,
4577         xfs_daddr_t             tail_blk,
4578         int                     pass,
4579         xfs_daddr_t             *first_bad)     /* out: first bad log rec */
4580 {
4581         xlog_rec_header_t       *rhead;
4582         xfs_daddr_t             blk_no;
4583         xfs_daddr_t             rhead_blk;
4584         char                    *offset;
4585         xfs_buf_t               *hbp, *dbp;
4586         int                     error = 0, h_size, h_len;
4587         int                     bblks, split_bblks;
4588         int                     hblks, split_hblks, wrapped_hblks;
4589         struct hlist_head       rhash[XLOG_RHASH_SIZE];
4590
4591         ASSERT(head_blk != tail_blk);
4592         rhead_blk = 0;
4593
4594         /*
4595          * Read the header of the tail block and get the iclog buffer size from
4596          * h_size.  Use this to tell how many sectors make up the log header.
4597          */
4598         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
4599                 /*
4600                  * When using variable length iclogs, read first sector of
4601                  * iclog header and extract the header size from it.  Get a
4602                  * new hbp that is the correct size.
4603                  */
4604                 hbp = xlog_get_bp(log, 1);
4605                 if (!hbp)
4606                         return -ENOMEM;
4607
4608                 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
4609                 if (error)
4610                         goto bread_err1;
4611
4612                 rhead = (xlog_rec_header_t *)offset;
4613                 error = xlog_valid_rec_header(log, rhead, tail_blk);
4614                 if (error)
4615                         goto bread_err1;
4616
4617                 /*
4618                  * xfsprogs has a bug where record length is based on lsunit but
4619                  * h_size (iclog size) is hardcoded to 32k. Now that we
4620                  * unconditionally CRC verify the unmount record, this means the
4621                  * log buffer can be too small for the record and cause an
4622                  * overrun.
4623                  *
4624                  * Detect this condition here. Use lsunit for the buffer size as
4625                  * long as this looks like the mkfs case. Otherwise, return an
4626                  * error to avoid a buffer overrun.
4627                  */
4628                 h_size = be32_to_cpu(rhead->h_size);
4629                 h_len = be32_to_cpu(rhead->h_len);
4630                 if (h_len > h_size) {
4631                         if (h_len <= log->l_mp->m_logbsize &&
4632                             be32_to_cpu(rhead->h_num_logops) == 1) {
4633                                 xfs_warn(log->l_mp,
4634                 "invalid iclog size (%d bytes), using lsunit (%d bytes)",
4635                                          h_size, log->l_mp->m_logbsize);
4636                                 h_size = log->l_mp->m_logbsize;
4637                         } else
4638                                 return -EFSCORRUPTED;
4639                 }
4640
4641                 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
4642                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
4643                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
4644                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
4645                                 hblks++;
4646                         xlog_put_bp(hbp);
4647                         hbp = xlog_get_bp(log, hblks);
4648                 } else {
4649                         hblks = 1;
4650                 }
4651         } else {
4652                 ASSERT(log->l_sectBBsize == 1);
4653                 hblks = 1;
4654                 hbp = xlog_get_bp(log, 1);
4655                 h_size = XLOG_BIG_RECORD_BSIZE;
4656         }
4657
4658         if (!hbp)
4659                 return -ENOMEM;
4660         dbp = xlog_get_bp(log, BTOBB(h_size));
4661         if (!dbp) {
4662                 xlog_put_bp(hbp);
4663                 return -ENOMEM;
4664         }
4665
4666         memset(rhash, 0, sizeof(rhash));
4667         blk_no = rhead_blk = tail_blk;
4668         if (tail_blk > head_blk) {
4669                 /*
4670                  * Perform recovery around the end of the physical log.
4671                  * When the head is not on the same cycle number as the tail,
4672                  * we can't do a sequential recovery.
4673                  */
4674                 while (blk_no < log->l_logBBsize) {
4675                         /*
4676                          * Check for header wrapping around physical end-of-log
4677                          */
4678                         offset = hbp->b_addr;
4679                         split_hblks = 0;
4680                         wrapped_hblks = 0;
4681                         if (blk_no + hblks <= log->l_logBBsize) {
4682                                 /* Read header in one read */
4683                                 error = xlog_bread(log, blk_no, hblks, hbp,
4684                                                    &offset);
4685                                 if (error)
4686                                         goto bread_err2;
4687                         } else {
4688                                 /* This LR is split across physical log end */
4689                                 if (blk_no != log->l_logBBsize) {
4690                                         /* some data before physical log end */
4691                                         ASSERT(blk_no <= INT_MAX);
4692                                         split_hblks = log->l_logBBsize - (int)blk_no;
4693                                         ASSERT(split_hblks > 0);
4694                                         error = xlog_bread(log, blk_no,
4695                                                            split_hblks, hbp,
4696                                                            &offset);
4697                                         if (error)
4698                                                 goto bread_err2;
4699                                 }
4700
4701                                 /*
4702                                  * Note: this black magic still works with
4703                                  * large sector sizes (non-512) only because:
4704                                  * - we increased the buffer size originally
4705                                  *   by 1 sector giving us enough extra space
4706                                  *   for the second read;
4707                                  * - the log start is guaranteed to be sector
4708                                  *   aligned;
4709                                  * - we read the log end (LR header start)
4710                                  *   _first_, then the log start (LR header end)
4711                                  *   - order is important.
4712                                  */
4713                                 wrapped_hblks = hblks - split_hblks;
4714                                 error = xlog_bread_offset(log, 0,
4715                                                 wrapped_hblks, hbp,
4716                                                 offset + BBTOB(split_hblks));
4717                                 if (error)
4718                                         goto bread_err2;
4719                         }
4720                         rhead = (xlog_rec_header_t *)offset;
4721                         error = xlog_valid_rec_header(log, rhead,
4722                                                 split_hblks ? blk_no : 0);
4723                         if (error)
4724                                 goto bread_err2;
4725
4726                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
4727                         blk_no += hblks;
4728
4729                         /* Read in data for log record */
4730                         if (blk_no + bblks <= log->l_logBBsize) {
4731                                 error = xlog_bread(log, blk_no, bblks, dbp,
4732                                                    &offset);
4733                                 if (error)
4734                                         goto bread_err2;
4735                         } else {
4736                                 /* This log record is split across the
4737                                  * physical end of log */
4738                                 offset = dbp->b_addr;
4739                                 split_bblks = 0;
4740                                 if (blk_no != log->l_logBBsize) {
4741                                         /* some data is before the physical
4742                                          * end of log */
4743                                         ASSERT(!wrapped_hblks);
4744                                         ASSERT(blk_no <= INT_MAX);
4745                                         split_bblks =
4746                                                 log->l_logBBsize - (int)blk_no;
4747                                         ASSERT(split_bblks > 0);
4748                                         error = xlog_bread(log, blk_no,
4749                                                         split_bblks, dbp,
4750                                                         &offset);
4751                                         if (error)
4752                                                 goto bread_err2;
4753                                 }
4754
4755                                 /*
4756                                  * Note: this black magic still works with
4757                                  * large sector sizes (non-512) only because:
4758                                  * - we increased the buffer size originally
4759                                  *   by 1 sector giving us enough extra space
4760                                  *   for the second read;
4761                                  * - the log start is guaranteed to be sector
4762                                  *   aligned;
4763                                  * - we read the log end (LR header start)
4764                                  *   _first_, then the log start (LR header end)
4765                                  *   - order is important.
4766                                  */
4767                                 error = xlog_bread_offset(log, 0,
4768                                                 bblks - split_bblks, dbp,
4769                                                 offset + BBTOB(split_bblks));
4770                                 if (error)
4771                                         goto bread_err2;
4772                         }
4773
4774                         error = xlog_recover_process(log, rhash, rhead, offset,
4775                                                      pass);
4776                         if (error)
4777                                 goto bread_err2;
4778
4779                         blk_no += bblks;
4780                         rhead_blk = blk_no;
4781                 }
4782
4783                 ASSERT(blk_no >= log->l_logBBsize);
4784                 blk_no -= log->l_logBBsize;
4785                 rhead_blk = blk_no;
4786         }
4787
4788         /* read first part of physical log */
4789         while (blk_no < head_blk) {
4790                 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
4791                 if (error)
4792                         goto bread_err2;
4793
4794                 rhead = (xlog_rec_header_t *)offset;
4795                 error = xlog_valid_rec_header(log, rhead, blk_no);
4796                 if (error)
4797                         goto bread_err2;
4798
4799                 /* blocks in data section */
4800                 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
4801                 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
4802                                    &offset);
4803                 if (error)
4804                         goto bread_err2;
4805
4806                 error = xlog_recover_process(log, rhash, rhead, offset, pass);
4807                 if (error)
4808                         goto bread_err2;
4809
4810                 blk_no += bblks + hblks;
4811                 rhead_blk = blk_no;
4812         }
4813
4814  bread_err2:
4815         xlog_put_bp(dbp);
4816  bread_err1:
4817         xlog_put_bp(hbp);
4818
4819         if (error && first_bad)
4820                 *first_bad = rhead_blk;
4821
4822         return error;
4823 }
4824
4825 /*
4826  * Do the recovery of the log.  We actually do this in two phases.
4827  * The two passes are necessary in order to implement the function
4828  * of cancelling a record written into the log.  The first pass
4829  * determines those things which have been cancelled, and the
4830  * second pass replays log items normally except for those which
4831  * have been cancelled.  The handling of the replay and cancellations
4832  * takes place in the log item type specific routines.
4833  *
4834  * The table of items which have cancel records in the log is allocated
4835  * and freed at this level, since only here do we know when all of
4836  * the log recovery has been completed.
4837  */
4838 STATIC int
4839 xlog_do_log_recovery(
4840         struct xlog     *log,
4841         xfs_daddr_t     head_blk,
4842         xfs_daddr_t     tail_blk)
4843 {
4844         int             error, i;
4845
4846         ASSERT(head_blk != tail_blk);
4847
4848         /*
4849          * First do a pass to find all of the cancelled buf log items.
4850          * Store them in the buf_cancel_table for use in the second pass.
4851          */
4852         log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
4853                                                  sizeof(struct list_head),
4854                                                  KM_SLEEP);
4855         for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
4856                 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
4857
4858         error = xlog_do_recovery_pass(log, head_blk, tail_blk,
4859                                       XLOG_RECOVER_PASS1, NULL);
4860         if (error != 0) {
4861                 kmem_free(log->l_buf_cancel_table);
4862                 log->l_buf_cancel_table = NULL;
4863                 return error;
4864         }
4865         /*
4866          * Then do a second pass to actually recover the items in the log.
4867          * When it is complete free the table of buf cancel items.
4868          */
4869         error = xlog_do_recovery_pass(log, head_blk, tail_blk,
4870                                       XLOG_RECOVER_PASS2, NULL);
4871 #ifdef DEBUG
4872         if (!error) {
4873                 int     i;
4874
4875                 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
4876                         ASSERT(list_empty(&log->l_buf_cancel_table[i]));
4877         }
4878 #endif  /* DEBUG */
4879
4880         kmem_free(log->l_buf_cancel_table);
4881         log->l_buf_cancel_table = NULL;
4882
4883         return error;
4884 }
4885
4886 /*
4887  * Do the actual recovery
4888  */
4889 STATIC int
4890 xlog_do_recover(
4891         struct xlog     *log,
4892         xfs_daddr_t     head_blk,
4893         xfs_daddr_t     tail_blk)
4894 {
4895         int             error;
4896         xfs_buf_t       *bp;
4897         xfs_sb_t        *sbp;
4898
4899         /*
4900          * First replay the images in the log.
4901          */
4902         error = xlog_do_log_recovery(log, head_blk, tail_blk);
4903         if (error)
4904                 return error;
4905
4906         /*
4907          * If IO errors happened during recovery, bail out.
4908          */
4909         if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
4910                 return -EIO;
4911         }
4912
4913         /*
4914          * We now update the tail_lsn since much of the recovery has completed
4915          * and there may be space available to use.  If there were no extent
4916          * or iunlinks, we can free up the entire log and set the tail_lsn to
4917          * be the last_sync_lsn.  This was set in xlog_find_tail to be the
4918          * lsn of the last known good LR on disk.  If there are extent frees
4919          * or iunlinks they will have some entries in the AIL; so we look at
4920          * the AIL to determine how to set the tail_lsn.
4921          */
4922         xlog_assign_tail_lsn(log->l_mp);
4923
4924         /*
4925          * Now that we've finished replaying all buffer and inode
4926          * updates, re-read in the superblock and reverify it.
4927          */
4928         bp = xfs_getsb(log->l_mp, 0);
4929         XFS_BUF_UNDONE(bp);
4930         ASSERT(!(XFS_BUF_ISWRITE(bp)));
4931         XFS_BUF_READ(bp);
4932         XFS_BUF_UNASYNC(bp);
4933         bp->b_ops = &xfs_sb_buf_ops;
4934
4935         error = xfs_buf_submit_wait(bp);
4936         if (error) {
4937                 if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
4938                         xfs_buf_ioerror_alert(bp, __func__);
4939                         ASSERT(0);
4940                 }
4941                 xfs_buf_relse(bp);
4942                 return error;
4943         }
4944
4945         /* Convert superblock from on-disk format */
4946         sbp = &log->l_mp->m_sb;
4947         xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
4948         ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
4949         ASSERT(xfs_sb_good_version(sbp));
4950         xfs_reinit_percpu_counters(log->l_mp);
4951
4952         xfs_buf_relse(bp);
4953
4954
4955         xlog_recover_check_summary(log);
4956
4957         /* Normal transactions can now occur */
4958         log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
4959         return 0;
4960 }
4961
4962 /*
4963  * Perform recovery and re-initialize some log variables in xlog_find_tail.
4964  *
4965  * Return error or zero.
4966  */
4967 int
4968 xlog_recover(
4969         struct xlog     *log)
4970 {
4971         xfs_daddr_t     head_blk, tail_blk;
4972         int             error;
4973
4974         /* find the tail of the log */
4975         error = xlog_find_tail(log, &head_blk, &tail_blk);
4976         if (error)
4977                 return error;
4978
4979         /*
4980          * The superblock was read before the log was available and thus the LSN
4981          * could not be verified. Check the superblock LSN against the current
4982          * LSN now that it's known.
4983          */
4984         if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
4985             !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
4986                 return -EINVAL;
4987
4988         if (tail_blk != head_blk) {
4989                 /* There used to be a comment here:
4990                  *
4991                  * disallow recovery on read-only mounts.  note -- mount
4992                  * checks for ENOSPC and turns it into an intelligent
4993                  * error message.
4994                  * ...but this is no longer true.  Now, unless you specify
4995                  * NORECOVERY (in which case this function would never be
4996                  * called), we just go ahead and recover.  We do this all
4997                  * under the vfs layer, so we can get away with it unless
4998                  * the device itself is read-only, in which case we fail.
4999                  */
5000                 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
5001                         return error;
5002                 }
5003
5004                 /*
5005                  * Version 5 superblock log feature mask validation. We know the
5006                  * log is dirty so check if there are any unknown log features
5007                  * in what we need to recover. If there are unknown features
5008                  * (e.g. unsupported transactions, then simply reject the
5009                  * attempt at recovery before touching anything.
5010                  */
5011                 if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
5012                     xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
5013                                         XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
5014                         xfs_warn(log->l_mp,
5015 "Superblock has unknown incompatible log features (0x%x) enabled.",
5016                                 (log->l_mp->m_sb.sb_features_log_incompat &
5017                                         XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
5018                         xfs_warn(log->l_mp,
5019 "The log can not be fully and/or safely recovered by this kernel.");
5020                         xfs_warn(log->l_mp,
5021 "Please recover the log on a kernel that supports the unknown features.");
5022                         return -EINVAL;
5023                 }
5024
5025                 /*
5026                  * Delay log recovery if the debug hook is set. This is debug
5027                  * instrumention to coordinate simulation of I/O failures with
5028                  * log recovery.
5029                  */
5030                 if (xfs_globals.log_recovery_delay) {
5031                         xfs_notice(log->l_mp,
5032                                 "Delaying log recovery for %d seconds.",
5033                                 xfs_globals.log_recovery_delay);
5034                         msleep(xfs_globals.log_recovery_delay * 1000);
5035                 }
5036
5037                 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
5038                                 log->l_mp->m_logname ? log->l_mp->m_logname
5039                                                      : "internal");
5040
5041                 error = xlog_do_recover(log, head_blk, tail_blk);
5042                 log->l_flags |= XLOG_RECOVERY_NEEDED;
5043         }
5044         return error;
5045 }
5046
5047 /*
5048  * In the first part of recovery we replay inodes and buffers and build
5049  * up the list of extent free items which need to be processed.  Here
5050  * we process the extent free items and clean up the on disk unlinked
5051  * inode lists.  This is separated from the first part of recovery so
5052  * that the root and real-time bitmap inodes can be read in from disk in
5053  * between the two stages.  This is necessary so that we can free space
5054  * in the real-time portion of the file system.
5055  */
5056 int
5057 xlog_recover_finish(
5058         struct xlog     *log)
5059 {
5060         /*
5061          * Now we're ready to do the transactions needed for the
5062          * rest of recovery.  Start with completing all the extent
5063          * free intent records and then process the unlinked inode
5064          * lists.  At this point, we essentially run in normal mode
5065          * except that we're still performing recovery actions
5066          * rather than accepting new requests.
5067          */
5068         if (log->l_flags & XLOG_RECOVERY_NEEDED) {
5069                 int     error;
5070                 error = xlog_recover_process_efis(log);
5071                 if (error) {
5072                         xfs_alert(log->l_mp, "Failed to recover EFIs");
5073                         return error;
5074                 }
5075                 /*
5076                  * Sync the log to get all the EFIs out of the AIL.
5077                  * This isn't absolutely necessary, but it helps in
5078                  * case the unlink transactions would have problems
5079                  * pushing the EFIs out of the way.
5080                  */
5081                 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
5082
5083                 xlog_recover_process_iunlinks(log);
5084
5085                 xlog_recover_check_summary(log);
5086
5087                 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
5088                                 log->l_mp->m_logname ? log->l_mp->m_logname
5089                                                      : "internal");
5090                 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
5091         } else {
5092                 xfs_info(log->l_mp, "Ending clean mount");
5093         }
5094         return 0;
5095 }
5096
5097 int
5098 xlog_recover_cancel(
5099         struct xlog     *log)
5100 {
5101         int             error = 0;
5102
5103         if (log->l_flags & XLOG_RECOVERY_NEEDED)
5104                 error = xlog_recover_cancel_efis(log);
5105
5106         return error;
5107 }
5108
5109 #if defined(DEBUG)
5110 /*
5111  * Read all of the agf and agi counters and check that they
5112  * are consistent with the superblock counters.
5113  */
5114 void
5115 xlog_recover_check_summary(
5116         struct xlog     *log)
5117 {
5118         xfs_mount_t     *mp;
5119         xfs_agf_t       *agfp;
5120         xfs_buf_t       *agfbp;
5121         xfs_buf_t       *agibp;
5122         xfs_agnumber_t  agno;
5123         __uint64_t      freeblks;
5124         __uint64_t      itotal;
5125         __uint64_t      ifree;
5126         int             error;
5127
5128         mp = log->l_mp;
5129
5130         freeblks = 0LL;
5131         itotal = 0LL;
5132         ifree = 0LL;
5133         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
5134                 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
5135                 if (error) {
5136                         xfs_alert(mp, "%s agf read failed agno %d error %d",
5137                                                 __func__, agno, error);
5138                 } else {
5139                         agfp = XFS_BUF_TO_AGF(agfbp);
5140                         freeblks += be32_to_cpu(agfp->agf_freeblks) +
5141                                     be32_to_cpu(agfp->agf_flcount);
5142                         xfs_buf_relse(agfbp);
5143                 }
5144
5145                 error = xfs_read_agi(mp, NULL, agno, &agibp);
5146                 if (error) {
5147                         xfs_alert(mp, "%s agi read failed agno %d error %d",
5148                                                 __func__, agno, error);
5149                 } else {
5150                         struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
5151
5152                         itotal += be32_to_cpu(agi->agi_count);
5153                         ifree += be32_to_cpu(agi->agi_freecount);
5154                         xfs_buf_relse(agibp);
5155                 }
5156         }
5157 }
5158 #endif /* DEBUG */