xfs: refactor recovered RUI log item playback
[linux-2.6-block.git] / fs / xfs / xfs_log_recover.c
CommitLineData
0b61f8a4 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
87c199c2 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
7b718769 4 * All Rights Reserved.
1da177e4 5 */
1da177e4 6#include "xfs.h"
a844f451 7#include "xfs_fs.h"
70a9883c 8#include "xfs_shared.h"
239880ef
DC
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
a844f451 12#include "xfs_bit.h"
a844f451 13#include "xfs_sb.h"
1da177e4 14#include "xfs_mount.h"
50995582 15#include "xfs_defer.h"
1da177e4 16#include "xfs_inode.h"
239880ef 17#include "xfs_trans.h"
239880ef 18#include "xfs_log.h"
1da177e4 19#include "xfs_log_priv.h"
1da177e4 20#include "xfs_log_recover.h"
a4fbe6ab 21#include "xfs_inode_item.h"
1da177e4
LT
22#include "xfs_extfree_item.h"
23#include "xfs_trans_priv.h"
a4fbe6ab
DC
24#include "xfs_alloc.h"
25#include "xfs_ialloc.h"
1da177e4 26#include "xfs_quota.h"
0b1b213f 27#include "xfs_trace.h"
33479e05 28#include "xfs_icache.h"
a4fbe6ab 29#include "xfs_bmap_btree.h"
a4fbe6ab 30#include "xfs_error.h"
2b9ab5ab 31#include "xfs_dir2.h"
9e88b5d8 32#include "xfs_rmap_item.h"
60a4a222 33#include "xfs_buf_item.h"
f997ee21 34#include "xfs_refcount_item.h"
77d61fe4 35#include "xfs_bmap_item.h"
1da177e4 36
fc06c6d0
DC
37#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
38
9a8d2fdb
MT
39STATIC int
40xlog_find_zeroed(
41 struct xlog *,
42 xfs_daddr_t *);
43STATIC int
44xlog_clear_stale_blocks(
45 struct xlog *,
46 xfs_lsn_t);
1da177e4 47#if defined(DEBUG)
9a8d2fdb
MT
48STATIC void
49xlog_recover_check_summary(
50 struct xlog *);
1da177e4
LT
51#else
52#define xlog_recover_check_summary(log)
1da177e4 53#endif
7088c413
BF
54STATIC int
55xlog_do_recovery_pass(
56 struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
1da177e4 57
d5689eaa
CH
58/*
59 * This structure is used during recovery to record the buf log items which
60 * have been canceled and should not be replayed.
61 */
62struct xfs_buf_cancel {
63 xfs_daddr_t bc_blkno;
64 uint bc_len;
65 int bc_refcount;
66 struct list_head bc_list;
67};
68
1da177e4
LT
69/*
70 * Sector aligned buffer routines for buffer create/read/write/access
71 */
72
ff30a622 73/*
99c26595
BF
74 * Verify the log-relative block number and length in basic blocks are valid for
75 * an operation involving the given XFS log buffer. Returns true if the fields
76 * are valid, false otherwise.
ff30a622 77 */
99c26595 78static inline bool
6e9b3dd8 79xlog_verify_bno(
9a8d2fdb 80 struct xlog *log,
99c26595 81 xfs_daddr_t blk_no,
ff30a622
AE
82 int bbcount)
83{
99c26595
BF
84 if (blk_no < 0 || blk_no >= log->l_logBBsize)
85 return false;
86 if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
87 return false;
88 return true;
ff30a622
AE
89}
90
36adecff 91/*
6ad5b325
CH
92 * Allocate a buffer to hold log data. The buffer needs to be able to map to
93 * a range of nbblks basic blocks at any valid offset within the log.
36adecff 94 */
6ad5b325 95static char *
6e9b3dd8 96xlog_alloc_buffer(
9a8d2fdb 97 struct xlog *log,
3228149c 98 int nbblks)
1da177e4 99{
f8f9ee47
DC
100 int align_mask = xfs_buftarg_dma_alignment(log->l_targ);
101
99c26595
BF
102 /*
103 * Pass log block 0 since we don't have an addr yet, buffer will be
104 * verified on read.
105 */
a71895c5 106 if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
a0fa2b67 107 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
ff30a622 108 nbblks);
3228149c
DC
109 return NULL;
110 }
1da177e4 111
36adecff 112 /*
6ad5b325
CH
113 * We do log I/O in units of log sectors (a power-of-2 multiple of the
114 * basic block size), so we round up the requested size to accommodate
115 * the basic blocks required for complete log sectors.
36adecff 116 *
6ad5b325
CH
117 * In addition, the buffer may be used for a non-sector-aligned block
118 * offset, in which case an I/O of the requested size could extend
119 * beyond the end of the buffer. If the requested size is only 1 basic
120 * block it will never straddle a sector boundary, so this won't be an
121 * issue. Nor will this be a problem if the log I/O is done in basic
122 * blocks (sector size 1). But otherwise we extend the buffer by one
123 * extra log sector to ensure there's space to accommodate this
124 * possibility.
36adecff 125 */
69ce58f0
AE
126 if (nbblks > 1 && log->l_sectBBsize > 1)
127 nbblks += log->l_sectBBsize;
128 nbblks = round_up(nbblks, log->l_sectBBsize);
3219e8cf 129 return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO);
1da177e4
LT
130}
131
48389ef1
AE
132/*
133 * Return the address of the start of the given block number's data
134 * in a log buffer. The buffer covers a log sector-aligned region.
135 */
18ffb8c3 136static inline unsigned int
076e6acb 137xlog_align(
9a8d2fdb 138 struct xlog *log,
18ffb8c3 139 xfs_daddr_t blk_no)
076e6acb 140{
18ffb8c3 141 return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
076e6acb
CH
142}
143
6ad5b325
CH
144static int
145xlog_do_io(
146 struct xlog *log,
147 xfs_daddr_t blk_no,
148 unsigned int nbblks,
149 char *data,
150 unsigned int op)
1da177e4 151{
6ad5b325 152 int error;
1da177e4 153
a71895c5 154 if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
99c26595
BF
155 xfs_warn(log->l_mp,
156 "Invalid log block/length (0x%llx, 0x%x) for buffer",
157 blk_no, nbblks);
2451337d 158 return -EFSCORRUPTED;
3228149c
DC
159 }
160
69ce58f0
AE
161 blk_no = round_down(blk_no, log->l_sectBBsize);
162 nbblks = round_up(nbblks, log->l_sectBBsize);
1da177e4 163 ASSERT(nbblks > 0);
1da177e4 164
6ad5b325
CH
165 error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
166 BBTOB(nbblks), data, op);
167 if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) {
168 xfs_alert(log->l_mp,
169 "log recovery %s I/O error at daddr 0x%llx len %d error %d",
170 op == REQ_OP_WRITE ? "write" : "read",
171 blk_no, nbblks, error);
172 }
1da177e4
LT
173 return error;
174}
175
076e6acb 176STATIC int
6ad5b325 177xlog_bread_noalign(
9a8d2fdb 178 struct xlog *log,
076e6acb
CH
179 xfs_daddr_t blk_no,
180 int nbblks,
6ad5b325 181 char *data)
076e6acb 182{
6ad5b325 183 return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
076e6acb
CH
184}
185
44396476 186STATIC int
6ad5b325 187xlog_bread(
9a8d2fdb 188 struct xlog *log,
6ad5b325
CH
189 xfs_daddr_t blk_no,
190 int nbblks,
191 char *data,
192 char **offset)
44396476 193{
6ad5b325 194 int error;
44396476 195
6ad5b325
CH
196 error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
197 if (!error)
198 *offset = data + xlog_align(log, blk_no);
199 return error;
44396476
DC
200}
201
ba0f32d4 202STATIC int
1da177e4 203xlog_bwrite(
9a8d2fdb 204 struct xlog *log,
1da177e4
LT
205 xfs_daddr_t blk_no,
206 int nbblks,
6ad5b325 207 char *data)
1da177e4 208{
6ad5b325 209 return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
1da177e4
LT
210}
211
1da177e4
LT
212#ifdef DEBUG
213/*
214 * dump debug superblock and log record information
215 */
216STATIC void
217xlog_header_check_dump(
218 xfs_mount_t *mp,
219 xlog_rec_header_t *head)
220{
08e96e1a 221 xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
03daa57c 222 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
08e96e1a 223 xfs_debug(mp, " log : uuid = %pU, fmt = %d",
03daa57c 224 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
1da177e4
LT
225}
226#else
227#define xlog_header_check_dump(mp, head)
228#endif
229
230/*
231 * check log record header for recovery
232 */
233STATIC int
234xlog_header_check_recover(
235 xfs_mount_t *mp,
236 xlog_rec_header_t *head)
237{
69ef921b 238 ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
1da177e4
LT
239
240 /*
241 * IRIX doesn't write the h_fmt field and leaves it zeroed
242 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
243 * a dirty log created in IRIX.
244 */
a71895c5 245 if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
a0fa2b67
DC
246 xfs_warn(mp,
247 "dirty log written in incompatible format - can't recover");
1da177e4 248 xlog_header_check_dump(mp, head);
2451337d 249 return -EFSCORRUPTED;
a71895c5
DW
250 }
251 if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
252 &head->h_fs_uuid))) {
a0fa2b67
DC
253 xfs_warn(mp,
254 "dirty log entry has mismatched uuid - can't recover");
1da177e4 255 xlog_header_check_dump(mp, head);
2451337d 256 return -EFSCORRUPTED;
1da177e4
LT
257 }
258 return 0;
259}
260
261/*
262 * read the head block of the log and check the header
263 */
264STATIC int
265xlog_header_check_mount(
266 xfs_mount_t *mp,
267 xlog_rec_header_t *head)
268{
69ef921b 269 ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
1da177e4 270
d905fdaa 271 if (uuid_is_null(&head->h_fs_uuid)) {
1da177e4
LT
272 /*
273 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
d905fdaa 274 * h_fs_uuid is null, we assume this log was last mounted
1da177e4
LT
275 * by IRIX and continue.
276 */
d905fdaa 277 xfs_warn(mp, "null uuid in log - IRIX style log");
a71895c5
DW
278 } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
279 &head->h_fs_uuid))) {
a0fa2b67 280 xfs_warn(mp, "log has mismatched uuid - can't recover");
1da177e4 281 xlog_header_check_dump(mp, head);
2451337d 282 return -EFSCORRUPTED;
1da177e4
LT
283 }
284 return 0;
285}
286
1094d3f1 287void
1da177e4
LT
288xlog_recover_iodone(
289 struct xfs_buf *bp)
290{
5a52c2a5 291 if (bp->b_error) {
1da177e4
LT
292 /*
293 * We're not going to bother about retrying
294 * this during recovery. One strike!
295 */
dbd329f1 296 if (!XFS_FORCED_SHUTDOWN(bp->b_mount)) {
cdbcf82b 297 xfs_buf_ioerror_alert(bp, __this_address);
dbd329f1 298 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
595bff75 299 }
1da177e4 300 }
60a4a222
BF
301
302 /*
303 * On v5 supers, a bli could be attached to update the metadata LSN.
304 * Clean it up.
305 */
fb1755a6 306 if (bp->b_log_item)
60a4a222 307 xfs_buf_item_relse(bp);
fb1755a6 308 ASSERT(bp->b_log_item == NULL);
60a4a222 309
cb669ca5 310 bp->b_iodone = NULL;
e8aaba9a 311 xfs_buf_ioend(bp);
1da177e4
LT
312}
313
314/*
315 * This routine finds (to an approximation) the first block in the physical
316 * log which contains the given cycle. It uses a binary search algorithm.
317 * Note that the algorithm can not be perfect because the disk will not
318 * necessarily be perfect.
319 */
a8272ce0 320STATIC int
1da177e4 321xlog_find_cycle_start(
9a8d2fdb 322 struct xlog *log,
6e9b3dd8 323 char *buffer,
1da177e4
LT
324 xfs_daddr_t first_blk,
325 xfs_daddr_t *last_blk,
326 uint cycle)
327{
b2a922cd 328 char *offset;
1da177e4 329 xfs_daddr_t mid_blk;
e3bb2e30 330 xfs_daddr_t end_blk;
1da177e4
LT
331 uint mid_cycle;
332 int error;
333
e3bb2e30
AE
334 end_blk = *last_blk;
335 mid_blk = BLK_AVG(first_blk, end_blk);
336 while (mid_blk != first_blk && mid_blk != end_blk) {
6e9b3dd8 337 error = xlog_bread(log, mid_blk, 1, buffer, &offset);
076e6acb 338 if (error)
1da177e4 339 return error;
03bea6fe 340 mid_cycle = xlog_get_cycle(offset);
e3bb2e30
AE
341 if (mid_cycle == cycle)
342 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
343 else
344 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
345 mid_blk = BLK_AVG(first_blk, end_blk);
1da177e4 346 }
e3bb2e30
AE
347 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
348 (mid_blk == end_blk && mid_blk-1 == first_blk));
349
350 *last_blk = end_blk;
1da177e4
LT
351
352 return 0;
353}
354
355/*
3f943d85
AE
356 * Check that a range of blocks does not contain stop_on_cycle_no.
357 * Fill in *new_blk with the block offset where such a block is
358 * found, or with -1 (an invalid block number) if there is no such
359 * block in the range. The scan needs to occur from front to back
360 * and the pointer into the region must be updated since a later
361 * routine will need to perform another test.
1da177e4
LT
362 */
363STATIC int
364xlog_find_verify_cycle(
9a8d2fdb 365 struct xlog *log,
1da177e4
LT
366 xfs_daddr_t start_blk,
367 int nbblks,
368 uint stop_on_cycle_no,
369 xfs_daddr_t *new_blk)
370{
371 xfs_daddr_t i, j;
372 uint cycle;
6e9b3dd8 373 char *buffer;
1da177e4 374 xfs_daddr_t bufblks;
b2a922cd 375 char *buf = NULL;
1da177e4
LT
376 int error = 0;
377
6881a229
AE
378 /*
379 * Greedily allocate a buffer big enough to handle the full
380 * range of basic blocks we'll be examining. If that fails,
381 * try a smaller size. We need to be able to read at least
382 * a log sector, or we're out of luck.
383 */
1da177e4 384 bufblks = 1 << ffs(nbblks);
81158e0c
DC
385 while (bufblks > log->l_logBBsize)
386 bufblks >>= 1;
6e9b3dd8 387 while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
1da177e4 388 bufblks >>= 1;
69ce58f0 389 if (bufblks < log->l_sectBBsize)
2451337d 390 return -ENOMEM;
1da177e4
LT
391 }
392
393 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
394 int bcount;
395
396 bcount = min(bufblks, (start_blk + nbblks - i));
397
6e9b3dd8 398 error = xlog_bread(log, i, bcount, buffer, &buf);
076e6acb 399 if (error)
1da177e4
LT
400 goto out;
401
1da177e4 402 for (j = 0; j < bcount; j++) {
03bea6fe 403 cycle = xlog_get_cycle(buf);
1da177e4
LT
404 if (cycle == stop_on_cycle_no) {
405 *new_blk = i+j;
406 goto out;
407 }
408
409 buf += BBSIZE;
410 }
411 }
412
413 *new_blk = -1;
414
415out:
6e9b3dd8 416 kmem_free(buffer);
1da177e4
LT
417 return error;
418}
419
420/*
421 * Potentially backup over partial log record write.
422 *
423 * In the typical case, last_blk is the number of the block directly after
424 * a good log record. Therefore, we subtract one to get the block number
425 * of the last block in the given buffer. extra_bblks contains the number
426 * of blocks we would have read on a previous read. This happens when the
427 * last log record is split over the end of the physical log.
428 *
429 * extra_bblks is the number of blocks potentially verified on a previous
430 * call to this routine.
431 */
432STATIC int
433xlog_find_verify_log_record(
9a8d2fdb 434 struct xlog *log,
1da177e4
LT
435 xfs_daddr_t start_blk,
436 xfs_daddr_t *last_blk,
437 int extra_bblks)
438{
439 xfs_daddr_t i;
6e9b3dd8 440 char *buffer;
b2a922cd 441 char *offset = NULL;
1da177e4
LT
442 xlog_rec_header_t *head = NULL;
443 int error = 0;
444 int smallmem = 0;
445 int num_blks = *last_blk - start_blk;
446 int xhdrs;
447
448 ASSERT(start_blk != 0 || *last_blk != start_blk);
449
6e9b3dd8
CH
450 buffer = xlog_alloc_buffer(log, num_blks);
451 if (!buffer) {
452 buffer = xlog_alloc_buffer(log, 1);
453 if (!buffer)
2451337d 454 return -ENOMEM;
1da177e4
LT
455 smallmem = 1;
456 } else {
6e9b3dd8 457 error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
076e6acb 458 if (error)
1da177e4 459 goto out;
1da177e4
LT
460 offset += ((num_blks - 1) << BBSHIFT);
461 }
462
463 for (i = (*last_blk) - 1; i >= 0; i--) {
464 if (i < start_blk) {
465 /* valid log record not found */
a0fa2b67
DC
466 xfs_warn(log->l_mp,
467 "Log inconsistent (didn't find previous header)");
1da177e4 468 ASSERT(0);
895e196f 469 error = -EFSCORRUPTED;
1da177e4
LT
470 goto out;
471 }
472
473 if (smallmem) {
6e9b3dd8 474 error = xlog_bread(log, i, 1, buffer, &offset);
076e6acb 475 if (error)
1da177e4 476 goto out;
1da177e4
LT
477 }
478
479 head = (xlog_rec_header_t *)offset;
480
69ef921b 481 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
1da177e4
LT
482 break;
483
484 if (!smallmem)
485 offset -= BBSIZE;
486 }
487
488 /*
489 * We hit the beginning of the physical log & still no header. Return
490 * to caller. If caller can handle a return of -1, then this routine
491 * will be called again for the end of the physical log.
492 */
493 if (i == -1) {
2451337d 494 error = 1;
1da177e4
LT
495 goto out;
496 }
497
498 /*
499 * We have the final block of the good log (the first block
500 * of the log record _before_ the head. So we check the uuid.
501 */
502 if ((error = xlog_header_check_mount(log->l_mp, head)))
503 goto out;
504
505 /*
506 * We may have found a log record header before we expected one.
507 * last_blk will be the 1st block # with a given cycle #. We may end
508 * up reading an entire log record. In this case, we don't want to
509 * reset last_blk. Only when last_blk points in the middle of a log
510 * record do we update last_blk.
511 */
62118709 512 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
b53e675d 513 uint h_size = be32_to_cpu(head->h_size);
1da177e4
LT
514
515 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
516 if (h_size % XLOG_HEADER_CYCLE_SIZE)
517 xhdrs++;
518 } else {
519 xhdrs = 1;
520 }
521
b53e675d
CH
522 if (*last_blk - i + extra_bblks !=
523 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
1da177e4
LT
524 *last_blk = i;
525
526out:
6e9b3dd8 527 kmem_free(buffer);
1da177e4
LT
528 return error;
529}
530
531/*
532 * Head is defined to be the point of the log where the next log write
0a94da24 533 * could go. This means that incomplete LR writes at the end are
1da177e4
LT
534 * eliminated when calculating the head. We aren't guaranteed that previous
535 * LR have complete transactions. We only know that a cycle number of
536 * current cycle number -1 won't be present in the log if we start writing
537 * from our current block number.
538 *
539 * last_blk contains the block number of the first block with a given
540 * cycle number.
541 *
542 * Return: zero if normal, non-zero if error.
543 */
ba0f32d4 544STATIC int
1da177e4 545xlog_find_head(
9a8d2fdb 546 struct xlog *log,
1da177e4
LT
547 xfs_daddr_t *return_head_blk)
548{
6e9b3dd8 549 char *buffer;
b2a922cd 550 char *offset;
1da177e4
LT
551 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
552 int num_scan_bblks;
553 uint first_half_cycle, last_half_cycle;
554 uint stop_on_cycle;
555 int error, log_bbnum = log->l_logBBsize;
556
557 /* Is the end of the log device zeroed? */
2451337d
DC
558 error = xlog_find_zeroed(log, &first_blk);
559 if (error < 0) {
560 xfs_warn(log->l_mp, "empty log check failed");
561 return error;
562 }
563 if (error == 1) {
1da177e4
LT
564 *return_head_blk = first_blk;
565
566 /* Is the whole lot zeroed? */
567 if (!first_blk) {
568 /* Linux XFS shouldn't generate totally zeroed logs -
569 * mkfs etc write a dummy unmount record to a fresh
570 * log so we can store the uuid in there
571 */
a0fa2b67 572 xfs_warn(log->l_mp, "totally zeroed log");
1da177e4
LT
573 }
574
575 return 0;
1da177e4
LT
576 }
577
578 first_blk = 0; /* get cycle # of 1st block */
6e9b3dd8
CH
579 buffer = xlog_alloc_buffer(log, 1);
580 if (!buffer)
2451337d 581 return -ENOMEM;
076e6acb 582
6e9b3dd8 583 error = xlog_bread(log, 0, 1, buffer, &offset);
076e6acb 584 if (error)
6e9b3dd8 585 goto out_free_buffer;
076e6acb 586
03bea6fe 587 first_half_cycle = xlog_get_cycle(offset);
1da177e4
LT
588
589 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
6e9b3dd8 590 error = xlog_bread(log, last_blk, 1, buffer, &offset);
076e6acb 591 if (error)
6e9b3dd8 592 goto out_free_buffer;
076e6acb 593
03bea6fe 594 last_half_cycle = xlog_get_cycle(offset);
1da177e4
LT
595 ASSERT(last_half_cycle != 0);
596
597 /*
598 * If the 1st half cycle number is equal to the last half cycle number,
599 * then the entire log is stamped with the same cycle number. In this
600 * case, head_blk can't be set to zero (which makes sense). The below
601 * math doesn't work out properly with head_blk equal to zero. Instead,
602 * we set it to log_bbnum which is an invalid block number, but this
603 * value makes the math correct. If head_blk doesn't changed through
604 * all the tests below, *head_blk is set to zero at the very end rather
605 * than log_bbnum. In a sense, log_bbnum and zero are the same block
606 * in a circular file.
607 */
608 if (first_half_cycle == last_half_cycle) {
609 /*
610 * In this case we believe that the entire log should have
611 * cycle number last_half_cycle. We need to scan backwards
612 * from the end verifying that there are no holes still
613 * containing last_half_cycle - 1. If we find such a hole,
614 * then the start of that hole will be the new head. The
615 * simple case looks like
616 * x | x ... | x - 1 | x
617 * Another case that fits this picture would be
618 * x | x + 1 | x ... | x
c41564b5 619 * In this case the head really is somewhere at the end of the
1da177e4
LT
620 * log, as one of the latest writes at the beginning was
621 * incomplete.
622 * One more case is
623 * x | x + 1 | x ... | x - 1 | x
624 * This is really the combination of the above two cases, and
625 * the head has to end up at the start of the x-1 hole at the
626 * end of the log.
627 *
628 * In the 256k log case, we will read from the beginning to the
629 * end of the log and search for cycle numbers equal to x-1.
630 * We don't worry about the x+1 blocks that we encounter,
631 * because we know that they cannot be the head since the log
632 * started with x.
633 */
634 head_blk = log_bbnum;
635 stop_on_cycle = last_half_cycle - 1;
636 } else {
637 /*
638 * In this case we want to find the first block with cycle
639 * number matching last_half_cycle. We expect the log to be
640 * some variation on
3f943d85 641 * x + 1 ... | x ... | x
1da177e4
LT
642 * The first block with cycle number x (last_half_cycle) will
643 * be where the new head belongs. First we do a binary search
644 * for the first occurrence of last_half_cycle. The binary
645 * search may not be totally accurate, so then we scan back
646 * from there looking for occurrences of last_half_cycle before
647 * us. If that backwards scan wraps around the beginning of
648 * the log, then we look for occurrences of last_half_cycle - 1
649 * at the end of the log. The cases we're looking for look
650 * like
3f943d85
AE
651 * v binary search stopped here
652 * x + 1 ... | x | x + 1 | x ... | x
653 * ^ but we want to locate this spot
1da177e4 654 * or
1da177e4 655 * <---------> less than scan distance
3f943d85
AE
656 * x + 1 ... | x ... | x - 1 | x
657 * ^ we want to locate this spot
1da177e4
LT
658 */
659 stop_on_cycle = last_half_cycle;
6e9b3dd8
CH
660 error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
661 last_half_cycle);
662 if (error)
663 goto out_free_buffer;
1da177e4
LT
664 }
665
666 /*
667 * Now validate the answer. Scan back some number of maximum possible
668 * blocks and make sure each one has the expected cycle number. The
669 * maximum is determined by the total possible amount of buffering
670 * in the in-core log. The following number can be made tighter if
671 * we actually look at the block size of the filesystem.
672 */
9f2a4505 673 num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
1da177e4
LT
674 if (head_blk >= num_scan_bblks) {
675 /*
676 * We are guaranteed that the entire check can be performed
677 * in one buffer.
678 */
679 start_blk = head_blk - num_scan_bblks;
680 if ((error = xlog_find_verify_cycle(log,
681 start_blk, num_scan_bblks,
682 stop_on_cycle, &new_blk)))
6e9b3dd8 683 goto out_free_buffer;
1da177e4
LT
684 if (new_blk != -1)
685 head_blk = new_blk;
686 } else { /* need to read 2 parts of log */
687 /*
688 * We are going to scan backwards in the log in two parts.
689 * First we scan the physical end of the log. In this part
690 * of the log, we are looking for blocks with cycle number
691 * last_half_cycle - 1.
692 * If we find one, then we know that the log starts there, as
693 * we've found a hole that didn't get written in going around
694 * the end of the physical log. The simple case for this is
695 * x + 1 ... | x ... | x - 1 | x
696 * <---------> less than scan distance
697 * If all of the blocks at the end of the log have cycle number
698 * last_half_cycle, then we check the blocks at the start of
699 * the log looking for occurrences of last_half_cycle. If we
700 * find one, then our current estimate for the location of the
701 * first occurrence of last_half_cycle is wrong and we move
702 * back to the hole we've found. This case looks like
703 * x + 1 ... | x | x + 1 | x ...
704 * ^ binary search stopped here
705 * Another case we need to handle that only occurs in 256k
706 * logs is
707 * x + 1 ... | x ... | x+1 | x ...
708 * ^ binary search stops here
709 * In a 256k log, the scan at the end of the log will see the
710 * x + 1 blocks. We need to skip past those since that is
711 * certainly not the head of the log. By searching for
712 * last_half_cycle-1 we accomplish that.
713 */
1da177e4 714 ASSERT(head_blk <= INT_MAX &&
3f943d85
AE
715 (xfs_daddr_t) num_scan_bblks >= head_blk);
716 start_blk = log_bbnum - (num_scan_bblks - head_blk);
1da177e4
LT
717 if ((error = xlog_find_verify_cycle(log, start_blk,
718 num_scan_bblks - (int)head_blk,
719 (stop_on_cycle - 1), &new_blk)))
6e9b3dd8 720 goto out_free_buffer;
1da177e4
LT
721 if (new_blk != -1) {
722 head_blk = new_blk;
9db127ed 723 goto validate_head;
1da177e4
LT
724 }
725
726 /*
727 * Scan beginning of log now. The last part of the physical
728 * log is good. This scan needs to verify that it doesn't find
729 * the last_half_cycle.
730 */
731 start_blk = 0;
732 ASSERT(head_blk <= INT_MAX);
733 if ((error = xlog_find_verify_cycle(log,
734 start_blk, (int)head_blk,
735 stop_on_cycle, &new_blk)))
6e9b3dd8 736 goto out_free_buffer;
1da177e4
LT
737 if (new_blk != -1)
738 head_blk = new_blk;
739 }
740
9db127ed 741validate_head:
1da177e4
LT
742 /*
743 * Now we need to make sure head_blk is not pointing to a block in
744 * the middle of a log record.
745 */
746 num_scan_bblks = XLOG_REC_SHIFT(log);
747 if (head_blk >= num_scan_bblks) {
748 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
749
750 /* start ptr at last block ptr before head_blk */
2451337d
DC
751 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
752 if (error == 1)
753 error = -EIO;
754 if (error)
6e9b3dd8 755 goto out_free_buffer;
1da177e4
LT
756 } else {
757 start_blk = 0;
758 ASSERT(head_blk <= INT_MAX);
2451337d
DC
759 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
760 if (error < 0)
6e9b3dd8 761 goto out_free_buffer;
2451337d 762 if (error == 1) {
1da177e4 763 /* We hit the beginning of the log during our search */
3f943d85 764 start_blk = log_bbnum - (num_scan_bblks - head_blk);
1da177e4
LT
765 new_blk = log_bbnum;
766 ASSERT(start_blk <= INT_MAX &&
767 (xfs_daddr_t) log_bbnum-start_blk >= 0);
768 ASSERT(head_blk <= INT_MAX);
2451337d
DC
769 error = xlog_find_verify_log_record(log, start_blk,
770 &new_blk, (int)head_blk);
771 if (error == 1)
772 error = -EIO;
773 if (error)
6e9b3dd8 774 goto out_free_buffer;
1da177e4
LT
775 if (new_blk != log_bbnum)
776 head_blk = new_blk;
777 } else if (error)
6e9b3dd8 778 goto out_free_buffer;
1da177e4
LT
779 }
780
6e9b3dd8 781 kmem_free(buffer);
1da177e4
LT
782 if (head_blk == log_bbnum)
783 *return_head_blk = 0;
784 else
785 *return_head_blk = head_blk;
786 /*
787 * When returning here, we have a good block number. Bad block
788 * means that during a previous crash, we didn't have a clean break
789 * from cycle number N to cycle number N-1. In this case, we need
790 * to find the first block with cycle number N-1.
791 */
792 return 0;
793
6e9b3dd8
CH
794out_free_buffer:
795 kmem_free(buffer);
1da177e4 796 if (error)
a0fa2b67 797 xfs_warn(log->l_mp, "failed to find log head");
1da177e4
LT
798 return error;
799}
800
eed6b462
BF
801/*
802 * Seek backwards in the log for log record headers.
803 *
804 * Given a starting log block, walk backwards until we find the provided number
805 * of records or hit the provided tail block. The return value is the number of
806 * records encountered or a negative error code. The log block and buffer
807 * pointer of the last record seen are returned in rblk and rhead respectively.
808 */
809STATIC int
810xlog_rseek_logrec_hdr(
811 struct xlog *log,
812 xfs_daddr_t head_blk,
813 xfs_daddr_t tail_blk,
814 int count,
6e9b3dd8 815 char *buffer,
eed6b462
BF
816 xfs_daddr_t *rblk,
817 struct xlog_rec_header **rhead,
818 bool *wrapped)
819{
820 int i;
821 int error;
822 int found = 0;
823 char *offset = NULL;
824 xfs_daddr_t end_blk;
825
826 *wrapped = false;
827
828 /*
829 * Walk backwards from the head block until we hit the tail or the first
830 * block in the log.
831 */
832 end_blk = head_blk > tail_blk ? tail_blk : 0;
833 for (i = (int) head_blk - 1; i >= end_blk; i--) {
6e9b3dd8 834 error = xlog_bread(log, i, 1, buffer, &offset);
eed6b462
BF
835 if (error)
836 goto out_error;
837
838 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
839 *rblk = i;
840 *rhead = (struct xlog_rec_header *) offset;
841 if (++found == count)
842 break;
843 }
844 }
845
846 /*
847 * If we haven't hit the tail block or the log record header count,
848 * start looking again from the end of the physical log. Note that
849 * callers can pass head == tail if the tail is not yet known.
850 */
851 if (tail_blk >= head_blk && found != count) {
852 for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
6e9b3dd8 853 error = xlog_bread(log, i, 1, buffer, &offset);
eed6b462
BF
854 if (error)
855 goto out_error;
856
857 if (*(__be32 *)offset ==
858 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
859 *wrapped = true;
860 *rblk = i;
861 *rhead = (struct xlog_rec_header *) offset;
862 if (++found == count)
863 break;
864 }
865 }
866 }
867
868 return found;
869
870out_error:
871 return error;
872}
873
7088c413
BF
874/*
875 * Seek forward in the log for log record headers.
876 *
877 * Given head and tail blocks, walk forward from the tail block until we find
878 * the provided number of records or hit the head block. The return value is the
879 * number of records encountered or a negative error code. The log block and
880 * buffer pointer of the last record seen are returned in rblk and rhead
881 * respectively.
882 */
883STATIC int
884xlog_seek_logrec_hdr(
885 struct xlog *log,
886 xfs_daddr_t head_blk,
887 xfs_daddr_t tail_blk,
888 int count,
6e9b3dd8 889 char *buffer,
7088c413
BF
890 xfs_daddr_t *rblk,
891 struct xlog_rec_header **rhead,
892 bool *wrapped)
893{
894 int i;
895 int error;
896 int found = 0;
897 char *offset = NULL;
898 xfs_daddr_t end_blk;
899
900 *wrapped = false;
901
902 /*
903 * Walk forward from the tail block until we hit the head or the last
904 * block in the log.
905 */
906 end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
907 for (i = (int) tail_blk; i <= end_blk; i++) {
6e9b3dd8 908 error = xlog_bread(log, i, 1, buffer, &offset);
7088c413
BF
909 if (error)
910 goto out_error;
911
912 if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
913 *rblk = i;
914 *rhead = (struct xlog_rec_header *) offset;
915 if (++found == count)
916 break;
917 }
918 }
919
920 /*
921 * If we haven't hit the head block or the log record header count,
922 * start looking again from the start of the physical log.
923 */
924 if (tail_blk > head_blk && found != count) {
925 for (i = 0; i < (int) head_blk; i++) {
6e9b3dd8 926 error = xlog_bread(log, i, 1, buffer, &offset);
7088c413
BF
927 if (error)
928 goto out_error;
929
930 if (*(__be32 *)offset ==
931 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
932 *wrapped = true;
933 *rblk = i;
934 *rhead = (struct xlog_rec_header *) offset;
935 if (++found == count)
936 break;
937 }
938 }
939 }
940
941 return found;
942
943out_error:
944 return error;
945}
946
947/*
4a4f66ea
BF
948 * Calculate distance from head to tail (i.e., unused space in the log).
949 */
950static inline int
951xlog_tail_distance(
952 struct xlog *log,
953 xfs_daddr_t head_blk,
954 xfs_daddr_t tail_blk)
955{
956 if (head_blk < tail_blk)
957 return tail_blk - head_blk;
958
959 return tail_blk + (log->l_logBBsize - head_blk);
960}
961
962/*
963 * Verify the log tail. This is particularly important when torn or incomplete
964 * writes have been detected near the front of the log and the head has been
965 * walked back accordingly.
966 *
967 * We also have to handle the case where the tail was pinned and the head
968 * blocked behind the tail right before a crash. If the tail had been pushed
969 * immediately prior to the crash and the subsequent checkpoint was only
970 * partially written, it's possible it overwrote the last referenced tail in the
971 * log with garbage. This is not a coherency problem because the tail must have
972 * been pushed before it can be overwritten, but appears as log corruption to
973 * recovery because we have no way to know the tail was updated if the
974 * subsequent checkpoint didn't write successfully.
7088c413 975 *
4a4f66ea
BF
976 * Therefore, CRC check the log from tail to head. If a failure occurs and the
977 * offending record is within max iclog bufs from the head, walk the tail
978 * forward and retry until a valid tail is found or corruption is detected out
979 * of the range of a possible overwrite.
7088c413
BF
980 */
981STATIC int
982xlog_verify_tail(
983 struct xlog *log,
984 xfs_daddr_t head_blk,
4a4f66ea
BF
985 xfs_daddr_t *tail_blk,
986 int hsize)
7088c413
BF
987{
988 struct xlog_rec_header *thead;
6e9b3dd8 989 char *buffer;
7088c413 990 xfs_daddr_t first_bad;
7088c413
BF
991 int error = 0;
992 bool wrapped;
4a4f66ea
BF
993 xfs_daddr_t tmp_tail;
994 xfs_daddr_t orig_tail = *tail_blk;
7088c413 995
6e9b3dd8
CH
996 buffer = xlog_alloc_buffer(log, 1);
997 if (!buffer)
7088c413
BF
998 return -ENOMEM;
999
1000 /*
4a4f66ea
BF
1001 * Make sure the tail points to a record (returns positive count on
1002 * success).
7088c413 1003 */
6e9b3dd8 1004 error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
4a4f66ea
BF
1005 &tmp_tail, &thead, &wrapped);
1006 if (error < 0)
7088c413 1007 goto out;
4a4f66ea
BF
1008 if (*tail_blk != tmp_tail)
1009 *tail_blk = tmp_tail;
7088c413
BF
1010
1011 /*
4a4f66ea
BF
1012 * Run a CRC check from the tail to the head. We can't just check
1013 * MAX_ICLOGS records past the tail because the tail may point to stale
1014 * blocks cleared during the search for the head/tail. These blocks are
1015 * overwritten with zero-length records and thus record count is not a
1016 * reliable indicator of the iclog state before a crash.
7088c413 1017 */
4a4f66ea
BF
1018 first_bad = 0;
1019 error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
7088c413 1020 XLOG_RECOVER_CRCPASS, &first_bad);
a4c9b34d 1021 while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
4a4f66ea
BF
1022 int tail_distance;
1023
1024 /*
1025 * Is corruption within range of the head? If so, retry from
1026 * the next record. Otherwise return an error.
1027 */
1028 tail_distance = xlog_tail_distance(log, head_blk, first_bad);
1029 if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
1030 break;
7088c413 1031
4a4f66ea 1032 /* skip to the next record; returns positive count on success */
6e9b3dd8
CH
1033 error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
1034 buffer, &tmp_tail, &thead, &wrapped);
4a4f66ea
BF
1035 if (error < 0)
1036 goto out;
1037
1038 *tail_blk = tmp_tail;
1039 first_bad = 0;
1040 error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
1041 XLOG_RECOVER_CRCPASS, &first_bad);
1042 }
1043
1044 if (!error && *tail_blk != orig_tail)
1045 xfs_warn(log->l_mp,
1046 "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
1047 orig_tail, *tail_blk);
7088c413 1048out:
6e9b3dd8 1049 kmem_free(buffer);
7088c413
BF
1050 return error;
1051}
1052
1053/*
1054 * Detect and trim torn writes from the head of the log.
1055 *
1056 * Storage without sector atomicity guarantees can result in torn writes in the
1057 * log in the event of a crash. Our only means to detect this scenario is via
1058 * CRC verification. While we can't always be certain that CRC verification
1059 * failure is due to a torn write vs. an unrelated corruption, we do know that
1060 * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1061 * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1062 * the log and treat failures in this range as torn writes as a matter of
1063 * policy. In the event of CRC failure, the head is walked back to the last good
1064 * record in the log and the tail is updated from that record and verified.
1065 */
1066STATIC int
1067xlog_verify_head(
1068 struct xlog *log,
1069 xfs_daddr_t *head_blk, /* in/out: unverified head */
1070 xfs_daddr_t *tail_blk, /* out: tail block */
6e9b3dd8 1071 char *buffer,
7088c413
BF
1072 xfs_daddr_t *rhead_blk, /* start blk of last record */
1073 struct xlog_rec_header **rhead, /* ptr to last record */
1074 bool *wrapped) /* last rec. wraps phys. log */
1075{
1076 struct xlog_rec_header *tmp_rhead;
6e9b3dd8 1077 char *tmp_buffer;
7088c413
BF
1078 xfs_daddr_t first_bad;
1079 xfs_daddr_t tmp_rhead_blk;
1080 int found;
1081 int error;
1082 bool tmp_wrapped;
1083
1084 /*
82ff6cc2
BF
1085 * Check the head of the log for torn writes. Search backwards from the
1086 * head until we hit the tail or the maximum number of log record I/Os
1087 * that could have been in flight at one time. Use a temporary buffer so
6e9b3dd8 1088 * we don't trash the rhead/buffer pointers from the caller.
7088c413 1089 */
6e9b3dd8
CH
1090 tmp_buffer = xlog_alloc_buffer(log, 1);
1091 if (!tmp_buffer)
7088c413
BF
1092 return -ENOMEM;
1093 error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
6e9b3dd8
CH
1094 XLOG_MAX_ICLOGS, tmp_buffer,
1095 &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
1096 kmem_free(tmp_buffer);
7088c413
BF
1097 if (error < 0)
1098 return error;
1099
1100 /*
1101 * Now run a CRC verification pass over the records starting at the
1102 * block found above to the current head. If a CRC failure occurs, the
1103 * log block of the first bad record is saved in first_bad.
1104 */
1105 error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1106 XLOG_RECOVER_CRCPASS, &first_bad);
a4c9b34d 1107 if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
7088c413
BF
1108 /*
1109 * We've hit a potential torn write. Reset the error and warn
1110 * about it.
1111 */
1112 error = 0;
1113 xfs_warn(log->l_mp,
1114"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1115 first_bad, *head_blk);
1116
1117 /*
1118 * Get the header block and buffer pointer for the last good
1119 * record before the bad record.
1120 *
1121 * Note that xlog_find_tail() clears the blocks at the new head
1122 * (i.e., the records with invalid CRC) if the cycle number
1123 * matches the the current cycle.
1124 */
6e9b3dd8
CH
1125 found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
1126 buffer, rhead_blk, rhead, wrapped);
7088c413
BF
1127 if (found < 0)
1128 return found;
1129 if (found == 0) /* XXX: right thing to do here? */
1130 return -EIO;
1131
1132 /*
1133 * Reset the head block to the starting block of the first bad
1134 * log record and set the tail block based on the last good
1135 * record.
1136 *
1137 * Bail out if the updated head/tail match as this indicates
1138 * possible corruption outside of the acceptable
1139 * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1140 */
1141 *head_blk = first_bad;
1142 *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1143 if (*head_blk == *tail_blk) {
1144 ASSERT(0);
1145 return 0;
1146 }
7088c413 1147 }
5297ac1f
BF
1148 if (error)
1149 return error;
7088c413 1150
4a4f66ea
BF
1151 return xlog_verify_tail(log, *head_blk, tail_blk,
1152 be32_to_cpu((*rhead)->h_size));
7088c413
BF
1153}
1154
0703a8e1
DC
1155/*
1156 * We need to make sure we handle log wrapping properly, so we can't use the
1157 * calculated logbno directly. Make sure it wraps to the correct bno inside the
1158 * log.
1159 *
1160 * The log is limited to 32 bit sizes, so we use the appropriate modulus
1161 * operation here and cast it back to a 64 bit daddr on return.
1162 */
1163static inline xfs_daddr_t
1164xlog_wrap_logbno(
1165 struct xlog *log,
1166 xfs_daddr_t bno)
1167{
1168 int mod;
1169
1170 div_s64_rem(bno, log->l_logBBsize, &mod);
1171 return mod;
1172}
1173
65b99a08
BF
1174/*
1175 * Check whether the head of the log points to an unmount record. In other
1176 * words, determine whether the log is clean. If so, update the in-core state
1177 * appropriately.
1178 */
1179static int
1180xlog_check_unmount_rec(
1181 struct xlog *log,
1182 xfs_daddr_t *head_blk,
1183 xfs_daddr_t *tail_blk,
1184 struct xlog_rec_header *rhead,
1185 xfs_daddr_t rhead_blk,
6e9b3dd8 1186 char *buffer,
65b99a08
BF
1187 bool *clean)
1188{
1189 struct xlog_op_header *op_head;
1190 xfs_daddr_t umount_data_blk;
1191 xfs_daddr_t after_umount_blk;
1192 int hblks;
1193 int error;
1194 char *offset;
1195
1196 *clean = false;
1197
1198 /*
1199 * Look for unmount record. If we find it, then we know there was a
1200 * clean unmount. Since 'i' could be the last block in the physical
1201 * log, we convert to a log block before comparing to the head_blk.
1202 *
1203 * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
1204 * below. We won't want to clear the unmount record if there is one, so
1205 * we pass the lsn of the unmount record rather than the block after it.
1206 */
1207 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1208 int h_size = be32_to_cpu(rhead->h_size);
1209 int h_version = be32_to_cpu(rhead->h_version);
1210
1211 if ((h_version & XLOG_VERSION_2) &&
1212 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1213 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1214 if (h_size % XLOG_HEADER_CYCLE_SIZE)
1215 hblks++;
1216 } else {
1217 hblks = 1;
1218 }
1219 } else {
1220 hblks = 1;
1221 }
0703a8e1
DC
1222
1223 after_umount_blk = xlog_wrap_logbno(log,
1224 rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
1225
65b99a08
BF
1226 if (*head_blk == after_umount_blk &&
1227 be32_to_cpu(rhead->h_num_logops) == 1) {
0703a8e1 1228 umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
6e9b3dd8 1229 error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
65b99a08
BF
1230 if (error)
1231 return error;
1232
1233 op_head = (struct xlog_op_header *)offset;
1234 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1235 /*
1236 * Set tail and last sync so that newly written log
1237 * records will point recovery to after the current
1238 * unmount record.
1239 */
1240 xlog_assign_atomic_lsn(&log->l_tail_lsn,
1241 log->l_curr_cycle, after_umount_blk);
1242 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1243 log->l_curr_cycle, after_umount_blk);
1244 *tail_blk = after_umount_blk;
1245
1246 *clean = true;
1247 }
1248 }
1249
1250 return 0;
1251}
1252
717bc0eb
BF
1253static void
1254xlog_set_state(
1255 struct xlog *log,
1256 xfs_daddr_t head_blk,
1257 struct xlog_rec_header *rhead,
1258 xfs_daddr_t rhead_blk,
1259 bool bump_cycle)
1260{
1261 /*
1262 * Reset log values according to the state of the log when we
1263 * crashed. In the case where head_blk == 0, we bump curr_cycle
1264 * one because the next write starts a new cycle rather than
1265 * continuing the cycle of the last good log record. At this
1266 * point we have guaranteed that all partial log records have been
1267 * accounted for. Therefore, we know that the last good log record
1268 * written was complete and ended exactly on the end boundary
1269 * of the physical log.
1270 */
1271 log->l_prev_block = rhead_blk;
1272 log->l_curr_block = (int)head_blk;
1273 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1274 if (bump_cycle)
1275 log->l_curr_cycle++;
1276 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1277 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1278 xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1279 BBTOB(log->l_curr_block));
1280 xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1281 BBTOB(log->l_curr_block));
1282}
1283
1da177e4
LT
1284/*
1285 * Find the sync block number or the tail of the log.
1286 *
1287 * This will be the block number of the last record to have its
1288 * associated buffers synced to disk. Every log record header has
1289 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
1290 * to get a sync block number. The only concern is to figure out which
1291 * log record header to believe.
1292 *
1293 * The following algorithm uses the log record header with the largest
1294 * lsn. The entire log record does not need to be valid. We only care
1295 * that the header is valid.
1296 *
1297 * We could speed up search by using current head_blk buffer, but it is not
1298 * available.
1299 */
5d77c0dc 1300STATIC int
1da177e4 1301xlog_find_tail(
9a8d2fdb 1302 struct xlog *log,
1da177e4 1303 xfs_daddr_t *head_blk,
65be6054 1304 xfs_daddr_t *tail_blk)
1da177e4
LT
1305{
1306 xlog_rec_header_t *rhead;
b2a922cd 1307 char *offset = NULL;
6e9b3dd8 1308 char *buffer;
7088c413 1309 int error;
7088c413 1310 xfs_daddr_t rhead_blk;
1da177e4 1311 xfs_lsn_t tail_lsn;
eed6b462 1312 bool wrapped = false;
65b99a08 1313 bool clean = false;
1da177e4
LT
1314
1315 /*
1316 * Find previous log record
1317 */
1318 if ((error = xlog_find_head(log, head_blk)))
1319 return error;
82ff6cc2 1320 ASSERT(*head_blk < INT_MAX);
1da177e4 1321
6e9b3dd8
CH
1322 buffer = xlog_alloc_buffer(log, 1);
1323 if (!buffer)
2451337d 1324 return -ENOMEM;
1da177e4 1325 if (*head_blk == 0) { /* special case */
6e9b3dd8 1326 error = xlog_bread(log, 0, 1, buffer, &offset);
076e6acb 1327 if (error)
9db127ed 1328 goto done;
076e6acb 1329
03bea6fe 1330 if (xlog_get_cycle(offset) == 0) {
1da177e4
LT
1331 *tail_blk = 0;
1332 /* leave all other log inited values alone */
9db127ed 1333 goto done;
1da177e4
LT
1334 }
1335 }
1336
1337 /*
82ff6cc2
BF
1338 * Search backwards through the log looking for the log record header
1339 * block. This wraps all the way back around to the head so something is
1340 * seriously wrong if we can't find it.
1da177e4 1341 */
6e9b3dd8 1342 error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
82ff6cc2
BF
1343 &rhead_blk, &rhead, &wrapped);
1344 if (error < 0)
050552cb 1345 goto done;
82ff6cc2
BF
1346 if (!error) {
1347 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
050552cb
DW
1348 error = -EFSCORRUPTED;
1349 goto done;
82ff6cc2
BF
1350 }
1351 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1da177e4
LT
1352
1353 /*
717bc0eb 1354 * Set the log state based on the current head record.
1da177e4 1355 */
717bc0eb 1356 xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
65b99a08 1357 tail_lsn = atomic64_read(&log->l_tail_lsn);
1da177e4
LT
1358
1359 /*
65b99a08
BF
1360 * Look for an unmount record at the head of the log. This sets the log
1361 * state to determine whether recovery is necessary.
1da177e4 1362 */
65b99a08 1363 error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
6e9b3dd8 1364 rhead_blk, buffer, &clean);
65b99a08
BF
1365 if (error)
1366 goto done;
1da177e4
LT
1367
1368 /*
7f6aff3a
BF
1369 * Verify the log head if the log is not clean (e.g., we have anything
1370 * but an unmount record at the head). This uses CRC verification to
1371 * detect and trim torn writes. If discovered, CRC failures are
1372 * considered torn writes and the log head is trimmed accordingly.
1da177e4 1373 *
7f6aff3a
BF
1374 * Note that we can only run CRC verification when the log is dirty
1375 * because there's no guarantee that the log data behind an unmount
1376 * record is compatible with the current architecture.
1da177e4 1377 */
7f6aff3a
BF
1378 if (!clean) {
1379 xfs_daddr_t orig_head = *head_blk;
1da177e4 1380
6e9b3dd8 1381 error = xlog_verify_head(log, head_blk, tail_blk, buffer,
7f6aff3a 1382 &rhead_blk, &rhead, &wrapped);
076e6acb 1383 if (error)
9db127ed 1384 goto done;
076e6acb 1385
7f6aff3a
BF
1386 /* update in-core state again if the head changed */
1387 if (*head_blk != orig_head) {
1388 xlog_set_state(log, *head_blk, rhead, rhead_blk,
1389 wrapped);
1390 tail_lsn = atomic64_read(&log->l_tail_lsn);
1391 error = xlog_check_unmount_rec(log, head_blk, tail_blk,
6e9b3dd8 1392 rhead, rhead_blk, buffer,
7f6aff3a
BF
1393 &clean);
1394 if (error)
1395 goto done;
1da177e4
LT
1396 }
1397 }
1398
65b99a08
BF
1399 /*
1400 * Note that the unmount was clean. If the unmount was not clean, we
1401 * need to know this to rebuild the superblock counters from the perag
1402 * headers if we have a filesystem using non-persistent counters.
1403 */
1404 if (clean)
1405 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1da177e4
LT
1406
1407 /*
1408 * Make sure that there are no blocks in front of the head
1409 * with the same cycle number as the head. This can happen
1410 * because we allow multiple outstanding log writes concurrently,
1411 * and the later writes might make it out before earlier ones.
1412 *
1413 * We use the lsn from before modifying it so that we'll never
1414 * overwrite the unmount record after a clean unmount.
1415 *
1416 * Do this only if we are going to recover the filesystem
1417 *
1418 * NOTE: This used to say "if (!readonly)"
1419 * However on Linux, we can & do recover a read-only filesystem.
1420 * We only skip recovery if NORECOVERY is specified on mount,
1421 * in which case we would not be here.
1422 *
1423 * But... if the -device- itself is readonly, just skip this.
1424 * We can't recover this device anyway, so it won't matter.
1425 */
2d15d2c0 1426 if (!xfs_readonly_buftarg(log->l_targ))
1da177e4 1427 error = xlog_clear_stale_blocks(log, tail_lsn);
1da177e4 1428
9db127ed 1429done:
6e9b3dd8 1430 kmem_free(buffer);
1da177e4
LT
1431
1432 if (error)
a0fa2b67 1433 xfs_warn(log->l_mp, "failed to locate log tail");
1da177e4
LT
1434 return error;
1435}
1436
1437/*
1438 * Is the log zeroed at all?
1439 *
1440 * The last binary search should be changed to perform an X block read
1441 * once X becomes small enough. You can then search linearly through
1442 * the X blocks. This will cut down on the number of reads we need to do.
1443 *
1444 * If the log is partially zeroed, this routine will pass back the blkno
1445 * of the first block with cycle number 0. It won't have a complete LR
1446 * preceding it.
1447 *
1448 * Return:
1449 * 0 => the log is completely written to
2451337d
DC
1450 * 1 => use *blk_no as the first block of the log
1451 * <0 => error has occurred
1da177e4 1452 */
a8272ce0 1453STATIC int
1da177e4 1454xlog_find_zeroed(
9a8d2fdb 1455 struct xlog *log,
1da177e4
LT
1456 xfs_daddr_t *blk_no)
1457{
6e9b3dd8 1458 char *buffer;
b2a922cd 1459 char *offset;
1da177e4
LT
1460 uint first_cycle, last_cycle;
1461 xfs_daddr_t new_blk, last_blk, start_blk;
1462 xfs_daddr_t num_scan_bblks;
1463 int error, log_bbnum = log->l_logBBsize;
1464
6fdf8ccc
NS
1465 *blk_no = 0;
1466
1da177e4 1467 /* check totally zeroed log */
6e9b3dd8
CH
1468 buffer = xlog_alloc_buffer(log, 1);
1469 if (!buffer)
2451337d 1470 return -ENOMEM;
6e9b3dd8 1471 error = xlog_bread(log, 0, 1, buffer, &offset);
076e6acb 1472 if (error)
6e9b3dd8 1473 goto out_free_buffer;
076e6acb 1474
03bea6fe 1475 first_cycle = xlog_get_cycle(offset);
1da177e4
LT
1476 if (first_cycle == 0) { /* completely zeroed log */
1477 *blk_no = 0;
6e9b3dd8 1478 kmem_free(buffer);
2451337d 1479 return 1;
1da177e4
LT
1480 }
1481
1482 /* check partially zeroed log */
6e9b3dd8 1483 error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
076e6acb 1484 if (error)
6e9b3dd8 1485 goto out_free_buffer;
076e6acb 1486
03bea6fe 1487 last_cycle = xlog_get_cycle(offset);
1da177e4 1488 if (last_cycle != 0) { /* log completely written to */
6e9b3dd8 1489 kmem_free(buffer);
1da177e4 1490 return 0;
1da177e4
LT
1491 }
1492
1493 /* we have a partially zeroed log */
1494 last_blk = log_bbnum-1;
6e9b3dd8
CH
1495 error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
1496 if (error)
1497 goto out_free_buffer;
1da177e4
LT
1498
1499 /*
1500 * Validate the answer. Because there is no way to guarantee that
1501 * the entire log is made up of log records which are the same size,
1502 * we scan over the defined maximum blocks. At this point, the maximum
1503 * is not chosen to mean anything special. XXXmiken
1504 */
1505 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1506 ASSERT(num_scan_bblks <= INT_MAX);
1507
1508 if (last_blk < num_scan_bblks)
1509 num_scan_bblks = last_blk;
1510 start_blk = last_blk - num_scan_bblks;
1511
1512 /*
1513 * We search for any instances of cycle number 0 that occur before
1514 * our current estimate of the head. What we're trying to detect is
1515 * 1 ... | 0 | 1 | 0...
1516 * ^ binary search ends here
1517 */
1518 if ((error = xlog_find_verify_cycle(log, start_blk,
1519 (int)num_scan_bblks, 0, &new_blk)))
6e9b3dd8 1520 goto out_free_buffer;
1da177e4
LT
1521 if (new_blk != -1)
1522 last_blk = new_blk;
1523
1524 /*
1525 * Potentially backup over partial log record write. We don't need
1526 * to search the end of the log because we know it is zero.
1527 */
2451337d
DC
1528 error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1529 if (error == 1)
1530 error = -EIO;
1531 if (error)
6e9b3dd8 1532 goto out_free_buffer;
1da177e4
LT
1533
1534 *blk_no = last_blk;
6e9b3dd8
CH
1535out_free_buffer:
1536 kmem_free(buffer);
1da177e4
LT
1537 if (error)
1538 return error;
2451337d 1539 return 1;
1da177e4
LT
1540}
1541
1542/*
1543 * These are simple subroutines used by xlog_clear_stale_blocks() below
1544 * to initialize a buffer full of empty log record headers and write
1545 * them into the log.
1546 */
1547STATIC void
1548xlog_add_record(
9a8d2fdb 1549 struct xlog *log,
b2a922cd 1550 char *buf,
1da177e4
LT
1551 int cycle,
1552 int block,
1553 int tail_cycle,
1554 int tail_block)
1555{
1556 xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
1557
1558 memset(buf, 0, BBSIZE);
b53e675d
CH
1559 recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1560 recp->h_cycle = cpu_to_be32(cycle);
1561 recp->h_version = cpu_to_be32(
62118709 1562 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
b53e675d
CH
1563 recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1564 recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1565 recp->h_fmt = cpu_to_be32(XLOG_FMT);
1da177e4
LT
1566 memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1567}
1568
1569STATIC int
1570xlog_write_log_records(
9a8d2fdb 1571 struct xlog *log,
1da177e4
LT
1572 int cycle,
1573 int start_block,
1574 int blocks,
1575 int tail_cycle,
1576 int tail_block)
1577{
b2a922cd 1578 char *offset;
6e9b3dd8 1579 char *buffer;
1da177e4 1580 int balign, ealign;
69ce58f0 1581 int sectbb = log->l_sectBBsize;
1da177e4
LT
1582 int end_block = start_block + blocks;
1583 int bufblks;
1584 int error = 0;
1585 int i, j = 0;
1586
6881a229
AE
1587 /*
1588 * Greedily allocate a buffer big enough to handle the full
1589 * range of basic blocks to be written. If that fails, try
1590 * a smaller size. We need to be able to write at least a
1591 * log sector, or we're out of luck.
1592 */
1da177e4 1593 bufblks = 1 << ffs(blocks);
81158e0c
DC
1594 while (bufblks > log->l_logBBsize)
1595 bufblks >>= 1;
6e9b3dd8 1596 while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
1da177e4 1597 bufblks >>= 1;
69ce58f0 1598 if (bufblks < sectbb)
2451337d 1599 return -ENOMEM;
1da177e4
LT
1600 }
1601
1602 /* We may need to do a read at the start to fill in part of
1603 * the buffer in the starting sector not covered by the first
1604 * write below.
1605 */
5c17f533 1606 balign = round_down(start_block, sectbb);
1da177e4 1607 if (balign != start_block) {
6e9b3dd8 1608 error = xlog_bread_noalign(log, start_block, 1, buffer);
076e6acb 1609 if (error)
6e9b3dd8 1610 goto out_free_buffer;
076e6acb 1611
1da177e4
LT
1612 j = start_block - balign;
1613 }
1614
1615 for (i = start_block; i < end_block; i += bufblks) {
1616 int bcount, endcount;
1617
1618 bcount = min(bufblks, end_block - start_block);
1619 endcount = bcount - j;
1620
1621 /* We may need to do a read at the end to fill in part of
1622 * the buffer in the final sector not covered by the write.
1623 * If this is the same sector as the above read, skip it.
1624 */
5c17f533 1625 ealign = round_down(end_block, sectbb);
1da177e4 1626 if (j == 0 && (start_block + endcount > ealign)) {
6ad5b325 1627 error = xlog_bread_noalign(log, ealign, sectbb,
6e9b3dd8 1628 buffer + BBTOB(ealign - start_block));
076e6acb
CH
1629 if (error)
1630 break;
1631
1da177e4
LT
1632 }
1633
6e9b3dd8 1634 offset = buffer + xlog_align(log, start_block);
1da177e4
LT
1635 for (; j < endcount; j++) {
1636 xlog_add_record(log, offset, cycle, i+j,
1637 tail_cycle, tail_block);
1638 offset += BBSIZE;
1639 }
6e9b3dd8 1640 error = xlog_bwrite(log, start_block, endcount, buffer);
1da177e4
LT
1641 if (error)
1642 break;
1643 start_block += endcount;
1644 j = 0;
1645 }
076e6acb 1646
6e9b3dd8
CH
1647out_free_buffer:
1648 kmem_free(buffer);
1da177e4
LT
1649 return error;
1650}
1651
1652/*
1653 * This routine is called to blow away any incomplete log writes out
1654 * in front of the log head. We do this so that we won't become confused
1655 * if we come up, write only a little bit more, and then crash again.
1656 * If we leave the partial log records out there, this situation could
1657 * cause us to think those partial writes are valid blocks since they
1658 * have the current cycle number. We get rid of them by overwriting them
1659 * with empty log records with the old cycle number rather than the
1660 * current one.
1661 *
1662 * The tail lsn is passed in rather than taken from
1663 * the log so that we will not write over the unmount record after a
1664 * clean unmount in a 512 block log. Doing so would leave the log without
1665 * any valid log records in it until a new one was written. If we crashed
1666 * during that time we would not be able to recover.
1667 */
1668STATIC int
1669xlog_clear_stale_blocks(
9a8d2fdb 1670 struct xlog *log,
1da177e4
LT
1671 xfs_lsn_t tail_lsn)
1672{
1673 int tail_cycle, head_cycle;
1674 int tail_block, head_block;
1675 int tail_distance, max_distance;
1676 int distance;
1677 int error;
1678
1679 tail_cycle = CYCLE_LSN(tail_lsn);
1680 tail_block = BLOCK_LSN(tail_lsn);
1681 head_cycle = log->l_curr_cycle;
1682 head_block = log->l_curr_block;
1683
1684 /*
1685 * Figure out the distance between the new head of the log
1686 * and the tail. We want to write over any blocks beyond the
1687 * head that we may have written just before the crash, but
1688 * we don't want to overwrite the tail of the log.
1689 */
1690 if (head_cycle == tail_cycle) {
1691 /*
1692 * The tail is behind the head in the physical log,
1693 * so the distance from the head to the tail is the
1694 * distance from the head to the end of the log plus
1695 * the distance from the beginning of the log to the
1696 * tail.
1697 */
a71895c5
DW
1698 if (XFS_IS_CORRUPT(log->l_mp,
1699 head_block < tail_block ||
1700 head_block >= log->l_logBBsize))
2451337d 1701 return -EFSCORRUPTED;
1da177e4
LT
1702 tail_distance = tail_block + (log->l_logBBsize - head_block);
1703 } else {
1704 /*
1705 * The head is behind the tail in the physical log,
1706 * so the distance from the head to the tail is just
1707 * the tail block minus the head block.
1708 */
a71895c5
DW
1709 if (XFS_IS_CORRUPT(log->l_mp,
1710 head_block >= tail_block ||
1711 head_cycle != tail_cycle + 1))
2451337d 1712 return -EFSCORRUPTED;
1da177e4
LT
1713 tail_distance = tail_block - head_block;
1714 }
1715
1716 /*
1717 * If the head is right up against the tail, we can't clear
1718 * anything.
1719 */
1720 if (tail_distance <= 0) {
1721 ASSERT(tail_distance == 0);
1722 return 0;
1723 }
1724
1725 max_distance = XLOG_TOTAL_REC_SHIFT(log);
1726 /*
1727 * Take the smaller of the maximum amount of outstanding I/O
1728 * we could have and the distance to the tail to clear out.
1729 * We take the smaller so that we don't overwrite the tail and
1730 * we don't waste all day writing from the head to the tail
1731 * for no reason.
1732 */
9bb54cb5 1733 max_distance = min(max_distance, tail_distance);
1da177e4
LT
1734
1735 if ((head_block + max_distance) <= log->l_logBBsize) {
1736 /*
1737 * We can stomp all the blocks we need to without
1738 * wrapping around the end of the log. Just do it
1739 * in a single write. Use the cycle number of the
1740 * current cycle minus one so that the log will look like:
1741 * n ... | n - 1 ...
1742 */
1743 error = xlog_write_log_records(log, (head_cycle - 1),
1744 head_block, max_distance, tail_cycle,
1745 tail_block);
1746 if (error)
1747 return error;
1748 } else {
1749 /*
1750 * We need to wrap around the end of the physical log in
1751 * order to clear all the blocks. Do it in two separate
1752 * I/Os. The first write should be from the head to the
1753 * end of the physical log, and it should use the current
1754 * cycle number minus one just like above.
1755 */
1756 distance = log->l_logBBsize - head_block;
1757 error = xlog_write_log_records(log, (head_cycle - 1),
1758 head_block, distance, tail_cycle,
1759 tail_block);
1760
1761 if (error)
1762 return error;
1763
1764 /*
1765 * Now write the blocks at the start of the physical log.
1766 * This writes the remainder of the blocks we want to clear.
1767 * It uses the current cycle number since we're now on the
1768 * same cycle as the head so that we get:
1769 * n ... n ... | n - 1 ...
1770 * ^^^^^ blocks we're writing
1771 */
1772 distance = max_distance - (log->l_logBBsize - head_block);
1773 error = xlog_write_log_records(log, head_cycle, 0, distance,
1774 tail_cycle, tail_block);
1775 if (error)
1776 return error;
1777 }
1778
1779 return 0;
1780}
1781
1782/******************************************************************************
1783 *
1784 * Log recover routines
1785 *
1786 ******************************************************************************
1787 */
86ffa471
DW
1788static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
1789 &xlog_buf_item_ops,
1790 &xlog_inode_item_ops,
1791 &xlog_dquot_item_ops,
1792 &xlog_quotaoff_item_ops,
1793 &xlog_icreate_item_ops,
1794 &xlog_efi_item_ops,
1795 &xlog_efd_item_ops,
1796 &xlog_rui_item_ops,
1797 &xlog_rud_item_ops,
1798 &xlog_cui_item_ops,
1799 &xlog_cud_item_ops,
1800 &xlog_bui_item_ops,
1801 &xlog_bud_item_ops,
1802};
1803
1804static const struct xlog_recover_item_ops *
1805xlog_find_item_ops(
1806 struct xlog_recover_item *item)
1807{
1808 unsigned int i;
1809
1810 for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
1811 if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
1812 return xlog_recover_item_ops[i];
1813
1814 return NULL;
1815}
1da177e4 1816
f0a76953 1817/*
a775ad77
DC
1818 * Sort the log items in the transaction.
1819 *
1820 * The ordering constraints are defined by the inode allocation and unlink
1821 * behaviour. The rules are:
1822 *
1823 * 1. Every item is only logged once in a given transaction. Hence it
1824 * represents the last logged state of the item. Hence ordering is
1825 * dependent on the order in which operations need to be performed so
1826 * required initial conditions are always met.
1827 *
1828 * 2. Cancelled buffers are recorded in pass 1 in a separate table and
1829 * there's nothing to replay from them so we can simply cull them
1830 * from the transaction. However, we can't do that until after we've
1831 * replayed all the other items because they may be dependent on the
1832 * cancelled buffer and replaying the cancelled buffer can remove it
1833 * form the cancelled buffer table. Hence they have tobe done last.
1834 *
1835 * 3. Inode allocation buffers must be replayed before inode items that
28c8e41a
DC
1836 * read the buffer and replay changes into it. For filesystems using the
1837 * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1838 * treated the same as inode allocation buffers as they create and
1839 * initialise the buffers directly.
a775ad77
DC
1840 *
1841 * 4. Inode unlink buffers must be replayed after inode items are replayed.
1842 * This ensures that inodes are completely flushed to the inode buffer
1843 * in a "free" state before we remove the unlinked inode list pointer.
1844 *
1845 * Hence the ordering needs to be inode allocation buffers first, inode items
1846 * second, inode unlink buffers third and cancelled buffers last.
1847 *
1848 * But there's a problem with that - we can't tell an inode allocation buffer
1849 * apart from a regular buffer, so we can't separate them. We can, however,
1850 * tell an inode unlink buffer from the others, and so we can separate them out
1851 * from all the other buffers and move them to last.
1852 *
1853 * Hence, 4 lists, in order from head to tail:
28c8e41a
DC
1854 * - buffer_list for all buffers except cancelled/inode unlink buffers
1855 * - item_list for all non-buffer items
1856 * - inode_buffer_list for inode unlink buffers
1857 * - cancel_list for the cancelled buffers
1858 *
1859 * Note that we add objects to the tail of the lists so that first-to-last
1860 * ordering is preserved within the lists. Adding objects to the head of the
1861 * list means when we traverse from the head we walk them in last-to-first
1862 * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1863 * but for all other items there may be specific ordering that we need to
1864 * preserve.
f0a76953 1865 */
1da177e4
LT
1866STATIC int
1867xlog_recover_reorder_trans(
ad223e60
MT
1868 struct xlog *log,
1869 struct xlog_recover *trans,
9abbc539 1870 int pass)
1da177e4 1871{
35f4521f 1872 struct xlog_recover_item *item, *n;
2a84108f 1873 int error = 0;
f0a76953 1874 LIST_HEAD(sort_list);
a775ad77
DC
1875 LIST_HEAD(cancel_list);
1876 LIST_HEAD(buffer_list);
1877 LIST_HEAD(inode_buffer_list);
5ce70b77 1878 LIST_HEAD(item_list);
f0a76953
DC
1879
1880 list_splice_init(&trans->r_itemq, &sort_list);
1881 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
86ffa471 1882 enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST;
1da177e4 1883
86ffa471
DW
1884 item->ri_ops = xlog_find_item_ops(item);
1885 if (!item->ri_ops) {
a0fa2b67 1886 xfs_warn(log->l_mp,
0d2d35a3
DW
1887 "%s: unrecognized type of log operation (%d)",
1888 __func__, ITEM_TYPE(item));
1da177e4 1889 ASSERT(0);
2a84108f
MT
1890 /*
1891 * return the remaining items back to the transaction
1892 * item list so they can be freed in caller.
1893 */
1894 if (!list_empty(&sort_list))
1895 list_splice_init(&sort_list, &trans->r_itemq);
86ffa471
DW
1896 error = -EFSCORRUPTED;
1897 break;
1898 }
1899
1900 if (item->ri_ops->reorder)
1901 fate = item->ri_ops->reorder(item);
1902
1903 switch (fate) {
1904 case XLOG_REORDER_BUFFER_LIST:
1905 list_move_tail(&item->ri_list, &buffer_list);
1906 break;
1907 case XLOG_REORDER_CANCEL_LIST:
1908 trace_xfs_log_recover_item_reorder_head(log,
1909 trans, item, pass);
1910 list_move(&item->ri_list, &cancel_list);
1911 break;
1912 case XLOG_REORDER_INODE_BUFFER_LIST:
1913 list_move(&item->ri_list, &inode_buffer_list);
1914 break;
1915 case XLOG_REORDER_ITEM_LIST:
1916 trace_xfs_log_recover_item_reorder_tail(log,
1917 trans, item, pass);
1918 list_move_tail(&item->ri_list, &item_list);
1919 break;
1da177e4 1920 }
f0a76953 1921 }
86ffa471 1922
f0a76953 1923 ASSERT(list_empty(&sort_list));
a775ad77
DC
1924 if (!list_empty(&buffer_list))
1925 list_splice(&buffer_list, &trans->r_itemq);
5ce70b77
CH
1926 if (!list_empty(&item_list))
1927 list_splice_tail(&item_list, &trans->r_itemq);
a775ad77
DC
1928 if (!list_empty(&inode_buffer_list))
1929 list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1930 if (!list_empty(&cancel_list))
1931 list_splice_tail(&cancel_list, &trans->r_itemq);
2a84108f 1932 return error;
1da177e4
LT
1933}
1934
e968350a
CH
1935static struct xfs_buf_cancel *
1936xlog_find_buffer_cancelled(
ad223e60 1937 struct xlog *log,
1da177e4 1938 xfs_daddr_t blkno,
e968350a 1939 uint len)
1da177e4 1940{
d5689eaa
CH
1941 struct list_head *bucket;
1942 struct xfs_buf_cancel *bcp;
1da177e4 1943
e968350a 1944 if (!log->l_buf_cancel_table)
84a5b730 1945 return NULL;
1da177e4 1946
d5689eaa
CH
1947 bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1948 list_for_each_entry(bcp, bucket, bc_list) {
1949 if (bcp->bc_blkno == blkno && bcp->bc_len == len)
84a5b730 1950 return bcp;
1da177e4 1951 }
d5689eaa 1952
84a5b730
DC
1953 return NULL;
1954}
1955
3304a4fa 1956bool
98b69b12
CH
1957xlog_add_buffer_cancelled(
1958 struct xlog *log,
1959 xfs_daddr_t blkno,
1960 uint len)
1961{
1962 struct xfs_buf_cancel *bcp;
1963
1964 /*
1965 * If we find an existing cancel record, this indicates that the buffer
1966 * was cancelled multiple times. To ensure that during pass 2 we keep
1967 * the record in the table until we reach its last occurrence in the
1968 * log, a reference count is kept to tell how many times we expect to
1969 * see this record during the second pass.
1970 */
1971 bcp = xlog_find_buffer_cancelled(log, blkno, len);
1972 if (bcp) {
1973 bcp->bc_refcount++;
1974 return false;
1975 }
1976
1977 bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
1978 bcp->bc_blkno = blkno;
1979 bcp->bc_len = len;
1980 bcp->bc_refcount = 1;
1981 list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno));
1982 return true;
1983}
1984
84a5b730 1985/*
e968350a
CH
1986 * Check if there is and entry for blkno, len in the buffer cancel record table.
1987 */
1094d3f1 1988bool
e968350a
CH
1989xlog_is_buffer_cancelled(
1990 struct xlog *log,
1991 xfs_daddr_t blkno,
1992 uint len)
1993{
1994 return xlog_find_buffer_cancelled(log, blkno, len) != NULL;
1995}
1996
1997/*
1998 * Check if there is and entry for blkno, len in the buffer cancel record table,
1999 * and decremented the reference count on it if there is one.
84a5b730 2000 *
e968350a
CH
2001 * Remove the cancel record once the refcount hits zero, so that if the same
2002 * buffer is re-used again after its last cancellation we actually replay the
2003 * changes made at that point.
84a5b730 2004 */
1094d3f1 2005bool
e968350a 2006xlog_put_buffer_cancelled(
84a5b730
DC
2007 struct xlog *log,
2008 xfs_daddr_t blkno,
e968350a 2009 uint len)
84a5b730
DC
2010{
2011 struct xfs_buf_cancel *bcp;
2012
e968350a
CH
2013 bcp = xlog_find_buffer_cancelled(log, blkno, len);
2014 if (!bcp) {
2015 ASSERT(0);
2016 return false;
2017 }
d5689eaa 2018
e968350a
CH
2019 if (--bcp->bc_refcount == 0) {
2020 list_del(&bcp->bc_list);
2021 kmem_free(bcp);
d5689eaa 2022 }
e968350a 2023 return true;
1da177e4
LT
2024}
2025
8ea5682d 2026void
7d4894b4
CH
2027xlog_buf_readahead(
2028 struct xlog *log,
2029 xfs_daddr_t blkno,
2030 uint len,
2031 const struct xfs_buf_ops *ops)
2032{
2033 if (!xlog_is_buffer_cancelled(log, blkno, len))
2034 xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
2035}
2036
00574da1
ZYW
2037STATIC int
2038xlog_recover_items_pass2(
2039 struct xlog *log,
2040 struct xlog_recover *trans,
2041 struct list_head *buffer_list,
2042 struct list_head *item_list)
2043{
2044 struct xlog_recover_item *item;
2045 int error = 0;
2046
2047 list_for_each_entry(item, item_list, ri_list) {
2565a11b
DW
2048 trace_xfs_log_recover_item_recover(log, trans, item,
2049 XLOG_RECOVER_PASS2);
2050
2051 if (item->ri_ops->commit_pass2)
2052 error = item->ri_ops->commit_pass2(log, buffer_list,
2053 item, trans->r_lsn);
00574da1
ZYW
2054 if (error)
2055 return error;
2056 }
2057
2058 return error;
2059}
2060
d0450948
CH
2061/*
2062 * Perform the transaction.
2063 *
2064 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2065 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2066 */
1da177e4
LT
2067STATIC int
2068xlog_recover_commit_trans(
ad223e60 2069 struct xlog *log,
d0450948 2070 struct xlog_recover *trans,
12818d24
BF
2071 int pass,
2072 struct list_head *buffer_list)
1da177e4 2073{
00574da1 2074 int error = 0;
00574da1
ZYW
2075 int items_queued = 0;
2076 struct xlog_recover_item *item;
2077 struct xlog_recover_item *next;
00574da1
ZYW
2078 LIST_HEAD (ra_list);
2079 LIST_HEAD (done_list);
2080
2081 #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
1da177e4 2082
39775431 2083 hlist_del_init(&trans->r_list);
d0450948
CH
2084
2085 error = xlog_recover_reorder_trans(log, trans, pass);
2086 if (error)
1da177e4 2087 return error;
d0450948 2088
00574da1 2089 list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
3304a4fa
DW
2090 trace_xfs_log_recover_item_recover(log, trans, item, pass);
2091
43ff2122
CH
2092 switch (pass) {
2093 case XLOG_RECOVER_PASS1:
3304a4fa
DW
2094 if (item->ri_ops->commit_pass1)
2095 error = item->ri_ops->commit_pass1(log, item);
43ff2122
CH
2096 break;
2097 case XLOG_RECOVER_PASS2:
8ea5682d
DW
2098 if (item->ri_ops->ra_pass2)
2099 item->ri_ops->ra_pass2(log, item);
00574da1
ZYW
2100 list_move_tail(&item->ri_list, &ra_list);
2101 items_queued++;
2102 if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
2103 error = xlog_recover_items_pass2(log, trans,
12818d24 2104 buffer_list, &ra_list);
00574da1
ZYW
2105 list_splice_tail_init(&ra_list, &done_list);
2106 items_queued = 0;
2107 }
2108
43ff2122
CH
2109 break;
2110 default:
2111 ASSERT(0);
2112 }
2113
d0450948 2114 if (error)
43ff2122 2115 goto out;
d0450948
CH
2116 }
2117
00574da1
ZYW
2118out:
2119 if (!list_empty(&ra_list)) {
2120 if (!error)
2121 error = xlog_recover_items_pass2(log, trans,
12818d24 2122 buffer_list, &ra_list);
00574da1
ZYW
2123 list_splice_tail_init(&ra_list, &done_list);
2124 }
2125
2126 if (!list_empty(&done_list))
2127 list_splice_init(&done_list, &trans->r_itemq);
2128
12818d24 2129 return error;
1da177e4
LT
2130}
2131
76560669
DC
2132STATIC void
2133xlog_recover_add_item(
2134 struct list_head *head)
2135{
35f4521f 2136 struct xlog_recover_item *item;
76560669 2137
35f4521f 2138 item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
76560669
DC
2139 INIT_LIST_HEAD(&item->ri_list);
2140 list_add_tail(&item->ri_list, head);
2141}
2142
1da177e4 2143STATIC int
76560669
DC
2144xlog_recover_add_to_cont_trans(
2145 struct xlog *log,
2146 struct xlog_recover *trans,
b2a922cd 2147 char *dp,
76560669 2148 int len)
1da177e4 2149{
35f4521f 2150 struct xlog_recover_item *item;
b2a922cd 2151 char *ptr, *old_ptr;
76560669
DC
2152 int old_len;
2153
89cebc84
BF
2154 /*
2155 * If the transaction is empty, the header was split across this and the
2156 * previous record. Copy the rest of the header.
2157 */
76560669 2158 if (list_empty(&trans->r_itemq)) {
848ccfc8 2159 ASSERT(len <= sizeof(struct xfs_trans_header));
89cebc84
BF
2160 if (len > sizeof(struct xfs_trans_header)) {
2161 xfs_warn(log->l_mp, "%s: bad header length", __func__);
895e196f 2162 return -EFSCORRUPTED;
89cebc84
BF
2163 }
2164
76560669 2165 xlog_recover_add_item(&trans->r_itemq);
b2a922cd 2166 ptr = (char *)&trans->r_theader +
89cebc84 2167 sizeof(struct xfs_trans_header) - len;
76560669
DC
2168 memcpy(ptr, dp, len);
2169 return 0;
2170 }
89cebc84 2171
76560669 2172 /* take the tail entry */
35f4521f
DW
2173 item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2174 ri_list);
76560669
DC
2175
2176 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
2177 old_len = item->ri_buf[item->ri_cnt-1].i_len;
2178
707e0dda 2179 ptr = kmem_realloc(old_ptr, len + old_len, 0);
76560669
DC
2180 memcpy(&ptr[old_len], dp, len);
2181 item->ri_buf[item->ri_cnt-1].i_len += len;
2182 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
2183 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1da177e4
LT
2184 return 0;
2185}
2186
76560669
DC
2187/*
2188 * The next region to add is the start of a new region. It could be
2189 * a whole region or it could be the first part of a new region. Because
2190 * of this, the assumption here is that the type and size fields of all
2191 * format structures fit into the first 32 bits of the structure.
2192 *
2193 * This works because all regions must be 32 bit aligned. Therefore, we
2194 * either have both fields or we have neither field. In the case we have
2195 * neither field, the data part of the region is zero length. We only have
2196 * a log_op_header and can throw away the header since a new one will appear
2197 * later. If we have at least 4 bytes, then we can determine how many regions
2198 * will appear in the current log item.
2199 */
2200STATIC int
2201xlog_recover_add_to_trans(
2202 struct xlog *log,
2203 struct xlog_recover *trans,
b2a922cd 2204 char *dp,
76560669
DC
2205 int len)
2206{
06b11321 2207 struct xfs_inode_log_format *in_f; /* any will do */
35f4521f 2208 struct xlog_recover_item *item;
b2a922cd 2209 char *ptr;
76560669
DC
2210
2211 if (!len)
2212 return 0;
2213 if (list_empty(&trans->r_itemq)) {
2214 /* we need to catch log corruptions here */
2215 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
2216 xfs_warn(log->l_mp, "%s: bad header magic number",
2217 __func__);
2218 ASSERT(0);
895e196f 2219 return -EFSCORRUPTED;
76560669 2220 }
89cebc84
BF
2221
2222 if (len > sizeof(struct xfs_trans_header)) {
2223 xfs_warn(log->l_mp, "%s: bad header length", __func__);
2224 ASSERT(0);
895e196f 2225 return -EFSCORRUPTED;
89cebc84
BF
2226 }
2227
2228 /*
2229 * The transaction header can be arbitrarily split across op
2230 * records. If we don't have the whole thing here, copy what we
2231 * do have and handle the rest in the next record.
2232 */
2233 if (len == sizeof(struct xfs_trans_header))
76560669
DC
2234 xlog_recover_add_item(&trans->r_itemq);
2235 memcpy(&trans->r_theader, dp, len);
2236 return 0;
2237 }
2238
707e0dda 2239 ptr = kmem_alloc(len, 0);
76560669 2240 memcpy(ptr, dp, len);
06b11321 2241 in_f = (struct xfs_inode_log_format *)ptr;
76560669
DC
2242
2243 /* take the tail entry */
35f4521f
DW
2244 item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
2245 ri_list);
76560669
DC
2246 if (item->ri_total != 0 &&
2247 item->ri_total == item->ri_cnt) {
2248 /* tail item is in use, get a new one */
2249 xlog_recover_add_item(&trans->r_itemq);
2250 item = list_entry(trans->r_itemq.prev,
35f4521f 2251 struct xlog_recover_item, ri_list);
76560669
DC
2252 }
2253
2254 if (item->ri_total == 0) { /* first region to be added */
2255 if (in_f->ilf_size == 0 ||
2256 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
2257 xfs_warn(log->l_mp,
2258 "bad number of regions (%d) in inode log format",
2259 in_f->ilf_size);
2260 ASSERT(0);
2261 kmem_free(ptr);
895e196f 2262 return -EFSCORRUPTED;
76560669
DC
2263 }
2264
2265 item->ri_total = in_f->ilf_size;
2266 item->ri_buf =
2267 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
707e0dda 2268 0);
76560669 2269 }
d6abecb8
DW
2270
2271 if (item->ri_total <= item->ri_cnt) {
2272 xfs_warn(log->l_mp,
2273 "log item region count (%d) overflowed size (%d)",
2274 item->ri_cnt, item->ri_total);
2275 ASSERT(0);
2276 kmem_free(ptr);
2277 return -EFSCORRUPTED;
2278 }
2279
76560669
DC
2280 /* Description region is ri_buf[0] */
2281 item->ri_buf[item->ri_cnt].i_addr = ptr;
2282 item->ri_buf[item->ri_cnt].i_len = len;
2283 item->ri_cnt++;
2284 trace_xfs_log_recover_item_add(log, trans, item, 0);
2285 return 0;
2286}
b818cca1 2287
76560669
DC
2288/*
2289 * Free up any resources allocated by the transaction
2290 *
2291 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2292 */
2293STATIC void
2294xlog_recover_free_trans(
2295 struct xlog_recover *trans)
2296{
35f4521f 2297 struct xlog_recover_item *item, *n;
76560669
DC
2298 int i;
2299
39775431
BF
2300 hlist_del_init(&trans->r_list);
2301
76560669
DC
2302 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2303 /* Free the regions in the item. */
2304 list_del(&item->ri_list);
2305 for (i = 0; i < item->ri_cnt; i++)
2306 kmem_free(item->ri_buf[i].i_addr);
2307 /* Free the item itself */
2308 kmem_free(item->ri_buf);
2309 kmem_free(item);
2310 }
2311 /* Free the transaction recover structure */
2312 kmem_free(trans);
2313}
2314
e9131e50
DC
2315/*
2316 * On error or completion, trans is freed.
2317 */
1da177e4 2318STATIC int
eeb11688
DC
2319xlog_recovery_process_trans(
2320 struct xlog *log,
2321 struct xlog_recover *trans,
b2a922cd 2322 char *dp,
eeb11688
DC
2323 unsigned int len,
2324 unsigned int flags,
12818d24
BF
2325 int pass,
2326 struct list_head *buffer_list)
1da177e4 2327{
e9131e50
DC
2328 int error = 0;
2329 bool freeit = false;
eeb11688
DC
2330
2331 /* mask off ophdr transaction container flags */
2332 flags &= ~XLOG_END_TRANS;
2333 if (flags & XLOG_WAS_CONT_TRANS)
2334 flags &= ~XLOG_CONTINUE_TRANS;
2335
88b863db
DC
2336 /*
2337 * Callees must not free the trans structure. We'll decide if we need to
2338 * free it or not based on the operation being done and it's result.
2339 */
eeb11688
DC
2340 switch (flags) {
2341 /* expected flag values */
2342 case 0:
2343 case XLOG_CONTINUE_TRANS:
2344 error = xlog_recover_add_to_trans(log, trans, dp, len);
2345 break;
2346 case XLOG_WAS_CONT_TRANS:
2347 error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
2348 break;
2349 case XLOG_COMMIT_TRANS:
12818d24
BF
2350 error = xlog_recover_commit_trans(log, trans, pass,
2351 buffer_list);
88b863db
DC
2352 /* success or fail, we are now done with this transaction. */
2353 freeit = true;
eeb11688
DC
2354 break;
2355
2356 /* unexpected flag values */
2357 case XLOG_UNMOUNT_TRANS:
e9131e50 2358 /* just skip trans */
eeb11688 2359 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
e9131e50 2360 freeit = true;
eeb11688
DC
2361 break;
2362 case XLOG_START_TRANS:
eeb11688
DC
2363 default:
2364 xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
2365 ASSERT(0);
895e196f 2366 error = -EFSCORRUPTED;
eeb11688
DC
2367 break;
2368 }
e9131e50
DC
2369 if (error || freeit)
2370 xlog_recover_free_trans(trans);
eeb11688
DC
2371 return error;
2372}
2373
b818cca1
DC
2374/*
2375 * Lookup the transaction recovery structure associated with the ID in the
2376 * current ophdr. If the transaction doesn't exist and the start flag is set in
2377 * the ophdr, then allocate a new transaction for future ID matches to find.
2378 * Either way, return what we found during the lookup - an existing transaction
2379 * or nothing.
2380 */
eeb11688
DC
2381STATIC struct xlog_recover *
2382xlog_recover_ophdr_to_trans(
2383 struct hlist_head rhash[],
2384 struct xlog_rec_header *rhead,
2385 struct xlog_op_header *ohead)
2386{
2387 struct xlog_recover *trans;
2388 xlog_tid_t tid;
2389 struct hlist_head *rhp;
2390
2391 tid = be32_to_cpu(ohead->oh_tid);
2392 rhp = &rhash[XLOG_RHASH(tid)];
b818cca1
DC
2393 hlist_for_each_entry(trans, rhp, r_list) {
2394 if (trans->r_log_tid == tid)
2395 return trans;
2396 }
eeb11688
DC
2397
2398 /*
b818cca1
DC
2399 * skip over non-start transaction headers - we could be
2400 * processing slack space before the next transaction starts
2401 */
2402 if (!(ohead->oh_flags & XLOG_START_TRANS))
2403 return NULL;
2404
2405 ASSERT(be32_to_cpu(ohead->oh_len) == 0);
2406
2407 /*
2408 * This is a new transaction so allocate a new recovery container to
2409 * hold the recovery ops that will follow.
2410 */
707e0dda 2411 trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
b818cca1
DC
2412 trans->r_log_tid = tid;
2413 trans->r_lsn = be64_to_cpu(rhead->h_lsn);
2414 INIT_LIST_HEAD(&trans->r_itemq);
2415 INIT_HLIST_NODE(&trans->r_list);
2416 hlist_add_head(&trans->r_list, rhp);
2417
2418 /*
2419 * Nothing more to do for this ophdr. Items to be added to this new
2420 * transaction will be in subsequent ophdr containers.
eeb11688 2421 */
eeb11688
DC
2422 return NULL;
2423}
2424
2425STATIC int
2426xlog_recover_process_ophdr(
2427 struct xlog *log,
2428 struct hlist_head rhash[],
2429 struct xlog_rec_header *rhead,
2430 struct xlog_op_header *ohead,
b2a922cd
CH
2431 char *dp,
2432 char *end,
12818d24
BF
2433 int pass,
2434 struct list_head *buffer_list)
eeb11688
DC
2435{
2436 struct xlog_recover *trans;
eeb11688 2437 unsigned int len;
12818d24 2438 int error;
eeb11688
DC
2439
2440 /* Do we understand who wrote this op? */
2441 if (ohead->oh_clientid != XFS_TRANSACTION &&
2442 ohead->oh_clientid != XFS_LOG) {
2443 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2444 __func__, ohead->oh_clientid);
2445 ASSERT(0);
895e196f 2446 return -EFSCORRUPTED;
eeb11688
DC
2447 }
2448
2449 /*
2450 * Check the ophdr contains all the data it is supposed to contain.
2451 */
2452 len = be32_to_cpu(ohead->oh_len);
2453 if (dp + len > end) {
2454 xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
2455 WARN_ON(1);
895e196f 2456 return -EFSCORRUPTED;
eeb11688
DC
2457 }
2458
2459 trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
2460 if (!trans) {
2461 /* nothing to do, so skip over this ophdr */
2462 return 0;
2463 }
2464
12818d24
BF
2465 /*
2466 * The recovered buffer queue is drained only once we know that all
2467 * recovery items for the current LSN have been processed. This is
2468 * required because:
2469 *
2470 * - Buffer write submission updates the metadata LSN of the buffer.
2471 * - Log recovery skips items with a metadata LSN >= the current LSN of
2472 * the recovery item.
2473 * - Separate recovery items against the same metadata buffer can share
2474 * a current LSN. I.e., consider that the LSN of a recovery item is
2475 * defined as the starting LSN of the first record in which its
2476 * transaction appears, that a record can hold multiple transactions,
2477 * and/or that a transaction can span multiple records.
2478 *
2479 * In other words, we are allowed to submit a buffer from log recovery
2480 * once per current LSN. Otherwise, we may incorrectly skip recovery
2481 * items and cause corruption.
2482 *
2483 * We don't know up front whether buffers are updated multiple times per
2484 * LSN. Therefore, track the current LSN of each commit log record as it
2485 * is processed and drain the queue when it changes. Use commit records
2486 * because they are ordered correctly by the logging code.
2487 */
2488 if (log->l_recovery_lsn != trans->r_lsn &&
2489 ohead->oh_flags & XLOG_COMMIT_TRANS) {
2490 error = xfs_buf_delwri_submit(buffer_list);
2491 if (error)
2492 return error;
2493 log->l_recovery_lsn = trans->r_lsn;
2494 }
2495
e9131e50 2496 return xlog_recovery_process_trans(log, trans, dp, len,
12818d24 2497 ohead->oh_flags, pass, buffer_list);
1da177e4
LT
2498}
2499
2500/*
2501 * There are two valid states of the r_state field. 0 indicates that the
2502 * transaction structure is in a normal state. We have either seen the
2503 * start of the transaction or the last operation we added was not a partial
2504 * operation. If the last operation we added to the transaction was a
2505 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2506 *
2507 * NOTE: skip LRs with 0 data length.
2508 */
2509STATIC int
2510xlog_recover_process_data(
9a8d2fdb 2511 struct xlog *log,
f0a76953 2512 struct hlist_head rhash[],
9a8d2fdb 2513 struct xlog_rec_header *rhead,
b2a922cd 2514 char *dp,
12818d24
BF
2515 int pass,
2516 struct list_head *buffer_list)
1da177e4 2517{
eeb11688 2518 struct xlog_op_header *ohead;
b2a922cd 2519 char *end;
1da177e4 2520 int num_logops;
1da177e4 2521 int error;
1da177e4 2522
eeb11688 2523 end = dp + be32_to_cpu(rhead->h_len);
b53e675d 2524 num_logops = be32_to_cpu(rhead->h_num_logops);
1da177e4
LT
2525
2526 /* check the log format matches our own - else we can't recover */
2527 if (xlog_header_check_recover(log->l_mp, rhead))
2451337d 2528 return -EIO;
1da177e4 2529
5cd9cee9 2530 trace_xfs_log_recover_record(log, rhead, pass);
eeb11688
DC
2531 while ((dp < end) && num_logops) {
2532
2533 ohead = (struct xlog_op_header *)dp;
2534 dp += sizeof(*ohead);
2535 ASSERT(dp <= end);
2536
2537 /* errors will abort recovery */
2538 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
12818d24 2539 dp, end, pass, buffer_list);
eeb11688
DC
2540 if (error)
2541 return error;
2542
67fcb7bf 2543 dp += be32_to_cpu(ohead->oh_len);
1da177e4
LT
2544 num_logops--;
2545 }
2546 return 0;
2547}
2548
f997ee21
DW
2549/* Recover the CUI if necessary. */
2550STATIC int
2551xlog_recover_process_cui(
fbfa977d 2552 struct xfs_trans *parent_tp,
f997ee21 2553 struct xfs_ail *ailp,
fbfa977d 2554 struct xfs_log_item *lip)
f997ee21
DW
2555{
2556 struct xfs_cui_log_item *cuip;
2557 int error;
2558
2559 /*
2560 * Skip CUIs that we've already processed.
2561 */
2562 cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
2563 if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
2564 return 0;
2565
57e80956 2566 spin_unlock(&ailp->ail_lock);
fbfa977d 2567 error = xfs_cui_recover(parent_tp, cuip);
57e80956 2568 spin_lock(&ailp->ail_lock);
f997ee21
DW
2569
2570 return error;
2571}
2572
2573/* Release the CUI since we're cancelling everything. */
2574STATIC void
2575xlog_recover_cancel_cui(
2576 struct xfs_mount *mp,
2577 struct xfs_ail *ailp,
2578 struct xfs_log_item *lip)
2579{
2580 struct xfs_cui_log_item *cuip;
2581
2582 cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
2583
57e80956 2584 spin_unlock(&ailp->ail_lock);
f997ee21 2585 xfs_cui_release(cuip);
57e80956 2586 spin_lock(&ailp->ail_lock);
f997ee21
DW
2587}
2588
77d61fe4
DW
2589/* Recover the BUI if necessary. */
2590STATIC int
2591xlog_recover_process_bui(
fbfa977d 2592 struct xfs_trans *parent_tp,
77d61fe4 2593 struct xfs_ail *ailp,
fbfa977d 2594 struct xfs_log_item *lip)
77d61fe4
DW
2595{
2596 struct xfs_bui_log_item *buip;
2597 int error;
2598
2599 /*
2600 * Skip BUIs that we've already processed.
2601 */
2602 buip = container_of(lip, struct xfs_bui_log_item, bui_item);
2603 if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
2604 return 0;
2605
57e80956 2606 spin_unlock(&ailp->ail_lock);
fbfa977d 2607 error = xfs_bui_recover(parent_tp, buip);
57e80956 2608 spin_lock(&ailp->ail_lock);
77d61fe4
DW
2609
2610 return error;
2611}
2612
2613/* Release the BUI since we're cancelling everything. */
2614STATIC void
2615xlog_recover_cancel_bui(
2616 struct xfs_mount *mp,
2617 struct xfs_ail *ailp,
2618 struct xfs_log_item *lip)
2619{
2620 struct xfs_bui_log_item *buip;
2621
2622 buip = container_of(lip, struct xfs_bui_log_item, bui_item);
2623
57e80956 2624 spin_unlock(&ailp->ail_lock);
77d61fe4 2625 xfs_bui_release(buip);
57e80956 2626 spin_lock(&ailp->ail_lock);
77d61fe4
DW
2627}
2628
dc42375d
DW
2629/* Is this log item a deferred action intent? */
2630static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
2631{
2632 switch (lip->li_type) {
2633 case XFS_LI_EFI:
9e88b5d8 2634 case XFS_LI_RUI:
f997ee21 2635 case XFS_LI_CUI:
77d61fe4 2636 case XFS_LI_BUI:
dc42375d
DW
2637 return true;
2638 default:
2639 return false;
2640 }
1da177e4
LT
2641}
2642
50995582
DW
2643/* Take all the collected deferred ops and finish them in order. */
2644static int
2645xlog_finish_defer_ops(
fbfa977d 2646 struct xfs_trans *parent_tp)
50995582 2647{
fbfa977d 2648 struct xfs_mount *mp = parent_tp->t_mountp;
50995582
DW
2649 struct xfs_trans *tp;
2650 int64_t freeblks;
2651 uint resblks;
2652 int error;
2653
2654 /*
2655 * We're finishing the defer_ops that accumulated as a result of
2656 * recovering unfinished intent items during log recovery. We
2657 * reserve an itruncate transaction because it is the largest
2658 * permanent transaction type. Since we're the only user of the fs
2659 * right now, take 93% (15/16) of the available free blocks. Use
2660 * weird math to avoid a 64-bit division.
2661 */
2662 freeblks = percpu_counter_sum(&mp->m_fdblocks);
2663 if (freeblks <= 0)
2664 return -ENOSPC;
2665 resblks = min_t(int64_t, UINT_MAX, freeblks);
2666 resblks = (resblks * 15) >> 4;
2667 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
2668 0, XFS_TRANS_RESERVE, &tp);
2669 if (error)
2670 return error;
91ef75b6 2671 /* transfer all collected dfops to this transaction */
ce356d64 2672 xfs_defer_move(tp, parent_tp);
50995582 2673
50995582 2674 return xfs_trans_commit(tp);
50995582
DW
2675}
2676
1da177e4 2677/*
dc42375d
DW
2678 * When this is called, all of the log intent items which did not have
2679 * corresponding log done items should be in the AIL. What we do now
2680 * is update the data structures associated with each one.
1da177e4 2681 *
dc42375d
DW
2682 * Since we process the log intent items in normal transactions, they
2683 * will be removed at some point after the commit. This prevents us
2684 * from just walking down the list processing each one. We'll use a
2685 * flag in the intent item to skip those that we've already processed
2686 * and use the AIL iteration mechanism's generation count to try to
2687 * speed this up at least a bit.
1da177e4 2688 *
dc42375d
DW
2689 * When we start, we know that the intents are the only things in the
2690 * AIL. As we process them, however, other items are added to the
2691 * AIL.
1da177e4 2692 */
3c1e2bbe 2693STATIC int
dc42375d 2694xlog_recover_process_intents(
f0b2efad 2695 struct xlog *log)
1da177e4 2696{
fbfa977d 2697 struct xfs_trans *parent_tp;
27d8d5fe 2698 struct xfs_ail_cursor cur;
50995582 2699 struct xfs_log_item *lip;
a9c21c1b 2700 struct xfs_ail *ailp;
fbfa977d 2701 int error;
7bf7a193 2702#if defined(DEBUG) || defined(XFS_WARN)
dc42375d 2703 xfs_lsn_t last_lsn;
7bf7a193 2704#endif
1da177e4 2705
fbfa977d
BF
2706 /*
2707 * The intent recovery handlers commit transactions to complete recovery
2708 * for individual intents, but any new deferred operations that are
2709 * queued during that process are held off until the very end. The
2710 * purpose of this transaction is to serve as a container for deferred
2711 * operations. Each intent recovery handler must transfer dfops here
2712 * before its local transaction commits, and we'll finish the entire
2713 * list below.
2714 */
2715 error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
2716 if (error)
2717 return error;
2718
a9c21c1b 2719 ailp = log->l_ailp;
57e80956 2720 spin_lock(&ailp->ail_lock);
a9c21c1b 2721 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
7bf7a193 2722#if defined(DEBUG) || defined(XFS_WARN)
dc42375d 2723 last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
7bf7a193 2724#endif
1da177e4
LT
2725 while (lip != NULL) {
2726 /*
dc42375d
DW
2727 * We're done when we see something other than an intent.
2728 * There should be no intents left in the AIL now.
1da177e4 2729 */
dc42375d 2730 if (!xlog_item_is_intent(lip)) {
27d8d5fe 2731#ifdef DEBUG
a9c21c1b 2732 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
dc42375d 2733 ASSERT(!xlog_item_is_intent(lip));
27d8d5fe 2734#endif
1da177e4
LT
2735 break;
2736 }
2737
2738 /*
dc42375d
DW
2739 * We should never see a redo item with a LSN higher than
2740 * the last transaction we found in the log at the start
2741 * of recovery.
1da177e4 2742 */
dc42375d 2743 ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
1da177e4 2744
50995582
DW
2745 /*
2746 * NOTE: If your intent processing routine can create more
2747 * deferred ops, you /must/ attach them to the dfops in this
2748 * routine or else those subsequent intents will get
2749 * replayed in the wrong order!
2750 */
dc42375d 2751 switch (lip->li_type) {
f997ee21 2752 case XFS_LI_CUI:
fbfa977d 2753 error = xlog_recover_process_cui(parent_tp, ailp, lip);
f997ee21 2754 break;
77d61fe4 2755 case XFS_LI_BUI:
fbfa977d 2756 error = xlog_recover_process_bui(parent_tp, ailp, lip);
77d61fe4 2757 break;
10d0c6e0
DW
2758 default:
2759 error = lip->li_ops->iop_recover(lip, parent_tp);
2760 break;
dc42375d 2761 }
27d8d5fe
DC
2762 if (error)
2763 goto out;
a9c21c1b 2764 lip = xfs_trans_ail_cursor_next(ailp, &cur);
1da177e4 2765 }
27d8d5fe 2766out:
e4a1e29c 2767 xfs_trans_ail_cursor_done(&cur);
57e80956 2768 spin_unlock(&ailp->ail_lock);
fbfa977d
BF
2769 if (!error)
2770 error = xlog_finish_defer_ops(parent_tp);
2771 xfs_trans_cancel(parent_tp);
50995582 2772
3c1e2bbe 2773 return error;
1da177e4
LT
2774}
2775
f0b2efad 2776/*
dc42375d
DW
2777 * A cancel occurs when the mount has failed and we're bailing out.
2778 * Release all pending log intent items so they don't pin the AIL.
f0b2efad 2779 */
a7a9250e 2780STATIC void
dc42375d 2781xlog_recover_cancel_intents(
f0b2efad
BF
2782 struct xlog *log)
2783{
2784 struct xfs_log_item *lip;
f0b2efad
BF
2785 struct xfs_ail_cursor cur;
2786 struct xfs_ail *ailp;
2787
2788 ailp = log->l_ailp;
57e80956 2789 spin_lock(&ailp->ail_lock);
f0b2efad
BF
2790 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2791 while (lip != NULL) {
2792 /*
dc42375d
DW
2793 * We're done when we see something other than an intent.
2794 * There should be no intents left in the AIL now.
f0b2efad 2795 */
dc42375d 2796 if (!xlog_item_is_intent(lip)) {
f0b2efad
BF
2797#ifdef DEBUG
2798 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
dc42375d 2799 ASSERT(!xlog_item_is_intent(lip));
f0b2efad
BF
2800#endif
2801 break;
2802 }
2803
dc42375d 2804 switch (lip->li_type) {
f997ee21
DW
2805 case XFS_LI_CUI:
2806 xlog_recover_cancel_cui(log->l_mp, ailp, lip);
2807 break;
77d61fe4
DW
2808 case XFS_LI_BUI:
2809 xlog_recover_cancel_bui(log->l_mp, ailp, lip);
2810 break;
10d0c6e0
DW
2811 default:
2812 spin_unlock(&ailp->ail_lock);
2813 lip->li_ops->iop_release(lip);
2814 spin_lock(&ailp->ail_lock);
2815 break;
dc42375d 2816 }
f0b2efad
BF
2817
2818 lip = xfs_trans_ail_cursor_next(ailp, &cur);
2819 }
2820
2821 xfs_trans_ail_cursor_done(&cur);
57e80956 2822 spin_unlock(&ailp->ail_lock);
f0b2efad
BF
2823}
2824
1da177e4
LT
2825/*
2826 * This routine performs a transaction to null out a bad inode pointer
2827 * in an agi unlinked inode hash bucket.
2828 */
2829STATIC void
2830xlog_recover_clear_agi_bucket(
2831 xfs_mount_t *mp,
2832 xfs_agnumber_t agno,
2833 int bucket)
2834{
2835 xfs_trans_t *tp;
2836 xfs_agi_t *agi;
2837 xfs_buf_t *agibp;
2838 int offset;
2839 int error;
2840
253f4911 2841 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
e5720eec 2842 if (error)
253f4911 2843 goto out_error;
1da177e4 2844
5e1be0fb
CH
2845 error = xfs_read_agi(mp, tp, agno, &agibp);
2846 if (error)
e5720eec 2847 goto out_abort;
1da177e4 2848
370c782b 2849 agi = agibp->b_addr;
16259e7d 2850 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
1da177e4
LT
2851 offset = offsetof(xfs_agi_t, agi_unlinked) +
2852 (sizeof(xfs_agino_t) * bucket);
2853 xfs_trans_log_buf(tp, agibp, offset,
2854 (offset + sizeof(xfs_agino_t) - 1));
2855
70393313 2856 error = xfs_trans_commit(tp);
e5720eec
DC
2857 if (error)
2858 goto out_error;
2859 return;
2860
2861out_abort:
4906e215 2862 xfs_trans_cancel(tp);
e5720eec 2863out_error:
a0fa2b67 2864 xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
e5720eec 2865 return;
1da177e4
LT
2866}
2867
23fac50f
CH
2868STATIC xfs_agino_t
2869xlog_recover_process_one_iunlink(
2870 struct xfs_mount *mp,
2871 xfs_agnumber_t agno,
2872 xfs_agino_t agino,
2873 int bucket)
2874{
2875 struct xfs_buf *ibp;
2876 struct xfs_dinode *dip;
2877 struct xfs_inode *ip;
2878 xfs_ino_t ino;
2879 int error;
2880
2881 ino = XFS_AGINO_TO_INO(mp, agno, agino);
7b6259e7 2882 error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
23fac50f
CH
2883 if (error)
2884 goto fail;
2885
2886 /*
2887 * Get the on disk inode to find the next inode in the bucket.
2888 */
c1995079 2889 error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0);
23fac50f 2890 if (error)
0e446673 2891 goto fail_iput;
23fac50f 2892
17c12bcd 2893 xfs_iflags_clear(ip, XFS_IRECOVERY);
54d7b5c1 2894 ASSERT(VFS_I(ip)->i_nlink == 0);
c19b3b05 2895 ASSERT(VFS_I(ip)->i_mode != 0);
23fac50f
CH
2896
2897 /* setup for the next pass */
2898 agino = be32_to_cpu(dip->di_next_unlinked);
2899 xfs_buf_relse(ibp);
2900
2901 /*
2902 * Prevent any DMAPI event from being sent when the reference on
2903 * the inode is dropped.
2904 */
2905 ip->i_d.di_dmevmask = 0;
2906
44a8736b 2907 xfs_irele(ip);
23fac50f
CH
2908 return agino;
2909
0e446673 2910 fail_iput:
44a8736b 2911 xfs_irele(ip);
23fac50f
CH
2912 fail:
2913 /*
2914 * We can't read in the inode this bucket points to, or this inode
2915 * is messed up. Just ditch this bucket of inodes. We will lose
2916 * some inodes and space, but at least we won't hang.
2917 *
2918 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
2919 * clear the inode pointer in the bucket.
2920 */
2921 xlog_recover_clear_agi_bucket(mp, agno, bucket);
2922 return NULLAGINO;
2923}
2924
1da177e4 2925/*
8ab39f11 2926 * Recover AGI unlinked lists
1da177e4 2927 *
8ab39f11
DC
2928 * This is called during recovery to process any inodes which we unlinked but
2929 * not freed when the system crashed. These inodes will be on the lists in the
2930 * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
2931 * any inodes found on the lists. Each inode is removed from the lists when it
2932 * has been fully truncated and is freed. The freeing of the inode and its
2933 * removal from the list must be atomic.
2934 *
2935 * If everything we touch in the agi processing loop is already in memory, this
2936 * loop can hold the cpu for a long time. It runs without lock contention,
2937 * memory allocation contention, the need wait for IO, etc, and so will run
2938 * until we either run out of inodes to process, run low on memory or we run out
2939 * of log space.
2940 *
2941 * This behaviour is bad for latency on single CPU and non-preemptible kernels,
2942 * and can prevent other filesytem work (such as CIL pushes) from running. This
2943 * can lead to deadlocks if the recovery process runs out of log reservation
2944 * space. Hence we need to yield the CPU when there is other kernel work
2945 * scheduled on this CPU to ensure other scheduled work can run without undue
2946 * latency.
1da177e4 2947 */
d96f8f89 2948STATIC void
1da177e4 2949xlog_recover_process_iunlinks(
9a8d2fdb 2950 struct xlog *log)
1da177e4
LT
2951{
2952 xfs_mount_t *mp;
2953 xfs_agnumber_t agno;
2954 xfs_agi_t *agi;
2955 xfs_buf_t *agibp;
1da177e4 2956 xfs_agino_t agino;
1da177e4
LT
2957 int bucket;
2958 int error;
1da177e4
LT
2959
2960 mp = log->l_mp;
2961
1da177e4
LT
2962 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
2963 /*
2964 * Find the agi for this ag.
2965 */
5e1be0fb
CH
2966 error = xfs_read_agi(mp, NULL, agno, &agibp);
2967 if (error) {
2968 /*
2969 * AGI is b0rked. Don't process it.
2970 *
2971 * We should probably mark the filesystem as corrupt
2972 * after we've recovered all the ag's we can....
2973 */
2974 continue;
1da177e4 2975 }
d97d32ed
JK
2976 /*
2977 * Unlock the buffer so that it can be acquired in the normal
2978 * course of the transaction to truncate and free each inode.
2979 * Because we are not racing with anyone else here for the AGI
2980 * buffer, we don't even need to hold it locked to read the
2981 * initial unlinked bucket entries out of the buffer. We keep
2982 * buffer reference though, so that it stays pinned in memory
2983 * while we need the buffer.
2984 */
370c782b 2985 agi = agibp->b_addr;
d97d32ed 2986 xfs_buf_unlock(agibp);
1da177e4
LT
2987
2988 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
16259e7d 2989 agino = be32_to_cpu(agi->agi_unlinked[bucket]);
1da177e4 2990 while (agino != NULLAGINO) {
23fac50f
CH
2991 agino = xlog_recover_process_one_iunlink(mp,
2992 agno, agino, bucket);
8ab39f11 2993 cond_resched();
1da177e4
LT
2994 }
2995 }
d97d32ed 2996 xfs_buf_rele(agibp);
1da177e4 2997 }
1da177e4
LT
2998}
2999
91083269 3000STATIC void
1da177e4 3001xlog_unpack_data(
9a8d2fdb 3002 struct xlog_rec_header *rhead,
b2a922cd 3003 char *dp,
9a8d2fdb 3004 struct xlog *log)
1da177e4
LT
3005{
3006 int i, j, k;
1da177e4 3007
b53e675d 3008 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
1da177e4 3009 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
b53e675d 3010 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
1da177e4
LT
3011 dp += BBSIZE;
3012 }
3013
62118709 3014 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
b28708d6 3015 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
b53e675d 3016 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
1da177e4
LT
3017 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3018 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
b53e675d 3019 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
1da177e4
LT
3020 dp += BBSIZE;
3021 }
3022 }
1da177e4
LT
3023}
3024
9d94901f 3025/*
b94fb2d1 3026 * CRC check, unpack and process a log record.
9d94901f
BF
3027 */
3028STATIC int
3029xlog_recover_process(
3030 struct xlog *log,
3031 struct hlist_head rhash[],
3032 struct xlog_rec_header *rhead,
3033 char *dp,
12818d24
BF
3034 int pass,
3035 struct list_head *buffer_list)
9d94901f 3036{
cae028df 3037 __le32 old_crc = rhead->h_crc;
b94fb2d1
BF
3038 __le32 crc;
3039
6528250b
BF
3040 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
3041
b94fb2d1 3042 /*
6528250b
BF
3043 * Nothing else to do if this is a CRC verification pass. Just return
3044 * if this a record with a non-zero crc. Unfortunately, mkfs always
cae028df 3045 * sets old_crc to 0 so we must consider this valid even on v5 supers.
6528250b
BF
3046 * Otherwise, return EFSBADCRC on failure so the callers up the stack
3047 * know precisely what failed.
3048 */
3049 if (pass == XLOG_RECOVER_CRCPASS) {
cae028df 3050 if (old_crc && crc != old_crc)
6528250b
BF
3051 return -EFSBADCRC;
3052 return 0;
3053 }
3054
3055 /*
3056 * We're in the normal recovery path. Issue a warning if and only if the
3057 * CRC in the header is non-zero. This is an advisory warning and the
3058 * zero CRC check prevents warnings from being emitted when upgrading
3059 * the kernel from one that does not add CRCs by default.
b94fb2d1 3060 */
cae028df
DC
3061 if (crc != old_crc) {
3062 if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
b94fb2d1
BF
3063 xfs_alert(log->l_mp,
3064 "log record CRC mismatch: found 0x%x, expected 0x%x.",
cae028df 3065 le32_to_cpu(old_crc),
b94fb2d1
BF
3066 le32_to_cpu(crc));
3067 xfs_hex_dump(dp, 32);
3068 }
3069
3070 /*
3071 * If the filesystem is CRC enabled, this mismatch becomes a
3072 * fatal log corruption failure.
3073 */
a5155b87
DW
3074 if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
3075 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
b94fb2d1 3076 return -EFSCORRUPTED;
a5155b87 3077 }
b94fb2d1 3078 }
9d94901f 3079
91083269 3080 xlog_unpack_data(rhead, dp, log);
9d94901f 3081
12818d24
BF
3082 return xlog_recover_process_data(log, rhash, rhead, dp, pass,
3083 buffer_list);
9d94901f
BF
3084}
3085
1da177e4
LT
3086STATIC int
3087xlog_valid_rec_header(
9a8d2fdb
MT
3088 struct xlog *log,
3089 struct xlog_rec_header *rhead,
1da177e4
LT
3090 xfs_daddr_t blkno)
3091{
3092 int hlen;
3093
a71895c5
DW
3094 if (XFS_IS_CORRUPT(log->l_mp,
3095 rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
2451337d 3096 return -EFSCORRUPTED;
a71895c5
DW
3097 if (XFS_IS_CORRUPT(log->l_mp,
3098 (!rhead->h_version ||
3099 (be32_to_cpu(rhead->h_version) &
3100 (~XLOG_VERSION_OKBITS))))) {
a0fa2b67 3101 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
34a622b2 3102 __func__, be32_to_cpu(rhead->h_version));
895e196f 3103 return -EFSCORRUPTED;
1da177e4
LT
3104 }
3105
3106 /* LR body must have data or it wouldn't have been written */
b53e675d 3107 hlen = be32_to_cpu(rhead->h_len);
a71895c5 3108 if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX))
2451337d 3109 return -EFSCORRUPTED;
a71895c5
DW
3110 if (XFS_IS_CORRUPT(log->l_mp,
3111 blkno > log->l_logBBsize || blkno > INT_MAX))
2451337d 3112 return -EFSCORRUPTED;
1da177e4
LT
3113 return 0;
3114}
3115
3116/*
3117 * Read the log from tail to head and process the log records found.
3118 * Handle the two cases where the tail and head are in the same cycle
3119 * and where the active portion of the log wraps around the end of
3120 * the physical log separately. The pass parameter is passed through
3121 * to the routines called to process the data and is not looked at
3122 * here.
3123 */
3124STATIC int
3125xlog_do_recovery_pass(
9a8d2fdb 3126 struct xlog *log,
1da177e4
LT
3127 xfs_daddr_t head_blk,
3128 xfs_daddr_t tail_blk,
d7f37692
BF
3129 int pass,
3130 xfs_daddr_t *first_bad) /* out: first bad log rec */
1da177e4
LT
3131{
3132 xlog_rec_header_t *rhead;
284f1c2c 3133 xfs_daddr_t blk_no, rblk_no;
d7f37692 3134 xfs_daddr_t rhead_blk;
b2a922cd 3135 char *offset;
6ad5b325 3136 char *hbp, *dbp;
a70f9fe5 3137 int error = 0, h_size, h_len;
12818d24 3138 int error2 = 0;
1da177e4
LT
3139 int bblks, split_bblks;
3140 int hblks, split_hblks, wrapped_hblks;
39775431 3141 int i;
f0a76953 3142 struct hlist_head rhash[XLOG_RHASH_SIZE];
12818d24 3143 LIST_HEAD (buffer_list);
1da177e4
LT
3144
3145 ASSERT(head_blk != tail_blk);
a4c9b34d 3146 blk_no = rhead_blk = tail_blk;
1da177e4 3147
39775431
BF
3148 for (i = 0; i < XLOG_RHASH_SIZE; i++)
3149 INIT_HLIST_HEAD(&rhash[i]);
3150
1da177e4
LT
3151 /*
3152 * Read the header of the tail block and get the iclog buffer size from
3153 * h_size. Use this to tell how many sectors make up the log header.
3154 */
62118709 3155 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1da177e4
LT
3156 /*
3157 * When using variable length iclogs, read first sector of
3158 * iclog header and extract the header size from it. Get a
3159 * new hbp that is the correct size.
3160 */
6e9b3dd8 3161 hbp = xlog_alloc_buffer(log, 1);
1da177e4 3162 if (!hbp)
2451337d 3163 return -ENOMEM;
076e6acb
CH
3164
3165 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3166 if (error)
1da177e4 3167 goto bread_err1;
076e6acb 3168
1da177e4
LT
3169 rhead = (xlog_rec_header_t *)offset;
3170 error = xlog_valid_rec_header(log, rhead, tail_blk);
3171 if (error)
3172 goto bread_err1;
a70f9fe5
BF
3173
3174 /*
3175 * xfsprogs has a bug where record length is based on lsunit but
3176 * h_size (iclog size) is hardcoded to 32k. Now that we
3177 * unconditionally CRC verify the unmount record, this means the
3178 * log buffer can be too small for the record and cause an
3179 * overrun.
3180 *
3181 * Detect this condition here. Use lsunit for the buffer size as
3182 * long as this looks like the mkfs case. Otherwise, return an
3183 * error to avoid a buffer overrun.
3184 */
b53e675d 3185 h_size = be32_to_cpu(rhead->h_size);
a70f9fe5
BF
3186 h_len = be32_to_cpu(rhead->h_len);
3187 if (h_len > h_size) {
3188 if (h_len <= log->l_mp->m_logbsize &&
3189 be32_to_cpu(rhead->h_num_logops) == 1) {
3190 xfs_warn(log->l_mp,
3191 "invalid iclog size (%d bytes), using lsunit (%d bytes)",
3192 h_size, log->l_mp->m_logbsize);
3193 h_size = log->l_mp->m_logbsize;
a5155b87
DW
3194 } else {
3195 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
3196 log->l_mp);
050552cb
DW
3197 error = -EFSCORRUPTED;
3198 goto bread_err1;
a5155b87 3199 }
a70f9fe5
BF
3200 }
3201
b53e675d 3202 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
1da177e4
LT
3203 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3204 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3205 if (h_size % XLOG_HEADER_CYCLE_SIZE)
3206 hblks++;
6ad5b325 3207 kmem_free(hbp);
6e9b3dd8 3208 hbp = xlog_alloc_buffer(log, hblks);
1da177e4
LT
3209 } else {
3210 hblks = 1;
3211 }
3212 } else {
69ce58f0 3213 ASSERT(log->l_sectBBsize == 1);
1da177e4 3214 hblks = 1;
6e9b3dd8 3215 hbp = xlog_alloc_buffer(log, 1);
1da177e4
LT
3216 h_size = XLOG_BIG_RECORD_BSIZE;
3217 }
3218
3219 if (!hbp)
2451337d 3220 return -ENOMEM;
6e9b3dd8 3221 dbp = xlog_alloc_buffer(log, BTOBB(h_size));
1da177e4 3222 if (!dbp) {
6ad5b325 3223 kmem_free(hbp);
2451337d 3224 return -ENOMEM;
1da177e4
LT
3225 }
3226
3227 memset(rhash, 0, sizeof(rhash));
970fd3f0 3228 if (tail_blk > head_blk) {
1da177e4
LT
3229 /*
3230 * Perform recovery around the end of the physical log.
3231 * When the head is not on the same cycle number as the tail,
970fd3f0 3232 * we can't do a sequential recovery.
1da177e4 3233 */
1da177e4
LT
3234 while (blk_no < log->l_logBBsize) {
3235 /*
3236 * Check for header wrapping around physical end-of-log
3237 */
6ad5b325 3238 offset = hbp;
1da177e4
LT
3239 split_hblks = 0;
3240 wrapped_hblks = 0;
3241 if (blk_no + hblks <= log->l_logBBsize) {
3242 /* Read header in one read */
076e6acb
CH
3243 error = xlog_bread(log, blk_no, hblks, hbp,
3244 &offset);
1da177e4
LT
3245 if (error)
3246 goto bread_err2;
1da177e4
LT
3247 } else {
3248 /* This LR is split across physical log end */
3249 if (blk_no != log->l_logBBsize) {
3250 /* some data before physical log end */
3251 ASSERT(blk_no <= INT_MAX);
3252 split_hblks = log->l_logBBsize - (int)blk_no;
3253 ASSERT(split_hblks > 0);
076e6acb
CH
3254 error = xlog_bread(log, blk_no,
3255 split_hblks, hbp,
3256 &offset);
3257 if (error)
1da177e4 3258 goto bread_err2;
1da177e4 3259 }
076e6acb 3260
1da177e4
LT
3261 /*
3262 * Note: this black magic still works with
3263 * large sector sizes (non-512) only because:
3264 * - we increased the buffer size originally
3265 * by 1 sector giving us enough extra space
3266 * for the second read;
3267 * - the log start is guaranteed to be sector
3268 * aligned;
3269 * - we read the log end (LR header start)
3270 * _first_, then the log start (LR header end)
3271 * - order is important.
3272 */
234f56ac 3273 wrapped_hblks = hblks - split_hblks;
6ad5b325
CH
3274 error = xlog_bread_noalign(log, 0,
3275 wrapped_hblks,
44396476 3276 offset + BBTOB(split_hblks));
1da177e4
LT
3277 if (error)
3278 goto bread_err2;
1da177e4
LT
3279 }
3280 rhead = (xlog_rec_header_t *)offset;
3281 error = xlog_valid_rec_header(log, rhead,
3282 split_hblks ? blk_no : 0);
3283 if (error)
3284 goto bread_err2;
3285
b53e675d 3286 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1da177e4
LT
3287 blk_no += hblks;
3288
284f1c2c
BF
3289 /*
3290 * Read the log record data in multiple reads if it
3291 * wraps around the end of the log. Note that if the
3292 * header already wrapped, blk_no could point past the
3293 * end of the log. The record data is contiguous in
3294 * that case.
3295 */
3296 if (blk_no + bblks <= log->l_logBBsize ||
3297 blk_no >= log->l_logBBsize) {
0703a8e1 3298 rblk_no = xlog_wrap_logbno(log, blk_no);
284f1c2c 3299 error = xlog_bread(log, rblk_no, bblks, dbp,
076e6acb 3300 &offset);
1da177e4
LT
3301 if (error)
3302 goto bread_err2;
1da177e4
LT
3303 } else {
3304 /* This log record is split across the
3305 * physical end of log */
6ad5b325 3306 offset = dbp;
1da177e4
LT
3307 split_bblks = 0;
3308 if (blk_no != log->l_logBBsize) {
3309 /* some data is before the physical
3310 * end of log */
3311 ASSERT(!wrapped_hblks);
3312 ASSERT(blk_no <= INT_MAX);
3313 split_bblks =
3314 log->l_logBBsize - (int)blk_no;
3315 ASSERT(split_bblks > 0);
076e6acb
CH
3316 error = xlog_bread(log, blk_no,
3317 split_bblks, dbp,
3318 &offset);
3319 if (error)
1da177e4 3320 goto bread_err2;
1da177e4 3321 }
076e6acb 3322
1da177e4
LT
3323 /*
3324 * Note: this black magic still works with
3325 * large sector sizes (non-512) only because:
3326 * - we increased the buffer size originally
3327 * by 1 sector giving us enough extra space
3328 * for the second read;
3329 * - the log start is guaranteed to be sector
3330 * aligned;
3331 * - we read the log end (LR header start)
3332 * _first_, then the log start (LR header end)
3333 * - order is important.
3334 */
6ad5b325
CH
3335 error = xlog_bread_noalign(log, 0,
3336 bblks - split_bblks,
44396476 3337 offset + BBTOB(split_bblks));
076e6acb
CH
3338 if (error)
3339 goto bread_err2;
1da177e4 3340 }
0e446be4 3341
9d94901f 3342 error = xlog_recover_process(log, rhash, rhead, offset,
12818d24 3343 pass, &buffer_list);
0e446be4 3344 if (error)
1da177e4 3345 goto bread_err2;
d7f37692 3346
1da177e4 3347 blk_no += bblks;
d7f37692 3348 rhead_blk = blk_no;
1da177e4
LT
3349 }
3350
3351 ASSERT(blk_no >= log->l_logBBsize);
3352 blk_no -= log->l_logBBsize;
d7f37692 3353 rhead_blk = blk_no;
970fd3f0 3354 }
1da177e4 3355
970fd3f0
ES
3356 /* read first part of physical log */
3357 while (blk_no < head_blk) {
3358 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3359 if (error)
3360 goto bread_err2;
076e6acb 3361
970fd3f0
ES
3362 rhead = (xlog_rec_header_t *)offset;
3363 error = xlog_valid_rec_header(log, rhead, blk_no);
3364 if (error)
3365 goto bread_err2;
076e6acb 3366
970fd3f0
ES
3367 /* blocks in data section */
3368 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3369 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3370 &offset);
3371 if (error)
3372 goto bread_err2;
076e6acb 3373
12818d24
BF
3374 error = xlog_recover_process(log, rhash, rhead, offset, pass,
3375 &buffer_list);
970fd3f0
ES
3376 if (error)
3377 goto bread_err2;
d7f37692 3378
970fd3f0 3379 blk_no += bblks + hblks;
d7f37692 3380 rhead_blk = blk_no;
1da177e4
LT
3381 }
3382
3383 bread_err2:
6ad5b325 3384 kmem_free(dbp);
1da177e4 3385 bread_err1:
6ad5b325 3386 kmem_free(hbp);
d7f37692 3387
12818d24
BF
3388 /*
3389 * Submit buffers that have been added from the last record processed,
3390 * regardless of error status.
3391 */
3392 if (!list_empty(&buffer_list))
3393 error2 = xfs_buf_delwri_submit(&buffer_list);
3394
d7f37692
BF
3395 if (error && first_bad)
3396 *first_bad = rhead_blk;
3397
39775431
BF
3398 /*
3399 * Transactions are freed at commit time but transactions without commit
3400 * records on disk are never committed. Free any that may be left in the
3401 * hash table.
3402 */
3403 for (i = 0; i < XLOG_RHASH_SIZE; i++) {
3404 struct hlist_node *tmp;
3405 struct xlog_recover *trans;
3406
3407 hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
3408 xlog_recover_free_trans(trans);
3409 }
3410
12818d24 3411 return error ? error : error2;
1da177e4
LT
3412}
3413
3414/*
3415 * Do the recovery of the log. We actually do this in two phases.
3416 * The two passes are necessary in order to implement the function
3417 * of cancelling a record written into the log. The first pass
3418 * determines those things which have been cancelled, and the
3419 * second pass replays log items normally except for those which
3420 * have been cancelled. The handling of the replay and cancellations
3421 * takes place in the log item type specific routines.
3422 *
3423 * The table of items which have cancel records in the log is allocated
3424 * and freed at this level, since only here do we know when all of
3425 * the log recovery has been completed.
3426 */
3427STATIC int
3428xlog_do_log_recovery(
9a8d2fdb 3429 struct xlog *log,
1da177e4
LT
3430 xfs_daddr_t head_blk,
3431 xfs_daddr_t tail_blk)
3432{
d5689eaa 3433 int error, i;
1da177e4
LT
3434
3435 ASSERT(head_blk != tail_blk);
3436
3437 /*
3438 * First do a pass to find all of the cancelled buf log items.
3439 * Store them in the buf_cancel_table for use in the second pass.
3440 */
d5689eaa
CH
3441 log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3442 sizeof(struct list_head),
707e0dda 3443 0);
d5689eaa
CH
3444 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3445 INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3446
1da177e4 3447 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
d7f37692 3448 XLOG_RECOVER_PASS1, NULL);
1da177e4 3449 if (error != 0) {
f0e2d93c 3450 kmem_free(log->l_buf_cancel_table);
1da177e4
LT
3451 log->l_buf_cancel_table = NULL;
3452 return error;
3453 }
3454 /*
3455 * Then do a second pass to actually recover the items in the log.
3456 * When it is complete free the table of buf cancel items.
3457 */
3458 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
d7f37692 3459 XLOG_RECOVER_PASS2, NULL);
1da177e4 3460#ifdef DEBUG
6d192a9b 3461 if (!error) {
1da177e4
LT
3462 int i;
3463
3464 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
d5689eaa 3465 ASSERT(list_empty(&log->l_buf_cancel_table[i]));
1da177e4
LT
3466 }
3467#endif /* DEBUG */
3468
f0e2d93c 3469 kmem_free(log->l_buf_cancel_table);
1da177e4
LT
3470 log->l_buf_cancel_table = NULL;
3471
3472 return error;
3473}
3474
3475/*
3476 * Do the actual recovery
3477 */
3478STATIC int
3479xlog_do_recover(
9a8d2fdb 3480 struct xlog *log,
1da177e4
LT
3481 xfs_daddr_t head_blk,
3482 xfs_daddr_t tail_blk)
3483{
a798011c 3484 struct xfs_mount *mp = log->l_mp;
1da177e4
LT
3485 int error;
3486 xfs_buf_t *bp;
3487 xfs_sb_t *sbp;
3488
e67d3d42
BF
3489 trace_xfs_log_recover(log, head_blk, tail_blk);
3490
1da177e4
LT
3491 /*
3492 * First replay the images in the log.
3493 */
3494 error = xlog_do_log_recovery(log, head_blk, tail_blk);
43ff2122 3495 if (error)
1da177e4 3496 return error;
1da177e4
LT
3497
3498 /*
3499 * If IO errors happened during recovery, bail out.
3500 */
a798011c 3501 if (XFS_FORCED_SHUTDOWN(mp)) {
2451337d 3502 return -EIO;
1da177e4
LT
3503 }
3504
3505 /*
3506 * We now update the tail_lsn since much of the recovery has completed
3507 * and there may be space available to use. If there were no extent
3508 * or iunlinks, we can free up the entire log and set the tail_lsn to
3509 * be the last_sync_lsn. This was set in xlog_find_tail to be the
3510 * lsn of the last known good LR on disk. If there are extent frees
3511 * or iunlinks they will have some entries in the AIL; so we look at
3512 * the AIL to determine how to set the tail_lsn.
3513 */
a798011c 3514 xlog_assign_tail_lsn(mp);
1da177e4
LT
3515
3516 /*
3517 * Now that we've finished replaying all buffer and inode
98021821 3518 * updates, re-read in the superblock and reverify it.
1da177e4 3519 */
8c9ce2f7 3520 bp = xfs_getsb(mp);
1157b32c 3521 bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
b68c0821 3522 ASSERT(!(bp->b_flags & XBF_WRITE));
0cac682f 3523 bp->b_flags |= XBF_READ;
1813dd64 3524 bp->b_ops = &xfs_sb_buf_ops;
83a0adc3 3525
6af88cda 3526 error = xfs_buf_submit(bp);
d64e31a2 3527 if (error) {
a798011c 3528 if (!XFS_FORCED_SHUTDOWN(mp)) {
cdbcf82b 3529 xfs_buf_ioerror_alert(bp, __this_address);
595bff75
DC
3530 ASSERT(0);
3531 }
1da177e4
LT
3532 xfs_buf_relse(bp);
3533 return error;
3534 }
3535
3536 /* Convert superblock from on-disk format */
a798011c 3537 sbp = &mp->m_sb;
3e6e8afd 3538 xfs_sb_from_disk(sbp, bp->b_addr);
1da177e4
LT
3539 xfs_buf_relse(bp);
3540
a798011c
DC
3541 /* re-initialise in-core superblock and geometry structures */
3542 xfs_reinit_percpu_counters(mp);
3543 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
3544 if (error) {
3545 xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
3546 return error;
3547 }
52548852 3548 mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
5478eead 3549
1da177e4
LT
3550 xlog_recover_check_summary(log);
3551
3552 /* Normal transactions can now occur */
3553 log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3554 return 0;
3555}
3556
3557/*
3558 * Perform recovery and re-initialize some log variables in xlog_find_tail.
3559 *
3560 * Return error or zero.
3561 */
3562int
3563xlog_recover(
9a8d2fdb 3564 struct xlog *log)
1da177e4
LT
3565{
3566 xfs_daddr_t head_blk, tail_blk;
3567 int error;
3568
3569 /* find the tail of the log */
a45086e2
BF
3570 error = xlog_find_tail(log, &head_blk, &tail_blk);
3571 if (error)
1da177e4
LT
3572 return error;
3573
a45086e2
BF
3574 /*
3575 * The superblock was read before the log was available and thus the LSN
3576 * could not be verified. Check the superblock LSN against the current
3577 * LSN now that it's known.
3578 */
3579 if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
3580 !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
3581 return -EINVAL;
3582
1da177e4
LT
3583 if (tail_blk != head_blk) {
3584 /* There used to be a comment here:
3585 *
3586 * disallow recovery on read-only mounts. note -- mount
3587 * checks for ENOSPC and turns it into an intelligent
3588 * error message.
3589 * ...but this is no longer true. Now, unless you specify
3590 * NORECOVERY (in which case this function would never be
3591 * called), we just go ahead and recover. We do this all
3592 * under the vfs layer, so we can get away with it unless
3593 * the device itself is read-only, in which case we fail.
3594 */
3a02ee18 3595 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
1da177e4
LT
3596 return error;
3597 }
3598
e721f504
DC
3599 /*
3600 * Version 5 superblock log feature mask validation. We know the
3601 * log is dirty so check if there are any unknown log features
3602 * in what we need to recover. If there are unknown features
3603 * (e.g. unsupported transactions, then simply reject the
3604 * attempt at recovery before touching anything.
3605 */
3606 if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
3607 xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
3608 XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
3609 xfs_warn(log->l_mp,
f41febd2 3610"Superblock has unknown incompatible log features (0x%x) enabled.",
e721f504
DC
3611 (log->l_mp->m_sb.sb_features_log_incompat &
3612 XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
f41febd2
JP
3613 xfs_warn(log->l_mp,
3614"The log can not be fully and/or safely recovered by this kernel.");
3615 xfs_warn(log->l_mp,
3616"Please recover the log on a kernel that supports the unknown features.");
2451337d 3617 return -EINVAL;
e721f504
DC
3618 }
3619
2e227178
BF
3620 /*
3621 * Delay log recovery if the debug hook is set. This is debug
3622 * instrumention to coordinate simulation of I/O failures with
3623 * log recovery.
3624 */
3625 if (xfs_globals.log_recovery_delay) {
3626 xfs_notice(log->l_mp,
3627 "Delaying log recovery for %d seconds.",
3628 xfs_globals.log_recovery_delay);
3629 msleep(xfs_globals.log_recovery_delay * 1000);
3630 }
3631
a0fa2b67
DC
3632 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3633 log->l_mp->m_logname ? log->l_mp->m_logname
3634 : "internal");
1da177e4
LT
3635
3636 error = xlog_do_recover(log, head_blk, tail_blk);
3637 log->l_flags |= XLOG_RECOVERY_NEEDED;
3638 }
3639 return error;
3640}
3641
3642/*
3643 * In the first part of recovery we replay inodes and buffers and build
3644 * up the list of extent free items which need to be processed. Here
3645 * we process the extent free items and clean up the on disk unlinked
3646 * inode lists. This is separated from the first part of recovery so
3647 * that the root and real-time bitmap inodes can be read in from disk in
3648 * between the two stages. This is necessary so that we can free space
3649 * in the real-time portion of the file system.
3650 */
3651int
3652xlog_recover_finish(
9a8d2fdb 3653 struct xlog *log)
1da177e4
LT
3654{
3655 /*
3656 * Now we're ready to do the transactions needed for the
3657 * rest of recovery. Start with completing all the extent
3658 * free intent records and then process the unlinked inode
3659 * lists. At this point, we essentially run in normal mode
3660 * except that we're still performing recovery actions
3661 * rather than accepting new requests.
3662 */
3663 if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3c1e2bbe 3664 int error;
dc42375d 3665 error = xlog_recover_process_intents(log);
3c1e2bbe 3666 if (error) {
dc42375d 3667 xfs_alert(log->l_mp, "Failed to recover intents");
3c1e2bbe
DC
3668 return error;
3669 }
9e88b5d8 3670
1da177e4 3671 /*
dc42375d 3672 * Sync the log to get all the intents out of the AIL.
1da177e4
LT
3673 * This isn't absolutely necessary, but it helps in
3674 * case the unlink transactions would have problems
dc42375d 3675 * pushing the intents out of the way.
1da177e4 3676 */
a14a348b 3677 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
1da177e4 3678
4249023a 3679 xlog_recover_process_iunlinks(log);
1da177e4
LT
3680
3681 xlog_recover_check_summary(log);
3682
a0fa2b67
DC
3683 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
3684 log->l_mp->m_logname ? log->l_mp->m_logname
3685 : "internal");
1da177e4
LT
3686 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3687 } else {
a0fa2b67 3688 xfs_info(log->l_mp, "Ending clean mount");
1da177e4
LT
3689 }
3690 return 0;
3691}
3692
a7a9250e 3693void
f0b2efad
BF
3694xlog_recover_cancel(
3695 struct xlog *log)
3696{
f0b2efad 3697 if (log->l_flags & XLOG_RECOVERY_NEEDED)
a7a9250e 3698 xlog_recover_cancel_intents(log);
f0b2efad 3699}
1da177e4
LT
3700
3701#if defined(DEBUG)
3702/*
3703 * Read all of the agf and agi counters and check that they
3704 * are consistent with the superblock counters.
3705 */
e89fbb5e 3706STATIC void
1da177e4 3707xlog_recover_check_summary(
9a8d2fdb 3708 struct xlog *log)
1da177e4
LT
3709{
3710 xfs_mount_t *mp;
1da177e4
LT
3711 xfs_buf_t *agfbp;
3712 xfs_buf_t *agibp;
1da177e4 3713 xfs_agnumber_t agno;
c8ce540d
DW
3714 uint64_t freeblks;
3715 uint64_t itotal;
3716 uint64_t ifree;
5e1be0fb 3717 int error;
1da177e4
LT
3718
3719 mp = log->l_mp;
3720
3721 freeblks = 0LL;
3722 itotal = 0LL;
3723 ifree = 0LL;
3724 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4805621a
FCH
3725 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3726 if (error) {
a0fa2b67
DC
3727 xfs_alert(mp, "%s agf read failed agno %d error %d",
3728 __func__, agno, error);
4805621a 3729 } else {
9798f615
CH
3730 struct xfs_agf *agfp = agfbp->b_addr;
3731
4805621a
FCH
3732 freeblks += be32_to_cpu(agfp->agf_freeblks) +
3733 be32_to_cpu(agfp->agf_flcount);
3734 xfs_buf_relse(agfbp);
1da177e4 3735 }
1da177e4 3736
5e1be0fb 3737 error = xfs_read_agi(mp, NULL, agno, &agibp);
a0fa2b67
DC
3738 if (error) {
3739 xfs_alert(mp, "%s agi read failed agno %d error %d",
3740 __func__, agno, error);
3741 } else {
370c782b 3742 struct xfs_agi *agi = agibp->b_addr;
16259e7d 3743
5e1be0fb
CH
3744 itotal += be32_to_cpu(agi->agi_count);
3745 ifree += be32_to_cpu(agi->agi_freecount);
3746 xfs_buf_relse(agibp);
3747 }
1da177e4 3748 }
1da177e4
LT
3749}
3750#endif /* DEBUG */