[linux-2.6-block.git] / fs / xfs / scrub / tempfile.c

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_quota.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_dir2.h"
#include "xfs_exchrange.h"
#include "xfs_exchmaps.h"
#include "xfs_defer.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
#include "scrub/trace.h"
#include "scrub/tempfile.h"
#include "scrub/tempexch.h"
#include "scrub/xfile.h"

/*
 * Create a temporary file for reconstructing metadata, with the intention of
 * atomically exchanging the temporary file's contents with the file that's
 * being repaired.
 */
int
xrep_tempfile_create(
	struct xfs_scrub	*sc,
	uint16_t		mode)
{
	struct xfs_mount	*mp = sc->mp;
	struct xfs_trans	*tp = NULL;
	struct xfs_dquot	*udqp = NULL;
	struct xfs_dquot	*gdqp = NULL;
	struct xfs_dquot	*pdqp = NULL;
	struct xfs_trans_res	*tres;
	struct xfs_inode	*dp = mp->m_rootip;
	xfs_ino_t		ino;
	unsigned int		resblks;
	bool			is_dir = S_ISDIR(mode);
	int			error;

	if (xfs_is_shutdown(mp))
		return -EIO;
	if (xfs_is_readonly(mp))
		return -EROFS;

	ASSERT(sc->tp == NULL);
	ASSERT(sc->tempip == NULL);

	/*
	 * Make sure that we have allocated dquot(s) on disk.  The temporary
	 * inode should be completely root owned so that we don't fail due to
	 * quota limits.
	 */
	error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
			XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
	if (error)
		return error;

	if (is_dir) {
		resblks = XFS_MKDIR_SPACE_RES(mp, 0);
		tres = &M_RES(mp)->tr_mkdir;
	} else {
		resblks = XFS_IALLOC_SPACE_RES(mp);
		tres = &M_RES(mp)->tr_create_tmpfile;
	}

	error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
			&tp);
	if (error)
		goto out_release_dquots;

	/* Allocate inode, set up directory. */
	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
	if (error)
		goto out_trans_cancel;
	error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0,
			0, false, &sc->tempip);
	if (error)
		goto out_trans_cancel;

	/* Change the ownership of the inode to root. */
	VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID;
	VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID;
	sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT);
	xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);

	/*
	 * Mark our temporary file as private so that LSMs and the ACL code
	 * don't try to add their own metadata or reason about these files.
	 * The file should never be exposed to userspace.
	 */
	VFS_I(sc->tempip)->i_flags |= S_PRIVATE;
	VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;

	if (is_dir) {
		error = xfs_dir_init(tp, sc->tempip, dp);
		if (error)
			goto out_trans_cancel;
	}

	/*
	 * Attach the dquot(s) to the inodes and modify them incore.
	 * These ids of the inode couldn't have changed since the new
	 * inode has been locked ever since it was created.
	 */
	xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);

	/*
	 * Put our temp file on the unlinked list so it's purged automatically.
	 * All file-based metadata being reconstructed using this file must be
	 * atomically exchanged with the original file because the contents
	 * here will be purged when the inode is dropped or log recovery cleans
	 * out the unlinked list.
	 */
	error = xfs_iunlink(tp, sc->tempip);
	if (error)
		goto out_trans_cancel;

	error = xfs_trans_commit(tp);
	if (error)
		goto out_release_inode;

	trace_xrep_tempfile_create(sc);

	xfs_qm_dqrele(udqp);
	xfs_qm_dqrele(gdqp);
	xfs_qm_dqrele(pdqp);

	/* Finish setting up the incore / vfs context. */
	xfs_setup_iops(sc->tempip);
	xfs_finish_inode_setup(sc->tempip);

	sc->temp_ilock_flags = 0;
	return error;

out_trans_cancel:
	xfs_trans_cancel(tp);
out_release_inode:
	/*
	 * Wait until after the current transaction is aborted to finish the
	 * setup of the inode and release the inode.  This prevents recursive
	 * transactions and deadlocks from xfs_inactive.
	 */
	if (sc->tempip) {
		xfs_finish_inode_setup(sc->tempip);
		xchk_irele(sc, sc->tempip);
	}
out_release_dquots:
	xfs_qm_dqrele(udqp);
	xfs_qm_dqrele(gdqp);
	xfs_qm_dqrele(pdqp);

	return error;
}

/* Take IOLOCK_EXCL on the temporary file, maybe. */
bool
xrep_tempfile_iolock_nowait(
	struct xfs_scrub	*sc)
{
	if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
		sc->temp_ilock_flags |= XFS_IOLOCK_EXCL;
		return true;
	}

	return false;
}

/*
 * Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
 * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
 * to avoid deadlocks and lockdep complaints.
 */
int
xrep_tempfile_iolock_polled(
	struct xfs_scrub	*sc)
{
	int			error = 0;

	while (!xrep_tempfile_iolock_nowait(sc)) {
		if (xchk_should_terminate(sc, &error))
			return error;
		delay(1);
	}

	return 0;
}

/* Release IOLOCK_EXCL on the temporary file. */
void
xrep_tempfile_iounlock(
	struct xfs_scrub	*sc)
{
	xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
	sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
}

/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
void
xrep_tempfile_ilock(
	struct xfs_scrub	*sc)
{
	sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
	xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
}

/* Try to grab ILOCK_EXCL on the temporary file. */
bool
xrep_tempfile_ilock_nowait(
	struct xfs_scrub	*sc)
{
	if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
		sc->temp_ilock_flags |= XFS_ILOCK_EXCL;
		return true;
	}

	return false;
}

/* Unlock ILOCK_EXCL on the temporary file after an update. */
void
xrep_tempfile_iunlock(
	struct xfs_scrub	*sc)
{
	xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
	sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
}

/* Release the temporary file. */
void
xrep_tempfile_rele(
	struct xfs_scrub	*sc)
{
	if (!sc->tempip)
		return;

	if (sc->temp_ilock_flags) {
		xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
		sc->temp_ilock_flags = 0;
	}

	xchk_irele(sc, sc->tempip);
	sc->tempip = NULL;
}

/*
 * Make sure that the given range of the data fork of the temporary file is
 * mapped to written blocks.  The caller must ensure that both inodes are
 * joined to the transaction.
 */
int
xrep_tempfile_prealloc(
	struct xfs_scrub	*sc,
	xfs_fileoff_t		off,
	xfs_filblks_t		len)
{
	struct xfs_bmbt_irec	map;
	xfs_fileoff_t		end = off + len;
	int			error;

	ASSERT(sc->tempip != NULL);
	ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip));

	for (; off < end; off = map.br_startoff + map.br_blockcount) {
		int		nmaps = 1;

		/*
		 * If we have a real extent mapping this block then we're
		 * in ok shape.
		 */
		error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps,
				XFS_DATA_FORK);
		if (error)
			return error;
		if (nmaps == 0) {
			ASSERT(nmaps != 0);
			return -EFSCORRUPTED;
		}

		if (xfs_bmap_is_written_extent(&map))
			continue;

		/*
		 * If we find a delalloc reservation then something is very
		 * very wrong.  Bail out.
		 */
		if (map.br_startblock == DELAYSTARTBLOCK)
			return -EFSCORRUPTED;

		/*
		 * Make sure this block has a real zeroed extent allocated to
		 * it.
		 */
		nmaps = 1;
		error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off,
				XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map,
				&nmaps);
		if (error)
			return error;
		if (nmaps != 1)
			return -EFSCORRUPTED;

		trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map);

		/* Commit new extent and all deferred work. */
		error = xfs_defer_finish(&sc->tp);
		if (error)
			return error;
	}

	return 0;
}

/*
 * Write data to each block of a file.  The given range of the tempfile's data
 * fork must already be populated with written extents.
 */
int
xrep_tempfile_copyin(
	struct xfs_scrub	*sc,
	xfs_fileoff_t		off,
	xfs_filblks_t		len,
	xrep_tempfile_copyin_fn	prep_fn,
	void			*data)
{
	LIST_HEAD(buffers_list);
	struct xfs_mount	*mp = sc->mp;
	struct xfs_buf		*bp;
	xfs_fileoff_t		flush_mask;
	xfs_fileoff_t		end = off + len;
	loff_t			pos = XFS_FSB_TO_B(mp, off);
	int			error = 0;

	ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode));

	/* Flush buffers to disk every 512K */
	flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1;

	for (; off < end; off++, pos += mp->m_sb.sb_blocksize) {
		struct xfs_bmbt_irec	map;
		int			nmaps = 1;

		/* Read block mapping for this file block. */
		error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0);
		if (error)
			goto out_err;
		if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) {
			error = -EFSCORRUPTED;
			goto out_err;
		}

		/* Get the metadata buffer for this offset in the file. */
		error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp,
				XFS_FSB_TO_DADDR(mp, map.br_startblock),
				mp->m_bsize, 0, &bp);
		if (error)
			goto out_err;

		trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map);

		/* Read in a block's worth of data from the xfile. */
		error = prep_fn(sc, bp, data);
		if (error) {
			xfs_trans_brelse(sc->tp, bp);
			goto out_err;
		}

		/* Queue buffer, and flush if we have too much dirty data. */
		xfs_buf_delwri_queue_here(bp, &buffers_list);
		xfs_trans_brelse(sc->tp, bp);

		if (!(off & flush_mask)) {
			error = xfs_buf_delwri_submit(&buffers_list);
			if (error)
				goto out_err;
		}
	}

	/*
	 * Write the new blocks to disk.  If the ordered list isn't empty after
	 * that, then something went wrong and we have to fail.  This should
	 * never happen, but we'll check anyway.
	 */
	error = xfs_buf_delwri_submit(&buffers_list);
	if (error)
		goto out_err;

	if (!list_empty(&buffers_list)) {
		ASSERT(list_empty(&buffers_list));
		error = -EIO;
		goto out_err;
	}

	return 0;

out_err:
	xfs_buf_delwri_cancel(&buffers_list);
	return error;
}

/*
 * Set the temporary file's size.  Caller must join the tempfile to the scrub
 * transaction and is responsible for adjusting block mappings as needed.
 */
int
xrep_tempfile_set_isize(
	struct xfs_scrub	*sc,
	unsigned long long	isize)
{
	if (sc->tempip->i_disk_size == isize)
		return 0;

	sc->tempip->i_disk_size = isize;
	i_size_write(VFS_I(sc->tempip), isize);
	return xrep_tempfile_roll_trans(sc);
}

/*
 * Roll a repair transaction involving the temporary file.  Caller must join
 * both the temporary file and the file being scrubbed to the transaction.
 * This function return with both inodes joined to a new scrub transaction,
 * or the usual negative errno.
 */
int
xrep_tempfile_roll_trans(
	struct xfs_scrub	*sc)
{
	int			error;

	xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
	error = xrep_roll_trans(sc);
	if (error)
		return error;

	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
	return 0;
}

/* Enable file content exchanges. */
int
xrep_tempexch_enable(
	struct xfs_scrub	*sc)
{
	if (sc->flags & XREP_FSGATES_EXCHANGE_RANGE)
		return 0;

	if (!xfs_has_exchange_range(sc->mp))
		return -EOPNOTSUPP;

	trace_xchk_fsgates_enable(sc, XREP_FSGATES_EXCHANGE_RANGE);

	sc->flags |= XREP_FSGATES_EXCHANGE_RANGE;
	return 0;
}

/*
 * Fill out the mapping exchange request in preparation for atomically
 * committing the contents of a metadata file that we've rebuilt in the temp
 * file.
 */
STATIC int
xrep_tempexch_prep_request(
	struct xfs_scrub	*sc,
	int			whichfork,
	struct xrep_tempexch	*tx)
{
	struct xfs_exchmaps_req	*req = &tx->req;

	memset(tx, 0, sizeof(struct xrep_tempexch));

	/* COW forks don't exist on disk. */
	if (whichfork == XFS_COW_FORK) {
		ASSERT(0);
		return -EINVAL;
	}

	/* Both files should have the relevant forks. */
	if (!xfs_ifork_ptr(sc->ip, whichfork) ||
	    !xfs_ifork_ptr(sc->tempip, whichfork)) {
		ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
		ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL);
		return -EINVAL;
	}

	/* Exchange all mappings in both forks. */
	req->ip1 = sc->tempip;
	req->ip2 = sc->ip;
	req->startoff1 = 0;
	req->startoff2 = 0;
	switch (whichfork) {
	case XFS_ATTR_FORK:
		req->flags |= XFS_EXCHMAPS_ATTR_FORK;
		break;
	case XFS_DATA_FORK:
		/* Always exchange sizes when exchanging data fork mappings. */
		req->flags |= XFS_EXCHMAPS_SET_SIZES;
		break;
	}
	req->blockcount = XFS_MAX_FILEOFF;

	return 0;
}

/*
 * Obtain a quota reservation to make sure we don't hit EDQUOT.  We can skip
 * this if quota enforcement is disabled or if both inodes' dquots are the
 * same.  The qretry structure must be initialized to zeroes before the first
 * call to this function.
 */
STATIC int
xrep_tempexch_reserve_quota(
	struct xfs_scrub		*sc,
	const struct xrep_tempexch	*tx)
{
	struct xfs_trans		*tp = sc->tp;
	const struct xfs_exchmaps_req	*req = &tx->req;
	int64_t				ddelta, rdelta;
	int				error;

	/*
	 * Don't bother with a quota reservation if we're not enforcing them
	 * or the two inodes have the same dquots.
	 */
	if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 ||
	    (req->ip1->i_udquot == req->ip2->i_udquot &&
	     req->ip1->i_gdquot == req->ip2->i_gdquot &&
	     req->ip1->i_pdquot == req->ip2->i_pdquot))
		return 0;

	/*
	 * Quota reservation for each file comes from two sources.  First, we
	 * need to account for any net gain in mapped blocks during the
	 * exchange.  Second, we need reservation for the gross gain in mapped
	 * blocks so that we don't trip over any quota block reservation
	 * assertions.  We must reserve the gross gain because the quota code
	 * subtracts from bcount the number of blocks that we unmap; it does
	 * not add that quantity back to the quota block reservation.
	 */
	ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount);
	rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount);
	error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
			ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount,
			true);
	if (error)
		return error;

	ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount);
	rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount);
	return xfs_trans_reserve_quota_nblks(tp, req->ip2,
			ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount,
			true);
}

/*
 * Prepare an existing transaction for an atomic file contents exchange.
 *
 * This function fills out the mapping exchange request and resource estimation
 * structures in preparation for exchanging the contents of a metadata file
 * that has been rebuilt in the temp file.  Next, it reserves space and quota
 * for the transaction.
 *
 * The caller must hold ILOCK_EXCL of the scrub target file and the temporary
 * file.  The caller must join both inodes to the transaction with no unlock
 * flags, and is responsible for dropping both ILOCKs when appropriate.  Only
 * use this when those ILOCKs cannot be dropped.
 */
int
xrep_tempexch_trans_reserve(
	struct xfs_scrub	*sc,
	int			whichfork,
	struct xrep_tempexch	*tx)
{
	int			error;

	ASSERT(sc->tp != NULL);
	xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
	xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);

	error = xrep_tempexch_prep_request(sc, whichfork, tx);
	if (error)
		return error;

	error = xfs_exchmaps_estimate(&tx->req);
	if (error)
		return error;

	error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0);
	if (error)
		return error;

	return xrep_tempexch_reserve_quota(sc, tx);
}

/*
 * Exchange file mappings (and hence file contents) between the file being
 * repaired and the temporary file.  Returns with both inodes locked and joined
 * to a clean scrub transaction.
 */
int
xrep_tempexch_contents(
	struct xfs_scrub	*sc,
	struct xrep_tempexch	*tx)
{
	int			error;

	ASSERT(sc->flags & XREP_FSGATES_EXCHANGE_RANGE);

	xfs_exchange_mappings(sc->tp, &tx->req);
	error = xfs_defer_finish(&sc->tp);
	if (error)
		return error;

	/*
	 * If we exchanged the ondisk sizes of two metadata files, we must
	 * exchanged the incore sizes as well.
	 */
	if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) {
		loff_t	temp;

		temp = i_size_read(VFS_I(sc->ip));
		i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
		i_size_write(VFS_I(sc->tempip), temp);
	}

	return 0;
}
Commit	Line	Data
84c14ee3 DW	1	// SPDX-License-Identifier: GPL-2.0-or-later
	2	/*
	3	* Copyright (c) 2021-2024 Oracle. All Rights Reserved.
	4	* Author: Darrick J. Wong <djwong@kernel.org>
	5	*/
	6	#include "xfs.h"
	7	#include "xfs_fs.h"
	8	#include "xfs_shared.h"
	9	#include "xfs_format.h"
	10	#include "xfs_trans_resv.h"
	11	#include "xfs_mount.h"
	12	#include "xfs_log_format.h"
	13	#include "xfs_trans.h"
	14	#include "xfs_inode.h"
	15	#include "xfs_ialloc.h"
	16	#include "xfs_quota.h"
e81ce424	17	#include "xfs_bmap.h"
84c14ee3 DW	18	#include "xfs_bmap_btree.h"
	19	#include "xfs_trans_space.h"
	20	#include "xfs_dir2.h"
	21	#include "xfs_exchrange.h"
56596d8b	22	#include "xfs_exchmaps.h"
e81ce424	23	#include "xfs_defer.h"
84c14ee3 DW	24	#include "scrub/scrub.h"
84c14ee3 DW	25	#include "scrub/common.h"
e81ce424	26	#include "scrub/repair.h"
84c14ee3 DW	27	#include "scrub/trace.h"
84c14ee3 DW	28	#include "scrub/tempfile.h"
56596d8b	29	#include "scrub/tempexch.h"
e81ce424	30	#include "scrub/xfile.h"
84c14ee3 DW	31
	32	/*
	33	* Create a temporary file for reconstructing metadata, with the intention of
	34	* atomically exchanging the temporary file's contents with the file that's
	35	* being repaired.
	36	*/
	37	int
	38	xrep_tempfile_create(
	39	struct xfs_scrub *sc,
	40	uint16_t mode)
	41	{
	42	struct xfs_mount *mp = sc->mp;
	43	struct xfs_trans *tp = NULL;
	44	struct xfs_dquot *udqp = NULL;
	45	struct xfs_dquot *gdqp = NULL;
	46	struct xfs_dquot *pdqp = NULL;
	47	struct xfs_trans_res *tres;
	48	struct xfs_inode *dp = mp->m_rootip;
	49	xfs_ino_t ino;
	50	unsigned int resblks;
	51	bool is_dir = S_ISDIR(mode);
	52	int error;
	53
	54	if (xfs_is_shutdown(mp))
	55	return -EIO;
	56	if (xfs_is_readonly(mp))
	57	return -EROFS;
	58
	59	ASSERT(sc->tp == NULL);
	60	ASSERT(sc->tempip == NULL);
	61
	62	/*
	63	* Make sure that we have allocated dquot(s) on disk. The temporary
	64	* inode should be completely root owned so that we don't fail due to
	65	* quota limits.
	66	*/
	67	error = xfs_qm_vop_dqalloc(dp, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
	68	XFS_QMOPT_QUOTALL, &udqp, &gdqp, &pdqp);
	69	if (error)
	70	return error;
	71
	72	if (is_dir) {
	73	resblks = XFS_MKDIR_SPACE_RES(mp, 0);
	74	tres = &M_RES(mp)->tr_mkdir;
	75	} else {
	76	resblks = XFS_IALLOC_SPACE_RES(mp);
	77	tres = &M_RES(mp)->tr_create_tmpfile;
	78	}
	79
	80	error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks,
	81	&tp);
	82	if (error)
	83	goto out_release_dquots;
	84
	85	/* Allocate inode, set up directory. */
	86	error = xfs_dialloc(&tp, dp->i_ino, mode, &ino);
	87	if (error)
	88	goto out_trans_cancel;
	89	error = xfs_init_new_inode(&nop_mnt_idmap, tp, dp, ino, mode, 0, 0,
	90	0, false, &sc->tempip);
	91	if (error)
	92	goto out_trans_cancel;
	93
	94	/* Change the ownership of the inode to root. */
95	VFS_I(sc->tempip)->i_uid = GLOBAL_ROOT_UID;
96	VFS_I(sc->tempip)->i_gid = GLOBAL_ROOT_GID;
97	sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME \| XFS_DIFLAG_RTINHERIT);
98	xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE);
99
100	/*
101	* Mark our temporary file as private so that LSMs and the ACL code
102	* don't try to add their own metadata or reason about these files.
103	* The file should never be exposed to userspace.
104	*/
105	VFS_I(sc->tempip)->i_flags \|= S_PRIVATE;
106	VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR;
107
108	if (is_dir) {
109	error = xfs_dir_init(tp, sc->tempip, dp);
110	if (error)
111	goto out_trans_cancel;
112	}
113
114	/*
115	* Attach the dquot(s) to the inodes and modify them incore.
116	* These ids of the inode couldn't have changed since the new
117	* inode has been locked ever since it was created.
118	*/
119	xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp);
120
121	/*
122	* Put our temp file on the unlinked list so it's purged automatically.
123	* All file-based metadata being reconstructed using this file must be
124	* atomically exchanged with the original file because the contents
125	* here will be purged when the inode is dropped or log recovery cleans
126	* out the unlinked list.
127	*/
128	error = xfs_iunlink(tp, sc->tempip);
129	if (error)
130	goto out_trans_cancel;
131
132	error = xfs_trans_commit(tp);
133	if (error)
134	goto out_release_inode;
135
136	trace_xrep_tempfile_create(sc);
137
138	xfs_qm_dqrele(udqp);
139	xfs_qm_dqrele(gdqp);
140	xfs_qm_dqrele(pdqp);
141
142	/* Finish setting up the incore / vfs context. */
143	xfs_setup_iops(sc->tempip);
144	xfs_finish_inode_setup(sc->tempip);
145
146	sc->temp_ilock_flags = 0;
147	return error;
148
149	out_trans_cancel:
150	xfs_trans_cancel(tp);
151	out_release_inode:
152	/*
153	* Wait until after the current transaction is aborted to finish the
154	* setup of the inode and release the inode. This prevents recursive
155	* transactions and deadlocks from xfs_inactive.
156	*/
157	if (sc->tempip) {
158	xfs_finish_inode_setup(sc->tempip);
159	xchk_irele(sc, sc->tempip);
160	}
161	out_release_dquots:
162	xfs_qm_dqrele(udqp);
163	xfs_qm_dqrele(gdqp);
164	xfs_qm_dqrele(pdqp);
165
166	return error;
167	}
168
169	/* Take IOLOCK_EXCL on the temporary file, maybe. */
170	bool
171	xrep_tempfile_iolock_nowait(
172	struct xfs_scrub *sc)
173	{
174	if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) {
175	sc->temp_ilock_flags \|= XFS_IOLOCK_EXCL;
176	return true;
177	}
178
179	return false;
180	}
181
182	/*
183	* Take the temporary file's IOLOCK while holding a different inode's IOLOCK.
184	* In theory nobody else should hold the tempfile's IOLOCK, but we use trylock
185	* to avoid deadlocks and lockdep complaints.
186	*/
187	int
188	xrep_tempfile_iolock_polled(
189	struct xfs_scrub *sc)
190	{
191	int error = 0;
192
193	while (!xrep_tempfile_iolock_nowait(sc)) {
194	if (xchk_should_terminate(sc, &error))
195	return error;
196	delay(1);
197	}
198
199	return 0;
200	}
201
202	/* Release IOLOCK_EXCL on the temporary file. */
203	void
204	xrep_tempfile_iounlock(
205	struct xfs_scrub *sc)
206	{
207	xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL);
208	sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL;
209	}
210
211	/* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */
212	void
213	xrep_tempfile_ilock(
214	struct xfs_scrub *sc)
215	{
216	sc->temp_ilock_flags \|= XFS_ILOCK_EXCL;
217	xfs_ilock(sc->tempip, XFS_ILOCK_EXCL);
218	}
219
220	/* Try to grab ILOCK_EXCL on the temporary file. */
221	bool
222	xrep_tempfile_ilock_nowait(
223	struct xfs_scrub *sc)
224	{
225	if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) {
226	sc->temp_ilock_flags \|= XFS_ILOCK_EXCL;
227	return true;
228	}
229
230	return false;
231	}
232
233	/* Unlock ILOCK_EXCL on the temporary file after an update. */
234	void
235	xrep_tempfile_iunlock(
236	struct xfs_scrub *sc)
237	{
238	xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL);
239	sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL;
240	}
241
242	/* Release the temporary file. */
243	void
244	xrep_tempfile_rele(
245	struct xfs_scrub *sc)
246	{
247	if (!sc->tempip)
248	return;
249
250	if (sc->temp_ilock_flags) {
251	xfs_iunlock(sc->tempip, sc->temp_ilock_flags);
252	sc->temp_ilock_flags = 0;
253	}
254
255	xchk_irele(sc, sc->tempip);
256	sc->tempip = NULL;
257	}
e81ce424 DW	258
	259	/*
	260	* Make sure that the given range of the data fork of the temporary file is
	261	* mapped to written blocks. The caller must ensure that both inodes are
	262	* joined to the transaction.
	263	*/
	264	int
	265	xrep_tempfile_prealloc(
	266	struct xfs_scrub *sc,
	267	xfs_fileoff_t off,
	268	xfs_filblks_t len)
	269	{
	270	struct xfs_bmbt_irec map;
	271	xfs_fileoff_t end = off + len;
	272	int error;
	273
	274	ASSERT(sc->tempip != NULL);
	275	ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip));
	276
	277	for (; off < end; off = map.br_startoff + map.br_blockcount) {
	278	int nmaps = 1;
	279
	280	/*
	281	* If we have a real extent mapping this block then we're
	282	* in ok shape.
	283	*/
	284	error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps,
	285	XFS_DATA_FORK);
	286	if (error)
	287	return error;
	288	if (nmaps == 0) {
	289	ASSERT(nmaps != 0);
	290	return -EFSCORRUPTED;
	291	}
	292
	293	if (xfs_bmap_is_written_extent(&map))
	294	continue;
	295
	296	/*
	297	* If we find a delalloc reservation then something is very
	298	* very wrong. Bail out.
	299	*/
	300	if (map.br_startblock == DELAYSTARTBLOCK)
	301	return -EFSCORRUPTED;
	302
	303	/*
	304	* Make sure this block has a real zeroed extent allocated to
	305	* it.
	306	*/
	307	nmaps = 1;
	308	error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off,
	309	XFS_BMAPI_CONVERT \| XFS_BMAPI_ZERO, 0, &map,
	310	&nmaps);
	311	if (error)
	312	return error;
	313	if (nmaps != 1)
	314	return -EFSCORRUPTED;
	315
	316	trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map);
	317
	318	/* Commit new extent and all deferred work. */
	319	error = xfs_defer_finish(&sc->tp);
	320	if (error)
	321	return error;
322	}
323
324	return 0;
325	}
326
327	/*
328	* Write data to each block of a file. The given range of the tempfile's data
329	* fork must already be populated with written extents.
330	*/
331	int
332	xrep_tempfile_copyin(
333	struct xfs_scrub *sc,
334	xfs_fileoff_t off,
335	xfs_filblks_t len,
336	xrep_tempfile_copyin_fn prep_fn,
337	void *data)
338	{
339	LIST_HEAD(buffers_list);
340	struct xfs_mount *mp = sc->mp;
341	struct xfs_buf *bp;
342	xfs_fileoff_t flush_mask;
343	xfs_fileoff_t end = off + len;
344	loff_t pos = XFS_FSB_TO_B(mp, off);
345	int error = 0;
346
347	ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode));
348
349	/* Flush buffers to disk every 512K */
350	flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1;
351
352	for (; off < end; off++, pos += mp->m_sb.sb_blocksize) {
353	struct xfs_bmbt_irec map;
354	int nmaps = 1;
355
356	/* Read block mapping for this file block. */
357	error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0);
358	if (error)
359	goto out_err;
360	if (nmaps == 0 \|\| !xfs_bmap_is_written_extent(&map)) {
361	error = -EFSCORRUPTED;
362	goto out_err;
363	}
364
365	/* Get the metadata buffer for this offset in the file. */
366	error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp,
367	XFS_FSB_TO_DADDR(mp, map.br_startblock),
368	mp->m_bsize, 0, &bp);
369	if (error)
370	goto out_err;
371
372	trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map);
373
374	/* Read in a block's worth of data from the xfile. */
375	error = prep_fn(sc, bp, data);
376	if (error) {
377	xfs_trans_brelse(sc->tp, bp);
378	goto out_err;
379	}
380
381	/* Queue buffer, and flush if we have too much dirty data. */
382	xfs_buf_delwri_queue_here(bp, &buffers_list);
383	xfs_trans_brelse(sc->tp, bp);
384
385	if (!(off & flush_mask)) {
386	error = xfs_buf_delwri_submit(&buffers_list);
387	if (error)
388	goto out_err;
389	}
390	}
391
392	/*
393	* Write the new blocks to disk. If the ordered list isn't empty after
394	* that, then something went wrong and we have to fail. This should
395	* never happen, but we'll check anyway.
396	*/
397	error = xfs_buf_delwri_submit(&buffers_list);
398	if (error)
399	goto out_err;
400
401	if (!list_empty(&buffers_list)) {
402	ASSERT(list_empty(&buffers_list));
403	error = -EIO;
404	goto out_err;
405	}
406
407	return 0;
408
409	out_err:
410	xfs_buf_delwri_cancel(&buffers_list);
411	return error;
412	}
413
414	/*
415	* Set the temporary file's size. Caller must join the tempfile to the scrub
416	* transaction and is responsible for adjusting block mappings as needed.
417	*/
418	int
419	xrep_tempfile_set_isize(
420	struct xfs_scrub *sc,
421	unsigned long long isize)
422	{
423	if (sc->tempip->i_disk_size == isize)
424	return 0;
425
426	sc->tempip->i_disk_size = isize;
427	i_size_write(VFS_I(sc->tempip), isize);
428	return xrep_tempfile_roll_trans(sc);
429	}
430
431	/*
432	* Roll a repair transaction involving the temporary file. Caller must join
433	* both the temporary file and the file being scrubbed to the transaction.
434	* This function return with both inodes joined to a new scrub transaction,
435	* or the usual negative errno.
436	*/
437	int
438	xrep_tempfile_roll_trans(
439	struct xfs_scrub *sc)
440	{
441	int error;
442
443	xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE);
444	error = xrep_roll_trans(sc);
445	if (error)
446	return error;
447
448	xfs_trans_ijoin(sc->tp, sc->tempip, 0);
449	return 0;
450	}
56596d8b DW	451
	452	/* Enable file content exchanges. */
	453	int
	454	xrep_tempexch_enable(
	455	struct xfs_scrub *sc)
	456	{
	457	if (sc->flags & XREP_FSGATES_EXCHANGE_RANGE)
	458	return 0;
	459
	460	if (!xfs_has_exchange_range(sc->mp))
	461	return -EOPNOTSUPP;
	462
	463	trace_xchk_fsgates_enable(sc, XREP_FSGATES_EXCHANGE_RANGE);
	464
	465	sc->flags \|= XREP_FSGATES_EXCHANGE_RANGE;
	466	return 0;
	467	}
	468
	469	/*
	470	* Fill out the mapping exchange request in preparation for atomically
	471	* committing the contents of a metadata file that we've rebuilt in the temp
	472	* file.
	473	*/
	474	STATIC int
	475	xrep_tempexch_prep_request(
	476	struct xfs_scrub *sc,
	477	int whichfork,
	478	struct xrep_tempexch *tx)
	479	{
	480	struct xfs_exchmaps_req *req = &tx->req;
	481
	482	memset(tx, 0, sizeof(struct xrep_tempexch));
	483
	484	/* COW forks don't exist on disk. */
	485	if (whichfork == XFS_COW_FORK) {
	486	ASSERT(0);
	487	return -EINVAL;
	488	}
	489
	490	/* Both files should have the relevant forks. */
	491	if (!xfs_ifork_ptr(sc->ip, whichfork) \|\|
	492	!xfs_ifork_ptr(sc->tempip, whichfork)) {
	493	ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL);
	494	ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL);
	495	return -EINVAL;
	496	}
	497
	498	/* Exchange all mappings in both forks. */
	499	req->ip1 = sc->tempip;
	500	req->ip2 = sc->ip;
	501	req->startoff1 = 0;
	502	req->startoff2 = 0;
	503	switch (whichfork) {
	504	case XFS_ATTR_FORK:
	505	req->flags \|= XFS_EXCHMAPS_ATTR_FORK;
	506	break;
	507	case XFS_DATA_FORK:
	508	/* Always exchange sizes when exchanging data fork mappings. */
	509	req->flags \|= XFS_EXCHMAPS_SET_SIZES;
	510	break;
	511	}
	512	req->blockcount = XFS_MAX_FILEOFF;
	513
	514	return 0;
515	}
516
517	/*
518	* Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip
519	* this if quota enforcement is disabled or if both inodes' dquots are the
520	* same. The qretry structure must be initialized to zeroes before the first
521	* call to this function.
522	*/
523	STATIC int
524	xrep_tempexch_reserve_quota(
525	struct xfs_scrub *sc,
526	const struct xrep_tempexch *tx)
527	{
528	struct xfs_trans *tp = sc->tp;
529	const struct xfs_exchmaps_req *req = &tx->req;
530	int64_t ddelta, rdelta;
531	int error;
532
533	/*
534	* Don't bother with a quota reservation if we're not enforcing them
535	* or the two inodes have the same dquots.
536	*/
537	if (!XFS_IS_QUOTA_ON(tp->t_mountp) \|\| req->ip1 == req->ip2 \|\|
538	(req->ip1->i_udquot == req->ip2->i_udquot &&
539	req->ip1->i_gdquot == req->ip2->i_gdquot &&
540	req->ip1->i_pdquot == req->ip2->i_pdquot))
541	return 0;
542
543	/*
544	* Quota reservation for each file comes from two sources. First, we
545	* need to account for any net gain in mapped blocks during the
546	* exchange. Second, we need reservation for the gross gain in mapped
547	* blocks so that we don't trip over any quota block reservation
548	* assertions. We must reserve the gross gain because the quota code
549	* subtracts from bcount the number of blocks that we unmap; it does
550	* not add that quantity back to the quota block reservation.
551	*/
552	ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount);
553	rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount);
554	error = xfs_trans_reserve_quota_nblks(tp, req->ip1,
555	ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount,
556	true);
557	if (error)
558	return error;
559
560	ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount);
561	rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount);
562	return xfs_trans_reserve_quota_nblks(tp, req->ip2,
563	ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount,
564	true);
565	}
566
567	/*
568	* Prepare an existing transaction for an atomic file contents exchange.
569	*
570	* This function fills out the mapping exchange request and resource estimation
571	* structures in preparation for exchanging the contents of a metadata file
572	* that has been rebuilt in the temp file. Next, it reserves space and quota
573	* for the transaction.
574	*
575	* The caller must hold ILOCK_EXCL of the scrub target file and the temporary
576	* file. The caller must join both inodes to the transaction with no unlock
577	* flags, and is responsible for dropping both ILOCKs when appropriate. Only
578	* use this when those ILOCKs cannot be dropped.
579	*/
580	int
581	xrep_tempexch_trans_reserve(
582	struct xfs_scrub *sc,
583	int whichfork,
584	struct xrep_tempexch *tx)
585	{
586	int error;
587
588	ASSERT(sc->tp != NULL);
589	xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL);
590	xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL);
591
592	error = xrep_tempexch_prep_request(sc, whichfork, tx);
593	if (error)
594	return error;
595
596	error = xfs_exchmaps_estimate(&tx->req);
597	if (error)
598	return error;
599
600	error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0);
601	if (error)
602	return error;
603
604	return xrep_tempexch_reserve_quota(sc, tx);
605	}
606
607	/*
608	* Exchange file mappings (and hence file contents) between the file being
609	* repaired and the temporary file. Returns with both inodes locked and joined
610	* to a clean scrub transaction.
611	*/
612	int
613	xrep_tempexch_contents(
614	struct xfs_scrub *sc,
615	struct xrep_tempexch *tx)
616	{
617	int error;
618
619	ASSERT(sc->flags & XREP_FSGATES_EXCHANGE_RANGE);
620
621	xfs_exchange_mappings(sc->tp, &tx->req);
622	error = xfs_defer_finish(&sc->tp);
623	if (error)
624	return error;
625
626	/*
627	* If we exchanged the ondisk sizes of two metadata files, we must
628	* exchanged the incore sizes as well.
629	*/
630	if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) {
631	loff_t temp;
632
633	temp = i_size_read(VFS_I(sc->ip));
634	i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip)));
635	i_size_write(VFS_I(sc->tempip), temp);
636	}
637
638	return 0;
639	}