[linux-2.6-block.git] / fs / xfs / linux-2.6 / xfs_sync.c

/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_dir2.h"
#include "xfs_dmapi.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
#include "xfs_btree.h"
#include "xfs_dir2_sf.h"
#include "xfs_attr_sf.h"
#include "xfs_inode.h"
#include "xfs_dinode.h"
#include "xfs_error.h"
#include "xfs_mru_cache.h"
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
#include "xfs_utils.h"
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
#include "xfs_rw.h"

#include <linux/kthread.h>
#include <linux/freezer.h>

/*
 * Sync all the inodes in the given AG according to the
 * direction given by the flags.
 */
STATIC int
xfs_sync_inodes_ag(
	xfs_mount_t	*mp,
	int		ag,
	int		flags)
{
	xfs_perag_t	*pag = &mp->m_perag[ag];
	int		nr_found;
	int		first_index = 0;
	int		error = 0;
	int		last_error = 0;
	int		fflag = XFS_B_ASYNC;
	int		lock_flags = XFS_ILOCK_SHARED;

	if (flags & SYNC_DELWRI)
		fflag = XFS_B_DELWRI;
	if (flags & SYNC_WAIT)
		fflag = 0;		/* synchronous overrides all */

	if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
		/*
		 * We need the I/O lock if we're going to call any of
		 * the flush/inval routines.
		 */
		lock_flags |= XFS_IOLOCK_SHARED;
	}

	do {
		struct inode	*inode;
		boolean_t	inode_refed;
		xfs_inode_t	*ip = NULL;

		/*
		 * use a gang lookup to find the next inode in the tree
		 * as the tree is sparse and a gang lookup walks to find
		 * the number of objects requested.
		 */
		read_lock(&pag->pag_ici_lock);
		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
				(void**)&ip, first_index, 1);

		if (!nr_found) {
			read_unlock(&pag->pag_ici_lock);
			break;
		}

		/* update the index for the next lookup */
		first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);

		/*
		 * skip inodes in reclaim. Let xfs_syncsub do that for
		 * us so we don't need to worry.
		 */
		if (xfs_iflags_test(ip, (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
			read_unlock(&pag->pag_ici_lock);
			continue;
		}

		/* bad inodes are dealt with elsewhere */
		inode = VFS_I(ip);
		if (is_bad_inode(inode)) {
			read_unlock(&pag->pag_ici_lock);
			continue;
		}

		/* nothing to sync during shutdown */
		if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
			read_unlock(&pag->pag_ici_lock);
			return 0;
		}

		/*
		 * If we can't get a reference on the VFS_I, the inode must be
		 * in reclaim. If we can get the inode lock without blocking,
		 * it is safe to flush the inode because we hold the tree lock
		 * and xfs_iextract will block right now. Hence if we lock the
		 * inode while holding the tree lock, xfs_ireclaim() is
		 * guaranteed to block on the inode lock we now hold and hence
		 * it is safe to reference the inode until we drop the inode
		 * locks completely.
		 */
		inode_refed = B_FALSE;
		if (igrab(inode)) {
			read_unlock(&pag->pag_ici_lock);
			xfs_ilock(ip, lock_flags);
			inode_refed = B_TRUE;
		} else {
			if (!xfs_ilock_nowait(ip, lock_flags)) {
				/* leave it to reclaim */
				read_unlock(&pag->pag_ici_lock);
				continue;
			}
			read_unlock(&pag->pag_ici_lock);
		}

		/*
		 * If we have to flush data or wait for I/O completion
		 * we need to drop the ilock that we currently hold.
		 * If we need to drop the lock, insert a marker if we
		 * have not already done so.
		 */
		if (flags & SYNC_CLOSE) {
			xfs_iunlock(ip, XFS_ILOCK_SHARED);
			if (XFS_FORCED_SHUTDOWN(mp))
				xfs_tosspages(ip, 0, -1, FI_REMAPF);
			else
				error = xfs_flushinval_pages(ip, 0, -1,
							FI_REMAPF);
			/* wait for I/O on freeze */
			if (flags & SYNC_IOWAIT)
				vn_iowait(ip);

			xfs_ilock(ip, XFS_ILOCK_SHARED);
		}

		if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
			xfs_iunlock(ip, XFS_ILOCK_SHARED);
			error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
			if (flags & SYNC_IOWAIT)
				vn_iowait(ip);
			xfs_ilock(ip, XFS_ILOCK_SHARED);
		}

		if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
			if (flags & SYNC_WAIT) {
				xfs_iflock(ip);
				if (!xfs_inode_clean(ip))
					error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
				else
					xfs_ifunlock(ip);
			} else if (xfs_iflock_nowait(ip)) {
				if (!xfs_inode_clean(ip))
					error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
				else
					xfs_ifunlock(ip);
			}
		}

		if (lock_flags)
			xfs_iunlock(ip, lock_flags);

		if (inode_refed) {
			IRELE(ip);
		}

		if (error)
			last_error = error;
		/*
		 * bail out if the filesystem is corrupted.
		 */
		if (error == EFSCORRUPTED)
			return XFS_ERROR(error);

	} while (nr_found);

	return last_error;
}

int
xfs_sync_inodes(
	xfs_mount_t	*mp,
	int		flags)
{
	int		error;
	int		last_error;
	int		i;

	if (mp->m_flags & XFS_MOUNT_RDONLY)
		return 0;
	error = 0;
	last_error = 0;

	for (i = 0; i < mp->m_sb.sb_agcount; i++) {
		if (!mp->m_perag[i].pag_ici_init)
			continue;
		error = xfs_sync_inodes_ag(mp, i, flags);
		if (error)
			last_error = error;
		if (error == EFSCORRUPTED)
			break;
	}
	return XFS_ERROR(last_error);
}

STATIC int
xfs_commit_dummy_trans(
	struct xfs_mount	*mp,
	uint			log_flags)
{
	struct xfs_inode	*ip = mp->m_rootip;
	struct xfs_trans	*tp;
	int			error;

	/*
	 * Put a dummy transaction in the log to tell recovery
	 * that all others are OK.
	 */
	tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
	if (error) {
		xfs_trans_cancel(tp, 0);
		return error;
	}

	xfs_ilock(ip, XFS_ILOCK_EXCL);

	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
	xfs_trans_ihold(tp, ip);
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
	/* XXX(hch): ignoring the error here.. */
	error = xfs_trans_commit(tp, 0);

	xfs_iunlock(ip, XFS_ILOCK_EXCL);

	xfs_log_force(mp, 0, log_flags);
	return 0;
}

STATIC int
xfs_sync_fsdata(
	struct xfs_mount	*mp,
	int			flags)
{
	struct xfs_buf		*bp;
	struct xfs_buf_log_item	*bip;
	int			error = 0;

	/*
	 * If this is xfssyncd() then only sync the superblock if we can
	 * lock it without sleeping and it is not pinned.
	 */
	if (flags & SYNC_BDFLUSH) {
		ASSERT(!(flags & SYNC_WAIT));

		bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
		if (!bp)
			goto out;

		bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
		if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
			goto out_brelse;
	} else {
		bp = xfs_getsb(mp, 0);

		/*
		 * If the buffer is pinned then push on the log so we won't
		 * get stuck waiting in the write for someone, maybe
		 * ourselves, to flush the log.
		 *
		 * Even though we just pushed the log above, we did not have
		 * the superblock buffer locked at that point so it can
		 * become pinned in between there and here.
		 */
		if (XFS_BUF_ISPINNED(bp))
			xfs_log_force(mp, 0, XFS_LOG_FORCE);
	}


	if (flags & SYNC_WAIT)
		XFS_BUF_UNASYNC(bp);
	else
		XFS_BUF_ASYNC(bp);

	return xfs_bwrite(mp, bp);

 out_brelse:
	xfs_buf_relse(bp);
 out:
	return error;
}

/*
 * xfs_sync flushes any pending I/O to file system vfsp.
 *
 * This routine is called by vfs_sync() to make sure that things make it
 * out to disk eventually, on sync() system calls to flush out everything,
 * and when the file system is unmounted.  For the vfs_sync() case, all
 * we really need to do is sync out the log to make all of our meta-data
 * updates permanent (except for timestamps).  For calls from pflushd(),
 * dirty pages are kept moving by calling pdflush() on the inodes
 * containing them.  We also flush the inodes that we can lock without
 * sleeping and the superblock if we can lock it without sleeping from
 * vfs_sync() so that items at the tail of the log are always moving out.
 *
 * Flags:
 *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
 *		       to sleep if we can help it.  All we really need
 *		       to do is ensure that the log is synced at least
 *		       periodically.  We also push the inodes and
 *		       superblock if we can lock them without sleeping
 *			and they are not pinned.
 *      SYNC_ATTR    - We need to flush the inodes. Now handled by direct calls
 *		       to xfs_sync_inodes().
 *      SYNC_WAIT    - All the flushes that take place in this call should
 *		       be synchronous.
 *      SYNC_DELWRI  - This tells us to push dirty pages associated with
 *		       inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
 *		       determine if they should be flushed sync, async, or
 *		       delwri.
 *      SYNC_CLOSE   - This flag is passed when the system is being
 *		       unmounted.  We should sync and invalidate everything.
 *      SYNC_FSDATA  - This indicates that the caller would like to make
 *		       sure the superblock is safe on disk.  We can ensure
 *		       this by simply making sure the log gets flushed
 *		       if SYNC_BDFLUSH is set, and by actually writing it
 *		       out otherwise.
 *	SYNC_IOWAIT  - The caller wants us to wait for all data I/O to complete
 *		       before we return (including direct I/O). Forms the drain
 *		       side of the write barrier needed to safely quiesce the
 *		       filesystem.
 *
 */
int
xfs_sync(
	xfs_mount_t	*mp,
	int		flags)
{
	int		error;
	int		last_error = 0;
	uint		log_flags = XFS_LOG_FORCE;

	ASSERT(!(flags & SYNC_ATTR));

	/*
	 * Get the Quota Manager to flush the dquots.
	 *
	 * If XFS quota support is not enabled or this filesystem
	 * instance does not use quotas XFS_QM_DQSYNC will always
	 * return zero.
	 */
	error = XFS_QM_DQSYNC(mp, flags);
	if (error) {
		/*
		 * If we got an IO error, we will be shutting down.
		 * So, there's nothing more for us to do here.
		 */
		ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
		if (XFS_FORCED_SHUTDOWN(mp))
			return XFS_ERROR(error);
	}

	if (flags & SYNC_IOWAIT)
		xfs_filestream_flush(mp);

	/*
	 * Sync out the log.  This ensures that the log is periodically
	 * flushed even if there is not enough activity to fill it up.
	 */
	if (flags & SYNC_WAIT)
		log_flags |= XFS_LOG_SYNC;

	xfs_log_force(mp, (xfs_lsn_t)0, log_flags);

	if (flags & SYNC_DELWRI) {
		if (flags & SYNC_BDFLUSH)
			xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
		else
			error = xfs_sync_inodes(mp, flags);
		/*
		 * Flushing out dirty data above probably generated more
		 * log activity, so if this isn't vfs_sync() then flush
		 * the log again.
		 */
		xfs_log_force(mp, 0, log_flags);
	}

	if (flags & SYNC_FSDATA) {
		error = xfs_sync_fsdata(mp, flags);
		if (error)
			last_error = error;
	}

	/*
	 * Now check to see if the log needs a "dummy" transaction.
	 */
	if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
		error = xfs_commit_dummy_trans(mp, log_flags);
		if (error)
			return error;
	}

	/*
	 * When shutting down, we need to insure that the AIL is pushed
	 * to disk or the filesystem can appear corrupt from the PROM.
	 */
	if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
		XFS_bflush(mp->m_ddev_targp);
		if (mp->m_rtdev_targp) {
			XFS_bflush(mp->m_rtdev_targp);
		}
	}

	return XFS_ERROR(last_error);
}

/*
 * Enqueue a work item to be picked up by the vfs xfssyncd thread.
 * Doing this has two advantages:
 * - It saves on stack space, which is tight in certain situations
 * - It can be used (with care) as a mechanism to avoid deadlocks.
 * Flushing while allocating in a full filesystem requires both.
 */
STATIC void
xfs_syncd_queue_work(
	struct xfs_mount *mp,
	void		*data,
	void		(*syncer)(struct xfs_mount *, void *))
{
	struct bhv_vfs_sync_work *work;

	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
	INIT_LIST_HEAD(&work->w_list);
	work->w_syncer = syncer;
	work->w_data = data;
	work->w_mount = mp;
	spin_lock(&mp->m_sync_lock);
	list_add_tail(&work->w_list, &mp->m_sync_list);
	spin_unlock(&mp->m_sync_lock);
	wake_up_process(mp->m_sync_task);
}

/*
 * Flush delayed allocate data, attempting to free up reserved space
 * from existing allocations.  At this point a new allocation attempt
 * has failed with ENOSPC and we are in the process of scratching our
 * heads, looking about for more room...
 */
STATIC void
xfs_flush_inode_work(
	struct xfs_mount *mp,
	void		*arg)
{
	struct inode	*inode = arg;
	filemap_flush(inode->i_mapping);
	iput(inode);
}

void
xfs_flush_inode(
	xfs_inode_t	*ip)
{
	struct inode	*inode = VFS_I(ip);

	igrab(inode);
	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
	delay(msecs_to_jiffies(500));
}

/*
 * This is the "bigger hammer" version of xfs_flush_inode_work...
 * (IOW, "If at first you don't succeed, use a Bigger Hammer").
 */
STATIC void
xfs_flush_device_work(
	struct xfs_mount *mp,
	void		*arg)
{
	struct inode	*inode = arg;
	sync_blockdev(mp->m_super->s_bdev);
	iput(inode);
}

void
xfs_flush_device(
	xfs_inode_t	*ip)
{
	struct inode	*inode = VFS_I(ip);

	igrab(inode);
	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
	delay(msecs_to_jiffies(500));
	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
}

/*
 * Every sync period we need to unpin all items, reclaim inodes, sync
 * quota and write out the superblock. We might need to cover the log
 * to indicate it is idle.
 */
STATIC void
xfs_sync_worker(
	struct xfs_mount *mp,
	void		*unused)
{
	int		error;

	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
		xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
		/* dgc: errors ignored here */
		error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
		error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
		if (xfs_log_need_covered(mp))
			error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
	}
	mp->m_sync_seq++;
	wake_up(&mp->m_wait_single_sync_task);
}

STATIC int
xfssyncd(
	void			*arg)
{
	struct xfs_mount	*mp = arg;
	long			timeleft;
	bhv_vfs_sync_work_t	*work, *n;
	LIST_HEAD		(tmp);

	set_freezable();
	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
	for (;;) {
		timeleft = schedule_timeout_interruptible(timeleft);
		/* swsusp */
		try_to_freeze();
		if (kthread_should_stop() && list_empty(&mp->m_sync_list))
			break;

		spin_lock(&mp->m_sync_lock);
		/*
		 * We can get woken by laptop mode, to do a sync -
		 * that's the (only!) case where the list would be
		 * empty with time remaining.
		 */
		if (!timeleft || list_empty(&mp->m_sync_list)) {
			if (!timeleft)
				timeleft = xfs_syncd_centisecs *
							msecs_to_jiffies(10);
			INIT_LIST_HEAD(&mp->m_sync_work.w_list);
			list_add_tail(&mp->m_sync_work.w_list,
					&mp->m_sync_list);
		}
		list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
			list_move(&work->w_list, &tmp);
		spin_unlock(&mp->m_sync_lock);

		list_for_each_entry_safe(work, n, &tmp, w_list) {
			(*work->w_syncer)(mp, work->w_data);
			list_del(&work->w_list);
			if (work == &mp->m_sync_work)
				continue;
			kmem_free(work);
		}
	}

	return 0;
}

int
xfs_syncd_init(
	struct xfs_mount	*mp)
{
	mp->m_sync_work.w_syncer = xfs_sync_worker;
	mp->m_sync_work.w_mount = mp;
	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
	if (IS_ERR(mp->m_sync_task))
		return -PTR_ERR(mp->m_sync_task);
	return 0;
}

void
xfs_syncd_stop(
	struct xfs_mount	*mp)
{
	kthread_stop(mp->m_sync_task);
}
Commit	Line	Data
fe4fa4b8 DC	1	/*
	2	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
	3	* All Rights Reserved.
	4	*
	5	* This program is free software; you can redistribute it and/or
	6	* modify it under the terms of the GNU General Public License as
	7	* published by the Free Software Foundation.
	8	*
	9	* This program is distributed in the hope that it would be useful,
	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	* GNU General Public License for more details.
	13	*
	14	* You should have received a copy of the GNU General Public License
	15	* along with this program; if not, write the Free Software Foundation,
	16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	17	*/
	18	#include "xfs.h"
	19	#include "xfs_fs.h"
	20	#include "xfs_types.h"
	21	#include "xfs_bit.h"
	22	#include "xfs_log.h"
	23	#include "xfs_inum.h"
	24	#include "xfs_trans.h"
	25	#include "xfs_sb.h"
	26	#include "xfs_ag.h"
	27	#include "xfs_dir2.h"
	28	#include "xfs_dmapi.h"
	29	#include "xfs_mount.h"
	30	#include "xfs_bmap_btree.h"
	31	#include "xfs_alloc_btree.h"
	32	#include "xfs_ialloc_btree.h"
	33	#include "xfs_btree.h"
	34	#include "xfs_dir2_sf.h"
	35	#include "xfs_attr_sf.h"
	36	#include "xfs_inode.h"
	37	#include "xfs_dinode.h"
	38	#include "xfs_error.h"
	39	#include "xfs_mru_cache.h"
	40	#include "xfs_filestream.h"
	41	#include "xfs_vnodeops.h"
	42	#include "xfs_utils.h"
	43	#include "xfs_buf_item.h"
	44	#include "xfs_inode_item.h"
	45	#include "xfs_rw.h"
	46
a167b17e DC	47	#include <linux/kthread.h>
	48	#include <linux/freezer.h>
	49
fe4fa4b8	50	/*
683a8970 DC	51	* Sync all the inodes in the given AG according to the
683a8970 DC	52	* direction given by the flags.
fe4fa4b8	53	*/
683a8970 DC	54	STATIC int
683a8970 DC	55	xfs_sync_inodes_ag(
fe4fa4b8	56	xfs_mount_t *mp,
683a8970	57	int ag,
2030b5ab	58	int flags)
fe4fa4b8	59	{
683a8970	60	xfs_perag_t *pag = &mp->m_perag[ag];
683a8970 DC	61	int nr_found;
	62	int first_index = 0;
	63	int error = 0;
	64	int last_error = 0;
	65	int fflag = XFS_B_ASYNC;
	66	int lock_flags = XFS_ILOCK_SHARED;
fe4fa4b8	67
fe4fa4b8 DC	68	if (flags & SYNC_DELWRI)
	69	fflag = XFS_B_DELWRI;
	70	if (flags & SYNC_WAIT)
	71	fflag = 0; /* synchronous overrides all */
	72
fe4fa4b8 DC	73	if (flags & (SYNC_DELWRI \| SYNC_CLOSE)) {
	74	/*
	75	* We need the I/O lock if we're going to call any of
	76	* the flush/inval routines.
	77	*/
683a8970	78	lock_flags \|= XFS_IOLOCK_SHARED;
fe4fa4b8 DC	79	}
fe4fa4b8 DC	80
fe4fa4b8	81	do {
bc60a993 DC	82	struct inode *inode;
	83	boolean_t inode_refed;
	84	xfs_inode_t *ip = NULL;
	85
fe4fa4b8	86	/*
683a8970 DC	87	* use a gang lookup to find the next inode in the tree
	88	* as the tree is sparse and a gang lookup walks to find
	89	* the number of objects requested.
fe4fa4b8	90	*/
683a8970 DC	91	read_lock(&pag->pag_ici_lock);
	92	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
	93	(void**)&ip, first_index, 1);
fe4fa4b8	94
683a8970 DC	95	if (!nr_found) {
	96	read_unlock(&pag->pag_ici_lock);
	97	break;
fe4fa4b8 DC	98	}
fe4fa4b8 DC	99
683a8970 DC	100	/* update the index for the next lookup */
683a8970 DC	101	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
fe4fa4b8 DC	102
fe4fa4b8 DC	103	/*
683a8970 DC	104	* skip inodes in reclaim. Let xfs_syncsub do that for
683a8970 DC	105	* us so we don't need to worry.
fe4fa4b8	106	*/
bc60a993	107	if (xfs_iflags_test(ip, (XFS_IRECLAIM\|XFS_IRECLAIMABLE))) {
683a8970	108	read_unlock(&pag->pag_ici_lock);
fe4fa4b8 DC	109	continue;
	110	}
	111
683a8970	112	/* bad inodes are dealt with elsewhere */
bc60a993 DC	113	inode = VFS_I(ip);
bc60a993 DC	114	if (is_bad_inode(inode)) {
683a8970	115	read_unlock(&pag->pag_ici_lock);
fe4fa4b8 DC	116	continue;
	117	}
	118
683a8970	119	/* nothing to sync during shutdown */
fe4fa4b8	120	if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
683a8970	121	read_unlock(&pag->pag_ici_lock);
fe4fa4b8 DC	122	return 0;
	123	}
	124
	125	/*
bc60a993 DC	126	* If we can't get a reference on the VFS_I, the inode must be
	127	* in reclaim. If we can get the inode lock without blocking,
	128	* it is safe to flush the inode because we hold the tree lock
	129	* and xfs_iextract will block right now. Hence if we lock the
	130	* inode while holding the tree lock, xfs_ireclaim() is
	131	* guaranteed to block on the inode lock we now hold and hence
	132	* it is safe to reference the inode until we drop the inode
	133	* locks completely.
fe4fa4b8	134	*/
bc60a993 DC	135	inode_refed = B_FALSE;
bc60a993 DC	136	if (igrab(inode)) {
683a8970	137	read_unlock(&pag->pag_ici_lock);
fe4fa4b8	138	xfs_ilock(ip, lock_flags);
bc60a993	139	inode_refed = B_TRUE;
683a8970	140	} else {
bc60a993 DC	141	if (!xfs_ilock_nowait(ip, lock_flags)) {
	142	/* leave it to reclaim */
	143	read_unlock(&pag->pag_ici_lock);
	144	continue;
	145	}
683a8970	146	read_unlock(&pag->pag_ici_lock);
fe4fa4b8	147	}
bc60a993	148
fe4fa4b8 DC	149	/*
	150	* If we have to flush data or wait for I/O completion
	151	* we need to drop the ilock that we currently hold.
	152	* If we need to drop the lock, insert a marker if we
	153	* have not already done so.
	154	*/
683a8970	155	if (flags & SYNC_CLOSE) {
fe4fa4b8	156	xfs_iunlock(ip, XFS_ILOCK_SHARED);
683a8970 DC	157	if (XFS_FORCED_SHUTDOWN(mp))
	158	xfs_tosspages(ip, 0, -1, FI_REMAPF);
	159	else
	160	error = xfs_flushinval_pages(ip, 0, -1,
	161	FI_REMAPF);
	162	/* wait for I/O on freeze */
fe4fa4b8 DC	163	if (flags & SYNC_IOWAIT)
	164	vn_iowait(ip);
	165
	166	xfs_ilock(ip, XFS_ILOCK_SHARED);
	167	}
	168
bc60a993	169	if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
683a8970 DC	170	xfs_iunlock(ip, XFS_ILOCK_SHARED);
	171	error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
	172	if (flags & SYNC_IOWAIT)
	173	vn_iowait(ip);
	174	xfs_ilock(ip, XFS_ILOCK_SHARED);
	175	}
fe4fa4b8	176
683a8970	177	if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
fe4fa4b8 DC	178	if (flags & SYNC_WAIT) {
fe4fa4b8 DC	179	xfs_iflock(ip);
683a8970 DC	180	if (!xfs_inode_clean(ip))
	181	error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
	182	else
	183	xfs_ifunlock(ip);
fe4fa4b8	184	} else if (xfs_iflock_nowait(ip)) {
683a8970 DC	185	if (!xfs_inode_clean(ip))
	186	error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
	187	else
	188	xfs_ifunlock(ip);
fe4fa4b8 DC	189	}
	190	}
	191
683a8970	192	if (lock_flags)
fe4fa4b8	193	xfs_iunlock(ip, lock_flags);
fe4fa4b8	194
bc60a993	195	if (inode_refed) {
fe4fa4b8	196	IRELE(ip);
fe4fa4b8 DC	197	}
fe4fa4b8 DC	198
683a8970	199	if (error)
fe4fa4b8	200	last_error = error;
fe4fa4b8 DC	201	/*
	202	* bail out if the filesystem is corrupted.
	203	*/
683a8970	204	if (error == EFSCORRUPTED)
fe4fa4b8	205	return XFS_ERROR(error);
fe4fa4b8	206
683a8970	207	} while (nr_found);
fe4fa4b8	208
683a8970 DC	209	return last_error;
683a8970 DC	210	}
fe4fa4b8	211
683a8970 DC	212	int
	213	xfs_sync_inodes(
	214	xfs_mount_t *mp,
2030b5ab	215	int flags)
683a8970 DC	216	{
	217	int error;
	218	int last_error;
	219	int i;
fe4fa4b8	220
683a8970 DC	221	if (mp->m_flags & XFS_MOUNT_RDONLY)
	222	return 0;
	223	error = 0;
	224	last_error = 0;
fe4fa4b8	225
683a8970 DC	226	for (i = 0; i < mp->m_sb.sb_agcount; i++) {
	227	if (!mp->m_perag[i].pag_ici_init)
	228	continue;
2030b5ab	229	error = xfs_sync_inodes_ag(mp, i, flags);
683a8970 DC	230	if (error)
	231	last_error = error;
	232	if (error == EFSCORRUPTED)
	233	break;
	234	}
fe4fa4b8 DC	235	return XFS_ERROR(last_error);
	236	}
	237
2af75df7 CH	238	STATIC int
	239	xfs_commit_dummy_trans(
	240	struct xfs_mount *mp,
	241	uint log_flags)
	242	{
	243	struct xfs_inode *ip = mp->m_rootip;
	244	struct xfs_trans *tp;
	245	int error;
	246
	247	/*
	248	* Put a dummy transaction in the log to tell recovery
	249	* that all others are OK.
	250	*/
	251	tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
	252	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
	253	if (error) {
	254	xfs_trans_cancel(tp, 0);
	255	return error;
	256	}
	257
	258	xfs_ilock(ip, XFS_ILOCK_EXCL);
	259
	260	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
	261	xfs_trans_ihold(tp, ip);
	262	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
	263	/* XXX(hch): ignoring the error here.. */
	264	error = xfs_trans_commit(tp, 0);
	265
	266	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	267
	268	xfs_log_force(mp, 0, log_flags);
	269	return 0;
	270	}
	271
	272	STATIC int
	273	xfs_sync_fsdata(
	274	struct xfs_mount *mp,
	275	int flags)
	276	{
	277	struct xfs_buf *bp;
	278	struct xfs_buf_log_item *bip;
	279	int error = 0;
	280
	281	/*
	282	* If this is xfssyncd() then only sync the superblock if we can
	283	* lock it without sleeping and it is not pinned.
	284	*/
	285	if (flags & SYNC_BDFLUSH) {
	286	ASSERT(!(flags & SYNC_WAIT));
	287
	288	bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
	289	if (!bp)
	290	goto out;
	291
	292	bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
	293	if (!bip \|\| !xfs_buf_item_dirty(bip) \|\| XFS_BUF_ISPINNED(bp))
	294	goto out_brelse;
	295	} else {
	296	bp = xfs_getsb(mp, 0);
	297
	298	/*
	299	* If the buffer is pinned then push on the log so we won't
	300	* get stuck waiting in the write for someone, maybe
	301	* ourselves, to flush the log.
302	*
303	* Even though we just pushed the log above, we did not have
304	* the superblock buffer locked at that point so it can
305	* become pinned in between there and here.
306	*/
307	if (XFS_BUF_ISPINNED(bp))
308	xfs_log_force(mp, 0, XFS_LOG_FORCE);
309	}
310
311
312	if (flags & SYNC_WAIT)
313	XFS_BUF_UNASYNC(bp);
314	else
315	XFS_BUF_ASYNC(bp);
316
317	return xfs_bwrite(mp, bp);
318
319	out_brelse:
320	xfs_buf_relse(bp);
321	out:
322	return error;
323	}
324
fe4fa4b8	325	/*
dfd837a9	326	* xfs_sync flushes any pending I/O to file system vfsp.
fe4fa4b8	327	*
dfd837a9 DC	328	* This routine is called by vfs_sync() to make sure that things make it
	329	* out to disk eventually, on sync() system calls to flush out everything,
	330	* and when the file system is unmounted. For the vfs_sync() case, all
	331	* we really need to do is sync out the log to make all of our meta-data
	332	* updates permanent (except for timestamps). For calls from pflushd(),
	333	* dirty pages are kept moving by calling pdflush() on the inodes
	334	* containing them. We also flush the inodes that we can lock without
	335	* sleeping and the superblock if we can lock it without sleeping from
	336	* vfs_sync() so that items at the tail of the log are always moving out.
	337	*
	338	* Flags:
	339	* SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
	340	* to sleep if we can help it. All we really need
	341	* to do is ensure that the log is synced at least
	342	* periodically. We also push the inodes and
	343	* superblock if we can lock them without sleeping
	344	* and they are not pinned.
be97d9d5 DC	345	* SYNC_ATTR - We need to flush the inodes. Now handled by direct calls
be97d9d5 DC	346	* to xfs_sync_inodes().
dfd837a9 DC	347	* SYNC_WAIT - All the flushes that take place in this call should
	348	* be synchronous.
	349	* SYNC_DELWRI - This tells us to push dirty pages associated with
	350	* inodes. SYNC_WAIT and SYNC_BDFLUSH are used to
	351	* determine if they should be flushed sync, async, or
	352	* delwri.
	353	* SYNC_CLOSE - This flag is passed when the system is being
	354	* unmounted. We should sync and invalidate everything.
	355	* SYNC_FSDATA - This indicates that the caller would like to make
	356	* sure the superblock is safe on disk. We can ensure
	357	* this by simply making sure the log gets flushed
	358	* if SYNC_BDFLUSH is set, and by actually writing it
	359	* out otherwise.
	360	* SYNC_IOWAIT - The caller wants us to wait for all data I/O to complete
	361	* before we return (including direct I/O). Forms the drain
	362	* side of the write barrier needed to safely quiesce the
	363	* filesystem.
fe4fa4b8 DC	364	*
fe4fa4b8 DC	365	*/
dfd837a9 DC	366	int
dfd837a9 DC	367	xfs_sync(
fe4fa4b8	368	xfs_mount_t *mp,
2030b5ab	369	int flags)
fe4fa4b8	370	{
dfd837a9	371	int error;
fe4fa4b8 DC	372	int last_error = 0;
fe4fa4b8 DC	373	uint log_flags = XFS_LOG_FORCE;
fe4fa4b8	374
be97d9d5 DC	375	ASSERT(!(flags & SYNC_ATTR));
be97d9d5 DC	376
dfd837a9 DC	377	/*
	378	* Get the Quota Manager to flush the dquots.
	379	*
	380	* If XFS quota support is not enabled or this filesystem
	381	* instance does not use quotas XFS_QM_DQSYNC will always
	382	* return zero.
	383	*/
	384	error = XFS_QM_DQSYNC(mp, flags);
	385	if (error) {
	386	/*
	387	* If we got an IO error, we will be shutting down.
	388	* So, there's nothing more for us to do here.
	389	*/
	390	ASSERT(error != EIO \|\| XFS_FORCED_SHUTDOWN(mp));
	391	if (XFS_FORCED_SHUTDOWN(mp))
	392	return XFS_ERROR(error);
	393	}
	394
	395	if (flags & SYNC_IOWAIT)
	396	xfs_filestream_flush(mp);
	397
fe4fa4b8 DC	398	/*
	399	* Sync out the log. This ensures that the log is periodically
	400	* flushed even if there is not enough activity to fill it up.
	401	*/
	402	if (flags & SYNC_WAIT)
	403	log_flags \|= XFS_LOG_SYNC;
	404
	405	xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
	406
be97d9d5	407	if (flags & SYNC_DELWRI) {
fe4fa4b8	408	if (flags & SYNC_BDFLUSH)
75c68f41	409	xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
fe4fa4b8	410	else
2030b5ab	411	error = xfs_sync_inodes(mp, flags);
be97d9d5 DC	412	/*
	413	* Flushing out dirty data above probably generated more
	414	* log activity, so if this isn't vfs_sync() then flush
	415	* the log again.
	416	*/
2af75df7	417	xfs_log_force(mp, 0, log_flags);
be97d9d5	418	}
fe4fa4b8 DC	419
fe4fa4b8 DC	420	if (flags & SYNC_FSDATA) {
2af75df7 CH	421	error = xfs_sync_fsdata(mp, flags);
2af75df7 CH	422	if (error)
fe4fa4b8	423	last_error = error;
fe4fa4b8 DC	424	}
	425
	426	/*
	427	* Now check to see if the log needs a "dummy" transaction.
	428	*/
	429	if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
2af75df7 CH	430	error = xfs_commit_dummy_trans(mp, log_flags);
2af75df7 CH	431	if (error)
fe4fa4b8	432	return error;
fe4fa4b8 DC	433	}
	434
	435	/*
	436	* When shutting down, we need to insure that the AIL is pushed
	437	* to disk or the filesystem can appear corrupt from the PROM.
	438	*/
	439	if ((flags & (SYNC_CLOSE\|SYNC_WAIT)) == (SYNC_CLOSE\|SYNC_WAIT)) {
	440	XFS_bflush(mp->m_ddev_targp);
	441	if (mp->m_rtdev_targp) {
	442	XFS_bflush(mp->m_rtdev_targp);
	443	}
	444	}
	445
	446	return XFS_ERROR(last_error);
	447	}
a167b17e DC	448
	449	/*
	450	* Enqueue a work item to be picked up by the vfs xfssyncd thread.
	451	* Doing this has two advantages:
	452	* - It saves on stack space, which is tight in certain situations
	453	* - It can be used (with care) as a mechanism to avoid deadlocks.
	454	* Flushing while allocating in a full filesystem requires both.
	455	*/
	456	STATIC void
	457	xfs_syncd_queue_work(
	458	struct xfs_mount *mp,
	459	void *data,
	460	void (syncer)(struct xfs_mount , void *))
	461	{
	462	struct bhv_vfs_sync_work *work;
	463
	464	work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
	465	INIT_LIST_HEAD(&work->w_list);
	466	work->w_syncer = syncer;
	467	work->w_data = data;
	468	work->w_mount = mp;
	469	spin_lock(&mp->m_sync_lock);
	470	list_add_tail(&work->w_list, &mp->m_sync_list);
	471	spin_unlock(&mp->m_sync_lock);
	472	wake_up_process(mp->m_sync_task);
	473	}
	474
	475	/*
	476	* Flush delayed allocate data, attempting to free up reserved space
	477	* from existing allocations. At this point a new allocation attempt
	478	* has failed with ENOSPC and we are in the process of scratching our
	479	* heads, looking about for more room...
	480	*/
	481	STATIC void
	482	xfs_flush_inode_work(
	483	struct xfs_mount *mp,
	484	void *arg)
	485	{
	486	struct inode *inode = arg;
	487	filemap_flush(inode->i_mapping);
	488	iput(inode);
	489	}
	490
	491	void
	492	xfs_flush_inode(
	493	xfs_inode_t *ip)
	494	{
	495	struct inode *inode = VFS_I(ip);
	496
	497	igrab(inode);
	498	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
	499	delay(msecs_to_jiffies(500));
	500	}
	501
	502	/*
	503	* This is the "bigger hammer" version of xfs_flush_inode_work...
	504	* (IOW, "If at first you don't succeed, use a Bigger Hammer").
	505	*/
	506	STATIC void
	507	xfs_flush_device_work(
	508	struct xfs_mount *mp,
	509	void *arg)
	510	{
	511	struct inode *inode = arg;
512	sync_blockdev(mp->m_super->s_bdev);
513	iput(inode);
514	}
515
516	void
517	xfs_flush_device(
518	xfs_inode_t *ip)
519	{
520	struct inode *inode = VFS_I(ip);
521
522	igrab(inode);
523	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
524	delay(msecs_to_jiffies(500));
525	xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE\|XFS_LOG_SYNC);
526	}
527
aacaa880 DC	528	/*
	529	* Every sync period we need to unpin all items, reclaim inodes, sync
	530	* quota and write out the superblock. We might need to cover the log
	531	* to indicate it is idle.
	532	*/
a167b17e DC	533	STATIC void
	534	xfs_sync_worker(
	535	struct xfs_mount *mp,
	536	void *unused)
	537	{
	538	int error;
	539
aacaa880 DC	540	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
	541	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
	542	xfs_finish_reclaim_all(mp, 1, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
	543	/* dgc: errors ignored here */
	544	error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
	545	error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
	546	if (xfs_log_need_covered(mp))
	547	error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
	548	}
a167b17e DC	549	mp->m_sync_seq++;
	550	wake_up(&mp->m_wait_single_sync_task);
	551	}
	552
	553	STATIC int
	554	xfssyncd(
	555	void *arg)
	556	{
	557	struct xfs_mount *mp = arg;
	558	long timeleft;
	559	bhv_vfs_sync_work_t work, n;
	560	LIST_HEAD (tmp);
	561
	562	set_freezable();
	563	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
	564	for (;;) {
	565	timeleft = schedule_timeout_interruptible(timeleft);
	566	/* swsusp */
	567	try_to_freeze();
	568	if (kthread_should_stop() && list_empty(&mp->m_sync_list))
	569	break;
	570
	571	spin_lock(&mp->m_sync_lock);
	572	/*
	573	* We can get woken by laptop mode, to do a sync -
	574	* that's the (only!) case where the list would be
	575	* empty with time remaining.
	576	*/
	577	if (!timeleft \|\| list_empty(&mp->m_sync_list)) {
	578	if (!timeleft)
	579	timeleft = xfs_syncd_centisecs *
	580	msecs_to_jiffies(10);
	581	INIT_LIST_HEAD(&mp->m_sync_work.w_list);
	582	list_add_tail(&mp->m_sync_work.w_list,
	583	&mp->m_sync_list);
	584	}
	585	list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
	586	list_move(&work->w_list, &tmp);
	587	spin_unlock(&mp->m_sync_lock);
	588
	589	list_for_each_entry_safe(work, n, &tmp, w_list) {
	590	(*work->w_syncer)(mp, work->w_data);
	591	list_del(&work->w_list);
	592	if (work == &mp->m_sync_work)
	593	continue;
	594	kmem_free(work);
	595	}
	596	}
	597
	598	return 0;
	599	}
	600
	601	int
	602	xfs_syncd_init(
	603	struct xfs_mount *mp)
	604	{
	605	mp->m_sync_work.w_syncer = xfs_sync_worker;
	606	mp->m_sync_work.w_mount = mp;
	607	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
	608	if (IS_ERR(mp->m_sync_task))
	609	return -PTR_ERR(mp->m_sync_task);
	610	return 0;
	611	}
	612
613	void
614	xfs_syncd_stop(
615	struct xfs_mount *mp)
616	{
617	kthread_stop(mp->m_sync_task);
618	}
619