[linux-block.git] / fs / xfs / xfs_reflink.c

/*
 * Copyright (C) 2016 Oracle.  All Rights Reserved.
 *
 * Author: Darrick J. Wong <darrick.wong@oracle.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_error.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_ioctl.h"
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_icache.h"
#include "xfs_pnfs.h"
#include "xfs_refcount_btree.h"
#include "xfs_refcount.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_bit.h"
#include "xfs_alloc.h"
#include "xfs_quota_defs.h"
#include "xfs_quota.h"
#include "xfs_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
#include "xfs_iomap.h"

/*
 * Copy on Write of Shared Blocks
 *
 * XFS must preserve "the usual" file semantics even when two files share
 * the same physical blocks.  This means that a write to one file must not
 * alter the blocks in a different file; the way that we'll do that is
 * through the use of a copy-on-write mechanism.  At a high level, that
 * means that when we want to write to a shared block, we allocate a new
 * block, write the data to the new block, and if that succeeds we map the
 * new block into the file.
 *
 * XFS provides a "delayed allocation" mechanism that defers the allocation
 * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
 * possible.  This reduces fragmentation by enabling the filesystem to ask
 * for bigger chunks less often, which is exactly what we want for CoW.
 *
 * The delalloc mechanism begins when the kernel wants to make a block
 * writable (write_begin or page_mkwrite).  If the offset is not mapped, we
 * create a delalloc mapping, which is a regular in-core extent, but without
 * a real startblock.  (For delalloc mappings, the startblock encodes both
 * a flag that this is a delalloc mapping, and a worst-case estimate of how
 * many blocks might be required to put the mapping into the BMBT.)  delalloc
 * mappings are a reservation against the free space in the filesystem;
 * adjacent mappings can also be combined into fewer larger mappings.
 *
 * When dirty pages are being written out (typically in writepage), the
 * delalloc reservations are converted into real mappings by allocating
 * blocks and replacing the delalloc mapping with real ones.  A delalloc
 * mapping can be replaced by several real ones if the free space is
 * fragmented.
 *
 * We want to adapt the delalloc mechanism for copy-on-write, since the
 * write paths are similar.  The first two steps (creating the reservation
 * and allocating the blocks) are exactly the same as delalloc except that
 * the mappings must be stored in a separate CoW fork because we do not want
 * to disturb the mapping in the data fork until we're sure that the write
 * succeeded.  IO completion in this case is the process of removing the old
 * mapping from the data fork and moving the new mapping from the CoW fork to
 * the data fork.  This will be discussed shortly.
 *
 * For now, unaligned directio writes will be bounced back to the page cache.
 * Block-aligned directio writes will use the same mechanism as buffered
 * writes.
 *
 * CoW remapping must be done after the data block write completes,
 * because we don't want to destroy the old data fork map until we're sure
 * the new block has been written.  Since the new mappings are kept in a
 * separate fork, we can simply iterate these mappings to find the ones
 * that cover the file blocks that we just CoW'd.  For each extent, simply
 * unmap the corresponding range in the data fork, map the new range into
 * the data fork, and remove the extent from the CoW fork.
 *
 * Since the remapping operation can be applied to an arbitrary file
 * range, we record the need for the remap step as a flag in the ioend
 * instead of declaring a new IO type.  This is required for direct io
 * because we only have ioend for the whole dio, and we have to be able to
 * remember the presence of unwritten blocks and CoW blocks with a single
 * ioend structure.  Better yet, the more ground we can cover with one
 * ioend, the better.
 */

/*
 * Given an AG extent, find the lowest-numbered run of shared blocks
 * within that range and return the range in fbno/flen.  If
 * find_end_of_shared is true, return the longest contiguous extent of
 * shared blocks.  If there are no shared extents, fbno and flen will
 * be set to NULLAGBLOCK and 0, respectively.
 */
int
xfs_reflink_find_shared(
	struct xfs_mount	*mp,
	xfs_agnumber_t		agno,
	xfs_agblock_t		agbno,
	xfs_extlen_t		aglen,
	xfs_agblock_t		*fbno,
	xfs_extlen_t		*flen,
	bool			find_end_of_shared)
{
	struct xfs_buf		*agbp;
	struct xfs_btree_cur	*cur;
	int			error;

	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
	if (error)
		return error;

	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);

	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
			find_end_of_shared);

	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);

	xfs_buf_relse(agbp);
	return error;
}

/*
 * Trim the mapping to the next block where there's a change in the
 * shared/unshared status.  More specifically, this means that we
 * find the lowest-numbered extent of shared blocks that coincides with
 * the given block mapping.  If the shared extent overlaps the start of
 * the mapping, trim the mapping to the end of the shared extent.  If
 * the shared region intersects the mapping, trim the mapping to the
 * start of the shared extent.  If there are no shared regions that
 * overlap, just return the original extent.
 */
int
xfs_reflink_trim_around_shared(
	struct xfs_inode	*ip,
	struct xfs_bmbt_irec	*irec,
	bool			*shared,
	bool			*trimmed)
{
	xfs_agnumber_t		agno;
	xfs_agblock_t		agbno;
	xfs_extlen_t		aglen;
	xfs_agblock_t		fbno;
	xfs_extlen_t		flen;
	int			error = 0;

	/* Holes, unwritten, and delalloc extents cannot be shared */
	if (!xfs_is_reflink_inode(ip) ||
	    ISUNWRITTEN(irec) ||
	    irec->br_startblock == HOLESTARTBLOCK ||
	    irec->br_startblock == DELAYSTARTBLOCK) {
		*shared = false;
		return 0;
	}

	trace_xfs_reflink_trim_around_shared(ip, irec);

	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
	aglen = irec->br_blockcount;

	error = xfs_reflink_find_shared(ip->i_mount, agno, agbno,
			aglen, &fbno, &flen, true);
	if (error)
		return error;

	*shared = *trimmed = false;
	if (fbno == NULLAGBLOCK) {
		/* No shared blocks at all. */
		return 0;
	} else if (fbno == agbno) {
		/*
		 * The start of this extent is shared.  Truncate the
		 * mapping at the end of the shared region so that a
		 * subsequent iteration starts at the start of the
		 * unshared region.
		 */
		irec->br_blockcount = flen;
		*shared = true;
		if (flen != aglen)
			*trimmed = true;
		return 0;
	} else {
		/*
		 * There's a shared extent midway through this extent.
		 * Truncate the mapping at the start of the shared
		 * extent so that a subsequent iteration starts at the
		 * start of the shared region.
		 */
		irec->br_blockcount = fbno - agbno;
		*trimmed = true;
		return 0;
	}
}

/* Create a CoW reservation for a range of blocks within a file. */
static int
__xfs_reflink_reserve_cow(
	struct xfs_inode	*ip,
	xfs_fileoff_t		*offset_fsb,
	xfs_fileoff_t		end_fsb)
{
	struct xfs_bmbt_irec	got, prev, imap;
	xfs_fileoff_t		orig_end_fsb;
	int			nimaps, eof = 0, error = 0;
	bool			shared = false, trimmed = false;
	xfs_extnum_t		idx;

	/* Already reserved?  Skip the refcount btree access. */
	xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx,
			&got, &prev);
	if (!eof && got.br_startoff <= *offset_fsb) {
		end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount;
		trace_xfs_reflink_cow_found(ip, &got);
		goto done;
	}

	/* Read extent from the source file. */
	nimaps = 1;
	error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
			&imap, &nimaps, 0);
	if (error)
		goto out_unlock;
	ASSERT(nimaps == 1);

	/* Trim the mapping to the nearest shared extent boundary. */
	error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
	if (error)
		goto out_unlock;

	end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;

	/* Not shared?  Just report the (potentially capped) extent. */
	if (!shared)
		goto done;

	/*
	 * Fork all the shared blocks from our write offset until the end of
	 * the extent.
	 */
	error = xfs_qm_dqattach_locked(ip, 0);
	if (error)
		goto out_unlock;

retry:
	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb,
			end_fsb - *offset_fsb, &got,
			&prev, &idx, eof);
	switch (error) {
	case 0:
		break;
	case -ENOSPC:
	case -EDQUOT:
		/* retry without any preallocation */
		trace_xfs_reflink_cow_enospc(ip, &imap);
		if (end_fsb != orig_end_fsb) {
			end_fsb = orig_end_fsb;
			goto retry;
		}
		/*FALLTHRU*/
	default:
		goto out_unlock;
	}

	trace_xfs_reflink_cow_alloc(ip, &got);
done:
	*offset_fsb = end_fsb;
out_unlock:
	return error;
}

/* Create a CoW reservation for part of a file. */
int
xfs_reflink_reserve_cow_range(
	struct xfs_inode	*ip,
	xfs_off_t		offset,
	xfs_off_t		count)
{
	struct xfs_mount	*mp = ip->i_mount;
	xfs_fileoff_t		offset_fsb, end_fsb;
	int			error;

	trace_xfs_reflink_reserve_cow_range(ip, offset, count);

	offset_fsb = XFS_B_TO_FSBT(mp, offset);
	end_fsb = XFS_B_TO_FSB(mp, offset + count);

	xfs_ilock(ip, XFS_ILOCK_EXCL);
	while (offset_fsb < end_fsb) {
		error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb);
		if (error) {
			trace_xfs_reflink_reserve_cow_range_error(ip, error,
				_RET_IP_);
			break;
		}
	}
	xfs_iunlock(ip, XFS_ILOCK_EXCL);

	return error;
}

/*
 * Find the CoW reservation (and whether or not it needs block allocation)
 * for a given byte offset of a file.
 */
bool
xfs_reflink_find_cow_mapping(
	struct xfs_inode		*ip,
	xfs_off_t			offset,
	struct xfs_bmbt_irec		*imap,
	bool				*need_alloc)
{
	struct xfs_bmbt_irec		irec;
	struct xfs_ifork		*ifp;
	struct xfs_bmbt_rec_host	*gotp;
	xfs_fileoff_t			bno;
	xfs_extnum_t			idx;

	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
	ASSERT(xfs_is_reflink_inode(ip));

	/* Find the extent in the CoW fork. */
	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
	bno = XFS_B_TO_FSBT(ip->i_mount, offset);
	gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
	if (!gotp)
		return false;

	xfs_bmbt_get_all(gotp, &irec);
	if (bno >= irec.br_startoff + irec.br_blockcount ||
	    bno < irec.br_startoff)
		return false;

	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
			&irec);

	/* If it's still delalloc, we must allocate later. */
	*imap = irec;
	*need_alloc = !!(isnullstartblock(irec.br_startblock));

	return true;
}

/*
 * Trim an extent to end at the next CoW reservation past offset_fsb.
 */
int
xfs_reflink_trim_irec_to_next_cow(
	struct xfs_inode		*ip,
	xfs_fileoff_t			offset_fsb,
	struct xfs_bmbt_irec		*imap)
{
	struct xfs_bmbt_irec		irec;
	struct xfs_ifork		*ifp;
	struct xfs_bmbt_rec_host	*gotp;
	xfs_extnum_t			idx;

	if (!xfs_is_reflink_inode(ip))
		return 0;

	/* Find the extent in the CoW fork. */
	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
	gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
	if (!gotp)
		return 0;
	xfs_bmbt_get_all(gotp, &irec);

	/* This is the extent before; try sliding up one. */
	if (irec.br_startoff < offset_fsb) {
		idx++;
		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
			return 0;
		gotp = xfs_iext_get_ext(ifp, idx);
		xfs_bmbt_get_all(gotp, &irec);
	}

	if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
		return 0;

	imap->br_blockcount = irec.br_startoff - imap->br_startoff;
	trace_xfs_reflink_trim_irec(ip, imap);

	return 0;
}
Commit	Line	Data
3993baeb DW	1	/*
	2	* Copyright (C) 2016 Oracle. All Rights Reserved.
	3	*
	4	* Author: Darrick J. Wong <darrick.wong@oracle.com>
	5	*
	6	* This program is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU General Public License
	8	* as published by the Free Software Foundation; either version 2
	9	* of the License, or (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it would be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write the Free Software Foundation,
	18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
	19	*/
	20	#include "xfs.h"
	21	#include "xfs_fs.h"
	22	#include "xfs_shared.h"
	23	#include "xfs_format.h"
	24	#include "xfs_log_format.h"
	25	#include "xfs_trans_resv.h"
	26	#include "xfs_mount.h"
	27	#include "xfs_defer.h"
	28	#include "xfs_da_format.h"
	29	#include "xfs_da_btree.h"
	30	#include "xfs_inode.h"
	31	#include "xfs_trans.h"
	32	#include "xfs_inode_item.h"
	33	#include "xfs_bmap.h"
	34	#include "xfs_bmap_util.h"
	35	#include "xfs_error.h"
	36	#include "xfs_dir2.h"
	37	#include "xfs_dir2_priv.h"
	38	#include "xfs_ioctl.h"
	39	#include "xfs_trace.h"
	40	#include "xfs_log.h"
	41	#include "xfs_icache.h"
	42	#include "xfs_pnfs.h"
	43	#include "xfs_refcount_btree.h"
	44	#include "xfs_refcount.h"
	45	#include "xfs_bmap_btree.h"
	46	#include "xfs_trans_space.h"
	47	#include "xfs_bit.h"
	48	#include "xfs_alloc.h"
	49	#include "xfs_quota_defs.h"
	50	#include "xfs_quota.h"
	51	#include "xfs_btree.h"
	52	#include "xfs_bmap_btree.h"
	53	#include "xfs_reflink.h"
2a06705c	54	#include "xfs_iomap.h"
3993baeb DW	55
	56	/*
	57	* Copy on Write of Shared Blocks
	58	*
	59	* XFS must preserve "the usual" file semantics even when two files share
	60	* the same physical blocks. This means that a write to one file must not
	61	* alter the blocks in a different file; the way that we'll do that is
	62	* through the use of a copy-on-write mechanism. At a high level, that
	63	* means that when we want to write to a shared block, we allocate a new
	64	* block, write the data to the new block, and if that succeeds we map the
	65	* new block into the file.
	66	*
	67	* XFS provides a "delayed allocation" mechanism that defers the allocation
	68	* of disk blocks to dirty-but-not-yet-mapped file blocks as long as
	69	* possible. This reduces fragmentation by enabling the filesystem to ask
	70	* for bigger chunks less often, which is exactly what we want for CoW.
	71	*
	72	* The delalloc mechanism begins when the kernel wants to make a block
	73	* writable (write_begin or page_mkwrite). If the offset is not mapped, we
	74	* create a delalloc mapping, which is a regular in-core extent, but without
	75	* a real startblock. (For delalloc mappings, the startblock encodes both
	76	* a flag that this is a delalloc mapping, and a worst-case estimate of how
	77	* many blocks might be required to put the mapping into the BMBT.) delalloc
	78	* mappings are a reservation against the free space in the filesystem;
	79	* adjacent mappings can also be combined into fewer larger mappings.
	80	*
	81	* When dirty pages are being written out (typically in writepage), the
	82	* delalloc reservations are converted into real mappings by allocating
	83	* blocks and replacing the delalloc mapping with real ones. A delalloc
	84	* mapping can be replaced by several real ones if the free space is
	85	* fragmented.
	86	*
	87	* We want to adapt the delalloc mechanism for copy-on-write, since the
	88	* write paths are similar. The first two steps (creating the reservation
	89	* and allocating the blocks) are exactly the same as delalloc except that
	90	* the mappings must be stored in a separate CoW fork because we do not want
	91	* to disturb the mapping in the data fork until we're sure that the write
	92	* succeeded. IO completion in this case is the process of removing the old
	93	* mapping from the data fork and moving the new mapping from the CoW fork to
	94	* the data fork. This will be discussed shortly.
	95	*
	96	* For now, unaligned directio writes will be bounced back to the page cache.
	97	* Block-aligned directio writes will use the same mechanism as buffered
	98	* writes.
	99	*
	100	* CoW remapping must be done after the data block write completes,
	101	* because we don't want to destroy the old data fork map until we're sure
	102	* the new block has been written. Since the new mappings are kept in a
	103	* separate fork, we can simply iterate these mappings to find the ones
	104	* that cover the file blocks that we just CoW'd. For each extent, simply
	105	* unmap the corresponding range in the data fork, map the new range into
	106	* the data fork, and remove the extent from the CoW fork.
	107	*
	108	* Since the remapping operation can be applied to an arbitrary file
	109	* range, we record the need for the remap step as a flag in the ioend
	110	* instead of declaring a new IO type. This is required for direct io
	111	* because we only have ioend for the whole dio, and we have to be able to
	112	* remember the presence of unwritten blocks and CoW blocks with a single
	113	* ioend structure. Better yet, the more ground we can cover with one
	114	* ioend, the better.
	115	*/
2a06705c DW	116
	117	/*
	118	* Given an AG extent, find the lowest-numbered run of shared blocks
	119	* within that range and return the range in fbno/flen. If
	120	* find_end_of_shared is true, return the longest contiguous extent of
	121	* shared blocks. If there are no shared extents, fbno and flen will
	122	* be set to NULLAGBLOCK and 0, respectively.
	123	*/
	124	int
	125	xfs_reflink_find_shared(
	126	struct xfs_mount *mp,
	127	xfs_agnumber_t agno,
	128	xfs_agblock_t agbno,
	129	xfs_extlen_t aglen,
	130	xfs_agblock_t *fbno,
	131	xfs_extlen_t *flen,
	132	bool find_end_of_shared)
	133	{
	134	struct xfs_buf *agbp;
	135	struct xfs_btree_cur *cur;
	136	int error;
	137
	138	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
	139	if (error)
	140	return error;
	141
	142	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
	143
	144	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
	145	find_end_of_shared);
	146
	147	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
	148
	149	xfs_buf_relse(agbp);
	150	return error;
	151	}
	152
	153	/*
	154	* Trim the mapping to the next block where there's a change in the
	155	* shared/unshared status. More specifically, this means that we
	156	* find the lowest-numbered extent of shared blocks that coincides with
	157	* the given block mapping. If the shared extent overlaps the start of
	158	* the mapping, trim the mapping to the end of the shared extent. If
	159	* the shared region intersects the mapping, trim the mapping to the
	160	* start of the shared extent. If there are no shared regions that
	161	* overlap, just return the original extent.
	162	*/
	163	int
	164	xfs_reflink_trim_around_shared(
	165	struct xfs_inode *ip,
	166	struct xfs_bmbt_irec *irec,
	167	bool *shared,
	168	bool *trimmed)
	169	{
	170	xfs_agnumber_t agno;
	171	xfs_agblock_t agbno;
	172	xfs_extlen_t aglen;
	173	xfs_agblock_t fbno;
	174	xfs_extlen_t flen;
	175	int error = 0;
	176
	177	/* Holes, unwritten, and delalloc extents cannot be shared */
	178	if (!xfs_is_reflink_inode(ip) \|\|
	179	ISUNWRITTEN(irec) \|\|
180	irec->br_startblock == HOLESTARTBLOCK \|\|
181	irec->br_startblock == DELAYSTARTBLOCK) {
182	*shared = false;
183	return 0;
184	}
185
186	trace_xfs_reflink_trim_around_shared(ip, irec);
187
188	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
189	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
190	aglen = irec->br_blockcount;
191
192	error = xfs_reflink_find_shared(ip->i_mount, agno, agbno,
193	aglen, &fbno, &flen, true);
194	if (error)
195	return error;
196
197	shared = trimmed = false;
198	if (fbno == NULLAGBLOCK) {
199	/* No shared blocks at all. */
200	return 0;
201	} else if (fbno == agbno) {
202	/*
203	* The start of this extent is shared. Truncate the
204	* mapping at the end of the shared region so that a
205	* subsequent iteration starts at the start of the
206	* unshared region.
207	*/
208	irec->br_blockcount = flen;
209	*shared = true;
210	if (flen != aglen)
211	*trimmed = true;
212	return 0;
213	} else {
214	/*
215	* There's a shared extent midway through this extent.
216	* Truncate the mapping at the start of the shared
217	* extent so that a subsequent iteration starts at the
218	* start of the shared region.
219	*/
220	irec->br_blockcount = fbno - agbno;
221	*trimmed = true;
222	return 0;
223	}
224	}
225
226	/* Create a CoW reservation for a range of blocks within a file. */
227	static int
228	__xfs_reflink_reserve_cow(
229	struct xfs_inode *ip,
230	xfs_fileoff_t *offset_fsb,
231	xfs_fileoff_t end_fsb)
232	{
233	struct xfs_bmbt_irec got, prev, imap;
234	xfs_fileoff_t orig_end_fsb;
235	int nimaps, eof = 0, error = 0;
236	bool shared = false, trimmed = false;
237	xfs_extnum_t idx;
238
239	/* Already reserved? Skip the refcount btree access. */
240	xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx,
241	&got, &prev);
242	if (!eof && got.br_startoff <= *offset_fsb) {
243	end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount;
244	trace_xfs_reflink_cow_found(ip, &got);
245	goto done;
246	}
247
248	/* Read extent from the source file. */
249	nimaps = 1;
250	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
251	&imap, &nimaps, 0);
252	if (error)
253	goto out_unlock;
254	ASSERT(nimaps == 1);
255
256	/* Trim the mapping to the nearest shared extent boundary. */
257	error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
258	if (error)
259	goto out_unlock;
260
261	end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;
262
263	/* Not shared? Just report the (potentially capped) extent. */
264	if (!shared)
265	goto done;
266
267	/*
268	* Fork all the shared blocks from our write offset until the end of
269	* the extent.
270	*/
271	error = xfs_qm_dqattach_locked(ip, 0);
272	if (error)
273	goto out_unlock;
274
275	retry:
276	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb,
277	end_fsb - *offset_fsb, &got,
278	&prev, &idx, eof);
279	switch (error) {
280	case 0:
281	break;
282	case -ENOSPC:
283	case -EDQUOT:
284	/* retry without any preallocation */
285	trace_xfs_reflink_cow_enospc(ip, &imap);
286	if (end_fsb != orig_end_fsb) {
287	end_fsb = orig_end_fsb;
288	goto retry;
289	}
290	/FALLTHRU/
291	default:
292	goto out_unlock;
293	}
294
295	trace_xfs_reflink_cow_alloc(ip, &got);
296	done:
297	*offset_fsb = end_fsb;
298	out_unlock:
299	return error;
300	}
301
302	/* Create a CoW reservation for part of a file. */
303	int
304	xfs_reflink_reserve_cow_range(
305	struct xfs_inode *ip,
306	xfs_off_t offset,
307	xfs_off_t count)
308	{
309	struct xfs_mount *mp = ip->i_mount;
310	xfs_fileoff_t offset_fsb, end_fsb;
311	int error;
312
313	trace_xfs_reflink_reserve_cow_range(ip, offset, count);
314
315	offset_fsb = XFS_B_TO_FSBT(mp, offset);
316	end_fsb = XFS_B_TO_FSB(mp, offset + count);
317
318	xfs_ilock(ip, XFS_ILOCK_EXCL);
319	while (offset_fsb < end_fsb) {
320	error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb);
321	if (error) {
322	trace_xfs_reflink_reserve_cow_range_error(ip, error,
323	_RET_IP_);
324	break;
325	}
326	}
327	xfs_iunlock(ip, XFS_ILOCK_EXCL);
328
329	return error;
330	}
ef473667 DW	331
	332	/*
	333	* Find the CoW reservation (and whether or not it needs block allocation)
	334	* for a given byte offset of a file.
	335	*/
	336	bool
	337	xfs_reflink_find_cow_mapping(
	338	struct xfs_inode *ip,
	339	xfs_off_t offset,
	340	struct xfs_bmbt_irec *imap,
	341	bool *need_alloc)
	342	{
	343	struct xfs_bmbt_irec irec;
	344	struct xfs_ifork *ifp;
	345	struct xfs_bmbt_rec_host *gotp;
	346	xfs_fileoff_t bno;
	347	xfs_extnum_t idx;
	348
	349	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL \| XFS_ILOCK_SHARED));
	350	ASSERT(xfs_is_reflink_inode(ip));
	351
	352	/* Find the extent in the CoW fork. */
	353	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
	354	bno = XFS_B_TO_FSBT(ip->i_mount, offset);
	355	gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
	356	if (!gotp)
	357	return false;
	358
	359	xfs_bmbt_get_all(gotp, &irec);
	360	if (bno >= irec.br_startoff + irec.br_blockcount \|\|
	361	bno < irec.br_startoff)
	362	return false;
	363
	364	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
	365	&irec);
	366
	367	/* If it's still delalloc, we must allocate later. */
	368	*imap = irec;
	369	*need_alloc = !!(isnullstartblock(irec.br_startblock));
	370
	371	return true;
	372	}
	373
	374	/*
	375	* Trim an extent to end at the next CoW reservation past offset_fsb.
	376	*/
	377	int
	378	xfs_reflink_trim_irec_to_next_cow(
	379	struct xfs_inode *ip,
	380	xfs_fileoff_t offset_fsb,
	381	struct xfs_bmbt_irec *imap)
	382	{
	383	struct xfs_bmbt_irec irec;
	384	struct xfs_ifork *ifp;
	385	struct xfs_bmbt_rec_host *gotp;
	386	xfs_extnum_t idx;
	387
	388	if (!xfs_is_reflink_inode(ip))
	389	return 0;
	390
	391	/* Find the extent in the CoW fork. */
	392	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
	393	gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
	394	if (!gotp)
395	return 0;
396	xfs_bmbt_get_all(gotp, &irec);
397
398	/* This is the extent before; try sliding up one. */
399	if (irec.br_startoff < offset_fsb) {
400	idx++;
401	if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
402	return 0;
403	gotp = xfs_iext_get_ext(ifp, idx);
404	xfs_bmbt_get_all(gotp, &irec);
405	}
406
407	if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
408	return 0;
409
410	imap->br_blockcount = irec.br_startoff - imap->br_startoff;
411	trace_xfs_reflink_trim_irec(ip, imap);
412
413	return 0;
414	}