[linux-2.6-block.git] / fs / iomap.c

/*
 * Copyright (C) 2010 Red Hat, Inc.
 * Copyright (c) 2016 Christoph Hellwig.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include "internal.h"

/*
 * Execute a iomap write on a segment of the mapping that spans a
 * contiguous range of pages that have identical block mapping state.
 *
 * This avoids the need to map pages individually, do individual allocations
 * for each page and most importantly avoid the need for filesystem specific
 * locking per page. Instead, all the operations are amortised over the entire
 * range of pages. It is assumed that the filesystems will lock whatever
 * resources they require in the iomap_begin call, and release them in the
 * iomap_end call.
 */
loff_t
iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
		struct iomap_ops *ops, void *data, iomap_actor_t actor)
{
	struct iomap iomap = { 0 };
	loff_t written = 0, ret;

	/*
	 * Need to map a range from start position for length bytes. This can
	 * span multiple pages - it is only guaranteed to return a range of a
	 * single type of pages (e.g. all into a hole, all mapped or all
	 * unwritten). Failure at this point has nothing to undo.
	 *
	 * If allocation is required for this range, reserve the space now so
	 * that the allocation is guaranteed to succeed later on. Once we copy
	 * the data into the page cache pages, then we cannot fail otherwise we
	 * expose transient stale data. If the reserve fails, we can safely
	 * back out at this point as there is nothing to undo.
	 */
	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
	if (ret)
		return ret;
	if (WARN_ON(iomap.offset > pos))
		return -EIO;

	/*
	 * Cut down the length to the one actually provided by the filesystem,
	 * as it might not be able to give us the whole size that we requested.
	 */
	if (iomap.offset + iomap.length < pos + length)
		length = iomap.offset + iomap.length - pos;

	/*
	 * Now that we have guaranteed that the space allocation will succeed.
	 * we can do the copy-in page by page without having to worry about
	 * failures exposing transient data.
	 */
	written = actor(inode, pos, length, data, &iomap);

	/*
	 * Now the data has been copied, commit the range we've copied.  This
	 * should not fail unless the filesystem has had a fatal error.
	 */
	if (ops->iomap_end) {
		ret = ops->iomap_end(inode, pos, length,
				     written > 0 ? written : 0,
				     flags, &iomap);
	}

	return written ? written : ret;
}

static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
	loff_t i_size = i_size_read(inode);

	/*
	 * Only truncate newly allocated pages beyoned EOF, even if the
	 * write started inside the existing inode size.
	 */
	if (pos + len > i_size)
		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
}

static int
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
		struct page **pagep, struct iomap *iomap)
{
	pgoff_t index = pos >> PAGE_SHIFT;
	struct page *page;
	int status = 0;

	BUG_ON(pos + len > iomap->offset + iomap->length);

	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
	if (!page)
		return -ENOMEM;

	status = __block_write_begin_int(page, pos, len, NULL, iomap);
	if (unlikely(status)) {
		unlock_page(page);
		put_page(page);
		page = NULL;

		iomap_write_failed(inode, pos, len);
	}

	*pagep = page;
	return status;
}

static int
iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
		unsigned copied, struct page *page)
{
	int ret;

	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
			copied, page, NULL);
	if (ret < len)
		iomap_write_failed(inode, pos, len);
	return ret;
}

static loff_t
iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
		struct iomap *iomap)
{
	struct iov_iter *i = data;
	long status = 0;
	ssize_t written = 0;
	unsigned int flags = AOP_FLAG_NOFS;

	/*
	 * Copies from kernel address space cannot fail (NFSD is a big user).
	 */
	if (!iter_is_iovec(i))
		flags |= AOP_FLAG_UNINTERRUPTIBLE;

	do {
		struct page *page;
		unsigned long offset;	/* Offset into pagecache page */
		unsigned long bytes;	/* Bytes to write to page */
		size_t copied;		/* Bytes copied from user */

		offset = (pos & (PAGE_SIZE - 1));
		bytes = min_t(unsigned long, PAGE_SIZE - offset,
						iov_iter_count(i));
again:
		if (bytes > length)
			bytes = length;

		/*
		 * Bring in the user page that we will copy from _first_.
		 * Otherwise there's a nasty deadlock on copying from the
		 * same page as we're writing to, without it being marked
		 * up-to-date.
		 *
		 * Not only is this an optimisation, but it is also required
		 * to check that the address is actually valid, when atomic
		 * usercopies are used, below.
		 */
		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
			status = -EFAULT;
			break;
		}

		status = iomap_write_begin(inode, pos, bytes, flags, &page,
				iomap);
		if (unlikely(status))
			break;

		if (mapping_writably_mapped(inode->i_mapping))
			flush_dcache_page(page);

		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);

		flush_dcache_page(page);

		status = iomap_write_end(inode, pos, bytes, copied, page);
		if (unlikely(status < 0))
			break;
		copied = status;

		cond_resched();

		iov_iter_advance(i, copied);
		if (unlikely(copied == 0)) {
			/*
			 * If we were unable to copy any data at all, we must
			 * fall back to a single segment length write.
			 *
			 * If we didn't fallback here, we could livelock
			 * because not all segments in the iov can be copied at
			 * once without a pagefault.
			 */
			bytes = min_t(unsigned long, PAGE_SIZE - offset,
						iov_iter_single_seg_count(i));
			goto again;
		}
		pos += copied;
		written += copied;
		length -= copied;

		balance_dirty_pages_ratelimited(inode->i_mapping);
	} while (iov_iter_count(i) && length);

	return written ? written : status;
}

ssize_t
iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
		struct iomap_ops *ops)
{
	struct inode *inode = iocb->ki_filp->f_mapping->host;
	loff_t pos = iocb->ki_pos, ret = 0, written = 0;

	while (iov_iter_count(iter)) {
		ret = iomap_apply(inode, pos, iov_iter_count(iter),
				IOMAP_WRITE, ops, iter, iomap_write_actor);
		if (ret <= 0)
			break;
		pos += ret;
		written += ret;
	}

	return written ? written : ret;
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);

static struct page *
__iomap_read_page(struct inode *inode, loff_t offset)
{
	struct address_space *mapping = inode->i_mapping;
	struct page *page;

	page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
	if (IS_ERR(page))
		return page;
	if (!PageUptodate(page)) {
		put_page(page);
		return ERR_PTR(-EIO);
	}
	return page;
}

static loff_t
iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
		struct iomap *iomap)
{
	long status = 0;
	ssize_t written = 0;

	do {
		struct page *page, *rpage;
		unsigned long offset;	/* Offset into pagecache page */
		unsigned long bytes;	/* Bytes to write to page */

		offset = (pos & (PAGE_SIZE - 1));
		bytes = min_t(unsigned long, PAGE_SIZE - offset, length);

		rpage = __iomap_read_page(inode, pos);
		if (IS_ERR(rpage))
			return PTR_ERR(rpage);

		status = iomap_write_begin(inode, pos, bytes,
				AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE,
				&page, iomap);
		put_page(rpage);
		if (unlikely(status))
			return status;

		WARN_ON_ONCE(!PageUptodate(page));

		status = iomap_write_end(inode, pos, bytes, bytes, page);
		if (unlikely(status <= 0)) {
			if (WARN_ON_ONCE(status == 0))
				return -EIO;
			return status;
		}

		cond_resched();

		pos += status;
		written += status;
		length -= status;

		balance_dirty_pages_ratelimited(inode->i_mapping);
	} while (length);

	return written;
}

int
iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
		struct iomap_ops *ops)
{
	loff_t ret;

	while (len) {
		ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
				iomap_dirty_actor);
		if (ret <= 0)
			return ret;
		pos += ret;
		len -= ret;
	}

	return 0;
}
EXPORT_SYMBOL_GPL(iomap_file_dirty);

static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
		unsigned bytes, struct iomap *iomap)
{
	struct page *page;
	int status;

	status = iomap_write_begin(inode, pos, bytes,
			AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
	if (status)
		return status;

	zero_user(page, offset, bytes);
	mark_page_accessed(page);

	return iomap_write_end(inode, pos, bytes, bytes, page);
}

static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
		struct iomap *iomap)
{
	sector_t sector = iomap->blkno +
		(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);

	return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
}

static loff_t
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
		void *data, struct iomap *iomap)
{
	bool *did_zero = data;
	loff_t written = 0;
	int status;

	/* already zeroed?  we're done. */
	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
	    	return count;

	do {
		unsigned offset, bytes;

		offset = pos & (PAGE_SIZE - 1); /* Within page */
		bytes = min_t(unsigned, PAGE_SIZE - offset, count);

		if (IS_DAX(inode))
			status = iomap_dax_zero(pos, offset, bytes, iomap);
		else
			status = iomap_zero(inode, pos, offset, bytes, iomap);
		if (status < 0)
			return status;

		pos += bytes;
		count -= bytes;
		written += bytes;
		if (did_zero)
			*did_zero = true;
	} while (count > 0);

	return written;
}

int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
		struct iomap_ops *ops)
{
	loff_t ret;

	while (len > 0) {
		ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
				ops, did_zero, iomap_zero_range_actor);
		if (ret <= 0)
			return ret;

		pos += ret;
		len -= ret;
	}

	return 0;
}
EXPORT_SYMBOL_GPL(iomap_zero_range);

int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
		struct iomap_ops *ops)
{
	unsigned blocksize = (1 << inode->i_blkbits);
	unsigned off = pos & (blocksize - 1);

	/* Block boundary? Nothing to do */
	if (!off)
		return 0;
	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);

static loff_t
iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
		void *data, struct iomap *iomap)
{
	struct page *page = data;
	int ret;

	ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
			NULL, iomap);
	if (ret)
		return ret;

	block_commit_write(page, 0, length);
	return length;
}

int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
		struct iomap_ops *ops)
{
	struct page *page = vmf->page;
	struct inode *inode = file_inode(vma->vm_file);
	unsigned long length;
	loff_t offset, size;
	ssize_t ret;

	lock_page(page);
	size = i_size_read(inode);
	if ((page->mapping != inode->i_mapping) ||
	    (page_offset(page) > size)) {
		/* We overload EFAULT to mean page got truncated */
		ret = -EFAULT;
		goto out_unlock;
	}

	/* page is wholly or partially inside EOF */
	if (((page->index + 1) << PAGE_SHIFT) > size)
		length = size & ~PAGE_MASK;
	else
		length = PAGE_SIZE;

	offset = page_offset(page);
	while (length > 0) {
		ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
				ops, page, iomap_page_mkwrite_actor);
		if (unlikely(ret <= 0))
			goto out_unlock;
		offset += ret;
		length -= ret;
	}

	set_page_dirty(page);
	wait_for_stable_page(page);
	return 0;
out_unlock:
	unlock_page(page);
	return ret;
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);

struct fiemap_ctx {
	struct fiemap_extent_info *fi;
	struct iomap prev;
};

static int iomap_to_fiemap(struct fiemap_extent_info *fi,
		struct iomap *iomap, u32 flags)
{
	switch (iomap->type) {
	case IOMAP_HOLE:
		/* skip holes */
		return 0;
	case IOMAP_DELALLOC:
		flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
		break;
	case IOMAP_UNWRITTEN:
		flags |= FIEMAP_EXTENT_UNWRITTEN;
		break;
	case IOMAP_MAPPED:
		break;
	}

	if (iomap->flags & IOMAP_F_MERGED)
		flags |= FIEMAP_EXTENT_MERGED;
	if (iomap->flags & IOMAP_F_SHARED)
		flags |= FIEMAP_EXTENT_SHARED;

	return fiemap_fill_next_extent(fi, iomap->offset,
			iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
			iomap->length, flags);

}

static loff_t
iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
		struct iomap *iomap)
{
	struct fiemap_ctx *ctx = data;
	loff_t ret = length;

	if (iomap->type == IOMAP_HOLE)
		return length;

	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
	ctx->prev = *iomap;
	switch (ret) {
	case 0:		/* success */
		return length;
	case 1:		/* extent array full */
		return 0;
	default:
		return ret;
	}
}

int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
		loff_t start, loff_t len, struct iomap_ops *ops)
{
	struct fiemap_ctx ctx;
	loff_t ret;

	memset(&ctx, 0, sizeof(ctx));
	ctx.fi = fi;
	ctx.prev.type = IOMAP_HOLE;

	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
	if (ret)
		return ret;

	if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
		ret = filemap_write_and_wait(inode->i_mapping);
		if (ret)
			return ret;
	}

	while (len > 0) {
		ret = iomap_apply(inode, start, len, 0, ops, &ctx,
				iomap_fiemap_actor);
		/* inode with no (attribute) mapping will give ENOENT */
		if (ret == -ENOENT)
			break;
		if (ret < 0)
			return ret;
		if (ret == 0)
			break;

		start += ret;
		len -= ret;
	}

	if (ctx.prev.type != IOMAP_HOLE) {
		ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
		if (ret < 0)
			return ret;
	}

	return 0;
}
EXPORT_SYMBOL_GPL(iomap_fiemap);
Commit	Line	Data
ae259a9c CH	1	/*
	2	* Copyright (C) 2010 Red Hat, Inc.
	3	* Copyright (c) 2016 Christoph Hellwig.
	4	*
	5	* This program is free software; you can redistribute it and/or modify it
	6	* under the terms and conditions of the GNU General Public License,
	7	* version 2, as published by the Free Software Foundation.
	8	*
	9	* This program is distributed in the hope it will be useful, but WITHOUT
	10	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	11	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
	12	* more details.
	13	*/
	14	#include <linux/module.h>
	15	#include <linux/compiler.h>
	16	#include <linux/fs.h>
	17	#include <linux/iomap.h>
	18	#include <linux/uaccess.h>
	19	#include <linux/gfp.h>
	20	#include <linux/mm.h>
	21	#include <linux/swap.h>
	22	#include <linux/pagemap.h>
	23	#include <linux/file.h>
	24	#include <linux/uio.h>
	25	#include <linux/backing-dev.h>
	26	#include <linux/buffer_head.h>
9a286f0e	27	#include <linux/dax.h>
ae259a9c CH	28	#include "internal.h"
ae259a9c CH	29
ae259a9c CH	30	/*
	31	* Execute a iomap write on a segment of the mapping that spans a
	32	* contiguous range of pages that have identical block mapping state.
	33	*
	34	* This avoids the need to map pages individually, do individual allocations
	35	* for each page and most importantly avoid the need for filesystem specific
	36	* locking per page. Instead, all the operations are amortised over the entire
	37	* range of pages. It is assumed that the filesystems will lock whatever
	38	* resources they require in the iomap_begin call, and release them in the
	39	* iomap_end call.
	40	*/
befb503c	41	loff_t
ae259a9c CH	42	iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
	43	struct iomap_ops ops, void data, iomap_actor_t actor)
	44	{
	45	struct iomap iomap = { 0 };
	46	loff_t written = 0, ret;
	47
	48	/*
	49	* Need to map a range from start position for length bytes. This can
	50	* span multiple pages - it is only guaranteed to return a range of a
	51	* single type of pages (e.g. all into a hole, all mapped or all
	52	* unwritten). Failure at this point has nothing to undo.
	53	*
	54	* If allocation is required for this range, reserve the space now so
	55	* that the allocation is guaranteed to succeed later on. Once we copy
	56	* the data into the page cache pages, then we cannot fail otherwise we
	57	* expose transient stale data. If the reserve fails, we can safely
	58	* back out at this point as there is nothing to undo.
	59	*/
	60	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
	61	if (ret)
	62	return ret;
	63	if (WARN_ON(iomap.offset > pos))
	64	return -EIO;
	65
	66	/*
	67	* Cut down the length to the one actually provided by the filesystem,
	68	* as it might not be able to give us the whole size that we requested.
	69	*/
	70	if (iomap.offset + iomap.length < pos + length)
	71	length = iomap.offset + iomap.length - pos;
	72
	73	/*
	74	* Now that we have guaranteed that the space allocation will succeed.
	75	* we can do the copy-in page by page without having to worry about
	76	* failures exposing transient data.
	77	*/
	78	written = actor(inode, pos, length, data, &iomap);
	79
	80	/*
	81	* Now the data has been copied, commit the range we've copied. This
	82	* should not fail unless the filesystem has had a fatal error.
	83	*/
f20ac7ab CH	84	if (ops->iomap_end) {
	85	ret = ops->iomap_end(inode, pos, length,
	86	written > 0 ? written : 0,
	87	flags, &iomap);
	88	}
ae259a9c CH	89
	90	return written ? written : ret;
	91	}
	92
	93	static void
	94	iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
	95	{
	96	loff_t i_size = i_size_read(inode);
	97
	98	/*
	99	* Only truncate newly allocated pages beyoned EOF, even if the
	100	* write started inside the existing inode size.
	101	*/
	102	if (pos + len > i_size)
	103	truncate_pagecache_range(inode, max(pos, i_size), pos + len);
	104	}
	105
	106	static int
	107	iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
	108	struct page *pagep, struct iomap iomap)
	109	{
	110	pgoff_t index = pos >> PAGE_SHIFT;
	111	struct page *page;
	112	int status = 0;
	113
	114	BUG_ON(pos + len > iomap->offset + iomap->length);
	115
	116	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
	117	if (!page)
	118	return -ENOMEM;
	119
	120	status = __block_write_begin_int(page, pos, len, NULL, iomap);
	121	if (unlikely(status)) {
	122	unlock_page(page);
	123	put_page(page);
	124	page = NULL;
	125
	126	iomap_write_failed(inode, pos, len);
	127	}
	128
	129	*pagep = page;
	130	return status;
	131	}
	132
	133	static int
	134	iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
	135	unsigned copied, struct page *page)
	136	{
	137	int ret;
	138
	139	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
	140	copied, page, NULL);
	141	if (ret < len)
	142	iomap_write_failed(inode, pos, len);
	143	return ret;
	144	}
	145
	146	static loff_t
	147	iomap_write_actor(struct inode inode, loff_t pos, loff_t length, void data,
	148	struct iomap *iomap)
	149	{
	150	struct iov_iter *i = data;
	151	long status = 0;
	152	ssize_t written = 0;
153	unsigned int flags = AOP_FLAG_NOFS;
154
155	/*
156	* Copies from kernel address space cannot fail (NFSD is a big user).
157	*/
158	if (!iter_is_iovec(i))
159	flags \|= AOP_FLAG_UNINTERRUPTIBLE;
160
161	do {
162	struct page *page;
163	unsigned long offset; /* Offset into pagecache page */
164	unsigned long bytes; /* Bytes to write to page */
165	size_t copied; /* Bytes copied from user */
166
167	offset = (pos & (PAGE_SIZE - 1));
168	bytes = min_t(unsigned long, PAGE_SIZE - offset,
169	iov_iter_count(i));
170	again:
171	if (bytes > length)
172	bytes = length;
173
174	/*
175	* Bring in the user page that we will copy from _first_.
176	* Otherwise there's a nasty deadlock on copying from the
177	* same page as we're writing to, without it being marked
178	* up-to-date.
179	*
180	* Not only is this an optimisation, but it is also required
181	* to check that the address is actually valid, when atomic
182	* usercopies are used, below.
183	*/
184	if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
185	status = -EFAULT;
186	break;
187	}
188
189	status = iomap_write_begin(inode, pos, bytes, flags, &page,
190	iomap);
191	if (unlikely(status))
192	break;
193
194	if (mapping_writably_mapped(inode->i_mapping))
195	flush_dcache_page(page);
196
ae259a9c	197	copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
ae259a9c CH	198
ae259a9c CH	199	flush_dcache_page(page);
ae259a9c CH	200
	201	status = iomap_write_end(inode, pos, bytes, copied, page);
	202	if (unlikely(status < 0))
	203	break;
	204	copied = status;
	205
	206	cond_resched();
	207
	208	iov_iter_advance(i, copied);
	209	if (unlikely(copied == 0)) {
	210	/*
	211	* If we were unable to copy any data at all, we must
	212	* fall back to a single segment length write.
	213	*
	214	* If we didn't fallback here, we could livelock
	215	* because not all segments in the iov can be copied at
	216	* once without a pagefault.
	217	*/
	218	bytes = min_t(unsigned long, PAGE_SIZE - offset,
	219	iov_iter_single_seg_count(i));
	220	goto again;
	221	}
	222	pos += copied;
	223	written += copied;
	224	length -= copied;
	225
	226	balance_dirty_pages_ratelimited(inode->i_mapping);
	227	} while (iov_iter_count(i) && length);
	228
	229	return written ? written : status;
	230	}
	231
	232	ssize_t
	233	iomap_file_buffered_write(struct kiocb iocb, struct iov_iter iter,
	234	struct iomap_ops *ops)
	235	{
	236	struct inode *inode = iocb->ki_filp->f_mapping->host;
	237	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
	238
	239	while (iov_iter_count(iter)) {
	240	ret = iomap_apply(inode, pos, iov_iter_count(iter),
	241	IOMAP_WRITE, ops, iter, iomap_write_actor);
	242	if (ret <= 0)
	243	break;
	244	pos += ret;
	245	written += ret;
	246	}
	247
	248	return written ? written : ret;
	249	}
	250	EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
	251
5f4e5752 CH	252	static struct page *
	253	__iomap_read_page(struct inode *inode, loff_t offset)
	254	{
	255	struct address_space *mapping = inode->i_mapping;
	256	struct page *page;
	257
	258	page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
	259	if (IS_ERR(page))
	260	return page;
	261	if (!PageUptodate(page)) {
	262	put_page(page);
	263	return ERR_PTR(-EIO);
	264	}
	265	return page;
	266	}
	267
	268	static loff_t
	269	iomap_dirty_actor(struct inode inode, loff_t pos, loff_t length, void data,
	270	struct iomap *iomap)
	271	{
	272	long status = 0;
	273	ssize_t written = 0;
	274
	275	do {
	276	struct page page, rpage;
	277	unsigned long offset; /* Offset into pagecache page */
	278	unsigned long bytes; /* Bytes to write to page */
	279
	280	offset = (pos & (PAGE_SIZE - 1));
	281	bytes = min_t(unsigned long, PAGE_SIZE - offset, length);
	282
	283	rpage = __iomap_read_page(inode, pos);
	284	if (IS_ERR(rpage))
	285	return PTR_ERR(rpage);
	286
	287	status = iomap_write_begin(inode, pos, bytes,
	288	AOP_FLAG_NOFS \| AOP_FLAG_UNINTERRUPTIBLE,
	289	&page, iomap);
	290	put_page(rpage);
	291	if (unlikely(status))
	292	return status;
	293
	294	WARN_ON_ONCE(!PageUptodate(page));
	295
	296	status = iomap_write_end(inode, pos, bytes, bytes, page);
	297	if (unlikely(status <= 0)) {
	298	if (WARN_ON_ONCE(status == 0))
	299	return -EIO;
	300	return status;
	301	}
	302
	303	cond_resched();
	304
	305	pos += status;
	306	written += status;
	307	length -= status;
	308
	309	balance_dirty_pages_ratelimited(inode->i_mapping);
	310	} while (length);
	311
	312	return written;
	313	}
	314
	315	int
316	iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
317	struct iomap_ops *ops)
318	{
319	loff_t ret;
320
321	while (len) {
322	ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
323	iomap_dirty_actor);
324	if (ret <= 0)
325	return ret;
326	pos += ret;
327	len -= ret;
328	}
329
330	return 0;
331	}
332	EXPORT_SYMBOL_GPL(iomap_file_dirty);
333
ae259a9c CH	334	static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
	335	unsigned bytes, struct iomap *iomap)
	336	{
	337	struct page *page;
	338	int status;
	339
	340	status = iomap_write_begin(inode, pos, bytes,
	341	AOP_FLAG_UNINTERRUPTIBLE \| AOP_FLAG_NOFS, &page, iomap);
	342	if (status)
	343	return status;
	344
	345	zero_user(page, offset, bytes);
	346	mark_page_accessed(page);
	347
	348	return iomap_write_end(inode, pos, bytes, bytes, page);
	349	}
	350
9a286f0e CH	351	static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
	352	struct iomap *iomap)
	353	{
	354	sector_t sector = iomap->blkno +
	355	(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
	356
	357	return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
	358	}
	359
ae259a9c CH	360	static loff_t
	361	iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
	362	void data, struct iomap iomap)
	363	{
	364	bool *did_zero = data;
	365	loff_t written = 0;
	366	int status;
	367
	368	/* already zeroed? we're done. */
	369	if (iomap->type == IOMAP_HOLE \|\| iomap->type == IOMAP_UNWRITTEN)
	370	return count;
	371
	372	do {
	373	unsigned offset, bytes;
	374
	375	offset = pos & (PAGE_SIZE - 1); /* Within page */
	376	bytes = min_t(unsigned, PAGE_SIZE - offset, count);
	377
9a286f0e CH	378	if (IS_DAX(inode))
	379	status = iomap_dax_zero(pos, offset, bytes, iomap);
	380	else
	381	status = iomap_zero(inode, pos, offset, bytes, iomap);
ae259a9c CH	382	if (status < 0)
	383	return status;
	384
	385	pos += bytes;
	386	count -= bytes;
	387	written += bytes;
	388	if (did_zero)
	389	*did_zero = true;
	390	} while (count > 0);
	391
	392	return written;
	393	}
	394
	395	int
	396	iomap_zero_range(struct inode inode, loff_t pos, loff_t len, bool did_zero,
	397	struct iomap_ops *ops)
	398	{
	399	loff_t ret;
	400
	401	while (len > 0) {
	402	ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
	403	ops, did_zero, iomap_zero_range_actor);
	404	if (ret <= 0)
	405	return ret;
	406
	407	pos += ret;
	408	len -= ret;
	409	}
	410
	411	return 0;
	412	}
	413	EXPORT_SYMBOL_GPL(iomap_zero_range);
	414
	415	int
	416	iomap_truncate_page(struct inode inode, loff_t pos, bool did_zero,
	417	struct iomap_ops *ops)
	418	{
	419	unsigned blocksize = (1 << inode->i_blkbits);
	420	unsigned off = pos & (blocksize - 1);
	421
	422	/* Block boundary? Nothing to do */
	423	if (!off)
	424	return 0;
	425	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
	426	}
	427	EXPORT_SYMBOL_GPL(iomap_truncate_page);
	428
	429	static loff_t
	430	iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
	431	void data, struct iomap iomap)
	432	{
	433	struct page *page = data;
	434	int ret;
	435
	436	ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
	437	NULL, iomap);
	438	if (ret)
	439	return ret;
	440
	441	block_commit_write(page, 0, length);
	442	return length;
	443	}
	444
	445	int iomap_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
446	struct iomap_ops *ops)
447	{
448	struct page *page = vmf->page;
449	struct inode *inode = file_inode(vma->vm_file);
450	unsigned long length;
451	loff_t offset, size;
452	ssize_t ret;
453
454	lock_page(page);
455	size = i_size_read(inode);
456	if ((page->mapping != inode->i_mapping) \|\|
457	(page_offset(page) > size)) {
458	/* We overload EFAULT to mean page got truncated */
459	ret = -EFAULT;
460	goto out_unlock;
461	}
462
463	/* page is wholly or partially inside EOF */
464	if (((page->index + 1) << PAGE_SHIFT) > size)
465	length = size & ~PAGE_MASK;
466	else
467	length = PAGE_SIZE;
468
469	offset = page_offset(page);
470	while (length > 0) {
471	ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
472	ops, page, iomap_page_mkwrite_actor);
473	if (unlikely(ret <= 0))
474	goto out_unlock;
475	offset += ret;
476	length -= ret;
477	}
478
479	set_page_dirty(page);
480	wait_for_stable_page(page);
481	return 0;
482	out_unlock:
483	unlock_page(page);
484	return ret;
485	}
486	EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
8be9f564 CH	487
	488	struct fiemap_ctx {
	489	struct fiemap_extent_info *fi;
	490	struct iomap prev;
	491	};
	492
	493	static int iomap_to_fiemap(struct fiemap_extent_info *fi,
	494	struct iomap *iomap, u32 flags)
	495	{
	496	switch (iomap->type) {
	497	case IOMAP_HOLE:
	498	/* skip holes */
	499	return 0;
	500	case IOMAP_DELALLOC:
	501	flags \|= FIEMAP_EXTENT_DELALLOC \| FIEMAP_EXTENT_UNKNOWN;
	502	break;
	503	case IOMAP_UNWRITTEN:
	504	flags \|= FIEMAP_EXTENT_UNWRITTEN;
	505	break;
	506	case IOMAP_MAPPED:
	507	break;
	508	}
	509
17de0a9f CH	510	if (iomap->flags & IOMAP_F_MERGED)
17de0a9f CH	511	flags \|= FIEMAP_EXTENT_MERGED;
e43c460d DW	512	if (iomap->flags & IOMAP_F_SHARED)
e43c460d DW	513	flags \|= FIEMAP_EXTENT_SHARED;
17de0a9f	514
8be9f564 CH	515	return fiemap_fill_next_extent(fi, iomap->offset,
8be9f564 CH	516	iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
17de0a9f	517	iomap->length, flags);
8be9f564 CH	518
	519	}
	520
	521	static loff_t
	522	iomap_fiemap_actor(struct inode inode, loff_t pos, loff_t length, void data,
	523	struct iomap *iomap)
	524	{
	525	struct fiemap_ctx *ctx = data;
	526	loff_t ret = length;
	527
	528	if (iomap->type == IOMAP_HOLE)
	529	return length;
	530
	531	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
	532	ctx->prev = *iomap;
	533	switch (ret) {
	534	case 0: /* success */
	535	return length;
	536	case 1: /* extent array full */
	537	return 0;
	538	default:
	539	return ret;
	540	}
	541	}
	542
	543	int iomap_fiemap(struct inode inode, struct fiemap_extent_info fi,
	544	loff_t start, loff_t len, struct iomap_ops *ops)
	545	{
	546	struct fiemap_ctx ctx;
	547	loff_t ret;
	548
	549	memset(&ctx, 0, sizeof(ctx));
	550	ctx.fi = fi;
	551	ctx.prev.type = IOMAP_HOLE;
	552
	553	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
	554	if (ret)
	555	return ret;
	556
8896b8f6 DC	557	if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
	558	ret = filemap_write_and_wait(inode->i_mapping);
	559	if (ret)
	560	return ret;
	561	}
8be9f564 CH	562
	563	while (len > 0) {
	564	ret = iomap_apply(inode, start, len, 0, ops, &ctx,
	565	iomap_fiemap_actor);
ac2dc058 DC	566	/* inode with no (attribute) mapping will give ENOENT */
	567	if (ret == -ENOENT)
	568	break;
8be9f564 CH	569	if (ret < 0)
	570	return ret;
	571	if (ret == 0)
	572	break;
	573
	574	start += ret;
	575	len -= ret;
	576	}
	577
	578	if (ctx.prev.type != IOMAP_HOLE) {
	579	ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
	580	if (ret < 0)
	581	return ret;
	582	}
	583
	584	return 0;
	585	}
	586	EXPORT_SYMBOL_GPL(iomap_fiemap);