[linux-block.git] / fs / dax.c

/*
 * fs/dax.c - Direct Access filesystem code
 * Copyright (c) 2013-2014 Intel Corporation
 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

#include <linux/atomic.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/uio.h>

int dax_clear_blocks(struct inode *inode, sector_t block, long size)
{
	struct block_device *bdev = inode->i_sb->s_bdev;
	sector_t sector = block << (inode->i_blkbits - 9);

	might_sleep();
	do {
		void *addr;
		unsigned long pfn;
		long count;

		count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
		if (count < 0)
			return count;
		BUG_ON(size < count);
		while (count > 0) {
			unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
			if (pgsz > count)
				pgsz = count;
			if (pgsz < PAGE_SIZE)
				memset(addr, 0, pgsz);
			else
				clear_page(addr);
			addr += pgsz;
			size -= pgsz;
			count -= pgsz;
			BUG_ON(pgsz & 511);
			sector += pgsz / 512;
			cond_resched();
		}
	} while (size);

	return 0;
}
EXPORT_SYMBOL_GPL(dax_clear_blocks);

static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
{
	unsigned long pfn;
	sector_t sector = bh->b_blocknr << (blkbits - 9);
	return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
}

static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
			loff_t end)
{
	loff_t final = end - pos + first; /* The final byte of the buffer */

	if (first > 0)
		memset(addr, 0, first);
	if (final < size)
		memset(addr + final, 0, size - final);
}

static bool buffer_written(struct buffer_head *bh)
{
	return buffer_mapped(bh) && !buffer_unwritten(bh);
}

/*
 * When ext4 encounters a hole, it returns without modifying the buffer_head
 * which means that we can't trust b_size.  To cope with this, we set b_state
 * to 0 before calling get_block and, if any bit is set, we know we can trust
 * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
 * and would save us time calling get_block repeatedly.
 */
static bool buffer_size_valid(struct buffer_head *bh)
{
	return bh->b_state != 0;
}

static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
			loff_t start, loff_t end, get_block_t get_block,
			struct buffer_head *bh)
{
	ssize_t retval = 0;
	loff_t pos = start;
	loff_t max = start;
	loff_t bh_max = start;
	void *addr;
	bool hole = false;

	if (rw != WRITE)
		end = min(end, i_size_read(inode));

	while (pos < end) {
		unsigned len;
		if (pos == max) {
			unsigned blkbits = inode->i_blkbits;
			sector_t block = pos >> blkbits;
			unsigned first = pos - (block << blkbits);
			long size;

			if (pos == bh_max) {
				bh->b_size = PAGE_ALIGN(end - pos);
				bh->b_state = 0;
				retval = get_block(inode, block, bh,
								rw == WRITE);
				if (retval)
					break;
				if (!buffer_size_valid(bh))
					bh->b_size = 1 << blkbits;
				bh_max = pos - first + bh->b_size;
			} else {
				unsigned done = bh->b_size -
						(bh_max - (pos - first));
				bh->b_blocknr += done >> blkbits;
				bh->b_size -= done;
			}

			hole = (rw != WRITE) && !buffer_written(bh);
			if (hole) {
				addr = NULL;
				size = bh->b_size - first;
			} else {
				retval = dax_get_addr(bh, &addr, blkbits);
				if (retval < 0)
					break;
				if (buffer_unwritten(bh) || buffer_new(bh))
					dax_new_buf(addr, retval, first, pos,
									end);
				addr += first;
				size = retval - first;
			}
			max = min(pos + size, end);
		}

		if (rw == WRITE)
			len = copy_from_iter(addr, max - pos, iter);
		else if (!hole)
			len = copy_to_iter(addr, max - pos, iter);
		else
			len = iov_iter_zero(max - pos, iter);

		if (!len)
			break;

		pos += len;
		addr += len;
	}

	return (pos == start) ? retval : pos - start;
}

/**
 * dax_do_io - Perform I/O to a DAX file
 * @rw: READ to read or WRITE to write
 * @iocb: The control block for this I/O
 * @inode: The file which the I/O is directed at
 * @iter: The addresses to do I/O from or to
 * @pos: The file offset where the I/O starts
 * @get_block: The filesystem method used to translate file offsets to blocks
 * @end_io: A filesystem callback for I/O completion
 * @flags: See below
 *
 * This function uses the same locking scheme as do_blockdev_direct_IO:
 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
 * caller for writes.  For reads, we take and release the i_mutex ourselves.
 * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
 * is in progress.
 */
ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
			struct iov_iter *iter, loff_t pos,
			get_block_t get_block, dio_iodone_t end_io, int flags)
{
	struct buffer_head bh;
	ssize_t retval = -EINVAL;
	loff_t end = pos + iov_iter_count(iter);

	memset(&bh, 0, sizeof(bh));

	if ((flags & DIO_LOCKING) && (rw == READ)) {
		struct address_space *mapping = inode->i_mapping;
		mutex_lock(&inode->i_mutex);
		retval = filemap_write_and_wait_range(mapping, pos, end - 1);
		if (retval) {
			mutex_unlock(&inode->i_mutex);
			goto out;
		}
	}

	/* Protects against truncate */
	atomic_inc(&inode->i_dio_count);

	retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);

	if ((flags & DIO_LOCKING) && (rw == READ))
		mutex_unlock(&inode->i_mutex);

	if ((retval > 0) && end_io)
		end_io(iocb, pos, retval, bh.b_private);

	inode_dio_done(inode);
 out:
	return retval;
}
EXPORT_SYMBOL_GPL(dax_do_io);
Commit	Line	Data
d475c634 MW	1	/*
	2	* fs/dax.c - Direct Access filesystem code
	3	* Copyright (c) 2013-2014 Intel Corporation
	4	* Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
	5	* Author: Ross Zwisler <ross.zwisler@linux.intel.com>
	6	*
	7	* This program is free software; you can redistribute it and/or modify it
	8	* under the terms and conditions of the GNU General Public License,
	9	* version 2, as published by the Free Software Foundation.
	10	*
	11	* This program is distributed in the hope it will be useful, but WITHOUT
	12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
	14	* more details.
	15	*/
	16
	17	#include <linux/atomic.h>
	18	#include <linux/blkdev.h>
	19	#include <linux/buffer_head.h>
	20	#include <linux/fs.h>
	21	#include <linux/genhd.h>
	22	#include <linux/mutex.h>
289c6aed	23	#include <linux/sched.h>
d475c634 MW	24	#include <linux/uio.h>
d475c634 MW	25
289c6aed MW	26	int dax_clear_blocks(struct inode *inode, sector_t block, long size)
	27	{
	28	struct block_device *bdev = inode->i_sb->s_bdev;
	29	sector_t sector = block << (inode->i_blkbits - 9);
	30
	31	might_sleep();
	32	do {
	33	void *addr;
	34	unsigned long pfn;
	35	long count;
	36
	37	count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
	38	if (count < 0)
	39	return count;
	40	BUG_ON(size < count);
	41	while (count > 0) {
	42	unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
	43	if (pgsz > count)
	44	pgsz = count;
	45	if (pgsz < PAGE_SIZE)
	46	memset(addr, 0, pgsz);
	47	else
	48	clear_page(addr);
	49	addr += pgsz;
	50	size -= pgsz;
	51	count -= pgsz;
	52	BUG_ON(pgsz & 511);
	53	sector += pgsz / 512;
	54	cond_resched();
	55	}
	56	} while (size);
	57
	58	return 0;
	59	}
	60	EXPORT_SYMBOL_GPL(dax_clear_blocks);
	61
d475c634 MW	62	static long dax_get_addr(struct buffer_head bh, void *addr, unsigned blkbits)
	63	{
	64	unsigned long pfn;
	65	sector_t sector = bh->b_blocknr << (blkbits - 9);
	66	return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
	67	}
	68
	69	static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
	70	loff_t end)
	71	{
	72	loff_t final = end - pos + first; /* The final byte of the buffer */
	73
	74	if (first > 0)
	75	memset(addr, 0, first);
	76	if (final < size)
	77	memset(addr + final, 0, size - final);
	78	}
	79
	80	static bool buffer_written(struct buffer_head *bh)
	81	{
	82	return buffer_mapped(bh) && !buffer_unwritten(bh);
	83	}
	84
	85	/*
	86	* When ext4 encounters a hole, it returns without modifying the buffer_head
	87	* which means that we can't trust b_size. To cope with this, we set b_state
	88	* to 0 before calling get_block and, if any bit is set, we know we can trust
	89	* b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
	90	* and would save us time calling get_block repeatedly.
	91	*/
	92	static bool buffer_size_valid(struct buffer_head *bh)
	93	{
	94	return bh->b_state != 0;
	95	}
	96
	97	static ssize_t dax_io(int rw, struct inode inode, struct iov_iter iter,
	98	loff_t start, loff_t end, get_block_t get_block,
	99	struct buffer_head *bh)
	100	{
	101	ssize_t retval = 0;
	102	loff_t pos = start;
	103	loff_t max = start;
	104	loff_t bh_max = start;
	105	void *addr;
	106	bool hole = false;
	107
	108	if (rw != WRITE)
	109	end = min(end, i_size_read(inode));
	110
	111	while (pos < end) {
	112	unsigned len;
	113	if (pos == max) {
	114	unsigned blkbits = inode->i_blkbits;
	115	sector_t block = pos >> blkbits;
	116	unsigned first = pos - (block << blkbits);
	117	long size;
	118
	119	if (pos == bh_max) {
	120	bh->b_size = PAGE_ALIGN(end - pos);
	121	bh->b_state = 0;
	122	retval = get_block(inode, block, bh,
	123	rw == WRITE);
	124	if (retval)
	125	break;
126	if (!buffer_size_valid(bh))
127	bh->b_size = 1 << blkbits;
128	bh_max = pos - first + bh->b_size;
129	} else {
130	unsigned done = bh->b_size -
131	(bh_max - (pos - first));
132	bh->b_blocknr += done >> blkbits;
133	bh->b_size -= done;
134	}
135
136	hole = (rw != WRITE) && !buffer_written(bh);
137	if (hole) {
138	addr = NULL;
139	size = bh->b_size - first;
140	} else {
141	retval = dax_get_addr(bh, &addr, blkbits);
142	if (retval < 0)
143	break;
144	if (buffer_unwritten(bh) \|\| buffer_new(bh))
145	dax_new_buf(addr, retval, first, pos,
146	end);
147	addr += first;
148	size = retval - first;
149	}
150	max = min(pos + size, end);
151	}
152
153	if (rw == WRITE)
154	len = copy_from_iter(addr, max - pos, iter);
155	else if (!hole)
156	len = copy_to_iter(addr, max - pos, iter);
157	else
158	len = iov_iter_zero(max - pos, iter);
159
160	if (!len)
161	break;
162
163	pos += len;
164	addr += len;
165	}
166
167	return (pos == start) ? retval : pos - start;
168	}
169
170	/**
171	* dax_do_io - Perform I/O to a DAX file
172	* @rw: READ to read or WRITE to write
173	* @iocb: The control block for this I/O
174	* @inode: The file which the I/O is directed at
175	* @iter: The addresses to do I/O from or to
176	* @pos: The file offset where the I/O starts
177	* @get_block: The filesystem method used to translate file offsets to blocks
178	* @end_io: A filesystem callback for I/O completion
179	* @flags: See below
180	*
181	* This function uses the same locking scheme as do_blockdev_direct_IO:
182	* If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
183	* caller for writes. For reads, we take and release the i_mutex ourselves.
184	* If DIO_LOCKING is not set, the filesystem takes care of its own locking.
185	* As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
186	* is in progress.
187	*/
188	ssize_t dax_do_io(int rw, struct kiocb iocb, struct inode inode,
189	struct iov_iter *iter, loff_t pos,
190	get_block_t get_block, dio_iodone_t end_io, int flags)
191	{
192	struct buffer_head bh;
193	ssize_t retval = -EINVAL;
194	loff_t end = pos + iov_iter_count(iter);
195
196	memset(&bh, 0, sizeof(bh));
197
198	if ((flags & DIO_LOCKING) && (rw == READ)) {
199	struct address_space *mapping = inode->i_mapping;
200	mutex_lock(&inode->i_mutex);
201	retval = filemap_write_and_wait_range(mapping, pos, end - 1);
202	if (retval) {
203	mutex_unlock(&inode->i_mutex);
204	goto out;
205	}
206	}
207
208	/* Protects against truncate */
209	atomic_inc(&inode->i_dio_count);
210
211	retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
212
213	if ((flags & DIO_LOCKING) && (rw == READ))
214	mutex_unlock(&inode->i_mutex);
215
216	if ((retval > 0) && end_io)
217	end_io(iocb, pos, retval, bh.b_private);
218
219	inode_dio_done(inode);
220	out:
221	return retval;
222	}
223	EXPORT_SYMBOL_GPL(dax_do_io);