#include "xfs_icache.h"
#include "xfs_pnfs.h"
#include "xfs_iomap.h"
+#include "xfs_reflink.h"
#include <linux/dcache.h>
#include <linux/falloc.h>
static const struct vm_operations_struct xfs_file_vm_ops;
-/*
- * Locking primitives for read and write IO paths to ensure we consistently use
- * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
- */
-static inline void
-xfs_rw_ilock(
- struct xfs_inode *ip,
- int type)
-{
- if (type & XFS_IOLOCK_EXCL)
- inode_lock(VFS_I(ip));
- xfs_ilock(ip, type);
-}
-
-static inline void
-xfs_rw_iunlock(
- struct xfs_inode *ip,
- int type)
-{
- xfs_iunlock(ip, type);
- if (type & XFS_IOLOCK_EXCL)
- inode_unlock(VFS_I(ip));
-}
-
-static inline void
-xfs_rw_ilock_demote(
- struct xfs_inode *ip,
- int type)
-{
- xfs_ilock_demote(ip, type);
- if (type & XFS_IOLOCK_EXCL)
- inode_unlock(VFS_I(ip));
-}
-
/*
* Clear the specified ranges to zero through either the pagecache or DAX.
* Holes and unwritten extents will be left as-is as they already are zeroed.
struct xfs_inode *ip = XFS_I(inode);
loff_t isize = i_size_read(inode);
size_t count = iov_iter_count(to);
+ loff_t end = iocb->ki_pos + count - 1;
struct iov_iter data;
struct xfs_buftarg *target;
ssize_t ret = 0;
file_accessed(iocb->ki_filp);
- /*
- * Locking is a bit tricky here. If we take an exclusive lock for direct
- * IO, we effectively serialise all new concurrent read IO to this file
- * and block it behind IO that is currently in progress because IO in
- * progress holds the IO lock shared. We only need to hold the lock
- * exclusive to blow away the page cache, so only take lock exclusively
- * if the page cache needs invalidation. This allows the normal direct
- * IO case of no page cache pages to proceeed concurrently without
- * serialisation.
- */
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (mapping->nrpages) {
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
- xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+ if (ret)
+ goto out_unlock;
/*
- * The generic dio code only flushes the range of the particular
- * I/O. Because we take an exclusive lock here, this whole
- * sequence is considerably more expensive for us. This has a
- * noticeable performance impact for any file with cached pages,
- * even when outside of the range of the particular I/O.
- *
- * Hence, amortize the cost of the lock against a full file
- * flush and reduce the chances of repeated iolock cycles going
- * forward.
+ * Invalidate whole pages. This can return an error if we fail
+ * to invalidate a page, but this should never happen on XFS.
+ * Warn if it does fail.
*/
- if (mapping->nrpages) {
- ret = filemap_write_and_wait(mapping);
- if (ret) {
- xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
- return ret;
- }
-
- /*
- * Invalidate whole pages. This can return an error if
- * we fail to invalidate a page, but this should never
- * happen on XFS. Warn if it does fail.
- */
- ret = invalidate_inode_pages2(mapping);
- WARN_ON_ONCE(ret);
- ret = 0;
- }
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+ WARN_ON_ONCE(ret);
+ ret = 0;
}
data = *to;
ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
xfs_get_blocks_direct, NULL, NULL, 0);
- if (ret > 0) {
+ if (ret >= 0) {
iocb->ki_pos += ret;
iov_iter_advance(to, ret);
}
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+out_unlock:
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
if (!count)
return 0; /* skip atime */
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
- ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
file_accessed(iocb->ki_filp);
return ret;
trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
- xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
ret = generic_file_read_iter(iocb, to);
- xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+ xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
}
if (error <= 0)
return error;
- error = xfs_break_layouts(inode, iolock, true);
+ error = xfs_break_layouts(inode, iolock);
if (error)
return error;
- /* For changing security info in file_remove_privs() we need i_mutex */
+ /*
+ * For changing security info in file_remove_privs() we need i_rwsem
+ * exclusively.
+ */
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
- xfs_rw_iunlock(ip, *iolock);
+ xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ xfs_ilock(ip, *iolock);
goto restart;
}
/*
spin_unlock(&ip->i_flags_lock);
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
+ xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
+ xfs_ilock(ip, *iolock);
iov_iter_reexpand(from, count);
}
/*
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- /* "unaligned" here means not aligned to a filesystem block */
- if ((iocb->ki_pos & mp->m_blockmask) ||
- ((iocb->ki_pos + count) & mp->m_blockmask))
- unaligned_io = 1;
-
/*
- * We don't need to take an exclusive lock unless there page cache needs
- * to be invalidated or unaligned IO is being executed. We don't need to
- * consider the EOF extension case here because
- * xfs_file_aio_write_checks() will relock the inode as necessary for
- * EOF zeroing cases and fill out the new inode size as appropriate.
+ * Don't take the exclusive iolock here unless the I/O is unaligned to
+ * the file system block size. We don't need to consider the EOF
+ * extension case here because xfs_file_aio_write_checks() will relock
+ * the inode as necessary for EOF zeroing cases and fill out the new
+ * inode size as appropriate.
*/
- if (unaligned_io || mapping->nrpages)
+ if ((iocb->ki_pos & mp->m_blockmask) ||
+ ((iocb->ki_pos + count) & mp->m_blockmask)) {
+ unaligned_io = 1;
iolock = XFS_IOLOCK_EXCL;
- else
+ } else {
iolock = XFS_IOLOCK_SHARED;
- xfs_rw_ilock(ip, iolock);
-
- /*
- * Recheck if there are cached pages that need invalidate after we got
- * the iolock to protect against other threads adding new pages while
- * we were waiting for the iolock.
- */
- if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, iolock);
- iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, iolock);
}
+ xfs_ilock(ip, iolock);
+
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
count = iov_iter_count(from);
end = iocb->ki_pos + count - 1;
- /*
- * See xfs_file_dio_aio_read() for why we do a full-file flush here.
- */
if (mapping->nrpages) {
- ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
if (ret)
goto out;
+
/*
* Invalidate whole pages. This can return an error if we fail
* to invalidate a page, but this should never happen on XFS.
* Warn if it does fail.
*/
- ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
WARN_ON_ONCE(ret);
ret = 0;
}
/*
* If we are doing unaligned IO, wait for all other IO to drain,
- * otherwise demote the lock if we had to flush cached pages
+ * otherwise demote the lock if we had to take the exclusive lock
+ * for other reasons in xfs_file_aio_write_checks.
*/
if (unaligned_io)
inode_dio_wait(inode);
else if (iolock == XFS_IOLOCK_EXCL) {
- xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
+ /* If this is a block-aligned directio CoW, remap immediately. */
+ if (xfs_is_reflink_inode(ip) && !unaligned_io) {
+ ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
+ if (ret)
+ goto out;
+ }
+
data = *from;
ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
xfs_get_blocks_direct, xfs_end_io_direct_write,
iov_iter_advance(from, ret);
}
out:
- xfs_rw_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
/*
* No fallback to buffered IO on errors for XFS, direct IO will either
size_t count;
loff_t pos;
- xfs_rw_ilock(ip, iolock);
+ xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
goto out;
count = iov_iter_count(from);
trace_xfs_file_dax_write(ip, count, pos);
-
- ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+ ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
i_size_write(inode, iocb->ki_pos);
error = xfs_setfilesize(ip, pos, ret);
}
-
out:
- xfs_rw_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
return error ? error : ret;
}
int enospc = 0;
int iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, iolock);
+ xfs_ilock(ip, iolock);
ret = xfs_file_aio_write_checks(iocb, from, &iolock);
if (ret)
enospc = xfs_inode_free_quota_eofblocks(ip);
if (enospc)
goto write_retry;
+ enospc = xfs_inode_free_quota_cowblocks(ip);
+ if (enospc)
+ goto write_retry;
} else if (ret == -ENOSPC && !enospc) {
struct xfs_eofblocks eofb = {0};
current->backing_dev_info = NULL;
out:
- xfs_rw_iunlock(ip, iolock);
+ xfs_iunlock(ip, iolock);
return ret;
}
if (IS_DAX(inode))
ret = xfs_file_dax_write(iocb, from);
- else if (iocb->ki_flags & IOCB_DIRECT)
+ else if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * Allow a directio write to fall back to a buffered
+ * write *only* in the case that we're doing a reflink
+ * CoW. In all other directio scenarios we do not
+ * allow an operation to fall back to buffered mode.
+ */
ret = xfs_file_dio_aio_write(iocb, from);
- else
+ if (ret == -EREMCHG)
+ goto buffered;
+ } else {
+buffered:
ret = xfs_file_buffered_aio_write(iocb, from);
+ }
if (ret > 0) {
XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
#define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
- FALLOC_FL_INSERT_RANGE)
+ FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
STATIC long
xfs_file_fallocate(
return -EOPNOTSUPP;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock, false);
+ error = xfs_break_layouts(inode, &iolock);
if (error)
goto out_unlock;
if (mode & FALLOC_FL_ZERO_RANGE)
error = xfs_zero_file_space(ip, offset, len);
- else
+ else {
+ if (mode & FALLOC_FL_UNSHARE_RANGE) {
+ error = xfs_reflink_unshare(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ }
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
+ }
if (error)
goto out_unlock;
}
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = xfs_setattr_size(ip, &iattr);
+ error = xfs_vn_setattr_size(file_dentry(file), &iattr);
if (error)
goto out_unlock;
}
return error;
}
+STATIC ssize_t
+xfs_file_copy_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ size_t len,
+ unsigned int flags)
+{
+ int error;
+
+ error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+ if (error)
+ return error;
+ return len;
+}
+
+STATIC int
+xfs_file_clone_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len)
+{
+ return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+ len, false);
+}
+
+#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
+STATIC ssize_t
+xfs_file_dedupe_range(
+ struct file *src_file,
+ u64 loff,
+ u64 len,
+ struct file *dst_file,
+ u64 dst_loff)
+{
+ int error;
+
+ /*
+ * Limit the total length we will dedupe for each operation.
+ * This is intended to bound the total time spent in this
+ * ioctl to something sane.
+ */
+ if (len > XFS_MAX_DEDUPE_LEN)
+ len = XFS_MAX_DEDUPE_LEN;
+
+ error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
+ len, true);
+ if (error)
+ return error;
+ return len;
+}
STATIC int
xfs_file_open(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
} else {
ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
ret = block_page_mkwrite_return(ret);
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
+ ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
}
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
+ ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (flags & FAULT_FLAG_WRITE)
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
+ .get_unmapped_area = thp_get_unmapped_area,
.fallocate = xfs_file_fallocate,
+ .copy_file_range = xfs_file_copy_range,
+ .clone_file_range = xfs_file_clone_range,
+ .dedupe_file_range = xfs_file_dedupe_range,
};
const struct file_operations xfs_dir_file_operations = {