xfs: remove i_iolock and use i_rwsem in the VFS inode instead

[linux-2.6-block.git] / fs / xfs / xfs_file.c
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index f46b2929c64d5236d87ec05bfabe794523c2c598..d054b73b56fbbaee27721f34627823819c6db086 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
  #include "xfs_icache.h"
  #include "xfs_pnfs.h"
  #include "xfs_iomap.h"
+#include "xfs_reflink.h"
  
  #include <linux/dcache.h>
  #include <linux/falloc.h>
@@ -46,40 +47,6 @@
  
  static const struct vm_operations_struct xfs_file_vm_ops;
  
-/*
- * Locking primitives for read and write IO paths to ensure we consistently use
- * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
- */
-static inline void
-xfs_rw_ilock(
-       struct xfs_inode        *ip,
-       int                     type)
-{
-       if (type & XFS_IOLOCK_EXCL)
-               inode_lock(VFS_I(ip));
-       xfs_ilock(ip, type);
-}
-
-static inline void
-xfs_rw_iunlock(
-       struct xfs_inode        *ip,
-       int                     type)
-{
-       xfs_iunlock(ip, type);
-       if (type & XFS_IOLOCK_EXCL)
-               inode_unlock(VFS_I(ip));
-}
-
-static inline void
-xfs_rw_ilock_demote(
-       struct xfs_inode        *ip,
-       int                     type)
-{
-       xfs_ilock_demote(ip, type);
-       if (type & XFS_IOLOCK_EXCL)
-               inode_unlock(VFS_I(ip));
-}
-
  /*
   * Clear the specified ranges to zero through either the pagecache or DAX.
   * Holes and unwritten extents will be left as-is as they already are zeroed.
@@ -248,6 +215,7 @@ xfs_file_dio_aio_read(
         struct xfs_inode        *ip = XFS_I(inode);
         loff_t                  isize = i_size_read(inode);
         size_t                  count = iov_iter_count(to);
+       loff_t                  end = iocb->ki_pos + count - 1;
         struct iov_iter         data;
         struct xfs_buftarg      *target;
         ssize_t                 ret = 0;
@@ -271,60 +239,33 @@ xfs_file_dio_aio_read(
  
         file_accessed(iocb->ki_filp);
  
-       /*
-        * Locking is a bit tricky here. If we take an exclusive lock for direct
-        * IO, we effectively serialise all new concurrent read IO to this file
-        * and block it behind IO that is currently in progress because IO in
-        * progress holds the IO lock shared. We only need to hold the lock
-        * exclusive to blow away the page cache, so only take lock exclusively
-        * if the page cache needs invalidation. This allows the normal direct
-        * IO case of no page cache pages to proceeed concurrently without
-        * serialisation.
-        */
-       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+       xfs_ilock(ip, XFS_IOLOCK_SHARED);
         if (mapping->nrpages) {
-               xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
-               xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+               ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+               if (ret)
+                       goto out_unlock;
  
                 /*
-                * The generic dio code only flushes the range of the particular
-                * I/O. Because we take an exclusive lock here, this whole
-                * sequence is considerably more expensive for us. This has a
-                * noticeable performance impact for any file with cached pages,
-                * even when outside of the range of the particular I/O.
-                *
-                * Hence, amortize the cost of the lock against a full file
-                * flush and reduce the chances of repeated iolock cycles going
-                * forward.
+                * Invalidate whole pages. This can return an error if we fail
+                * to invalidate a page, but this should never happen on XFS.
+                * Warn if it does fail.
                  */
-               if (mapping->nrpages) {
-                       ret = filemap_write_and_wait(mapping);
-                       if (ret) {
-                               xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
-                               return ret;
-                       }
-
-                       /*
-                        * Invalidate whole pages. This can return an error if
-                        * we fail to invalidate a page, but this should never
-                        * happen on XFS. Warn if it does fail.
-                        */
-                       ret = invalidate_inode_pages2(mapping);
-                       WARN_ON_ONCE(ret);
-                       ret = 0;
-               }
-               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               ret = invalidate_inode_pages2_range(mapping,
+                               iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+               WARN_ON_ONCE(ret);
+               ret = 0;
         }
  
         data = *to;
         ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
                         xfs_get_blocks_direct, NULL, NULL, 0);
-       if (ret > 0) {
+       if (ret >= 0) {
                 iocb->ki_pos += ret;
                 iov_iter_advance(to, ret);
         }
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
  
+out_unlock:
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
         return ret;
  }
  
@@ -342,9 +283,9 @@ xfs_file_dax_read(
         if (!count)
                 return 0; /* skip atime */
  
-       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-       ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+       ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  
         file_accessed(iocb->ki_filp);
         return ret;
@@ -360,9 +301,9 @@ xfs_file_buffered_aio_read(
  
         trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
  
-       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+       xfs_ilock(ip, XFS_IOLOCK_SHARED);
         ret = generic_file_read_iter(iocb, to);
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  
         return ret;
  }
@@ -443,15 +384,18 @@ restart:
         if (error <= 0)
                 return error;
  
-       error = xfs_break_layouts(inode, iolock, true);
+       error = xfs_break_layouts(inode, iolock);
         if (error)
                 return error;
  
-       /* For changing security info in file_remove_privs() we need i_mutex */
+       /*
+        * For changing security info in file_remove_privs() we need i_rwsem
+        * exclusively.
+        */
         if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
-               xfs_rw_iunlock(ip, *iolock);
+               xfs_iunlock(ip, *iolock);
                 *iolock = XFS_IOLOCK_EXCL;
-               xfs_rw_ilock(ip, *iolock);
+               xfs_ilock(ip, *iolock);
                 goto restart;
         }
         /*
@@ -476,9 +420,9 @@ restart:
                 spin_unlock(&ip->i_flags_lock);
                 if (!drained_dio) {
                         if (*iolock == XFS_IOLOCK_SHARED) {
-                               xfs_rw_iunlock(ip, *iolock);
+                               xfs_iunlock(ip, *iolock);
                                 *iolock = XFS_IOLOCK_EXCL;
-                               xfs_rw_ilock(ip, *iolock);
+                               xfs_ilock(ip, *iolock);
                                 iov_iter_reexpand(from, count);
                         }
                         /*
@@ -569,71 +513,66 @@ xfs_file_dio_aio_write(
         if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
                 return -EINVAL;
  
-       /* "unaligned" here means not aligned to a filesystem block */
-       if ((iocb->ki_pos & mp->m_blockmask) ||
-           ((iocb->ki_pos + count) & mp->m_blockmask))
-               unaligned_io = 1;
-
         /*
-        * We don't need to take an exclusive lock unless there page cache needs
-        * to be invalidated or unaligned IO is being executed. We don't need to
-        * consider the EOF extension case here because
-        * xfs_file_aio_write_checks() will relock the inode as necessary for
-        * EOF zeroing cases and fill out the new inode size as appropriate.
+        * Don't take the exclusive iolock here unless the I/O is unaligned to
+        * the file system block size.  We don't need to consider the EOF
+        * extension case here because xfs_file_aio_write_checks() will relock
+        * the inode as necessary for EOF zeroing cases and fill out the new
+        * inode size as appropriate.
          */
-       if (unaligned_io || mapping->nrpages)
+       if ((iocb->ki_pos & mp->m_blockmask) ||
+           ((iocb->ki_pos + count) & mp->m_blockmask)) {
+               unaligned_io = 1;
                 iolock = XFS_IOLOCK_EXCL;
-       else
+       } else {
                 iolock = XFS_IOLOCK_SHARED;
-       xfs_rw_ilock(ip, iolock);
-
-       /*
-        * Recheck if there are cached pages that need invalidate after we got
-        * the iolock to protect against other threads adding new pages while
-        * we were waiting for the iolock.
-        */
-       if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
-               xfs_rw_iunlock(ip, iolock);
-               iolock = XFS_IOLOCK_EXCL;
-               xfs_rw_ilock(ip, iolock);
         }
  
+       xfs_ilock(ip, iolock);
+
         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
         if (ret)
                 goto out;
         count = iov_iter_count(from);
         end = iocb->ki_pos + count - 1;
  
-       /*
-        * See xfs_file_dio_aio_read() for why we do a full-file flush here.
-        */
         if (mapping->nrpages) {
-               ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+               ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
                 if (ret)
                         goto out;
+
                 /*
                  * Invalidate whole pages. This can return an error if we fail
                  * to invalidate a page, but this should never happen on XFS.
                  * Warn if it does fail.
                  */
-               ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+               ret = invalidate_inode_pages2_range(mapping,
+                               iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
                 WARN_ON_ONCE(ret);
                 ret = 0;
         }
  
         /*
          * If we are doing unaligned IO, wait for all other IO to drain,
-        * otherwise demote the lock if we had to flush cached pages
+        * otherwise demote the lock if we had to take the exclusive lock
+        * for other reasons in xfs_file_aio_write_checks.
          */
         if (unaligned_io)
                 inode_dio_wait(inode);
         else if (iolock == XFS_IOLOCK_EXCL) {
-               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
                 iolock = XFS_IOLOCK_SHARED;
         }
  
         trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
  
+       /* If this is a block-aligned directio CoW, remap immediately. */
+       if (xfs_is_reflink_inode(ip) && !unaligned_io) {
+               ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
+               if (ret)
+                       goto out;
+       }
+
         data = *from;
         ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
                         xfs_get_blocks_direct, xfs_end_io_direct_write,
@@ -651,7 +590,7 @@ xfs_file_dio_aio_write(
                 iov_iter_advance(from, ret);
         }
  out:
-       xfs_rw_iunlock(ip, iolock);
+       xfs_iunlock(ip, iolock);
  
         /*
          * No fallback to buffered IO on errors for XFS, direct IO will either
@@ -673,7 +612,7 @@ xfs_file_dax_write(
         size_t                  count;
         loff_t                  pos;
  
-       xfs_rw_ilock(ip, iolock);
+       xfs_ilock(ip, iolock);
         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
         if (ret)
                 goto out;
@@ -682,15 +621,13 @@ xfs_file_dax_write(
         count = iov_iter_count(from);
  
         trace_xfs_file_dax_write(ip, count, pos);
-
-       ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+       ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
         if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
                 i_size_write(inode, iocb->ki_pos);
                 error = xfs_setfilesize(ip, pos, ret);
         }
-
  out:
-       xfs_rw_iunlock(ip, iolock);
+       xfs_iunlock(ip, iolock);
         return error ? error : ret;
  }
  
@@ -707,7 +644,7 @@ xfs_file_buffered_aio_write(
         int                     enospc = 0;
         int                     iolock = XFS_IOLOCK_EXCL;
  
-       xfs_rw_ilock(ip, iolock);
+       xfs_ilock(ip, iolock);
  
         ret = xfs_file_aio_write_checks(iocb, from, &iolock);
         if (ret)
@@ -735,6 +672,9 @@ write_retry:
                 enospc = xfs_inode_free_quota_eofblocks(ip);
                 if (enospc)
                         goto write_retry;
+               enospc = xfs_inode_free_quota_cowblocks(ip);
+               if (enospc)
+                       goto write_retry;
         } else if (ret == -ENOSPC && !enospc) {
                 struct xfs_eofblocks eofb = {0};
  
@@ -748,7 +688,7 @@ write_retry:
  
         current->backing_dev_info = NULL;
  out:
-       xfs_rw_iunlock(ip, iolock);
+       xfs_iunlock(ip, iolock);
         return ret;
  }
  
@@ -774,10 +714,20 @@ xfs_file_write_iter(
  
         if (IS_DAX(inode))
                 ret = xfs_file_dax_write(iocb, from);
-       else if (iocb->ki_flags & IOCB_DIRECT)
+       else if (iocb->ki_flags & IOCB_DIRECT) {
+               /*
+                * Allow a directio write to fall back to a buffered
+                * write *only* in the case that we're doing a reflink
+                * CoW.  In all other directio scenarios we do not
+                * allow an operation to fall back to buffered mode.
+                */
                 ret = xfs_file_dio_aio_write(iocb, from);
-       else
+               if (ret == -EREMCHG)
+                       goto buffered;
+       } else {
+buffered:
                 ret = xfs_file_buffered_aio_write(iocb, from);
+       }
  
         if (ret > 0) {
                 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
@@ -791,7 +741,7 @@ xfs_file_write_iter(
  #define        XFS_FALLOC_FL_SUPPORTED                                         \
                 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |           \
                  FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |      \
-                FALLOC_FL_INSERT_RANGE)
+                FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
  
  STATIC long
  xfs_file_fallocate(
@@ -814,7 +764,7 @@ xfs_file_fallocate(
                 return -EOPNOTSUPP;
  
         xfs_ilock(ip, iolock);
-       error = xfs_break_layouts(inode, &iolock, false);
+       error = xfs_break_layouts(inode, &iolock);
         if (error)
                 goto out_unlock;
  
@@ -881,9 +831,15 @@ xfs_file_fallocate(
  
                 if (mode & FALLOC_FL_ZERO_RANGE)
                         error = xfs_zero_file_space(ip, offset, len);
-               else
+               else {
+                       if (mode & FALLOC_FL_UNSHARE_RANGE) {
+                               error = xfs_reflink_unshare(ip, offset, len);
+                               if (error)
+                                       goto out_unlock;
+                       }
                         error = xfs_alloc_file_space(ip, offset, len,
                                                      XFS_BMAPI_PREALLOC);
+               }
                 if (error)
                         goto out_unlock;
         }
@@ -901,7 +857,7 @@ xfs_file_fallocate(
  
                 iattr.ia_valid = ATTR_SIZE;
                 iattr.ia_size = new_size;
-               error = xfs_setattr_size(ip, &iattr);
+               error = xfs_vn_setattr_size(file_dentry(file), &iattr);
                 if (error)
                         goto out_unlock;
         }
@@ -920,6 +876,61 @@ out_unlock:
         return error;
  }
  
+STATIC ssize_t
+xfs_file_copy_range(
+       struct file     *file_in,
+       loff_t          pos_in,
+       struct file     *file_out,
+       loff_t          pos_out,
+       size_t          len,
+       unsigned int    flags)
+{
+       int             error;
+
+       error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+                                    len, false);
+       if (error)
+               return error;
+       return len;
+}
+
+STATIC int
+xfs_file_clone_range(
+       struct file     *file_in,
+       loff_t          pos_in,
+       struct file     *file_out,
+       loff_t          pos_out,
+       u64             len)
+{
+       return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+                                    len, false);
+}
+
+#define XFS_MAX_DEDUPE_LEN     (16 * 1024 * 1024)
+STATIC ssize_t
+xfs_file_dedupe_range(
+       struct file     *src_file,
+       u64             loff,
+       u64             len,
+       struct file     *dst_file,
+       u64             dst_loff)
+{
+       int             error;
+
+       /*
+        * Limit the total length we will dedupe for each operation.
+        * This is intended to bound the total time spent in this
+        * ioctl to something sane.
+        */
+       if (len > XFS_MAX_DEDUPE_LEN)
+               len = XFS_MAX_DEDUPE_LEN;
+
+       error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
+                                    len, true);
+       if (error)
+               return error;
+       return len;
+}
  
  STATIC int
  xfs_file_open(
@@ -1430,7 +1441,7 @@ xfs_filemap_page_mkwrite(
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
         if (IS_DAX(inode)) {
-               ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
+               ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
         } else {
                 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
                 ret = block_page_mkwrite_return(ret);
@@ -1464,7 +1475,7 @@ xfs_filemap_fault(
                  * changes to xfs_get_blocks_direct() to map unwritten extent
                  * ioend for conversion on read-only mappings.
                  */
-               ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
+               ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
         } else
                 ret = filemap_fault(vma, vmf);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1501,7 +1512,7 @@ xfs_filemap_pmd_fault(
         }
  
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
+       ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
         if (flags & FAULT_FLAG_WRITE)
@@ -1579,7 +1590,11 @@ const struct file_operations xfs_file_operations = {
         .open           = xfs_file_open,
         .release        = xfs_file_release,
         .fsync          = xfs_file_fsync,
+       .get_unmapped_area = thp_get_unmapped_area,
         .fallocate      = xfs_file_fallocate,
+       .copy_file_range = xfs_file_copy_range,
+       .clone_file_range = xfs_file_clone_range,
+       .dedupe_file_range = xfs_file_dedupe_range,
  };
  
  const struct file_operations xfs_dir_file_operations = {