iomap: report collisions between directio and buffered writes to userspace
authorDarrick J. Wong <darrick.wong@oracle.com>
Mon, 8 Jan 2018 18:41:39 +0000 (10:41 -0800)
committerDarrick J. Wong <darrick.wong@oracle.com>
Mon, 8 Jan 2018 18:41:39 +0000 (10:41 -0800)
If two programs simultaneously try to write to the same part of a file
via direct IO and buffered IO, there's a chance that the post-diowrite
pagecache invalidation will fail on the dirty page.  When this happens,
the dio write succeeded, which means that the page cache is no longer
coherent with the disk!

Programs are not supposed to mix IO types and this is a clear case of
data corruption, so store an EIO which will be reflected to userspace
during the next fsync.  Replace the WARN_ON with a ratelimited pr_crit
so that the developers have /some/ kind of breadcrumb to track down the
offending program(s) and file(s) involved.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
fs/direct-io.c
fs/iomap.c
include/linux/fs.h

index 3aafb3343a65c76fb66211228564d73ce9097485..a0ca9e48e9937da671739e24d6d7dc2a4867d7ac 100644 (file)
@@ -219,6 +219,27 @@ static inline struct page *dio_get_page(struct dio *dio,
        return dio->pages[sdio->head];
 }
 
+/*
+ * Warn about a page cache invalidation failure during a direct io write.
+ */
+void dio_warn_stale_pagecache(struct file *filp)
+{
+       static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
+       char pathname[128];
+       struct inode *inode = file_inode(filp);
+       char *path;
+
+       errseq_set(&inode->i_mapping->wb_err, -EIO);
+       if (__ratelimit(&_rs)) {
+               path = file_path(filp, pathname, sizeof(pathname));
+               if (IS_ERR(path))
+                       path = "(unknown)";
+               pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
+               pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
+                       current->comm);
+       }
+}
+
 /**
  * dio_complete() - called when all DIO BIO I/O has been completed
  * @offset: the byte offset in the file of the completed operation
@@ -290,7 +311,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags)
                err = invalidate_inode_pages2_range(dio->inode->i_mapping,
                                        offset >> PAGE_SHIFT,
                                        (offset + ret - 1) >> PAGE_SHIFT);
-               WARN_ON_ONCE(err);
+               if (err)
+                       dio_warn_stale_pagecache(dio->iocb->ki_filp);
        }
 
        if (!(dio->flags & DIO_SKIP_DIO_COUNT))
index 47d29ccffaef4d0f1aeb48c174dc3898344cd9b1..e5de7725f18a7358b2cff497c9babd0a7b65f8e7 100644 (file)
@@ -753,7 +753,8 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
                err = invalidate_inode_pages2_range(inode->i_mapping,
                                offset >> PAGE_SHIFT,
                                (offset + dio->size - 1) >> PAGE_SHIFT);
-               WARN_ON_ONCE(err);
+               if (err)
+                       dio_warn_stale_pagecache(iocb->ki_filp);
        }
 
        inode_dio_end(file_inode(iocb->ki_filp));
@@ -1018,9 +1019,16 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
        if (ret)
                goto out_free_dio;
 
+       /*
+        * Try to invalidate cache pages for the range we're direct
+        * writing.  If this invalidation fails, tough, the write will
+        * still work, but racing two incompatible write paths is a
+        * pretty crazy thing to do, so we don't support it 100%.
+        */
        ret = invalidate_inode_pages2_range(mapping,
                        start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-       WARN_ON_ONCE(ret);
+       if (ret)
+               dio_warn_stale_pagecache(iocb->ki_filp);
        ret = 0;
 
        if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
index 511fbaabf6248b67220c16653e491f74e3f046e7..7f8d96d68f34f893d5216ea257da30226d6370a0 100644 (file)
@@ -2992,6 +2992,7 @@ enum {
 };
 
 void dio_end_io(struct bio *bio);
+void dio_warn_stale_pagecache(struct file *filp);
 
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                             struct block_device *bdev, struct iov_iter *iter,