Merge branch 'work.aio' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
[linux-2.6-block.git] / fs / iomap.c
index 65aae194aecaa927595a5737dd021d619b79ae9a..7d1e9f45f098c76647914d6bc4a9033e653ae20f 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/file.h>
 #include <linux/uio.h>
 #include <linux/backing-dev.h>
@@ -27,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/dax.h>
 #include <linux/sched/signal.h>
+#include <linux/swap.h>
 
 #include "internal.h"
 
@@ -95,6 +97,12 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
        return written ? written : ret;
 }
 
+static sector_t
+iomap_sector(struct iomap *iomap, loff_t pos)
+{
+       return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
 static void
 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
 {
@@ -352,11 +360,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
 static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
                struct iomap *iomap)
 {
-       sector_t sector = (iomap->addr +
-                          (pos & PAGE_MASK) - iomap->offset) >> 9;
-
-       return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
-                       offset, bytes);
+       return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
+                       iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
 }
 
 static loff_t
@@ -501,10 +506,13 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
        case IOMAP_DELALLOC:
                flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
                break;
+       case IOMAP_MAPPED:
+               break;
        case IOMAP_UNWRITTEN:
                flags |= FIEMAP_EXTENT_UNWRITTEN;
                break;
-       case IOMAP_MAPPED:
+       case IOMAP_INLINE:
+               flags |= FIEMAP_EXTENT_DATA_INLINE;
                break;
        }
 
@@ -512,8 +520,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
                flags |= FIEMAP_EXTENT_MERGED;
        if (iomap->flags & IOMAP_F_SHARED)
                flags |= FIEMAP_EXTENT_SHARED;
-       if (iomap->flags & IOMAP_F_DATA_INLINE)
-               flags |= FIEMAP_EXTENT_DATA_INLINE;
 
        return fiemap_fill_next_extent(fi, iomap->offset,
                        iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
@@ -587,6 +593,113 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
 }
 EXPORT_SYMBOL_GPL(iomap_fiemap);
 
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ * Returns true if found and updates @lastoff to the offset in file.
+ */
+static bool
+page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
+               int whence)
+{
+       const struct address_space_operations *ops = inode->i_mapping->a_ops;
+       unsigned int bsize = i_blocksize(inode), off;
+       bool seek_data = whence == SEEK_DATA;
+       loff_t poff = page_offset(page);
+
+       if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
+               return false;
+
+       if (*lastoff < poff) {
+               /*
+                * Last offset smaller than the start of the page means we found
+                * a hole:
+                */
+               if (whence == SEEK_HOLE)
+                       return true;
+               *lastoff = poff;
+       }
+
+       /*
+        * Just check the page unless we can and should check block ranges:
+        */
+       if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
+               return PageUptodate(page) == seek_data;
+
+       lock_page(page);
+       if (unlikely(page->mapping != inode->i_mapping))
+               goto out_unlock_not_found;
+
+       for (off = 0; off < PAGE_SIZE; off += bsize) {
+               if ((*lastoff & ~PAGE_MASK) >= off + bsize)
+                       continue;
+               if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
+                       unlock_page(page);
+                       return true;
+               }
+               *lastoff = poff + off + bsize;
+       }
+
+out_unlock_not_found:
+       unlock_page(page);
+       return false;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: uptodate buffer heads count as data; everything else
+ * counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+static loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+               int whence)
+{
+       pgoff_t index = offset >> PAGE_SHIFT;
+       pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+       loff_t lastoff = offset;
+       struct pagevec pvec;
+
+       if (length <= 0)
+               return -ENOENT;
+
+       pagevec_init(&pvec);
+
+       do {
+               unsigned nr_pages, i;
+
+               nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+                                               end - 1);
+               if (nr_pages == 0)
+                       break;
+
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+
+                       if (page_seek_hole_data(inode, page, &lastoff, whence))
+                               goto check_range;
+                       lastoff = page_offset(page) + PAGE_SIZE;
+               }
+               pagevec_release(&pvec);
+       } while (index < end);
+
+       /* When no page at lastoff and we are not done, we found a hole. */
+       if (whence != SEEK_HOLE)
+               goto not_found;
+
+check_range:
+       if (lastoff < offset + length)
+               goto out;
+not_found:
+       lastoff = -ENOENT;
+out:
+       pagevec_release(&pvec);
+       return lastoff;
+}
+
+
 static loff_t
 iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
                      void *data, struct iomap *iomap)
@@ -685,6 +798,8 @@ EXPORT_SYMBOL_GPL(iomap_seek_data);
  * Private flags for iomap_dio, must not overlap with the public ones in
  * iomap.h:
  */
+#define IOMAP_DIO_WRITE_FUA    (1 << 28)
+#define IOMAP_DIO_NEED_SYNC    (1 << 29)
 #define IOMAP_DIO_WRITE                (1 << 30)
 #define IOMAP_DIO_DIRTY                (1 << 31)
 
@@ -759,6 +874,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
                        dio_warn_stale_pagecache(iocb->ki_filp);
        }
 
+       /*
+        * If this is a DSYNC write, make sure we push it to stable storage now
+        * that we've written data.
+        */
+       if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+               ret = generic_write_sync(iocb, ret);
+
        inode_dio_end(file_inode(iocb->ki_filp));
        kfree(dio);
 
@@ -769,13 +891,8 @@ static void iomap_dio_complete_work(struct work_struct *work)
 {
        struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
        struct kiocb *iocb = dio->iocb;
-       bool is_write = (dio->flags & IOMAP_DIO_WRITE);
-       ssize_t ret;
 
-       ret = iomap_dio_complete(dio);
-       if (is_write && ret > 0)
-               ret = generic_write_sync(iocb, ret);
-       iocb->ki_complete(iocb, ret, 0);
+       iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
 }
 
 /*
@@ -833,14 +950,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 
        bio = bio_alloc(GFP_KERNEL, 1);
        bio_set_dev(bio, iomap->bdev);
-       bio->bi_iter.bi_sector =
-               (iomap->addr + pos - iomap->offset) >> 9;
+       bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
        bio->bi_private = dio;
        bio->bi_end_io = iomap_dio_bio_end_io;
 
        get_page(page);
-       if (bio_add_page(bio, page, len, 0) != len)
-               BUG();
+       __bio_add_page(bio, page, len, 0);
        bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
 
        atomic_inc(&dio->ref);
@@ -858,6 +973,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
        struct iov_iter iter;
        struct bio *bio;
        bool need_zeroout = false;
+       bool use_fua = false;
        int nr_pages, ret;
        size_t copied = 0;
 
@@ -881,8 +997,20 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
        case IOMAP_MAPPED:
                if (iomap->flags & IOMAP_F_SHARED)
                        dio->flags |= IOMAP_DIO_COW;
-               if (iomap->flags & IOMAP_F_NEW)
+               if (iomap->flags & IOMAP_F_NEW) {
                        need_zeroout = true;
+               } else {
+                       /*
+                        * Use a FUA write if we need datasync semantics, this
+                        * is a pure data IO that doesn't require any metadata
+                        * updates and the underlying device supports FUA. This
+                        * allows us to avoid cache flushes on IO completion.
+                        */
+                       if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+                           (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+                           blk_queue_fua(bdev_get_queue(iomap->bdev)))
+                               use_fua = true;
+               }
                break;
        default:
                WARN_ON_ONCE(1);
@@ -916,8 +1044,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 
                bio = bio_alloc(GFP_KERNEL, nr_pages);
                bio_set_dev(bio, iomap->bdev);
-               bio->bi_iter.bi_sector =
-                       (iomap->addr + pos - iomap->offset) >> 9;
+               bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
                bio->bi_write_hint = dio->iocb->ki_hint;
                bio->bi_ioprio = dio->iocb->ki_ioprio;
                bio->bi_private = dio;
@@ -931,10 +1058,14 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 
                n = bio->bi_iter.bi_size;
                if (dio->flags & IOMAP_DIO_WRITE) {
-                       bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+                       bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+                       if (use_fua)
+                               bio->bi_opf |= REQ_FUA;
+                       else
+                               dio->flags &= ~IOMAP_DIO_WRITE_FUA;
                        task_io_account_write(n);
                } else {
-                       bio_set_op_attrs(bio, REQ_OP_READ, 0);
+                       bio->bi_opf = REQ_OP_READ;
                        if (dio->flags & IOMAP_DIO_DIRTY)
                                bio_set_pages_dirty(bio);
                }
@@ -962,6 +1093,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
        return copied;
 }
 
+/*
+ * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
+ * is being issued as AIO or not.  This allows us to optimise pure data writes
+ * to use REQ_FUA rather than requiring generic_write_sync() to issue a
+ * REQ_FLUSH post write. This is slightly tricky because a single request here
+ * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
+ */
 ssize_t
 iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
@@ -1006,8 +1146,21 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                if (iter->type == ITER_IOVEC)
                        dio->flags |= IOMAP_DIO_DIRTY;
        } else {
-               dio->flags |= IOMAP_DIO_WRITE;
                flags |= IOMAP_WRITE;
+               dio->flags |= IOMAP_DIO_WRITE;
+
+               /* for data sync or sync, we need sync completion processing */
+               if (iocb->ki_flags & IOCB_DSYNC)
+                       dio->flags |= IOMAP_DIO_NEED_SYNC;
+
+               /*
+                * For datasync only writes, we optimistically try using FUA for
+                * this IO.  Any non-FUA write that occurs will clear this flag,
+                * hence we know before completion whether a cache flush is
+                * necessary.
+                */
+               if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
+                       dio->flags |= IOMAP_DIO_WRITE_FUA;
        }
 
        if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -1063,6 +1216,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
        if (ret < 0)
                iomap_dio_set_error(dio, ret);
 
+       /*
+        * If all the writes we issued were FUA, we don't need to flush the
+        * cache on IO completion. Clear the sync flag for this case.
+        */
+       if (dio->flags & IOMAP_DIO_WRITE_FUA)
+               dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+
        if (!atomic_dec_and_test(&dio->ref)) {
                if (!is_sync_kiocb(iocb))
                        return -EIOCBQUEUED;
@@ -1090,3 +1250,203 @@ out_free_dio:
        return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+/* Swapfile activation */
+
+#ifdef CONFIG_SWAP
+struct iomap_swapfile_info {
+       struct iomap iomap;             /* accumulated iomap */
+       struct swap_info_struct *sis;
+       uint64_t lowest_ppage;          /* lowest physical addr seen (pages) */
+       uint64_t highest_ppage;         /* highest physical addr seen (pages) */
+       unsigned long nr_pages;         /* number of pages collected */
+       int nr_extents;                 /* extent count */
+};
+
+/*
+ * Collect physical extents for this swap file.  Physical extents reported to
+ * the swap code must be trimmed to align to a page boundary.  The logical
+ * offset within the file is irrelevant since the swapfile code maps logical
+ * page numbers of the swap device to the physical page-aligned extents.
+ */
+static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
+{
+       struct iomap *iomap = &isi->iomap;
+       unsigned long nr_pages;
+       uint64_t first_ppage;
+       uint64_t first_ppage_reported;
+       uint64_t next_ppage;
+       int error;
+
+       /*
+        * Round the start up and the end down so that the physical
+        * extent aligns to a page boundary.
+        */
+       first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
+       next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
+                       PAGE_SHIFT;
+
+       /* Skip too-short physical extents. */
+       if (first_ppage >= next_ppage)
+               return 0;
+       nr_pages = next_ppage - first_ppage;
+
+       /*
+        * Calculate how much swap space we're adding; the first page contains
+        * the swap header and doesn't count.  The mm still wants that first
+        * page fed to add_swap_extent, however.
+        */
+       first_ppage_reported = first_ppage;
+       if (iomap->offset == 0)
+               first_ppage_reported++;
+       if (isi->lowest_ppage > first_ppage_reported)
+               isi->lowest_ppage = first_ppage_reported;
+       if (isi->highest_ppage < (next_ppage - 1))
+               isi->highest_ppage = next_ppage - 1;
+
+       /* Add extent, set up for the next call. */
+       error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
+       if (error < 0)
+               return error;
+       isi->nr_extents += error;
+       isi->nr_pages += nr_pages;
+       return 0;
+}
+
+/*
+ * Accumulate iomaps for this swap file.  We have to accumulate iomaps because
+ * swap only cares about contiguous page-aligned physical extents and makes no
+ * distinction between written and unwritten extents.
+ */
+static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
+               loff_t count, void *data, struct iomap *iomap)
+{
+       struct iomap_swapfile_info *isi = data;
+       int error;
+
+       switch (iomap->type) {
+       case IOMAP_MAPPED:
+       case IOMAP_UNWRITTEN:
+               /* Only real or unwritten extents. */
+               break;
+       case IOMAP_INLINE:
+               /* No inline data. */
+               pr_err("swapon: file is inline\n");
+               return -EINVAL;
+       default:
+               pr_err("swapon: file has unallocated extents\n");
+               return -EINVAL;
+       }
+
+       /* No uncommitted metadata or shared blocks. */
+       if (iomap->flags & IOMAP_F_DIRTY) {
+               pr_err("swapon: file is not committed\n");
+               return -EINVAL;
+       }
+       if (iomap->flags & IOMAP_F_SHARED) {
+               pr_err("swapon: file has shared extents\n");
+               return -EINVAL;
+       }
+
+       /* Only one bdev per swap file. */
+       if (iomap->bdev != isi->sis->bdev) {
+               pr_err("swapon: file is on multiple devices\n");
+               return -EINVAL;
+       }
+
+       if (isi->iomap.length == 0) {
+               /* No accumulated extent, so just store it. */
+               memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+       } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
+               /* Append this to the accumulated extent. */
+               isi->iomap.length += iomap->length;
+       } else {
+               /* Otherwise, add the retained iomap and store this one. */
+               error = iomap_swapfile_add_extent(isi);
+               if (error)
+                       return error;
+               memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+       }
+       return count;
+}
+
+/*
+ * Iterate a swap file's iomaps to construct physical extents that can be
+ * passed to the swapfile subsystem.
+ */
+int iomap_swapfile_activate(struct swap_info_struct *sis,
+               struct file *swap_file, sector_t *pagespan,
+               const struct iomap_ops *ops)
+{
+       struct iomap_swapfile_info isi = {
+               .sis = sis,
+               .lowest_ppage = (sector_t)-1ULL,
+       };
+       struct address_space *mapping = swap_file->f_mapping;
+       struct inode *inode = mapping->host;
+       loff_t pos = 0;
+       loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
+       loff_t ret;
+
+       ret = filemap_write_and_wait(inode->i_mapping);
+       if (ret)
+               return ret;
+
+       while (len > 0) {
+               ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
+                               ops, &isi, iomap_swapfile_activate_actor);
+               if (ret <= 0)
+                       return ret;
+
+               pos += ret;
+               len -= ret;
+       }
+
+       if (isi.iomap.length) {
+               ret = iomap_swapfile_add_extent(&isi);
+               if (ret)
+                       return ret;
+       }
+
+       *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
+       sis->max = isi.nr_pages;
+       sis->pages = isi.nr_pages - 1;
+       sis->highest_bit = isi.nr_pages - 1;
+       return isi.nr_extents;
+}
+EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
+#endif /* CONFIG_SWAP */
+
+static loff_t
+iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
+               void *data, struct iomap *iomap)
+{
+       sector_t *bno = data, addr;
+
+       if (iomap->type == IOMAP_MAPPED) {
+               addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
+               if (addr > INT_MAX)
+                       WARN(1, "would truncate bmap result\n");
+               else
+                       *bno = addr;
+       }
+       return 0;
+}
+
+/* legacy ->bmap interface.  0 is the error return (!) */
+sector_t
+iomap_bmap(struct address_space *mapping, sector_t bno,
+               const struct iomap_ops *ops)
+{
+       struct inode *inode = mapping->host;
+       loff_t pos = bno >> inode->i_blkbits;
+       unsigned blocksize = i_blocksize(inode);
+
+       if (filemap_write_and_wait(mapping))
+               return 0;
+
+       bno = 0;
+       iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
+       return bno;
+}
+EXPORT_SYMBOL_GPL(iomap_bmap);