shmem: Implement splice-read
authorDavid Howells <dhowells@redhat.com>
Mon, 22 May 2023 13:49:56 +0000 (14:49 +0100)
committerJens Axboe <axboe@kernel.dk>
Wed, 24 May 2023 14:42:16 +0000 (08:42 -0600)
The new filemap_splice_read() has an implicit expectation via
filemap_get_pages() that ->read_folio() exists if ->readahead() doesn't
fully populate the pagecache of the file it is reading from[1], potentially
leading to a jump to NULL if this doesn't exist.  shmem, however, (and by
extension, tmpfs, ramfs and rootfs), doesn't have ->read_folio(),

Work around this by equipping shmem with its own splice-read
implementation, based on filemap_splice_read(), but able to paste in
zero_page when there's a page missing.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Daniel Golle <daniel@makrotopia.org>
cc: Guenter Roeck <groeck7@gmail.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Jens Axboe <axboe@kernel.dk>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: John Hubbard <jhubbard@nvidia.com>
cc: David Hildenbrand <david@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Hugh Dickins <hughd@google.com>
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
Link: https://lore.kernel.org/r/Y+pdHFFTk1TTEBsO@makrotopia.org/
Link: https://lore.kernel.org/r/20230522135018.2742245-10-dhowells@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
mm/shmem.c

index e40a08c5c6d78ac629b52e4956b4cb5c13948b6c..1f504ed982cf0e4327c6f02f333227f115c1b574 100644 (file)
@@ -2731,6 +2731,138 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
        return retval ? retval : error;
 }
 
+static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
+                             struct pipe_buffer *buf)
+{
+       return true;
+}
+
+static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
+                                 struct pipe_buffer *buf)
+{
+}
+
+static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
+                                   struct pipe_buffer *buf)
+{
+       return false;
+}
+
+static const struct pipe_buf_operations zero_pipe_buf_ops = {
+       .release        = zero_pipe_buf_release,
+       .try_steal      = zero_pipe_buf_try_steal,
+       .get            = zero_pipe_buf_get,
+};
+
+static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
+                                       loff_t fpos, size_t size)
+{
+       size_t offset = fpos & ~PAGE_MASK;
+
+       size = min_t(size_t, size, PAGE_SIZE - offset);
+
+       if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+               struct pipe_buffer *buf = pipe_head_buf(pipe);
+
+               *buf = (struct pipe_buffer) {
+                       .ops    = &zero_pipe_buf_ops,
+                       .page   = ZERO_PAGE(0),
+                       .offset = offset,
+                       .len    = size,
+               };
+               pipe->head++;
+       }
+
+       return size;
+}
+
+static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+                                     struct pipe_inode_info *pipe,
+                                     size_t len, unsigned int flags)
+{
+       struct inode *inode = file_inode(in);
+       struct address_space *mapping = inode->i_mapping;
+       struct folio *folio = NULL;
+       size_t total_spliced = 0, used, npages, n, part;
+       loff_t isize;
+       int error = 0;
+
+       /* Work out how much data we can actually add into the pipe */
+       used = pipe_occupancy(pipe->head, pipe->tail);
+       npages = max_t(ssize_t, pipe->max_usage - used, 0);
+       len = min_t(size_t, len, npages * PAGE_SIZE);
+
+       do {
+               if (*ppos >= i_size_read(inode))
+                       break;
+
+               error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio, SGP_READ);
+               if (error) {
+                       if (error == -EINVAL)
+                               error = 0;
+                       break;
+               }
+               if (folio) {
+                       folio_unlock(folio);
+
+                       if (folio_test_hwpoison(folio)) {
+                               error = -EIO;
+                               break;
+                       }
+               }
+
+               /*
+                * i_size must be checked after we know the pages are Uptodate.
+                *
+                * Checking i_size after the check allows us to calculate
+                * the correct value for "nr", which means the zero-filled
+                * part of the page is not copied back to userspace (unless
+                * another truncate extends the file - this is desired though).
+                */
+               isize = i_size_read(inode);
+               if (unlikely(*ppos >= isize))
+                       break;
+               part = min_t(loff_t, isize - *ppos, len);
+
+               if (folio) {
+                       /*
+                        * If users can be writing to this page using arbitrary
+                        * virtual addresses, take care about potential aliasing
+                        * before reading the page on the kernel side.
+                        */
+                       if (mapping_writably_mapped(mapping))
+                               flush_dcache_folio(folio);
+                       folio_mark_accessed(folio);
+                       /*
+                        * Ok, we have the page, and it's up-to-date, so we can
+                        * now splice it into the pipe.
+                        */
+                       n = splice_folio_into_pipe(pipe, folio, *ppos, part);
+                       folio_put(folio);
+                       folio = NULL;
+               } else {
+                       n = splice_zeropage_into_pipe(pipe, *ppos, len);
+               }
+
+               if (!n)
+                       break;
+               len -= n;
+               total_spliced += n;
+               *ppos += n;
+               in->f_ra.prev_pos = *ppos;
+               if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+                       break;
+
+               cond_resched();
+       } while (len);
+
+       if (folio)
+               folio_put(folio);
+
+       file_accessed(in);
+       return total_spliced ? total_spliced : error;
+}
+
 static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 {
        struct address_space *mapping = file->f_mapping;
@@ -3971,7 +4103,7 @@ static const struct file_operations shmem_file_operations = {
        .read_iter      = shmem_file_read_iter,
        .write_iter     = generic_file_write_iter,
        .fsync          = noop_fsync,
-       .splice_read    = generic_file_splice_read,
+       .splice_read    = shmem_file_splice_read,
        .splice_write   = iter_file_splice_write,
        .fallocate      = shmem_fallocate,
 #endif