2 * "splice": joining two ropes together by interweaving their strands.
4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other.
8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer.
11 * Named by Larry McVoy, original implementation from Linus, extended by
12 * Jens to support splicing to files, network, direct splicing, etc and
13 * fixing lots of bugs.
15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
21 #include <linux/file.h>
22 #include <linux/pagemap.h>
23 #include <linux/splice.h>
24 #include <linux/mm_inline.h>
25 #include <linux/swap.h>
26 #include <linux/writeback.h>
27 #include <linux/buffer_head.h>
28 #include <linux/module.h>
29 #include <linux/syscalls.h>
30 #include <linux/uio.h>
33 * Attempt to steal a page from a pipe buffer. This should perhaps go into
34 * a vm helper function, it's already simplified quite a bit by the
35 * addition of remove_mapping(). If success is returned, the caller may
36 * attempt to reuse this page for another destination.
38 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
39 struct pipe_buffer *buf)
41 struct page *page = buf->page;
42 struct address_space *mapping;
46 mapping = page_mapping(page);
48 WARN_ON(!PageUptodate(page));
51 * At least for ext2 with nobh option, we need to wait on
52 * writeback completing on this page, since we'll remove it
53 * from the pagecache. Otherwise truncate wont wait on the
54 * page, allowing the disk blocks to be reused by someone else
55 * before we actually wrote our data to them. fs corruption
58 wait_on_page_writeback(page);
60 if (PagePrivate(page))
61 try_to_release_page(page, GFP_KERNEL);
64 * If we succeeded in removing the mapping, set LRU flag
67 if (remove_mapping(mapping, page)) {
68 buf->flags |= PIPE_BUF_FLAG_LRU;
74 * Raced with truncate or failed to remove page from current
75 * address space, unlock and return failure.
81 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
82 struct pipe_buffer *buf)
84 page_cache_release(buf->page);
85 buf->flags &= ~PIPE_BUF_FLAG_LRU;
88 static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe,
89 struct pipe_buffer *buf)
91 struct page *page = buf->page;
94 if (!PageUptodate(page)) {
98 * Page got truncated/unhashed. This will cause a 0-byte
99 * splice, if this is the first page.
101 if (!page->mapping) {
107 * Uh oh, read-error from disk.
109 if (!PageUptodate(page)) {
115 * Page is ok afterall, we are done.
126 static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
128 .map = generic_pipe_buf_map,
129 .unmap = generic_pipe_buf_unmap,
130 .pin = page_cache_pipe_buf_pin,
131 .release = page_cache_pipe_buf_release,
132 .steal = page_cache_pipe_buf_steal,
133 .get = generic_pipe_buf_get,
136 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
137 struct pipe_buffer *buf)
139 if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
142 buf->flags |= PIPE_BUF_FLAG_LRU;
143 return generic_pipe_buf_steal(pipe, buf);
146 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
148 .map = generic_pipe_buf_map,
149 .unmap = generic_pipe_buf_unmap,
150 .pin = generic_pipe_buf_pin,
151 .release = page_cache_pipe_buf_release,
152 .steal = user_page_pipe_buf_steal,
153 .get = generic_pipe_buf_get,
157 * Pipe output worker. This fills a pipe with the information contained
158 * from splice_pipe_desc().
160 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
161 struct splice_pipe_desc *spd)
163 unsigned int spd_pages = spd->nr_pages;
164 int ret, do_wakeup, page_nr;
171 mutex_lock(&pipe->inode->i_mutex);
174 if (!pipe->readers) {
175 send_sig(SIGPIPE, current, 0);
181 if (pipe->nrbufs < PIPE_BUFFERS) {
182 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
183 struct pipe_buffer *buf = pipe->bufs + newbuf;
185 buf->page = spd->pages[page_nr];
186 buf->offset = spd->partial[page_nr].offset;
187 buf->len = spd->partial[page_nr].len;
189 if (spd->flags & SPLICE_F_GIFT)
190 buf->flags |= PIPE_BUF_FLAG_GIFT;
199 if (!--spd->nr_pages)
201 if (pipe->nrbufs < PIPE_BUFFERS)
207 if (spd->flags & SPLICE_F_NONBLOCK) {
213 if (signal_pending(current)) {
221 if (waitqueue_active(&pipe->wait))
222 wake_up_interruptible_sync(&pipe->wait);
223 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
227 pipe->waiting_writers++;
229 pipe->waiting_writers--;
233 mutex_unlock(&pipe->inode->i_mutex);
237 if (waitqueue_active(&pipe->wait))
238 wake_up_interruptible(&pipe->wait);
239 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
243 while (page_nr < spd_pages)
244 page_cache_release(spd->pages[page_nr++]);
250 __generic_file_splice_read(struct file *in, loff_t *ppos,
251 struct pipe_inode_info *pipe, size_t len,
254 struct address_space *mapping = in->f_mapping;
255 unsigned int loff, nr_pages;
256 struct page *pages[PIPE_BUFFERS];
257 struct partial_page partial[PIPE_BUFFERS];
259 pgoff_t index, end_index;
262 struct splice_pipe_desc spd = {
266 .ops = &page_cache_pipe_buf_ops,
269 index = *ppos >> PAGE_CACHE_SHIFT;
270 loff = *ppos & ~PAGE_CACHE_MASK;
271 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
273 if (nr_pages > PIPE_BUFFERS)
274 nr_pages = PIPE_BUFFERS;
277 * Don't try to 2nd guess the read-ahead logic, call into
278 * page_cache_readahead() like the page cache reads would do.
280 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
283 * Now fill in the holes:
288 * Lookup the (hopefully) full range of pages we need.
290 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
293 * If find_get_pages_contig() returned fewer pages than we needed,
296 index += spd.nr_pages;
297 while (spd.nr_pages < nr_pages) {
299 * Page could be there, find_get_pages_contig() breaks on
302 page = find_get_page(mapping, index);
305 * Make sure the read-ahead engine is notified
306 * about this failure.
308 handle_ra_miss(mapping, &in->f_ra, index);
311 * page didn't exist, allocate one.
313 page = page_cache_alloc_cold(mapping);
317 error = add_to_page_cache_lru(page, mapping, index,
319 if (unlikely(error)) {
320 page_cache_release(page);
321 if (error == -EEXIST)
326 * add_to_page_cache() locks the page, unlock it
327 * to avoid convoluting the logic below even more.
332 pages[spd.nr_pages++] = page;
337 * Now loop over the map and see if we need to start IO on any
338 * pages, fill in the partial map, etc.
340 index = *ppos >> PAGE_CACHE_SHIFT;
341 nr_pages = spd.nr_pages;
343 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
344 unsigned int this_len;
350 * this_len is the max we'll use from this page
352 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
353 page = pages[page_nr];
356 * If the page isn't uptodate, we may need to start io on it
358 if (!PageUptodate(page)) {
360 * If in nonblock mode then dont block on waiting
361 * for an in-flight io page
363 if (flags & SPLICE_F_NONBLOCK) {
364 if (TestSetPageLocked(page))
370 * page was truncated, stop here. if this isn't the
371 * first page, we'll just complete what we already
374 if (!page->mapping) {
379 * page was already under io and is now done, great
381 if (PageUptodate(page)) {
387 * need to read in the page
389 error = mapping->a_ops->readpage(in, page);
390 if (unlikely(error)) {
392 * We really should re-lookup the page here,
393 * but it complicates things a lot. Instead
394 * lets just do what we already stored, and
395 * we'll get it the next time we are called.
397 if (error == AOP_TRUNCATED_PAGE)
405 * i_size must be checked after PageUptodate.
407 isize = i_size_read(mapping->host);
408 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
409 if (unlikely(!isize || index > end_index))
413 * if this is the last page, see if we need to shrink
414 * the length and stop
416 if (end_index == index) {
420 * max good bytes in this page
422 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
427 * force quit after adding this page
429 this_len = min(this_len, plen - loff);
433 partial[page_nr].offset = loff;
434 partial[page_nr].len = this_len;
442 * Release any pages at the end, if we quit early. 'page_nr' is how far
443 * we got, 'nr_pages' is how many pages are in the map.
445 while (page_nr < nr_pages)
446 page_cache_release(pages[page_nr++]);
449 return splice_to_pipe(pipe, &spd);
455 * generic_file_splice_read - splice data from file to a pipe
456 * @in: file to splice from
457 * @pipe: pipe to splice to
458 * @len: number of bytes to splice
459 * @flags: splice modifier flags
461 * Will read pages from given file and fill them into a pipe.
463 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
464 struct pipe_inode_info *pipe, size_t len,
471 isize = i_size_read(in->f_mapping->host);
472 if (unlikely(*ppos >= isize))
475 left = isize - *ppos;
476 if (unlikely(left < len))
482 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
489 if (flags & SPLICE_F_NONBLOCK) {
506 EXPORT_SYMBOL(generic_file_splice_read);
509 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
510 * using sendpage(). Return the number of bytes sent.
512 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
513 struct pipe_buffer *buf, struct splice_desc *sd)
515 struct file *file = sd->u.file;
516 loff_t pos = sd->pos;
519 ret = buf->ops->pin(pipe, buf);
521 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
523 ret = file->f_op->sendpage(file, buf->page, buf->offset,
524 sd->len, &pos, more);
531 * This is a little more tricky than the file -> pipe splicing. There are
532 * basically three cases:
534 * - Destination page already exists in the address space and there
535 * are users of it. For that case we have no other option that
536 * copying the data. Tough luck.
537 * - Destination page already exists in the address space, but there
538 * are no users of it. Make sure it's uptodate, then drop it. Fall
539 * through to last case.
540 * - Destination page does not exist, we can add the pipe page to
541 * the page cache and avoid the copy.
543 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
544 * sd->flags), we attempt to migrate pages from the pipe to the output
545 * file address space page cache. This is possible if no one else has
546 * the pipe page referenced outside of the pipe and page cache. If
547 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
548 * a new page in the output file page cache and fill/dirty that.
550 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
551 struct splice_desc *sd)
553 struct file *file = sd->u.file;
554 struct address_space *mapping = file->f_mapping;
555 unsigned int offset, this_len;
561 * make sure the data in this buffer is uptodate
563 ret = buf->ops->pin(pipe, buf);
567 index = sd->pos >> PAGE_CACHE_SHIFT;
568 offset = sd->pos & ~PAGE_CACHE_MASK;
571 if (this_len + offset > PAGE_CACHE_SIZE)
572 this_len = PAGE_CACHE_SIZE - offset;
575 page = find_lock_page(mapping, index);
578 page = page_cache_alloc_cold(mapping);
583 * This will also lock the page
585 ret = add_to_page_cache_lru(page, mapping, index,
591 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
593 loff_t isize = i_size_read(mapping->host);
595 if (ret != AOP_TRUNCATED_PAGE)
597 page_cache_release(page);
598 if (ret == AOP_TRUNCATED_PAGE)
602 * prepare_write() may have instantiated a few blocks
603 * outside i_size. Trim these off again.
605 if (sd->pos + this_len > isize)
606 vmtruncate(mapping->host, isize);
611 if (buf->page != page) {
613 * Careful, ->map() uses KM_USER0!
615 char *src = buf->ops->map(pipe, buf, 1);
616 char *dst = kmap_atomic(page, KM_USER1);
618 memcpy(dst + offset, src + buf->offset, this_len);
619 flush_dcache_page(page);
620 kunmap_atomic(dst, KM_USER1);
621 buf->ops->unmap(pipe, buf, src);
624 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
626 if (ret == AOP_TRUNCATED_PAGE) {
627 page_cache_release(page);
633 * Partial write has happened, so 'ret' already initialized by
634 * number of bytes written, Where is nothing we have to do here.
639 * Return the number of bytes written and mark page as
640 * accessed, we are now done!
642 mark_page_accessed(page);
644 page_cache_release(page);
651 * Pipe input worker. Most of this logic works like a regular pipe, the
652 * key here is the 'actor' worker passed in that actually moves the data
653 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
655 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
658 int ret, do_wakeup, err;
665 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
666 const struct pipe_buf_operations *ops = buf->ops;
669 if (sd->len > sd->total_len)
670 sd->len = sd->total_len;
672 err = actor(pipe, buf, sd);
674 if (!ret && err != -ENODATA)
686 sd->total_len -= err;
692 ops->release(pipe, buf);
693 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
707 if (!pipe->waiting_writers) {
712 if (sd->flags & SPLICE_F_NONBLOCK) {
718 if (signal_pending(current)) {
726 if (waitqueue_active(&pipe->wait))
727 wake_up_interruptible_sync(&pipe->wait);
728 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
737 if (waitqueue_active(&pipe->wait))
738 wake_up_interruptible(&pipe->wait);
739 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
744 EXPORT_SYMBOL(__splice_from_pipe);
746 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
747 loff_t *ppos, size_t len, unsigned int flags,
751 struct inode *inode = out->f_mapping->host;
752 struct splice_desc sd = {
760 * The actor worker might be calling ->prepare_write and
761 * ->commit_write. Most of the time, these expect i_mutex to
762 * be held. Since this may result in an ABBA deadlock with
763 * pipe->inode, we have to order lock acquiry here.
765 inode_double_lock(inode, pipe->inode);
766 ret = __splice_from_pipe(pipe, &sd, actor);
767 inode_double_unlock(inode, pipe->inode);
773 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
775 * @out: file to write to
776 * @len: number of bytes to splice
777 * @flags: splice modifier flags
779 * Will either move or copy pages (determined by @flags options) from
780 * the given pipe inode to the given file. The caller is responsible
781 * for acquiring i_mutex on both inodes.
785 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
786 loff_t *ppos, size_t len, unsigned int flags)
788 struct address_space *mapping = out->f_mapping;
789 struct inode *inode = mapping->host;
790 struct splice_desc sd = {
799 err = remove_suid(out->f_path.dentry);
803 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
805 unsigned long nr_pages;
808 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
811 * If file or inode is SYNC and we actually wrote some data,
814 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
815 err = generic_osync_inode(inode, mapping,
816 OSYNC_METADATA|OSYNC_DATA);
821 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
827 EXPORT_SYMBOL(generic_file_splice_write_nolock);
830 * generic_file_splice_write - splice data from a pipe to a file
832 * @out: file to write to
833 * @len: number of bytes to splice
834 * @flags: splice modifier flags
836 * Will either move or copy pages (determined by @flags options) from
837 * the given pipe inode to the given file.
841 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
842 loff_t *ppos, size_t len, unsigned int flags)
844 struct address_space *mapping = out->f_mapping;
845 struct inode *inode = mapping->host;
849 err = should_remove_suid(out->f_path.dentry);
851 mutex_lock(&inode->i_mutex);
852 err = __remove_suid(out->f_path.dentry, err);
853 mutex_unlock(&inode->i_mutex);
858 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
860 unsigned long nr_pages;
863 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
866 * If file or inode is SYNC and we actually wrote some data,
869 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
870 mutex_lock(&inode->i_mutex);
871 err = generic_osync_inode(inode, mapping,
872 OSYNC_METADATA|OSYNC_DATA);
873 mutex_unlock(&inode->i_mutex);
878 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
884 EXPORT_SYMBOL(generic_file_splice_write);
887 * generic_splice_sendpage - splice data from a pipe to a socket
889 * @out: socket to write to
890 * @len: number of bytes to splice
891 * @flags: splice modifier flags
893 * Will send @len bytes from the pipe to a network socket. No data copying
897 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
898 loff_t *ppos, size_t len, unsigned int flags)
900 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
903 EXPORT_SYMBOL(generic_splice_sendpage);
906 * Attempt to initiate a splice from pipe to file.
908 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
909 loff_t *ppos, size_t len, unsigned int flags)
913 if (unlikely(!out->f_op || !out->f_op->splice_write))
916 if (unlikely(!(out->f_mode & FMODE_WRITE)))
919 ret = rw_verify_area(WRITE, out, ppos, len);
920 if (unlikely(ret < 0))
923 return out->f_op->splice_write(pipe, out, ppos, len, flags);
927 * Attempt to initiate a splice from a file to a pipe.
929 static long do_splice_to(struct file *in, loff_t *ppos,
930 struct pipe_inode_info *pipe, size_t len,
935 if (unlikely(!in->f_op || !in->f_op->splice_read))
938 if (unlikely(!(in->f_mode & FMODE_READ)))
941 ret = rw_verify_area(READ, in, ppos, len);
942 if (unlikely(ret < 0))
945 return in->f_op->splice_read(in, ppos, pipe, len, flags);
949 * Splices from an input file to an actor, using a 'direct' pipe.
951 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
952 splice_direct_actor *actor)
954 struct pipe_inode_info *pipe;
961 * We require the input being a regular file, as we don't want to
962 * randomly drop data for eg socket -> socket splicing. Use the
963 * piped splicing for that!
965 i_mode = in->f_path.dentry->d_inode->i_mode;
966 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
970 * neither in nor out is a pipe, setup an internal pipe attached to
971 * 'out' and transfer the wanted data from 'in' to 'out' through that
973 pipe = current->splice_pipe;
974 if (unlikely(!pipe)) {
975 pipe = alloc_pipe_info(NULL);
980 * We don't have an immediate reader, but we'll read the stuff
981 * out of the pipe right after the splice_to_pipe(). So set
982 * PIPE_READERS appropriately.
986 current->splice_pipe = pipe;
998 * Don't block on output, we have to drain the direct pipe.
1000 sd->flags &= ~SPLICE_F_NONBLOCK;
1003 size_t read_len, max_read_len;
1006 * Do at most PIPE_BUFFERS pages worth of transfer:
1008 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
1010 ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags);
1011 if (unlikely(ret < 0))
1015 sd->total_len = read_len;
1018 * NOTE: nonblocking mode only applies to the input. We
1019 * must not do the output in nonblocking mode as then we
1020 * could get stuck data in the internal pipe:
1022 ret = actor(pipe, sd);
1023 if (unlikely(ret < 0))
1030 * In nonblocking mode, if we got back a short read then
1031 * that was due to either an IO error or due to the
1032 * pagecache entry not being there. In the IO error case
1033 * the _next_ splice attempt will produce a clean IO error
1034 * return value (not a short read), so in both cases it's
1035 * correct to break out of the loop here:
1037 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
1041 pipe->nrbufs = pipe->curbuf = 0;
1047 * If we did an incomplete transfer we must release
1048 * the pipe buffers in question:
1050 for (i = 0; i < PIPE_BUFFERS; i++) {
1051 struct pipe_buffer *buf = pipe->bufs + i;
1054 buf->ops->release(pipe, buf);
1058 pipe->nrbufs = pipe->curbuf = 0;
1061 * If we transferred some data, return the number of bytes:
1069 EXPORT_SYMBOL(splice_direct_to_actor);
1071 static int direct_splice_actor(struct pipe_inode_info *pipe,
1072 struct splice_desc *sd)
1074 struct file *file = sd->u.file;
1076 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
1079 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1080 size_t len, unsigned int flags)
1082 struct splice_desc sd = {
1091 ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1097 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1098 * location, so checking ->i_pipe is not enough to verify that this is a
1101 static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1103 if (S_ISFIFO(inode->i_mode))
1104 return inode->i_pipe;
1110 * Determine where to splice to/from.
1112 static long do_splice(struct file *in, loff_t __user *off_in,
1113 struct file *out, loff_t __user *off_out,
1114 size_t len, unsigned int flags)
1116 struct pipe_inode_info *pipe;
1117 loff_t offset, *off;
1120 pipe = pipe_info(in->f_path.dentry->d_inode);
1125 if (out->f_op->llseek == no_llseek)
1127 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1133 ret = do_splice_from(pipe, out, off, len, flags);
1135 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1141 pipe = pipe_info(out->f_path.dentry->d_inode);
1146 if (in->f_op->llseek == no_llseek)
1148 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1154 ret = do_splice_to(in, off, pipe, len, flags);
1156 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1166 * Map an iov into an array of pages and offset/length tupples. With the
1167 * partial_page structure, we can map several non-contiguous ranges into
1168 * our ones pages[] map instead of splitting that operation into pieces.
1169 * Could easily be exported as a generic helper for other users, in which
1170 * case one would probably want to add a 'max_nr_pages' parameter as well.
1172 static int get_iovec_page_array(const struct iovec __user *iov,
1173 unsigned int nr_vecs, struct page **pages,
1174 struct partial_page *partial, int aligned)
1176 int buffers = 0, error = 0;
1179 * It's ok to take the mmap_sem for reading, even
1180 * across a "get_user()".
1182 down_read(¤t->mm->mmap_sem);
1185 unsigned long off, npages;
1191 * Get user address base and length for this iovec.
1193 error = get_user(base, &iov->iov_base);
1194 if (unlikely(error))
1196 error = get_user(len, &iov->iov_len);
1197 if (unlikely(error))
1201 * Sanity check this iovec. 0 read succeeds.
1206 if (unlikely(!base))
1210 * Get this base offset and number of pages, then map
1211 * in the user pages.
1213 off = (unsigned long) base & ~PAGE_MASK;
1216 * If asked for alignment, the offset must be zero and the
1217 * length a multiple of the PAGE_SIZE.
1220 if (aligned && (off || len & ~PAGE_MASK))
1223 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1224 if (npages > PIPE_BUFFERS - buffers)
1225 npages = PIPE_BUFFERS - buffers;
1227 error = get_user_pages(current, current->mm,
1228 (unsigned long) base, npages, 0, 0,
1229 &pages[buffers], NULL);
1231 if (unlikely(error <= 0))
1235 * Fill this contiguous range into the partial page map.
1237 for (i = 0; i < error; i++) {
1238 const int plen = min_t(size_t, len, PAGE_SIZE - off);
1240 partial[buffers].offset = off;
1241 partial[buffers].len = plen;
1249 * We didn't complete this iov, stop here since it probably
1250 * means we have to move some of this into a pipe to
1251 * be able to continue.
1257 * Don't continue if we mapped fewer pages than we asked for,
1258 * or if we mapped the max number of pages that we have
1261 if (error < npages || buffers == PIPE_BUFFERS)
1268 up_read(¤t->mm->mmap_sem);
1276 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1277 struct splice_desc *sd)
1282 ret = buf->ops->pin(pipe, buf);
1287 * See if we can use the atomic maps, by prefaulting in the
1288 * pages and doing an atomic copy
1290 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1291 src = buf->ops->map(pipe, buf, 1);
1292 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1294 buf->ops->unmap(pipe, buf, src);
1302 * No dice, use slow non-atomic map and copy
1304 src = buf->ops->map(pipe, buf, 0);
1307 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1312 sd->u.userptr += ret;
1313 buf->ops->unmap(pipe, buf, src);
1318 * For lack of a better implementation, implement vmsplice() to userspace
1319 * as a simple copy of the pipes pages to the user iov.
1321 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1322 unsigned long nr_segs, unsigned int flags)
1324 struct pipe_inode_info *pipe;
1325 struct splice_desc sd;
1330 pipe = pipe_info(file->f_path.dentry->d_inode);
1335 mutex_lock(&pipe->inode->i_mutex);
1343 * Get user address base and length for this iovec.
1345 error = get_user(base, &iov->iov_base);
1346 if (unlikely(error))
1348 error = get_user(len, &iov->iov_len);
1349 if (unlikely(error))
1353 * Sanity check this iovec. 0 read succeeds.
1357 if (unlikely(!base)) {
1365 sd.u.userptr = base;
1368 size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1386 mutex_unlock(&pipe->inode->i_mutex);
1395 * vmsplice splices a user address range into a pipe. It can be thought of
1396 * as splice-from-memory, where the regular splice is splice-from-file (or
1397 * to file). In both cases the output is a pipe, naturally.
1399 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1400 unsigned long nr_segs, unsigned int flags)
1402 struct pipe_inode_info *pipe;
1403 struct page *pages[PIPE_BUFFERS];
1404 struct partial_page partial[PIPE_BUFFERS];
1405 struct splice_pipe_desc spd = {
1409 .ops = &user_page_pipe_buf_ops,
1412 pipe = pipe_info(file->f_path.dentry->d_inode);
1416 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1417 flags & SPLICE_F_GIFT);
1418 if (spd.nr_pages <= 0)
1419 return spd.nr_pages;
1421 return splice_to_pipe(pipe, &spd);
1425 * Note that vmsplice only really supports true splicing _from_ user memory
1426 * to a pipe, not the other way around. Splicing from user memory is a simple
1427 * operation that can be supported without any funky alignment restrictions
1428 * or nasty vm tricks. We simply map in the user memory and fill them into
1429 * a pipe. The reverse isn't quite as easy, though. There are two possible
1430 * solutions for that:
1432 * - memcpy() the data internally, at which point we might as well just
1433 * do a regular read() on the buffer anyway.
1434 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1435 * has restriction limitations on both ends of the pipe).
1437 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1440 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1441 unsigned long nr_segs, unsigned int flags)
1447 if (unlikely(nr_segs > UIO_MAXIOV))
1449 else if (unlikely(!nr_segs))
1453 file = fget_light(fd, &fput);
1455 if (file->f_mode & FMODE_WRITE)
1456 error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1457 else if (file->f_mode & FMODE_READ)
1458 error = vmsplice_to_user(file, iov, nr_segs, flags);
1460 fput_light(file, fput);
1466 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1467 int fd_out, loff_t __user *off_out,
1468 size_t len, unsigned int flags)
1471 struct file *in, *out;
1472 int fput_in, fput_out;
1478 in = fget_light(fd_in, &fput_in);
1480 if (in->f_mode & FMODE_READ) {
1481 out = fget_light(fd_out, &fput_out);
1483 if (out->f_mode & FMODE_WRITE)
1484 error = do_splice(in, off_in,
1487 fput_light(out, fput_out);
1491 fput_light(in, fput_in);
1498 * Make sure there's data to read. Wait for input if we can, otherwise
1499 * return an appropriate error.
1501 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1506 * Check ->nrbufs without the inode lock first. This function
1507 * is speculative anyways, so missing one is ok.
1513 mutex_lock(&pipe->inode->i_mutex);
1515 while (!pipe->nrbufs) {
1516 if (signal_pending(current)) {
1522 if (!pipe->waiting_writers) {
1523 if (flags & SPLICE_F_NONBLOCK) {
1531 mutex_unlock(&pipe->inode->i_mutex);
1536 * Make sure there's writeable room. Wait for room if we can, otherwise
1537 * return an appropriate error.
1539 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1544 * Check ->nrbufs without the inode lock first. This function
1545 * is speculative anyways, so missing one is ok.
1547 if (pipe->nrbufs < PIPE_BUFFERS)
1551 mutex_lock(&pipe->inode->i_mutex);
1553 while (pipe->nrbufs >= PIPE_BUFFERS) {
1554 if (!pipe->readers) {
1555 send_sig(SIGPIPE, current, 0);
1559 if (flags & SPLICE_F_NONBLOCK) {
1563 if (signal_pending(current)) {
1567 pipe->waiting_writers++;
1569 pipe->waiting_writers--;
1572 mutex_unlock(&pipe->inode->i_mutex);
1577 * Link contents of ipipe to opipe.
1579 static int link_pipe(struct pipe_inode_info *ipipe,
1580 struct pipe_inode_info *opipe,
1581 size_t len, unsigned int flags)
1583 struct pipe_buffer *ibuf, *obuf;
1584 int ret = 0, i = 0, nbuf;
1587 * Potential ABBA deadlock, work around it by ordering lock
1588 * grabbing by inode address. Otherwise two different processes
1589 * could deadlock (one doing tee from A -> B, the other from B -> A).
1591 inode_double_lock(ipipe->inode, opipe->inode);
1594 if (!opipe->readers) {
1595 send_sig(SIGPIPE, current, 0);
1602 * If we have iterated all input buffers or ran out of
1603 * output room, break.
1605 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1608 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1609 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1612 * Get a reference to this pipe buffer,
1613 * so we can copy the contents over.
1615 ibuf->ops->get(ipipe, ibuf);
1617 obuf = opipe->bufs + nbuf;
1621 * Don't inherit the gift flag, we need to
1622 * prevent multiple steals of this page.
1624 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1626 if (obuf->len > len)
1635 inode_double_unlock(ipipe->inode, opipe->inode);
1638 * If we put data in the output pipe, wakeup any potential readers.
1642 if (waitqueue_active(&opipe->wait))
1643 wake_up_interruptible(&opipe->wait);
1644 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1651 * This is a tee(1) implementation that works on pipes. It doesn't copy
1652 * any data, it simply references the 'in' pages on the 'out' pipe.
1653 * The 'flags' used are the SPLICE_F_* variants, currently the only
1654 * applicable one is SPLICE_F_NONBLOCK.
1656 static long do_tee(struct file *in, struct file *out, size_t len,
1659 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1660 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1664 * Duplicate the contents of ipipe to opipe without actually
1667 if (ipipe && opipe && ipipe != opipe) {
1669 * Keep going, unless we encounter an error. The ipipe/opipe
1670 * ordering doesn't really matter.
1672 ret = link_ipipe_prep(ipipe, flags);
1674 ret = link_opipe_prep(opipe, flags);
1676 ret = link_pipe(ipipe, opipe, len, flags);
1677 if (!ret && (flags & SPLICE_F_NONBLOCK))
1686 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
1695 in = fget_light(fdin, &fput_in);
1697 if (in->f_mode & FMODE_READ) {
1699 struct file *out = fget_light(fdout, &fput_out);
1702 if (out->f_mode & FMODE_WRITE)
1703 error = do_tee(in, out, len, flags);
1704 fput_light(out, fput_out);
1707 fput_light(in, fput_in);