0af8d150394f6a968ba37a7c2f88afad1b2914fa
[linux-block.git] / fs / splice.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * "splice": joining two ropes together by interweaving their strands.
4  *
5  * This is the "extended pipe" functionality, where a pipe is used as
6  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7  * buffer that you can use to transfer data from one end to the other.
8  *
9  * The traditional unix read/write is extended with a "splice()" operation
10  * that transfers data buffers to or from a pipe buffer.
11  *
12  * Named by Larry McVoy, original implementation from Linus, extended by
13  * Jens to support splicing to files, network, direct splicing, etc and
14  * fixing lots of bugs.
15  *
16  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19  *
20  */
21 #include <linux/bvec.h>
22 #include <linux/fs.h>
23 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/splice.h>
26 #include <linux/memcontrol.h>
27 #include <linux/mm_inline.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/export.h>
31 #include <linux/syscalls.h>
32 #include <linux/uio.h>
33 #include <linux/fsnotify.h>
34 #include <linux/security.h>
35 #include <linux/gfp.h>
36 #include <linux/socket.h>
37 #include <linux/sched/signal.h>
38
39 #include "internal.h"
40
41 /*
42  * Attempt to steal a page from a pipe buffer. This should perhaps go into
43  * a vm helper function, it's already simplified quite a bit by the
44  * addition of remove_mapping(). If success is returned, the caller may
45  * attempt to reuse this page for another destination.
46  */
47 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
48                 struct pipe_buffer *buf)
49 {
50         struct folio *folio = page_folio(buf->page);
51         struct address_space *mapping;
52
53         folio_lock(folio);
54
55         mapping = folio_mapping(folio);
56         if (mapping) {
57                 WARN_ON(!folio_test_uptodate(folio));
58
59                 /*
60                  * At least for ext2 with nobh option, we need to wait on
61                  * writeback completing on this folio, since we'll remove it
62                  * from the pagecache.  Otherwise truncate wont wait on the
63                  * folio, allowing the disk blocks to be reused by someone else
64                  * before we actually wrote our data to them. fs corruption
65                  * ensues.
66                  */
67                 folio_wait_writeback(folio);
68
69                 if (folio_has_private(folio) &&
70                     !filemap_release_folio(folio, GFP_KERNEL))
71                         goto out_unlock;
72
73                 /*
74                  * If we succeeded in removing the mapping, set LRU flag
75                  * and return good.
76                  */
77                 if (remove_mapping(mapping, folio)) {
78                         buf->flags |= PIPE_BUF_FLAG_LRU;
79                         return true;
80                 }
81         }
82
83         /*
84          * Raced with truncate or failed to remove folio from current
85          * address space, unlock and return failure.
86          */
87 out_unlock:
88         folio_unlock(folio);
89         return false;
90 }
91
92 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
93                                         struct pipe_buffer *buf)
94 {
95         put_page(buf->page);
96         buf->flags &= ~PIPE_BUF_FLAG_LRU;
97 }
98
99 /*
100  * Check whether the contents of buf is OK to access. Since the content
101  * is a page cache page, IO may be in flight.
102  */
103 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
104                                        struct pipe_buffer *buf)
105 {
106         struct page *page = buf->page;
107         int err;
108
109         if (!PageUptodate(page)) {
110                 lock_page(page);
111
112                 /*
113                  * Page got truncated/unhashed. This will cause a 0-byte
114                  * splice, if this is the first page.
115                  */
116                 if (!page->mapping) {
117                         err = -ENODATA;
118                         goto error;
119                 }
120
121                 /*
122                  * Uh oh, read-error from disk.
123                  */
124                 if (!PageUptodate(page)) {
125                         err = -EIO;
126                         goto error;
127                 }
128
129                 /*
130                  * Page is ok afterall, we are done.
131                  */
132                 unlock_page(page);
133         }
134
135         return 0;
136 error:
137         unlock_page(page);
138         return err;
139 }
140
141 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
142         .confirm        = page_cache_pipe_buf_confirm,
143         .release        = page_cache_pipe_buf_release,
144         .try_steal      = page_cache_pipe_buf_try_steal,
145         .get            = generic_pipe_buf_get,
146 };
147
148 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
149                 struct pipe_buffer *buf)
150 {
151         if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
152                 return false;
153
154         buf->flags |= PIPE_BUF_FLAG_LRU;
155         return generic_pipe_buf_try_steal(pipe, buf);
156 }
157
158 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
159         .release        = page_cache_pipe_buf_release,
160         .try_steal      = user_page_pipe_buf_try_steal,
161         .get            = generic_pipe_buf_get,
162 };
163
164 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
165 {
166         smp_mb();
167         if (waitqueue_active(&pipe->rd_wait))
168                 wake_up_interruptible(&pipe->rd_wait);
169         kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
170 }
171
172 /**
173  * splice_to_pipe - fill passed data into a pipe
174  * @pipe:       pipe to fill
175  * @spd:        data to fill
176  *
177  * Description:
178  *    @spd contains a map of pages and len/offset tuples, along with
179  *    the struct pipe_buf_operations associated with these pages. This
180  *    function will link that data to the pipe.
181  *
182  */
183 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
184                        struct splice_pipe_desc *spd)
185 {
186         unsigned int spd_pages = spd->nr_pages;
187         unsigned int tail = pipe->tail;
188         unsigned int head = pipe->head;
189         unsigned int mask = pipe->ring_size - 1;
190         int ret = 0, page_nr = 0;
191
192         if (!spd_pages)
193                 return 0;
194
195         if (unlikely(!pipe->readers)) {
196                 send_sig(SIGPIPE, current, 0);
197                 ret = -EPIPE;
198                 goto out;
199         }
200
201         while (!pipe_full(head, tail, pipe->max_usage)) {
202                 struct pipe_buffer *buf = &pipe->bufs[head & mask];
203
204                 buf->page = spd->pages[page_nr];
205                 buf->offset = spd->partial[page_nr].offset;
206                 buf->len = spd->partial[page_nr].len;
207                 buf->private = spd->partial[page_nr].private;
208                 buf->ops = spd->ops;
209                 buf->flags = 0;
210
211                 head++;
212                 pipe->head = head;
213                 page_nr++;
214                 ret += buf->len;
215
216                 if (!--spd->nr_pages)
217                         break;
218         }
219
220         if (!ret)
221                 ret = -EAGAIN;
222
223 out:
224         while (page_nr < spd_pages)
225                 spd->spd_release(spd, page_nr++);
226
227         return ret;
228 }
229 EXPORT_SYMBOL_GPL(splice_to_pipe);
230
231 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
232 {
233         unsigned int head = pipe->head;
234         unsigned int tail = pipe->tail;
235         unsigned int mask = pipe->ring_size - 1;
236         int ret;
237
238         if (unlikely(!pipe->readers)) {
239                 send_sig(SIGPIPE, current, 0);
240                 ret = -EPIPE;
241         } else if (pipe_full(head, tail, pipe->max_usage)) {
242                 ret = -EAGAIN;
243         } else {
244                 pipe->bufs[head & mask] = *buf;
245                 pipe->head = head + 1;
246                 return buf->len;
247         }
248         pipe_buf_release(pipe, buf);
249         return ret;
250 }
251 EXPORT_SYMBOL(add_to_pipe);
252
253 /*
254  * Check if we need to grow the arrays holding pages and partial page
255  * descriptions.
256  */
257 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
258 {
259         unsigned int max_usage = READ_ONCE(pipe->max_usage);
260
261         spd->nr_pages_max = max_usage;
262         if (max_usage <= PIPE_DEF_BUFFERS)
263                 return 0;
264
265         spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
266         spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
267                                      GFP_KERNEL);
268
269         if (spd->pages && spd->partial)
270                 return 0;
271
272         kfree(spd->pages);
273         kfree(spd->partial);
274         return -ENOMEM;
275 }
276
277 void splice_shrink_spd(struct splice_pipe_desc *spd)
278 {
279         if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
280                 return;
281
282         kfree(spd->pages);
283         kfree(spd->partial);
284 }
285
286 /*
287  * Splice data from an O_DIRECT file into pages and then add them to the output
288  * pipe.
289  */
290 ssize_t direct_splice_read(struct file *in, loff_t *ppos,
291                            struct pipe_inode_info *pipe,
292                            size_t len, unsigned int flags)
293 {
294         struct iov_iter to;
295         struct bio_vec *bv;
296         struct kiocb kiocb;
297         struct page **pages;
298         ssize_t ret;
299         size_t used, npages, chunk, remain, reclaim;
300         int i;
301
302         /* Work out how much data we can actually add into the pipe */
303         used = pipe_occupancy(pipe->head, pipe->tail);
304         npages = max_t(ssize_t, pipe->max_usage - used, 0);
305         len = min_t(size_t, len, npages * PAGE_SIZE);
306         npages = DIV_ROUND_UP(len, PAGE_SIZE);
307
308         bv = kzalloc(array_size(npages, sizeof(bv[0])) +
309                      array_size(npages, sizeof(struct page *)), GFP_KERNEL);
310         if (!bv)
311                 return -ENOMEM;
312
313         pages = (void *)(bv + npages);
314         npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
315         if (!npages) {
316                 kfree(bv);
317                 return -ENOMEM;
318         }
319
320         remain = len = min_t(size_t, len, npages * PAGE_SIZE);
321
322         for (i = 0; i < npages; i++) {
323                 chunk = min_t(size_t, PAGE_SIZE, remain);
324                 bv[i].bv_page = pages[i];
325                 bv[i].bv_offset = 0;
326                 bv[i].bv_len = chunk;
327                 remain -= chunk;
328         }
329
330         /* Do the I/O */
331         iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
332         init_sync_kiocb(&kiocb, in);
333         kiocb.ki_pos = *ppos;
334         ret = call_read_iter(in, &kiocb, &to);
335
336         reclaim = npages * PAGE_SIZE;
337         remain = 0;
338         if (ret > 0) {
339                 reclaim -= ret;
340                 remain = ret;
341                 *ppos = kiocb.ki_pos;
342                 file_accessed(in);
343         } else if (ret < 0) {
344                 /*
345                  * callers of ->splice_read() expect -EAGAIN on
346                  * "can't put anything in there", rather than -EFAULT.
347                  */
348                 if (ret == -EFAULT)
349                         ret = -EAGAIN;
350         }
351
352         /* Free any pages that didn't get touched at all. */
353         reclaim /= PAGE_SIZE;
354         if (reclaim) {
355                 npages -= reclaim;
356                 release_pages(pages + npages, reclaim);
357         }
358
359         /* Push the remaining pages into the pipe. */
360         for (i = 0; i < npages; i++) {
361                 struct pipe_buffer *buf = pipe_head_buf(pipe);
362
363                 chunk = min_t(size_t, remain, PAGE_SIZE);
364                 *buf = (struct pipe_buffer) {
365                         .ops    = &default_pipe_buf_ops,
366                         .page   = bv[i].bv_page,
367                         .offset = 0,
368                         .len    = chunk,
369                 };
370                 pipe->head++;
371                 remain -= chunk;
372         }
373
374         kfree(bv);
375         return ret;
376 }
377 EXPORT_SYMBOL(direct_splice_read);
378
379 /**
380  * generic_file_splice_read - splice data from file to a pipe
381  * @in:         file to splice from
382  * @ppos:       position in @in
383  * @pipe:       pipe to splice to
384  * @len:        number of bytes to splice
385  * @flags:      splice modifier flags
386  *
387  * Description:
388  *    Will read pages from given file and fill them into a pipe. Can be
389  *    used as long as it has more or less sane ->read_iter().
390  *
391  */
392 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
393                                  struct pipe_inode_info *pipe, size_t len,
394                                  unsigned int flags)
395 {
396         struct iov_iter to;
397         struct kiocb kiocb;
398         int ret;
399
400         iov_iter_pipe(&to, ITER_DEST, pipe, len);
401         init_sync_kiocb(&kiocb, in);
402         kiocb.ki_pos = *ppos;
403         ret = call_read_iter(in, &kiocb, &to);
404         if (ret > 0) {
405                 *ppos = kiocb.ki_pos;
406                 file_accessed(in);
407         } else if (ret < 0) {
408                 /* free what was emitted */
409                 pipe_discard_from(pipe, to.start_head);
410                 /*
411                  * callers of ->splice_read() expect -EAGAIN on
412                  * "can't put anything in there", rather than -EFAULT.
413                  */
414                 if (ret == -EFAULT)
415                         ret = -EAGAIN;
416         }
417
418         return ret;
419 }
420 EXPORT_SYMBOL(generic_file_splice_read);
421
422 const struct pipe_buf_operations default_pipe_buf_ops = {
423         .release        = generic_pipe_buf_release,
424         .try_steal      = generic_pipe_buf_try_steal,
425         .get            = generic_pipe_buf_get,
426 };
427
428 /* Pipe buffer operations for a socket and similar. */
429 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
430         .release        = generic_pipe_buf_release,
431         .get            = generic_pipe_buf_get,
432 };
433 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
434
435 /*
436  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
437  * using sendpage(). Return the number of bytes sent.
438  */
439 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
440                             struct pipe_buffer *buf, struct splice_desc *sd)
441 {
442         struct file *file = sd->u.file;
443         loff_t pos = sd->pos;
444         int more;
445
446         if (!likely(file->f_op->sendpage))
447                 return -EINVAL;
448
449         more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
450
451         if (sd->len < sd->total_len &&
452             pipe_occupancy(pipe->head, pipe->tail) > 1)
453                 more |= MSG_SENDPAGE_NOTLAST;
454
455         return file->f_op->sendpage(file, buf->page, buf->offset,
456                                     sd->len, &pos, more);
457 }
458
459 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
460 {
461         smp_mb();
462         if (waitqueue_active(&pipe->wr_wait))
463                 wake_up_interruptible(&pipe->wr_wait);
464         kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
465 }
466
467 /**
468  * splice_from_pipe_feed - feed available data from a pipe to a file
469  * @pipe:       pipe to splice from
470  * @sd:         information to @actor
471  * @actor:      handler that splices the data
472  *
473  * Description:
474  *    This function loops over the pipe and calls @actor to do the
475  *    actual moving of a single struct pipe_buffer to the desired
476  *    destination.  It returns when there's no more buffers left in
477  *    the pipe or if the requested number of bytes (@sd->total_len)
478  *    have been copied.  It returns a positive number (one) if the
479  *    pipe needs to be filled with more data, zero if the required
480  *    number of bytes have been copied and -errno on error.
481  *
482  *    This, together with splice_from_pipe_{begin,end,next}, may be
483  *    used to implement the functionality of __splice_from_pipe() when
484  *    locking is required around copying the pipe buffers to the
485  *    destination.
486  */
487 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
488                           splice_actor *actor)
489 {
490         unsigned int head = pipe->head;
491         unsigned int tail = pipe->tail;
492         unsigned int mask = pipe->ring_size - 1;
493         int ret;
494
495         while (!pipe_empty(head, tail)) {
496                 struct pipe_buffer *buf = &pipe->bufs[tail & mask];
497
498                 sd->len = buf->len;
499                 if (sd->len > sd->total_len)
500                         sd->len = sd->total_len;
501
502                 ret = pipe_buf_confirm(pipe, buf);
503                 if (unlikely(ret)) {
504                         if (ret == -ENODATA)
505                                 ret = 0;
506                         return ret;
507                 }
508
509                 ret = actor(pipe, buf, sd);
510                 if (ret <= 0)
511                         return ret;
512
513                 buf->offset += ret;
514                 buf->len -= ret;
515
516                 sd->num_spliced += ret;
517                 sd->len -= ret;
518                 sd->pos += ret;
519                 sd->total_len -= ret;
520
521                 if (!buf->len) {
522                         pipe_buf_release(pipe, buf);
523                         tail++;
524                         pipe->tail = tail;
525                         if (pipe->files)
526                                 sd->need_wakeup = true;
527                 }
528
529                 if (!sd->total_len)
530                         return 0;
531         }
532
533         return 1;
534 }
535
536 /* We know we have a pipe buffer, but maybe it's empty? */
537 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
538 {
539         unsigned int tail = pipe->tail;
540         unsigned int mask = pipe->ring_size - 1;
541         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
542
543         if (unlikely(!buf->len)) {
544                 pipe_buf_release(pipe, buf);
545                 pipe->tail = tail+1;
546                 return true;
547         }
548
549         return false;
550 }
551
552 /**
553  * splice_from_pipe_next - wait for some data to splice from
554  * @pipe:       pipe to splice from
555  * @sd:         information about the splice operation
556  *
557  * Description:
558  *    This function will wait for some data and return a positive
559  *    value (one) if pipe buffers are available.  It will return zero
560  *    or -errno if no more data needs to be spliced.
561  */
562 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
563 {
564         /*
565          * Check for signal early to make process killable when there are
566          * always buffers available
567          */
568         if (signal_pending(current))
569                 return -ERESTARTSYS;
570
571 repeat:
572         while (pipe_empty(pipe->head, pipe->tail)) {
573                 if (!pipe->writers)
574                         return 0;
575
576                 if (sd->num_spliced)
577                         return 0;
578
579                 if (sd->flags & SPLICE_F_NONBLOCK)
580                         return -EAGAIN;
581
582                 if (signal_pending(current))
583                         return -ERESTARTSYS;
584
585                 if (sd->need_wakeup) {
586                         wakeup_pipe_writers(pipe);
587                         sd->need_wakeup = false;
588                 }
589
590                 pipe_wait_readable(pipe);
591         }
592
593         if (eat_empty_buffer(pipe))
594                 goto repeat;
595
596         return 1;
597 }
598
599 /**
600  * splice_from_pipe_begin - start splicing from pipe
601  * @sd:         information about the splice operation
602  *
603  * Description:
604  *    This function should be called before a loop containing
605  *    splice_from_pipe_next() and splice_from_pipe_feed() to
606  *    initialize the necessary fields of @sd.
607  */
608 static void splice_from_pipe_begin(struct splice_desc *sd)
609 {
610         sd->num_spliced = 0;
611         sd->need_wakeup = false;
612 }
613
614 /**
615  * splice_from_pipe_end - finish splicing from pipe
616  * @pipe:       pipe to splice from
617  * @sd:         information about the splice operation
618  *
619  * Description:
620  *    This function will wake up pipe writers if necessary.  It should
621  *    be called after a loop containing splice_from_pipe_next() and
622  *    splice_from_pipe_feed().
623  */
624 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
625 {
626         if (sd->need_wakeup)
627                 wakeup_pipe_writers(pipe);
628 }
629
630 /**
631  * __splice_from_pipe - splice data from a pipe to given actor
632  * @pipe:       pipe to splice from
633  * @sd:         information to @actor
634  * @actor:      handler that splices the data
635  *
636  * Description:
637  *    This function does little more than loop over the pipe and call
638  *    @actor to do the actual moving of a single struct pipe_buffer to
639  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
640  *    pipe_to_user.
641  *
642  */
643 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
644                            splice_actor *actor)
645 {
646         int ret;
647
648         splice_from_pipe_begin(sd);
649         do {
650                 cond_resched();
651                 ret = splice_from_pipe_next(pipe, sd);
652                 if (ret > 0)
653                         ret = splice_from_pipe_feed(pipe, sd, actor);
654         } while (ret > 0);
655         splice_from_pipe_end(pipe, sd);
656
657         return sd->num_spliced ? sd->num_spliced : ret;
658 }
659 EXPORT_SYMBOL(__splice_from_pipe);
660
661 /**
662  * splice_from_pipe - splice data from a pipe to a file
663  * @pipe:       pipe to splice from
664  * @out:        file to splice to
665  * @ppos:       position in @out
666  * @len:        how many bytes to splice
667  * @flags:      splice modifier flags
668  * @actor:      handler that splices the data
669  *
670  * Description:
671  *    See __splice_from_pipe. This function locks the pipe inode,
672  *    otherwise it's identical to __splice_from_pipe().
673  *
674  */
675 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
676                          loff_t *ppos, size_t len, unsigned int flags,
677                          splice_actor *actor)
678 {
679         ssize_t ret;
680         struct splice_desc sd = {
681                 .total_len = len,
682                 .flags = flags,
683                 .pos = *ppos,
684                 .u.file = out,
685         };
686
687         pipe_lock(pipe);
688         ret = __splice_from_pipe(pipe, &sd, actor);
689         pipe_unlock(pipe);
690
691         return ret;
692 }
693
694 /**
695  * iter_file_splice_write - splice data from a pipe to a file
696  * @pipe:       pipe info
697  * @out:        file to write to
698  * @ppos:       position in @out
699  * @len:        number of bytes to splice
700  * @flags:      splice modifier flags
701  *
702  * Description:
703  *    Will either move or copy pages (determined by @flags options) from
704  *    the given pipe inode to the given file.
705  *    This one is ->write_iter-based.
706  *
707  */
708 ssize_t
709 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
710                           loff_t *ppos, size_t len, unsigned int flags)
711 {
712         struct splice_desc sd = {
713                 .total_len = len,
714                 .flags = flags,
715                 .pos = *ppos,
716                 .u.file = out,
717         };
718         int nbufs = pipe->max_usage;
719         struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
720                                         GFP_KERNEL);
721         ssize_t ret;
722
723         if (unlikely(!array))
724                 return -ENOMEM;
725
726         pipe_lock(pipe);
727
728         splice_from_pipe_begin(&sd);
729         while (sd.total_len) {
730                 struct iov_iter from;
731                 unsigned int head, tail, mask;
732                 size_t left;
733                 int n;
734
735                 ret = splice_from_pipe_next(pipe, &sd);
736                 if (ret <= 0)
737                         break;
738
739                 if (unlikely(nbufs < pipe->max_usage)) {
740                         kfree(array);
741                         nbufs = pipe->max_usage;
742                         array = kcalloc(nbufs, sizeof(struct bio_vec),
743                                         GFP_KERNEL);
744                         if (!array) {
745                                 ret = -ENOMEM;
746                                 break;
747                         }
748                 }
749
750                 head = pipe->head;
751                 tail = pipe->tail;
752                 mask = pipe->ring_size - 1;
753
754                 /* build the vector */
755                 left = sd.total_len;
756                 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
757                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
758                         size_t this_len = buf->len;
759
760                         /* zero-length bvecs are not supported, skip them */
761                         if (!this_len)
762                                 continue;
763                         this_len = min(this_len, left);
764
765                         ret = pipe_buf_confirm(pipe, buf);
766                         if (unlikely(ret)) {
767                                 if (ret == -ENODATA)
768                                         ret = 0;
769                                 goto done;
770                         }
771
772                         bvec_set_page(&array[n], buf->page, this_len,
773                                       buf->offset);
774                         left -= this_len;
775                         n++;
776                 }
777
778                 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
779                 ret = vfs_iter_write(out, &from, &sd.pos, 0);
780                 if (ret <= 0)
781                         break;
782
783                 sd.num_spliced += ret;
784                 sd.total_len -= ret;
785                 *ppos = sd.pos;
786
787                 /* dismiss the fully eaten buffers, adjust the partial one */
788                 tail = pipe->tail;
789                 while (ret) {
790                         struct pipe_buffer *buf = &pipe->bufs[tail & mask];
791                         if (ret >= buf->len) {
792                                 ret -= buf->len;
793                                 buf->len = 0;
794                                 pipe_buf_release(pipe, buf);
795                                 tail++;
796                                 pipe->tail = tail;
797                                 if (pipe->files)
798                                         sd.need_wakeup = true;
799                         } else {
800                                 buf->offset += ret;
801                                 buf->len -= ret;
802                                 ret = 0;
803                         }
804                 }
805         }
806 done:
807         kfree(array);
808         splice_from_pipe_end(pipe, &sd);
809
810         pipe_unlock(pipe);
811
812         if (sd.num_spliced)
813                 ret = sd.num_spliced;
814
815         return ret;
816 }
817
818 EXPORT_SYMBOL(iter_file_splice_write);
819
820 /**
821  * generic_splice_sendpage - splice data from a pipe to a socket
822  * @pipe:       pipe to splice from
823  * @out:        socket to write to
824  * @ppos:       position in @out
825  * @len:        number of bytes to splice
826  * @flags:      splice modifier flags
827  *
828  * Description:
829  *    Will send @len bytes from the pipe to a network socket. No data copying
830  *    is involved.
831  *
832  */
833 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
834                                 loff_t *ppos, size_t len, unsigned int flags)
835 {
836         return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
837 }
838
839 EXPORT_SYMBOL(generic_splice_sendpage);
840
841 static int warn_unsupported(struct file *file, const char *op)
842 {
843         pr_debug_ratelimited(
844                 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
845                 op, file, current->pid, current->comm);
846         return -EINVAL;
847 }
848
849 /*
850  * Attempt to initiate a splice from pipe to file.
851  */
852 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
853                            loff_t *ppos, size_t len, unsigned int flags)
854 {
855         if (unlikely(!out->f_op->splice_write))
856                 return warn_unsupported(out, "write");
857         return out->f_op->splice_write(pipe, out, ppos, len, flags);
858 }
859
860 /*
861  * Attempt to initiate a splice from a file to a pipe.
862  */
863 static long do_splice_to(struct file *in, loff_t *ppos,
864                          struct pipe_inode_info *pipe, size_t len,
865                          unsigned int flags)
866 {
867         unsigned int p_space;
868         int ret;
869
870         if (unlikely(!(in->f_mode & FMODE_READ)))
871                 return -EBADF;
872
873         /* Don't try to read more the pipe has space for. */
874         p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
875         len = min_t(size_t, len, p_space << PAGE_SHIFT);
876
877         ret = rw_verify_area(READ, in, ppos, len);
878         if (unlikely(ret < 0))
879                 return ret;
880
881         if (unlikely(len > MAX_RW_COUNT))
882                 len = MAX_RW_COUNT;
883
884         if (unlikely(!in->f_op->splice_read))
885                 return warn_unsupported(in, "read");
886         return in->f_op->splice_read(in, ppos, pipe, len, flags);
887 }
888
889 /**
890  * splice_direct_to_actor - splices data directly between two non-pipes
891  * @in:         file to splice from
892  * @sd:         actor information on where to splice to
893  * @actor:      handles the data splicing
894  *
895  * Description:
896  *    This is a special case helper to splice directly between two
897  *    points, without requiring an explicit pipe. Internally an allocated
898  *    pipe is cached in the process, and reused during the lifetime of
899  *    that process.
900  *
901  */
902 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
903                                splice_direct_actor *actor)
904 {
905         struct pipe_inode_info *pipe;
906         long ret, bytes;
907         size_t len;
908         int i, flags, more;
909
910         /*
911          * We require the input to be seekable, as we don't want to randomly
912          * drop data for eg socket -> socket splicing. Use the piped splicing
913          * for that!
914          */
915         if (unlikely(!(in->f_mode & FMODE_LSEEK)))
916                 return -EINVAL;
917
918         /*
919          * neither in nor out is a pipe, setup an internal pipe attached to
920          * 'out' and transfer the wanted data from 'in' to 'out' through that
921          */
922         pipe = current->splice_pipe;
923         if (unlikely(!pipe)) {
924                 pipe = alloc_pipe_info();
925                 if (!pipe)
926                         return -ENOMEM;
927
928                 /*
929                  * We don't have an immediate reader, but we'll read the stuff
930                  * out of the pipe right after the splice_to_pipe(). So set
931                  * PIPE_READERS appropriately.
932                  */
933                 pipe->readers = 1;
934
935                 current->splice_pipe = pipe;
936         }
937
938         /*
939          * Do the splice.
940          */
941         bytes = 0;
942         len = sd->total_len;
943         flags = sd->flags;
944
945         /*
946          * Don't block on output, we have to drain the direct pipe.
947          */
948         sd->flags &= ~SPLICE_F_NONBLOCK;
949         more = sd->flags & SPLICE_F_MORE;
950
951         WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
952
953         while (len) {
954                 size_t read_len;
955                 loff_t pos = sd->pos, prev_pos = pos;
956
957                 ret = do_splice_to(in, &pos, pipe, len, flags);
958                 if (unlikely(ret <= 0))
959                         goto out_release;
960
961                 read_len = ret;
962                 sd->total_len = read_len;
963
964                 /*
965                  * If more data is pending, set SPLICE_F_MORE
966                  * If this is the last data and SPLICE_F_MORE was not set
967                  * initially, clears it.
968                  */
969                 if (read_len < len)
970                         sd->flags |= SPLICE_F_MORE;
971                 else if (!more)
972                         sd->flags &= ~SPLICE_F_MORE;
973                 /*
974                  * NOTE: nonblocking mode only applies to the input. We
975                  * must not do the output in nonblocking mode as then we
976                  * could get stuck data in the internal pipe:
977                  */
978                 ret = actor(pipe, sd);
979                 if (unlikely(ret <= 0)) {
980                         sd->pos = prev_pos;
981                         goto out_release;
982                 }
983
984                 bytes += ret;
985                 len -= ret;
986                 sd->pos = pos;
987
988                 if (ret < read_len) {
989                         sd->pos = prev_pos + ret;
990                         goto out_release;
991                 }
992         }
993
994 done:
995         pipe->tail = pipe->head = 0;
996         file_accessed(in);
997         return bytes;
998
999 out_release:
1000         /*
1001          * If we did an incomplete transfer we must release
1002          * the pipe buffers in question:
1003          */
1004         for (i = 0; i < pipe->ring_size; i++) {
1005                 struct pipe_buffer *buf = &pipe->bufs[i];
1006
1007                 if (buf->ops)
1008                         pipe_buf_release(pipe, buf);
1009         }
1010
1011         if (!bytes)
1012                 bytes = ret;
1013
1014         goto done;
1015 }
1016 EXPORT_SYMBOL(splice_direct_to_actor);
1017
1018 static int direct_splice_actor(struct pipe_inode_info *pipe,
1019                                struct splice_desc *sd)
1020 {
1021         struct file *file = sd->u.file;
1022
1023         return do_splice_from(pipe, file, sd->opos, sd->total_len,
1024                               sd->flags);
1025 }
1026
1027 /**
1028  * do_splice_direct - splices data directly between two files
1029  * @in:         file to splice from
1030  * @ppos:       input file offset
1031  * @out:        file to splice to
1032  * @opos:       output file offset
1033  * @len:        number of bytes to splice
1034  * @flags:      splice modifier flags
1035  *
1036  * Description:
1037  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1038  *    doing it in the application would incur an extra system call
1039  *    (splice in + splice out, as compared to just sendfile()). So this helper
1040  *    can splice directly through a process-private pipe.
1041  *
1042  */
1043 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1044                       loff_t *opos, size_t len, unsigned int flags)
1045 {
1046         struct splice_desc sd = {
1047                 .len            = len,
1048                 .total_len      = len,
1049                 .flags          = flags,
1050                 .pos            = *ppos,
1051                 .u.file         = out,
1052                 .opos           = opos,
1053         };
1054         long ret;
1055
1056         if (unlikely(!(out->f_mode & FMODE_WRITE)))
1057                 return -EBADF;
1058
1059         if (unlikely(out->f_flags & O_APPEND))
1060                 return -EINVAL;
1061
1062         ret = rw_verify_area(WRITE, out, opos, len);
1063         if (unlikely(ret < 0))
1064                 return ret;
1065
1066         ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1067         if (ret > 0)
1068                 *ppos = sd.pos;
1069
1070         return ret;
1071 }
1072 EXPORT_SYMBOL(do_splice_direct);
1073
1074 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1075 {
1076         for (;;) {
1077                 if (unlikely(!pipe->readers)) {
1078                         send_sig(SIGPIPE, current, 0);
1079                         return -EPIPE;
1080                 }
1081                 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1082                         return 0;
1083                 if (flags & SPLICE_F_NONBLOCK)
1084                         return -EAGAIN;
1085                 if (signal_pending(current))
1086                         return -ERESTARTSYS;
1087                 pipe_wait_writable(pipe);
1088         }
1089 }
1090
1091 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1092                                struct pipe_inode_info *opipe,
1093                                size_t len, unsigned int flags);
1094
1095 long splice_file_to_pipe(struct file *in,
1096                          struct pipe_inode_info *opipe,
1097                          loff_t *offset,
1098                          size_t len, unsigned int flags)
1099 {
1100         long ret;
1101
1102         pipe_lock(opipe);
1103         ret = wait_for_space(opipe, flags);
1104         if (!ret)
1105                 ret = do_splice_to(in, offset, opipe, len, flags);
1106         pipe_unlock(opipe);
1107         if (ret > 0)
1108                 wakeup_pipe_readers(opipe);
1109         return ret;
1110 }
1111
1112 /*
1113  * Determine where to splice to/from.
1114  */
1115 long do_splice(struct file *in, loff_t *off_in, struct file *out,
1116                loff_t *off_out, size_t len, unsigned int flags)
1117 {
1118         struct pipe_inode_info *ipipe;
1119         struct pipe_inode_info *opipe;
1120         loff_t offset;
1121         long ret;
1122
1123         if (unlikely(!(in->f_mode & FMODE_READ) ||
1124                      !(out->f_mode & FMODE_WRITE)))
1125                 return -EBADF;
1126
1127         ipipe = get_pipe_info(in, true);
1128         opipe = get_pipe_info(out, true);
1129
1130         if (ipipe && opipe) {
1131                 if (off_in || off_out)
1132                         return -ESPIPE;
1133
1134                 /* Splicing to self would be fun, but... */
1135                 if (ipipe == opipe)
1136                         return -EINVAL;
1137
1138                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1139                         flags |= SPLICE_F_NONBLOCK;
1140
1141                 return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1142         }
1143
1144         if (ipipe) {
1145                 if (off_in)
1146                         return -ESPIPE;
1147                 if (off_out) {
1148                         if (!(out->f_mode & FMODE_PWRITE))
1149                                 return -EINVAL;
1150                         offset = *off_out;
1151                 } else {
1152                         offset = out->f_pos;
1153                 }
1154
1155                 if (unlikely(out->f_flags & O_APPEND))
1156                         return -EINVAL;
1157
1158                 ret = rw_verify_area(WRITE, out, &offset, len);
1159                 if (unlikely(ret < 0))
1160                         return ret;
1161
1162                 if (in->f_flags & O_NONBLOCK)
1163                         flags |= SPLICE_F_NONBLOCK;
1164
1165                 file_start_write(out);
1166                 ret = do_splice_from(ipipe, out, &offset, len, flags);
1167                 file_end_write(out);
1168
1169                 if (ret > 0)
1170                         fsnotify_modify(out);
1171
1172                 if (!off_out)
1173                         out->f_pos = offset;
1174                 else
1175                         *off_out = offset;
1176
1177                 return ret;
1178         }
1179
1180         if (opipe) {
1181                 if (off_out)
1182                         return -ESPIPE;
1183                 if (off_in) {
1184                         if (!(in->f_mode & FMODE_PREAD))
1185                                 return -EINVAL;
1186                         offset = *off_in;
1187                 } else {
1188                         offset = in->f_pos;
1189                 }
1190
1191                 if (out->f_flags & O_NONBLOCK)
1192                         flags |= SPLICE_F_NONBLOCK;
1193
1194                 ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1195
1196                 if (ret > 0)
1197                         fsnotify_access(in);
1198
1199                 if (!off_in)
1200                         in->f_pos = offset;
1201                 else
1202                         *off_in = offset;
1203
1204                 return ret;
1205         }
1206
1207         return -EINVAL;
1208 }
1209
1210 static long __do_splice(struct file *in, loff_t __user *off_in,
1211                         struct file *out, loff_t __user *off_out,
1212                         size_t len, unsigned int flags)
1213 {
1214         struct pipe_inode_info *ipipe;
1215         struct pipe_inode_info *opipe;
1216         loff_t offset, *__off_in = NULL, *__off_out = NULL;
1217         long ret;
1218
1219         ipipe = get_pipe_info(in, true);
1220         opipe = get_pipe_info(out, true);
1221
1222         if (ipipe && off_in)
1223                 return -ESPIPE;
1224         if (opipe && off_out)
1225                 return -ESPIPE;
1226
1227         if (off_out) {
1228                 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1229                         return -EFAULT;
1230                 __off_out = &offset;
1231         }
1232         if (off_in) {
1233                 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1234                         return -EFAULT;
1235                 __off_in = &offset;
1236         }
1237
1238         ret = do_splice(in, __off_in, out, __off_out, len, flags);
1239         if (ret < 0)
1240                 return ret;
1241
1242         if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1243                 return -EFAULT;
1244         if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1245                 return -EFAULT;
1246
1247         return ret;
1248 }
1249
1250 static int iter_to_pipe(struct iov_iter *from,
1251                         struct pipe_inode_info *pipe,
1252                         unsigned flags)
1253 {
1254         struct pipe_buffer buf = {
1255                 .ops = &user_page_pipe_buf_ops,
1256                 .flags = flags
1257         };
1258         size_t total = 0;
1259         int ret = 0;
1260
1261         while (iov_iter_count(from)) {
1262                 struct page *pages[16];
1263                 ssize_t left;
1264                 size_t start;
1265                 int i, n;
1266
1267                 left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1268                 if (left <= 0) {
1269                         ret = left;
1270                         break;
1271                 }
1272
1273                 n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1274                 for (i = 0; i < n; i++) {
1275                         int size = min_t(int, left, PAGE_SIZE - start);
1276
1277                         buf.page = pages[i];
1278                         buf.offset = start;
1279                         buf.len = size;
1280                         ret = add_to_pipe(pipe, &buf);
1281                         if (unlikely(ret < 0)) {
1282                                 iov_iter_revert(from, left);
1283                                 // this one got dropped by add_to_pipe()
1284                                 while (++i < n)
1285                                         put_page(pages[i]);
1286                                 goto out;
1287                         }
1288                         total += ret;
1289                         left -= size;
1290                         start = 0;
1291                 }
1292         }
1293 out:
1294         return total ? total : ret;
1295 }
1296
1297 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1298                         struct splice_desc *sd)
1299 {
1300         int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1301         return n == sd->len ? n : -EFAULT;
1302 }
1303
1304 /*
1305  * For lack of a better implementation, implement vmsplice() to userspace
1306  * as a simple copy of the pipes pages to the user iov.
1307  */
1308 static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1309                              unsigned int flags)
1310 {
1311         struct pipe_inode_info *pipe = get_pipe_info(file, true);
1312         struct splice_desc sd = {
1313                 .total_len = iov_iter_count(iter),
1314                 .flags = flags,
1315                 .u.data = iter
1316         };
1317         long ret = 0;
1318
1319         if (!pipe)
1320                 return -EBADF;
1321
1322         if (sd.total_len) {
1323                 pipe_lock(pipe);
1324                 ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1325                 pipe_unlock(pipe);
1326         }
1327
1328         return ret;
1329 }
1330
1331 /*
1332  * vmsplice splices a user address range into a pipe. It can be thought of
1333  * as splice-from-memory, where the regular splice is splice-from-file (or
1334  * to file). In both cases the output is a pipe, naturally.
1335  */
1336 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1337                              unsigned int flags)
1338 {
1339         struct pipe_inode_info *pipe;
1340         long ret = 0;
1341         unsigned buf_flag = 0;
1342
1343         if (flags & SPLICE_F_GIFT)
1344                 buf_flag = PIPE_BUF_FLAG_GIFT;
1345
1346         pipe = get_pipe_info(file, true);
1347         if (!pipe)
1348                 return -EBADF;
1349
1350         pipe_lock(pipe);
1351         ret = wait_for_space(pipe, flags);
1352         if (!ret)
1353                 ret = iter_to_pipe(iter, pipe, buf_flag);
1354         pipe_unlock(pipe);
1355         if (ret > 0)
1356                 wakeup_pipe_readers(pipe);
1357         return ret;
1358 }
1359
1360 static int vmsplice_type(struct fd f, int *type)
1361 {
1362         if (!f.file)
1363                 return -EBADF;
1364         if (f.file->f_mode & FMODE_WRITE) {
1365                 *type = ITER_SOURCE;
1366         } else if (f.file->f_mode & FMODE_READ) {
1367                 *type = ITER_DEST;
1368         } else {
1369                 fdput(f);
1370                 return -EBADF;
1371         }
1372         return 0;
1373 }
1374
1375 /*
1376  * Note that vmsplice only really supports true splicing _from_ user memory
1377  * to a pipe, not the other way around. Splicing from user memory is a simple
1378  * operation that can be supported without any funky alignment restrictions
1379  * or nasty vm tricks. We simply map in the user memory and fill them into
1380  * a pipe. The reverse isn't quite as easy, though. There are two possible
1381  * solutions for that:
1382  *
1383  *      - memcpy() the data internally, at which point we might as well just
1384  *        do a regular read() on the buffer anyway.
1385  *      - Lots of nasty vm tricks, that are neither fast nor flexible (it
1386  *        has restriction limitations on both ends of the pipe).
1387  *
1388  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1389  *
1390  */
1391 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1392                 unsigned long, nr_segs, unsigned int, flags)
1393 {
1394         struct iovec iovstack[UIO_FASTIOV];
1395         struct iovec *iov = iovstack;
1396         struct iov_iter iter;
1397         ssize_t error;
1398         struct fd f;
1399         int type;
1400
1401         if (unlikely(flags & ~SPLICE_F_ALL))
1402                 return -EINVAL;
1403
1404         f = fdget(fd);
1405         error = vmsplice_type(f, &type);
1406         if (error)
1407                 return error;
1408
1409         error = import_iovec(type, uiov, nr_segs,
1410                              ARRAY_SIZE(iovstack), &iov, &iter);
1411         if (error < 0)
1412                 goto out_fdput;
1413
1414         if (!iov_iter_count(&iter))
1415                 error = 0;
1416         else if (type == ITER_SOURCE)
1417                 error = vmsplice_to_pipe(f.file, &iter, flags);
1418         else
1419                 error = vmsplice_to_user(f.file, &iter, flags);
1420
1421         kfree(iov);
1422 out_fdput:
1423         fdput(f);
1424         return error;
1425 }
1426
1427 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1428                 int, fd_out, loff_t __user *, off_out,
1429                 size_t, len, unsigned int, flags)
1430 {
1431         struct fd in, out;
1432         long error;
1433
1434         if (unlikely(!len))
1435                 return 0;
1436
1437         if (unlikely(flags & ~SPLICE_F_ALL))
1438                 return -EINVAL;
1439
1440         error = -EBADF;
1441         in = fdget(fd_in);
1442         if (in.file) {
1443                 out = fdget(fd_out);
1444                 if (out.file) {
1445                         error = __do_splice(in.file, off_in, out.file, off_out,
1446                                                 len, flags);
1447                         fdput(out);
1448                 }
1449                 fdput(in);
1450         }
1451         return error;
1452 }
1453
1454 /*
1455  * Make sure there's data to read. Wait for input if we can, otherwise
1456  * return an appropriate error.
1457  */
1458 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1459 {
1460         int ret;
1461
1462         /*
1463          * Check the pipe occupancy without the inode lock first. This function
1464          * is speculative anyways, so missing one is ok.
1465          */
1466         if (!pipe_empty(pipe->head, pipe->tail))
1467                 return 0;
1468
1469         ret = 0;
1470         pipe_lock(pipe);
1471
1472         while (pipe_empty(pipe->head, pipe->tail)) {
1473                 if (signal_pending(current)) {
1474                         ret = -ERESTARTSYS;
1475                         break;
1476                 }
1477                 if (!pipe->writers)
1478                         break;
1479                 if (flags & SPLICE_F_NONBLOCK) {
1480                         ret = -EAGAIN;
1481                         break;
1482                 }
1483                 pipe_wait_readable(pipe);
1484         }
1485
1486         pipe_unlock(pipe);
1487         return ret;
1488 }
1489
1490 /*
1491  * Make sure there's writeable room. Wait for room if we can, otherwise
1492  * return an appropriate error.
1493  */
1494 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1495 {
1496         int ret;
1497
1498         /*
1499          * Check pipe occupancy without the inode lock first. This function
1500          * is speculative anyways, so missing one is ok.
1501          */
1502         if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1503                 return 0;
1504
1505         ret = 0;
1506         pipe_lock(pipe);
1507
1508         while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1509                 if (!pipe->readers) {
1510                         send_sig(SIGPIPE, current, 0);
1511                         ret = -EPIPE;
1512                         break;
1513                 }
1514                 if (flags & SPLICE_F_NONBLOCK) {
1515                         ret = -EAGAIN;
1516                         break;
1517                 }
1518                 if (signal_pending(current)) {
1519                         ret = -ERESTARTSYS;
1520                         break;
1521                 }
1522                 pipe_wait_writable(pipe);
1523         }
1524
1525         pipe_unlock(pipe);
1526         return ret;
1527 }
1528
1529 /*
1530  * Splice contents of ipipe to opipe.
1531  */
1532 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1533                                struct pipe_inode_info *opipe,
1534                                size_t len, unsigned int flags)
1535 {
1536         struct pipe_buffer *ibuf, *obuf;
1537         unsigned int i_head, o_head;
1538         unsigned int i_tail, o_tail;
1539         unsigned int i_mask, o_mask;
1540         int ret = 0;
1541         bool input_wakeup = false;
1542
1543
1544 retry:
1545         ret = ipipe_prep(ipipe, flags);
1546         if (ret)
1547                 return ret;
1548
1549         ret = opipe_prep(opipe, flags);
1550         if (ret)
1551                 return ret;
1552
1553         /*
1554          * Potential ABBA deadlock, work around it by ordering lock
1555          * grabbing by pipe info address. Otherwise two different processes
1556          * could deadlock (one doing tee from A -> B, the other from B -> A).
1557          */
1558         pipe_double_lock(ipipe, opipe);
1559
1560         i_tail = ipipe->tail;
1561         i_mask = ipipe->ring_size - 1;
1562         o_head = opipe->head;
1563         o_mask = opipe->ring_size - 1;
1564
1565         do {
1566                 size_t o_len;
1567
1568                 if (!opipe->readers) {
1569                         send_sig(SIGPIPE, current, 0);
1570                         if (!ret)
1571                                 ret = -EPIPE;
1572                         break;
1573                 }
1574
1575                 i_head = ipipe->head;
1576                 o_tail = opipe->tail;
1577
1578                 if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1579                         break;
1580
1581                 /*
1582                  * Cannot make any progress, because either the input
1583                  * pipe is empty or the output pipe is full.
1584                  */
1585                 if (pipe_empty(i_head, i_tail) ||
1586                     pipe_full(o_head, o_tail, opipe->max_usage)) {
1587                         /* Already processed some buffers, break */
1588                         if (ret)
1589                                 break;
1590
1591                         if (flags & SPLICE_F_NONBLOCK) {
1592                                 ret = -EAGAIN;
1593                                 break;
1594                         }
1595
1596                         /*
1597                          * We raced with another reader/writer and haven't
1598                          * managed to process any buffers.  A zero return
1599                          * value means EOF, so retry instead.
1600                          */
1601                         pipe_unlock(ipipe);
1602                         pipe_unlock(opipe);
1603                         goto retry;
1604                 }
1605
1606                 ibuf = &ipipe->bufs[i_tail & i_mask];
1607                 obuf = &opipe->bufs[o_head & o_mask];
1608
1609                 if (len >= ibuf->len) {
1610                         /*
1611                          * Simply move the whole buffer from ipipe to opipe
1612                          */
1613                         *obuf = *ibuf;
1614                         ibuf->ops = NULL;
1615                         i_tail++;
1616                         ipipe->tail = i_tail;
1617                         input_wakeup = true;
1618                         o_len = obuf->len;
1619                         o_head++;
1620                         opipe->head = o_head;
1621                 } else {
1622                         /*
1623                          * Get a reference to this pipe buffer,
1624                          * so we can copy the contents over.
1625                          */
1626                         if (!pipe_buf_get(ipipe, ibuf)) {
1627                                 if (ret == 0)
1628                                         ret = -EFAULT;
1629                                 break;
1630                         }
1631                         *obuf = *ibuf;
1632
1633                         /*
1634                          * Don't inherit the gift and merge flags, we need to
1635                          * prevent multiple steals of this page.
1636                          */
1637                         obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1638                         obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1639
1640                         obuf->len = len;
1641                         ibuf->offset += len;
1642                         ibuf->len -= len;
1643                         o_len = len;
1644                         o_head++;
1645                         opipe->head = o_head;
1646                 }
1647                 ret += o_len;
1648                 len -= o_len;
1649         } while (len);
1650
1651         pipe_unlock(ipipe);
1652         pipe_unlock(opipe);
1653
1654         /*
1655          * If we put data in the output pipe, wakeup any potential readers.
1656          */
1657         if (ret > 0)
1658                 wakeup_pipe_readers(opipe);
1659
1660         if (input_wakeup)
1661                 wakeup_pipe_writers(ipipe);
1662
1663         return ret;
1664 }
1665
1666 /*
1667  * Link contents of ipipe to opipe.
1668  */
1669 static int link_pipe(struct pipe_inode_info *ipipe,
1670                      struct pipe_inode_info *opipe,
1671                      size_t len, unsigned int flags)
1672 {
1673         struct pipe_buffer *ibuf, *obuf;
1674         unsigned int i_head, o_head;
1675         unsigned int i_tail, o_tail;
1676         unsigned int i_mask, o_mask;
1677         int ret = 0;
1678
1679         /*
1680          * Potential ABBA deadlock, work around it by ordering lock
1681          * grabbing by pipe info address. Otherwise two different processes
1682          * could deadlock (one doing tee from A -> B, the other from B -> A).
1683          */
1684         pipe_double_lock(ipipe, opipe);
1685
1686         i_tail = ipipe->tail;
1687         i_mask = ipipe->ring_size - 1;
1688         o_head = opipe->head;
1689         o_mask = opipe->ring_size - 1;
1690
1691         do {
1692                 if (!opipe->readers) {
1693                         send_sig(SIGPIPE, current, 0);
1694                         if (!ret)
1695                                 ret = -EPIPE;
1696                         break;
1697                 }
1698
1699                 i_head = ipipe->head;
1700                 o_tail = opipe->tail;
1701
1702                 /*
1703                  * If we have iterated all input buffers or run out of
1704                  * output room, break.
1705                  */
1706                 if (pipe_empty(i_head, i_tail) ||
1707                     pipe_full(o_head, o_tail, opipe->max_usage))
1708                         break;
1709
1710                 ibuf = &ipipe->bufs[i_tail & i_mask];
1711                 obuf = &opipe->bufs[o_head & o_mask];
1712
1713                 /*
1714                  * Get a reference to this pipe buffer,
1715                  * so we can copy the contents over.
1716                  */
1717                 if (!pipe_buf_get(ipipe, ibuf)) {
1718                         if (ret == 0)
1719                                 ret = -EFAULT;
1720                         break;
1721                 }
1722
1723                 *obuf = *ibuf;
1724
1725                 /*
1726                  * Don't inherit the gift and merge flag, we need to prevent
1727                  * multiple steals of this page.
1728                  */
1729                 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1730                 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1731
1732                 if (obuf->len > len)
1733                         obuf->len = len;
1734                 ret += obuf->len;
1735                 len -= obuf->len;
1736
1737                 o_head++;
1738                 opipe->head = o_head;
1739                 i_tail++;
1740         } while (len);
1741
1742         pipe_unlock(ipipe);
1743         pipe_unlock(opipe);
1744
1745         /*
1746          * If we put data in the output pipe, wakeup any potential readers.
1747          */
1748         if (ret > 0)
1749                 wakeup_pipe_readers(opipe);
1750
1751         return ret;
1752 }
1753
1754 /*
1755  * This is a tee(1) implementation that works on pipes. It doesn't copy
1756  * any data, it simply references the 'in' pages on the 'out' pipe.
1757  * The 'flags' used are the SPLICE_F_* variants, currently the only
1758  * applicable one is SPLICE_F_NONBLOCK.
1759  */
1760 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1761 {
1762         struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1763         struct pipe_inode_info *opipe = get_pipe_info(out, true);
1764         int ret = -EINVAL;
1765
1766         if (unlikely(!(in->f_mode & FMODE_READ) ||
1767                      !(out->f_mode & FMODE_WRITE)))
1768                 return -EBADF;
1769
1770         /*
1771          * Duplicate the contents of ipipe to opipe without actually
1772          * copying the data.
1773          */
1774         if (ipipe && opipe && ipipe != opipe) {
1775                 if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1776                         flags |= SPLICE_F_NONBLOCK;
1777
1778                 /*
1779                  * Keep going, unless we encounter an error. The ipipe/opipe
1780                  * ordering doesn't really matter.
1781                  */
1782                 ret = ipipe_prep(ipipe, flags);
1783                 if (!ret) {
1784                         ret = opipe_prep(opipe, flags);
1785                         if (!ret)
1786                                 ret = link_pipe(ipipe, opipe, len, flags);
1787                 }
1788         }
1789
1790         return ret;
1791 }
1792
1793 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1794 {
1795         struct fd in, out;
1796         int error;
1797
1798         if (unlikely(flags & ~SPLICE_F_ALL))
1799                 return -EINVAL;
1800
1801         if (unlikely(!len))
1802                 return 0;
1803
1804         error = -EBADF;
1805         in = fdget(fdin);
1806         if (in.file) {
1807                 out = fdget(fdout);
1808                 if (out.file) {
1809                         error = do_tee(in.file, out.file, len, flags);
1810                         fdput(out);
1811                 }
1812                 fdput(in);
1813         }
1814
1815         return error;
1816 }