Merge tag 'for-6.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/pateldipen19...
[linux-block.git] / lib / iov_iter.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <crypto/hash.h>
3 #include <linux/export.h>
4 #include <linux/bvec.h>
5 #include <linux/fault-inject-usercopy.h>
6 #include <linux/uio.h>
7 #include <linux/pagemap.h>
8 #include <linux/highmem.h>
9 #include <linux/slab.h>
10 #include <linux/vmalloc.h>
11 #include <linux/splice.h>
12 #include <linux/compat.h>
13 #include <net/checksum.h>
14 #include <linux/scatterlist.h>
15 #include <linux/instrumented.h>
16
17 #define PIPE_PARANOIA /* for now */
18
19 /* covers ubuf and kbuf alike */
20 #define iterate_buf(i, n, base, len, off, __p, STEP) {          \
21         size_t __maybe_unused off = 0;                          \
22         len = n;                                                \
23         base = __p + i->iov_offset;                             \
24         len -= (STEP);                                          \
25         i->iov_offset += len;                                   \
26         n = len;                                                \
27 }
28
29 /* covers iovec and kvec alike */
30 #define iterate_iovec(i, n, base, len, off, __p, STEP) {        \
31         size_t off = 0;                                         \
32         size_t skip = i->iov_offset;                            \
33         do {                                                    \
34                 len = min(n, __p->iov_len - skip);              \
35                 if (likely(len)) {                              \
36                         base = __p->iov_base + skip;            \
37                         len -= (STEP);                          \
38                         off += len;                             \
39                         skip += len;                            \
40                         n -= len;                               \
41                         if (skip < __p->iov_len)                \
42                                 break;                          \
43                 }                                               \
44                 __p++;                                          \
45                 skip = 0;                                       \
46         } while (n);                                            \
47         i->iov_offset = skip;                                   \
48         n = off;                                                \
49 }
50
51 #define iterate_bvec(i, n, base, len, off, p, STEP) {           \
52         size_t off = 0;                                         \
53         unsigned skip = i->iov_offset;                          \
54         while (n) {                                             \
55                 unsigned offset = p->bv_offset + skip;          \
56                 unsigned left;                                  \
57                 void *kaddr = kmap_local_page(p->bv_page +      \
58                                         offset / PAGE_SIZE);    \
59                 base = kaddr + offset % PAGE_SIZE;              \
60                 len = min(min(n, (size_t)(p->bv_len - skip)),   \
61                      (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \
62                 left = (STEP);                                  \
63                 kunmap_local(kaddr);                            \
64                 len -= left;                                    \
65                 off += len;                                     \
66                 skip += len;                                    \
67                 if (skip == p->bv_len) {                        \
68                         skip = 0;                               \
69                         p++;                                    \
70                 }                                               \
71                 n -= len;                                       \
72                 if (left)                                       \
73                         break;                                  \
74         }                                                       \
75         i->iov_offset = skip;                                   \
76         n = off;                                                \
77 }
78
79 #define iterate_xarray(i, n, base, len, __off, STEP) {          \
80         __label__ __out;                                        \
81         size_t __off = 0;                                       \
82         struct folio *folio;                                    \
83         loff_t start = i->xarray_start + i->iov_offset;         \
84         pgoff_t index = start / PAGE_SIZE;                      \
85         XA_STATE(xas, i->xarray, index);                        \
86                                                                 \
87         len = PAGE_SIZE - offset_in_page(start);                \
88         rcu_read_lock();                                        \
89         xas_for_each(&xas, folio, ULONG_MAX) {                  \
90                 unsigned left;                                  \
91                 size_t offset;                                  \
92                 if (xas_retry(&xas, folio))                     \
93                         continue;                               \
94                 if (WARN_ON(xa_is_value(folio)))                \
95                         break;                                  \
96                 if (WARN_ON(folio_test_hugetlb(folio)))         \
97                         break;                                  \
98                 offset = offset_in_folio(folio, start + __off); \
99                 while (offset < folio_size(folio)) {            \
100                         base = kmap_local_folio(folio, offset); \
101                         len = min(n, len);                      \
102                         left = (STEP);                          \
103                         kunmap_local(base);                     \
104                         len -= left;                            \
105                         __off += len;                           \
106                         n -= len;                               \
107                         if (left || n == 0)                     \
108                                 goto __out;                     \
109                         offset += len;                          \
110                         len = PAGE_SIZE;                        \
111                 }                                               \
112         }                                                       \
113 __out:                                                          \
114         rcu_read_unlock();                                      \
115         i->iov_offset += __off;                                 \
116         n = __off;                                              \
117 }
118
119 #define __iterate_and_advance(i, n, base, len, off, I, K) {     \
120         if (unlikely(i->count < n))                             \
121                 n = i->count;                                   \
122         if (likely(n)) {                                        \
123                 if (likely(iter_is_ubuf(i))) {                  \
124                         void __user *base;                      \
125                         size_t len;                             \
126                         iterate_buf(i, n, base, len, off,       \
127                                                 i->ubuf, (I))   \
128                 } else if (likely(iter_is_iovec(i))) {          \
129                         const struct iovec *iov = iter_iov(i);  \
130                         void __user *base;                      \
131                         size_t len;                             \
132                         iterate_iovec(i, n, base, len, off,     \
133                                                 iov, (I))       \
134                         i->nr_segs -= iov - iter_iov(i);        \
135                         i->__iov = iov;                         \
136                 } else if (iov_iter_is_bvec(i)) {               \
137                         const struct bio_vec *bvec = i->bvec;   \
138                         void *base;                             \
139                         size_t len;                             \
140                         iterate_bvec(i, n, base, len, off,      \
141                                                 bvec, (K))      \
142                         i->nr_segs -= bvec - i->bvec;           \
143                         i->bvec = bvec;                         \
144                 } else if (iov_iter_is_kvec(i)) {               \
145                         const struct kvec *kvec = i->kvec;      \
146                         void *base;                             \
147                         size_t len;                             \
148                         iterate_iovec(i, n, base, len, off,     \
149                                                 kvec, (K))      \
150                         i->nr_segs -= kvec - i->kvec;           \
151                         i->kvec = kvec;                         \
152                 } else if (iov_iter_is_xarray(i)) {             \
153                         void *base;                             \
154                         size_t len;                             \
155                         iterate_xarray(i, n, base, len, off,    \
156                                                         (K))    \
157                 }                                               \
158                 i->count -= n;                                  \
159         }                                                       \
160 }
161 #define iterate_and_advance(i, n, base, len, off, I, K) \
162         __iterate_and_advance(i, n, base, len, off, I, ((void)(K),0))
163
164 static int copyout(void __user *to, const void *from, size_t n)
165 {
166         if (should_fail_usercopy())
167                 return n;
168         if (access_ok(to, n)) {
169                 instrument_copy_to_user(to, from, n);
170                 n = raw_copy_to_user(to, from, n);
171         }
172         return n;
173 }
174
175 static int copyout_nofault(void __user *to, const void *from, size_t n)
176 {
177         long res;
178
179         if (should_fail_usercopy())
180                 return n;
181
182         res = copy_to_user_nofault(to, from, n);
183
184         return res < 0 ? n : res;
185 }
186
187 static int copyin(void *to, const void __user *from, size_t n)
188 {
189         size_t res = n;
190
191         if (should_fail_usercopy())
192                 return n;
193         if (access_ok(from, n)) {
194                 instrument_copy_from_user_before(to, from, n);
195                 res = raw_copy_from_user(to, from, n);
196                 instrument_copy_from_user_after(to, from, n, res);
197         }
198         return res;
199 }
200
201 #ifdef PIPE_PARANOIA
202 static bool sanity(const struct iov_iter *i)
203 {
204         struct pipe_inode_info *pipe = i->pipe;
205         unsigned int p_head = pipe->head;
206         unsigned int p_tail = pipe->tail;
207         unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
208         unsigned int i_head = i->head;
209         unsigned int idx;
210
211         if (i->last_offset) {
212                 struct pipe_buffer *p;
213                 if (unlikely(p_occupancy == 0))
214                         goto Bad;       // pipe must be non-empty
215                 if (unlikely(i_head != p_head - 1))
216                         goto Bad;       // must be at the last buffer...
217
218                 p = pipe_buf(pipe, i_head);
219                 if (unlikely(p->offset + p->len != abs(i->last_offset)))
220                         goto Bad;       // ... at the end of segment
221         } else {
222                 if (i_head != p_head)
223                         goto Bad;       // must be right after the last buffer
224         }
225         return true;
226 Bad:
227         printk(KERN_ERR "idx = %d, offset = %d\n", i_head, i->last_offset);
228         printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
229                         p_head, p_tail, pipe->ring_size);
230         for (idx = 0; idx < pipe->ring_size; idx++)
231                 printk(KERN_ERR "[%p %p %d %d]\n",
232                         pipe->bufs[idx].ops,
233                         pipe->bufs[idx].page,
234                         pipe->bufs[idx].offset,
235                         pipe->bufs[idx].len);
236         WARN_ON(1);
237         return false;
238 }
239 #else
240 #define sanity(i) true
241 #endif
242
243 static struct page *push_anon(struct pipe_inode_info *pipe, unsigned size)
244 {
245         struct page *page = alloc_page(GFP_USER);
246         if (page) {
247                 struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++);
248                 *buf = (struct pipe_buffer) {
249                         .ops = &default_pipe_buf_ops,
250                         .page = page,
251                         .offset = 0,
252                         .len = size
253                 };
254         }
255         return page;
256 }
257
258 static void push_page(struct pipe_inode_info *pipe, struct page *page,
259                         unsigned int offset, unsigned int size)
260 {
261         struct pipe_buffer *buf = pipe_buf(pipe, pipe->head++);
262         *buf = (struct pipe_buffer) {
263                 .ops = &page_cache_pipe_buf_ops,
264                 .page = page,
265                 .offset = offset,
266                 .len = size
267         };
268         get_page(page);
269 }
270
271 static inline int last_offset(const struct pipe_buffer *buf)
272 {
273         if (buf->ops == &default_pipe_buf_ops)
274                 return buf->len;        // buf->offset is 0 for those
275         else
276                 return -(buf->offset + buf->len);
277 }
278
279 static struct page *append_pipe(struct iov_iter *i, size_t size,
280                                 unsigned int *off)
281 {
282         struct pipe_inode_info *pipe = i->pipe;
283         int offset = i->last_offset;
284         struct pipe_buffer *buf;
285         struct page *page;
286
287         if (offset > 0 && offset < PAGE_SIZE) {
288                 // some space in the last buffer; add to it
289                 buf = pipe_buf(pipe, pipe->head - 1);
290                 size = min_t(size_t, size, PAGE_SIZE - offset);
291                 buf->len += size;
292                 i->last_offset += size;
293                 i->count -= size;
294                 *off = offset;
295                 return buf->page;
296         }
297         // OK, we need a new buffer
298         *off = 0;
299         size = min_t(size_t, size, PAGE_SIZE);
300         if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
301                 return NULL;
302         page = push_anon(pipe, size);
303         if (!page)
304                 return NULL;
305         i->head = pipe->head - 1;
306         i->last_offset = size;
307         i->count -= size;
308         return page;
309 }
310
311 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
312                          struct iov_iter *i)
313 {
314         struct pipe_inode_info *pipe = i->pipe;
315         unsigned int head = pipe->head;
316
317         if (unlikely(bytes > i->count))
318                 bytes = i->count;
319
320         if (unlikely(!bytes))
321                 return 0;
322
323         if (!sanity(i))
324                 return 0;
325
326         if (offset && i->last_offset == -offset) { // could we merge it?
327                 struct pipe_buffer *buf = pipe_buf(pipe, head - 1);
328                 if (buf->page == page) {
329                         buf->len += bytes;
330                         i->last_offset -= bytes;
331                         i->count -= bytes;
332                         return bytes;
333                 }
334         }
335         if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
336                 return 0;
337
338         push_page(pipe, page, offset, bytes);
339         i->last_offset = -(offset + bytes);
340         i->head = head;
341         i->count -= bytes;
342         return bytes;
343 }
344
345 /*
346  * fault_in_iov_iter_readable - fault in iov iterator for reading
347  * @i: iterator
348  * @size: maximum length
349  *
350  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
351  * @size.  For each iovec, fault in each page that constitutes the iovec.
352  *
353  * Returns the number of bytes not faulted in (like copy_to_user() and
354  * copy_from_user()).
355  *
356  * Always returns 0 for non-userspace iterators.
357  */
358 size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
359 {
360         if (iter_is_ubuf(i)) {
361                 size_t n = min(size, iov_iter_count(i));
362                 n -= fault_in_readable(i->ubuf + i->iov_offset, n);
363                 return size - n;
364         } else if (iter_is_iovec(i)) {
365                 size_t count = min(size, iov_iter_count(i));
366                 const struct iovec *p;
367                 size_t skip;
368
369                 size -= count;
370                 for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
371                         size_t len = min(count, p->iov_len - skip);
372                         size_t ret;
373
374                         if (unlikely(!len))
375                                 continue;
376                         ret = fault_in_readable(p->iov_base + skip, len);
377                         count -= len - ret;
378                         if (ret)
379                                 break;
380                 }
381                 return count + size;
382         }
383         return 0;
384 }
385 EXPORT_SYMBOL(fault_in_iov_iter_readable);
386
387 /*
388  * fault_in_iov_iter_writeable - fault in iov iterator for writing
389  * @i: iterator
390  * @size: maximum length
391  *
392  * Faults in the iterator using get_user_pages(), i.e., without triggering
393  * hardware page faults.  This is primarily useful when we already know that
394  * some or all of the pages in @i aren't in memory.
395  *
396  * Returns the number of bytes not faulted in, like copy_to_user() and
397  * copy_from_user().
398  *
399  * Always returns 0 for non-user-space iterators.
400  */
401 size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
402 {
403         if (iter_is_ubuf(i)) {
404                 size_t n = min(size, iov_iter_count(i));
405                 n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
406                 return size - n;
407         } else if (iter_is_iovec(i)) {
408                 size_t count = min(size, iov_iter_count(i));
409                 const struct iovec *p;
410                 size_t skip;
411
412                 size -= count;
413                 for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
414                         size_t len = min(count, p->iov_len - skip);
415                         size_t ret;
416
417                         if (unlikely(!len))
418                                 continue;
419                         ret = fault_in_safe_writeable(p->iov_base + skip, len);
420                         count -= len - ret;
421                         if (ret)
422                                 break;
423                 }
424                 return count + size;
425         }
426         return 0;
427 }
428 EXPORT_SYMBOL(fault_in_iov_iter_writeable);
429
430 void iov_iter_init(struct iov_iter *i, unsigned int direction,
431                         const struct iovec *iov, unsigned long nr_segs,
432                         size_t count)
433 {
434         WARN_ON(direction & ~(READ | WRITE));
435         *i = (struct iov_iter) {
436                 .iter_type = ITER_IOVEC,
437                 .nofault = false,
438                 .user_backed = true,
439                 .data_source = direction,
440                 .__iov = iov,
441                 .nr_segs = nr_segs,
442                 .iov_offset = 0,
443                 .count = count
444         };
445 }
446 EXPORT_SYMBOL(iov_iter_init);
447
448 // returns the offset in partial buffer (if any)
449 static inline unsigned int pipe_npages(const struct iov_iter *i, int *npages)
450 {
451         struct pipe_inode_info *pipe = i->pipe;
452         int used = pipe->head - pipe->tail;
453         int off = i->last_offset;
454
455         *npages = max((int)pipe->max_usage - used, 0);
456
457         if (off > 0 && off < PAGE_SIZE) { // anon and not full
458                 (*npages)++;
459                 return off;
460         }
461         return 0;
462 }
463
464 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
465                                 struct iov_iter *i)
466 {
467         unsigned int off, chunk;
468
469         if (unlikely(bytes > i->count))
470                 bytes = i->count;
471         if (unlikely(!bytes))
472                 return 0;
473
474         if (!sanity(i))
475                 return 0;
476
477         for (size_t n = bytes; n; n -= chunk) {
478                 struct page *page = append_pipe(i, n, &off);
479                 chunk = min_t(size_t, n, PAGE_SIZE - off);
480                 if (!page)
481                         return bytes - n;
482                 memcpy_to_page(page, off, addr, chunk);
483                 addr += chunk;
484         }
485         return bytes;
486 }
487
488 static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
489                               __wsum sum, size_t off)
490 {
491         __wsum next = csum_partial_copy_nocheck(from, to, len);
492         return csum_block_add(sum, next, off);
493 }
494
495 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
496                                          struct iov_iter *i, __wsum *sump)
497 {
498         __wsum sum = *sump;
499         size_t off = 0;
500         unsigned int chunk, r;
501
502         if (unlikely(bytes > i->count))
503                 bytes = i->count;
504         if (unlikely(!bytes))
505                 return 0;
506
507         if (!sanity(i))
508                 return 0;
509
510         while (bytes) {
511                 struct page *page = append_pipe(i, bytes, &r);
512                 char *p;
513
514                 if (!page)
515                         break;
516                 chunk = min_t(size_t, bytes, PAGE_SIZE - r);
517                 p = kmap_local_page(page);
518                 sum = csum_and_memcpy(p + r, addr + off, chunk, sum, off);
519                 kunmap_local(p);
520                 off += chunk;
521                 bytes -= chunk;
522         }
523         *sump = sum;
524         return off;
525 }
526
527 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
528 {
529         if (WARN_ON_ONCE(i->data_source))
530                 return 0;
531         if (unlikely(iov_iter_is_pipe(i)))
532                 return copy_pipe_to_iter(addr, bytes, i);
533         if (user_backed_iter(i))
534                 might_fault();
535         iterate_and_advance(i, bytes, base, len, off,
536                 copyout(base, addr + off, len),
537                 memcpy(base, addr + off, len)
538         )
539
540         return bytes;
541 }
542 EXPORT_SYMBOL(_copy_to_iter);
543
544 #ifdef CONFIG_ARCH_HAS_COPY_MC
545 static int copyout_mc(void __user *to, const void *from, size_t n)
546 {
547         if (access_ok(to, n)) {
548                 instrument_copy_to_user(to, from, n);
549                 n = copy_mc_to_user((__force void *) to, from, n);
550         }
551         return n;
552 }
553
554 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
555                                 struct iov_iter *i)
556 {
557         size_t xfer = 0;
558         unsigned int off, chunk;
559
560         if (unlikely(bytes > i->count))
561                 bytes = i->count;
562         if (unlikely(!bytes))
563                 return 0;
564
565         if (!sanity(i))
566                 return 0;
567
568         while (bytes) {
569                 struct page *page = append_pipe(i, bytes, &off);
570                 unsigned long rem;
571                 char *p;
572
573                 if (!page)
574                         break;
575                 chunk = min_t(size_t, bytes, PAGE_SIZE - off);
576                 p = kmap_local_page(page);
577                 rem = copy_mc_to_kernel(p + off, addr + xfer, chunk);
578                 chunk -= rem;
579                 kunmap_local(p);
580                 xfer += chunk;
581                 bytes -= chunk;
582                 if (rem) {
583                         iov_iter_revert(i, rem);
584                         break;
585                 }
586         }
587         return xfer;
588 }
589
590 /**
591  * _copy_mc_to_iter - copy to iter with source memory error exception handling
592  * @addr: source kernel address
593  * @bytes: total transfer length
594  * @i: destination iterator
595  *
596  * The pmem driver deploys this for the dax operation
597  * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
598  * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
599  * successfully copied.
600  *
601  * The main differences between this and typical _copy_to_iter().
602  *
603  * * Typical tail/residue handling after a fault retries the copy
604  *   byte-by-byte until the fault happens again. Re-triggering machine
605  *   checks is potentially fatal so the implementation uses source
606  *   alignment and poison alignment assumptions to avoid re-triggering
607  *   hardware exceptions.
608  *
609  * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
610  *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
611  *   a short copy.
612  *
613  * Return: number of bytes copied (may be %0)
614  */
615 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
616 {
617         if (WARN_ON_ONCE(i->data_source))
618                 return 0;
619         if (unlikely(iov_iter_is_pipe(i)))
620                 return copy_mc_pipe_to_iter(addr, bytes, i);
621         if (user_backed_iter(i))
622                 might_fault();
623         __iterate_and_advance(i, bytes, base, len, off,
624                 copyout_mc(base, addr + off, len),
625                 copy_mc_to_kernel(base, addr + off, len)
626         )
627
628         return bytes;
629 }
630 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
631 #endif /* CONFIG_ARCH_HAS_COPY_MC */
632
633 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
634 {
635         if (WARN_ON_ONCE(!i->data_source))
636                 return 0;
637
638         if (user_backed_iter(i))
639                 might_fault();
640         iterate_and_advance(i, bytes, base, len, off,
641                 copyin(addr + off, base, len),
642                 memcpy(addr + off, base, len)
643         )
644
645         return bytes;
646 }
647 EXPORT_SYMBOL(_copy_from_iter);
648
649 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
650 {
651         if (WARN_ON_ONCE(!i->data_source))
652                 return 0;
653
654         iterate_and_advance(i, bytes, base, len, off,
655                 __copy_from_user_inatomic_nocache(addr + off, base, len),
656                 memcpy(addr + off, base, len)
657         )
658
659         return bytes;
660 }
661 EXPORT_SYMBOL(_copy_from_iter_nocache);
662
663 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
664 /**
665  * _copy_from_iter_flushcache - write destination through cpu cache
666  * @addr: destination kernel address
667  * @bytes: total transfer length
668  * @i: source iterator
669  *
670  * The pmem driver arranges for filesystem-dax to use this facility via
671  * dax_copy_from_iter() for ensuring that writes to persistent memory
672  * are flushed through the CPU cache. It is differentiated from
673  * _copy_from_iter_nocache() in that guarantees all data is flushed for
674  * all iterator types. The _copy_from_iter_nocache() only attempts to
675  * bypass the cache for the ITER_IOVEC case, and on some archs may use
676  * instructions that strand dirty-data in the cache.
677  *
678  * Return: number of bytes copied (may be %0)
679  */
680 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
681 {
682         if (WARN_ON_ONCE(!i->data_source))
683                 return 0;
684
685         iterate_and_advance(i, bytes, base, len, off,
686                 __copy_from_user_flushcache(addr + off, base, len),
687                 memcpy_flushcache(addr + off, base, len)
688         )
689
690         return bytes;
691 }
692 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
693 #endif
694
695 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
696 {
697         struct page *head;
698         size_t v = n + offset;
699
700         /*
701          * The general case needs to access the page order in order
702          * to compute the page size.
703          * However, we mostly deal with order-0 pages and thus can
704          * avoid a possible cache line miss for requests that fit all
705          * page orders.
706          */
707         if (n <= v && v <= PAGE_SIZE)
708                 return true;
709
710         head = compound_head(page);
711         v += (page - head) << PAGE_SHIFT;
712
713         if (WARN_ON(n > v || v > page_size(head)))
714                 return false;
715         return true;
716 }
717
718 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
719                          struct iov_iter *i)
720 {
721         size_t res = 0;
722         if (!page_copy_sane(page, offset, bytes))
723                 return 0;
724         if (WARN_ON_ONCE(i->data_source))
725                 return 0;
726         if (unlikely(iov_iter_is_pipe(i)))
727                 return copy_page_to_iter_pipe(page, offset, bytes, i);
728         page += offset / PAGE_SIZE; // first subpage
729         offset %= PAGE_SIZE;
730         while (1) {
731                 void *kaddr = kmap_local_page(page);
732                 size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
733                 n = _copy_to_iter(kaddr + offset, n, i);
734                 kunmap_local(kaddr);
735                 res += n;
736                 bytes -= n;
737                 if (!bytes || !n)
738                         break;
739                 offset += n;
740                 if (offset == PAGE_SIZE) {
741                         page++;
742                         offset = 0;
743                 }
744         }
745         return res;
746 }
747 EXPORT_SYMBOL(copy_page_to_iter);
748
749 size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
750                                  struct iov_iter *i)
751 {
752         size_t res = 0;
753
754         if (!page_copy_sane(page, offset, bytes))
755                 return 0;
756         if (WARN_ON_ONCE(i->data_source))
757                 return 0;
758         if (unlikely(iov_iter_is_pipe(i)))
759                 return copy_page_to_iter_pipe(page, offset, bytes, i);
760         page += offset / PAGE_SIZE; // first subpage
761         offset %= PAGE_SIZE;
762         while (1) {
763                 void *kaddr = kmap_local_page(page);
764                 size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
765
766                 iterate_and_advance(i, n, base, len, off,
767                         copyout_nofault(base, kaddr + offset + off, len),
768                         memcpy(base, kaddr + offset + off, len)
769                 )
770                 kunmap_local(kaddr);
771                 res += n;
772                 bytes -= n;
773                 if (!bytes || !n)
774                         break;
775                 offset += n;
776                 if (offset == PAGE_SIZE) {
777                         page++;
778                         offset = 0;
779                 }
780         }
781         return res;
782 }
783 EXPORT_SYMBOL(copy_page_to_iter_nofault);
784
785 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
786                          struct iov_iter *i)
787 {
788         size_t res = 0;
789         if (!page_copy_sane(page, offset, bytes))
790                 return 0;
791         page += offset / PAGE_SIZE; // first subpage
792         offset %= PAGE_SIZE;
793         while (1) {
794                 void *kaddr = kmap_local_page(page);
795                 size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
796                 n = _copy_from_iter(kaddr + offset, n, i);
797                 kunmap_local(kaddr);
798                 res += n;
799                 bytes -= n;
800                 if (!bytes || !n)
801                         break;
802                 offset += n;
803                 if (offset == PAGE_SIZE) {
804                         page++;
805                         offset = 0;
806                 }
807         }
808         return res;
809 }
810 EXPORT_SYMBOL(copy_page_from_iter);
811
812 static size_t pipe_zero(size_t bytes, struct iov_iter *i)
813 {
814         unsigned int chunk, off;
815
816         if (unlikely(bytes > i->count))
817                 bytes = i->count;
818         if (unlikely(!bytes))
819                 return 0;
820
821         if (!sanity(i))
822                 return 0;
823
824         for (size_t n = bytes; n; n -= chunk) {
825                 struct page *page = append_pipe(i, n, &off);
826                 char *p;
827
828                 if (!page)
829                         return bytes - n;
830                 chunk = min_t(size_t, n, PAGE_SIZE - off);
831                 p = kmap_local_page(page);
832                 memset(p + off, 0, chunk);
833                 kunmap_local(p);
834         }
835         return bytes;
836 }
837
838 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
839 {
840         if (unlikely(iov_iter_is_pipe(i)))
841                 return pipe_zero(bytes, i);
842         iterate_and_advance(i, bytes, base, len, count,
843                 clear_user(base, len),
844                 memset(base, 0, len)
845         )
846
847         return bytes;
848 }
849 EXPORT_SYMBOL(iov_iter_zero);
850
851 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
852                                   struct iov_iter *i)
853 {
854         char *kaddr = kmap_atomic(page), *p = kaddr + offset;
855         if (!page_copy_sane(page, offset, bytes)) {
856                 kunmap_atomic(kaddr);
857                 return 0;
858         }
859         if (WARN_ON_ONCE(!i->data_source)) {
860                 kunmap_atomic(kaddr);
861                 return 0;
862         }
863         iterate_and_advance(i, bytes, base, len, off,
864                 copyin(p + off, base, len),
865                 memcpy(p + off, base, len)
866         )
867         kunmap_atomic(kaddr);
868         return bytes;
869 }
870 EXPORT_SYMBOL(copy_page_from_iter_atomic);
871
872 static void pipe_advance(struct iov_iter *i, size_t size)
873 {
874         struct pipe_inode_info *pipe = i->pipe;
875         int off = i->last_offset;
876
877         if (!off && !size) {
878                 pipe_discard_from(pipe, i->start_head); // discard everything
879                 return;
880         }
881         i->count -= size;
882         while (1) {
883                 struct pipe_buffer *buf = pipe_buf(pipe, i->head);
884                 if (off) /* make it relative to the beginning of buffer */
885                         size += abs(off) - buf->offset;
886                 if (size <= buf->len) {
887                         buf->len = size;
888                         i->last_offset = last_offset(buf);
889                         break;
890                 }
891                 size -= buf->len;
892                 i->head++;
893                 off = 0;
894         }
895         pipe_discard_from(pipe, i->head + 1); // discard everything past this one
896 }
897
898 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
899 {
900         const struct bio_vec *bvec, *end;
901
902         if (!i->count)
903                 return;
904         i->count -= size;
905
906         size += i->iov_offset;
907
908         for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
909                 if (likely(size < bvec->bv_len))
910                         break;
911                 size -= bvec->bv_len;
912         }
913         i->iov_offset = size;
914         i->nr_segs -= bvec - i->bvec;
915         i->bvec = bvec;
916 }
917
918 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
919 {
920         const struct iovec *iov, *end;
921
922         if (!i->count)
923                 return;
924         i->count -= size;
925
926         size += i->iov_offset; // from beginning of current segment
927         for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
928                 if (likely(size < iov->iov_len))
929                         break;
930                 size -= iov->iov_len;
931         }
932         i->iov_offset = size;
933         i->nr_segs -= iov - iter_iov(i);
934         i->__iov = iov;
935 }
936
937 void iov_iter_advance(struct iov_iter *i, size_t size)
938 {
939         if (unlikely(i->count < size))
940                 size = i->count;
941         if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
942                 i->iov_offset += size;
943                 i->count -= size;
944         } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
945                 /* iovec and kvec have identical layouts */
946                 iov_iter_iovec_advance(i, size);
947         } else if (iov_iter_is_bvec(i)) {
948                 iov_iter_bvec_advance(i, size);
949         } else if (iov_iter_is_pipe(i)) {
950                 pipe_advance(i, size);
951         } else if (iov_iter_is_discard(i)) {
952                 i->count -= size;
953         }
954 }
955 EXPORT_SYMBOL(iov_iter_advance);
956
957 void iov_iter_revert(struct iov_iter *i, size_t unroll)
958 {
959         if (!unroll)
960                 return;
961         if (WARN_ON(unroll > MAX_RW_COUNT))
962                 return;
963         i->count += unroll;
964         if (unlikely(iov_iter_is_pipe(i))) {
965                 struct pipe_inode_info *pipe = i->pipe;
966                 unsigned int head = pipe->head;
967
968                 while (head > i->start_head) {
969                         struct pipe_buffer *b = pipe_buf(pipe, --head);
970                         if (unroll < b->len) {
971                                 b->len -= unroll;
972                                 i->last_offset = last_offset(b);
973                                 i->head = head;
974                                 return;
975                         }
976                         unroll -= b->len;
977                         pipe_buf_release(pipe, b);
978                         pipe->head--;
979                 }
980                 i->last_offset = 0;
981                 i->head = head;
982                 return;
983         }
984         if (unlikely(iov_iter_is_discard(i)))
985                 return;
986         if (unroll <= i->iov_offset) {
987                 i->iov_offset -= unroll;
988                 return;
989         }
990         unroll -= i->iov_offset;
991         if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
992                 BUG(); /* We should never go beyond the start of the specified
993                         * range since we might then be straying into pages that
994                         * aren't pinned.
995                         */
996         } else if (iov_iter_is_bvec(i)) {
997                 const struct bio_vec *bvec = i->bvec;
998                 while (1) {
999                         size_t n = (--bvec)->bv_len;
1000                         i->nr_segs++;
1001                         if (unroll <= n) {
1002                                 i->bvec = bvec;
1003                                 i->iov_offset = n - unroll;
1004                                 return;
1005                         }
1006                         unroll -= n;
1007                 }
1008         } else { /* same logics for iovec and kvec */
1009                 const struct iovec *iov = iter_iov(i);
1010                 while (1) {
1011                         size_t n = (--iov)->iov_len;
1012                         i->nr_segs++;
1013                         if (unroll <= n) {
1014                                 i->__iov = iov;
1015                                 i->iov_offset = n - unroll;
1016                                 return;
1017                         }
1018                         unroll -= n;
1019                 }
1020         }
1021 }
1022 EXPORT_SYMBOL(iov_iter_revert);
1023
1024 /*
1025  * Return the count of just the current iov_iter segment.
1026  */
1027 size_t iov_iter_single_seg_count(const struct iov_iter *i)
1028 {
1029         if (i->nr_segs > 1) {
1030                 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1031                         return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
1032                 if (iov_iter_is_bvec(i))
1033                         return min(i->count, i->bvec->bv_len - i->iov_offset);
1034         }
1035         return i->count;
1036 }
1037 EXPORT_SYMBOL(iov_iter_single_seg_count);
1038
1039 void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1040                         const struct kvec *kvec, unsigned long nr_segs,
1041                         size_t count)
1042 {
1043         WARN_ON(direction & ~(READ | WRITE));
1044         *i = (struct iov_iter){
1045                 .iter_type = ITER_KVEC,
1046                 .data_source = direction,
1047                 .kvec = kvec,
1048                 .nr_segs = nr_segs,
1049                 .iov_offset = 0,
1050                 .count = count
1051         };
1052 }
1053 EXPORT_SYMBOL(iov_iter_kvec);
1054
1055 void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1056                         const struct bio_vec *bvec, unsigned long nr_segs,
1057                         size_t count)
1058 {
1059         WARN_ON(direction & ~(READ | WRITE));
1060         *i = (struct iov_iter){
1061                 .iter_type = ITER_BVEC,
1062                 .data_source = direction,
1063                 .bvec = bvec,
1064                 .nr_segs = nr_segs,
1065                 .iov_offset = 0,
1066                 .count = count
1067         };
1068 }
1069 EXPORT_SYMBOL(iov_iter_bvec);
1070
1071 void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1072                         struct pipe_inode_info *pipe,
1073                         size_t count)
1074 {
1075         BUG_ON(direction != READ);
1076         WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1077         *i = (struct iov_iter){
1078                 .iter_type = ITER_PIPE,
1079                 .data_source = false,
1080                 .pipe = pipe,
1081                 .head = pipe->head,
1082                 .start_head = pipe->head,
1083                 .last_offset = 0,
1084                 .count = count
1085         };
1086 }
1087 EXPORT_SYMBOL(iov_iter_pipe);
1088
1089 /**
1090  * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1091  * @i: The iterator to initialise.
1092  * @direction: The direction of the transfer.
1093  * @xarray: The xarray to access.
1094  * @start: The start file position.
1095  * @count: The size of the I/O buffer in bytes.
1096  *
1097  * Set up an I/O iterator to either draw data out of the pages attached to an
1098  * inode or to inject data into those pages.  The pages *must* be prevented
1099  * from evaporation, either by taking a ref on them or locking them by the
1100  * caller.
1101  */
1102 void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1103                      struct xarray *xarray, loff_t start, size_t count)
1104 {
1105         BUG_ON(direction & ~1);
1106         *i = (struct iov_iter) {
1107                 .iter_type = ITER_XARRAY,
1108                 .data_source = direction,
1109                 .xarray = xarray,
1110                 .xarray_start = start,
1111                 .count = count,
1112                 .iov_offset = 0
1113         };
1114 }
1115 EXPORT_SYMBOL(iov_iter_xarray);
1116
1117 /**
1118  * iov_iter_discard - Initialise an I/O iterator that discards data
1119  * @i: The iterator to initialise.
1120  * @direction: The direction of the transfer.
1121  * @count: The size of the I/O buffer in bytes.
1122  *
1123  * Set up an I/O iterator that just discards everything that's written to it.
1124  * It's only available as a READ iterator.
1125  */
1126 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1127 {
1128         BUG_ON(direction != READ);
1129         *i = (struct iov_iter){
1130                 .iter_type = ITER_DISCARD,
1131                 .data_source = false,
1132                 .count = count,
1133                 .iov_offset = 0
1134         };
1135 }
1136 EXPORT_SYMBOL(iov_iter_discard);
1137
1138 static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
1139                                    unsigned len_mask)
1140 {
1141         size_t size = i->count;
1142         size_t skip = i->iov_offset;
1143         unsigned k;
1144
1145         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1146                 const struct iovec *iov = iter_iov(i) + k;
1147                 size_t len = iov->iov_len - skip;
1148
1149                 if (len > size)
1150                         len = size;
1151                 if (len & len_mask)
1152                         return false;
1153                 if ((unsigned long)(iov->iov_base + skip) & addr_mask)
1154                         return false;
1155
1156                 size -= len;
1157                 if (!size)
1158                         break;
1159         }
1160         return true;
1161 }
1162
1163 static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
1164                                   unsigned len_mask)
1165 {
1166         size_t size = i->count;
1167         unsigned skip = i->iov_offset;
1168         unsigned k;
1169
1170         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1171                 size_t len = i->bvec[k].bv_len - skip;
1172
1173                 if (len > size)
1174                         len = size;
1175                 if (len & len_mask)
1176                         return false;
1177                 if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask)
1178                         return false;
1179
1180                 size -= len;
1181                 if (!size)
1182                         break;
1183         }
1184         return true;
1185 }
1186
1187 /**
1188  * iov_iter_is_aligned() - Check if the addresses and lengths of each segments
1189  *      are aligned to the parameters.
1190  *
1191  * @i: &struct iov_iter to restore
1192  * @addr_mask: bit mask to check against the iov element's addresses
1193  * @len_mask: bit mask to check against the iov element's lengths
1194  *
1195  * Return: false if any addresses or lengths intersect with the provided masks
1196  */
1197 bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
1198                          unsigned len_mask)
1199 {
1200         if (likely(iter_is_ubuf(i))) {
1201                 if (i->count & len_mask)
1202                         return false;
1203                 if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
1204                         return false;
1205                 return true;
1206         }
1207
1208         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1209                 return iov_iter_aligned_iovec(i, addr_mask, len_mask);
1210
1211         if (iov_iter_is_bvec(i))
1212                 return iov_iter_aligned_bvec(i, addr_mask, len_mask);
1213
1214         if (iov_iter_is_pipe(i)) {
1215                 size_t size = i->count;
1216
1217                 if (size & len_mask)
1218                         return false;
1219                 if (size && i->last_offset > 0) {
1220                         if (i->last_offset & addr_mask)
1221                                 return false;
1222                 }
1223
1224                 return true;
1225         }
1226
1227         if (iov_iter_is_xarray(i)) {
1228                 if (i->count & len_mask)
1229                         return false;
1230                 if ((i->xarray_start + i->iov_offset) & addr_mask)
1231                         return false;
1232         }
1233
1234         return true;
1235 }
1236 EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
1237
1238 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
1239 {
1240         unsigned long res = 0;
1241         size_t size = i->count;
1242         size_t skip = i->iov_offset;
1243         unsigned k;
1244
1245         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1246                 const struct iovec *iov = iter_iov(i) + k;
1247                 size_t len = iov->iov_len - skip;
1248                 if (len) {
1249                         res |= (unsigned long)iov->iov_base + skip;
1250                         if (len > size)
1251                                 len = size;
1252                         res |= len;
1253                         size -= len;
1254                         if (!size)
1255                                 break;
1256                 }
1257         }
1258         return res;
1259 }
1260
1261 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1262 {
1263         unsigned res = 0;
1264         size_t size = i->count;
1265         unsigned skip = i->iov_offset;
1266         unsigned k;
1267
1268         for (k = 0; k < i->nr_segs; k++, skip = 0) {
1269                 size_t len = i->bvec[k].bv_len - skip;
1270                 res |= (unsigned long)i->bvec[k].bv_offset + skip;
1271                 if (len > size)
1272                         len = size;
1273                 res |= len;
1274                 size -= len;
1275                 if (!size)
1276                         break;
1277         }
1278         return res;
1279 }
1280
1281 unsigned long iov_iter_alignment(const struct iov_iter *i)
1282 {
1283         if (likely(iter_is_ubuf(i))) {
1284                 size_t size = i->count;
1285                 if (size)
1286                         return ((unsigned long)i->ubuf + i->iov_offset) | size;
1287                 return 0;
1288         }
1289
1290         /* iovec and kvec have identical layouts */
1291         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1292                 return iov_iter_alignment_iovec(i);
1293
1294         if (iov_iter_is_bvec(i))
1295                 return iov_iter_alignment_bvec(i);
1296
1297         if (iov_iter_is_pipe(i)) {
1298                 size_t size = i->count;
1299
1300                 if (size && i->last_offset > 0)
1301                         return size | i->last_offset;
1302                 return size;
1303         }
1304
1305         if (iov_iter_is_xarray(i))
1306                 return (i->xarray_start + i->iov_offset) | i->count;
1307
1308         return 0;
1309 }
1310 EXPORT_SYMBOL(iov_iter_alignment);
1311
1312 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1313 {
1314         unsigned long res = 0;
1315         unsigned long v = 0;
1316         size_t size = i->count;
1317         unsigned k;
1318
1319         if (iter_is_ubuf(i))
1320                 return 0;
1321
1322         if (WARN_ON(!iter_is_iovec(i)))
1323                 return ~0U;
1324
1325         for (k = 0; k < i->nr_segs; k++) {
1326                 const struct iovec *iov = iter_iov(i) + k;
1327                 if (iov->iov_len) {
1328                         unsigned long base = (unsigned long)iov->iov_base;
1329                         if (v) // if not the first one
1330                                 res |= base | v; // this start | previous end
1331                         v = base + iov->iov_len;
1332                         if (size <= iov->iov_len)
1333                                 break;
1334                         size -= iov->iov_len;
1335                 }
1336         }
1337         return res;
1338 }
1339 EXPORT_SYMBOL(iov_iter_gap_alignment);
1340
1341 static int want_pages_array(struct page ***res, size_t size,
1342                             size_t start, unsigned int maxpages)
1343 {
1344         unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE);
1345
1346         if (count > maxpages)
1347                 count = maxpages;
1348         WARN_ON(!count);        // caller should've prevented that
1349         if (!*res) {
1350                 *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
1351                 if (!*res)
1352                         return 0;
1353         }
1354         return count;
1355 }
1356
1357 static ssize_t pipe_get_pages(struct iov_iter *i,
1358                    struct page ***pages, size_t maxsize, unsigned maxpages,
1359                    size_t *start)
1360 {
1361         unsigned int npages, count, off, chunk;
1362         struct page **p;
1363         size_t left;
1364
1365         if (!sanity(i))
1366                 return -EFAULT;
1367
1368         *start = off = pipe_npages(i, &npages);
1369         if (!npages)
1370                 return -EFAULT;
1371         count = want_pages_array(pages, maxsize, off, min(npages, maxpages));
1372         if (!count)
1373                 return -ENOMEM;
1374         p = *pages;
1375         for (npages = 0, left = maxsize ; npages < count; npages++, left -= chunk) {
1376                 struct page *page = append_pipe(i, left, &off);
1377                 if (!page)
1378                         break;
1379                 chunk = min_t(size_t, left, PAGE_SIZE - off);
1380                 get_page(*p++ = page);
1381         }
1382         if (!npages)
1383                 return -EFAULT;
1384         return maxsize - left;
1385 }
1386
1387 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1388                                           pgoff_t index, unsigned int nr_pages)
1389 {
1390         XA_STATE(xas, xa, index);
1391         struct page *page;
1392         unsigned int ret = 0;
1393
1394         rcu_read_lock();
1395         for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1396                 if (xas_retry(&xas, page))
1397                         continue;
1398
1399                 /* Has the page moved or been split? */
1400                 if (unlikely(page != xas_reload(&xas))) {
1401                         xas_reset(&xas);
1402                         continue;
1403                 }
1404
1405                 pages[ret] = find_subpage(page, xas.xa_index);
1406                 get_page(pages[ret]);
1407                 if (++ret == nr_pages)
1408                         break;
1409         }
1410         rcu_read_unlock();
1411         return ret;
1412 }
1413
1414 static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1415                                      struct page ***pages, size_t maxsize,
1416                                      unsigned maxpages, size_t *_start_offset)
1417 {
1418         unsigned nr, offset, count;
1419         pgoff_t index;
1420         loff_t pos;
1421
1422         pos = i->xarray_start + i->iov_offset;
1423         index = pos >> PAGE_SHIFT;
1424         offset = pos & ~PAGE_MASK;
1425         *_start_offset = offset;
1426
1427         count = want_pages_array(pages, maxsize, offset, maxpages);
1428         if (!count)
1429                 return -ENOMEM;
1430         nr = iter_xarray_populate_pages(*pages, i->xarray, index, count);
1431         if (nr == 0)
1432                 return 0;
1433
1434         maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
1435         i->iov_offset += maxsize;
1436         i->count -= maxsize;
1437         return maxsize;
1438 }
1439
1440 /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
1441 static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
1442 {
1443         size_t skip;
1444         long k;
1445
1446         if (iter_is_ubuf(i))
1447                 return (unsigned long)i->ubuf + i->iov_offset;
1448
1449         for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1450                 const struct iovec *iov = iter_iov(i) + k;
1451                 size_t len = iov->iov_len - skip;
1452
1453                 if (unlikely(!len))
1454                         continue;
1455                 if (*size > len)
1456                         *size = len;
1457                 return (unsigned long)iov->iov_base + skip;
1458         }
1459         BUG(); // if it had been empty, we wouldn't get called
1460 }
1461
1462 /* must be done on non-empty ITER_BVEC one */
1463 static struct page *first_bvec_segment(const struct iov_iter *i,
1464                                        size_t *size, size_t *start)
1465 {
1466         struct page *page;
1467         size_t skip = i->iov_offset, len;
1468
1469         len = i->bvec->bv_len - skip;
1470         if (*size > len)
1471                 *size = len;
1472         skip += i->bvec->bv_offset;
1473         page = i->bvec->bv_page + skip / PAGE_SIZE;
1474         *start = skip % PAGE_SIZE;
1475         return page;
1476 }
1477
1478 static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
1479                    struct page ***pages, size_t maxsize,
1480                    unsigned int maxpages, size_t *start,
1481                    iov_iter_extraction_t extraction_flags)
1482 {
1483         unsigned int n, gup_flags = 0;
1484
1485         if (maxsize > i->count)
1486                 maxsize = i->count;
1487         if (!maxsize)
1488                 return 0;
1489         if (maxsize > MAX_RW_COUNT)
1490                 maxsize = MAX_RW_COUNT;
1491         if (extraction_flags & ITER_ALLOW_P2PDMA)
1492                 gup_flags |= FOLL_PCI_P2PDMA;
1493
1494         if (likely(user_backed_iter(i))) {
1495                 unsigned long addr;
1496                 int res;
1497
1498                 if (iov_iter_rw(i) != WRITE)
1499                         gup_flags |= FOLL_WRITE;
1500                 if (i->nofault)
1501                         gup_flags |= FOLL_NOFAULT;
1502
1503                 addr = first_iovec_segment(i, &maxsize);
1504                 *start = addr % PAGE_SIZE;
1505                 addr &= PAGE_MASK;
1506                 n = want_pages_array(pages, maxsize, *start, maxpages);
1507                 if (!n)
1508                         return -ENOMEM;
1509                 res = get_user_pages_fast(addr, n, gup_flags, *pages);
1510                 if (unlikely(res <= 0))
1511                         return res;
1512                 maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
1513                 iov_iter_advance(i, maxsize);
1514                 return maxsize;
1515         }
1516         if (iov_iter_is_bvec(i)) {
1517                 struct page **p;
1518                 struct page *page;
1519
1520                 page = first_bvec_segment(i, &maxsize, start);
1521                 n = want_pages_array(pages, maxsize, *start, maxpages);
1522                 if (!n)
1523                         return -ENOMEM;
1524                 p = *pages;
1525                 for (int k = 0; k < n; k++)
1526                         get_page(p[k] = page + k);
1527                 maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
1528                 i->count -= maxsize;
1529                 i->iov_offset += maxsize;
1530                 if (i->iov_offset == i->bvec->bv_len) {
1531                         i->iov_offset = 0;
1532                         i->bvec++;
1533                         i->nr_segs--;
1534                 }
1535                 return maxsize;
1536         }
1537         if (iov_iter_is_pipe(i))
1538                 return pipe_get_pages(i, pages, maxsize, maxpages, start);
1539         if (iov_iter_is_xarray(i))
1540                 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1541         return -EFAULT;
1542 }
1543
1544 ssize_t iov_iter_get_pages(struct iov_iter *i,
1545                    struct page **pages, size_t maxsize, unsigned maxpages,
1546                    size_t *start, iov_iter_extraction_t extraction_flags)
1547 {
1548         if (!maxpages)
1549                 return 0;
1550         BUG_ON(!pages);
1551
1552         return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages,
1553                                           start, extraction_flags);
1554 }
1555 EXPORT_SYMBOL_GPL(iov_iter_get_pages);
1556
1557 ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
1558                 size_t maxsize, unsigned maxpages, size_t *start)
1559 {
1560         return iov_iter_get_pages(i, pages, maxsize, maxpages, start, 0);
1561 }
1562 EXPORT_SYMBOL(iov_iter_get_pages2);
1563
1564 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1565                    struct page ***pages, size_t maxsize,
1566                    size_t *start, iov_iter_extraction_t extraction_flags)
1567 {
1568         ssize_t len;
1569
1570         *pages = NULL;
1571
1572         len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start,
1573                                          extraction_flags);
1574         if (len <= 0) {
1575                 kvfree(*pages);
1576                 *pages = NULL;
1577         }
1578         return len;
1579 }
1580 EXPORT_SYMBOL_GPL(iov_iter_get_pages_alloc);
1581
1582 ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
1583                 struct page ***pages, size_t maxsize, size_t *start)
1584 {
1585         return iov_iter_get_pages_alloc(i, pages, maxsize, start, 0);
1586 }
1587 EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
1588
1589 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1590                                struct iov_iter *i)
1591 {
1592         __wsum sum, next;
1593         sum = *csum;
1594         if (WARN_ON_ONCE(!i->data_source))
1595                 return 0;
1596
1597         iterate_and_advance(i, bytes, base, len, off, ({
1598                 next = csum_and_copy_from_user(base, addr + off, len);
1599                 sum = csum_block_add(sum, next, off);
1600                 next ? 0 : len;
1601         }), ({
1602                 sum = csum_and_memcpy(addr + off, base, len, sum, off);
1603         })
1604         )
1605         *csum = sum;
1606         return bytes;
1607 }
1608 EXPORT_SYMBOL(csum_and_copy_from_iter);
1609
1610 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1611                              struct iov_iter *i)
1612 {
1613         struct csum_state *csstate = _csstate;
1614         __wsum sum, next;
1615
1616         if (WARN_ON_ONCE(i->data_source))
1617                 return 0;
1618         if (unlikely(iov_iter_is_discard(i))) {
1619                 // can't use csum_memcpy() for that one - data is not copied
1620                 csstate->csum = csum_block_add(csstate->csum,
1621                                                csum_partial(addr, bytes, 0),
1622                                                csstate->off);
1623                 csstate->off += bytes;
1624                 return bytes;
1625         }
1626
1627         sum = csum_shift(csstate->csum, csstate->off);
1628         if (unlikely(iov_iter_is_pipe(i)))
1629                 bytes = csum_and_copy_to_pipe_iter(addr, bytes, i, &sum);
1630         else iterate_and_advance(i, bytes, base, len, off, ({
1631                 next = csum_and_copy_to_user(addr + off, base, len);
1632                 sum = csum_block_add(sum, next, off);
1633                 next ? 0 : len;
1634         }), ({
1635                 sum = csum_and_memcpy(base, addr + off, len, sum, off);
1636         })
1637         )
1638         csstate->csum = csum_shift(sum, csstate->off);
1639         csstate->off += bytes;
1640         return bytes;
1641 }
1642 EXPORT_SYMBOL(csum_and_copy_to_iter);
1643
1644 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1645                 struct iov_iter *i)
1646 {
1647 #ifdef CONFIG_CRYPTO_HASH
1648         struct ahash_request *hash = hashp;
1649         struct scatterlist sg;
1650         size_t copied;
1651
1652         copied = copy_to_iter(addr, bytes, i);
1653         sg_init_one(&sg, addr, copied);
1654         ahash_request_set_crypt(hash, &sg, NULL, copied);
1655         crypto_ahash_update(hash);
1656         return copied;
1657 #else
1658         return 0;
1659 #endif
1660 }
1661 EXPORT_SYMBOL(hash_and_copy_to_iter);
1662
1663 static int iov_npages(const struct iov_iter *i, int maxpages)
1664 {
1665         size_t skip = i->iov_offset, size = i->count;
1666         const struct iovec *p;
1667         int npages = 0;
1668
1669         for (p = iter_iov(i); size; skip = 0, p++) {
1670                 unsigned offs = offset_in_page(p->iov_base + skip);
1671                 size_t len = min(p->iov_len - skip, size);
1672
1673                 if (len) {
1674                         size -= len;
1675                         npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1676                         if (unlikely(npages > maxpages))
1677                                 return maxpages;
1678                 }
1679         }
1680         return npages;
1681 }
1682
1683 static int bvec_npages(const struct iov_iter *i, int maxpages)
1684 {
1685         size_t skip = i->iov_offset, size = i->count;
1686         const struct bio_vec *p;
1687         int npages = 0;
1688
1689         for (p = i->bvec; size; skip = 0, p++) {
1690                 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1691                 size_t len = min(p->bv_len - skip, size);
1692
1693                 size -= len;
1694                 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1695                 if (unlikely(npages > maxpages))
1696                         return maxpages;
1697         }
1698         return npages;
1699 }
1700
1701 int iov_iter_npages(const struct iov_iter *i, int maxpages)
1702 {
1703         if (unlikely(!i->count))
1704                 return 0;
1705         if (likely(iter_is_ubuf(i))) {
1706                 unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
1707                 int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
1708                 return min(npages, maxpages);
1709         }
1710         /* iovec and kvec have identical layouts */
1711         if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1712                 return iov_npages(i, maxpages);
1713         if (iov_iter_is_bvec(i))
1714                 return bvec_npages(i, maxpages);
1715         if (iov_iter_is_pipe(i)) {
1716                 int npages;
1717
1718                 if (!sanity(i))
1719                         return 0;
1720
1721                 pipe_npages(i, &npages);
1722                 return min(npages, maxpages);
1723         }
1724         if (iov_iter_is_xarray(i)) {
1725                 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1726                 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1727                 return min(npages, maxpages);
1728         }
1729         return 0;
1730 }
1731 EXPORT_SYMBOL(iov_iter_npages);
1732
1733 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1734 {
1735         *new = *old;
1736         if (unlikely(iov_iter_is_pipe(new))) {
1737                 WARN_ON(1);
1738                 return NULL;
1739         }
1740         if (iov_iter_is_bvec(new))
1741                 return new->bvec = kmemdup(new->bvec,
1742                                     new->nr_segs * sizeof(struct bio_vec),
1743                                     flags);
1744         else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
1745                 /* iovec and kvec have identical layout */
1746                 return new->__iov = kmemdup(new->__iov,
1747                                    new->nr_segs * sizeof(struct iovec),
1748                                    flags);
1749         return NULL;
1750 }
1751 EXPORT_SYMBOL(dup_iter);
1752
1753 static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
1754                 const struct iovec __user *uvec, unsigned long nr_segs)
1755 {
1756         const struct compat_iovec __user *uiov =
1757                 (const struct compat_iovec __user *)uvec;
1758         int ret = -EFAULT, i;
1759
1760         if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1761                 return -EFAULT;
1762
1763         for (i = 0; i < nr_segs; i++) {
1764                 compat_uptr_t buf;
1765                 compat_ssize_t len;
1766
1767                 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1768                 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1769
1770                 /* check for compat_size_t not fitting in compat_ssize_t .. */
1771                 if (len < 0) {
1772                         ret = -EINVAL;
1773                         goto uaccess_end;
1774                 }
1775                 iov[i].iov_base = compat_ptr(buf);
1776                 iov[i].iov_len = len;
1777         }
1778
1779         ret = 0;
1780 uaccess_end:
1781         user_access_end();
1782         return ret;
1783 }
1784
1785 static int copy_iovec_from_user(struct iovec *iov,
1786                 const struct iovec __user *uiov, unsigned long nr_segs)
1787 {
1788         int ret = -EFAULT;
1789
1790         if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1791                 return -EFAULT;
1792
1793         do {
1794                 void __user *buf;
1795                 ssize_t len;
1796
1797                 unsafe_get_user(len, &uiov->iov_len, uaccess_end);
1798                 unsafe_get_user(buf, &uiov->iov_base, uaccess_end);
1799
1800                 /* check for size_t not fitting in ssize_t .. */
1801                 if (unlikely(len < 0)) {
1802                         ret = -EINVAL;
1803                         goto uaccess_end;
1804                 }
1805                 iov->iov_base = buf;
1806                 iov->iov_len = len;
1807
1808                 uiov++; iov++;
1809         } while (--nr_segs);
1810
1811         ret = 0;
1812 uaccess_end:
1813         user_access_end();
1814         return ret;
1815 }
1816
1817 struct iovec *iovec_from_user(const struct iovec __user *uvec,
1818                 unsigned long nr_segs, unsigned long fast_segs,
1819                 struct iovec *fast_iov, bool compat)
1820 {
1821         struct iovec *iov = fast_iov;
1822         int ret;
1823
1824         /*
1825          * SuS says "The readv() function *may* fail if the iovcnt argument was
1826          * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1827          * traditionally returned zero for zero segments, so...
1828          */
1829         if (nr_segs == 0)
1830                 return iov;
1831         if (nr_segs > UIO_MAXIOV)
1832                 return ERR_PTR(-EINVAL);
1833         if (nr_segs > fast_segs) {
1834                 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1835                 if (!iov)
1836                         return ERR_PTR(-ENOMEM);
1837         }
1838
1839         if (unlikely(compat))
1840                 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1841         else
1842                 ret = copy_iovec_from_user(iov, uvec, nr_segs);
1843         if (ret) {
1844                 if (iov != fast_iov)
1845                         kfree(iov);
1846                 return ERR_PTR(ret);
1847         }
1848
1849         return iov;
1850 }
1851
1852 /*
1853  * Single segment iovec supplied by the user, import it as ITER_UBUF.
1854  */
1855 static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
1856                                    struct iovec **iovp, struct iov_iter *i,
1857                                    bool compat)
1858 {
1859         struct iovec *iov = *iovp;
1860         ssize_t ret;
1861
1862         if (compat)
1863                 ret = copy_compat_iovec_from_user(iov, uvec, 1);
1864         else
1865                 ret = copy_iovec_from_user(iov, uvec, 1);
1866         if (unlikely(ret))
1867                 return ret;
1868
1869         ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
1870         if (unlikely(ret))
1871                 return ret;
1872         *iovp = NULL;
1873         return i->count;
1874 }
1875
1876 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1877                  unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1878                  struct iov_iter *i, bool compat)
1879 {
1880         ssize_t total_len = 0;
1881         unsigned long seg;
1882         struct iovec *iov;
1883
1884         if (nr_segs == 1)
1885                 return __import_iovec_ubuf(type, uvec, iovp, i, compat);
1886
1887         iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1888         if (IS_ERR(iov)) {
1889                 *iovp = NULL;
1890                 return PTR_ERR(iov);
1891         }
1892
1893         /*
1894          * According to the Single Unix Specification we should return EINVAL if
1895          * an element length is < 0 when cast to ssize_t or if the total length
1896          * would overflow the ssize_t return value of the system call.
1897          *
1898          * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1899          * overflow case.
1900          */
1901         for (seg = 0; seg < nr_segs; seg++) {
1902                 ssize_t len = (ssize_t)iov[seg].iov_len;
1903
1904                 if (!access_ok(iov[seg].iov_base, len)) {
1905                         if (iov != *iovp)
1906                                 kfree(iov);
1907                         *iovp = NULL;
1908                         return -EFAULT;
1909                 }
1910
1911                 if (len > MAX_RW_COUNT - total_len) {
1912                         len = MAX_RW_COUNT - total_len;
1913                         iov[seg].iov_len = len;
1914                 }
1915                 total_len += len;
1916         }
1917
1918         iov_iter_init(i, type, iov, nr_segs, total_len);
1919         if (iov == *iovp)
1920                 *iovp = NULL;
1921         else
1922                 *iovp = iov;
1923         return total_len;
1924 }
1925
1926 /**
1927  * import_iovec() - Copy an array of &struct iovec from userspace
1928  *     into the kernel, check that it is valid, and initialize a new
1929  *     &struct iov_iter iterator to access it.
1930  *
1931  * @type: One of %READ or %WRITE.
1932  * @uvec: Pointer to the userspace array.
1933  * @nr_segs: Number of elements in userspace array.
1934  * @fast_segs: Number of elements in @iov.
1935  * @iovp: (input and output parameter) Pointer to pointer to (usually small
1936  *     on-stack) kernel array.
1937  * @i: Pointer to iterator that will be initialized on success.
1938  *
1939  * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1940  * then this function places %NULL in *@iov on return. Otherwise, a new
1941  * array will be allocated and the result placed in *@iov. This means that
1942  * the caller may call kfree() on *@iov regardless of whether the small
1943  * on-stack array was used or not (and regardless of whether this function
1944  * returns an error or not).
1945  *
1946  * Return: Negative error code on error, bytes imported on success
1947  */
1948 ssize_t import_iovec(int type, const struct iovec __user *uvec,
1949                  unsigned nr_segs, unsigned fast_segs,
1950                  struct iovec **iovp, struct iov_iter *i)
1951 {
1952         return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1953                               in_compat_syscall());
1954 }
1955 EXPORT_SYMBOL(import_iovec);
1956
1957 int import_single_range(int rw, void __user *buf, size_t len,
1958                  struct iovec *iov, struct iov_iter *i)
1959 {
1960         if (len > MAX_RW_COUNT)
1961                 len = MAX_RW_COUNT;
1962         if (unlikely(!access_ok(buf, len)))
1963                 return -EFAULT;
1964
1965         iov_iter_ubuf(i, rw, buf, len);
1966         return 0;
1967 }
1968 EXPORT_SYMBOL(import_single_range);
1969
1970 int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
1971 {
1972         if (len > MAX_RW_COUNT)
1973                 len = MAX_RW_COUNT;
1974         if (unlikely(!access_ok(buf, len)))
1975                 return -EFAULT;
1976
1977         iov_iter_ubuf(i, rw, buf, len);
1978         return 0;
1979 }
1980
1981 /**
1982  * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
1983  *     iov_iter_save_state() was called.
1984  *
1985  * @i: &struct iov_iter to restore
1986  * @state: state to restore from
1987  *
1988  * Used after iov_iter_save_state() to bring restore @i, if operations may
1989  * have advanced it.
1990  *
1991  * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
1992  */
1993 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
1994 {
1995         if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
1996                          !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
1997                 return;
1998         i->iov_offset = state->iov_offset;
1999         i->count = state->count;
2000         if (iter_is_ubuf(i))
2001                 return;
2002         /*
2003          * For the *vec iters, nr_segs + iov is constant - if we increment
2004          * the vec, then we also decrement the nr_segs count. Hence we don't
2005          * need to track both of these, just one is enough and we can deduct
2006          * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
2007          * size, so we can just increment the iov pointer as they are unionzed.
2008          * ITER_BVEC _may_ be the same size on some archs, but on others it is
2009          * not. Be safe and handle it separately.
2010          */
2011         BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
2012         if (iov_iter_is_bvec(i))
2013                 i->bvec -= state->nr_segs - i->nr_segs;
2014         else
2015                 i->__iov -= state->nr_segs - i->nr_segs;
2016         i->nr_segs = state->nr_segs;
2017 }
2018
2019 /*
2020  * Extract a list of contiguous pages from an ITER_XARRAY iterator.  This does not
2021  * get references on the pages, nor does it get a pin on them.
2022  */
2023 static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
2024                                              struct page ***pages, size_t maxsize,
2025                                              unsigned int maxpages,
2026                                              iov_iter_extraction_t extraction_flags,
2027                                              size_t *offset0)
2028 {
2029         struct page *page, **p;
2030         unsigned int nr = 0, offset;
2031         loff_t pos = i->xarray_start + i->iov_offset;
2032         pgoff_t index = pos >> PAGE_SHIFT;
2033         XA_STATE(xas, i->xarray, index);
2034
2035         offset = pos & ~PAGE_MASK;
2036         *offset0 = offset;
2037
2038         maxpages = want_pages_array(pages, maxsize, offset, maxpages);
2039         if (!maxpages)
2040                 return -ENOMEM;
2041         p = *pages;
2042
2043         rcu_read_lock();
2044         for (page = xas_load(&xas); page; page = xas_next(&xas)) {
2045                 if (xas_retry(&xas, page))
2046                         continue;
2047
2048                 /* Has the page moved or been split? */
2049                 if (unlikely(page != xas_reload(&xas))) {
2050                         xas_reset(&xas);
2051                         continue;
2052                 }
2053
2054                 p[nr++] = find_subpage(page, xas.xa_index);
2055                 if (nr == maxpages)
2056                         break;
2057         }
2058         rcu_read_unlock();
2059
2060         maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
2061         iov_iter_advance(i, maxsize);
2062         return maxsize;
2063 }
2064
2065 /*
2066  * Extract a list of contiguous pages from an ITER_BVEC iterator.  This does
2067  * not get references on the pages, nor does it get a pin on them.
2068  */
2069 static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
2070                                            struct page ***pages, size_t maxsize,
2071                                            unsigned int maxpages,
2072                                            iov_iter_extraction_t extraction_flags,
2073                                            size_t *offset0)
2074 {
2075         struct page **p, *page;
2076         size_t skip = i->iov_offset, offset;
2077         int k;
2078
2079         for (;;) {
2080                 if (i->nr_segs == 0)
2081                         return 0;
2082                 maxsize = min(maxsize, i->bvec->bv_len - skip);
2083                 if (maxsize)
2084                         break;
2085                 i->iov_offset = 0;
2086                 i->nr_segs--;
2087                 i->bvec++;
2088                 skip = 0;
2089         }
2090
2091         skip += i->bvec->bv_offset;
2092         page = i->bvec->bv_page + skip / PAGE_SIZE;
2093         offset = skip % PAGE_SIZE;
2094         *offset0 = offset;
2095
2096         maxpages = want_pages_array(pages, maxsize, offset, maxpages);
2097         if (!maxpages)
2098                 return -ENOMEM;
2099         p = *pages;
2100         for (k = 0; k < maxpages; k++)
2101                 p[k] = page + k;
2102
2103         maxsize = min_t(size_t, maxsize, maxpages * PAGE_SIZE - offset);
2104         iov_iter_advance(i, maxsize);
2105         return maxsize;
2106 }
2107
2108 /*
2109  * Extract a list of virtually contiguous pages from an ITER_KVEC iterator.
2110  * This does not get references on the pages, nor does it get a pin on them.
2111  */
2112 static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i,
2113                                            struct page ***pages, size_t maxsize,
2114                                            unsigned int maxpages,
2115                                            iov_iter_extraction_t extraction_flags,
2116                                            size_t *offset0)
2117 {
2118         struct page **p, *page;
2119         const void *kaddr;
2120         size_t skip = i->iov_offset, offset, len;
2121         int k;
2122
2123         for (;;) {
2124                 if (i->nr_segs == 0)
2125                         return 0;
2126                 maxsize = min(maxsize, i->kvec->iov_len - skip);
2127                 if (maxsize)
2128                         break;
2129                 i->iov_offset = 0;
2130                 i->nr_segs--;
2131                 i->kvec++;
2132                 skip = 0;
2133         }
2134
2135         kaddr = i->kvec->iov_base + skip;
2136         offset = (unsigned long)kaddr & ~PAGE_MASK;
2137         *offset0 = offset;
2138
2139         maxpages = want_pages_array(pages, maxsize, offset, maxpages);
2140         if (!maxpages)
2141                 return -ENOMEM;
2142         p = *pages;
2143
2144         kaddr -= offset;
2145         len = offset + maxsize;
2146         for (k = 0; k < maxpages; k++) {
2147                 size_t seg = min_t(size_t, len, PAGE_SIZE);
2148
2149                 if (is_vmalloc_or_module_addr(kaddr))
2150                         page = vmalloc_to_page(kaddr);
2151                 else
2152                         page = virt_to_page(kaddr);
2153
2154                 p[k] = page;
2155                 len -= seg;
2156                 kaddr += PAGE_SIZE;
2157         }
2158
2159         maxsize = min_t(size_t, maxsize, maxpages * PAGE_SIZE - offset);
2160         iov_iter_advance(i, maxsize);
2161         return maxsize;
2162 }
2163
2164 /*
2165  * Extract a list of contiguous pages from a user iterator and get a pin on
2166  * each of them.  This should only be used if the iterator is user-backed
2167  * (IOBUF/UBUF).
2168  *
2169  * It does not get refs on the pages, but the pages must be unpinned by the
2170  * caller once the transfer is complete.
2171  *
2172  * This is safe to be used where background IO/DMA *is* going to be modifying
2173  * the buffer; using a pin rather than a ref makes forces fork() to give the
2174  * child a copy of the page.
2175  */
2176 static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
2177                                            struct page ***pages,
2178                                            size_t maxsize,
2179                                            unsigned int maxpages,
2180                                            iov_iter_extraction_t extraction_flags,
2181                                            size_t *offset0)
2182 {
2183         unsigned long addr;
2184         unsigned int gup_flags = 0;
2185         size_t offset;
2186         int res;
2187
2188         if (i->data_source == ITER_DEST)
2189                 gup_flags |= FOLL_WRITE;
2190         if (extraction_flags & ITER_ALLOW_P2PDMA)
2191                 gup_flags |= FOLL_PCI_P2PDMA;
2192         if (i->nofault)
2193                 gup_flags |= FOLL_NOFAULT;
2194
2195         addr = first_iovec_segment(i, &maxsize);
2196         *offset0 = offset = addr % PAGE_SIZE;
2197         addr &= PAGE_MASK;
2198         maxpages = want_pages_array(pages, maxsize, offset, maxpages);
2199         if (!maxpages)
2200                 return -ENOMEM;
2201         res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
2202         if (unlikely(res <= 0))
2203                 return res;
2204         maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
2205         iov_iter_advance(i, maxsize);
2206         return maxsize;
2207 }
2208
2209 /**
2210  * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
2211  * @i: The iterator to extract from
2212  * @pages: Where to return the list of pages
2213  * @maxsize: The maximum amount of iterator to extract
2214  * @maxpages: The maximum size of the list of pages
2215  * @extraction_flags: Flags to qualify request
2216  * @offset0: Where to return the starting offset into (*@pages)[0]
2217  *
2218  * Extract a list of contiguous pages from the current point of the iterator,
2219  * advancing the iterator.  The maximum number of pages and the maximum amount
2220  * of page contents can be set.
2221  *
2222  * If *@pages is NULL, a page list will be allocated to the required size and
2223  * *@pages will be set to its base.  If *@pages is not NULL, it will be assumed
2224  * that the caller allocated a page list at least @maxpages in size and this
2225  * will be filled in.
2226  *
2227  * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
2228  * be allowed on the pages extracted.
2229  *
2230  * The iov_iter_extract_will_pin() function can be used to query how cleanup
2231  * should be performed.
2232  *
2233  * Extra refs or pins on the pages may be obtained as follows:
2234  *
2235  *  (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be
2236  *      added to the pages, but refs will not be taken.
2237  *      iov_iter_extract_will_pin() will return true.
2238  *
2239  *  (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are
2240  *      merely listed; no extra refs or pins are obtained.
2241  *      iov_iter_extract_will_pin() will return 0.
2242  *
2243  * Note also:
2244  *
2245  *  (*) Use with ITER_DISCARD is not supported as that has no content.
2246  *
2247  * On success, the function sets *@pages to the new pagelist, if allocated, and
2248  * sets *offset0 to the offset into the first page.
2249  *
2250  * It may also return -ENOMEM and -EFAULT.
2251  */
2252 ssize_t iov_iter_extract_pages(struct iov_iter *i,
2253                                struct page ***pages,
2254                                size_t maxsize,
2255                                unsigned int maxpages,
2256                                iov_iter_extraction_t extraction_flags,
2257                                size_t *offset0)
2258 {
2259         maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT);
2260         if (!maxsize)
2261                 return 0;
2262
2263         if (likely(user_backed_iter(i)))
2264                 return iov_iter_extract_user_pages(i, pages, maxsize,
2265                                                    maxpages, extraction_flags,
2266                                                    offset0);
2267         if (iov_iter_is_kvec(i))
2268                 return iov_iter_extract_kvec_pages(i, pages, maxsize,
2269                                                    maxpages, extraction_flags,
2270                                                    offset0);
2271         if (iov_iter_is_bvec(i))
2272                 return iov_iter_extract_bvec_pages(i, pages, maxsize,
2273                                                    maxpages, extraction_flags,
2274                                                    offset0);
2275         if (iov_iter_is_xarray(i))
2276                 return iov_iter_extract_xarray_pages(i, pages, maxsize,
2277                                                      maxpages, extraction_flags,
2278                                                      offset0);
2279         return -EFAULT;
2280 }
2281 EXPORT_SYMBOL_GPL(iov_iter_extract_pages);