iterate_bvec(): expand bvec.h macro forest, massage a bit
[linux-block.git] / lib / iov_iter.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
7999096f 2#include <crypto/hash.h>
4f18cd31 3#include <linux/export.h>
2f8b5444 4#include <linux/bvec.h>
4d0e9df5 5#include <linux/fault-inject-usercopy.h>
4f18cd31
AV
6#include <linux/uio.h>
7#include <linux/pagemap.h>
28961998 8#include <linux/highmem.h>
91f79c43
AV
9#include <linux/slab.h>
10#include <linux/vmalloc.h>
241699cd 11#include <linux/splice.h>
bfdc5970 12#include <linux/compat.h>
a604ec7e 13#include <net/checksum.h>
d05f4435 14#include <linux/scatterlist.h>
d0ef4c36 15#include <linux/instrumented.h>
4f18cd31 16
241699cd
AV
17#define PIPE_PARANOIA /* for now */
18
5c67aa90 19/* covers iovec and kvec alike */
7a1bcb5d
AV
20#define iterate_iovec(i, n, __v, __p, skip, STEP) { \
21 size_t left; \
22 size_t wanted = n; \
7a1bcb5d
AV
23 do { \
24 __v.iov_len = min(n, __p->iov_len - skip); \
25 if (likely(__v.iov_len)) { \
26 __v.iov_base = __p->iov_base + skip; \
27 left = (STEP); \
28 __v.iov_len -= left; \
29 skip += __v.iov_len; \
30 n -= __v.iov_len; \
31 if (skip < __p->iov_len) \
32 break; \
33 } \
34 __p++; \
35 skip = 0; \
36 } while (n); \
37 n = wanted - n; \
04a31165
AV
38}
39
7491a2bf
AV
40#define iterate_bvec(i, n, __v, p, skip, STEP) { \
41 size_t wanted = n; \
42 while (n) { \
43 unsigned offset = p->bv_offset + skip; \
44 __v.bv_offset = offset % PAGE_SIZE; \
45 __v.bv_page = p->bv_page + offset / PAGE_SIZE; \
46 __v.bv_len = min(min(n, p->bv_len - skip), \
47 (size_t)(PAGE_SIZE - offset % PAGE_SIZE)); \
48 (void)(STEP); \
49 skip += __v.bv_len; \
50 if (skip == p->bv_len) { \
51 skip = 0; \
52 p++; \
53 } \
54 n -= __v.bv_len; \
55 } \
56 n = wanted - n; \
04a31165
AV
57}
58
7ff50620
DH
59#define iterate_xarray(i, n, __v, skip, STEP) { \
60 struct page *head = NULL; \
61 size_t wanted = n, seg, offset; \
62 loff_t start = i->xarray_start + skip; \
63 pgoff_t index = start >> PAGE_SHIFT; \
64 int j; \
65 \
66 XA_STATE(xas, i->xarray, index); \
67 \
68 rcu_read_lock(); \
69 xas_for_each(&xas, head, ULONG_MAX) { \
70 if (xas_retry(&xas, head)) \
71 continue; \
72 if (WARN_ON(xa_is_value(head))) \
73 break; \
74 if (WARN_ON(PageHuge(head))) \
75 break; \
76 for (j = (head->index < index) ? index - head->index : 0; \
77 j < thp_nr_pages(head); j++) { \
78 __v.bv_page = head + j; \
79 offset = (i->xarray_start + skip) & ~PAGE_MASK; \
80 seg = PAGE_SIZE - offset; \
81 __v.bv_offset = offset; \
82 __v.bv_len = min(n, seg); \
83 (void)(STEP); \
84 n -= __v.bv_len; \
85 skip += __v.bv_len; \
86 if (n == 0) \
87 break; \
88 } \
89 if (n == 0) \
90 break; \
91 } \
92 rcu_read_unlock(); \
93 n = wanted - n; \
94}
95
7ff50620 96#define iterate_and_advance(i, n, v, I, B, K, X) { \
dd254f5a
AV
97 if (unlikely(i->count < n)) \
98 n = i->count; \
f5da8354 99 if (likely(n)) { \
dd254f5a 100 size_t skip = i->iov_offset; \
28f38db7 101 if (likely(iter_is_iovec(i))) { \
5c67aa90 102 const struct iovec *iov = i->iov; \
28f38db7
AV
103 struct iovec v; \
104 iterate_iovec(i, n, v, iov, skip, (I)) \
28f38db7
AV
105 i->nr_segs -= iov - i->iov; \
106 i->iov = iov; \
107 } else if (iov_iter_is_bvec(i)) { \
1bdc76ae 108 const struct bio_vec *bvec = i->bvec; \
dd254f5a 109 struct bio_vec v; \
7491a2bf
AV
110 iterate_bvec(i, n, v, bvec, skip, (B)) \
111 i->nr_segs -= bvec - i->bvec; \
112 i->bvec = bvec; \
28f38db7 113 } else if (iov_iter_is_kvec(i)) { \
5c67aa90 114 const struct kvec *kvec = i->kvec; \
dd254f5a 115 struct kvec v; \
5c67aa90
AV
116 iterate_iovec(i, n, v, kvec, skip, \
117 ((void)(K),0)) \
dd254f5a
AV
118 i->nr_segs -= kvec - i->kvec; \
119 i->kvec = kvec; \
28f38db7 120 } else if (iov_iter_is_xarray(i)) { \
7ff50620
DH
121 struct bio_vec v; \
122 iterate_xarray(i, n, v, skip, (X)) \
7ce2a91e 123 } \
dd254f5a
AV
124 i->count -= n; \
125 i->iov_offset = skip; \
7ce2a91e 126 } \
7ce2a91e
AV
127}
128
09fc68dc
AV
129static int copyout(void __user *to, const void *from, size_t n)
130{
4d0e9df5
AL
131 if (should_fail_usercopy())
132 return n;
96d4f267 133 if (access_ok(to, n)) {
d0ef4c36 134 instrument_copy_to_user(to, from, n);
09fc68dc
AV
135 n = raw_copy_to_user(to, from, n);
136 }
137 return n;
138}
139
140static int copyin(void *to, const void __user *from, size_t n)
141{
4d0e9df5
AL
142 if (should_fail_usercopy())
143 return n;
96d4f267 144 if (access_ok(from, n)) {
d0ef4c36 145 instrument_copy_from_user(to, from, n);
09fc68dc
AV
146 n = raw_copy_from_user(to, from, n);
147 }
148 return n;
149}
150
62a8067a 151static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
4f18cd31
AV
152 struct iov_iter *i)
153{
154 size_t skip, copy, left, wanted;
155 const struct iovec *iov;
156 char __user *buf;
157 void *kaddr, *from;
158
159 if (unlikely(bytes > i->count))
160 bytes = i->count;
161
162 if (unlikely(!bytes))
163 return 0;
164
09fc68dc 165 might_fault();
4f18cd31
AV
166 wanted = bytes;
167 iov = i->iov;
168 skip = i->iov_offset;
169 buf = iov->iov_base + skip;
170 copy = min(bytes, iov->iov_len - skip);
171
3fa6c507 172 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
4f18cd31
AV
173 kaddr = kmap_atomic(page);
174 from = kaddr + offset;
175
176 /* first chunk, usually the only one */
09fc68dc 177 left = copyout(buf, from, copy);
4f18cd31
AV
178 copy -= left;
179 skip += copy;
180 from += copy;
181 bytes -= copy;
182
183 while (unlikely(!left && bytes)) {
184 iov++;
185 buf = iov->iov_base;
186 copy = min(bytes, iov->iov_len);
09fc68dc 187 left = copyout(buf, from, copy);
4f18cd31
AV
188 copy -= left;
189 skip = copy;
190 from += copy;
191 bytes -= copy;
192 }
193 if (likely(!bytes)) {
194 kunmap_atomic(kaddr);
195 goto done;
196 }
197 offset = from - kaddr;
198 buf += copy;
199 kunmap_atomic(kaddr);
200 copy = min(bytes, iov->iov_len - skip);
201 }
202 /* Too bad - revert to non-atomic kmap */
3fa6c507 203
4f18cd31
AV
204 kaddr = kmap(page);
205 from = kaddr + offset;
09fc68dc 206 left = copyout(buf, from, copy);
4f18cd31
AV
207 copy -= left;
208 skip += copy;
209 from += copy;
210 bytes -= copy;
211 while (unlikely(!left && bytes)) {
212 iov++;
213 buf = iov->iov_base;
214 copy = min(bytes, iov->iov_len);
09fc68dc 215 left = copyout(buf, from, copy);
4f18cd31
AV
216 copy -= left;
217 skip = copy;
218 from += copy;
219 bytes -= copy;
220 }
221 kunmap(page);
3fa6c507 222
4f18cd31 223done:
81055e58
AV
224 if (skip == iov->iov_len) {
225 iov++;
226 skip = 0;
227 }
4f18cd31
AV
228 i->count -= wanted - bytes;
229 i->nr_segs -= iov - i->iov;
230 i->iov = iov;
231 i->iov_offset = skip;
232 return wanted - bytes;
233}
4f18cd31 234
62a8067a 235static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
f0d1bec9
AV
236 struct iov_iter *i)
237{
238 size_t skip, copy, left, wanted;
239 const struct iovec *iov;
240 char __user *buf;
241 void *kaddr, *to;
242
243 if (unlikely(bytes > i->count))
244 bytes = i->count;
245
246 if (unlikely(!bytes))
247 return 0;
248
09fc68dc 249 might_fault();
f0d1bec9
AV
250 wanted = bytes;
251 iov = i->iov;
252 skip = i->iov_offset;
253 buf = iov->iov_base + skip;
254 copy = min(bytes, iov->iov_len - skip);
255
3fa6c507 256 if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
f0d1bec9
AV
257 kaddr = kmap_atomic(page);
258 to = kaddr + offset;
259
260 /* first chunk, usually the only one */
09fc68dc 261 left = copyin(to, buf, copy);
f0d1bec9
AV
262 copy -= left;
263 skip += copy;
264 to += copy;
265 bytes -= copy;
266
267 while (unlikely(!left && bytes)) {
268 iov++;
269 buf = iov->iov_base;
270 copy = min(bytes, iov->iov_len);
09fc68dc 271 left = copyin(to, buf, copy);
f0d1bec9
AV
272 copy -= left;
273 skip = copy;
274 to += copy;
275 bytes -= copy;
276 }
277 if (likely(!bytes)) {
278 kunmap_atomic(kaddr);
279 goto done;
280 }
281 offset = to - kaddr;
282 buf += copy;
283 kunmap_atomic(kaddr);
284 copy = min(bytes, iov->iov_len - skip);
285 }
286 /* Too bad - revert to non-atomic kmap */
3fa6c507 287
f0d1bec9
AV
288 kaddr = kmap(page);
289 to = kaddr + offset;
09fc68dc 290 left = copyin(to, buf, copy);
f0d1bec9
AV
291 copy -= left;
292 skip += copy;
293 to += copy;
294 bytes -= copy;
295 while (unlikely(!left && bytes)) {
296 iov++;
297 buf = iov->iov_base;
298 copy = min(bytes, iov->iov_len);
09fc68dc 299 left = copyin(to, buf, copy);
f0d1bec9
AV
300 copy -= left;
301 skip = copy;
302 to += copy;
303 bytes -= copy;
304 }
305 kunmap(page);
3fa6c507 306
f0d1bec9 307done:
81055e58
AV
308 if (skip == iov->iov_len) {
309 iov++;
310 skip = 0;
311 }
f0d1bec9
AV
312 i->count -= wanted - bytes;
313 i->nr_segs -= iov - i->iov;
314 i->iov = iov;
315 i->iov_offset = skip;
316 return wanted - bytes;
317}
f0d1bec9 318
241699cd
AV
319#ifdef PIPE_PARANOIA
320static bool sanity(const struct iov_iter *i)
321{
322 struct pipe_inode_info *pipe = i->pipe;
8cefc107
DH
323 unsigned int p_head = pipe->head;
324 unsigned int p_tail = pipe->tail;
325 unsigned int p_mask = pipe->ring_size - 1;
326 unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
327 unsigned int i_head = i->head;
328 unsigned int idx;
329
241699cd
AV
330 if (i->iov_offset) {
331 struct pipe_buffer *p;
8cefc107 332 if (unlikely(p_occupancy == 0))
241699cd 333 goto Bad; // pipe must be non-empty
8cefc107 334 if (unlikely(i_head != p_head - 1))
241699cd
AV
335 goto Bad; // must be at the last buffer...
336
8cefc107 337 p = &pipe->bufs[i_head & p_mask];
241699cd
AV
338 if (unlikely(p->offset + p->len != i->iov_offset))
339 goto Bad; // ... at the end of segment
340 } else {
8cefc107 341 if (i_head != p_head)
241699cd
AV
342 goto Bad; // must be right after the last buffer
343 }
344 return true;
345Bad:
8cefc107
DH
346 printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
347 printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
348 p_head, p_tail, pipe->ring_size);
349 for (idx = 0; idx < pipe->ring_size; idx++)
241699cd
AV
350 printk(KERN_ERR "[%p %p %d %d]\n",
351 pipe->bufs[idx].ops,
352 pipe->bufs[idx].page,
353 pipe->bufs[idx].offset,
354 pipe->bufs[idx].len);
355 WARN_ON(1);
356 return false;
357}
358#else
359#define sanity(i) true
360#endif
361
241699cd
AV
362static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
363 struct iov_iter *i)
364{
365 struct pipe_inode_info *pipe = i->pipe;
366 struct pipe_buffer *buf;
8cefc107
DH
367 unsigned int p_tail = pipe->tail;
368 unsigned int p_mask = pipe->ring_size - 1;
369 unsigned int i_head = i->head;
241699cd 370 size_t off;
241699cd
AV
371
372 if (unlikely(bytes > i->count))
373 bytes = i->count;
374
375 if (unlikely(!bytes))
376 return 0;
377
378 if (!sanity(i))
379 return 0;
380
381 off = i->iov_offset;
8cefc107 382 buf = &pipe->bufs[i_head & p_mask];
241699cd
AV
383 if (off) {
384 if (offset == off && buf->page == page) {
385 /* merge with the last one */
386 buf->len += bytes;
387 i->iov_offset += bytes;
388 goto out;
389 }
8cefc107
DH
390 i_head++;
391 buf = &pipe->bufs[i_head & p_mask];
241699cd 392 }
6718b6f8 393 if (pipe_full(i_head, p_tail, pipe->max_usage))
241699cd 394 return 0;
8cefc107 395
241699cd 396 buf->ops = &page_cache_pipe_buf_ops;
8cefc107
DH
397 get_page(page);
398 buf->page = page;
241699cd
AV
399 buf->offset = offset;
400 buf->len = bytes;
8cefc107
DH
401
402 pipe->head = i_head + 1;
241699cd 403 i->iov_offset = offset + bytes;
8cefc107 404 i->head = i_head;
241699cd
AV
405out:
406 i->count -= bytes;
407 return bytes;
408}
409
171a0203
AA
410/*
411 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
412 * bytes. For each iovec, fault in each page that constitutes the iovec.
413 *
414 * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
415 * because it is an invalid address).
416 */
8409a0d2 417int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
171a0203 418{
0e8f0d67 419 if (iter_is_iovec(i)) {
8409a0d2
AV
420 const struct iovec *p;
421 size_t skip;
422
423 if (bytes > i->count)
424 bytes = i->count;
425 for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
426 size_t len = min(bytes, p->iov_len - skip);
427 int err;
428
429 if (unlikely(!len))
430 continue;
431 err = fault_in_pages_readable(p->iov_base + skip, len);
171a0203 432 if (unlikely(err))
8409a0d2
AV
433 return err;
434 bytes -= len;
435 }
171a0203
AA
436 }
437 return 0;
438}
d4690f1e 439EXPORT_SYMBOL(iov_iter_fault_in_readable);
171a0203 440
aa563d7b 441void iov_iter_init(struct iov_iter *i, unsigned int direction,
71d8e532
AV
442 const struct iovec *iov, unsigned long nr_segs,
443 size_t count)
444{
aa563d7b 445 WARN_ON(direction & ~(READ | WRITE));
8cd54c1c
AV
446 WARN_ON_ONCE(uaccess_kernel());
447 *i = (struct iov_iter) {
448 .iter_type = ITER_IOVEC,
449 .data_source = direction,
450 .iov = iov,
451 .nr_segs = nr_segs,
452 .iov_offset = 0,
453 .count = count
454 };
71d8e532
AV
455}
456EXPORT_SYMBOL(iov_iter_init);
7b2c99d1 457
241699cd
AV
458static inline bool allocated(struct pipe_buffer *buf)
459{
460 return buf->ops == &default_pipe_buf_ops;
461}
462
8cefc107
DH
463static inline void data_start(const struct iov_iter *i,
464 unsigned int *iter_headp, size_t *offp)
241699cd 465{
8cefc107
DH
466 unsigned int p_mask = i->pipe->ring_size - 1;
467 unsigned int iter_head = i->head;
241699cd 468 size_t off = i->iov_offset;
8cefc107
DH
469
470 if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
471 off == PAGE_SIZE)) {
472 iter_head++;
241699cd
AV
473 off = 0;
474 }
8cefc107 475 *iter_headp = iter_head;
241699cd
AV
476 *offp = off;
477}
478
479static size_t push_pipe(struct iov_iter *i, size_t size,
8cefc107 480 int *iter_headp, size_t *offp)
241699cd
AV
481{
482 struct pipe_inode_info *pipe = i->pipe;
8cefc107
DH
483 unsigned int p_tail = pipe->tail;
484 unsigned int p_mask = pipe->ring_size - 1;
485 unsigned int iter_head;
241699cd 486 size_t off;
241699cd
AV
487 ssize_t left;
488
489 if (unlikely(size > i->count))
490 size = i->count;
491 if (unlikely(!size))
492 return 0;
493
494 left = size;
8cefc107
DH
495 data_start(i, &iter_head, &off);
496 *iter_headp = iter_head;
241699cd
AV
497 *offp = off;
498 if (off) {
499 left -= PAGE_SIZE - off;
500 if (left <= 0) {
8cefc107 501 pipe->bufs[iter_head & p_mask].len += size;
241699cd
AV
502 return size;
503 }
8cefc107
DH
504 pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
505 iter_head++;
241699cd 506 }
6718b6f8 507 while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
8cefc107 508 struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
241699cd
AV
509 struct page *page = alloc_page(GFP_USER);
510 if (!page)
511 break;
8cefc107
DH
512
513 buf->ops = &default_pipe_buf_ops;
514 buf->page = page;
515 buf->offset = 0;
516 buf->len = min_t(ssize_t, left, PAGE_SIZE);
517 left -= buf->len;
518 iter_head++;
519 pipe->head = iter_head;
520
521 if (left == 0)
241699cd 522 return size;
241699cd
AV
523 }
524 return size - left;
525}
526
527static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
528 struct iov_iter *i)
529{
530 struct pipe_inode_info *pipe = i->pipe;
8cefc107
DH
531 unsigned int p_mask = pipe->ring_size - 1;
532 unsigned int i_head;
241699cd 533 size_t n, off;
241699cd
AV
534
535 if (!sanity(i))
536 return 0;
537
8cefc107 538 bytes = n = push_pipe(i, bytes, &i_head, &off);
241699cd
AV
539 if (unlikely(!n))
540 return 0;
8cefc107 541 do {
241699cd 542 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
8cefc107
DH
543 memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
544 i->head = i_head;
241699cd
AV
545 i->iov_offset = off + chunk;
546 n -= chunk;
547 addr += chunk;
8cefc107
DH
548 off = 0;
549 i_head++;
550 } while (n);
241699cd
AV
551 i->count -= bytes;
552 return bytes;
553}
554
f9152895
AV
555static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
556 __wsum sum, size_t off)
557{
cc44c17b 558 __wsum next = csum_partial_copy_nocheck(from, to, len);
f9152895
AV
559 return csum_block_add(sum, next, off);
560}
561
78e1f386 562static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
52cbd23a
WB
563 struct csum_state *csstate,
564 struct iov_iter *i)
78e1f386
AV
565{
566 struct pipe_inode_info *pipe = i->pipe;
8cefc107 567 unsigned int p_mask = pipe->ring_size - 1;
52cbd23a
WB
568 __wsum sum = csstate->csum;
569 size_t off = csstate->off;
8cefc107 570 unsigned int i_head;
78e1f386 571 size_t n, r;
78e1f386
AV
572
573 if (!sanity(i))
574 return 0;
575
8cefc107 576 bytes = n = push_pipe(i, bytes, &i_head, &r);
78e1f386
AV
577 if (unlikely(!n))
578 return 0;
8cefc107 579 do {
78e1f386 580 size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
8cefc107 581 char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
f9152895 582 sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
78e1f386 583 kunmap_atomic(p);
8cefc107 584 i->head = i_head;
78e1f386
AV
585 i->iov_offset = r + chunk;
586 n -= chunk;
587 off += chunk;
588 addr += chunk;
8cefc107
DH
589 r = 0;
590 i_head++;
591 } while (n);
78e1f386 592 i->count -= bytes;
52cbd23a
WB
593 csstate->csum = sum;
594 csstate->off = off;
78e1f386
AV
595 return bytes;
596}
597
aa28de27 598size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
62a8067a 599{
36f7a8a4 600 const char *from = addr;
00e23707 601 if (unlikely(iov_iter_is_pipe(i)))
241699cd 602 return copy_pipe_to_iter(addr, bytes, i);
09fc68dc
AV
603 if (iter_is_iovec(i))
604 might_fault();
3d4d3e48 605 iterate_and_advance(i, bytes, v,
09fc68dc 606 copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
3d4d3e48 607 memcpy_to_page(v.bv_page, v.bv_offset,
a280455f 608 (from += v.bv_len) - v.bv_len, v.bv_len),
7ff50620
DH
609 memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
610 memcpy_to_page(v.bv_page, v.bv_offset,
611 (from += v.bv_len) - v.bv_len, v.bv_len)
3d4d3e48 612 )
62a8067a 613
3d4d3e48 614 return bytes;
c35e0248 615}
aa28de27 616EXPORT_SYMBOL(_copy_to_iter);
c35e0248 617
ec6347bb
DW
618#ifdef CONFIG_ARCH_HAS_COPY_MC
619static int copyout_mc(void __user *to, const void *from, size_t n)
8780356e 620{
96d4f267 621 if (access_ok(to, n)) {
d0ef4c36 622 instrument_copy_to_user(to, from, n);
ec6347bb 623 n = copy_mc_to_user((__force void *) to, from, n);
8780356e
DW
624 }
625 return n;
626}
627
ec6347bb 628static unsigned long copy_mc_to_page(struct page *page, size_t offset,
8780356e
DW
629 const char *from, size_t len)
630{
631 unsigned long ret;
632 char *to;
633
634 to = kmap_atomic(page);
ec6347bb 635 ret = copy_mc_to_kernel(to + offset, from, len);
8780356e
DW
636 kunmap_atomic(to);
637
638 return ret;
639}
640
ec6347bb 641static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
ca146f6f
DW
642 struct iov_iter *i)
643{
644 struct pipe_inode_info *pipe = i->pipe;
8cefc107
DH
645 unsigned int p_mask = pipe->ring_size - 1;
646 unsigned int i_head;
ca146f6f 647 size_t n, off, xfer = 0;
ca146f6f
DW
648
649 if (!sanity(i))
650 return 0;
651
8cefc107 652 bytes = n = push_pipe(i, bytes, &i_head, &off);
ca146f6f
DW
653 if (unlikely(!n))
654 return 0;
8cefc107 655 do {
ca146f6f
DW
656 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
657 unsigned long rem;
658
ec6347bb 659 rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page,
8cefc107
DH
660 off, addr, chunk);
661 i->head = i_head;
ca146f6f
DW
662 i->iov_offset = off + chunk - rem;
663 xfer += chunk - rem;
664 if (rem)
665 break;
666 n -= chunk;
667 addr += chunk;
8cefc107
DH
668 off = 0;
669 i_head++;
670 } while (n);
ca146f6f
DW
671 i->count -= xfer;
672 return xfer;
673}
674
bf3eeb9b 675/**
ec6347bb 676 * _copy_mc_to_iter - copy to iter with source memory error exception handling
bf3eeb9b
DW
677 * @addr: source kernel address
678 * @bytes: total transfer length
679 * @iter: destination iterator
680 *
ec6347bb
DW
681 * The pmem driver deploys this for the dax operation
682 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
683 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
684 * successfully copied.
bf3eeb9b 685 *
ec6347bb 686 * The main differences between this and typical _copy_to_iter().
bf3eeb9b
DW
687 *
688 * * Typical tail/residue handling after a fault retries the copy
689 * byte-by-byte until the fault happens again. Re-triggering machine
690 * checks is potentially fatal so the implementation uses source
691 * alignment and poison alignment assumptions to avoid re-triggering
692 * hardware exceptions.
693 *
694 * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
695 * Compare to copy_to_iter() where only ITER_IOVEC attempts might return
696 * a short copy.
bf3eeb9b 697 */
ec6347bb 698size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
8780356e
DW
699{
700 const char *from = addr;
701 unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
702
00e23707 703 if (unlikely(iov_iter_is_pipe(i)))
ec6347bb 704 return copy_mc_pipe_to_iter(addr, bytes, i);
8780356e
DW
705 if (iter_is_iovec(i))
706 might_fault();
707 iterate_and_advance(i, bytes, v,
ec6347bb
DW
708 copyout_mc(v.iov_base, (from += v.iov_len) - v.iov_len,
709 v.iov_len),
8780356e 710 ({
ec6347bb
DW
711 rem = copy_mc_to_page(v.bv_page, v.bv_offset,
712 (from += v.bv_len) - v.bv_len, v.bv_len);
8780356e
DW
713 if (rem) {
714 curr_addr = (unsigned long) from;
715 bytes = curr_addr - s_addr - rem;
716 return bytes;
717 }
718 }),
719 ({
ec6347bb
DW
720 rem = copy_mc_to_kernel(v.iov_base, (from += v.iov_len)
721 - v.iov_len, v.iov_len);
8780356e
DW
722 if (rem) {
723 curr_addr = (unsigned long) from;
724 bytes = curr_addr - s_addr - rem;
725 return bytes;
726 }
7ff50620
DH
727 }),
728 ({
729 rem = copy_mc_to_page(v.bv_page, v.bv_offset,
730 (from += v.bv_len) - v.bv_len, v.bv_len);
731 if (rem) {
732 curr_addr = (unsigned long) from;
733 bytes = curr_addr - s_addr - rem;
734 rcu_read_unlock();
3d14ec1f
DH
735 i->iov_offset += bytes;
736 i->count -= bytes;
7ff50620
DH
737 return bytes;
738 }
8780356e
DW
739 })
740 )
741
742 return bytes;
743}
ec6347bb
DW
744EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
745#endif /* CONFIG_ARCH_HAS_COPY_MC */
8780356e 746
aa28de27 747size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
c35e0248 748{
0dbca9a4 749 char *to = addr;
00e23707 750 if (unlikely(iov_iter_is_pipe(i))) {
241699cd
AV
751 WARN_ON(1);
752 return 0;
753 }
09fc68dc
AV
754 if (iter_is_iovec(i))
755 might_fault();
0dbca9a4 756 iterate_and_advance(i, bytes, v,
09fc68dc 757 copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
0dbca9a4 758 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
a280455f 759 v.bv_offset, v.bv_len),
7ff50620
DH
760 memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
761 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
762 v.bv_offset, v.bv_len)
0dbca9a4
AV
763 )
764
765 return bytes;
c35e0248 766}
aa28de27 767EXPORT_SYMBOL(_copy_from_iter);
c35e0248 768
aa28de27 769size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
aa583096
AV
770{
771 char *to = addr;
00e23707 772 if (unlikely(iov_iter_is_pipe(i))) {
241699cd
AV
773 WARN_ON(1);
774 return 0;
775 }
aa583096 776 iterate_and_advance(i, bytes, v,
3f763453 777 __copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
aa583096
AV
778 v.iov_base, v.iov_len),
779 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
780 v.bv_offset, v.bv_len),
7ff50620
DH
781 memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
782 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
783 v.bv_offset, v.bv_len)
aa583096
AV
784 )
785
786 return bytes;
787}
aa28de27 788EXPORT_SYMBOL(_copy_from_iter_nocache);
aa583096 789
0aed55af 790#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
abd08d7d
DW
791/**
792 * _copy_from_iter_flushcache - write destination through cpu cache
793 * @addr: destination kernel address
794 * @bytes: total transfer length
795 * @iter: source iterator
796 *
797 * The pmem driver arranges for filesystem-dax to use this facility via
798 * dax_copy_from_iter() for ensuring that writes to persistent memory
799 * are flushed through the CPU cache. It is differentiated from
800 * _copy_from_iter_nocache() in that guarantees all data is flushed for
801 * all iterator types. The _copy_from_iter_nocache() only attempts to
802 * bypass the cache for the ITER_IOVEC case, and on some archs may use
803 * instructions that strand dirty-data in the cache.
804 */
6a37e940 805size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
0aed55af
DW
806{
807 char *to = addr;
00e23707 808 if (unlikely(iov_iter_is_pipe(i))) {
0aed55af
DW
809 WARN_ON(1);
810 return 0;
811 }
812 iterate_and_advance(i, bytes, v,
813 __copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
814 v.iov_base, v.iov_len),
815 memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
816 v.bv_offset, v.bv_len),
817 memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
7ff50620
DH
818 v.iov_len),
819 memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
820 v.bv_offset, v.bv_len)
0aed55af
DW
821 )
822
823 return bytes;
824}
6a37e940 825EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
0aed55af
DW
826#endif
827
72e809ed
AV
828static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
829{
6daef95b
ED
830 struct page *head;
831 size_t v = n + offset;
832
833 /*
834 * The general case needs to access the page order in order
835 * to compute the page size.
836 * However, we mostly deal with order-0 pages and thus can
837 * avoid a possible cache line miss for requests that fit all
838 * page orders.
839 */
840 if (n <= v && v <= PAGE_SIZE)
841 return true;
842
843 head = compound_head(page);
844 v += (page - head) << PAGE_SHIFT;
a90bcb86 845
a50b854e 846 if (likely(n <= v && v <= (page_size(head))))
72e809ed
AV
847 return true;
848 WARN_ON(1);
849 return false;
850}
cbbd26b8 851
08aa6479 852static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
62a8067a
AV
853 struct iov_iter *i)
854{
28f38db7
AV
855 if (likely(iter_is_iovec(i)))
856 return copy_page_to_iter_iovec(page, offset, bytes, i);
857 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
d271524a
AV
858 void *kaddr = kmap_atomic(page);
859 size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
860 kunmap_atomic(kaddr);
861 return wanted;
28f38db7
AV
862 }
863 if (iov_iter_is_pipe(i))
864 return copy_page_to_iter_pipe(page, offset, bytes, i);
865 if (unlikely(iov_iter_is_discard(i))) {
a506abc7
AV
866 if (unlikely(i->count < bytes))
867 bytes = i->count;
868 i->count -= bytes;
9ea9ce04 869 return bytes;
28f38db7
AV
870 }
871 WARN_ON(1);
872 return 0;
62a8067a 873}
08aa6479
AV
874
875size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
876 struct iov_iter *i)
877{
878 size_t res = 0;
879 if (unlikely(!page_copy_sane(page, offset, bytes)))
880 return 0;
881 page += offset / PAGE_SIZE; // first subpage
882 offset %= PAGE_SIZE;
883 while (1) {
884 size_t n = __copy_page_to_iter(page, offset,
885 min(bytes, (size_t)PAGE_SIZE - offset), i);
886 res += n;
887 bytes -= n;
888 if (!bytes || !n)
889 break;
890 offset += n;
891 if (offset == PAGE_SIZE) {
892 page++;
893 offset = 0;
894 }
895 }
896 return res;
897}
62a8067a
AV
898EXPORT_SYMBOL(copy_page_to_iter);
899
900size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
901 struct iov_iter *i)
902{
72e809ed
AV
903 if (unlikely(!page_copy_sane(page, offset, bytes)))
904 return 0;
28f38db7
AV
905 if (likely(iter_is_iovec(i)))
906 return copy_page_from_iter_iovec(page, offset, bytes, i);
907 if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
d271524a 908 void *kaddr = kmap_atomic(page);
aa28de27 909 size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
d271524a
AV
910 kunmap_atomic(kaddr);
911 return wanted;
28f38db7
AV
912 }
913 WARN_ON(1);
914 return 0;
62a8067a
AV
915}
916EXPORT_SYMBOL(copy_page_from_iter);
917
241699cd
AV
918static size_t pipe_zero(size_t bytes, struct iov_iter *i)
919{
920 struct pipe_inode_info *pipe = i->pipe;
8cefc107
DH
921 unsigned int p_mask = pipe->ring_size - 1;
922 unsigned int i_head;
241699cd 923 size_t n, off;
241699cd
AV
924
925 if (!sanity(i))
926 return 0;
927
8cefc107 928 bytes = n = push_pipe(i, bytes, &i_head, &off);
241699cd
AV
929 if (unlikely(!n))
930 return 0;
931
8cefc107 932 do {
241699cd 933 size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
8cefc107
DH
934 memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
935 i->head = i_head;
241699cd
AV
936 i->iov_offset = off + chunk;
937 n -= chunk;
8cefc107
DH
938 off = 0;
939 i_head++;
940 } while (n);
241699cd
AV
941 i->count -= bytes;
942 return bytes;
943}
944
c35e0248
MW
945size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
946{
00e23707 947 if (unlikely(iov_iter_is_pipe(i)))
241699cd 948 return pipe_zero(bytes, i);
8442fa46 949 iterate_and_advance(i, bytes, v,
09fc68dc 950 clear_user(v.iov_base, v.iov_len),
a280455f 951 memzero_page(v.bv_page, v.bv_offset, v.bv_len),
7ff50620
DH
952 memset(v.iov_base, 0, v.iov_len),
953 memzero_page(v.bv_page, v.bv_offset, v.bv_len)
8442fa46
AV
954 )
955
956 return bytes;
c35e0248
MW
957}
958EXPORT_SYMBOL(iov_iter_zero);
959
f0b65f39
AV
960size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
961 struct iov_iter *i)
62a8067a 962{
04a31165 963 char *kaddr = kmap_atomic(page), *p = kaddr + offset;
72e809ed
AV
964 if (unlikely(!page_copy_sane(page, offset, bytes))) {
965 kunmap_atomic(kaddr);
966 return 0;
967 }
9ea9ce04 968 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
241699cd
AV
969 kunmap_atomic(kaddr);
970 WARN_ON(1);
971 return 0;
972 }
f0b65f39 973 iterate_and_advance(i, bytes, v,
09fc68dc 974 copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
04a31165 975 memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
a280455f 976 v.bv_offset, v.bv_len),
7ff50620
DH
977 memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
978 memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
979 v.bv_offset, v.bv_len)
04a31165
AV
980 )
981 kunmap_atomic(kaddr);
982 return bytes;
62a8067a 983}
f0b65f39 984EXPORT_SYMBOL(copy_page_from_iter_atomic);
62a8067a 985
b9dc6f65
AV
986static inline void pipe_truncate(struct iov_iter *i)
987{
988 struct pipe_inode_info *pipe = i->pipe;
8cefc107
DH
989 unsigned int p_tail = pipe->tail;
990 unsigned int p_head = pipe->head;
991 unsigned int p_mask = pipe->ring_size - 1;
992
993 if (!pipe_empty(p_head, p_tail)) {
994 struct pipe_buffer *buf;
995 unsigned int i_head = i->head;
b9dc6f65 996 size_t off = i->iov_offset;
8cefc107 997
b9dc6f65 998 if (off) {
8cefc107
DH
999 buf = &pipe->bufs[i_head & p_mask];
1000 buf->len = off - buf->offset;
1001 i_head++;
b9dc6f65 1002 }
8cefc107
DH
1003 while (p_head != i_head) {
1004 p_head--;
1005 pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
b9dc6f65 1006 }
8cefc107
DH
1007
1008 pipe->head = p_head;
b9dc6f65
AV
1009 }
1010}
1011
241699cd
AV
1012static void pipe_advance(struct iov_iter *i, size_t size)
1013{
1014 struct pipe_inode_info *pipe = i->pipe;
241699cd 1015 if (size) {
b9dc6f65 1016 struct pipe_buffer *buf;
8cefc107
DH
1017 unsigned int p_mask = pipe->ring_size - 1;
1018 unsigned int i_head = i->head;
b9dc6f65 1019 size_t off = i->iov_offset, left = size;
8cefc107 1020
241699cd 1021 if (off) /* make it relative to the beginning of buffer */
8cefc107 1022 left += off - pipe->bufs[i_head & p_mask].offset;
241699cd 1023 while (1) {
8cefc107 1024 buf = &pipe->bufs[i_head & p_mask];
b9dc6f65 1025 if (left <= buf->len)
241699cd 1026 break;
b9dc6f65 1027 left -= buf->len;
8cefc107 1028 i_head++;
241699cd 1029 }
8cefc107 1030 i->head = i_head;
b9dc6f65 1031 i->iov_offset = buf->offset + left;
241699cd 1032 }
b9dc6f65
AV
1033 i->count -= size;
1034 /* ... and discard everything past that point */
1035 pipe_truncate(i);
241699cd
AV
1036}
1037
54c8195b
PB
1038static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
1039{
1040 struct bvec_iter bi;
1041
1042 bi.bi_size = i->count;
1043 bi.bi_bvec_done = i->iov_offset;
1044 bi.bi_idx = 0;
1045 bvec_iter_advance(i->bvec, &bi, size);
1046
1047 i->bvec += bi.bi_idx;
1048 i->nr_segs -= bi.bi_idx;
1049 i->count = bi.bi_size;
1050 i->iov_offset = bi.bi_bvec_done;
1051}
1052
185ac4d4
AV
1053static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
1054{
1055 const struct iovec *iov, *end;
1056
1057 if (!i->count)
1058 return;
1059 i->count -= size;
1060
1061 size += i->iov_offset; // from beginning of current segment
1062 for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
1063 if (likely(size < iov->iov_len))
1064 break;
1065 size -= iov->iov_len;
1066 }
1067 i->iov_offset = size;
1068 i->nr_segs -= iov - i->iov;
1069 i->iov = iov;
1070}
1071
62a8067a
AV
1072void iov_iter_advance(struct iov_iter *i, size_t size)
1073{
3b3fc051
AV
1074 if (unlikely(i->count < size))
1075 size = i->count;
185ac4d4
AV
1076 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
1077 /* iovec and kvec have identical layouts */
1078 iov_iter_iovec_advance(i, size);
1079 } else if (iov_iter_is_bvec(i)) {
1080 iov_iter_bvec_advance(i, size);
1081 } else if (iov_iter_is_pipe(i)) {
241699cd 1082 pipe_advance(i, size);
185ac4d4 1083 } else if (unlikely(iov_iter_is_xarray(i))) {
7ff50620
DH
1084 i->iov_offset += size;
1085 i->count -= size;
185ac4d4
AV
1086 } else if (iov_iter_is_discard(i)) {
1087 i->count -= size;
54c8195b 1088 }
62a8067a
AV
1089}
1090EXPORT_SYMBOL(iov_iter_advance);
1091
27c0e374
AV
1092void iov_iter_revert(struct iov_iter *i, size_t unroll)
1093{
1094 if (!unroll)
1095 return;
5b47d59a
AV
1096 if (WARN_ON(unroll > MAX_RW_COUNT))
1097 return;
27c0e374 1098 i->count += unroll;
00e23707 1099 if (unlikely(iov_iter_is_pipe(i))) {
27c0e374 1100 struct pipe_inode_info *pipe = i->pipe;
8cefc107
DH
1101 unsigned int p_mask = pipe->ring_size - 1;
1102 unsigned int i_head = i->head;
27c0e374
AV
1103 size_t off = i->iov_offset;
1104 while (1) {
8cefc107
DH
1105 struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1106 size_t n = off - b->offset;
27c0e374 1107 if (unroll < n) {
4fa55cef 1108 off -= unroll;
27c0e374
AV
1109 break;
1110 }
1111 unroll -= n;
8cefc107 1112 if (!unroll && i_head == i->start_head) {
27c0e374
AV
1113 off = 0;
1114 break;
1115 }
8cefc107
DH
1116 i_head--;
1117 b = &pipe->bufs[i_head & p_mask];
1118 off = b->offset + b->len;
27c0e374
AV
1119 }
1120 i->iov_offset = off;
8cefc107 1121 i->head = i_head;
27c0e374
AV
1122 pipe_truncate(i);
1123 return;
1124 }
9ea9ce04
DH
1125 if (unlikely(iov_iter_is_discard(i)))
1126 return;
27c0e374
AV
1127 if (unroll <= i->iov_offset) {
1128 i->iov_offset -= unroll;
1129 return;
1130 }
1131 unroll -= i->iov_offset;
7ff50620
DH
1132 if (iov_iter_is_xarray(i)) {
1133 BUG(); /* We should never go beyond the start of the specified
1134 * range since we might then be straying into pages that
1135 * aren't pinned.
1136 */
1137 } else if (iov_iter_is_bvec(i)) {
27c0e374
AV
1138 const struct bio_vec *bvec = i->bvec;
1139 while (1) {
1140 size_t n = (--bvec)->bv_len;
1141 i->nr_segs++;
1142 if (unroll <= n) {
1143 i->bvec = bvec;
1144 i->iov_offset = n - unroll;
1145 return;
1146 }
1147 unroll -= n;
1148 }
1149 } else { /* same logics for iovec and kvec */
1150 const struct iovec *iov = i->iov;
1151 while (1) {
1152 size_t n = (--iov)->iov_len;
1153 i->nr_segs++;
1154 if (unroll <= n) {
1155 i->iov = iov;
1156 i->iov_offset = n - unroll;
1157 return;
1158 }
1159 unroll -= n;
1160 }
1161 }
1162}
1163EXPORT_SYMBOL(iov_iter_revert);
1164
62a8067a
AV
1165/*
1166 * Return the count of just the current iov_iter segment.
1167 */
1168size_t iov_iter_single_seg_count(const struct iov_iter *i)
1169{
28f38db7
AV
1170 if (i->nr_segs > 1) {
1171 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1172 return min(i->count, i->iov->iov_len - i->iov_offset);
1173 if (iov_iter_is_bvec(i))
1174 return min(i->count, i->bvec->bv_len - i->iov_offset);
1175 }
1176 return i->count;
62a8067a
AV
1177}
1178EXPORT_SYMBOL(iov_iter_single_seg_count);
1179
aa563d7b 1180void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
05afcb77 1181 const struct kvec *kvec, unsigned long nr_segs,
abb78f87
AV
1182 size_t count)
1183{
aa563d7b 1184 WARN_ON(direction & ~(READ | WRITE));
8cd54c1c
AV
1185 *i = (struct iov_iter){
1186 .iter_type = ITER_KVEC,
1187 .data_source = direction,
1188 .kvec = kvec,
1189 .nr_segs = nr_segs,
1190 .iov_offset = 0,
1191 .count = count
1192 };
abb78f87
AV
1193}
1194EXPORT_SYMBOL(iov_iter_kvec);
1195
aa563d7b 1196void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
05afcb77
AV
1197 const struct bio_vec *bvec, unsigned long nr_segs,
1198 size_t count)
1199{
aa563d7b 1200 WARN_ON(direction & ~(READ | WRITE));
8cd54c1c
AV
1201 *i = (struct iov_iter){
1202 .iter_type = ITER_BVEC,
1203 .data_source = direction,
1204 .bvec = bvec,
1205 .nr_segs = nr_segs,
1206 .iov_offset = 0,
1207 .count = count
1208 };
05afcb77
AV
1209}
1210EXPORT_SYMBOL(iov_iter_bvec);
1211
aa563d7b 1212void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
241699cd
AV
1213 struct pipe_inode_info *pipe,
1214 size_t count)
1215{
aa563d7b 1216 BUG_ON(direction != READ);
8cefc107 1217 WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
8cd54c1c
AV
1218 *i = (struct iov_iter){
1219 .iter_type = ITER_PIPE,
1220 .data_source = false,
1221 .pipe = pipe,
1222 .head = pipe->head,
1223 .start_head = pipe->head,
1224 .iov_offset = 0,
1225 .count = count
1226 };
241699cd
AV
1227}
1228EXPORT_SYMBOL(iov_iter_pipe);
1229
7ff50620
DH
1230/**
1231 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1232 * @i: The iterator to initialise.
1233 * @direction: The direction of the transfer.
1234 * @xarray: The xarray to access.
1235 * @start: The start file position.
1236 * @count: The size of the I/O buffer in bytes.
1237 *
1238 * Set up an I/O iterator to either draw data out of the pages attached to an
1239 * inode or to inject data into those pages. The pages *must* be prevented
1240 * from evaporation, either by taking a ref on them or locking them by the
1241 * caller.
1242 */
1243void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1244 struct xarray *xarray, loff_t start, size_t count)
1245{
1246 BUG_ON(direction & ~1);
8cd54c1c
AV
1247 *i = (struct iov_iter) {
1248 .iter_type = ITER_XARRAY,
1249 .data_source = direction,
1250 .xarray = xarray,
1251 .xarray_start = start,
1252 .count = count,
1253 .iov_offset = 0
1254 };
7ff50620
DH
1255}
1256EXPORT_SYMBOL(iov_iter_xarray);
1257
9ea9ce04
DH
1258/**
1259 * iov_iter_discard - Initialise an I/O iterator that discards data
1260 * @i: The iterator to initialise.
1261 * @direction: The direction of the transfer.
1262 * @count: The size of the I/O buffer in bytes.
1263 *
1264 * Set up an I/O iterator that just discards everything that's written to it.
1265 * It's only available as a READ iterator.
1266 */
1267void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1268{
1269 BUG_ON(direction != READ);
8cd54c1c
AV
1270 *i = (struct iov_iter){
1271 .iter_type = ITER_DISCARD,
1272 .data_source = false,
1273 .count = count,
1274 .iov_offset = 0
1275 };
9ea9ce04
DH
1276}
1277EXPORT_SYMBOL(iov_iter_discard);
1278
9221d2e3 1279static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
62a8067a 1280{
04a31165
AV
1281 unsigned long res = 0;
1282 size_t size = i->count;
9221d2e3
AV
1283 size_t skip = i->iov_offset;
1284 unsigned k;
1285
1286 for (k = 0; k < i->nr_segs; k++, skip = 0) {
1287 size_t len = i->iov[k].iov_len - skip;
1288 if (len) {
1289 res |= (unsigned long)i->iov[k].iov_base + skip;
1290 if (len > size)
1291 len = size;
1292 res |= len;
1293 size -= len;
1294 if (!size)
1295 break;
1296 }
1297 }
1298 return res;
1299}
04a31165 1300
9221d2e3
AV
1301static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1302{
1303 unsigned res = 0;
1304 size_t size = i->count;
1305 unsigned skip = i->iov_offset;
1306 unsigned k;
1307
1308 for (k = 0; k < i->nr_segs; k++, skip = 0) {
1309 size_t len = i->bvec[k].bv_len - skip;
1310 res |= (unsigned long)i->bvec[k].bv_offset + skip;
1311 if (len > size)
1312 len = size;
1313 res |= len;
1314 size -= len;
1315 if (!size)
1316 break;
1317 }
1318 return res;
1319}
1320
1321unsigned long iov_iter_alignment(const struct iov_iter *i)
1322{
1323 /* iovec and kvec have identical layouts */
1324 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1325 return iov_iter_alignment_iovec(i);
1326
1327 if (iov_iter_is_bvec(i))
1328 return iov_iter_alignment_bvec(i);
1329
1330 if (iov_iter_is_pipe(i)) {
e0ff126e 1331 unsigned int p_mask = i->pipe->ring_size - 1;
9221d2e3 1332 size_t size = i->count;
e0ff126e 1333
8cefc107 1334 if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
241699cd
AV
1335 return size | i->iov_offset;
1336 return size;
1337 }
9221d2e3
AV
1338
1339 if (iov_iter_is_xarray(i))
3d14ec1f 1340 return (i->xarray_start + i->iov_offset) | i->count;
9221d2e3
AV
1341
1342 return 0;
62a8067a
AV
1343}
1344EXPORT_SYMBOL(iov_iter_alignment);
1345
357f435d
AV
1346unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1347{
33844e66 1348 unsigned long res = 0;
610c7a71 1349 unsigned long v = 0;
357f435d 1350 size_t size = i->count;
610c7a71 1351 unsigned k;
357f435d 1352
610c7a71 1353 if (WARN_ON(!iter_is_iovec(i)))
241699cd 1354 return ~0U;
241699cd 1355
610c7a71
AV
1356 for (k = 0; k < i->nr_segs; k++) {
1357 if (i->iov[k].iov_len) {
1358 unsigned long base = (unsigned long)i->iov[k].iov_base;
1359 if (v) // if not the first one
1360 res |= base | v; // this start | previous end
1361 v = base + i->iov[k].iov_len;
1362 if (size <= i->iov[k].iov_len)
1363 break;
1364 size -= i->iov[k].iov_len;
1365 }
1366 }
33844e66 1367 return res;
357f435d
AV
1368}
1369EXPORT_SYMBOL(iov_iter_gap_alignment);
1370
e76b6312 1371static inline ssize_t __pipe_get_pages(struct iov_iter *i,
241699cd
AV
1372 size_t maxsize,
1373 struct page **pages,
8cefc107 1374 int iter_head,
241699cd
AV
1375 size_t *start)
1376{
1377 struct pipe_inode_info *pipe = i->pipe;
8cefc107
DH
1378 unsigned int p_mask = pipe->ring_size - 1;
1379 ssize_t n = push_pipe(i, maxsize, &iter_head, start);
241699cd
AV
1380 if (!n)
1381 return -EFAULT;
1382
1383 maxsize = n;
1384 n += *start;
1689c73a 1385 while (n > 0) {
8cefc107
DH
1386 get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1387 iter_head++;
241699cd
AV
1388 n -= PAGE_SIZE;
1389 }
1390
1391 return maxsize;
1392}
1393
1394static ssize_t pipe_get_pages(struct iov_iter *i,
1395 struct page **pages, size_t maxsize, unsigned maxpages,
1396 size_t *start)
1397{
8cefc107 1398 unsigned int iter_head, npages;
241699cd 1399 size_t capacity;
241699cd
AV
1400
1401 if (!sanity(i))
1402 return -EFAULT;
1403
8cefc107
DH
1404 data_start(i, &iter_head, start);
1405 /* Amount of free space: some of this one + all after this one */
1406 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1407 capacity = min(npages, maxpages) * PAGE_SIZE - *start;
241699cd 1408
8cefc107 1409 return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
241699cd
AV
1410}
1411
7ff50620
DH
1412static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1413 pgoff_t index, unsigned int nr_pages)
1414{
1415 XA_STATE(xas, xa, index);
1416 struct page *page;
1417 unsigned int ret = 0;
1418
1419 rcu_read_lock();
1420 for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1421 if (xas_retry(&xas, page))
1422 continue;
1423
1424 /* Has the page moved or been split? */
1425 if (unlikely(page != xas_reload(&xas))) {
1426 xas_reset(&xas);
1427 continue;
1428 }
1429
1430 pages[ret] = find_subpage(page, xas.xa_index);
1431 get_page(pages[ret]);
1432 if (++ret == nr_pages)
1433 break;
1434 }
1435 rcu_read_unlock();
1436 return ret;
1437}
1438
1439static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1440 struct page **pages, size_t maxsize,
1441 unsigned maxpages, size_t *_start_offset)
1442{
1443 unsigned nr, offset;
1444 pgoff_t index, count;
1445 size_t size = maxsize, actual;
1446 loff_t pos;
1447
1448 if (!size || !maxpages)
1449 return 0;
1450
1451 pos = i->xarray_start + i->iov_offset;
1452 index = pos >> PAGE_SHIFT;
1453 offset = pos & ~PAGE_MASK;
1454 *_start_offset = offset;
1455
1456 count = 1;
1457 if (size > PAGE_SIZE - offset) {
1458 size -= PAGE_SIZE - offset;
1459 count += size >> PAGE_SHIFT;
1460 size &= ~PAGE_MASK;
1461 if (size)
1462 count++;
1463 }
1464
1465 if (count > maxpages)
1466 count = maxpages;
1467
1468 nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1469 if (nr == 0)
1470 return 0;
1471
1472 actual = PAGE_SIZE * nr;
1473 actual -= offset;
1474 if (nr == count && size > 0) {
1475 unsigned last_offset = (nr > 1) ? 0 : offset;
1476 actual -= PAGE_SIZE - (last_offset + size);
1477 }
1478 return actual;
1479}
1480
3d671ca6
AV
1481/* must be done on non-empty ITER_IOVEC one */
1482static unsigned long first_iovec_segment(const struct iov_iter *i,
1483 size_t *size, size_t *start,
1484 size_t maxsize, unsigned maxpages)
1485{
1486 size_t skip;
1487 long k;
1488
1489 for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1490 unsigned long addr = (unsigned long)i->iov[k].iov_base + skip;
1491 size_t len = i->iov[k].iov_len - skip;
1492
1493 if (unlikely(!len))
1494 continue;
1495 if (len > maxsize)
1496 len = maxsize;
1497 len += (*start = addr % PAGE_SIZE);
1498 if (len > maxpages * PAGE_SIZE)
1499 len = maxpages * PAGE_SIZE;
1500 *size = len;
1501 return addr & PAGE_MASK;
1502 }
1503 BUG(); // if it had been empty, we wouldn't get called
1504}
1505
1506/* must be done on non-empty ITER_BVEC one */
1507static struct page *first_bvec_segment(const struct iov_iter *i,
1508 size_t *size, size_t *start,
1509 size_t maxsize, unsigned maxpages)
1510{
1511 struct page *page;
1512 size_t skip = i->iov_offset, len;
1513
1514 len = i->bvec->bv_len - skip;
1515 if (len > maxsize)
1516 len = maxsize;
1517 skip += i->bvec->bv_offset;
1518 page = i->bvec->bv_page + skip / PAGE_SIZE;
1519 len += (*start = skip % PAGE_SIZE);
1520 if (len > maxpages * PAGE_SIZE)
1521 len = maxpages * PAGE_SIZE;
1522 *size = len;
1523 return page;
1524}
1525
62a8067a 1526ssize_t iov_iter_get_pages(struct iov_iter *i,
2c80929c 1527 struct page **pages, size_t maxsize, unsigned maxpages,
62a8067a
AV
1528 size_t *start)
1529{
3d671ca6
AV
1530 size_t len;
1531 int n, res;
1532
e5393fae
AV
1533 if (maxsize > i->count)
1534 maxsize = i->count;
3d671ca6
AV
1535 if (!maxsize)
1536 return 0;
e5393fae 1537
3d671ca6
AV
1538 if (likely(iter_is_iovec(i))) {
1539 unsigned long addr;
e5393fae 1540
3d671ca6 1541 addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
e5393fae 1542 n = DIV_ROUND_UP(len, PAGE_SIZE);
73b0140b
IW
1543 res = get_user_pages_fast(addr, n,
1544 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0,
1545 pages);
e5393fae
AV
1546 if (unlikely(res < 0))
1547 return res;
1548 return (res == n ? len : res * PAGE_SIZE) - *start;
3d671ca6
AV
1549 }
1550 if (iov_iter_is_bvec(i)) {
1551 struct page *page;
1552
1553 page = first_bvec_segment(i, &len, start, maxsize, maxpages);
1554 n = DIV_ROUND_UP(len, PAGE_SIZE);
1555 while (n--)
1556 get_page(*pages++ = page++);
1557 return len - *start;
1558 }
1559 if (iov_iter_is_pipe(i))
1560 return pipe_get_pages(i, pages, maxsize, maxpages, start);
1561 if (iov_iter_is_xarray(i))
1562 return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1563 return -EFAULT;
62a8067a
AV
1564}
1565EXPORT_SYMBOL(iov_iter_get_pages);
1566
1b17f1f2
AV
1567static struct page **get_pages_array(size_t n)
1568{
752ade68 1569 return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1b17f1f2
AV
1570}
1571
241699cd
AV
1572static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1573 struct page ***pages, size_t maxsize,
1574 size_t *start)
1575{
1576 struct page **p;
8cefc107 1577 unsigned int iter_head, npages;
d7760d63 1578 ssize_t n;
241699cd
AV
1579
1580 if (!sanity(i))
1581 return -EFAULT;
1582
8cefc107
DH
1583 data_start(i, &iter_head, start);
1584 /* Amount of free space: some of this one + all after this one */
1585 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
241699cd
AV
1586 n = npages * PAGE_SIZE - *start;
1587 if (maxsize > n)
1588 maxsize = n;
1589 else
1590 npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1591 p = get_pages_array(npages);
1592 if (!p)
1593 return -ENOMEM;
8cefc107 1594 n = __pipe_get_pages(i, maxsize, p, iter_head, start);
241699cd
AV
1595 if (n > 0)
1596 *pages = p;
1597 else
1598 kvfree(p);
1599 return n;
1600}
1601
7ff50620
DH
1602static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1603 struct page ***pages, size_t maxsize,
1604 size_t *_start_offset)
1605{
1606 struct page **p;
1607 unsigned nr, offset;
1608 pgoff_t index, count;
1609 size_t size = maxsize, actual;
1610 loff_t pos;
1611
1612 if (!size)
1613 return 0;
1614
1615 pos = i->xarray_start + i->iov_offset;
1616 index = pos >> PAGE_SHIFT;
1617 offset = pos & ~PAGE_MASK;
1618 *_start_offset = offset;
1619
1620 count = 1;
1621 if (size > PAGE_SIZE - offset) {
1622 size -= PAGE_SIZE - offset;
1623 count += size >> PAGE_SHIFT;
1624 size &= ~PAGE_MASK;
1625 if (size)
1626 count++;
1627 }
1628
1629 p = get_pages_array(count);
1630 if (!p)
1631 return -ENOMEM;
1632 *pages = p;
1633
1634 nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1635 if (nr == 0)
1636 return 0;
1637
1638 actual = PAGE_SIZE * nr;
1639 actual -= offset;
1640 if (nr == count && size > 0) {
1641 unsigned last_offset = (nr > 1) ? 0 : offset;
1642 actual -= PAGE_SIZE - (last_offset + size);
1643 }
1644 return actual;
1645}
1646
62a8067a
AV
1647ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1648 struct page ***pages, size_t maxsize,
1649 size_t *start)
1650{
1b17f1f2 1651 struct page **p;
3d671ca6
AV
1652 size_t len;
1653 int n, res;
1b17f1f2
AV
1654
1655 if (maxsize > i->count)
1656 maxsize = i->count;
3d671ca6
AV
1657 if (!maxsize)
1658 return 0;
1b17f1f2 1659
3d671ca6
AV
1660 if (likely(iter_is_iovec(i))) {
1661 unsigned long addr;
1b17f1f2 1662
3d671ca6 1663 addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
1b17f1f2
AV
1664 n = DIV_ROUND_UP(len, PAGE_SIZE);
1665 p = get_pages_array(n);
1666 if (!p)
1667 return -ENOMEM;
73b0140b
IW
1668 res = get_user_pages_fast(addr, n,
1669 iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p);
1b17f1f2
AV
1670 if (unlikely(res < 0)) {
1671 kvfree(p);
1672 return res;
1673 }
1674 *pages = p;
1675 return (res == n ? len : res * PAGE_SIZE) - *start;
3d671ca6
AV
1676 }
1677 if (iov_iter_is_bvec(i)) {
1678 struct page *page;
1679
1680 page = first_bvec_segment(i, &len, start, maxsize, ~0U);
1681 n = DIV_ROUND_UP(len, PAGE_SIZE);
1682 *pages = p = get_pages_array(n);
1b17f1f2
AV
1683 if (!p)
1684 return -ENOMEM;
3d671ca6
AV
1685 while (n--)
1686 get_page(*p++ = page++);
1687 return len - *start;
1688 }
1689 if (iov_iter_is_pipe(i))
1690 return pipe_get_pages_alloc(i, pages, maxsize, start);
1691 if (iov_iter_is_xarray(i))
1692 return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1693 return -EFAULT;
62a8067a
AV
1694}
1695EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1696
a604ec7e
AV
1697size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1698 struct iov_iter *i)
1699{
1700 char *to = addr;
1701 __wsum sum, next;
1702 size_t off = 0;
a604ec7e 1703 sum = *csum;
9ea9ce04 1704 if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
241699cd
AV
1705 WARN_ON(1);
1706 return 0;
1707 }
a604ec7e 1708 iterate_and_advance(i, bytes, v, ({
cbbd26b8 1709 next = csum_and_copy_from_user(v.iov_base,
a604ec7e 1710 (to += v.iov_len) - v.iov_len,
c693cc46
AV
1711 v.iov_len);
1712 if (next) {
a604ec7e
AV
1713 sum = csum_block_add(sum, next, off);
1714 off += v.iov_len;
1715 }
c693cc46 1716 next ? 0 : v.iov_len;
a604ec7e
AV
1717 }), ({
1718 char *p = kmap_atomic(v.bv_page);
f9152895
AV
1719 sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1720 p + v.bv_offset, v.bv_len,
1721 sum, off);
a604ec7e 1722 kunmap_atomic(p);
a604ec7e
AV
1723 off += v.bv_len;
1724 }),({
f9152895
AV
1725 sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
1726 v.iov_base, v.iov_len,
1727 sum, off);
a604ec7e 1728 off += v.iov_len;
7ff50620
DH
1729 }), ({
1730 char *p = kmap_atomic(v.bv_page);
1731 sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1732 p + v.bv_offset, v.bv_len,
1733 sum, off);
1734 kunmap_atomic(p);
1735 off += v.bv_len;
a604ec7e
AV
1736 })
1737 )
1738 *csum = sum;
1739 return bytes;
1740}
1741EXPORT_SYMBOL(csum_and_copy_from_iter);
1742
52cbd23a 1743size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
a604ec7e
AV
1744 struct iov_iter *i)
1745{
52cbd23a 1746 struct csum_state *csstate = _csstate;
36f7a8a4 1747 const char *from = addr;
a604ec7e 1748 __wsum sum, next;
52cbd23a 1749 size_t off;
78e1f386
AV
1750
1751 if (unlikely(iov_iter_is_pipe(i)))
52cbd23a 1752 return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);
78e1f386 1753
594e450b
AV
1754 sum = csum_shift(csstate->csum, csstate->off);
1755 off = 0;
78e1f386 1756 if (unlikely(iov_iter_is_discard(i))) {
241699cd
AV
1757 WARN_ON(1); /* for now */
1758 return 0;
1759 }
a604ec7e 1760 iterate_and_advance(i, bytes, v, ({
a604ec7e 1761 next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
cbbd26b8 1762 v.iov_base,
c693cc46
AV
1763 v.iov_len);
1764 if (next) {
a604ec7e
AV
1765 sum = csum_block_add(sum, next, off);
1766 off += v.iov_len;
1767 }
c693cc46 1768 next ? 0 : v.iov_len;
a604ec7e
AV
1769 }), ({
1770 char *p = kmap_atomic(v.bv_page);
f9152895
AV
1771 sum = csum_and_memcpy(p + v.bv_offset,
1772 (from += v.bv_len) - v.bv_len,
1773 v.bv_len, sum, off);
a604ec7e 1774 kunmap_atomic(p);
a604ec7e
AV
1775 off += v.bv_len;
1776 }),({
f9152895
AV
1777 sum = csum_and_memcpy(v.iov_base,
1778 (from += v.iov_len) - v.iov_len,
1779 v.iov_len, sum, off);
a604ec7e 1780 off += v.iov_len;
7ff50620
DH
1781 }), ({
1782 char *p = kmap_atomic(v.bv_page);
1783 sum = csum_and_memcpy(p + v.bv_offset,
1784 (from += v.bv_len) - v.bv_len,
1785 v.bv_len, sum, off);
1786 kunmap_atomic(p);
1787 off += v.bv_len;
a604ec7e
AV
1788 })
1789 )
594e450b
AV
1790 csstate->csum = csum_shift(sum, csstate->off);
1791 csstate->off += bytes;
a604ec7e
AV
1792 return bytes;
1793}
1794EXPORT_SYMBOL(csum_and_copy_to_iter);
1795
d05f4435
SG
1796size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1797 struct iov_iter *i)
1798{
7999096f 1799#ifdef CONFIG_CRYPTO_HASH
d05f4435
SG
1800 struct ahash_request *hash = hashp;
1801 struct scatterlist sg;
1802 size_t copied;
1803
1804 copied = copy_to_iter(addr, bytes, i);
1805 sg_init_one(&sg, addr, copied);
1806 ahash_request_set_crypt(hash, &sg, NULL, copied);
1807 crypto_ahash_update(hash);
1808 return copied;
27fad74a
Y
1809#else
1810 return 0;
1811#endif
d05f4435
SG
1812}
1813EXPORT_SYMBOL(hash_and_copy_to_iter);
1814
66531c65 1815static int iov_npages(const struct iov_iter *i, int maxpages)
62a8067a 1816{
66531c65
AV
1817 size_t skip = i->iov_offset, size = i->count;
1818 const struct iovec *p;
e0f2dc40
AV
1819 int npages = 0;
1820
66531c65
AV
1821 for (p = i->iov; size; skip = 0, p++) {
1822 unsigned offs = offset_in_page(p->iov_base + skip);
1823 size_t len = min(p->iov_len - skip, size);
e0f2dc40 1824
66531c65
AV
1825 if (len) {
1826 size -= len;
1827 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1828 if (unlikely(npages > maxpages))
1829 return maxpages;
1830 }
1831 }
1832 return npages;
1833}
1834
1835static int bvec_npages(const struct iov_iter *i, int maxpages)
1836{
1837 size_t skip = i->iov_offset, size = i->count;
1838 const struct bio_vec *p;
1839 int npages = 0;
1840
1841 for (p = i->bvec; size; skip = 0, p++) {
1842 unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1843 size_t len = min(p->bv_len - skip, size);
1844
1845 size -= len;
1846 npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1847 if (unlikely(npages > maxpages))
1848 return maxpages;
1849 }
1850 return npages;
1851}
1852
1853int iov_iter_npages(const struct iov_iter *i, int maxpages)
1854{
1855 if (unlikely(!i->count))
1856 return 0;
1857 /* iovec and kvec have identical layouts */
1858 if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1859 return iov_npages(i, maxpages);
1860 if (iov_iter_is_bvec(i))
1861 return bvec_npages(i, maxpages);
1862 if (iov_iter_is_pipe(i)) {
8cefc107 1863 unsigned int iter_head;
66531c65 1864 int npages;
241699cd 1865 size_t off;
241699cd
AV
1866
1867 if (!sanity(i))
1868 return 0;
1869
8cefc107 1870 data_start(i, &iter_head, &off);
241699cd 1871 /* some of this one + all after this one */
66531c65
AV
1872 npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1873 return min(npages, maxpages);
1874 }
1875 if (iov_iter_is_xarray(i)) {
e4f8df86
AV
1876 unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1877 int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
66531c65
AV
1878 return min(npages, maxpages);
1879 }
1880 return 0;
62a8067a 1881}
f67da30c 1882EXPORT_SYMBOL(iov_iter_npages);
4b8164b9
AV
1883
1884const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1885{
1886 *new = *old;
00e23707 1887 if (unlikely(iov_iter_is_pipe(new))) {
241699cd
AV
1888 WARN_ON(1);
1889 return NULL;
1890 }
7ff50620 1891 if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
9ea9ce04 1892 return NULL;
00e23707 1893 if (iov_iter_is_bvec(new))
4b8164b9
AV
1894 return new->bvec = kmemdup(new->bvec,
1895 new->nr_segs * sizeof(struct bio_vec),
1896 flags);
1897 else
1898 /* iovec and kvec have identical layout */
1899 return new->iov = kmemdup(new->iov,
1900 new->nr_segs * sizeof(struct iovec),
1901 flags);
1902}
1903EXPORT_SYMBOL(dup_iter);
bc917be8 1904
bfdc5970
CH
1905static int copy_compat_iovec_from_user(struct iovec *iov,
1906 const struct iovec __user *uvec, unsigned long nr_segs)
1907{
1908 const struct compat_iovec __user *uiov =
1909 (const struct compat_iovec __user *)uvec;
1910 int ret = -EFAULT, i;
1911
a959a978 1912 if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
bfdc5970
CH
1913 return -EFAULT;
1914
1915 for (i = 0; i < nr_segs; i++) {
1916 compat_uptr_t buf;
1917 compat_ssize_t len;
1918
1919 unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1920 unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1921
1922 /* check for compat_size_t not fitting in compat_ssize_t .. */
1923 if (len < 0) {
1924 ret = -EINVAL;
1925 goto uaccess_end;
1926 }
1927 iov[i].iov_base = compat_ptr(buf);
1928 iov[i].iov_len = len;
1929 }
1930
1931 ret = 0;
1932uaccess_end:
1933 user_access_end();
1934 return ret;
1935}
1936
1937static int copy_iovec_from_user(struct iovec *iov,
1938 const struct iovec __user *uvec, unsigned long nr_segs)
fb041b59
DL
1939{
1940 unsigned long seg;
fb041b59 1941
bfdc5970
CH
1942 if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1943 return -EFAULT;
1944 for (seg = 0; seg < nr_segs; seg++) {
1945 if ((ssize_t)iov[seg].iov_len < 0)
1946 return -EINVAL;
fb041b59
DL
1947 }
1948
bfdc5970
CH
1949 return 0;
1950}
1951
1952struct iovec *iovec_from_user(const struct iovec __user *uvec,
1953 unsigned long nr_segs, unsigned long fast_segs,
1954 struct iovec *fast_iov, bool compat)
1955{
1956 struct iovec *iov = fast_iov;
1957 int ret;
1958
fb041b59 1959 /*
bfdc5970
CH
1960 * SuS says "The readv() function *may* fail if the iovcnt argument was
1961 * less than or equal to 0, or greater than {IOV_MAX}. Linux has
1962 * traditionally returned zero for zero segments, so...
fb041b59 1963 */
bfdc5970
CH
1964 if (nr_segs == 0)
1965 return iov;
1966 if (nr_segs > UIO_MAXIOV)
1967 return ERR_PTR(-EINVAL);
fb041b59
DL
1968 if (nr_segs > fast_segs) {
1969 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
bfdc5970
CH
1970 if (!iov)
1971 return ERR_PTR(-ENOMEM);
fb041b59 1972 }
bfdc5970
CH
1973
1974 if (compat)
1975 ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1976 else
1977 ret = copy_iovec_from_user(iov, uvec, nr_segs);
1978 if (ret) {
1979 if (iov != fast_iov)
1980 kfree(iov);
1981 return ERR_PTR(ret);
1982 }
1983
1984 return iov;
1985}
1986
1987ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1988 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1989 struct iov_iter *i, bool compat)
1990{
1991 ssize_t total_len = 0;
1992 unsigned long seg;
1993 struct iovec *iov;
1994
1995 iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1996 if (IS_ERR(iov)) {
1997 *iovp = NULL;
1998 return PTR_ERR(iov);
fb041b59
DL
1999 }
2000
2001 /*
bfdc5970
CH
2002 * According to the Single Unix Specification we should return EINVAL if
2003 * an element length is < 0 when cast to ssize_t or if the total length
2004 * would overflow the ssize_t return value of the system call.
fb041b59
DL
2005 *
2006 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
2007 * overflow case.
2008 */
fb041b59 2009 for (seg = 0; seg < nr_segs; seg++) {
fb041b59
DL
2010 ssize_t len = (ssize_t)iov[seg].iov_len;
2011
bfdc5970
CH
2012 if (!access_ok(iov[seg].iov_base, len)) {
2013 if (iov != *iovp)
2014 kfree(iov);
2015 *iovp = NULL;
2016 return -EFAULT;
fb041b59 2017 }
bfdc5970
CH
2018
2019 if (len > MAX_RW_COUNT - total_len) {
2020 len = MAX_RW_COUNT - total_len;
fb041b59
DL
2021 iov[seg].iov_len = len;
2022 }
bfdc5970 2023 total_len += len;
fb041b59 2024 }
bfdc5970
CH
2025
2026 iov_iter_init(i, type, iov, nr_segs, total_len);
2027 if (iov == *iovp)
2028 *iovp = NULL;
2029 else
2030 *iovp = iov;
2031 return total_len;
fb041b59
DL
2032}
2033
ffecee4f
VN
2034/**
2035 * import_iovec() - Copy an array of &struct iovec from userspace
2036 * into the kernel, check that it is valid, and initialize a new
2037 * &struct iov_iter iterator to access it.
2038 *
2039 * @type: One of %READ or %WRITE.
bfdc5970 2040 * @uvec: Pointer to the userspace array.
ffecee4f
VN
2041 * @nr_segs: Number of elements in userspace array.
2042 * @fast_segs: Number of elements in @iov.
bfdc5970 2043 * @iovp: (input and output parameter) Pointer to pointer to (usually small
ffecee4f
VN
2044 * on-stack) kernel array.
2045 * @i: Pointer to iterator that will be initialized on success.
2046 *
2047 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
2048 * then this function places %NULL in *@iov on return. Otherwise, a new
2049 * array will be allocated and the result placed in *@iov. This means that
2050 * the caller may call kfree() on *@iov regardless of whether the small
2051 * on-stack array was used or not (and regardless of whether this function
2052 * returns an error or not).
2053 *
87e5e6da 2054 * Return: Negative error code on error, bytes imported on success
ffecee4f 2055 */
bfdc5970 2056ssize_t import_iovec(int type, const struct iovec __user *uvec,
bc917be8 2057 unsigned nr_segs, unsigned fast_segs,
bfdc5970 2058 struct iovec **iovp, struct iov_iter *i)
bc917be8 2059{
89cd35c5
CH
2060 return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
2061 in_compat_syscall());
bc917be8
AV
2062}
2063EXPORT_SYMBOL(import_iovec);
2064
bc917be8
AV
2065int import_single_range(int rw, void __user *buf, size_t len,
2066 struct iovec *iov, struct iov_iter *i)
2067{
2068 if (len > MAX_RW_COUNT)
2069 len = MAX_RW_COUNT;
96d4f267 2070 if (unlikely(!access_ok(buf, len)))
bc917be8
AV
2071 return -EFAULT;
2072
2073 iov->iov_base = buf;
2074 iov->iov_len = len;
2075 iov_iter_init(i, rw, iov, 1, len);
2076 return 0;
2077}
e1267585 2078EXPORT_SYMBOL(import_single_range);