fs: implement kernel_write using __kernel_write
[linux-block.git] / fs / read_write.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
1da177e4
LT
2/*
3 * linux/fs/read_write.c
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 */
7
b12fb7f4 8#include <linux/slab.h>
1da177e4 9#include <linux/stat.h>
b12fb7f4 10#include <linux/sched/xacct.h>
1da177e4
LT
11#include <linux/fcntl.h>
12#include <linux/file.h>
13#include <linux/uio.h>
0eeca283 14#include <linux/fsnotify.h>
1da177e4 15#include <linux/security.h>
630d9c47 16#include <linux/export.h>
1da177e4 17#include <linux/syscalls.h>
e28cc715 18#include <linux/pagemap.h>
d6b29d7c 19#include <linux/splice.h>
561c6731 20#include <linux/compat.h>
29732938 21#include <linux/mount.h>
2feb55f8 22#include <linux/fs.h>
06ae43f3 23#include "internal.h"
1da177e4 24
7c0f6ba6 25#include <linux/uaccess.h>
1da177e4
LT
26#include <asm/unistd.h>
27
4b6f5d20 28const struct file_operations generic_ro_fops = {
1da177e4 29 .llseek = generic_file_llseek,
aad4f8bb 30 .read_iter = generic_file_read_iter,
1da177e4 31 .mmap = generic_file_readonly_mmap,
534f2aaa 32 .splice_read = generic_file_splice_read,
1da177e4
LT
33};
34
35EXPORT_SYMBOL(generic_ro_fops);
36
ddef7ed2 37static inline bool unsigned_offsets(struct file *file)
4a3956c7 38{
cccb5a1e 39 return file->f_mode & FMODE_UNSIGNED_OFFSET;
4a3956c7
KH
40}
41
46a1c2c7
JL
42/**
43 * vfs_setpos - update the file offset for lseek
44 * @file: file structure in question
45 * @offset: file offset to seek to
46 * @maxsize: maximum file size
47 *
48 * This is a low-level filesystem helper for updating the file offset to
49 * the value specified by @offset if the given offset is valid and it is
50 * not equal to the current file offset.
51 *
52 * Return the specified offset on success and -EINVAL on invalid offset.
53 */
54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
ef3d0fd2
AK
55{
56 if (offset < 0 && !unsigned_offsets(file))
57 return -EINVAL;
58 if (offset > maxsize)
59 return -EINVAL;
60
61 if (offset != file->f_pos) {
62 file->f_pos = offset;
63 file->f_version = 0;
64 }
65 return offset;
66}
46a1c2c7 67EXPORT_SYMBOL(vfs_setpos);
ef3d0fd2 68
3a8cff4f 69/**
5760495a 70 * generic_file_llseek_size - generic llseek implementation for regular files
3a8cff4f
CH
71 * @file: file structure to seek on
72 * @offset: file offset to seek to
965c8e59 73 * @whence: type of seek
e8b96eb5
ES
74 * @size: max size of this file in file system
75 * @eof: offset used for SEEK_END position
3a8cff4f 76 *
5760495a 77 * This is a variant of generic_file_llseek that allows passing in a custom
e8b96eb5 78 * maximum file size and a custom EOF position, for e.g. hashed directories
ef3d0fd2
AK
79 *
80 * Synchronization:
5760495a 81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
ef3d0fd2
AK
82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
83 * read/writes behave like SEEK_SET against seeks.
3a8cff4f 84 */
9465efc9 85loff_t
965c8e59 86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
e8b96eb5 87 loff_t maxsize, loff_t eof)
1da177e4 88{
965c8e59 89 switch (whence) {
3a8cff4f 90 case SEEK_END:
e8b96eb5 91 offset += eof;
3a8cff4f
CH
92 break;
93 case SEEK_CUR:
5b6f1eb9
AK
94 /*
95 * Here we special-case the lseek(fd, 0, SEEK_CUR)
96 * position-querying operation. Avoid rewriting the "same"
97 * f_pos value back to the file because a concurrent read(),
98 * write() or lseek() might have altered it
99 */
100 if (offset == 0)
101 return file->f_pos;
ef3d0fd2
AK
102 /*
103 * f_lock protects against read/modify/write race with other
104 * SEEK_CURs. Note that parallel writes and reads behave
105 * like SEEK_SET.
106 */
107 spin_lock(&file->f_lock);
46a1c2c7 108 offset = vfs_setpos(file, file->f_pos + offset, maxsize);
ef3d0fd2
AK
109 spin_unlock(&file->f_lock);
110 return offset;
982d8165
JB
111 case SEEK_DATA:
112 /*
113 * In the generic case the entire file is data, so as long as
114 * offset isn't at the end of the file then the offset is data.
115 */
fc46820b 116 if ((unsigned long long)offset >= eof)
982d8165
JB
117 return -ENXIO;
118 break;
119 case SEEK_HOLE:
120 /*
121 * There is a virtual hole at the end of the file, so as long as
122 * offset isn't i_size or larger, return i_size.
123 */
fc46820b 124 if ((unsigned long long)offset >= eof)
982d8165 125 return -ENXIO;
e8b96eb5 126 offset = eof;
982d8165 127 break;
1da177e4 128 }
3a8cff4f 129
46a1c2c7 130 return vfs_setpos(file, offset, maxsize);
5760495a
AK
131}
132EXPORT_SYMBOL(generic_file_llseek_size);
133
134/**
135 * generic_file_llseek - generic llseek implementation for regular files
136 * @file: file structure to seek on
137 * @offset: file offset to seek to
965c8e59 138 * @whence: type of seek
5760495a
AK
139 *
140 * This is a generic implemenation of ->llseek useable for all normal local
141 * filesystems. It just updates the file offset to the value specified by
546ae2d2 142 * @offset and @whence.
5760495a 143 */
965c8e59 144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
5760495a
AK
145{
146 struct inode *inode = file->f_mapping->host;
147
965c8e59 148 return generic_file_llseek_size(file, offset, whence,
e8b96eb5
ES
149 inode->i_sb->s_maxbytes,
150 i_size_read(inode));
1da177e4 151}
9465efc9 152EXPORT_SYMBOL(generic_file_llseek);
1da177e4 153
1bf9d14d
AV
154/**
155 * fixed_size_llseek - llseek implementation for fixed-sized devices
156 * @file: file structure to seek on
157 * @offset: file offset to seek to
158 * @whence: type of seek
159 * @size: size of the file
160 *
161 */
162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
163{
164 switch (whence) {
165 case SEEK_SET: case SEEK_CUR: case SEEK_END:
166 return generic_file_llseek_size(file, offset, whence,
167 size, size);
168 default:
169 return -EINVAL;
170 }
171}
172EXPORT_SYMBOL(fixed_size_llseek);
173
b25472f9
AV
174/**
175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
176 * @file: file structure to seek on
177 * @offset: file offset to seek to
178 * @whence: type of seek
179 *
180 */
181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
182{
183 switch (whence) {
184 case SEEK_SET: case SEEK_CUR:
185 return generic_file_llseek_size(file, offset, whence,
2feb55f8 186 OFFSET_MAX, 0);
b25472f9
AV
187 default:
188 return -EINVAL;
189 }
190}
191EXPORT_SYMBOL(no_seek_end_llseek);
192
193/**
194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
195 * @file: file structure to seek on
196 * @offset: file offset to seek to
197 * @whence: type of seek
198 * @size: maximal offset allowed
199 *
200 */
201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
202{
203 switch (whence) {
204 case SEEK_SET: case SEEK_CUR:
205 return generic_file_llseek_size(file, offset, whence,
206 size, 0);
207 default:
208 return -EINVAL;
209 }
210}
211EXPORT_SYMBOL(no_seek_end_llseek_size);
212
ae6afc3f
B
213/**
214 * noop_llseek - No Operation Performed llseek implementation
215 * @file: file structure to seek on
216 * @offset: file offset to seek to
965c8e59 217 * @whence: type of seek
ae6afc3f
B
218 *
219 * This is an implementation of ->llseek useable for the rare special case when
220 * userspace expects the seek to succeed but the (device) file is actually not
221 * able to perform the seek. In this case you use noop_llseek() instead of
222 * falling back to the default implementation of ->llseek.
223 */
965c8e59 224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
ae6afc3f
B
225{
226 return file->f_pos;
227}
228EXPORT_SYMBOL(noop_llseek);
229
965c8e59 230loff_t no_llseek(struct file *file, loff_t offset, int whence)
1da177e4
LT
231{
232 return -ESPIPE;
233}
234EXPORT_SYMBOL(no_llseek);
235
965c8e59 236loff_t default_llseek(struct file *file, loff_t offset, int whence)
1da177e4 237{
496ad9aa 238 struct inode *inode = file_inode(file);
16abef0e 239 loff_t retval;
1da177e4 240
5955102c 241 inode_lock(inode);
965c8e59 242 switch (whence) {
7b8e8924 243 case SEEK_END:
982d8165 244 offset += i_size_read(inode);
1da177e4 245 break;
7b8e8924 246 case SEEK_CUR:
5b6f1eb9
AK
247 if (offset == 0) {
248 retval = file->f_pos;
249 goto out;
250 }
1da177e4 251 offset += file->f_pos;
982d8165
JB
252 break;
253 case SEEK_DATA:
254 /*
255 * In the generic case the entire file is data, so as
256 * long as offset isn't at the end of the file then the
257 * offset is data.
258 */
bacb2d81
DC
259 if (offset >= inode->i_size) {
260 retval = -ENXIO;
261 goto out;
262 }
982d8165
JB
263 break;
264 case SEEK_HOLE:
265 /*
266 * There is a virtual hole at the end of the file, so
267 * as long as offset isn't i_size or larger, return
268 * i_size.
269 */
bacb2d81
DC
270 if (offset >= inode->i_size) {
271 retval = -ENXIO;
272 goto out;
273 }
982d8165
JB
274 offset = inode->i_size;
275 break;
1da177e4
LT
276 }
277 retval = -EINVAL;
cccb5a1e 278 if (offset >= 0 || unsigned_offsets(file)) {
1da177e4
LT
279 if (offset != file->f_pos) {
280 file->f_pos = offset;
281 file->f_version = 0;
282 }
283 retval = offset;
284 }
5b6f1eb9 285out:
5955102c 286 inode_unlock(inode);
1da177e4
LT
287 return retval;
288}
289EXPORT_SYMBOL(default_llseek);
290
965c8e59 291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
1da177e4
LT
292{
293 loff_t (*fn)(struct file *, loff_t, int);
294
295 fn = no_llseek;
296 if (file->f_mode & FMODE_LSEEK) {
72c2d531 297 if (file->f_op->llseek)
1da177e4
LT
298 fn = file->f_op->llseek;
299 }
965c8e59 300 return fn(file, offset, whence);
1da177e4
LT
301}
302EXPORT_SYMBOL(vfs_llseek);
303
76847e43 304off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
1da177e4
LT
305{
306 off_t retval;
9c225f26 307 struct fd f = fdget_pos(fd);
2903ff01
AV
308 if (!f.file)
309 return -EBADF;
1da177e4
LT
310
311 retval = -EINVAL;
965c8e59
AM
312 if (whence <= SEEK_MAX) {
313 loff_t res = vfs_llseek(f.file, offset, whence);
1da177e4
LT
314 retval = res;
315 if (res != (loff_t)retval)
316 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
317 }
9c225f26 318 fdput_pos(f);
1da177e4
LT
319 return retval;
320}
321
76847e43
DB
322SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
323{
324 return ksys_lseek(fd, offset, whence);
325}
326
561c6731
AV
327#ifdef CONFIG_COMPAT
328COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
329{
76847e43 330 return ksys_lseek(fd, offset, whence);
561c6731
AV
331}
332#endif
333
9e62ccec
MS
334#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
335 defined(__ARCH_WANT_SYS_LLSEEK)
003d7ab4
HC
336SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
337 unsigned long, offset_low, loff_t __user *, result,
965c8e59 338 unsigned int, whence)
1da177e4
LT
339{
340 int retval;
d7a15f8d 341 struct fd f = fdget_pos(fd);
1da177e4 342 loff_t offset;
1da177e4 343
2903ff01
AV
344 if (!f.file)
345 return -EBADF;
1da177e4
LT
346
347 retval = -EINVAL;
965c8e59 348 if (whence > SEEK_MAX)
1da177e4
LT
349 goto out_putf;
350
2903ff01 351 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
965c8e59 352 whence);
1da177e4
LT
353
354 retval = (int)offset;
355 if (offset >= 0) {
356 retval = -EFAULT;
357 if (!copy_to_user(result, &offset, sizeof(offset)))
358 retval = 0;
359 }
360out_putf:
d7a15f8d 361 fdput_pos(f);
1da177e4
LT
362 return retval;
363}
364#endif
365
68d70d03 366int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
1da177e4
LT
367{
368 struct inode *inode;
c43e259c 369 int retval = -EINVAL;
1da177e4 370
496ad9aa 371 inode = file_inode(file);
e28cc715 372 if (unlikely((ssize_t) count < 0))
c43e259c 373 return retval;
1da177e4 374
438ab720
KS
375 /*
376 * ranged mandatory locking does not apply to streams - it makes sense
377 * only for files where position has a meaning.
378 */
379 if (ppos) {
380 loff_t pos = *ppos;
381
382 if (unlikely(pos < 0)) {
383 if (!unsigned_offsets(file))
384 return retval;
385 if (count >= -pos) /* both values are in 0..LLONG_MAX */
386 return -EOVERFLOW;
387 } else if (unlikely((loff_t) (pos + count) < 0)) {
388 if (!unsigned_offsets(file))
389 return retval;
390 }
391
392 if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
393 retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
394 read_write == READ ? F_RDLCK : F_WRLCK);
395 if (retval < 0)
396 return retval;
397 }
e28cc715 398 }
438ab720 399
bc61384d 400 return security_file_permission(file,
c43e259c 401 read_write == READ ? MAY_READ : MAY_WRITE);
1da177e4
LT
402}
403
5d5d5689 404static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
293bc982
AV
405{
406 struct iovec iov = { .iov_base = buf, .iov_len = len };
407 struct kiocb kiocb;
408 struct iov_iter iter;
409 ssize_t ret;
410
411 init_sync_kiocb(&kiocb, filp);
438ab720 412 kiocb.ki_pos = (ppos ? *ppos : 0);
293bc982
AV
413 iov_iter_init(&iter, READ, &iov, 1, len);
414
bb7462b6 415 ret = call_read_iter(filp, &kiocb, &iter);
599bd19b 416 BUG_ON(ret == -EIOCBQUEUED);
438ab720
KS
417 if (ppos)
418 *ppos = kiocb.ki_pos;
293bc982
AV
419 return ret;
420}
421
6fb5032e
DK
422ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
423 loff_t *pos)
424{
6fb5032e 425 if (file->f_op->read)
3d04c8a1 426 return file->f_op->read(file, buf, count, pos);
6fb5032e 427 else if (file->f_op->read_iter)
3d04c8a1 428 return new_sync_read(file, buf, count, pos);
6fb5032e 429 else
3d04c8a1 430 return -EINVAL;
6fb5032e
DK
431}
432
bdd1d2d3 433ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
c41fbad0
CH
434{
435 mm_segment_t old_fs;
bdd1d2d3 436 ssize_t result;
c41fbad0
CH
437
438 old_fs = get_fs();
736706be 439 set_fs(KERNEL_DS);
c41fbad0 440 /* The cast to a user pointer is valid due to the set_fs() */
bdd1d2d3 441 result = vfs_read(file, (void __user *)buf, count, pos);
c41fbad0
CH
442 set_fs(old_fs);
443 return result;
444}
445EXPORT_SYMBOL(kernel_read);
6fb5032e 446
1da177e4
LT
447ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
448{
449 ssize_t ret;
450
451 if (!(file->f_mode & FMODE_READ))
452 return -EBADF;
7f7f25e8 453 if (!(file->f_mode & FMODE_CAN_READ))
1da177e4 454 return -EINVAL;
96d4f267 455 if (unlikely(!access_ok(buf, count)))
1da177e4
LT
456 return -EFAULT;
457
458 ret = rw_verify_area(READ, file, pos, count);
bc61384d
AV
459 if (!ret) {
460 if (count > MAX_RW_COUNT)
461 count = MAX_RW_COUNT;
6fb5032e 462 ret = __vfs_read(file, buf, count, pos);
c43e259c 463 if (ret > 0) {
2a12a9d7 464 fsnotify_access(file);
c43e259c 465 add_rchar(current, ret);
1da177e4 466 }
c43e259c 467 inc_syscr(current);
1da177e4
LT
468 }
469
470 return ret;
471}
472
5d5d5689 473static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
293bc982
AV
474{
475 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
476 struct kiocb kiocb;
477 struct iov_iter iter;
478 ssize_t ret;
479
480 init_sync_kiocb(&kiocb, filp);
438ab720 481 kiocb.ki_pos = (ppos ? *ppos : 0);
293bc982
AV
482 iov_iter_init(&iter, WRITE, &iov, 1, len);
483
bb7462b6 484 ret = call_write_iter(filp, &kiocb, &iter);
599bd19b 485 BUG_ON(ret == -EIOCBQUEUED);
438ab720 486 if (ret > 0 && ppos)
f765b134 487 *ppos = kiocb.ki_pos;
293bc982
AV
488 return ret;
489}
490
12e1e7af
GU
491static ssize_t __vfs_write(struct file *file, const char __user *p,
492 size_t count, loff_t *pos)
493c84c0
AV
493{
494 if (file->f_op->write)
495 return file->f_op->write(file, p, count, pos);
493c84c0
AV
496 else if (file->f_op->write_iter)
497 return new_sync_write(file, p, count, pos);
498 else
499 return -EINVAL;
500}
493c84c0 501
81238b2c 502/* caller is responsible for file_start_write/file_end_write */
73e18f7c 503ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
06ae43f3
AV
504{
505 mm_segment_t old_fs;
506 const char __user *p;
507 ssize_t ret;
508
a01ac27b
CH
509 if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
510 return -EBADF;
7f7f25e8 511 if (!(file->f_mode & FMODE_CAN_WRITE))
3e84f48e
AV
512 return -EINVAL;
513
06ae43f3 514 old_fs = get_fs();
736706be 515 set_fs(KERNEL_DS);
06ae43f3
AV
516 p = (__force const char __user *)buf;
517 if (count > MAX_RW_COUNT)
518 count = MAX_RW_COUNT;
493c84c0 519 ret = __vfs_write(file, p, count, pos);
06ae43f3
AV
520 set_fs(old_fs);
521 if (ret > 0) {
522 fsnotify_modify(file);
523 add_wchar(current, ret);
524 }
525 inc_syscw(current);
526 return ret;
527}
2ec3a12a 528
e13ec939
CH
529ssize_t kernel_write(struct file *file, const void *buf, size_t count,
530 loff_t *pos)
ac452aca 531{
81238b2c 532 ssize_t ret;
ac452aca 533
81238b2c
CH
534 ret = rw_verify_area(WRITE, file, pos, count);
535 if (ret)
536 return ret;
ac452aca 537
81238b2c
CH
538 file_start_write(file);
539 ret = __kernel_write(file, buf, count, pos);
540 file_end_write(file);
541 return ret;
ac452aca
CH
542}
543EXPORT_SYMBOL(kernel_write);
544
1da177e4
LT
545ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
546{
547 ssize_t ret;
548
549 if (!(file->f_mode & FMODE_WRITE))
550 return -EBADF;
7f7f25e8 551 if (!(file->f_mode & FMODE_CAN_WRITE))
1da177e4 552 return -EINVAL;
96d4f267 553 if (unlikely(!access_ok(buf, count)))
1da177e4
LT
554 return -EFAULT;
555
556 ret = rw_verify_area(WRITE, file, pos, count);
bc61384d
AV
557 if (!ret) {
558 if (count > MAX_RW_COUNT)
559 count = MAX_RW_COUNT;
03d95eb2 560 file_start_write(file);
493c84c0 561 ret = __vfs_write(file, buf, count, pos);
c43e259c 562 if (ret > 0) {
2a12a9d7 563 fsnotify_modify(file);
c43e259c 564 add_wchar(current, ret);
1da177e4 565 }
c43e259c 566 inc_syscw(current);
03d95eb2 567 file_end_write(file);
1da177e4
LT
568 }
569
570 return ret;
571}
572
438ab720
KS
573/* file_ppos returns &file->f_pos or NULL if file is stream */
574static inline loff_t *file_ppos(struct file *file)
1da177e4 575{
438ab720 576 return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
1da177e4
LT
577}
578
3ce4a7bf 579ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
1da177e4 580{
9c225f26 581 struct fd f = fdget_pos(fd);
1da177e4 582 ssize_t ret = -EBADF;
1da177e4 583
2903ff01 584 if (f.file) {
438ab720
KS
585 loff_t pos, *ppos = file_ppos(f.file);
586 if (ppos) {
587 pos = *ppos;
588 ppos = &pos;
589 }
590 ret = vfs_read(f.file, buf, count, ppos);
591 if (ret >= 0 && ppos)
592 f.file->f_pos = pos;
9c225f26 593 fdput_pos(f);
1da177e4 594 }
1da177e4
LT
595 return ret;
596}
1da177e4 597
3ce4a7bf
DB
598SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
599{
600 return ksys_read(fd, buf, count);
601}
602
e7a3e8b2 603ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
1da177e4 604{
9c225f26 605 struct fd f = fdget_pos(fd);
1da177e4 606 ssize_t ret = -EBADF;
1da177e4 607
2903ff01 608 if (f.file) {
438ab720
KS
609 loff_t pos, *ppos = file_ppos(f.file);
610 if (ppos) {
611 pos = *ppos;
612 ppos = &pos;
613 }
614 ret = vfs_write(f.file, buf, count, ppos);
615 if (ret >= 0 && ppos)
616 f.file->f_pos = pos;
9c225f26 617 fdput_pos(f);
1da177e4
LT
618 }
619
620 return ret;
621}
622
e7a3e8b2
DB
623SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
624 size_t, count)
625{
626 return ksys_write(fd, buf, count);
627}
628
36028d5d
DB
629ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
630 loff_t pos)
1da177e4 631{
2903ff01 632 struct fd f;
1da177e4 633 ssize_t ret = -EBADF;
1da177e4
LT
634
635 if (pos < 0)
636 return -EINVAL;
637
2903ff01
AV
638 f = fdget(fd);
639 if (f.file) {
1da177e4 640 ret = -ESPIPE;
2903ff01
AV
641 if (f.file->f_mode & FMODE_PREAD)
642 ret = vfs_read(f.file, buf, count, &pos);
643 fdput(f);
1da177e4
LT
644 }
645
646 return ret;
647}
648
36028d5d
DB
649SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
650 size_t, count, loff_t, pos)
651{
652 return ksys_pread64(fd, buf, count, pos);
653}
654
655ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
656 size_t count, loff_t pos)
1da177e4 657{
2903ff01 658 struct fd f;
1da177e4 659 ssize_t ret = -EBADF;
1da177e4
LT
660
661 if (pos < 0)
662 return -EINVAL;
663
2903ff01
AV
664 f = fdget(fd);
665 if (f.file) {
1da177e4 666 ret = -ESPIPE;
2903ff01
AV
667 if (f.file->f_mode & FMODE_PWRITE)
668 ret = vfs_write(f.file, buf, count, &pos);
669 fdput(f);
1da177e4
LT
670 }
671
672 return ret;
673}
674
36028d5d
DB
675SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
676 size_t, count, loff_t, pos)
677{
678 return ksys_pwrite64(fd, buf, count, pos);
679}
680
ac15ac06 681static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
ddef7ed2 682 loff_t *ppos, int type, rwf_t flags)
293bc982
AV
683{
684 struct kiocb kiocb;
293bc982
AV
685 ssize_t ret;
686
687 init_sync_kiocb(&kiocb, filp);
fdd2f5b7
GR
688 ret = kiocb_set_rw_flags(&kiocb, flags);
689 if (ret)
690 return ret;
438ab720 691 kiocb.ki_pos = (ppos ? *ppos : 0);
293bc982 692
0f78d06a 693 if (type == READ)
bb7462b6 694 ret = call_read_iter(filp, &kiocb, iter);
0f78d06a 695 else
bb7462b6 696 ret = call_write_iter(filp, &kiocb, iter);
599bd19b 697 BUG_ON(ret == -EIOCBQUEUED);
438ab720
KS
698 if (ppos)
699 *ppos = kiocb.ki_pos;
293bc982
AV
700 return ret;
701}
702
ee0b3e67 703/* Do it by hand, with file-ops */
ac15ac06 704static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
ddef7ed2 705 loff_t *ppos, int type, rwf_t flags)
ee0b3e67 706{
ee0b3e67
BP
707 ssize_t ret = 0;
708
97be7ebe 709 if (flags & ~RWF_HIPRI)
793b80ef
CH
710 return -EOPNOTSUPP;
711
ac15ac06
AV
712 while (iov_iter_count(iter)) {
713 struct iovec iovec = iov_iter_iovec(iter);
ee0b3e67
BP
714 ssize_t nr;
715
0f78d06a
MS
716 if (type == READ) {
717 nr = filp->f_op->read(filp, iovec.iov_base,
718 iovec.iov_len, ppos);
719 } else {
720 nr = filp->f_op->write(filp, iovec.iov_base,
721 iovec.iov_len, ppos);
722 }
ee0b3e67
BP
723
724 if (nr < 0) {
725 if (!ret)
726 ret = nr;
727 break;
728 }
729 ret += nr;
ac15ac06 730 if (nr != iovec.iov_len)
ee0b3e67 731 break;
ac15ac06 732 iov_iter_advance(iter, nr);
ee0b3e67
BP
733 }
734
735 return ret;
736}
737
ffecee4f
VN
738/**
739 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
740 * into the kernel and check that it is valid.
741 *
742 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
743 * @uvector: Pointer to the userspace array.
744 * @nr_segs: Number of elements in userspace array.
745 * @fast_segs: Number of elements in @fast_pointer.
746 * @fast_pointer: Pointer to (usually small on-stack) kernel array.
747 * @ret_pointer: (output parameter) Pointer to a variable that will point to
748 * either @fast_pointer, a newly allocated kernel array, or NULL,
749 * depending on which array was used.
750 *
751 * This function copies an array of &struct iovec of @nr_segs from
752 * userspace into the kernel and checks that each element is valid (e.g.
753 * it does not point to a kernel address or cause overflow by being too
754 * large, etc.).
755 *
756 * As an optimization, the caller may provide a pointer to a small
757 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
758 * (the size of this array, or 0 if unused, should be given in @fast_segs).
759 *
760 * @ret_pointer will always point to the array that was used, so the
761 * caller must take care not to call kfree() on it e.g. in case the
762 * @fast_pointer array was used and it was allocated on the stack.
763 *
764 * Return: The total number of bytes covered by the iovec array on success
765 * or a negative error code on error.
766 */
eed4e51f
BP
767ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
768 unsigned long nr_segs, unsigned long fast_segs,
769 struct iovec *fast_pointer,
ac34ebb3 770 struct iovec **ret_pointer)
435f49a5 771{
eed4e51f 772 unsigned long seg;
435f49a5 773 ssize_t ret;
eed4e51f
BP
774 struct iovec *iov = fast_pointer;
775
435f49a5
LT
776 /*
777 * SuS says "The readv() function *may* fail if the iovcnt argument
778 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
779 * traditionally returned zero for zero segments, so...
780 */
eed4e51f
BP
781 if (nr_segs == 0) {
782 ret = 0;
435f49a5 783 goto out;
eed4e51f
BP
784 }
785
435f49a5
LT
786 /*
787 * First get the "struct iovec" from user memory and
788 * verify all the pointers
789 */
eed4e51f
BP
790 if (nr_segs > UIO_MAXIOV) {
791 ret = -EINVAL;
435f49a5 792 goto out;
eed4e51f
BP
793 }
794 if (nr_segs > fast_segs) {
6da2ec56 795 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
eed4e51f
BP
796 if (iov == NULL) {
797 ret = -ENOMEM;
435f49a5 798 goto out;
eed4e51f 799 }
435f49a5 800 }
eed4e51f
BP
801 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
802 ret = -EFAULT;
435f49a5 803 goto out;
eed4e51f
BP
804 }
805
435f49a5 806 /*
eed4e51f
BP
807 * According to the Single Unix Specification we should return EINVAL
808 * if an element length is < 0 when cast to ssize_t or if the
809 * total length would overflow the ssize_t return value of the
810 * system call.
435f49a5
LT
811 *
812 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
813 * overflow case.
814 */
eed4e51f 815 ret = 0;
435f49a5
LT
816 for (seg = 0; seg < nr_segs; seg++) {
817 void __user *buf = iov[seg].iov_base;
818 ssize_t len = (ssize_t)iov[seg].iov_len;
eed4e51f
BP
819
820 /* see if we we're about to use an invalid len or if
821 * it's about to overflow ssize_t */
435f49a5 822 if (len < 0) {
eed4e51f 823 ret = -EINVAL;
435f49a5 824 goto out;
eed4e51f 825 }
ac34ebb3 826 if (type >= 0
96d4f267 827 && unlikely(!access_ok(buf, len))) {
eed4e51f 828 ret = -EFAULT;
435f49a5
LT
829 goto out;
830 }
831 if (len > MAX_RW_COUNT - ret) {
832 len = MAX_RW_COUNT - ret;
833 iov[seg].iov_len = len;
eed4e51f 834 }
eed4e51f 835 ret += len;
435f49a5 836 }
eed4e51f
BP
837out:
838 *ret_pointer = iov;
839 return ret;
840}
841
f5029855
AV
842#ifdef CONFIG_COMPAT
843ssize_t compat_rw_copy_check_uvector(int type,
844 const struct compat_iovec __user *uvector, unsigned long nr_segs,
845 unsigned long fast_segs, struct iovec *fast_pointer,
846 struct iovec **ret_pointer)
847{
848 compat_ssize_t tot_len;
849 struct iovec *iov = *ret_pointer = fast_pointer;
850 ssize_t ret = 0;
851 int seg;
852
853 /*
854 * SuS says "The readv() function *may* fail if the iovcnt argument
855 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
856 * traditionally returned zero for zero segments, so...
857 */
858 if (nr_segs == 0)
859 goto out;
860
861 ret = -EINVAL;
862 if (nr_segs > UIO_MAXIOV)
863 goto out;
864 if (nr_segs > fast_segs) {
865 ret = -ENOMEM;
6da2ec56 866 iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
f5029855
AV
867 if (iov == NULL)
868 goto out;
869 }
870 *ret_pointer = iov;
871
872 ret = -EFAULT;
96d4f267 873 if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
f5029855
AV
874 goto out;
875
876 /*
877 * Single unix specification:
878 * We should -EINVAL if an element length is not >= 0 and fitting an
879 * ssize_t.
880 *
881 * In Linux, the total length is limited to MAX_RW_COUNT, there is
882 * no overflow possibility.
883 */
884 tot_len = 0;
885 ret = -EINVAL;
886 for (seg = 0; seg < nr_segs; seg++) {
887 compat_uptr_t buf;
888 compat_ssize_t len;
889
890 if (__get_user(len, &uvector->iov_len) ||
891 __get_user(buf, &uvector->iov_base)) {
892 ret = -EFAULT;
893 goto out;
894 }
895 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
896 goto out;
897 if (type >= 0 &&
96d4f267 898 !access_ok(compat_ptr(buf), len)) {
f5029855
AV
899 ret = -EFAULT;
900 goto out;
901 }
902 if (len > MAX_RW_COUNT - tot_len)
903 len = MAX_RW_COUNT - tot_len;
904 tot_len += len;
905 iov->iov_base = compat_ptr(buf);
906 iov->iov_len = (compat_size_t) len;
907 uvector++;
908 iov++;
909 }
910 ret = tot_len;
911
912out:
913 return ret;
914}
915#endif
916
19c73586 917static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
ddef7ed2 918 loff_t *pos, rwf_t flags)
1da177e4 919{
1da177e4 920 size_t tot_len;
7687a7a4 921 ssize_t ret = 0;
1da177e4 922
edab5fe3
CH
923 if (!(file->f_mode & FMODE_READ))
924 return -EBADF;
925 if (!(file->f_mode & FMODE_CAN_READ))
926 return -EINVAL;
927
7687a7a4 928 tot_len = iov_iter_count(iter);
0504c074
AV
929 if (!tot_len)
930 goto out;
19c73586 931 ret = rw_verify_area(READ, file, pos, tot_len);
e28cc715 932 if (ret < 0)
19c73586 933 return ret;
1da177e4 934
19c73586
CH
935 if (file->f_op->read_iter)
936 ret = do_iter_readv_writev(file, iter, pos, READ, flags);
ee0b3e67 937 else
19c73586 938 ret = do_loop_readv_writev(file, iter, pos, READ, flags);
1da177e4 939out:
19c73586
CH
940 if (ret >= 0)
941 fsnotify_access(file);
1da177e4 942 return ret;
1da177e4
LT
943}
944
5dcdc43e
JX
945ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
946 struct iov_iter *iter)
947{
948 size_t tot_len;
949 ssize_t ret = 0;
950
951 if (!file->f_op->read_iter)
952 return -EINVAL;
953 if (!(file->f_mode & FMODE_READ))
954 return -EBADF;
955 if (!(file->f_mode & FMODE_CAN_READ))
956 return -EINVAL;
957
958 tot_len = iov_iter_count(iter);
959 if (!tot_len)
960 goto out;
961 ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
962 if (ret < 0)
963 return ret;
964
965 ret = call_read_iter(file, iocb, iter);
966out:
967 if (ret >= 0)
968 fsnotify_access(file);
969 return ret;
970}
971EXPORT_SYMBOL(vfs_iocb_iter_read);
972
18e9710e 973ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
ddef7ed2 974 rwf_t flags)
7687a7a4 975{
18e9710e
CH
976 if (!file->f_op->read_iter)
977 return -EINVAL;
978 return do_iter_read(file, iter, ppos, flags);
979}
980EXPORT_SYMBOL(vfs_iter_read);
7687a7a4 981
19c73586 982static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
ddef7ed2 983 loff_t *pos, rwf_t flags)
19c73586
CH
984{
985 size_t tot_len;
986 ssize_t ret = 0;
03d95eb2 987
edab5fe3
CH
988 if (!(file->f_mode & FMODE_WRITE))
989 return -EBADF;
990 if (!(file->f_mode & FMODE_CAN_WRITE))
991 return -EINVAL;
992
19c73586
CH
993 tot_len = iov_iter_count(iter);
994 if (!tot_len)
995 return 0;
996 ret = rw_verify_area(WRITE, file, pos, tot_len);
7687a7a4
MS
997 if (ret < 0)
998 return ret;
999
19c73586
CH
1000 if (file->f_op->write_iter)
1001 ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
1002 else
1003 ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
19c73586
CH
1004 if (ret > 0)
1005 fsnotify_modify(file);
7687a7a4
MS
1006 return ret;
1007}
1008
5dcdc43e
JX
1009ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
1010 struct iov_iter *iter)
1011{
1012 size_t tot_len;
1013 ssize_t ret = 0;
1014
1015 if (!file->f_op->write_iter)
1016 return -EINVAL;
1017 if (!(file->f_mode & FMODE_WRITE))
1018 return -EBADF;
1019 if (!(file->f_mode & FMODE_CAN_WRITE))
1020 return -EINVAL;
1021
1022 tot_len = iov_iter_count(iter);
1023 if (!tot_len)
1024 return 0;
1025 ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
1026 if (ret < 0)
1027 return ret;
1028
1029 ret = call_write_iter(file, iocb, iter);
1030 if (ret > 0)
1031 fsnotify_modify(file);
1032
1033 return ret;
1034}
1035EXPORT_SYMBOL(vfs_iocb_iter_write);
1036
abbb6589 1037ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
ddef7ed2 1038 rwf_t flags)
abbb6589
CH
1039{
1040 if (!file->f_op->write_iter)
1041 return -EINVAL;
1042 return do_iter_write(file, iter, ppos, flags);
1043}
1044EXPORT_SYMBOL(vfs_iter_write);
1045
1da177e4 1046ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
ddef7ed2 1047 unsigned long vlen, loff_t *pos, rwf_t flags)
1da177e4 1048{
7687a7a4
MS
1049 struct iovec iovstack[UIO_FASTIOV];
1050 struct iovec *iov = iovstack;
1051 struct iov_iter iter;
1052 ssize_t ret;
1da177e4 1053
251b42a1 1054 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
edab5fe3
CH
1055 if (ret >= 0) {
1056 ret = do_iter_read(file, &iter, pos, flags);
1057 kfree(iov);
1058 }
1da177e4 1059
251b42a1
CH
1060 return ret;
1061}
1da177e4 1062
9725d4ce 1063static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
ddef7ed2 1064 unsigned long vlen, loff_t *pos, rwf_t flags)
1da177e4 1065{
251b42a1
CH
1066 struct iovec iovstack[UIO_FASTIOV];
1067 struct iovec *iov = iovstack;
1068 struct iov_iter iter;
1069 ssize_t ret;
1da177e4 1070
251b42a1 1071 ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
edab5fe3 1072 if (ret >= 0) {
62473a2d 1073 file_start_write(file);
edab5fe3 1074 ret = do_iter_write(file, &iter, pos, flags);
62473a2d 1075 file_end_write(file);
edab5fe3
CH
1076 kfree(iov);
1077 }
251b42a1 1078 return ret;
1da177e4 1079}
1da177e4 1080
f17d8b35 1081static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
ddef7ed2 1082 unsigned long vlen, rwf_t flags)
1da177e4 1083{
9c225f26 1084 struct fd f = fdget_pos(fd);
1da177e4 1085 ssize_t ret = -EBADF;
1da177e4 1086
2903ff01 1087 if (f.file) {
438ab720
KS
1088 loff_t pos, *ppos = file_ppos(f.file);
1089 if (ppos) {
1090 pos = *ppos;
1091 ppos = &pos;
1092 }
1093 ret = vfs_readv(f.file, vec, vlen, ppos, flags);
1094 if (ret >= 0 && ppos)
1095 f.file->f_pos = pos;
9c225f26 1096 fdput_pos(f);
1da177e4
LT
1097 }
1098
1099 if (ret > 0)
4b98d11b
AD
1100 add_rchar(current, ret);
1101 inc_syscr(current);
1da177e4
LT
1102 return ret;
1103}
1104
f17d8b35 1105static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
ddef7ed2 1106 unsigned long vlen, rwf_t flags)
1da177e4 1107{
9c225f26 1108 struct fd f = fdget_pos(fd);
1da177e4 1109 ssize_t ret = -EBADF;
1da177e4 1110
2903ff01 1111 if (f.file) {
438ab720
KS
1112 loff_t pos, *ppos = file_ppos(f.file);
1113 if (ppos) {
1114 pos = *ppos;
1115 ppos = &pos;
1116 }
1117 ret = vfs_writev(f.file, vec, vlen, ppos, flags);
1118 if (ret >= 0 && ppos)
1119 f.file->f_pos = pos;
9c225f26 1120 fdput_pos(f);
1da177e4
LT
1121 }
1122
1123 if (ret > 0)
4b98d11b
AD
1124 add_wchar(current, ret);
1125 inc_syscw(current);
1da177e4
LT
1126 return ret;
1127}
1128
601cc11d
LT
1129static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
1130{
1131#define HALF_LONG_BITS (BITS_PER_LONG / 2)
1132 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
1133}
1134
f17d8b35 1135static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
ddef7ed2 1136 unsigned long vlen, loff_t pos, rwf_t flags)
f3554f4b 1137{
2903ff01 1138 struct fd f;
f3554f4b 1139 ssize_t ret = -EBADF;
f3554f4b
GH
1140
1141 if (pos < 0)
1142 return -EINVAL;
1143
2903ff01
AV
1144 f = fdget(fd);
1145 if (f.file) {
f3554f4b 1146 ret = -ESPIPE;
2903ff01 1147 if (f.file->f_mode & FMODE_PREAD)
f17d8b35 1148 ret = vfs_readv(f.file, vec, vlen, &pos, flags);
2903ff01 1149 fdput(f);
f3554f4b
GH
1150 }
1151
1152 if (ret > 0)
1153 add_rchar(current, ret);
1154 inc_syscr(current);
1155 return ret;
1156}
1157
f17d8b35 1158static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
ddef7ed2 1159 unsigned long vlen, loff_t pos, rwf_t flags)
f3554f4b 1160{
2903ff01 1161 struct fd f;
f3554f4b 1162 ssize_t ret = -EBADF;
f3554f4b
GH
1163
1164 if (pos < 0)
1165 return -EINVAL;
1166
2903ff01
AV
1167 f = fdget(fd);
1168 if (f.file) {
f3554f4b 1169 ret = -ESPIPE;
2903ff01 1170 if (f.file->f_mode & FMODE_PWRITE)
f17d8b35 1171 ret = vfs_writev(f.file, vec, vlen, &pos, flags);
2903ff01 1172 fdput(f);
f3554f4b
GH
1173 }
1174
1175 if (ret > 0)
1176 add_wchar(current, ret);
1177 inc_syscw(current);
1178 return ret;
1179}
1180
f17d8b35
MT
1181SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1182 unsigned long, vlen)
1183{
1184 return do_readv(fd, vec, vlen, 0);
1185}
1186
1187SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1188 unsigned long, vlen)
1189{
1190 return do_writev(fd, vec, vlen, 0);
1191}
1192
1193SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1194 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1195{
1196 loff_t pos = pos_from_hilo(pos_h, pos_l);
1197
1198 return do_preadv(fd, vec, vlen, pos, 0);
1199}
1200
1201SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1202 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
ddef7ed2 1203 rwf_t, flags)
f17d8b35
MT
1204{
1205 loff_t pos = pos_from_hilo(pos_h, pos_l);
1206
1207 if (pos == -1)
1208 return do_readv(fd, vec, vlen, flags);
1209
1210 return do_preadv(fd, vec, vlen, pos, flags);
1211}
1212
1213SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1214 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1215{
1216 loff_t pos = pos_from_hilo(pos_h, pos_l);
1217
1218 return do_pwritev(fd, vec, vlen, pos, 0);
1219}
1220
1221SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1222 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
ddef7ed2 1223 rwf_t, flags)
f17d8b35
MT
1224{
1225 loff_t pos = pos_from_hilo(pos_h, pos_l);
1226
1227 if (pos == -1)
1228 return do_writev(fd, vec, vlen, flags);
1229
1230 return do_pwritev(fd, vec, vlen, pos, flags);
1231}
1232
72ec3516 1233#ifdef CONFIG_COMPAT
72ec3516
AV
1234static size_t compat_readv(struct file *file,
1235 const struct compat_iovec __user *vec,
ddef7ed2 1236 unsigned long vlen, loff_t *pos, rwf_t flags)
72ec3516 1237{
72ec3516
AV
1238 struct iovec iovstack[UIO_FASTIOV];
1239 struct iovec *iov = iovstack;
ac15ac06 1240 struct iov_iter iter;
72ec3516 1241 ssize_t ret;
72ec3516 1242
26c87fb7 1243 ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
edab5fe3
CH
1244 if (ret >= 0) {
1245 ret = do_iter_read(file, &iter, pos, flags);
1246 kfree(iov);
1247 }
72ec3516
AV
1248 if (ret > 0)
1249 add_rchar(current, ret);
1250 inc_syscr(current);
1251 return ret;
1252}
1253
f17d8b35
MT
1254static size_t do_compat_readv(compat_ulong_t fd,
1255 const struct compat_iovec __user *vec,
ddef7ed2 1256 compat_ulong_t vlen, rwf_t flags)
72ec3516 1257{
9c225f26 1258 struct fd f = fdget_pos(fd);
72ec3516
AV
1259 ssize_t ret;
1260 loff_t pos;
1261
1262 if (!f.file)
1263 return -EBADF;
1264 pos = f.file->f_pos;
f17d8b35 1265 ret = compat_readv(f.file, vec, vlen, &pos, flags);
5faf153e
AV
1266 if (ret >= 0)
1267 f.file->f_pos = pos;
9c225f26 1268 fdput_pos(f);
72ec3516 1269 return ret;
f17d8b35 1270
72ec3516
AV
1271}
1272
f17d8b35
MT
1273COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
1274 const struct compat_iovec __user *,vec,
1275 compat_ulong_t, vlen)
1276{
1277 return do_compat_readv(fd, vec, vlen, 0);
1278}
1279
1280static long do_compat_preadv64(unsigned long fd,
378a10f3 1281 const struct compat_iovec __user *vec,
ddef7ed2 1282 unsigned long vlen, loff_t pos, rwf_t flags)
72ec3516
AV
1283{
1284 struct fd f;
1285 ssize_t ret;
1286
1287 if (pos < 0)
1288 return -EINVAL;
1289 f = fdget(fd);
1290 if (!f.file)
1291 return -EBADF;
1292 ret = -ESPIPE;
1293 if (f.file->f_mode & FMODE_PREAD)
f17d8b35 1294 ret = compat_readv(f.file, vec, vlen, &pos, flags);
72ec3516
AV
1295 fdput(f);
1296 return ret;
1297}
1298
378a10f3
HC
1299#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1300COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1301 const struct compat_iovec __user *,vec,
1302 unsigned long, vlen, loff_t, pos)
1303{
f17d8b35 1304 return do_compat_preadv64(fd, vec, vlen, pos, 0);
378a10f3
HC
1305}
1306#endif
1307
dfd948e3 1308COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
72ec3516 1309 const struct compat_iovec __user *,vec,
dfd948e3 1310 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
72ec3516
AV
1311{
1312 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
378a10f3 1313
f17d8b35
MT
1314 return do_compat_preadv64(fd, vec, vlen, pos, 0);
1315}
1316
3ebfd81f
L
1317#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1318COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1319 const struct compat_iovec __user *,vec,
ddef7ed2 1320 unsigned long, vlen, loff_t, pos, rwf_t, flags)
3ebfd81f 1321{
cc4b1242
AJ
1322 if (pos == -1)
1323 return do_compat_readv(fd, vec, vlen, flags);
1324
3ebfd81f
L
1325 return do_compat_preadv64(fd, vec, vlen, pos, flags);
1326}
1327#endif
1328
f17d8b35
MT
1329COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1330 const struct compat_iovec __user *,vec,
1331 compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
ddef7ed2 1332 rwf_t, flags)
f17d8b35
MT
1333{
1334 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1335
1336 if (pos == -1)
1337 return do_compat_readv(fd, vec, vlen, flags);
1338
1339 return do_compat_preadv64(fd, vec, vlen, pos, flags);
72ec3516
AV
1340}
1341
1342static size_t compat_writev(struct file *file,
1343 const struct compat_iovec __user *vec,
ddef7ed2 1344 unsigned long vlen, loff_t *pos, rwf_t flags)
72ec3516 1345{
26c87fb7
CH
1346 struct iovec iovstack[UIO_FASTIOV];
1347 struct iovec *iov = iovstack;
1348 struct iov_iter iter;
edab5fe3 1349 ssize_t ret;
72ec3516 1350
26c87fb7 1351 ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
edab5fe3 1352 if (ret >= 0) {
62473a2d 1353 file_start_write(file);
edab5fe3 1354 ret = do_iter_write(file, &iter, pos, flags);
62473a2d 1355 file_end_write(file);
edab5fe3
CH
1356 kfree(iov);
1357 }
72ec3516
AV
1358 if (ret > 0)
1359 add_wchar(current, ret);
1360 inc_syscw(current);
1361 return ret;
1362}
1363
f17d8b35
MT
1364static size_t do_compat_writev(compat_ulong_t fd,
1365 const struct compat_iovec __user* vec,
ddef7ed2 1366 compat_ulong_t vlen, rwf_t flags)
72ec3516 1367{
9c225f26 1368 struct fd f = fdget_pos(fd);
72ec3516
AV
1369 ssize_t ret;
1370 loff_t pos;
1371
1372 if (!f.file)
1373 return -EBADF;
1374 pos = f.file->f_pos;
f17d8b35 1375 ret = compat_writev(f.file, vec, vlen, &pos, flags);
5faf153e
AV
1376 if (ret >= 0)
1377 f.file->f_pos = pos;
9c225f26 1378 fdput_pos(f);
72ec3516
AV
1379 return ret;
1380}
1381
f17d8b35
MT
1382COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1383 const struct compat_iovec __user *, vec,
1384 compat_ulong_t, vlen)
1385{
1386 return do_compat_writev(fd, vec, vlen, 0);
1387}
1388
1389static long do_compat_pwritev64(unsigned long fd,
378a10f3 1390 const struct compat_iovec __user *vec,
ddef7ed2 1391 unsigned long vlen, loff_t pos, rwf_t flags)
72ec3516
AV
1392{
1393 struct fd f;
1394 ssize_t ret;
1395
1396 if (pos < 0)
1397 return -EINVAL;
1398 f = fdget(fd);
1399 if (!f.file)
1400 return -EBADF;
1401 ret = -ESPIPE;
1402 if (f.file->f_mode & FMODE_PWRITE)
f17d8b35 1403 ret = compat_writev(f.file, vec, vlen, &pos, flags);
72ec3516
AV
1404 fdput(f);
1405 return ret;
1406}
1407
378a10f3
HC
1408#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1409COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1410 const struct compat_iovec __user *,vec,
1411 unsigned long, vlen, loff_t, pos)
1412{
f17d8b35 1413 return do_compat_pwritev64(fd, vec, vlen, pos, 0);
378a10f3
HC
1414}
1415#endif
1416
dfd948e3 1417COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
72ec3516 1418 const struct compat_iovec __user *,vec,
dfd948e3 1419 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
72ec3516
AV
1420{
1421 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
378a10f3 1422
f17d8b35 1423 return do_compat_pwritev64(fd, vec, vlen, pos, 0);
72ec3516 1424}
f17d8b35 1425
3ebfd81f
L
1426#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1427COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1428 const struct compat_iovec __user *,vec,
ddef7ed2 1429 unsigned long, vlen, loff_t, pos, rwf_t, flags)
3ebfd81f 1430{
cc4b1242
AJ
1431 if (pos == -1)
1432 return do_compat_writev(fd, vec, vlen, flags);
1433
3ebfd81f
L
1434 return do_compat_pwritev64(fd, vec, vlen, pos, flags);
1435}
1436#endif
1437
f17d8b35
MT
1438COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1439 const struct compat_iovec __user *,vec,
ddef7ed2 1440 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
f17d8b35
MT
1441{
1442 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1443
1444 if (pos == -1)
1445 return do_compat_writev(fd, vec, vlen, flags);
1446
1447 return do_compat_pwritev64(fd, vec, vlen, pos, flags);
72ec3516 1448}
f17d8b35 1449
72ec3516
AV
1450#endif
1451
19f4fc3a
AV
1452static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1453 size_t count, loff_t max)
1da177e4 1454{
2903ff01
AV
1455 struct fd in, out;
1456 struct inode *in_inode, *out_inode;
1da177e4 1457 loff_t pos;
7995bd28 1458 loff_t out_pos;
1da177e4 1459 ssize_t retval;
2903ff01 1460 int fl;
1da177e4
LT
1461
1462 /*
1463 * Get input file, and verify that it is ok..
1464 */
1465 retval = -EBADF;
2903ff01
AV
1466 in = fdget(in_fd);
1467 if (!in.file)
1da177e4 1468 goto out;
2903ff01 1469 if (!(in.file->f_mode & FMODE_READ))
1da177e4 1470 goto fput_in;
1da177e4 1471 retval = -ESPIPE;
7995bd28
AV
1472 if (!ppos) {
1473 pos = in.file->f_pos;
1474 } else {
1475 pos = *ppos;
2903ff01 1476 if (!(in.file->f_mode & FMODE_PREAD))
1da177e4 1477 goto fput_in;
7995bd28
AV
1478 }
1479 retval = rw_verify_area(READ, in.file, &pos, count);
e28cc715 1480 if (retval < 0)
1da177e4 1481 goto fput_in;
bc61384d
AV
1482 if (count > MAX_RW_COUNT)
1483 count = MAX_RW_COUNT;
1da177e4 1484
1da177e4
LT
1485 /*
1486 * Get output file, and verify that it is ok..
1487 */
1488 retval = -EBADF;
2903ff01
AV
1489 out = fdget(out_fd);
1490 if (!out.file)
1da177e4 1491 goto fput_in;
2903ff01 1492 if (!(out.file->f_mode & FMODE_WRITE))
1da177e4 1493 goto fput_out;
496ad9aa
AV
1494 in_inode = file_inode(in.file);
1495 out_inode = file_inode(out.file);
7995bd28
AV
1496 out_pos = out.file->f_pos;
1497 retval = rw_verify_area(WRITE, out.file, &out_pos, count);
e28cc715 1498 if (retval < 0)
1da177e4
LT
1499 goto fput_out;
1500
1da177e4
LT
1501 if (!max)
1502 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1503
1da177e4
LT
1504 if (unlikely(pos + count > max)) {
1505 retval = -EOVERFLOW;
1506 if (pos >= max)
1507 goto fput_out;
1508 count = max - pos;
1509 }
1510
d96e6e71 1511 fl = 0;
534f2aaa 1512#if 0
d96e6e71
JA
1513 /*
1514 * We need to debate whether we can enable this or not. The
1515 * man page documents EAGAIN return for the output at least,
1516 * and the application is arguably buggy if it doesn't expect
1517 * EAGAIN on a non-blocking file descriptor.
1518 */
2903ff01 1519 if (in.file->f_flags & O_NONBLOCK)
d96e6e71 1520 fl = SPLICE_F_NONBLOCK;
534f2aaa 1521#endif
50cd2c57 1522 file_start_write(out.file);
7995bd28 1523 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
50cd2c57 1524 file_end_write(out.file);
1da177e4
LT
1525
1526 if (retval > 0) {
4b98d11b
AD
1527 add_rchar(current, retval);
1528 add_wchar(current, retval);
a68c2f12
SW
1529 fsnotify_access(in.file);
1530 fsnotify_modify(out.file);
7995bd28
AV
1531 out.file->f_pos = out_pos;
1532 if (ppos)
1533 *ppos = pos;
1534 else
1535 in.file->f_pos = pos;
1da177e4 1536 }
1da177e4 1537
4b98d11b
AD
1538 inc_syscr(current);
1539 inc_syscw(current);
7995bd28 1540 if (pos > max)
1da177e4
LT
1541 retval = -EOVERFLOW;
1542
1543fput_out:
2903ff01 1544 fdput(out);
1da177e4 1545fput_in:
2903ff01 1546 fdput(in);
1da177e4
LT
1547out:
1548 return retval;
1549}
1550
002c8976 1551SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1da177e4
LT
1552{
1553 loff_t pos;
1554 off_t off;
1555 ssize_t ret;
1556
1557 if (offset) {
1558 if (unlikely(get_user(off, offset)))
1559 return -EFAULT;
1560 pos = off;
1561 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1562 if (unlikely(put_user(pos, offset)))
1563 return -EFAULT;
1564 return ret;
1565 }
1566
1567 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1568}
1569
002c8976 1570SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1da177e4
LT
1571{
1572 loff_t pos;
1573 ssize_t ret;
1574
1575 if (offset) {
1576 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1577 return -EFAULT;
1578 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1579 if (unlikely(put_user(pos, offset)))
1580 return -EFAULT;
1581 return ret;
1582 }
1583
1584 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1585}
19f4fc3a
AV
1586
1587#ifdef CONFIG_COMPAT
1588COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1589 compat_off_t __user *, offset, compat_size_t, count)
1590{
1591 loff_t pos;
1592 off_t off;
1593 ssize_t ret;
1594
1595 if (offset) {
1596 if (unlikely(get_user(off, offset)))
1597 return -EFAULT;
1598 pos = off;
1599 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1600 if (unlikely(put_user(pos, offset)))
1601 return -EFAULT;
1602 return ret;
1603 }
1604
1605 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1606}
1607
1608COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1609 compat_loff_t __user *, offset, compat_size_t, count)
1610{
1611 loff_t pos;
1612 ssize_t ret;
1613
1614 if (offset) {
1615 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1616 return -EFAULT;
1617 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1618 if (unlikely(put_user(pos, offset)))
1619 return -EFAULT;
1620 return ret;
1621 }
1622
1623 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1624}
1625#endif
29732938 1626
f16acc9d
DC
1627/**
1628 * generic_copy_file_range - copy data between two files
1629 * @file_in: file structure to read from
1630 * @pos_in: file offset to read from
1631 * @file_out: file structure to write data to
1632 * @pos_out: file offset to write data to
1633 * @len: amount of data to copy
1634 * @flags: copy flags
1635 *
1636 * This is a generic filesystem helper to copy data from one file to another.
1637 * It has no constraints on the source or destination file owners - the files
1638 * can belong to different superblocks and different filesystem types. Short
1639 * copies are allowed.
1640 *
1641 * This should be called from the @file_out filesystem, as per the
1642 * ->copy_file_range() method.
1643 *
1644 * Returns the number of bytes copied or a negative error indicating the
1645 * failure.
1646 */
1647
1648ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
1649 struct file *file_out, loff_t pos_out,
1650 size_t len, unsigned int flags)
1651{
1652 return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1653 len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1654}
1655EXPORT_SYMBOL(generic_copy_file_range);
1656
64bf5ff5
DC
1657static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
1658 struct file *file_out, loff_t pos_out,
1659 size_t len, unsigned int flags)
1660{
5dae222a
AG
1661 /*
1662 * Although we now allow filesystems to handle cross sb copy, passing
1663 * a file of the wrong filesystem type to filesystem driver can result
1664 * in an attempt to dereference the wrong type of ->private_data, so
1665 * avoid doing that until we really have a good reason. NFS defines
1666 * several different file_system_type structures, but they all end up
1667 * using the same ->copy_file_range() function pointer.
1668 */
1669 if (file_out->f_op->copy_file_range &&
1670 file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
64bf5ff5
DC
1671 return file_out->f_op->copy_file_range(file_in, pos_in,
1672 file_out, pos_out,
1673 len, flags);
1674
1675 return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1676 flags);
1677}
1678
29732938
ZB
1679/*
1680 * copy_file_range() differs from regular file read and write in that it
1681 * specifically allows return partial success. When it does so is up to
1682 * the copy_file_range method.
1683 */
1684ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1685 struct file *file_out, loff_t pos_out,
1686 size_t len, unsigned int flags)
1687{
29732938
ZB
1688 ssize_t ret;
1689
1690 if (flags != 0)
1691 return -EINVAL;
1692
96e6e8f4
AG
1693 ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1694 flags);
a3171351
AG
1695 if (unlikely(ret))
1696 return ret;
11cbfb10 1697
29732938 1698 ret = rw_verify_area(READ, file_in, &pos_in, len);
bc61384d
AV
1699 if (unlikely(ret))
1700 return ret;
1701
1702 ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1703 if (unlikely(ret))
29732938
ZB
1704 return ret;
1705
29732938
ZB
1706 if (len == 0)
1707 return 0;
1708
bfe219d3 1709 file_start_write(file_out);
29732938 1710
a76b5b04
CH
1711 /*
1712 * Try cloning first, this is supported by more file systems, and
1713 * more efficient if both clone and copy are supported (e.g. NFS).
1714 */
5dae222a
AG
1715 if (file_in->f_op->remap_file_range &&
1716 file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
42ec3d4c
DW
1717 loff_t cloned;
1718
1719 cloned = file_in->f_op->remap_file_range(file_in, pos_in,
1720 file_out, pos_out,
eca3654e
DW
1721 min_t(loff_t, MAX_RW_COUNT, len),
1722 REMAP_FILE_CAN_SHORTEN);
42ec3d4c
DW
1723 if (cloned > 0) {
1724 ret = cloned;
a76b5b04
CH
1725 goto done;
1726 }
1727 }
1728
64bf5ff5
DC
1729 ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1730 flags);
1731 WARN_ON_ONCE(ret == -EOPNOTSUPP);
a76b5b04 1732done:
29732938
ZB
1733 if (ret > 0) {
1734 fsnotify_access(file_in);
1735 add_rchar(current, ret);
1736 fsnotify_modify(file_out);
1737 add_wchar(current, ret);
1738 }
a76b5b04 1739
29732938
ZB
1740 inc_syscr(current);
1741 inc_syscw(current);
1742
bfe219d3 1743 file_end_write(file_out);
29732938
ZB
1744
1745 return ret;
1746}
1747EXPORT_SYMBOL(vfs_copy_file_range);
1748
1749SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1750 int, fd_out, loff_t __user *, off_out,
1751 size_t, len, unsigned int, flags)
1752{
1753 loff_t pos_in;
1754 loff_t pos_out;
1755 struct fd f_in;
1756 struct fd f_out;
1757 ssize_t ret = -EBADF;
1758
1759 f_in = fdget(fd_in);
1760 if (!f_in.file)
1761 goto out2;
1762
1763 f_out = fdget(fd_out);
1764 if (!f_out.file)
1765 goto out1;
1766
1767 ret = -EFAULT;
1768 if (off_in) {
1769 if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1770 goto out;
1771 } else {
1772 pos_in = f_in.file->f_pos;
1773 }
1774
1775 if (off_out) {
1776 if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1777 goto out;
1778 } else {
1779 pos_out = f_out.file->f_pos;
1780 }
1781
1782 ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1783 flags);
1784 if (ret > 0) {
1785 pos_in += ret;
1786 pos_out += ret;
1787
1788 if (off_in) {
1789 if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1790 ret = -EFAULT;
1791 } else {
1792 f_in.file->f_pos = pos_in;
1793 }
1794
1795 if (off_out) {
1796 if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1797 ret = -EFAULT;
1798 } else {
1799 f_out.file->f_pos = pos_out;
1800 }
1801 }
1802
1803out:
1804 fdput(f_out);
1805out1:
1806 fdput(f_in);
1807out2:
1808 return ret;
1809}
04b38d60 1810
42ec3d4c
DW
1811static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
1812 bool write)
04b38d60
CH
1813{
1814 struct inode *inode = file_inode(file);
1815
42ec3d4c 1816 if (unlikely(pos < 0 || len < 0))
04b38d60
CH
1817 return -EINVAL;
1818
1819 if (unlikely((loff_t) (pos + len) < 0))
1820 return -EINVAL;
1821
1822 if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1823 loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1824 int retval;
1825
1826 retval = locks_mandatory_area(inode, file, pos, end,
1827 write ? F_WRLCK : F_RDLCK);
1828 if (retval < 0)
1829 return retval;
1830 }
1831
1832 return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1833}
07d19dc9
DW
1834/*
1835 * Ensure that we don't remap a partial EOF block in the middle of something
1836 * else. Assume that the offsets have already been checked for block
1837 * alignment.
1838 *
a5e6ea18
FM
1839 * For clone we only link a partial EOF block above or at the destination file's
1840 * EOF. For deduplication we accept a partial EOF block only if it ends at the
1841 * destination file's EOF (can not link it into the middle of a file).
eca3654e
DW
1842 *
1843 * Shorten the request if possible.
07d19dc9
DW
1844 */
1845static int generic_remap_check_len(struct inode *inode_in,
1846 struct inode *inode_out,
1847 loff_t pos_out,
42ec3d4c 1848 loff_t *len,
a91ae49b 1849 unsigned int remap_flags)
07d19dc9
DW
1850{
1851 u64 blkmask = i_blocksize(inode_in) - 1;
eca3654e 1852 loff_t new_len = *len;
07d19dc9
DW
1853
1854 if ((*len & blkmask) == 0)
1855 return 0;
1856
a5e6ea18 1857 if (pos_out + *len < i_size_read(inode_out))
eca3654e 1858 new_len &= ~blkmask;
07d19dc9 1859
eca3654e
DW
1860 if (new_len == *len)
1861 return 0;
1862
1863 if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
1864 *len = new_len;
1865 return 0;
1866 }
1867
1868 return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
07d19dc9 1869}
04b38d60 1870
edc58dd0 1871/* Read a page's worth of file data into the page cache. */
c32e5f39
DW
1872static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1873{
1874 struct page *page;
1875
1876 page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
1877 if (IS_ERR(page))
1878 return page;
1879 if (!PageUptodate(page)) {
1880 put_page(page);
1881 return ERR_PTR(-EIO);
1882 }
c32e5f39
DW
1883 return page;
1884}
1885
edc58dd0
DW
1886/*
1887 * Lock two pages, ensuring that we lock in offset order if the pages are from
1888 * the same file.
1889 */
1890static void vfs_lock_two_pages(struct page *page1, struct page *page2)
1891{
1892 /* Always lock in order of increasing index. */
1893 if (page1->index > page2->index)
1894 swap(page1, page2);
1895
1896 lock_page(page1);
1897 if (page1 != page2)
1898 lock_page(page2);
1899}
1900
1901/* Unlock two pages, being careful not to unlock the same page twice. */
1902static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
1903{
1904 unlock_page(page1);
1905 if (page1 != page2)
1906 unlock_page(page2);
1907}
1908
c32e5f39
DW
1909/*
1910 * Compare extents of two files to see if they are the same.
1911 * Caller must have locked both inodes to prevent write races.
1912 */
1913static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1914 struct inode *dest, loff_t destoff,
1915 loff_t len, bool *is_same)
1916{
1917 loff_t src_poff;
1918 loff_t dest_poff;
1919 void *src_addr;
1920 void *dest_addr;
1921 struct page *src_page;
1922 struct page *dest_page;
1923 loff_t cmp_len;
1924 bool same;
1925 int error;
1926
1927 error = -EINVAL;
1928 same = true;
1929 while (len) {
1930 src_poff = srcoff & (PAGE_SIZE - 1);
1931 dest_poff = destoff & (PAGE_SIZE - 1);
1932 cmp_len = min(PAGE_SIZE - src_poff,
1933 PAGE_SIZE - dest_poff);
1934 cmp_len = min(cmp_len, len);
1935 if (cmp_len <= 0)
1936 goto out_error;
1937
1938 src_page = vfs_dedupe_get_page(src, srcoff);
1939 if (IS_ERR(src_page)) {
1940 error = PTR_ERR(src_page);
1941 goto out_error;
1942 }
1943 dest_page = vfs_dedupe_get_page(dest, destoff);
1944 if (IS_ERR(dest_page)) {
1945 error = PTR_ERR(dest_page);
c32e5f39
DW
1946 put_page(src_page);
1947 goto out_error;
1948 }
edc58dd0
DW
1949
1950 vfs_lock_two_pages(src_page, dest_page);
1951
1952 /*
1953 * Now that we've locked both pages, make sure they're still
1954 * mapped to the file data we're interested in. If not,
1955 * someone is invalidating pages on us and we lose.
1956 */
1957 if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
1958 src_page->mapping != src->i_mapping ||
1959 dest_page->mapping != dest->i_mapping) {
1960 same = false;
1961 goto unlock;
1962 }
1963
c32e5f39
DW
1964 src_addr = kmap_atomic(src_page);
1965 dest_addr = kmap_atomic(dest_page);
1966
1967 flush_dcache_page(src_page);
1968 flush_dcache_page(dest_page);
1969
1970 if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1971 same = false;
1972
1973 kunmap_atomic(dest_addr);
1974 kunmap_atomic(src_addr);
edc58dd0
DW
1975unlock:
1976 vfs_unlock_two_pages(src_page, dest_page);
c32e5f39
DW
1977 put_page(dest_page);
1978 put_page(src_page);
1979
1980 if (!same)
1981 break;
1982
1983 srcoff += cmp_len;
1984 destoff += cmp_len;
1985 len -= cmp_len;
1986 }
1987
1988 *is_same = same;
1989 return 0;
1990
1991out_error:
1992 return error;
1993}
04b38d60 1994
876bec6f
DW
1995/*
1996 * Check that the two inodes are eligible for cloning, the ranges make
1997 * sense, and then flush all dirty data. Caller must ensure that the
1998 * inodes have been locked against any other modifications.
22725ce4 1999 *
8c5c836b
DW
2000 * If there's an error, then the usual negative error code is returned.
2001 * Otherwise returns 0 with *len set to the request length.
876bec6f 2002 */
a83ab01a
DW
2003int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
2004 struct file *file_out, loff_t pos_out,
42ec3d4c 2005 loff_t *len, unsigned int remap_flags)
876bec6f 2006{
1383a7ed
DW
2007 struct inode *inode_in = file_inode(file_in);
2008 struct inode *inode_out = file_inode(file_out);
876bec6f
DW
2009 bool same_inode = (inode_in == inode_out);
2010 int ret;
2011
2012 /* Don't touch certain kinds of inodes */
2013 if (IS_IMMUTABLE(inode_out))
2014 return -EPERM;
2015
2016 if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
2017 return -ETXTBSY;
2018
2019 /* Don't reflink dirs, pipes, sockets... */
2020 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
2021 return -EISDIR;
2022 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
2023 return -EINVAL;
2024
876bec6f
DW
2025 /* Zero length dedupe exits immediately; reflink goes to EOF. */
2026 if (*len == 0) {
1383a7ed
DW
2027 loff_t isize = i_size_read(inode_in);
2028
a91ae49b 2029 if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
876bec6f 2030 return 0;
22725ce4
DW
2031 if (pos_in > isize)
2032 return -EINVAL;
876bec6f 2033 *len = isize - pos_in;
2c5773f1
DW
2034 if (*len == 0)
2035 return 0;
876bec6f
DW
2036 }
2037
1383a7ed
DW
2038 /* Check that we don't violate system file offset limits. */
2039 ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
3d28193e 2040 remap_flags);
1383a7ed
DW
2041 if (ret)
2042 return ret;
876bec6f
DW
2043
2044 /* Wait for the completion of any pending IOs on both files */
2045 inode_dio_wait(inode_in);
2046 if (!same_inode)
2047 inode_dio_wait(inode_out);
2048
2049 ret = filemap_write_and_wait_range(inode_in->i_mapping,
2050 pos_in, pos_in + *len - 1);
2051 if (ret)
2052 return ret;
2053
2054 ret = filemap_write_and_wait_range(inode_out->i_mapping,
2055 pos_out, pos_out + *len - 1);
2056 if (ret)
2057 return ret;
2058
2059 /*
2060 * Check that the extents are the same.
2061 */
a91ae49b 2062 if (remap_flags & REMAP_FILE_DEDUP) {
876bec6f
DW
2063 bool is_same = false;
2064
2065 ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
2066 inode_out, pos_out, *len, &is_same);
2067 if (ret)
2068 return ret;
2069 if (!is_same)
2070 return -EBADE;
2071 }
2072
07d19dc9 2073 ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
a91ae49b 2074 remap_flags);
07d19dc9
DW
2075 if (ret)
2076 return ret;
2077
8dde90bc 2078 /* If can't alter the file contents, we're done. */
e38f7f53
AG
2079 if (!(remap_flags & REMAP_FILE_DEDUP))
2080 ret = file_modified(file_out);
8dde90bc 2081
e38f7f53 2082 return ret;
876bec6f 2083}
a83ab01a 2084EXPORT_SYMBOL(generic_remap_file_range_prep);
876bec6f 2085
42ec3d4c 2086loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
452ce659
DW
2087 struct file *file_out, loff_t pos_out,
2088 loff_t len, unsigned int remap_flags)
04b38d60 2089{
42ec3d4c 2090 loff_t ret;
04b38d60 2091
6744557b 2092 WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
04b38d60 2093
913b86e9
AG
2094 /*
2095 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
2096 * the same mount. Practically, they only need to be on the same file
2097 * system.
2098 */
a3171351 2099 if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
04b38d60
CH
2100 return -EXDEV;
2101
a3171351
AG
2102 ret = generic_file_rw_checks(file_in, file_out);
2103 if (ret < 0)
2104 return ret;
04b38d60 2105
2e5dfc99 2106 if (!file_in->f_op->remap_file_range)
0fcbf996
CH
2107 return -EOPNOTSUPP;
2108
6095028b 2109 ret = remap_verify_area(file_in, pos_in, len, false);
04b38d60
CH
2110 if (ret)
2111 return ret;
2112
6095028b 2113 ret = remap_verify_area(file_out, pos_out, len, true);
04b38d60
CH
2114 if (ret)
2115 return ret;
2116
2e5dfc99 2117 ret = file_in->f_op->remap_file_range(file_in, pos_in,
452ce659 2118 file_out, pos_out, len, remap_flags);
42ec3d4c
DW
2119 if (ret < 0)
2120 return ret;
04b38d60 2121
42ec3d4c
DW
2122 fsnotify_access(file_in);
2123 fsnotify_modify(file_out);
04b38d60
CH
2124 return ret;
2125}
a725356b
AG
2126EXPORT_SYMBOL(do_clone_file_range);
2127
42ec3d4c 2128loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
452ce659
DW
2129 struct file *file_out, loff_t pos_out,
2130 loff_t len, unsigned int remap_flags)
a725356b 2131{
42ec3d4c 2132 loff_t ret;
a725356b
AG
2133
2134 file_start_write(file_out);
452ce659
DW
2135 ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
2136 remap_flags);
a725356b
AG
2137 file_end_write(file_out);
2138
2139 return ret;
2140}
04b38d60 2141EXPORT_SYMBOL(vfs_clone_file_range);
54dbc151 2142
5de4480a
MF
2143/* Check whether we are allowed to dedupe the destination file */
2144static bool allow_file_dedupe(struct file *file)
2145{
2146 if (capable(CAP_SYS_ADMIN))
2147 return true;
2148 if (file->f_mode & FMODE_WRITE)
2149 return true;
2150 if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
2151 return true;
2152 if (!inode_permission(file_inode(file), MAY_WRITE))
2153 return true;
2154 return false;
2155}
2156
42ec3d4c
DW
2157loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
2158 struct file *dst_file, loff_t dst_pos,
df365836 2159 loff_t len, unsigned int remap_flags)
1b4f42a1 2160{
42ec3d4c 2161 loff_t ret;
1b4f42a1 2162
eca3654e
DW
2163 WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
2164 REMAP_FILE_CAN_SHORTEN));
1b4f42a1
MS
2165
2166 ret = mnt_want_write_file(dst_file);
2167 if (ret)
2168 return ret;
2169
6095028b 2170 ret = remap_verify_area(dst_file, dst_pos, len, true);
1b4f42a1
MS
2171 if (ret < 0)
2172 goto out_drop_write;
2173
85c95f20 2174 ret = -EPERM;
5de4480a 2175 if (!allow_file_dedupe(dst_file))
1b4f42a1
MS
2176 goto out_drop_write;
2177
2178 ret = -EXDEV;
2179 if (src_file->f_path.mnt != dst_file->f_path.mnt)
2180 goto out_drop_write;
2181
2182 ret = -EISDIR;
2183 if (S_ISDIR(file_inode(dst_file)->i_mode))
2184 goto out_drop_write;
2185
2186 ret = -EINVAL;
2e5dfc99 2187 if (!dst_file->f_op->remap_file_range)
1b4f42a1
MS
2188 goto out_drop_write;
2189
9aae2050
DW
2190 if (len == 0) {
2191 ret = 0;
2192 goto out_drop_write;
2193 }
2194
2e5dfc99 2195 ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
df365836 2196 dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
1b4f42a1
MS
2197out_drop_write:
2198 mnt_drop_write_file(dst_file);
2199
2200 return ret;
2201}
f1825366 2202EXPORT_SYMBOL(vfs_dedupe_file_range_one);
1b4f42a1 2203
54dbc151
DW
2204int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
2205{
2206 struct file_dedupe_range_info *info;
2207 struct inode *src = file_inode(file);
2208 u64 off;
2209 u64 len;
2210 int i;
2211 int ret;
54dbc151 2212 u16 count = same->dest_count;
42ec3d4c 2213 loff_t deduped;
54dbc151
DW
2214
2215 if (!(file->f_mode & FMODE_READ))
2216 return -EINVAL;
2217
2218 if (same->reserved1 || same->reserved2)
2219 return -EINVAL;
2220
2221 off = same->src_offset;
2222 len = same->src_length;
2223
54dbc151 2224 if (S_ISDIR(src->i_mode))
494633fa 2225 return -EISDIR;
54dbc151 2226
54dbc151 2227 if (!S_ISREG(src->i_mode))
494633fa
DC
2228 return -EINVAL;
2229
2230 if (!file->f_op->remap_file_range)
2231 return -EOPNOTSUPP;
54dbc151 2232
6095028b 2233 ret = remap_verify_area(file, off, len, false);
54dbc151 2234 if (ret < 0)
494633fa 2235 return ret;
54dbc151
DW
2236 ret = 0;
2237
22725ce4
DW
2238 if (off + len > i_size_read(src))
2239 return -EINVAL;
2240
92b66d2c
MS
2241 /* Arbitrary 1G limit on a single dedupe request, can be raised. */
2242 len = min_t(u64, len, 1 << 30);
2243
54dbc151
DW
2244 /* pre-format output fields to sane values */
2245 for (i = 0; i < count; i++) {
2246 same->info[i].bytes_deduped = 0ULL;
2247 same->info[i].status = FILE_DEDUPE_RANGE_SAME;
2248 }
2249
2250 for (i = 0, info = same->info; i < count; i++, info++) {
54dbc151 2251 struct fd dst_fd = fdget(info->dest_fd);
1b4f42a1 2252 struct file *dst_file = dst_fd.file;
54dbc151 2253
54dbc151
DW
2254 if (!dst_file) {
2255 info->status = -EBADF;
2256 goto next_loop;
2257 }
54dbc151
DW
2258
2259 if (info->reserved) {
2260 info->status = -EINVAL;
1b4f42a1 2261 goto next_fdput;
54dbc151
DW
2262 }
2263
1b4f42a1 2264 deduped = vfs_dedupe_file_range_one(file, off, dst_file,
df365836 2265 info->dest_offset, len,
eca3654e 2266 REMAP_FILE_CAN_SHORTEN);
1b4f42a1
MS
2267 if (deduped == -EBADE)
2268 info->status = FILE_DEDUPE_RANGE_DIFFERS;
2269 else if (deduped < 0)
2270 info->status = deduped;
2271 else
2272 info->bytes_deduped = len;
2273
22762711 2274next_fdput:
54dbc151 2275 fdput(dst_fd);
22762711 2276next_loop:
e62e560f 2277 if (fatal_signal_pending(current))
494633fa 2278 break;
54dbc151 2279 }
54dbc151
DW
2280 return ret;
2281}
2282EXPORT_SYMBOL(vfs_dedupe_file_range);