nvme: use queuedata for nvme_req_qid
[linux-2.6-block.git] / fs / read_write.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
1da177e4
LT
2/*
3 * linux/fs/read_write.c
4 *
5 * Copyright (C) 1991, 1992 Linus Torvalds
6 */
7
b12fb7f4 8#include <linux/slab.h>
1da177e4 9#include <linux/stat.h>
b12fb7f4 10#include <linux/sched/xacct.h>
1da177e4
LT
11#include <linux/fcntl.h>
12#include <linux/file.h>
13#include <linux/uio.h>
0eeca283 14#include <linux/fsnotify.h>
1da177e4 15#include <linux/security.h>
630d9c47 16#include <linux/export.h>
1da177e4 17#include <linux/syscalls.h>
e28cc715 18#include <linux/pagemap.h>
d6b29d7c 19#include <linux/splice.h>
561c6731 20#include <linux/compat.h>
29732938 21#include <linux/mount.h>
2feb55f8 22#include <linux/fs.h>
06ae43f3 23#include "internal.h"
1da177e4 24
7c0f6ba6 25#include <linux/uaccess.h>
1da177e4
LT
26#include <asm/unistd.h>
27
4b6f5d20 28const struct file_operations generic_ro_fops = {
1da177e4 29 .llseek = generic_file_llseek,
aad4f8bb 30 .read_iter = generic_file_read_iter,
1da177e4 31 .mmap = generic_file_readonly_mmap,
534f2aaa 32 .splice_read = generic_file_splice_read,
1da177e4
LT
33};
34
35EXPORT_SYMBOL(generic_ro_fops);
36
ddef7ed2 37static inline bool unsigned_offsets(struct file *file)
4a3956c7 38{
cccb5a1e 39 return file->f_mode & FMODE_UNSIGNED_OFFSET;
4a3956c7
KH
40}
41
46a1c2c7
JL
42/**
43 * vfs_setpos - update the file offset for lseek
44 * @file: file structure in question
45 * @offset: file offset to seek to
46 * @maxsize: maximum file size
47 *
48 * This is a low-level filesystem helper for updating the file offset to
49 * the value specified by @offset if the given offset is valid and it is
50 * not equal to the current file offset.
51 *
52 * Return the specified offset on success and -EINVAL on invalid offset.
53 */
54loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
ef3d0fd2
AK
55{
56 if (offset < 0 && !unsigned_offsets(file))
57 return -EINVAL;
58 if (offset > maxsize)
59 return -EINVAL;
60
61 if (offset != file->f_pos) {
62 file->f_pos = offset;
63 file->f_version = 0;
64 }
65 return offset;
66}
46a1c2c7 67EXPORT_SYMBOL(vfs_setpos);
ef3d0fd2 68
3a8cff4f 69/**
5760495a 70 * generic_file_llseek_size - generic llseek implementation for regular files
3a8cff4f
CH
71 * @file: file structure to seek on
72 * @offset: file offset to seek to
965c8e59 73 * @whence: type of seek
e8b96eb5
ES
74 * @size: max size of this file in file system
75 * @eof: offset used for SEEK_END position
3a8cff4f 76 *
5760495a 77 * This is a variant of generic_file_llseek that allows passing in a custom
e8b96eb5 78 * maximum file size and a custom EOF position, for e.g. hashed directories
ef3d0fd2
AK
79 *
80 * Synchronization:
5760495a 81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
ef3d0fd2
AK
82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
83 * read/writes behave like SEEK_SET against seeks.
3a8cff4f 84 */
9465efc9 85loff_t
965c8e59 86generic_file_llseek_size(struct file *file, loff_t offset, int whence,
e8b96eb5 87 loff_t maxsize, loff_t eof)
1da177e4 88{
965c8e59 89 switch (whence) {
3a8cff4f 90 case SEEK_END:
e8b96eb5 91 offset += eof;
3a8cff4f
CH
92 break;
93 case SEEK_CUR:
5b6f1eb9
AK
94 /*
95 * Here we special-case the lseek(fd, 0, SEEK_CUR)
96 * position-querying operation. Avoid rewriting the "same"
97 * f_pos value back to the file because a concurrent read(),
98 * write() or lseek() might have altered it
99 */
100 if (offset == 0)
101 return file->f_pos;
ef3d0fd2
AK
102 /*
103 * f_lock protects against read/modify/write race with other
104 * SEEK_CURs. Note that parallel writes and reads behave
105 * like SEEK_SET.
106 */
107 spin_lock(&file->f_lock);
46a1c2c7 108 offset = vfs_setpos(file, file->f_pos + offset, maxsize);
ef3d0fd2
AK
109 spin_unlock(&file->f_lock);
110 return offset;
982d8165
JB
111 case SEEK_DATA:
112 /*
113 * In the generic case the entire file is data, so as long as
114 * offset isn't at the end of the file then the offset is data.
115 */
fc46820b 116 if ((unsigned long long)offset >= eof)
982d8165
JB
117 return -ENXIO;
118 break;
119 case SEEK_HOLE:
120 /*
121 * There is a virtual hole at the end of the file, so as long as
122 * offset isn't i_size or larger, return i_size.
123 */
fc46820b 124 if ((unsigned long long)offset >= eof)
982d8165 125 return -ENXIO;
e8b96eb5 126 offset = eof;
982d8165 127 break;
1da177e4 128 }
3a8cff4f 129
46a1c2c7 130 return vfs_setpos(file, offset, maxsize);
5760495a
AK
131}
132EXPORT_SYMBOL(generic_file_llseek_size);
133
134/**
135 * generic_file_llseek - generic llseek implementation for regular files
136 * @file: file structure to seek on
137 * @offset: file offset to seek to
965c8e59 138 * @whence: type of seek
5760495a
AK
139 *
140 * This is a generic implemenation of ->llseek useable for all normal local
141 * filesystems. It just updates the file offset to the value specified by
546ae2d2 142 * @offset and @whence.
5760495a 143 */
965c8e59 144loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
5760495a
AK
145{
146 struct inode *inode = file->f_mapping->host;
147
965c8e59 148 return generic_file_llseek_size(file, offset, whence,
e8b96eb5
ES
149 inode->i_sb->s_maxbytes,
150 i_size_read(inode));
1da177e4 151}
9465efc9 152EXPORT_SYMBOL(generic_file_llseek);
1da177e4 153
1bf9d14d
AV
154/**
155 * fixed_size_llseek - llseek implementation for fixed-sized devices
156 * @file: file structure to seek on
157 * @offset: file offset to seek to
158 * @whence: type of seek
159 * @size: size of the file
160 *
161 */
162loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
163{
164 switch (whence) {
165 case SEEK_SET: case SEEK_CUR: case SEEK_END:
166 return generic_file_llseek_size(file, offset, whence,
167 size, size);
168 default:
169 return -EINVAL;
170 }
171}
172EXPORT_SYMBOL(fixed_size_llseek);
173
b25472f9
AV
174/**
175 * no_seek_end_llseek - llseek implementation for fixed-sized devices
176 * @file: file structure to seek on
177 * @offset: file offset to seek to
178 * @whence: type of seek
179 *
180 */
181loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
182{
183 switch (whence) {
184 case SEEK_SET: case SEEK_CUR:
185 return generic_file_llseek_size(file, offset, whence,
2feb55f8 186 OFFSET_MAX, 0);
b25472f9
AV
187 default:
188 return -EINVAL;
189 }
190}
191EXPORT_SYMBOL(no_seek_end_llseek);
192
193/**
194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
195 * @file: file structure to seek on
196 * @offset: file offset to seek to
197 * @whence: type of seek
198 * @size: maximal offset allowed
199 *
200 */
201loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
202{
203 switch (whence) {
204 case SEEK_SET: case SEEK_CUR:
205 return generic_file_llseek_size(file, offset, whence,
206 size, 0);
207 default:
208 return -EINVAL;
209 }
210}
211EXPORT_SYMBOL(no_seek_end_llseek_size);
212
ae6afc3f
B
213/**
214 * noop_llseek - No Operation Performed llseek implementation
215 * @file: file structure to seek on
216 * @offset: file offset to seek to
965c8e59 217 * @whence: type of seek
ae6afc3f
B
218 *
219 * This is an implementation of ->llseek useable for the rare special case when
220 * userspace expects the seek to succeed but the (device) file is actually not
221 * able to perform the seek. In this case you use noop_llseek() instead of
222 * falling back to the default implementation of ->llseek.
223 */
965c8e59 224loff_t noop_llseek(struct file *file, loff_t offset, int whence)
ae6afc3f
B
225{
226 return file->f_pos;
227}
228EXPORT_SYMBOL(noop_llseek);
229
965c8e59 230loff_t no_llseek(struct file *file, loff_t offset, int whence)
1da177e4
LT
231{
232 return -ESPIPE;
233}
234EXPORT_SYMBOL(no_llseek);
235
965c8e59 236loff_t default_llseek(struct file *file, loff_t offset, int whence)
1da177e4 237{
496ad9aa 238 struct inode *inode = file_inode(file);
16abef0e 239 loff_t retval;
1da177e4 240
5955102c 241 inode_lock(inode);
965c8e59 242 switch (whence) {
7b8e8924 243 case SEEK_END:
982d8165 244 offset += i_size_read(inode);
1da177e4 245 break;
7b8e8924 246 case SEEK_CUR:
5b6f1eb9
AK
247 if (offset == 0) {
248 retval = file->f_pos;
249 goto out;
250 }
1da177e4 251 offset += file->f_pos;
982d8165
JB
252 break;
253 case SEEK_DATA:
254 /*
255 * In the generic case the entire file is data, so as
256 * long as offset isn't at the end of the file then the
257 * offset is data.
258 */
bacb2d81
DC
259 if (offset >= inode->i_size) {
260 retval = -ENXIO;
261 goto out;
262 }
982d8165
JB
263 break;
264 case SEEK_HOLE:
265 /*
266 * There is a virtual hole at the end of the file, so
267 * as long as offset isn't i_size or larger, return
268 * i_size.
269 */
bacb2d81
DC
270 if (offset >= inode->i_size) {
271 retval = -ENXIO;
272 goto out;
273 }
982d8165
JB
274 offset = inode->i_size;
275 break;
1da177e4
LT
276 }
277 retval = -EINVAL;
cccb5a1e 278 if (offset >= 0 || unsigned_offsets(file)) {
1da177e4
LT
279 if (offset != file->f_pos) {
280 file->f_pos = offset;
281 file->f_version = 0;
282 }
283 retval = offset;
284 }
5b6f1eb9 285out:
5955102c 286 inode_unlock(inode);
1da177e4
LT
287 return retval;
288}
289EXPORT_SYMBOL(default_llseek);
290
965c8e59 291loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
1da177e4
LT
292{
293 loff_t (*fn)(struct file *, loff_t, int);
294
295 fn = no_llseek;
296 if (file->f_mode & FMODE_LSEEK) {
72c2d531 297 if (file->f_op->llseek)
1da177e4
LT
298 fn = file->f_op->llseek;
299 }
965c8e59 300 return fn(file, offset, whence);
1da177e4
LT
301}
302EXPORT_SYMBOL(vfs_llseek);
303
bef17329 304static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
1da177e4
LT
305{
306 off_t retval;
9c225f26 307 struct fd f = fdget_pos(fd);
2903ff01
AV
308 if (!f.file)
309 return -EBADF;
1da177e4
LT
310
311 retval = -EINVAL;
965c8e59
AM
312 if (whence <= SEEK_MAX) {
313 loff_t res = vfs_llseek(f.file, offset, whence);
1da177e4
LT
314 retval = res;
315 if (res != (loff_t)retval)
316 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
317 }
9c225f26 318 fdput_pos(f);
1da177e4
LT
319 return retval;
320}
321
76847e43
DB
322SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
323{
324 return ksys_lseek(fd, offset, whence);
325}
326
561c6731
AV
327#ifdef CONFIG_COMPAT
328COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
329{
76847e43 330 return ksys_lseek(fd, offset, whence);
561c6731
AV
331}
332#endif
333
9e62ccec
MS
334#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
335 defined(__ARCH_WANT_SYS_LLSEEK)
003d7ab4
HC
336SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
337 unsigned long, offset_low, loff_t __user *, result,
965c8e59 338 unsigned int, whence)
1da177e4
LT
339{
340 int retval;
d7a15f8d 341 struct fd f = fdget_pos(fd);
1da177e4 342 loff_t offset;
1da177e4 343
2903ff01
AV
344 if (!f.file)
345 return -EBADF;
1da177e4
LT
346
347 retval = -EINVAL;
965c8e59 348 if (whence > SEEK_MAX)
1da177e4
LT
349 goto out_putf;
350
2903ff01 351 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
965c8e59 352 whence);
1da177e4
LT
353
354 retval = (int)offset;
355 if (offset >= 0) {
356 retval = -EFAULT;
357 if (!copy_to_user(result, &offset, sizeof(offset)))
358 retval = 0;
359 }
360out_putf:
d7a15f8d 361 fdput_pos(f);
1da177e4
LT
362 return retval;
363}
364#endif
365
68d70d03 366int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
1da177e4
LT
367{
368 struct inode *inode;
c43e259c 369 int retval = -EINVAL;
1da177e4 370
496ad9aa 371 inode = file_inode(file);
e28cc715 372 if (unlikely((ssize_t) count < 0))
c43e259c 373 return retval;
1da177e4 374
438ab720
KS
375 /*
376 * ranged mandatory locking does not apply to streams - it makes sense
377 * only for files where position has a meaning.
378 */
379 if (ppos) {
380 loff_t pos = *ppos;
381
382 if (unlikely(pos < 0)) {
383 if (!unsigned_offsets(file))
384 return retval;
385 if (count >= -pos) /* both values are in 0..LLONG_MAX */
386 return -EOVERFLOW;
387 } else if (unlikely((loff_t) (pos + count) < 0)) {
388 if (!unsigned_offsets(file))
389 return retval;
390 }
391
392 if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
393 retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
394 read_write == READ ? F_RDLCK : F_WRLCK);
395 if (retval < 0)
396 return retval;
397 }
e28cc715 398 }
438ab720 399
bc61384d 400 return security_file_permission(file,
c43e259c 401 read_write == READ ? MAY_READ : MAY_WRITE);
1da177e4
LT
402}
403
5d5d5689 404static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
293bc982
AV
405{
406 struct iovec iov = { .iov_base = buf, .iov_len = len };
407 struct kiocb kiocb;
408 struct iov_iter iter;
409 ssize_t ret;
410
411 init_sync_kiocb(&kiocb, filp);
438ab720 412 kiocb.ki_pos = (ppos ? *ppos : 0);
293bc982
AV
413 iov_iter_init(&iter, READ, &iov, 1, len);
414
bb7462b6 415 ret = call_read_iter(filp, &kiocb, &iter);
599bd19b 416 BUG_ON(ret == -EIOCBQUEUED);
438ab720
KS
417 if (ppos)
418 *ppos = kiocb.ki_pos;
293bc982
AV
419 return ret;
420}
421
61a707c5
CH
422ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
423{
424 mm_segment_t old_fs = get_fs();
425 ssize_t ret;
426
427 if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
428 return -EINVAL;
429 if (!(file->f_mode & FMODE_CAN_READ))
430 return -EINVAL;
431
432 if (count > MAX_RW_COUNT)
433 count = MAX_RW_COUNT;
434 set_fs(KERNEL_DS);
775802c0
CH
435 if (file->f_op->read)
436 ret = file->f_op->read(file, (void __user *)buf, count, pos);
437 else if (file->f_op->read_iter)
438 ret = new_sync_read(file, (void __user *)buf, count, pos);
439 else
440 ret = -EINVAL;
61a707c5
CH
441 set_fs(old_fs);
442 if (ret > 0) {
443 fsnotify_access(file);
444 add_rchar(current, ret);
445 }
446 inc_syscr(current);
447 return ret;
448}
449
bdd1d2d3 450ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
c41fbad0 451{
6209dd91 452 ssize_t ret;
c41fbad0 453
6209dd91
CH
454 ret = rw_verify_area(READ, file, pos, count);
455 if (ret)
456 return ret;
457 return __kernel_read(file, buf, count, pos);
c41fbad0
CH
458}
459EXPORT_SYMBOL(kernel_read);
6fb5032e 460
1da177e4
LT
461ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
462{
463 ssize_t ret;
464
465 if (!(file->f_mode & FMODE_READ))
466 return -EBADF;
7f7f25e8 467 if (!(file->f_mode & FMODE_CAN_READ))
1da177e4 468 return -EINVAL;
96d4f267 469 if (unlikely(!access_ok(buf, count)))
1da177e4
LT
470 return -EFAULT;
471
472 ret = rw_verify_area(READ, file, pos, count);
775802c0
CH
473 if (ret)
474 return ret;
475 if (count > MAX_RW_COUNT)
476 count = MAX_RW_COUNT;
1da177e4 477
775802c0
CH
478 if (file->f_op->read)
479 ret = file->f_op->read(file, buf, count, pos);
480 else if (file->f_op->read_iter)
481 ret = new_sync_read(file, buf, count, pos);
482 else
483 ret = -EINVAL;
484 if (ret > 0) {
485 fsnotify_access(file);
486 add_rchar(current, ret);
487 }
488 inc_syscr(current);
1da177e4
LT
489 return ret;
490}
491
5d5d5689 492static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
293bc982
AV
493{
494 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
495 struct kiocb kiocb;
496 struct iov_iter iter;
497 ssize_t ret;
498
499 init_sync_kiocb(&kiocb, filp);
438ab720 500 kiocb.ki_pos = (ppos ? *ppos : 0);
293bc982
AV
501 iov_iter_init(&iter, WRITE, &iov, 1, len);
502
bb7462b6 503 ret = call_write_iter(filp, &kiocb, &iter);
599bd19b 504 BUG_ON(ret == -EIOCBQUEUED);
438ab720 505 if (ret > 0 && ppos)
f765b134 506 *ppos = kiocb.ki_pos;
293bc982
AV
507 return ret;
508}
509
81238b2c 510/* caller is responsible for file_start_write/file_end_write */
73e18f7c 511ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
06ae43f3
AV
512{
513 mm_segment_t old_fs;
514 const char __user *p;
515 ssize_t ret;
516
a01ac27b
CH
517 if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
518 return -EBADF;
7f7f25e8 519 if (!(file->f_mode & FMODE_CAN_WRITE))
3e84f48e
AV
520 return -EINVAL;
521
06ae43f3 522 old_fs = get_fs();
736706be 523 set_fs(KERNEL_DS);
06ae43f3
AV
524 p = (__force const char __user *)buf;
525 if (count > MAX_RW_COUNT)
526 count = MAX_RW_COUNT;
53ad8626
CH
527 if (file->f_op->write)
528 ret = file->f_op->write(file, p, count, pos);
529 else if (file->f_op->write_iter)
530 ret = new_sync_write(file, p, count, pos);
531 else
532 ret = -EINVAL;
06ae43f3
AV
533 set_fs(old_fs);
534 if (ret > 0) {
535 fsnotify_modify(file);
536 add_wchar(current, ret);
537 }
538 inc_syscw(current);
539 return ret;
540}
90fb7027
LT
541/*
542 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
543 * but autofs is one of the few internal kernel users that actually
544 * wants this _and_ can be built as a module. So we need to export
545 * this symbol for autofs, even though it really isn't appropriate
546 * for any other kernel modules.
547 */
548EXPORT_SYMBOL_GPL(__kernel_write);
2ec3a12a 549
e13ec939
CH
550ssize_t kernel_write(struct file *file, const void *buf, size_t count,
551 loff_t *pos)
ac452aca 552{
81238b2c 553 ssize_t ret;
ac452aca 554
81238b2c
CH
555 ret = rw_verify_area(WRITE, file, pos, count);
556 if (ret)
557 return ret;
ac452aca 558
81238b2c
CH
559 file_start_write(file);
560 ret = __kernel_write(file, buf, count, pos);
561 file_end_write(file);
562 return ret;
ac452aca
CH
563}
564EXPORT_SYMBOL(kernel_write);
565
1da177e4
LT
566ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
567{
568 ssize_t ret;
569
570 if (!(file->f_mode & FMODE_WRITE))
571 return -EBADF;
7f7f25e8 572 if (!(file->f_mode & FMODE_CAN_WRITE))
1da177e4 573 return -EINVAL;
96d4f267 574 if (unlikely(!access_ok(buf, count)))
1da177e4
LT
575 return -EFAULT;
576
577 ret = rw_verify_area(WRITE, file, pos, count);
53ad8626
CH
578 if (ret)
579 return ret;
580 if (count > MAX_RW_COUNT)
581 count = MAX_RW_COUNT;
582 file_start_write(file);
583 if (file->f_op->write)
584 ret = file->f_op->write(file, buf, count, pos);
585 else if (file->f_op->write_iter)
586 ret = new_sync_write(file, buf, count, pos);
587 else
588 ret = -EINVAL;
589 if (ret > 0) {
590 fsnotify_modify(file);
591 add_wchar(current, ret);
1da177e4 592 }
53ad8626
CH
593 inc_syscw(current);
594 file_end_write(file);
1da177e4
LT
595 return ret;
596}
597
438ab720
KS
598/* file_ppos returns &file->f_pos or NULL if file is stream */
599static inline loff_t *file_ppos(struct file *file)
1da177e4 600{
438ab720 601 return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
1da177e4
LT
602}
603
3ce4a7bf 604ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
1da177e4 605{
9c225f26 606 struct fd f = fdget_pos(fd);
1da177e4 607 ssize_t ret = -EBADF;
1da177e4 608
2903ff01 609 if (f.file) {
438ab720
KS
610 loff_t pos, *ppos = file_ppos(f.file);
611 if (ppos) {
612 pos = *ppos;
613 ppos = &pos;
614 }
615 ret = vfs_read(f.file, buf, count, ppos);
616 if (ret >= 0 && ppos)
617 f.file->f_pos = pos;
9c225f26 618 fdput_pos(f);
1da177e4 619 }
1da177e4
LT
620 return ret;
621}
1da177e4 622
3ce4a7bf
DB
623SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
624{
625 return ksys_read(fd, buf, count);
626}
627
e7a3e8b2 628ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
1da177e4 629{
9c225f26 630 struct fd f = fdget_pos(fd);
1da177e4 631 ssize_t ret = -EBADF;
1da177e4 632
2903ff01 633 if (f.file) {
438ab720
KS
634 loff_t pos, *ppos = file_ppos(f.file);
635 if (ppos) {
636 pos = *ppos;
637 ppos = &pos;
638 }
639 ret = vfs_write(f.file, buf, count, ppos);
640 if (ret >= 0 && ppos)
641 f.file->f_pos = pos;
9c225f26 642 fdput_pos(f);
1da177e4
LT
643 }
644
645 return ret;
646}
647
e7a3e8b2
DB
648SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
649 size_t, count)
650{
651 return ksys_write(fd, buf, count);
652}
653
36028d5d
DB
654ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
655 loff_t pos)
1da177e4 656{
2903ff01 657 struct fd f;
1da177e4 658 ssize_t ret = -EBADF;
1da177e4
LT
659
660 if (pos < 0)
661 return -EINVAL;
662
2903ff01
AV
663 f = fdget(fd);
664 if (f.file) {
1da177e4 665 ret = -ESPIPE;
2903ff01
AV
666 if (f.file->f_mode & FMODE_PREAD)
667 ret = vfs_read(f.file, buf, count, &pos);
668 fdput(f);
1da177e4
LT
669 }
670
671 return ret;
672}
673
36028d5d
DB
674SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
675 size_t, count, loff_t, pos)
676{
677 return ksys_pread64(fd, buf, count, pos);
678}
679
680ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
681 size_t count, loff_t pos)
1da177e4 682{
2903ff01 683 struct fd f;
1da177e4 684 ssize_t ret = -EBADF;
1da177e4
LT
685
686 if (pos < 0)
687 return -EINVAL;
688
2903ff01
AV
689 f = fdget(fd);
690 if (f.file) {
1da177e4 691 ret = -ESPIPE;
2903ff01
AV
692 if (f.file->f_mode & FMODE_PWRITE)
693 ret = vfs_write(f.file, buf, count, &pos);
694 fdput(f);
1da177e4
LT
695 }
696
697 return ret;
698}
699
36028d5d
DB
700SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
701 size_t, count, loff_t, pos)
702{
703 return ksys_pwrite64(fd, buf, count, pos);
704}
705
ac15ac06 706static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
ddef7ed2 707 loff_t *ppos, int type, rwf_t flags)
293bc982
AV
708{
709 struct kiocb kiocb;
293bc982
AV
710 ssize_t ret;
711
712 init_sync_kiocb(&kiocb, filp);
fdd2f5b7
GR
713 ret = kiocb_set_rw_flags(&kiocb, flags);
714 if (ret)
715 return ret;
438ab720 716 kiocb.ki_pos = (ppos ? *ppos : 0);
293bc982 717
0f78d06a 718 if (type == READ)
bb7462b6 719 ret = call_read_iter(filp, &kiocb, iter);
0f78d06a 720 else
bb7462b6 721 ret = call_write_iter(filp, &kiocb, iter);
599bd19b 722 BUG_ON(ret == -EIOCBQUEUED);
438ab720
KS
723 if (ppos)
724 *ppos = kiocb.ki_pos;
293bc982
AV
725 return ret;
726}
727
ee0b3e67 728/* Do it by hand, with file-ops */
ac15ac06 729static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
ddef7ed2 730 loff_t *ppos, int type, rwf_t flags)
ee0b3e67 731{
ee0b3e67
BP
732 ssize_t ret = 0;
733
97be7ebe 734 if (flags & ~RWF_HIPRI)
793b80ef
CH
735 return -EOPNOTSUPP;
736
ac15ac06
AV
737 while (iov_iter_count(iter)) {
738 struct iovec iovec = iov_iter_iovec(iter);
ee0b3e67
BP
739 ssize_t nr;
740
0f78d06a
MS
741 if (type == READ) {
742 nr = filp->f_op->read(filp, iovec.iov_base,
743 iovec.iov_len, ppos);
744 } else {
745 nr = filp->f_op->write(filp, iovec.iov_base,
746 iovec.iov_len, ppos);
747 }
ee0b3e67
BP
748
749 if (nr < 0) {
750 if (!ret)
751 ret = nr;
752 break;
753 }
754 ret += nr;
ac15ac06 755 if (nr != iovec.iov_len)
ee0b3e67 756 break;
ac15ac06 757 iov_iter_advance(iter, nr);
ee0b3e67
BP
758 }
759
760 return ret;
761}
762
19c73586 763static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
ddef7ed2 764 loff_t *pos, rwf_t flags)
1da177e4 765{
1da177e4 766 size_t tot_len;
7687a7a4 767 ssize_t ret = 0;
1da177e4 768
edab5fe3
CH
769 if (!(file->f_mode & FMODE_READ))
770 return -EBADF;
771 if (!(file->f_mode & FMODE_CAN_READ))
772 return -EINVAL;
773
7687a7a4 774 tot_len = iov_iter_count(iter);
0504c074
AV
775 if (!tot_len)
776 goto out;
19c73586 777 ret = rw_verify_area(READ, file, pos, tot_len);
e28cc715 778 if (ret < 0)
19c73586 779 return ret;
1da177e4 780
19c73586
CH
781 if (file->f_op->read_iter)
782 ret = do_iter_readv_writev(file, iter, pos, READ, flags);
ee0b3e67 783 else
19c73586 784 ret = do_loop_readv_writev(file, iter, pos, READ, flags);
1da177e4 785out:
19c73586
CH
786 if (ret >= 0)
787 fsnotify_access(file);
1da177e4 788 return ret;
1da177e4
LT
789}
790
5dcdc43e
JX
791ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
792 struct iov_iter *iter)
793{
794 size_t tot_len;
795 ssize_t ret = 0;
796
797 if (!file->f_op->read_iter)
798 return -EINVAL;
799 if (!(file->f_mode & FMODE_READ))
800 return -EBADF;
801 if (!(file->f_mode & FMODE_CAN_READ))
802 return -EINVAL;
803
804 tot_len = iov_iter_count(iter);
805 if (!tot_len)
806 goto out;
807 ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
808 if (ret < 0)
809 return ret;
810
811 ret = call_read_iter(file, iocb, iter);
812out:
813 if (ret >= 0)
814 fsnotify_access(file);
815 return ret;
816}
817EXPORT_SYMBOL(vfs_iocb_iter_read);
818
18e9710e 819ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
ddef7ed2 820 rwf_t flags)
7687a7a4 821{
18e9710e
CH
822 if (!file->f_op->read_iter)
823 return -EINVAL;
824 return do_iter_read(file, iter, ppos, flags);
825}
826EXPORT_SYMBOL(vfs_iter_read);
7687a7a4 827
19c73586 828static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
ddef7ed2 829 loff_t *pos, rwf_t flags)
19c73586
CH
830{
831 size_t tot_len;
832 ssize_t ret = 0;
03d95eb2 833
edab5fe3
CH
834 if (!(file->f_mode & FMODE_WRITE))
835 return -EBADF;
836 if (!(file->f_mode & FMODE_CAN_WRITE))
837 return -EINVAL;
838
19c73586
CH
839 tot_len = iov_iter_count(iter);
840 if (!tot_len)
841 return 0;
842 ret = rw_verify_area(WRITE, file, pos, tot_len);
7687a7a4
MS
843 if (ret < 0)
844 return ret;
845
19c73586
CH
846 if (file->f_op->write_iter)
847 ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
848 else
849 ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
19c73586
CH
850 if (ret > 0)
851 fsnotify_modify(file);
7687a7a4
MS
852 return ret;
853}
854
5dcdc43e
JX
855ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
856 struct iov_iter *iter)
857{
858 size_t tot_len;
859 ssize_t ret = 0;
860
861 if (!file->f_op->write_iter)
862 return -EINVAL;
863 if (!(file->f_mode & FMODE_WRITE))
864 return -EBADF;
865 if (!(file->f_mode & FMODE_CAN_WRITE))
866 return -EINVAL;
867
868 tot_len = iov_iter_count(iter);
869 if (!tot_len)
870 return 0;
871 ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
872 if (ret < 0)
873 return ret;
874
875 ret = call_write_iter(file, iocb, iter);
876 if (ret > 0)
877 fsnotify_modify(file);
878
879 return ret;
880}
881EXPORT_SYMBOL(vfs_iocb_iter_write);
882
abbb6589 883ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
ddef7ed2 884 rwf_t flags)
abbb6589
CH
885{
886 if (!file->f_op->write_iter)
887 return -EINVAL;
888 return do_iter_write(file, iter, ppos, flags);
889}
890EXPORT_SYMBOL(vfs_iter_write);
891
1da177e4 892ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
ddef7ed2 893 unsigned long vlen, loff_t *pos, rwf_t flags)
1da177e4 894{
7687a7a4
MS
895 struct iovec iovstack[UIO_FASTIOV];
896 struct iovec *iov = iovstack;
897 struct iov_iter iter;
898 ssize_t ret;
1da177e4 899
251b42a1 900 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
edab5fe3
CH
901 if (ret >= 0) {
902 ret = do_iter_read(file, &iter, pos, flags);
903 kfree(iov);
904 }
1da177e4 905
251b42a1
CH
906 return ret;
907}
1da177e4 908
9725d4ce 909static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
ddef7ed2 910 unsigned long vlen, loff_t *pos, rwf_t flags)
1da177e4 911{
251b42a1
CH
912 struct iovec iovstack[UIO_FASTIOV];
913 struct iovec *iov = iovstack;
914 struct iov_iter iter;
915 ssize_t ret;
1da177e4 916
251b42a1 917 ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
edab5fe3 918 if (ret >= 0) {
62473a2d 919 file_start_write(file);
edab5fe3 920 ret = do_iter_write(file, &iter, pos, flags);
62473a2d 921 file_end_write(file);
edab5fe3
CH
922 kfree(iov);
923 }
251b42a1 924 return ret;
1da177e4 925}
1da177e4 926
f17d8b35 927static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
ddef7ed2 928 unsigned long vlen, rwf_t flags)
1da177e4 929{
9c225f26 930 struct fd f = fdget_pos(fd);
1da177e4 931 ssize_t ret = -EBADF;
1da177e4 932
2903ff01 933 if (f.file) {
438ab720
KS
934 loff_t pos, *ppos = file_ppos(f.file);
935 if (ppos) {
936 pos = *ppos;
937 ppos = &pos;
938 }
939 ret = vfs_readv(f.file, vec, vlen, ppos, flags);
940 if (ret >= 0 && ppos)
941 f.file->f_pos = pos;
9c225f26 942 fdput_pos(f);
1da177e4
LT
943 }
944
945 if (ret > 0)
4b98d11b
AD
946 add_rchar(current, ret);
947 inc_syscr(current);
1da177e4
LT
948 return ret;
949}
950
f17d8b35 951static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
ddef7ed2 952 unsigned long vlen, rwf_t flags)
1da177e4 953{
9c225f26 954 struct fd f = fdget_pos(fd);
1da177e4 955 ssize_t ret = -EBADF;
1da177e4 956
2903ff01 957 if (f.file) {
438ab720
KS
958 loff_t pos, *ppos = file_ppos(f.file);
959 if (ppos) {
960 pos = *ppos;
961 ppos = &pos;
962 }
963 ret = vfs_writev(f.file, vec, vlen, ppos, flags);
964 if (ret >= 0 && ppos)
965 f.file->f_pos = pos;
9c225f26 966 fdput_pos(f);
1da177e4
LT
967 }
968
969 if (ret > 0)
4b98d11b
AD
970 add_wchar(current, ret);
971 inc_syscw(current);
1da177e4
LT
972 return ret;
973}
974
601cc11d
LT
975static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
976{
977#define HALF_LONG_BITS (BITS_PER_LONG / 2)
978 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
979}
980
f17d8b35 981static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
ddef7ed2 982 unsigned long vlen, loff_t pos, rwf_t flags)
f3554f4b 983{
2903ff01 984 struct fd f;
f3554f4b 985 ssize_t ret = -EBADF;
f3554f4b
GH
986
987 if (pos < 0)
988 return -EINVAL;
989
2903ff01
AV
990 f = fdget(fd);
991 if (f.file) {
f3554f4b 992 ret = -ESPIPE;
2903ff01 993 if (f.file->f_mode & FMODE_PREAD)
f17d8b35 994 ret = vfs_readv(f.file, vec, vlen, &pos, flags);
2903ff01 995 fdput(f);
f3554f4b
GH
996 }
997
998 if (ret > 0)
999 add_rchar(current, ret);
1000 inc_syscr(current);
1001 return ret;
1002}
1003
f17d8b35 1004static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
ddef7ed2 1005 unsigned long vlen, loff_t pos, rwf_t flags)
f3554f4b 1006{
2903ff01 1007 struct fd f;
f3554f4b 1008 ssize_t ret = -EBADF;
f3554f4b
GH
1009
1010 if (pos < 0)
1011 return -EINVAL;
1012
2903ff01
AV
1013 f = fdget(fd);
1014 if (f.file) {
f3554f4b 1015 ret = -ESPIPE;
2903ff01 1016 if (f.file->f_mode & FMODE_PWRITE)
f17d8b35 1017 ret = vfs_writev(f.file, vec, vlen, &pos, flags);
2903ff01 1018 fdput(f);
f3554f4b
GH
1019 }
1020
1021 if (ret > 0)
1022 add_wchar(current, ret);
1023 inc_syscw(current);
1024 return ret;
1025}
1026
f17d8b35
MT
1027SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1028 unsigned long, vlen)
1029{
1030 return do_readv(fd, vec, vlen, 0);
1031}
1032
1033SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1034 unsigned long, vlen)
1035{
1036 return do_writev(fd, vec, vlen, 0);
1037}
1038
1039SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1040 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1041{
1042 loff_t pos = pos_from_hilo(pos_h, pos_l);
1043
1044 return do_preadv(fd, vec, vlen, pos, 0);
1045}
1046
1047SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1048 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
ddef7ed2 1049 rwf_t, flags)
f17d8b35
MT
1050{
1051 loff_t pos = pos_from_hilo(pos_h, pos_l);
1052
1053 if (pos == -1)
1054 return do_readv(fd, vec, vlen, flags);
1055
1056 return do_preadv(fd, vec, vlen, pos, flags);
1057}
1058
1059SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1060 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1061{
1062 loff_t pos = pos_from_hilo(pos_h, pos_l);
1063
1064 return do_pwritev(fd, vec, vlen, pos, 0);
1065}
1066
1067SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1068 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
ddef7ed2 1069 rwf_t, flags)
f17d8b35
MT
1070{
1071 loff_t pos = pos_from_hilo(pos_h, pos_l);
1072
1073 if (pos == -1)
1074 return do_writev(fd, vec, vlen, flags);
1075
1076 return do_pwritev(fd, vec, vlen, pos, flags);
1077}
1078
3523a9d4
CH
1079/*
1080 * Various compat syscalls. Note that they all pretend to take a native
1081 * iovec - import_iovec will properly treat those as compat_iovecs based on
1082 * in_compat_syscall().
1083 */
72ec3516 1084#ifdef CONFIG_COMPAT
378a10f3
HC
1085#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1086COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
3523a9d4 1087 const struct iovec __user *, vec,
378a10f3
HC
1088 unsigned long, vlen, loff_t, pos)
1089{
3523a9d4 1090 return do_preadv(fd, vec, vlen, pos, 0);
378a10f3
HC
1091}
1092#endif
1093
dfd948e3 1094COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
3523a9d4 1095 const struct iovec __user *, vec,
dfd948e3 1096 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
72ec3516
AV
1097{
1098 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
378a10f3 1099
3523a9d4 1100 return do_preadv(fd, vec, vlen, pos, 0);
f17d8b35
MT
1101}
1102
3ebfd81f
L
1103#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
1104COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
3523a9d4 1105 const struct iovec __user *, vec,
ddef7ed2 1106 unsigned long, vlen, loff_t, pos, rwf_t, flags)
3ebfd81f 1107{
cc4b1242 1108 if (pos == -1)
3523a9d4
CH
1109 return do_readv(fd, vec, vlen, flags);
1110 return do_preadv(fd, vec, vlen, pos, flags);
3ebfd81f
L
1111}
1112#endif
1113
f17d8b35 1114COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
3523a9d4 1115 const struct iovec __user *, vec,
f17d8b35 1116 compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
ddef7ed2 1117 rwf_t, flags)
f17d8b35
MT
1118{
1119 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1120
1121 if (pos == -1)
3523a9d4
CH
1122 return do_readv(fd, vec, vlen, flags);
1123 return do_preadv(fd, vec, vlen, pos, flags);
72ec3516
AV
1124}
1125
378a10f3
HC
1126#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1127COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
3523a9d4 1128 const struct iovec __user *, vec,
378a10f3
HC
1129 unsigned long, vlen, loff_t, pos)
1130{
3523a9d4 1131 return do_pwritev(fd, vec, vlen, pos, 0);
378a10f3
HC
1132}
1133#endif
1134
dfd948e3 1135COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
3523a9d4 1136 const struct iovec __user *,vec,
dfd948e3 1137 compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
72ec3516
AV
1138{
1139 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
378a10f3 1140
3523a9d4 1141 return do_pwritev(fd, vec, vlen, pos, 0);
72ec3516 1142}
f17d8b35 1143
3ebfd81f
L
1144#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
1145COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
3523a9d4 1146 const struct iovec __user *, vec,
ddef7ed2 1147 unsigned long, vlen, loff_t, pos, rwf_t, flags)
3ebfd81f 1148{
cc4b1242 1149 if (pos == -1)
3523a9d4
CH
1150 return do_writev(fd, vec, vlen, flags);
1151 return do_pwritev(fd, vec, vlen, pos, flags);
3ebfd81f
L
1152}
1153#endif
1154
f17d8b35 1155COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
3523a9d4 1156 const struct iovec __user *,vec,
ddef7ed2 1157 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
f17d8b35
MT
1158{
1159 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1160
1161 if (pos == -1)
3523a9d4
CH
1162 return do_writev(fd, vec, vlen, flags);
1163 return do_pwritev(fd, vec, vlen, pos, flags);
72ec3516 1164}
3523a9d4 1165#endif /* CONFIG_COMPAT */
72ec3516 1166
19f4fc3a
AV
1167static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1168 size_t count, loff_t max)
1da177e4 1169{
2903ff01
AV
1170 struct fd in, out;
1171 struct inode *in_inode, *out_inode;
1da177e4 1172 loff_t pos;
7995bd28 1173 loff_t out_pos;
1da177e4 1174 ssize_t retval;
2903ff01 1175 int fl;
1da177e4
LT
1176
1177 /*
1178 * Get input file, and verify that it is ok..
1179 */
1180 retval = -EBADF;
2903ff01
AV
1181 in = fdget(in_fd);
1182 if (!in.file)
1da177e4 1183 goto out;
2903ff01 1184 if (!(in.file->f_mode & FMODE_READ))
1da177e4 1185 goto fput_in;
1da177e4 1186 retval = -ESPIPE;
7995bd28
AV
1187 if (!ppos) {
1188 pos = in.file->f_pos;
1189 } else {
1190 pos = *ppos;
2903ff01 1191 if (!(in.file->f_mode & FMODE_PREAD))
1da177e4 1192 goto fput_in;
7995bd28
AV
1193 }
1194 retval = rw_verify_area(READ, in.file, &pos, count);
e28cc715 1195 if (retval < 0)
1da177e4 1196 goto fput_in;
bc61384d
AV
1197 if (count > MAX_RW_COUNT)
1198 count = MAX_RW_COUNT;
1da177e4 1199
1da177e4
LT
1200 /*
1201 * Get output file, and verify that it is ok..
1202 */
1203 retval = -EBADF;
2903ff01
AV
1204 out = fdget(out_fd);
1205 if (!out.file)
1da177e4 1206 goto fput_in;
2903ff01 1207 if (!(out.file->f_mode & FMODE_WRITE))
1da177e4 1208 goto fput_out;
496ad9aa
AV
1209 in_inode = file_inode(in.file);
1210 out_inode = file_inode(out.file);
7995bd28
AV
1211 out_pos = out.file->f_pos;
1212 retval = rw_verify_area(WRITE, out.file, &out_pos, count);
e28cc715 1213 if (retval < 0)
1da177e4
LT
1214 goto fput_out;
1215
1da177e4
LT
1216 if (!max)
1217 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1218
1da177e4
LT
1219 if (unlikely(pos + count > max)) {
1220 retval = -EOVERFLOW;
1221 if (pos >= max)
1222 goto fput_out;
1223 count = max - pos;
1224 }
1225
d96e6e71 1226 fl = 0;
534f2aaa 1227#if 0
d96e6e71
JA
1228 /*
1229 * We need to debate whether we can enable this or not. The
1230 * man page documents EAGAIN return for the output at least,
1231 * and the application is arguably buggy if it doesn't expect
1232 * EAGAIN on a non-blocking file descriptor.
1233 */
2903ff01 1234 if (in.file->f_flags & O_NONBLOCK)
d96e6e71 1235 fl = SPLICE_F_NONBLOCK;
534f2aaa 1236#endif
50cd2c57 1237 file_start_write(out.file);
7995bd28 1238 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
50cd2c57 1239 file_end_write(out.file);
1da177e4
LT
1240
1241 if (retval > 0) {
4b98d11b
AD
1242 add_rchar(current, retval);
1243 add_wchar(current, retval);
a68c2f12
SW
1244 fsnotify_access(in.file);
1245 fsnotify_modify(out.file);
7995bd28
AV
1246 out.file->f_pos = out_pos;
1247 if (ppos)
1248 *ppos = pos;
1249 else
1250 in.file->f_pos = pos;
1da177e4 1251 }
1da177e4 1252
4b98d11b
AD
1253 inc_syscr(current);
1254 inc_syscw(current);
7995bd28 1255 if (pos > max)
1da177e4
LT
1256 retval = -EOVERFLOW;
1257
1258fput_out:
2903ff01 1259 fdput(out);
1da177e4 1260fput_in:
2903ff01 1261 fdput(in);
1da177e4
LT
1262out:
1263 return retval;
1264}
1265
002c8976 1266SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1da177e4
LT
1267{
1268 loff_t pos;
1269 off_t off;
1270 ssize_t ret;
1271
1272 if (offset) {
1273 if (unlikely(get_user(off, offset)))
1274 return -EFAULT;
1275 pos = off;
1276 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1277 if (unlikely(put_user(pos, offset)))
1278 return -EFAULT;
1279 return ret;
1280 }
1281
1282 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1283}
1284
002c8976 1285SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1da177e4
LT
1286{
1287 loff_t pos;
1288 ssize_t ret;
1289
1290 if (offset) {
1291 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1292 return -EFAULT;
1293 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1294 if (unlikely(put_user(pos, offset)))
1295 return -EFAULT;
1296 return ret;
1297 }
1298
1299 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1300}
19f4fc3a
AV
1301
1302#ifdef CONFIG_COMPAT
1303COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1304 compat_off_t __user *, offset, compat_size_t, count)
1305{
1306 loff_t pos;
1307 off_t off;
1308 ssize_t ret;
1309
1310 if (offset) {
1311 if (unlikely(get_user(off, offset)))
1312 return -EFAULT;
1313 pos = off;
1314 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1315 if (unlikely(put_user(pos, offset)))
1316 return -EFAULT;
1317 return ret;
1318 }
1319
1320 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1321}
1322
1323COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1324 compat_loff_t __user *, offset, compat_size_t, count)
1325{
1326 loff_t pos;
1327 ssize_t ret;
1328
1329 if (offset) {
1330 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1331 return -EFAULT;
1332 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1333 if (unlikely(put_user(pos, offset)))
1334 return -EFAULT;
1335 return ret;
1336 }
1337
1338 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1339}
1340#endif
29732938 1341
f16acc9d
DC
1342/**
1343 * generic_copy_file_range - copy data between two files
1344 * @file_in: file structure to read from
1345 * @pos_in: file offset to read from
1346 * @file_out: file structure to write data to
1347 * @pos_out: file offset to write data to
1348 * @len: amount of data to copy
1349 * @flags: copy flags
1350 *
1351 * This is a generic filesystem helper to copy data from one file to another.
1352 * It has no constraints on the source or destination file owners - the files
1353 * can belong to different superblocks and different filesystem types. Short
1354 * copies are allowed.
1355 *
1356 * This should be called from the @file_out filesystem, as per the
1357 * ->copy_file_range() method.
1358 *
1359 * Returns the number of bytes copied or a negative error indicating the
1360 * failure.
1361 */
1362
1363ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
1364 struct file *file_out, loff_t pos_out,
1365 size_t len, unsigned int flags)
1366{
1367 return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1368 len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1369}
1370EXPORT_SYMBOL(generic_copy_file_range);
1371
64bf5ff5
DC
1372static ssize_t do_copy_file_range(struct file *file_in, loff_t pos_in,
1373 struct file *file_out, loff_t pos_out,
1374 size_t len, unsigned int flags)
1375{
5dae222a
AG
1376 /*
1377 * Although we now allow filesystems to handle cross sb copy, passing
1378 * a file of the wrong filesystem type to filesystem driver can result
1379 * in an attempt to dereference the wrong type of ->private_data, so
1380 * avoid doing that until we really have a good reason. NFS defines
1381 * several different file_system_type structures, but they all end up
1382 * using the same ->copy_file_range() function pointer.
1383 */
1384 if (file_out->f_op->copy_file_range &&
1385 file_out->f_op->copy_file_range == file_in->f_op->copy_file_range)
64bf5ff5
DC
1386 return file_out->f_op->copy_file_range(file_in, pos_in,
1387 file_out, pos_out,
1388 len, flags);
1389
1390 return generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1391 flags);
1392}
1393
29732938
ZB
1394/*
1395 * copy_file_range() differs from regular file read and write in that it
1396 * specifically allows return partial success. When it does so is up to
1397 * the copy_file_range method.
1398 */
1399ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1400 struct file *file_out, loff_t pos_out,
1401 size_t len, unsigned int flags)
1402{
29732938
ZB
1403 ssize_t ret;
1404
1405 if (flags != 0)
1406 return -EINVAL;
1407
96e6e8f4
AG
1408 ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1409 flags);
a3171351
AG
1410 if (unlikely(ret))
1411 return ret;
11cbfb10 1412
29732938 1413 ret = rw_verify_area(READ, file_in, &pos_in, len);
bc61384d
AV
1414 if (unlikely(ret))
1415 return ret;
1416
1417 ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1418 if (unlikely(ret))
29732938
ZB
1419 return ret;
1420
29732938
ZB
1421 if (len == 0)
1422 return 0;
1423
bfe219d3 1424 file_start_write(file_out);
29732938 1425
a76b5b04
CH
1426 /*
1427 * Try cloning first, this is supported by more file systems, and
1428 * more efficient if both clone and copy are supported (e.g. NFS).
1429 */
5dae222a
AG
1430 if (file_in->f_op->remap_file_range &&
1431 file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
42ec3d4c
DW
1432 loff_t cloned;
1433
1434 cloned = file_in->f_op->remap_file_range(file_in, pos_in,
1435 file_out, pos_out,
eca3654e
DW
1436 min_t(loff_t, MAX_RW_COUNT, len),
1437 REMAP_FILE_CAN_SHORTEN);
42ec3d4c
DW
1438 if (cloned > 0) {
1439 ret = cloned;
a76b5b04
CH
1440 goto done;
1441 }
1442 }
1443
64bf5ff5
DC
1444 ret = do_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1445 flags);
1446 WARN_ON_ONCE(ret == -EOPNOTSUPP);
a76b5b04 1447done:
29732938
ZB
1448 if (ret > 0) {
1449 fsnotify_access(file_in);
1450 add_rchar(current, ret);
1451 fsnotify_modify(file_out);
1452 add_wchar(current, ret);
1453 }
a76b5b04 1454
29732938
ZB
1455 inc_syscr(current);
1456 inc_syscw(current);
1457
bfe219d3 1458 file_end_write(file_out);
29732938
ZB
1459
1460 return ret;
1461}
1462EXPORT_SYMBOL(vfs_copy_file_range);
1463
1464SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1465 int, fd_out, loff_t __user *, off_out,
1466 size_t, len, unsigned int, flags)
1467{
1468 loff_t pos_in;
1469 loff_t pos_out;
1470 struct fd f_in;
1471 struct fd f_out;
1472 ssize_t ret = -EBADF;
1473
1474 f_in = fdget(fd_in);
1475 if (!f_in.file)
1476 goto out2;
1477
1478 f_out = fdget(fd_out);
1479 if (!f_out.file)
1480 goto out1;
1481
1482 ret = -EFAULT;
1483 if (off_in) {
1484 if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1485 goto out;
1486 } else {
1487 pos_in = f_in.file->f_pos;
1488 }
1489
1490 if (off_out) {
1491 if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1492 goto out;
1493 } else {
1494 pos_out = f_out.file->f_pos;
1495 }
1496
1497 ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1498 flags);
1499 if (ret > 0) {
1500 pos_in += ret;
1501 pos_out += ret;
1502
1503 if (off_in) {
1504 if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1505 ret = -EFAULT;
1506 } else {
1507 f_in.file->f_pos = pos_in;
1508 }
1509
1510 if (off_out) {
1511 if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1512 ret = -EFAULT;
1513 } else {
1514 f_out.file->f_pos = pos_out;
1515 }
1516 }
1517
1518out:
1519 fdput(f_out);
1520out1:
1521 fdput(f_in);
1522out2:
1523 return ret;
1524}
04b38d60 1525
42ec3d4c
DW
1526static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
1527 bool write)
04b38d60
CH
1528{
1529 struct inode *inode = file_inode(file);
1530
42ec3d4c 1531 if (unlikely(pos < 0 || len < 0))
04b38d60
CH
1532 return -EINVAL;
1533
1534 if (unlikely((loff_t) (pos + len) < 0))
1535 return -EINVAL;
1536
1537 if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
1538 loff_t end = len ? pos + len - 1 : OFFSET_MAX;
1539 int retval;
1540
1541 retval = locks_mandatory_area(inode, file, pos, end,
1542 write ? F_WRLCK : F_RDLCK);
1543 if (retval < 0)
1544 return retval;
1545 }
1546
1547 return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
1548}
07d19dc9
DW
1549/*
1550 * Ensure that we don't remap a partial EOF block in the middle of something
1551 * else. Assume that the offsets have already been checked for block
1552 * alignment.
1553 *
a5e6ea18
FM
1554 * For clone we only link a partial EOF block above or at the destination file's
1555 * EOF. For deduplication we accept a partial EOF block only if it ends at the
1556 * destination file's EOF (can not link it into the middle of a file).
eca3654e
DW
1557 *
1558 * Shorten the request if possible.
07d19dc9
DW
1559 */
1560static int generic_remap_check_len(struct inode *inode_in,
1561 struct inode *inode_out,
1562 loff_t pos_out,
42ec3d4c 1563 loff_t *len,
a91ae49b 1564 unsigned int remap_flags)
07d19dc9
DW
1565{
1566 u64 blkmask = i_blocksize(inode_in) - 1;
eca3654e 1567 loff_t new_len = *len;
07d19dc9
DW
1568
1569 if ((*len & blkmask) == 0)
1570 return 0;
1571
a5e6ea18 1572 if (pos_out + *len < i_size_read(inode_out))
eca3654e 1573 new_len &= ~blkmask;
07d19dc9 1574
eca3654e
DW
1575 if (new_len == *len)
1576 return 0;
1577
1578 if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
1579 *len = new_len;
1580 return 0;
1581 }
1582
1583 return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
07d19dc9 1584}
04b38d60 1585
edc58dd0 1586/* Read a page's worth of file data into the page cache. */
c32e5f39
DW
1587static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
1588{
1589 struct page *page;
1590
1591 page = read_mapping_page(inode->i_mapping, offset >> PAGE_SHIFT, NULL);
1592 if (IS_ERR(page))
1593 return page;
1594 if (!PageUptodate(page)) {
1595 put_page(page);
1596 return ERR_PTR(-EIO);
1597 }
c32e5f39
DW
1598 return page;
1599}
1600
edc58dd0
DW
1601/*
1602 * Lock two pages, ensuring that we lock in offset order if the pages are from
1603 * the same file.
1604 */
1605static void vfs_lock_two_pages(struct page *page1, struct page *page2)
1606{
1607 /* Always lock in order of increasing index. */
1608 if (page1->index > page2->index)
1609 swap(page1, page2);
1610
1611 lock_page(page1);
1612 if (page1 != page2)
1613 lock_page(page2);
1614}
1615
1616/* Unlock two pages, being careful not to unlock the same page twice. */
1617static void vfs_unlock_two_pages(struct page *page1, struct page *page2)
1618{
1619 unlock_page(page1);
1620 if (page1 != page2)
1621 unlock_page(page2);
1622}
1623
c32e5f39
DW
1624/*
1625 * Compare extents of two files to see if they are the same.
1626 * Caller must have locked both inodes to prevent write races.
1627 */
1628static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
1629 struct inode *dest, loff_t destoff,
1630 loff_t len, bool *is_same)
1631{
1632 loff_t src_poff;
1633 loff_t dest_poff;
1634 void *src_addr;
1635 void *dest_addr;
1636 struct page *src_page;
1637 struct page *dest_page;
1638 loff_t cmp_len;
1639 bool same;
1640 int error;
1641
1642 error = -EINVAL;
1643 same = true;
1644 while (len) {
1645 src_poff = srcoff & (PAGE_SIZE - 1);
1646 dest_poff = destoff & (PAGE_SIZE - 1);
1647 cmp_len = min(PAGE_SIZE - src_poff,
1648 PAGE_SIZE - dest_poff);
1649 cmp_len = min(cmp_len, len);
1650 if (cmp_len <= 0)
1651 goto out_error;
1652
1653 src_page = vfs_dedupe_get_page(src, srcoff);
1654 if (IS_ERR(src_page)) {
1655 error = PTR_ERR(src_page);
1656 goto out_error;
1657 }
1658 dest_page = vfs_dedupe_get_page(dest, destoff);
1659 if (IS_ERR(dest_page)) {
1660 error = PTR_ERR(dest_page);
c32e5f39
DW
1661 put_page(src_page);
1662 goto out_error;
1663 }
edc58dd0
DW
1664
1665 vfs_lock_two_pages(src_page, dest_page);
1666
1667 /*
1668 * Now that we've locked both pages, make sure they're still
1669 * mapped to the file data we're interested in. If not,
1670 * someone is invalidating pages on us and we lose.
1671 */
1672 if (!PageUptodate(src_page) || !PageUptodate(dest_page) ||
1673 src_page->mapping != src->i_mapping ||
1674 dest_page->mapping != dest->i_mapping) {
1675 same = false;
1676 goto unlock;
1677 }
1678
c32e5f39
DW
1679 src_addr = kmap_atomic(src_page);
1680 dest_addr = kmap_atomic(dest_page);
1681
1682 flush_dcache_page(src_page);
1683 flush_dcache_page(dest_page);
1684
1685 if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
1686 same = false;
1687
1688 kunmap_atomic(dest_addr);
1689 kunmap_atomic(src_addr);
edc58dd0
DW
1690unlock:
1691 vfs_unlock_two_pages(src_page, dest_page);
c32e5f39
DW
1692 put_page(dest_page);
1693 put_page(src_page);
1694
1695 if (!same)
1696 break;
1697
1698 srcoff += cmp_len;
1699 destoff += cmp_len;
1700 len -= cmp_len;
1701 }
1702
1703 *is_same = same;
1704 return 0;
1705
1706out_error:
1707 return error;
1708}
04b38d60 1709
876bec6f
DW
1710/*
1711 * Check that the two inodes are eligible for cloning, the ranges make
1712 * sense, and then flush all dirty data. Caller must ensure that the
1713 * inodes have been locked against any other modifications.
22725ce4 1714 *
8c5c836b
DW
1715 * If there's an error, then the usual negative error code is returned.
1716 * Otherwise returns 0 with *len set to the request length.
876bec6f 1717 */
a83ab01a
DW
1718int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
1719 struct file *file_out, loff_t pos_out,
42ec3d4c 1720 loff_t *len, unsigned int remap_flags)
876bec6f 1721{
1383a7ed
DW
1722 struct inode *inode_in = file_inode(file_in);
1723 struct inode *inode_out = file_inode(file_out);
876bec6f
DW
1724 bool same_inode = (inode_in == inode_out);
1725 int ret;
1726
1727 /* Don't touch certain kinds of inodes */
1728 if (IS_IMMUTABLE(inode_out))
1729 return -EPERM;
1730
1731 if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1732 return -ETXTBSY;
1733
1734 /* Don't reflink dirs, pipes, sockets... */
1735 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1736 return -EISDIR;
1737 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1738 return -EINVAL;
1739
876bec6f
DW
1740 /* Zero length dedupe exits immediately; reflink goes to EOF. */
1741 if (*len == 0) {
1383a7ed
DW
1742 loff_t isize = i_size_read(inode_in);
1743
a91ae49b 1744 if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
876bec6f 1745 return 0;
22725ce4
DW
1746 if (pos_in > isize)
1747 return -EINVAL;
876bec6f 1748 *len = isize - pos_in;
2c5773f1
DW
1749 if (*len == 0)
1750 return 0;
876bec6f
DW
1751 }
1752
1383a7ed
DW
1753 /* Check that we don't violate system file offset limits. */
1754 ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
3d28193e 1755 remap_flags);
1383a7ed
DW
1756 if (ret)
1757 return ret;
876bec6f
DW
1758
1759 /* Wait for the completion of any pending IOs on both files */
1760 inode_dio_wait(inode_in);
1761 if (!same_inode)
1762 inode_dio_wait(inode_out);
1763
1764 ret = filemap_write_and_wait_range(inode_in->i_mapping,
1765 pos_in, pos_in + *len - 1);
1766 if (ret)
1767 return ret;
1768
1769 ret = filemap_write_and_wait_range(inode_out->i_mapping,
1770 pos_out, pos_out + *len - 1);
1771 if (ret)
1772 return ret;
1773
1774 /*
1775 * Check that the extents are the same.
1776 */
a91ae49b 1777 if (remap_flags & REMAP_FILE_DEDUP) {
876bec6f
DW
1778 bool is_same = false;
1779
1780 ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
1781 inode_out, pos_out, *len, &is_same);
1782 if (ret)
1783 return ret;
1784 if (!is_same)
1785 return -EBADE;
1786 }
1787
07d19dc9 1788 ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
a91ae49b 1789 remap_flags);
07d19dc9
DW
1790 if (ret)
1791 return ret;
1792
8dde90bc 1793 /* If can't alter the file contents, we're done. */
e38f7f53
AG
1794 if (!(remap_flags & REMAP_FILE_DEDUP))
1795 ret = file_modified(file_out);
8dde90bc 1796
e38f7f53 1797 return ret;
876bec6f 1798}
a83ab01a 1799EXPORT_SYMBOL(generic_remap_file_range_prep);
876bec6f 1800
42ec3d4c 1801loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
452ce659
DW
1802 struct file *file_out, loff_t pos_out,
1803 loff_t len, unsigned int remap_flags)
04b38d60 1804{
42ec3d4c 1805 loff_t ret;
04b38d60 1806
6744557b 1807 WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
04b38d60 1808
913b86e9
AG
1809 /*
1810 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
1811 * the same mount. Practically, they only need to be on the same file
1812 * system.
1813 */
a3171351 1814 if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
04b38d60
CH
1815 return -EXDEV;
1816
a3171351
AG
1817 ret = generic_file_rw_checks(file_in, file_out);
1818 if (ret < 0)
1819 return ret;
04b38d60 1820
2e5dfc99 1821 if (!file_in->f_op->remap_file_range)
0fcbf996
CH
1822 return -EOPNOTSUPP;
1823
6095028b 1824 ret = remap_verify_area(file_in, pos_in, len, false);
04b38d60
CH
1825 if (ret)
1826 return ret;
1827
6095028b 1828 ret = remap_verify_area(file_out, pos_out, len, true);
04b38d60
CH
1829 if (ret)
1830 return ret;
1831
2e5dfc99 1832 ret = file_in->f_op->remap_file_range(file_in, pos_in,
452ce659 1833 file_out, pos_out, len, remap_flags);
42ec3d4c
DW
1834 if (ret < 0)
1835 return ret;
04b38d60 1836
42ec3d4c
DW
1837 fsnotify_access(file_in);
1838 fsnotify_modify(file_out);
04b38d60
CH
1839 return ret;
1840}
a725356b
AG
1841EXPORT_SYMBOL(do_clone_file_range);
1842
42ec3d4c 1843loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
452ce659
DW
1844 struct file *file_out, loff_t pos_out,
1845 loff_t len, unsigned int remap_flags)
a725356b 1846{
42ec3d4c 1847 loff_t ret;
a725356b
AG
1848
1849 file_start_write(file_out);
452ce659
DW
1850 ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
1851 remap_flags);
a725356b
AG
1852 file_end_write(file_out);
1853
1854 return ret;
1855}
04b38d60 1856EXPORT_SYMBOL(vfs_clone_file_range);
54dbc151 1857
5de4480a
MF
1858/* Check whether we are allowed to dedupe the destination file */
1859static bool allow_file_dedupe(struct file *file)
1860{
1861 if (capable(CAP_SYS_ADMIN))
1862 return true;
1863 if (file->f_mode & FMODE_WRITE)
1864 return true;
1865 if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
1866 return true;
1867 if (!inode_permission(file_inode(file), MAY_WRITE))
1868 return true;
1869 return false;
1870}
1871
42ec3d4c
DW
1872loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
1873 struct file *dst_file, loff_t dst_pos,
df365836 1874 loff_t len, unsigned int remap_flags)
1b4f42a1 1875{
42ec3d4c 1876 loff_t ret;
1b4f42a1 1877
eca3654e
DW
1878 WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
1879 REMAP_FILE_CAN_SHORTEN));
1b4f42a1
MS
1880
1881 ret = mnt_want_write_file(dst_file);
1882 if (ret)
1883 return ret;
1884
6095028b 1885 ret = remap_verify_area(dst_file, dst_pos, len, true);
1b4f42a1
MS
1886 if (ret < 0)
1887 goto out_drop_write;
1888
85c95f20 1889 ret = -EPERM;
5de4480a 1890 if (!allow_file_dedupe(dst_file))
1b4f42a1
MS
1891 goto out_drop_write;
1892
1893 ret = -EXDEV;
1894 if (src_file->f_path.mnt != dst_file->f_path.mnt)
1895 goto out_drop_write;
1896
1897 ret = -EISDIR;
1898 if (S_ISDIR(file_inode(dst_file)->i_mode))
1899 goto out_drop_write;
1900
1901 ret = -EINVAL;
2e5dfc99 1902 if (!dst_file->f_op->remap_file_range)
1b4f42a1
MS
1903 goto out_drop_write;
1904
9aae2050
DW
1905 if (len == 0) {
1906 ret = 0;
1907 goto out_drop_write;
1908 }
1909
2e5dfc99 1910 ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
df365836 1911 dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
1b4f42a1
MS
1912out_drop_write:
1913 mnt_drop_write_file(dst_file);
1914
1915 return ret;
1916}
f1825366 1917EXPORT_SYMBOL(vfs_dedupe_file_range_one);
1b4f42a1 1918
54dbc151
DW
1919int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
1920{
1921 struct file_dedupe_range_info *info;
1922 struct inode *src = file_inode(file);
1923 u64 off;
1924 u64 len;
1925 int i;
1926 int ret;
54dbc151 1927 u16 count = same->dest_count;
42ec3d4c 1928 loff_t deduped;
54dbc151
DW
1929
1930 if (!(file->f_mode & FMODE_READ))
1931 return -EINVAL;
1932
1933 if (same->reserved1 || same->reserved2)
1934 return -EINVAL;
1935
1936 off = same->src_offset;
1937 len = same->src_length;
1938
54dbc151 1939 if (S_ISDIR(src->i_mode))
494633fa 1940 return -EISDIR;
54dbc151 1941
54dbc151 1942 if (!S_ISREG(src->i_mode))
494633fa
DC
1943 return -EINVAL;
1944
1945 if (!file->f_op->remap_file_range)
1946 return -EOPNOTSUPP;
54dbc151 1947
6095028b 1948 ret = remap_verify_area(file, off, len, false);
54dbc151 1949 if (ret < 0)
494633fa 1950 return ret;
54dbc151
DW
1951 ret = 0;
1952
22725ce4
DW
1953 if (off + len > i_size_read(src))
1954 return -EINVAL;
1955
92b66d2c
MS
1956 /* Arbitrary 1G limit on a single dedupe request, can be raised. */
1957 len = min_t(u64, len, 1 << 30);
1958
54dbc151
DW
1959 /* pre-format output fields to sane values */
1960 for (i = 0; i < count; i++) {
1961 same->info[i].bytes_deduped = 0ULL;
1962 same->info[i].status = FILE_DEDUPE_RANGE_SAME;
1963 }
1964
1965 for (i = 0, info = same->info; i < count; i++, info++) {
54dbc151 1966 struct fd dst_fd = fdget(info->dest_fd);
1b4f42a1 1967 struct file *dst_file = dst_fd.file;
54dbc151 1968
54dbc151
DW
1969 if (!dst_file) {
1970 info->status = -EBADF;
1971 goto next_loop;
1972 }
54dbc151
DW
1973
1974 if (info->reserved) {
1975 info->status = -EINVAL;
1b4f42a1 1976 goto next_fdput;
54dbc151
DW
1977 }
1978
1b4f42a1 1979 deduped = vfs_dedupe_file_range_one(file, off, dst_file,
df365836 1980 info->dest_offset, len,
eca3654e 1981 REMAP_FILE_CAN_SHORTEN);
1b4f42a1
MS
1982 if (deduped == -EBADE)
1983 info->status = FILE_DEDUPE_RANGE_DIFFERS;
1984 else if (deduped < 0)
1985 info->status = deduped;
1986 else
1987 info->bytes_deduped = len;
1988
22762711 1989next_fdput:
54dbc151 1990 fdput(dst_fd);
22762711 1991next_loop:
e62e560f 1992 if (fatal_signal_pending(current))
494633fa 1993 break;
54dbc151 1994 }
54dbc151
DW
1995 return ret;
1996}
1997EXPORT_SYMBOL(vfs_dedupe_file_range);