lseek_execute() doesn't need an inode passed to it
[linux-2.6-block.git] / fs / read_write.c
CommitLineData
1da177e4
LT
1/*
2 * linux/fs/read_write.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7#include <linux/slab.h>
8#include <linux/stat.h>
9#include <linux/fcntl.h>
10#include <linux/file.h>
11#include <linux/uio.h>
a27bb332 12#include <linux/aio.h>
0eeca283 13#include <linux/fsnotify.h>
1da177e4 14#include <linux/security.h>
630d9c47 15#include <linux/export.h>
1da177e4 16#include <linux/syscalls.h>
e28cc715 17#include <linux/pagemap.h>
d6b29d7c 18#include <linux/splice.h>
561c6731 19#include <linux/compat.h>
06ae43f3 20#include "internal.h"
1da177e4
LT
21
22#include <asm/uaccess.h>
23#include <asm/unistd.h>
24
c0bd14af
AV
25typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
26typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
27 unsigned long, loff_t);
28
4b6f5d20 29const struct file_operations generic_ro_fops = {
1da177e4 30 .llseek = generic_file_llseek,
543ade1f
BP
31 .read = do_sync_read,
32 .aio_read = generic_file_aio_read,
1da177e4 33 .mmap = generic_file_readonly_mmap,
534f2aaa 34 .splice_read = generic_file_splice_read,
1da177e4
LT
35};
36
37EXPORT_SYMBOL(generic_ro_fops);
38
cccb5a1e 39static inline int unsigned_offsets(struct file *file)
4a3956c7 40{
cccb5a1e 41 return file->f_mode & FMODE_UNSIGNED_OFFSET;
4a3956c7
KH
42}
43
2142914e 44static loff_t lseek_execute(struct file *file, loff_t offset, loff_t maxsize)
ef3d0fd2
AK
45{
46 if (offset < 0 && !unsigned_offsets(file))
47 return -EINVAL;
48 if (offset > maxsize)
49 return -EINVAL;
50
51 if (offset != file->f_pos) {
52 file->f_pos = offset;
53 file->f_version = 0;
54 }
55 return offset;
56}
57
3a8cff4f 58/**
5760495a 59 * generic_file_llseek_size - generic llseek implementation for regular files
3a8cff4f
CH
60 * @file: file structure to seek on
61 * @offset: file offset to seek to
965c8e59 62 * @whence: type of seek
e8b96eb5
ES
63 * @size: max size of this file in file system
64 * @eof: offset used for SEEK_END position
3a8cff4f 65 *
5760495a 66 * This is a variant of generic_file_llseek that allows passing in a custom
e8b96eb5 67 * maximum file size and a custom EOF position, for e.g. hashed directories
ef3d0fd2
AK
68 *
69 * Synchronization:
5760495a 70 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
ef3d0fd2
AK
71 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
72 * read/writes behave like SEEK_SET against seeks.
3a8cff4f 73 */
9465efc9 74loff_t
965c8e59 75generic_file_llseek_size(struct file *file, loff_t offset, int whence,
e8b96eb5 76 loff_t maxsize, loff_t eof)
1da177e4 77{
965c8e59 78 switch (whence) {
3a8cff4f 79 case SEEK_END:
e8b96eb5 80 offset += eof;
3a8cff4f
CH
81 break;
82 case SEEK_CUR:
5b6f1eb9
AK
83 /*
84 * Here we special-case the lseek(fd, 0, SEEK_CUR)
85 * position-querying operation. Avoid rewriting the "same"
86 * f_pos value back to the file because a concurrent read(),
87 * write() or lseek() might have altered it
88 */
89 if (offset == 0)
90 return file->f_pos;
ef3d0fd2
AK
91 /*
92 * f_lock protects against read/modify/write race with other
93 * SEEK_CURs. Note that parallel writes and reads behave
94 * like SEEK_SET.
95 */
96 spin_lock(&file->f_lock);
2142914e 97 offset = lseek_execute(file, file->f_pos + offset, maxsize);
ef3d0fd2
AK
98 spin_unlock(&file->f_lock);
99 return offset;
982d8165
JB
100 case SEEK_DATA:
101 /*
102 * In the generic case the entire file is data, so as long as
103 * offset isn't at the end of the file then the offset is data.
104 */
e8b96eb5 105 if (offset >= eof)
982d8165
JB
106 return -ENXIO;
107 break;
108 case SEEK_HOLE:
109 /*
110 * There is a virtual hole at the end of the file, so as long as
111 * offset isn't i_size or larger, return i_size.
112 */
e8b96eb5 113 if (offset >= eof)
982d8165 114 return -ENXIO;
e8b96eb5 115 offset = eof;
982d8165 116 break;
1da177e4 117 }
3a8cff4f 118
2142914e 119 return lseek_execute(file, offset, maxsize);
5760495a
AK
120}
121EXPORT_SYMBOL(generic_file_llseek_size);
122
123/**
124 * generic_file_llseek - generic llseek implementation for regular files
125 * @file: file structure to seek on
126 * @offset: file offset to seek to
965c8e59 127 * @whence: type of seek
5760495a
AK
128 *
129 * This is a generic implemenation of ->llseek useable for all normal local
130 * filesystems. It just updates the file offset to the value specified by
546ae2d2 131 * @offset and @whence.
5760495a 132 */
965c8e59 133loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
5760495a
AK
134{
135 struct inode *inode = file->f_mapping->host;
136
965c8e59 137 return generic_file_llseek_size(file, offset, whence,
e8b96eb5
ES
138 inode->i_sb->s_maxbytes,
139 i_size_read(inode));
1da177e4 140}
9465efc9 141EXPORT_SYMBOL(generic_file_llseek);
1da177e4 142
1bf9d14d
AV
143/**
144 * fixed_size_llseek - llseek implementation for fixed-sized devices
145 * @file: file structure to seek on
146 * @offset: file offset to seek to
147 * @whence: type of seek
148 * @size: size of the file
149 *
150 */
151loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
152{
153 switch (whence) {
154 case SEEK_SET: case SEEK_CUR: case SEEK_END:
155 return generic_file_llseek_size(file, offset, whence,
156 size, size);
157 default:
158 return -EINVAL;
159 }
160}
161EXPORT_SYMBOL(fixed_size_llseek);
162
ae6afc3f
B
163/**
164 * noop_llseek - No Operation Performed llseek implementation
165 * @file: file structure to seek on
166 * @offset: file offset to seek to
965c8e59 167 * @whence: type of seek
ae6afc3f
B
168 *
169 * This is an implementation of ->llseek useable for the rare special case when
170 * userspace expects the seek to succeed but the (device) file is actually not
171 * able to perform the seek. In this case you use noop_llseek() instead of
172 * falling back to the default implementation of ->llseek.
173 */
965c8e59 174loff_t noop_llseek(struct file *file, loff_t offset, int whence)
ae6afc3f
B
175{
176 return file->f_pos;
177}
178EXPORT_SYMBOL(noop_llseek);
179
965c8e59 180loff_t no_llseek(struct file *file, loff_t offset, int whence)
1da177e4
LT
181{
182 return -ESPIPE;
183}
184EXPORT_SYMBOL(no_llseek);
185
965c8e59 186loff_t default_llseek(struct file *file, loff_t offset, int whence)
1da177e4 187{
496ad9aa 188 struct inode *inode = file_inode(file);
16abef0e 189 loff_t retval;
1da177e4 190
982d8165 191 mutex_lock(&inode->i_mutex);
965c8e59 192 switch (whence) {
7b8e8924 193 case SEEK_END:
982d8165 194 offset += i_size_read(inode);
1da177e4 195 break;
7b8e8924 196 case SEEK_CUR:
5b6f1eb9
AK
197 if (offset == 0) {
198 retval = file->f_pos;
199 goto out;
200 }
1da177e4 201 offset += file->f_pos;
982d8165
JB
202 break;
203 case SEEK_DATA:
204 /*
205 * In the generic case the entire file is data, so as
206 * long as offset isn't at the end of the file then the
207 * offset is data.
208 */
bacb2d81
DC
209 if (offset >= inode->i_size) {
210 retval = -ENXIO;
211 goto out;
212 }
982d8165
JB
213 break;
214 case SEEK_HOLE:
215 /*
216 * There is a virtual hole at the end of the file, so
217 * as long as offset isn't i_size or larger, return
218 * i_size.
219 */
bacb2d81
DC
220 if (offset >= inode->i_size) {
221 retval = -ENXIO;
222 goto out;
223 }
982d8165
JB
224 offset = inode->i_size;
225 break;
1da177e4
LT
226 }
227 retval = -EINVAL;
cccb5a1e 228 if (offset >= 0 || unsigned_offsets(file)) {
1da177e4
LT
229 if (offset != file->f_pos) {
230 file->f_pos = offset;
231 file->f_version = 0;
232 }
233 retval = offset;
234 }
5b6f1eb9 235out:
982d8165 236 mutex_unlock(&inode->i_mutex);
1da177e4
LT
237 return retval;
238}
239EXPORT_SYMBOL(default_llseek);
240
965c8e59 241loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
1da177e4
LT
242{
243 loff_t (*fn)(struct file *, loff_t, int);
244
245 fn = no_llseek;
246 if (file->f_mode & FMODE_LSEEK) {
1da177e4
LT
247 if (file->f_op && file->f_op->llseek)
248 fn = file->f_op->llseek;
249 }
965c8e59 250 return fn(file, offset, whence);
1da177e4
LT
251}
252EXPORT_SYMBOL(vfs_llseek);
253
965c8e59 254SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
1da177e4
LT
255{
256 off_t retval;
2903ff01
AV
257 struct fd f = fdget(fd);
258 if (!f.file)
259 return -EBADF;
1da177e4
LT
260
261 retval = -EINVAL;
965c8e59
AM
262 if (whence <= SEEK_MAX) {
263 loff_t res = vfs_llseek(f.file, offset, whence);
1da177e4
LT
264 retval = res;
265 if (res != (loff_t)retval)
266 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
267 }
2903ff01 268 fdput(f);
1da177e4
LT
269 return retval;
270}
271
561c6731
AV
272#ifdef CONFIG_COMPAT
273COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
274{
275 return sys_lseek(fd, offset, whence);
276}
277#endif
278
1da177e4 279#ifdef __ARCH_WANT_SYS_LLSEEK
003d7ab4
HC
280SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
281 unsigned long, offset_low, loff_t __user *, result,
965c8e59 282 unsigned int, whence)
1da177e4
LT
283{
284 int retval;
2903ff01 285 struct fd f = fdget(fd);
1da177e4 286 loff_t offset;
1da177e4 287
2903ff01
AV
288 if (!f.file)
289 return -EBADF;
1da177e4
LT
290
291 retval = -EINVAL;
965c8e59 292 if (whence > SEEK_MAX)
1da177e4
LT
293 goto out_putf;
294
2903ff01 295 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
965c8e59 296 whence);
1da177e4
LT
297
298 retval = (int)offset;
299 if (offset >= 0) {
300 retval = -EFAULT;
301 if (!copy_to_user(result, &offset, sizeof(offset)))
302 retval = 0;
303 }
304out_putf:
2903ff01 305 fdput(f);
1da177e4
LT
306 return retval;
307}
308#endif
309
e28cc715
LT
310/*
311 * rw_verify_area doesn't like huge counts. We limit
312 * them to something that fits in "int" so that others
313 * won't have to do range checks all the time.
314 */
68d70d03 315int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
1da177e4
LT
316{
317 struct inode *inode;
318 loff_t pos;
c43e259c 319 int retval = -EINVAL;
1da177e4 320
496ad9aa 321 inode = file_inode(file);
e28cc715 322 if (unlikely((ssize_t) count < 0))
c43e259c 323 return retval;
1da177e4 324 pos = *ppos;
cccb5a1e
AV
325 if (unlikely(pos < 0)) {
326 if (!unsigned_offsets(file))
327 return retval;
328 if (count >= -pos) /* both values are in 0..LLONG_MAX */
329 return -EOVERFLOW;
330 } else if (unlikely((loff_t) (pos + count) < 0)) {
331 if (!unsigned_offsets(file))
4a3956c7
KH
332 return retval;
333 }
1da177e4 334
a16877ca 335 if (unlikely(inode->i_flock && mandatory_lock(inode))) {
c43e259c 336 retval = locks_mandatory_area(
e28cc715
LT
337 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
338 inode, file, pos, count);
339 if (retval < 0)
340 return retval;
341 }
c43e259c
JM
342 retval = security_file_permission(file,
343 read_write == READ ? MAY_READ : MAY_WRITE);
344 if (retval)
345 return retval;
e28cc715 346 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
1da177e4
LT
347}
348
349ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
350{
027445c3 351 struct iovec iov = { .iov_base = buf, .iov_len = len };
1da177e4
LT
352 struct kiocb kiocb;
353 ssize_t ret;
354
355 init_sync_kiocb(&kiocb, filp);
356 kiocb.ki_pos = *ppos;
027445c3 357 kiocb.ki_left = len;
61964eba 358 kiocb.ki_nbytes = len;
027445c3 359
41003a7b 360 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1da177e4
LT
361 if (-EIOCBQUEUED == ret)
362 ret = wait_on_sync_kiocb(&kiocb);
363 *ppos = kiocb.ki_pos;
364 return ret;
365}
366
367EXPORT_SYMBOL(do_sync_read);
368
369ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
370{
371 ssize_t ret;
372
373 if (!(file->f_mode & FMODE_READ))
374 return -EBADF;
375 if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
376 return -EINVAL;
377 if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
378 return -EFAULT;
379
380 ret = rw_verify_area(READ, file, pos, count);
e28cc715
LT
381 if (ret >= 0) {
382 count = ret;
c43e259c
JM
383 if (file->f_op->read)
384 ret = file->f_op->read(file, buf, count, pos);
385 else
386 ret = do_sync_read(file, buf, count, pos);
387 if (ret > 0) {
2a12a9d7 388 fsnotify_access(file);
c43e259c 389 add_rchar(current, ret);
1da177e4 390 }
c43e259c 391 inc_syscr(current);
1da177e4
LT
392 }
393
394 return ret;
395}
396
397EXPORT_SYMBOL(vfs_read);
398
399ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
400{
027445c3 401 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
1da177e4
LT
402 struct kiocb kiocb;
403 ssize_t ret;
404
405 init_sync_kiocb(&kiocb, filp);
406 kiocb.ki_pos = *ppos;
027445c3 407 kiocb.ki_left = len;
61964eba 408 kiocb.ki_nbytes = len;
027445c3 409
41003a7b 410 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1da177e4
LT
411 if (-EIOCBQUEUED == ret)
412 ret = wait_on_sync_kiocb(&kiocb);
413 *ppos = kiocb.ki_pos;
414 return ret;
415}
416
417EXPORT_SYMBOL(do_sync_write);
418
06ae43f3
AV
419ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
420{
421 mm_segment_t old_fs;
422 const char __user *p;
423 ssize_t ret;
424
3e84f48e
AV
425 if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
426 return -EINVAL;
427
06ae43f3
AV
428 old_fs = get_fs();
429 set_fs(get_ds());
430 p = (__force const char __user *)buf;
431 if (count > MAX_RW_COUNT)
432 count = MAX_RW_COUNT;
433 if (file->f_op->write)
434 ret = file->f_op->write(file, p, count, pos);
435 else
436 ret = do_sync_write(file, p, count, pos);
437 set_fs(old_fs);
438 if (ret > 0) {
439 fsnotify_modify(file);
440 add_wchar(current, ret);
441 }
442 inc_syscw(current);
443 return ret;
444}
445
1da177e4
LT
446ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
447{
448 ssize_t ret;
449
450 if (!(file->f_mode & FMODE_WRITE))
451 return -EBADF;
452 if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
453 return -EINVAL;
454 if (unlikely(!access_ok(VERIFY_READ, buf, count)))
455 return -EFAULT;
456
457 ret = rw_verify_area(WRITE, file, pos, count);
e28cc715
LT
458 if (ret >= 0) {
459 count = ret;
03d95eb2 460 file_start_write(file);
c43e259c
JM
461 if (file->f_op->write)
462 ret = file->f_op->write(file, buf, count, pos);
463 else
464 ret = do_sync_write(file, buf, count, pos);
465 if (ret > 0) {
2a12a9d7 466 fsnotify_modify(file);
c43e259c 467 add_wchar(current, ret);
1da177e4 468 }
c43e259c 469 inc_syscw(current);
03d95eb2 470 file_end_write(file);
1da177e4
LT
471 }
472
473 return ret;
474}
475
476EXPORT_SYMBOL(vfs_write);
477
478static inline loff_t file_pos_read(struct file *file)
479{
480 return file->f_pos;
481}
482
483static inline void file_pos_write(struct file *file, loff_t pos)
484{
485 file->f_pos = pos;
486}
487
3cdad428 488SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
1da177e4 489{
2903ff01 490 struct fd f = fdget(fd);
1da177e4 491 ssize_t ret = -EBADF;
1da177e4 492
2903ff01
AV
493 if (f.file) {
494 loff_t pos = file_pos_read(f.file);
495 ret = vfs_read(f.file, buf, count, &pos);
5faf153e
AV
496 if (ret >= 0)
497 file_pos_write(f.file, pos);
2903ff01 498 fdput(f);
1da177e4 499 }
1da177e4
LT
500 return ret;
501}
1da177e4 502
3cdad428
HC
503SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
504 size_t, count)
1da177e4 505{
2903ff01 506 struct fd f = fdget(fd);
1da177e4 507 ssize_t ret = -EBADF;
1da177e4 508
2903ff01
AV
509 if (f.file) {
510 loff_t pos = file_pos_read(f.file);
511 ret = vfs_write(f.file, buf, count, &pos);
5faf153e
AV
512 if (ret >= 0)
513 file_pos_write(f.file, pos);
2903ff01 514 fdput(f);
1da177e4
LT
515 }
516
517 return ret;
518}
519
4a0fd5bf
AV
520SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
521 size_t, count, loff_t, pos)
1da177e4 522{
2903ff01 523 struct fd f;
1da177e4 524 ssize_t ret = -EBADF;
1da177e4
LT
525
526 if (pos < 0)
527 return -EINVAL;
528
2903ff01
AV
529 f = fdget(fd);
530 if (f.file) {
1da177e4 531 ret = -ESPIPE;
2903ff01
AV
532 if (f.file->f_mode & FMODE_PREAD)
533 ret = vfs_read(f.file, buf, count, &pos);
534 fdput(f);
1da177e4
LT
535 }
536
537 return ret;
538}
539
4a0fd5bf
AV
540SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
541 size_t, count, loff_t, pos)
1da177e4 542{
2903ff01 543 struct fd f;
1da177e4 544 ssize_t ret = -EBADF;
1da177e4
LT
545
546 if (pos < 0)
547 return -EINVAL;
548
2903ff01
AV
549 f = fdget(fd);
550 if (f.file) {
1da177e4 551 ret = -ESPIPE;
2903ff01
AV
552 if (f.file->f_mode & FMODE_PWRITE)
553 ret = vfs_write(f.file, buf, count, &pos);
554 fdput(f);
1da177e4
LT
555 }
556
557 return ret;
558}
559
560/*
561 * Reduce an iovec's length in-place. Return the resulting number of segments
562 */
563unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
564{
565 unsigned long seg = 0;
566 size_t len = 0;
567
568 while (seg < nr_segs) {
569 seg++;
570 if (len + iov->iov_len >= to) {
571 iov->iov_len = to - len;
572 break;
573 }
574 len += iov->iov_len;
575 iov++;
576 }
577 return seg;
578}
19295529 579EXPORT_SYMBOL(iov_shorten);
1da177e4 580
72ec3516 581static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
ee0b3e67
BP
582 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
583{
584 struct kiocb kiocb;
585 ssize_t ret;
586
587 init_sync_kiocb(&kiocb, filp);
588 kiocb.ki_pos = *ppos;
589 kiocb.ki_left = len;
590 kiocb.ki_nbytes = len;
591
41003a7b 592 ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
ee0b3e67
BP
593 if (ret == -EIOCBQUEUED)
594 ret = wait_on_sync_kiocb(&kiocb);
595 *ppos = kiocb.ki_pos;
596 return ret;
597}
598
599/* Do it by hand, with file-ops */
72ec3516 600static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
ee0b3e67
BP
601 unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
602{
603 struct iovec *vector = iov;
604 ssize_t ret = 0;
605
606 while (nr_segs > 0) {
607 void __user *base;
608 size_t len;
609 ssize_t nr;
610
611 base = vector->iov_base;
612 len = vector->iov_len;
613 vector++;
614 nr_segs--;
615
616 nr = fn(filp, base, len, ppos);
617
618 if (nr < 0) {
619 if (!ret)
620 ret = nr;
621 break;
622 }
623 ret += nr;
624 if (nr != len)
625 break;
626 }
627
628 return ret;
629}
630
1da177e4
LT
631/* A write operation does a read from user space and vice versa */
632#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
633
eed4e51f
BP
634ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
635 unsigned long nr_segs, unsigned long fast_segs,
636 struct iovec *fast_pointer,
ac34ebb3 637 struct iovec **ret_pointer)
435f49a5 638{
eed4e51f 639 unsigned long seg;
435f49a5 640 ssize_t ret;
eed4e51f
BP
641 struct iovec *iov = fast_pointer;
642
435f49a5
LT
643 /*
644 * SuS says "The readv() function *may* fail if the iovcnt argument
645 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
646 * traditionally returned zero for zero segments, so...
647 */
eed4e51f
BP
648 if (nr_segs == 0) {
649 ret = 0;
435f49a5 650 goto out;
eed4e51f
BP
651 }
652
435f49a5
LT
653 /*
654 * First get the "struct iovec" from user memory and
655 * verify all the pointers
656 */
eed4e51f
BP
657 if (nr_segs > UIO_MAXIOV) {
658 ret = -EINVAL;
435f49a5 659 goto out;
eed4e51f
BP
660 }
661 if (nr_segs > fast_segs) {
435f49a5 662 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
eed4e51f
BP
663 if (iov == NULL) {
664 ret = -ENOMEM;
435f49a5 665 goto out;
eed4e51f 666 }
435f49a5 667 }
eed4e51f
BP
668 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
669 ret = -EFAULT;
435f49a5 670 goto out;
eed4e51f
BP
671 }
672
435f49a5 673 /*
eed4e51f
BP
674 * According to the Single Unix Specification we should return EINVAL
675 * if an element length is < 0 when cast to ssize_t or if the
676 * total length would overflow the ssize_t return value of the
677 * system call.
435f49a5
LT
678 *
679 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
680 * overflow case.
681 */
eed4e51f 682 ret = 0;
435f49a5
LT
683 for (seg = 0; seg < nr_segs; seg++) {
684 void __user *buf = iov[seg].iov_base;
685 ssize_t len = (ssize_t)iov[seg].iov_len;
eed4e51f
BP
686
687 /* see if we we're about to use an invalid len or if
688 * it's about to overflow ssize_t */
435f49a5 689 if (len < 0) {
eed4e51f 690 ret = -EINVAL;
435f49a5 691 goto out;
eed4e51f 692 }
ac34ebb3 693 if (type >= 0
fcf63409 694 && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
eed4e51f 695 ret = -EFAULT;
435f49a5
LT
696 goto out;
697 }
698 if (len > MAX_RW_COUNT - ret) {
699 len = MAX_RW_COUNT - ret;
700 iov[seg].iov_len = len;
eed4e51f 701 }
eed4e51f 702 ret += len;
435f49a5 703 }
eed4e51f
BP
704out:
705 *ret_pointer = iov;
706 return ret;
707}
708
1da177e4
LT
709static ssize_t do_readv_writev(int type, struct file *file,
710 const struct iovec __user * uvector,
711 unsigned long nr_segs, loff_t *pos)
712{
1da177e4
LT
713 size_t tot_len;
714 struct iovec iovstack[UIO_FASTIOV];
ee0b3e67 715 struct iovec *iov = iovstack;
1da177e4 716 ssize_t ret;
1da177e4
LT
717 io_fn_t fn;
718 iov_fn_t fnv;
719
eed4e51f
BP
720 if (!file->f_op) {
721 ret = -EINVAL;
1da177e4 722 goto out;
1da177e4 723 }
1da177e4 724
eed4e51f 725 ret = rw_copy_check_uvector(type, uvector, nr_segs,
ac34ebb3 726 ARRAY_SIZE(iovstack), iovstack, &iov);
eed4e51f 727 if (ret <= 0)
1da177e4 728 goto out;
1da177e4 729
eed4e51f 730 tot_len = ret;
1da177e4 731 ret = rw_verify_area(type, file, pos, tot_len);
e28cc715 732 if (ret < 0)
411b67b4 733 goto out;
1da177e4
LT
734
735 fnv = NULL;
736 if (type == READ) {
737 fn = file->f_op->read;
ee0b3e67 738 fnv = file->f_op->aio_read;
1da177e4
LT
739 } else {
740 fn = (io_fn_t)file->f_op->write;
ee0b3e67 741 fnv = file->f_op->aio_write;
03d95eb2 742 file_start_write(file);
1da177e4
LT
743 }
744
ee0b3e67
BP
745 if (fnv)
746 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
747 pos, fnv);
748 else
749 ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
1da177e4 750
03d95eb2
AV
751 if (type != READ)
752 file_end_write(file);
753
1da177e4
LT
754out:
755 if (iov != iovstack)
756 kfree(iov);
0eeca283
RL
757 if ((ret + (type == READ)) > 0) {
758 if (type == READ)
2a12a9d7 759 fsnotify_access(file);
0eeca283 760 else
2a12a9d7 761 fsnotify_modify(file);
0eeca283 762 }
1da177e4 763 return ret;
1da177e4
LT
764}
765
766ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
767 unsigned long vlen, loff_t *pos)
768{
769 if (!(file->f_mode & FMODE_READ))
770 return -EBADF;
ee0b3e67 771 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
1da177e4
LT
772 return -EINVAL;
773
774 return do_readv_writev(READ, file, vec, vlen, pos);
775}
776
777EXPORT_SYMBOL(vfs_readv);
778
779ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
780 unsigned long vlen, loff_t *pos)
781{
782 if (!(file->f_mode & FMODE_WRITE))
783 return -EBADF;
ee0b3e67 784 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1da177e4
LT
785 return -EINVAL;
786
787 return do_readv_writev(WRITE, file, vec, vlen, pos);
788}
789
790EXPORT_SYMBOL(vfs_writev);
791
3cdad428
HC
792SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
793 unsigned long, vlen)
1da177e4 794{
2903ff01 795 struct fd f = fdget(fd);
1da177e4 796 ssize_t ret = -EBADF;
1da177e4 797
2903ff01
AV
798 if (f.file) {
799 loff_t pos = file_pos_read(f.file);
800 ret = vfs_readv(f.file, vec, vlen, &pos);
5faf153e
AV
801 if (ret >= 0)
802 file_pos_write(f.file, pos);
2903ff01 803 fdput(f);
1da177e4
LT
804 }
805
806 if (ret > 0)
4b98d11b
AD
807 add_rchar(current, ret);
808 inc_syscr(current);
1da177e4
LT
809 return ret;
810}
811
3cdad428
HC
812SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
813 unsigned long, vlen)
1da177e4 814{
2903ff01 815 struct fd f = fdget(fd);
1da177e4 816 ssize_t ret = -EBADF;
1da177e4 817
2903ff01
AV
818 if (f.file) {
819 loff_t pos = file_pos_read(f.file);
820 ret = vfs_writev(f.file, vec, vlen, &pos);
5faf153e
AV
821 if (ret >= 0)
822 file_pos_write(f.file, pos);
2903ff01 823 fdput(f);
1da177e4
LT
824 }
825
826 if (ret > 0)
4b98d11b
AD
827 add_wchar(current, ret);
828 inc_syscw(current);
1da177e4
LT
829 return ret;
830}
831
601cc11d
LT
832static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
833{
834#define HALF_LONG_BITS (BITS_PER_LONG / 2)
835 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
836}
837
f3554f4b 838SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
601cc11d 839 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
f3554f4b 840{
601cc11d 841 loff_t pos = pos_from_hilo(pos_h, pos_l);
2903ff01 842 struct fd f;
f3554f4b 843 ssize_t ret = -EBADF;
f3554f4b
GH
844
845 if (pos < 0)
846 return -EINVAL;
847
2903ff01
AV
848 f = fdget(fd);
849 if (f.file) {
f3554f4b 850 ret = -ESPIPE;
2903ff01
AV
851 if (f.file->f_mode & FMODE_PREAD)
852 ret = vfs_readv(f.file, vec, vlen, &pos);
853 fdput(f);
f3554f4b
GH
854 }
855
856 if (ret > 0)
857 add_rchar(current, ret);
858 inc_syscr(current);
859 return ret;
860}
861
862SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
601cc11d 863 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
f3554f4b 864{
601cc11d 865 loff_t pos = pos_from_hilo(pos_h, pos_l);
2903ff01 866 struct fd f;
f3554f4b 867 ssize_t ret = -EBADF;
f3554f4b
GH
868
869 if (pos < 0)
870 return -EINVAL;
871
2903ff01
AV
872 f = fdget(fd);
873 if (f.file) {
f3554f4b 874 ret = -ESPIPE;
2903ff01
AV
875 if (f.file->f_mode & FMODE_PWRITE)
876 ret = vfs_writev(f.file, vec, vlen, &pos);
877 fdput(f);
f3554f4b
GH
878 }
879
880 if (ret > 0)
881 add_wchar(current, ret);
882 inc_syscw(current);
883 return ret;
884}
885
72ec3516
AV
886#ifdef CONFIG_COMPAT
887
888static ssize_t compat_do_readv_writev(int type, struct file *file,
889 const struct compat_iovec __user *uvector,
890 unsigned long nr_segs, loff_t *pos)
891{
892 compat_ssize_t tot_len;
893 struct iovec iovstack[UIO_FASTIOV];
894 struct iovec *iov = iovstack;
895 ssize_t ret;
896 io_fn_t fn;
897 iov_fn_t fnv;
898
899 ret = -EINVAL;
900 if (!file->f_op)
901 goto out;
902
903 ret = -EFAULT;
904 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
905 goto out;
906
907 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
908 UIO_FASTIOV, iovstack, &iov);
909 if (ret <= 0)
910 goto out;
911
912 tot_len = ret;
913 ret = rw_verify_area(type, file, pos, tot_len);
914 if (ret < 0)
915 goto out;
916
917 fnv = NULL;
918 if (type == READ) {
919 fn = file->f_op->read;
920 fnv = file->f_op->aio_read;
921 } else {
922 fn = (io_fn_t)file->f_op->write;
923 fnv = file->f_op->aio_write;
03d95eb2 924 file_start_write(file);
72ec3516
AV
925 }
926
03d95eb2 927 if (fnv)
72ec3516
AV
928 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
929 pos, fnv);
03d95eb2 930 else
72ec3516
AV
931 ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
932
03d95eb2
AV
933 if (type != READ)
934 file_end_write(file);
935
72ec3516
AV
936out:
937 if (iov != iovstack)
938 kfree(iov);
939 if ((ret + (type == READ)) > 0) {
940 if (type == READ)
941 fsnotify_access(file);
942 else
943 fsnotify_modify(file);
944 }
945 return ret;
946}
947
948static size_t compat_readv(struct file *file,
949 const struct compat_iovec __user *vec,
950 unsigned long vlen, loff_t *pos)
951{
952 ssize_t ret = -EBADF;
953
954 if (!(file->f_mode & FMODE_READ))
955 goto out;
956
957 ret = -EINVAL;
958 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
959 goto out;
960
961 ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
962
963out:
964 if (ret > 0)
965 add_rchar(current, ret);
966 inc_syscr(current);
967 return ret;
968}
969
970COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
971 const struct compat_iovec __user *,vec,
972 unsigned long, vlen)
973{
974 struct fd f = fdget(fd);
975 ssize_t ret;
976 loff_t pos;
977
978 if (!f.file)
979 return -EBADF;
980 pos = f.file->f_pos;
981 ret = compat_readv(f.file, vec, vlen, &pos);
5faf153e
AV
982 if (ret >= 0)
983 f.file->f_pos = pos;
72ec3516
AV
984 fdput(f);
985 return ret;
986}
987
988COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
989 const struct compat_iovec __user *,vec,
990 unsigned long, vlen, loff_t, pos)
991{
992 struct fd f;
993 ssize_t ret;
994
995 if (pos < 0)
996 return -EINVAL;
997 f = fdget(fd);
998 if (!f.file)
999 return -EBADF;
1000 ret = -ESPIPE;
1001 if (f.file->f_mode & FMODE_PREAD)
1002 ret = compat_readv(f.file, vec, vlen, &pos);
1003 fdput(f);
1004 return ret;
1005}
1006
1007COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
1008 const struct compat_iovec __user *,vec,
1009 unsigned long, vlen, u32, pos_low, u32, pos_high)
1010{
1011 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1012 return compat_sys_preadv64(fd, vec, vlen, pos);
1013}
1014
1015static size_t compat_writev(struct file *file,
1016 const struct compat_iovec __user *vec,
1017 unsigned long vlen, loff_t *pos)
1018{
1019 ssize_t ret = -EBADF;
1020
1021 if (!(file->f_mode & FMODE_WRITE))
1022 goto out;
1023
1024 ret = -EINVAL;
1025 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
1026 goto out;
1027
1028 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1029
1030out:
1031 if (ret > 0)
1032 add_wchar(current, ret);
1033 inc_syscw(current);
1034 return ret;
1035}
1036
1037COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
1038 const struct compat_iovec __user *, vec,
1039 unsigned long, vlen)
1040{
1041 struct fd f = fdget(fd);
1042 ssize_t ret;
1043 loff_t pos;
1044
1045 if (!f.file)
1046 return -EBADF;
1047 pos = f.file->f_pos;
1048 ret = compat_writev(f.file, vec, vlen, &pos);
5faf153e
AV
1049 if (ret >= 0)
1050 f.file->f_pos = pos;
72ec3516
AV
1051 fdput(f);
1052 return ret;
1053}
1054
1055COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1056 const struct compat_iovec __user *,vec,
1057 unsigned long, vlen, loff_t, pos)
1058{
1059 struct fd f;
1060 ssize_t ret;
1061
1062 if (pos < 0)
1063 return -EINVAL;
1064 f = fdget(fd);
1065 if (!f.file)
1066 return -EBADF;
1067 ret = -ESPIPE;
1068 if (f.file->f_mode & FMODE_PWRITE)
1069 ret = compat_writev(f.file, vec, vlen, &pos);
1070 fdput(f);
1071 return ret;
1072}
1073
1074COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
1075 const struct compat_iovec __user *,vec,
1076 unsigned long, vlen, u32, pos_low, u32, pos_high)
1077{
1078 loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1079 return compat_sys_pwritev64(fd, vec, vlen, pos);
1080}
1081#endif
1082
19f4fc3a
AV
1083static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1084 size_t count, loff_t max)
1da177e4 1085{
2903ff01
AV
1086 struct fd in, out;
1087 struct inode *in_inode, *out_inode;
1da177e4 1088 loff_t pos;
7995bd28 1089 loff_t out_pos;
1da177e4 1090 ssize_t retval;
2903ff01 1091 int fl;
1da177e4
LT
1092
1093 /*
1094 * Get input file, and verify that it is ok..
1095 */
1096 retval = -EBADF;
2903ff01
AV
1097 in = fdget(in_fd);
1098 if (!in.file)
1da177e4 1099 goto out;
2903ff01 1100 if (!(in.file->f_mode & FMODE_READ))
1da177e4 1101 goto fput_in;
1da177e4 1102 retval = -ESPIPE;
7995bd28
AV
1103 if (!ppos) {
1104 pos = in.file->f_pos;
1105 } else {
1106 pos = *ppos;
2903ff01 1107 if (!(in.file->f_mode & FMODE_PREAD))
1da177e4 1108 goto fput_in;
7995bd28
AV
1109 }
1110 retval = rw_verify_area(READ, in.file, &pos, count);
e28cc715 1111 if (retval < 0)
1da177e4 1112 goto fput_in;
e28cc715 1113 count = retval;
1da177e4 1114
1da177e4
LT
1115 /*
1116 * Get output file, and verify that it is ok..
1117 */
1118 retval = -EBADF;
2903ff01
AV
1119 out = fdget(out_fd);
1120 if (!out.file)
1da177e4 1121 goto fput_in;
2903ff01 1122 if (!(out.file->f_mode & FMODE_WRITE))
1da177e4
LT
1123 goto fput_out;
1124 retval = -EINVAL;
496ad9aa
AV
1125 in_inode = file_inode(in.file);
1126 out_inode = file_inode(out.file);
7995bd28
AV
1127 out_pos = out.file->f_pos;
1128 retval = rw_verify_area(WRITE, out.file, &out_pos, count);
e28cc715 1129 if (retval < 0)
1da177e4 1130 goto fput_out;
e28cc715 1131 count = retval;
1da177e4 1132
1da177e4
LT
1133 if (!max)
1134 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1135
1da177e4
LT
1136 if (unlikely(pos + count > max)) {
1137 retval = -EOVERFLOW;
1138 if (pos >= max)
1139 goto fput_out;
1140 count = max - pos;
1141 }
1142
d96e6e71 1143 fl = 0;
534f2aaa 1144#if 0
d96e6e71
JA
1145 /*
1146 * We need to debate whether we can enable this or not. The
1147 * man page documents EAGAIN return for the output at least,
1148 * and the application is arguably buggy if it doesn't expect
1149 * EAGAIN on a non-blocking file descriptor.
1150 */
2903ff01 1151 if (in.file->f_flags & O_NONBLOCK)
d96e6e71 1152 fl = SPLICE_F_NONBLOCK;
534f2aaa 1153#endif
50cd2c57 1154 file_start_write(out.file);
7995bd28 1155 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
50cd2c57 1156 file_end_write(out.file);
1da177e4
LT
1157
1158 if (retval > 0) {
4b98d11b
AD
1159 add_rchar(current, retval);
1160 add_wchar(current, retval);
a68c2f12
SW
1161 fsnotify_access(in.file);
1162 fsnotify_modify(out.file);
7995bd28
AV
1163 out.file->f_pos = out_pos;
1164 if (ppos)
1165 *ppos = pos;
1166 else
1167 in.file->f_pos = pos;
1da177e4 1168 }
1da177e4 1169
4b98d11b
AD
1170 inc_syscr(current);
1171 inc_syscw(current);
7995bd28 1172 if (pos > max)
1da177e4
LT
1173 retval = -EOVERFLOW;
1174
1175fput_out:
2903ff01 1176 fdput(out);
1da177e4 1177fput_in:
2903ff01 1178 fdput(in);
1da177e4
LT
1179out:
1180 return retval;
1181}
1182
002c8976 1183SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1da177e4
LT
1184{
1185 loff_t pos;
1186 off_t off;
1187 ssize_t ret;
1188
1189 if (offset) {
1190 if (unlikely(get_user(off, offset)))
1191 return -EFAULT;
1192 pos = off;
1193 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1194 if (unlikely(put_user(pos, offset)))
1195 return -EFAULT;
1196 return ret;
1197 }
1198
1199 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1200}
1201
002c8976 1202SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1da177e4
LT
1203{
1204 loff_t pos;
1205 ssize_t ret;
1206
1207 if (offset) {
1208 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1209 return -EFAULT;
1210 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1211 if (unlikely(put_user(pos, offset)))
1212 return -EFAULT;
1213 return ret;
1214 }
1215
1216 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1217}
19f4fc3a
AV
1218
1219#ifdef CONFIG_COMPAT
1220COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1221 compat_off_t __user *, offset, compat_size_t, count)
1222{
1223 loff_t pos;
1224 off_t off;
1225 ssize_t ret;
1226
1227 if (offset) {
1228 if (unlikely(get_user(off, offset)))
1229 return -EFAULT;
1230 pos = off;
1231 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1232 if (unlikely(put_user(pos, offset)))
1233 return -EFAULT;
1234 return ret;
1235 }
1236
1237 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1238}
1239
1240COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1241 compat_loff_t __user *, offset, compat_size_t, count)
1242{
1243 loff_t pos;
1244 ssize_t ret;
1245
1246 if (offset) {
1247 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1248 return -EFAULT;
1249 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1250 if (unlikely(put_user(pos, offset)))
1251 return -EFAULT;
1252 return ret;
1253 }
1254
1255 return do_sendfile(out_fd, in_fd, NULL, count, 0);
1256}
1257#endif