orangefs: make precopy_buffers() take iov_iter
[linux-block.git] / fs / orangefs / file.c
CommitLineData
5db11c21
MM
1/*
2 * (C) 2001 Clemson University and The University of Chicago
3 *
4 * See COPYING in top-level directory.
5 */
6
7/*
8 * Linux VFS file operations.
9 */
10
11#include "protocol.h"
12#include "pvfs2-kernel.h"
13#include "pvfs2-bufmap.h"
14#include <linux/fs.h>
15#include <linux/pagemap.h>
16
17#define wake_up_daemon_for_return(op) \
18do { \
19 spin_lock(&op->lock); \
20 op->io_completed = 1; \
21 spin_unlock(&op->lock); \
22 wake_up_interruptible(&op->io_completion_waitq);\
23} while (0)
24
25/*
26 * Copy to client-core's address space from the buffers specified
27 * by the iovec upto total_size bytes.
28 * NOTE: the iovector can either contain addresses which
29 * can futher be kernel-space or user-space addresses.
30 * or it can pointers to struct page's
31 */
32static int precopy_buffers(struct pvfs2_bufmap *bufmap,
33 int buffer_index,
a5c126a5 34 struct iov_iter *iter,
4d1c4404 35 size_t total_size)
5db11c21
MM
36{
37 int ret = 0;
5db11c21
MM
38 /*
39 * copy data from application/kernel by pulling it out
40 * of the iovec.
41 */
4d1c4404
MM
42
43
44 if (total_size) {
4d1c4404 45 ret = pvfs_bufmap_copy_from_iovec(bufmap,
a5c126a5 46 iter,
4d1c4404
MM
47 buffer_index,
48 total_size);
49 if (ret < 0)
50 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
51 __func__,
52 (long)ret);
4d1c4404
MM
53 }
54
5db11c21
MM
55 if (ret < 0)
56 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
57 __func__,
58 (long)ret);
59 return ret;
60}
61
62/*
63 * Copy from client-core's address space to the buffers specified
64 * by the iovec upto total_size bytes.
65 * NOTE: the iovector can either contain addresses which
66 * can futher be kernel-space or user-space addresses.
67 * or it can pointers to struct page's
68 */
69static int postcopy_buffers(struct pvfs2_bufmap *bufmap,
70 int buffer_index,
5f0e3c95 71 struct iov_iter *iter,
4d1c4404 72 size_t total_size)
5db11c21
MM
73{
74 int ret = 0;
5db11c21
MM
75 /*
76 * copy data to application/kernel by pushing it out to
77 * the iovec. NOTE; target buffers can be addresses or
78 * struct page pointers.
79 */
80 if (total_size) {
4d1c4404 81 ret = pvfs_bufmap_copy_to_iovec(bufmap,
5f0e3c95 82 iter,
5c278228
AV
83 buffer_index,
84 total_size);
5db11c21 85 if (ret < 0)
4d1c4404 86 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
5db11c21
MM
87 __func__,
88 (long)ret);
89 }
90 return ret;
91}
92
93/*
94 * Post and wait for the I/O upcall to finish
95 */
96static ssize_t wait_for_direct_io(enum PVFS_io_type type, struct inode *inode,
97 loff_t *offset, struct iovec *vec, unsigned long nr_segs,
4d1c4404 98 size_t total_size, loff_t readahead_size)
5db11c21
MM
99{
100 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
101 struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
102 struct pvfs2_bufmap *bufmap = NULL;
103 struct pvfs2_kernel_op_s *new_op = NULL;
104 int buffer_index = -1;
105 ssize_t ret;
106
107 new_op = op_alloc(PVFS2_VFS_OP_FILE_IO);
108 if (!new_op) {
109 ret = -ENOMEM;
110 goto out;
111 }
112 /* synchronous I/O */
113 new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO;
114 new_op->upcall.req.io.readahead_size = readahead_size;
115 new_op->upcall.req.io.io_type = type;
116 new_op->upcall.req.io.refn = pvfs2_inode->refn;
117
118populate_shared_memory:
119 /* get a shared buffer index */
120 ret = pvfs_bufmap_get(&bufmap, &buffer_index);
121 if (ret < 0) {
122 gossip_debug(GOSSIP_FILE_DEBUG,
123 "%s: pvfs_bufmap_get failure (%ld)\n",
124 __func__, (long)ret);
125 goto out;
126 }
127 gossip_debug(GOSSIP_FILE_DEBUG,
128 "%s(%pU): GET op %p -> buffer_index %d\n",
129 __func__,
130 handle,
131 new_op,
132 buffer_index);
133
134 new_op->uses_shared_memory = 1;
135 new_op->upcall.req.io.buf_index = buffer_index;
136 new_op->upcall.req.io.count = total_size;
137 new_op->upcall.req.io.offset = *offset;
138
139 gossip_debug(GOSSIP_FILE_DEBUG,
4d1c4404 140 "%s(%pU): nr_segs %lu, offset: %llu total_size: %zd\n",
5db11c21
MM
141 __func__,
142 handle,
5db11c21
MM
143 nr_segs,
144 llu(*offset),
145 total_size);
146 /*
147 * Stage 1: copy the buffers into client-core's address space
148 * precopy_buffers only pertains to writes.
149 */
150 if (type == PVFS_IO_WRITE) {
a5c126a5
AV
151 struct iov_iter iter;
152 iov_iter_init(&iter, WRITE, vec, nr_segs, total_size);
5db11c21
MM
153 ret = precopy_buffers(bufmap,
154 buffer_index,
a5c126a5 155 &iter,
4d1c4404 156 total_size);
5db11c21
MM
157 if (ret < 0)
158 goto out;
159 }
160
161 gossip_debug(GOSSIP_FILE_DEBUG,
162 "%s(%pU): Calling post_io_request with tag (%llu)\n",
163 __func__,
164 handle,
165 llu(new_op->tag));
166
167 /* Stage 2: Service the I/O operation */
168 ret = service_operation(new_op,
169 type == PVFS_IO_WRITE ?
170 "file_write" :
171 "file_read",
172 get_interruptible_flag(inode));
173
174 /*
175 * If service_operation() returns -EAGAIN #and# the operation was
176 * purged from pvfs2_request_list or htable_ops_in_progress, then
177 * we know that the client was restarted, causing the shared memory
178 * area to be wiped clean. To restart a write operation in this
179 * case, we must re-copy the data from the user's iovec to a NEW
180 * shared memory location. To restart a read operation, we must get
181 * a new shared memory location.
182 */
183 if (ret == -EAGAIN && op_state_purged(new_op)) {
184 pvfs_bufmap_put(bufmap, buffer_index);
185 gossip_debug(GOSSIP_FILE_DEBUG,
186 "%s:going to repopulate_shared_memory.\n",
187 __func__);
188 goto populate_shared_memory;
189 }
190
191 if (ret < 0) {
192 handle_io_error(); /* defined in pvfs2-kernel.h */
193 /*
54804949
MM
194 * don't write an error to syslog on signaled operation
195 * termination unless we've got debugging turned on, as
196 * this can happen regularly (i.e. ctrl-c)
5db11c21
MM
197 */
198 if (ret == -EINTR)
199 gossip_debug(GOSSIP_FILE_DEBUG,
200 "%s: returning error %ld\n", __func__,
201 (long)ret);
202 else
203 gossip_err("%s: error in %s handle %pU, returning %zd\n",
204 __func__,
205 type == PVFS_IO_READ ?
206 "read from" : "write to",
207 handle, ret);
208 goto out;
209 }
210
211 /*
212 * Stage 3: Post copy buffers from client-core's address space
213 * postcopy_buffers only pertains to reads.
214 */
215 if (type == PVFS_IO_READ) {
5f0e3c95
AV
216 struct iov_iter iter;
217 iov_iter_init(&iter, READ, vec, nr_segs, new_op->downcall.resp.io.amt_complete);
5db11c21
MM
218 ret = postcopy_buffers(bufmap,
219 buffer_index,
5f0e3c95 220 &iter,
4d1c4404 221 new_op->downcall.resp.io.amt_complete);
5db11c21
MM
222 if (ret < 0) {
223 /*
224 * put error codes in downcall so that handle_io_error()
225 * preserves it properly
226 */
227 new_op->downcall.status = ret;
228 handle_io_error();
229 goto out;
230 }
231 }
232 gossip_debug(GOSSIP_FILE_DEBUG,
233 "%s(%pU): Amount written as returned by the sys-io call:%d\n",
234 __func__,
235 handle,
236 (int)new_op->downcall.resp.io.amt_complete);
237
238 ret = new_op->downcall.resp.io.amt_complete;
239
240 /*
54804949
MM
241 * tell the device file owner waiting on I/O that this read has
242 * completed and it can return now. in this exact case, on
243 * wakeup the daemon will free the op, so we *cannot* touch it
244 * after this.
5db11c21
MM
245 */
246 wake_up_daemon_for_return(new_op);
247 new_op = NULL;
248
249out:
250 if (buffer_index >= 0) {
251 pvfs_bufmap_put(bufmap, buffer_index);
252 gossip_debug(GOSSIP_FILE_DEBUG,
253 "%s(%pU): PUT buffer_index %d\n",
254 __func__, handle, buffer_index);
255 buffer_index = -1;
256 }
257 if (new_op) {
258 op_release(new_op);
259 new_op = NULL;
260 }
261 return ret;
262}
263
264/*
265 * The reason we need to do this is to be able to support readv and writev
266 * that are larger than (pvfs_bufmap_size_query()) Default is
267 * PVFS2_BUFMAP_DEFAULT_DESC_SIZE MB. What that means is that we will
268 * create a new io vec descriptor for those memory addresses that
269 * go beyond the limit. Return value for this routine is negative in case
270 * of errors and 0 in case of success.
271 *
272 * Further, the new_nr_segs pointer is updated to hold the new value
273 * of number of iovecs, the new_vec pointer is updated to hold the pointer
274 * to the new split iovec, and the size array is an array of integers holding
275 * the number of iovecs that straddle pvfs_bufmap_size_query().
276 * The max_new_nr_segs value is computed by the caller and returned.
277 * (It will be (count of all iov_len/ block_size) + 1).
278 */
279static int split_iovecs(unsigned long max_new_nr_segs, /* IN */
280 unsigned long nr_segs, /* IN */
281 const struct iovec *original_iovec, /* IN */
282 unsigned long *new_nr_segs, /* OUT */
283 struct iovec **new_vec, /* OUT */
284 unsigned long *seg_count, /* OUT */
285 unsigned long **seg_array) /* OUT */
286{
287 unsigned long seg;
288 unsigned long count = 0;
289 unsigned long begin_seg;
290 unsigned long tmpnew_nr_segs = 0;
291 struct iovec *new_iovec = NULL;
292 struct iovec *orig_iovec;
293 unsigned long *sizes = NULL;
294 unsigned long sizes_count = 0;
295
296 if (nr_segs <= 0 ||
297 original_iovec == NULL ||
298 new_nr_segs == NULL ||
299 new_vec == NULL ||
300 seg_count == NULL ||
301 seg_array == NULL ||
302 max_new_nr_segs <= 0) {
303 gossip_err("Invalid parameters to split_iovecs\n");
304 return -EINVAL;
305 }
306 *new_nr_segs = 0;
307 *new_vec = NULL;
308 *seg_count = 0;
309 *seg_array = NULL;
310 /* copy the passed in iovec descriptor to a temp structure */
311 orig_iovec = kmalloc_array(nr_segs,
312 sizeof(*orig_iovec),
313 PVFS2_BUFMAP_GFP_FLAGS);
314 if (orig_iovec == NULL) {
315 gossip_err(
316 "split_iovecs: Could not allocate memory for %lu bytes!\n",
317 (unsigned long)(nr_segs * sizeof(*orig_iovec)));
318 return -ENOMEM;
319 }
320 new_iovec = kcalloc(max_new_nr_segs,
321 sizeof(*new_iovec),
322 PVFS2_BUFMAP_GFP_FLAGS);
323 if (new_iovec == NULL) {
324 kfree(orig_iovec);
325 gossip_err(
326 "split_iovecs: Could not allocate memory for %lu bytes!\n",
327 (unsigned long)(max_new_nr_segs * sizeof(*new_iovec)));
328 return -ENOMEM;
329 }
330 sizes = kcalloc(max_new_nr_segs,
331 sizeof(*sizes),
332 PVFS2_BUFMAP_GFP_FLAGS);
333 if (sizes == NULL) {
334 kfree(new_iovec);
335 kfree(orig_iovec);
336 gossip_err(
337 "split_iovecs: Could not allocate memory for %lu bytes!\n",
338 (unsigned long)(max_new_nr_segs * sizeof(*sizes)));
339 return -ENOMEM;
340 }
341 /* copy the passed in iovec to a temp structure */
342 memcpy(orig_iovec, original_iovec, nr_segs * sizeof(*orig_iovec));
343 begin_seg = 0;
344repeat:
345 for (seg = begin_seg; seg < nr_segs; seg++) {
346 if (tmpnew_nr_segs >= max_new_nr_segs ||
347 sizes_count >= max_new_nr_segs) {
348 kfree(sizes);
349 kfree(orig_iovec);
350 kfree(new_iovec);
351 gossip_err
352 ("split_iovecs: exceeded the index limit (%lu)\n",
353 tmpnew_nr_segs);
354 return -EINVAL;
355 }
356 if (count + orig_iovec[seg].iov_len <
357 pvfs_bufmap_size_query()) {
358 count += orig_iovec[seg].iov_len;
359 memcpy(&new_iovec[tmpnew_nr_segs],
360 &orig_iovec[seg],
361 sizeof(*new_iovec));
362 tmpnew_nr_segs++;
363 sizes[sizes_count]++;
364 } else {
365 new_iovec[tmpnew_nr_segs].iov_base =
366 orig_iovec[seg].iov_base;
367 new_iovec[tmpnew_nr_segs].iov_len =
368 (pvfs_bufmap_size_query() - count);
369 tmpnew_nr_segs++;
370 sizes[sizes_count]++;
371 sizes_count++;
372 begin_seg = seg;
373 orig_iovec[seg].iov_base +=
374 (pvfs_bufmap_size_query() - count);
375 orig_iovec[seg].iov_len -=
376 (pvfs_bufmap_size_query() - count);
377 count = 0;
378 break;
379 }
380 }
381 if (seg != nr_segs)
382 goto repeat;
383 else
384 sizes_count++;
385
386 *new_nr_segs = tmpnew_nr_segs;
387 /* new_iovec is freed by the caller */
388 *new_vec = new_iovec;
389 *seg_count = sizes_count;
390 /* seg_array is also freed by the caller */
391 *seg_array = sizes;
392 kfree(orig_iovec);
393 return 0;
394}
395
396static long bound_max_iovecs(const struct iovec *curr, unsigned long nr_segs,
397 ssize_t *total_count)
398{
399 unsigned long i;
400 long max_nr_iovecs;
401 ssize_t total;
402 ssize_t count;
403
404 total = 0;
405 count = 0;
406 max_nr_iovecs = 0;
407 for (i = 0; i < nr_segs; i++) {
408 const struct iovec *iv = &curr[i];
409
410 count += iv->iov_len;
411 if (unlikely((ssize_t) (count | iv->iov_len) < 0))
412 return -EINVAL;
413 if (total + iv->iov_len < pvfs_bufmap_size_query()) {
414 total += iv->iov_len;
415 max_nr_iovecs++;
416 } else {
417 total =
418 (total + iv->iov_len - pvfs_bufmap_size_query());
419 max_nr_iovecs += (total / pvfs_bufmap_size_query() + 2);
420 }
421 }
422 *total_count = count;
423 return max_nr_iovecs;
424}
425
426/*
427 * Common entry point for read/write/readv/writev
428 * This function will dispatch it to either the direct I/O
429 * or buffered I/O path depending on the mount options and/or
430 * augmented/extended metadata attached to the file.
431 * Note: File extended attributes override any mount options.
432 */
433static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
434 loff_t *offset, const struct iovec *iov, unsigned long nr_segs)
435{
436 struct inode *inode = file->f_mapping->host;
437 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
438 struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
439 ssize_t ret;
440 ssize_t total_count;
441 unsigned int to_free;
442 size_t count;
443 unsigned long seg;
eeaa3d44
MM
444 unsigned long new_nr_segs;
445 unsigned long max_new_nr_segs;
446 unsigned long seg_count;
447 unsigned long *seg_array;
448 struct iovec *iovecptr;
449 struct iovec *ptr;
5db11c21
MM
450
451 total_count = 0;
452 ret = -EINVAL;
453 count = 0;
454 to_free = 0;
455
456 /* Compute total and max number of segments after split */
457 max_new_nr_segs = bound_max_iovecs(iov, nr_segs, &count);
5db11c21
MM
458
459 gossip_debug(GOSSIP_FILE_DEBUG,
460 "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
461 __func__,
462 handle,
463 (int)count);
464
465 if (type == PVFS_IO_WRITE) {
466 gossip_debug(GOSSIP_FILE_DEBUG,
467 "%s(%pU): proceeding with offset : %llu, "
468 "size %d\n",
469 __func__,
470 handle,
471 llu(*offset),
472 (int)count);
473 }
474
475 if (count == 0) {
476 ret = 0;
477 goto out;
478 }
479
480 /*
481 * if the total size of data transfer requested is greater than
482 * the kernel-set blocksize of PVFS2, then we split the iovecs
483 * such that no iovec description straddles a block size limit
484 */
485
486 gossip_debug(GOSSIP_FILE_DEBUG,
487 "%s: pvfs_bufmap_size:%d\n",
488 __func__,
489 pvfs_bufmap_size_query());
490
491 if (count > pvfs_bufmap_size_query()) {
492 /*
493 * Split up the given iovec description such that
494 * no iovec descriptor straddles over the block-size limitation.
495 * This makes us our job easier to stage the I/O.
496 * In addition, this function will also compute an array
497 * with seg_count entries that will store the number of
498 * segments that straddle the block-size boundaries.
499 */
500 ret = split_iovecs(max_new_nr_segs, /* IN */
501 nr_segs, /* IN */
502 iov, /* IN */
503 &new_nr_segs, /* OUT */
504 &iovecptr, /* OUT */
505 &seg_count, /* OUT */
506 &seg_array); /* OUT */
507 if (ret < 0) {
508 gossip_err("%s: Failed to split iovecs to satisfy larger than blocksize readv/writev request %zd\n",
509 __func__,
510 ret);
511 goto out;
512 }
513 gossip_debug(GOSSIP_FILE_DEBUG,
514 "%s: Splitting iovecs from %lu to %lu"
515 " [max_new %lu]\n",
516 __func__,
517 nr_segs,
518 new_nr_segs,
519 max_new_nr_segs);
520 /* We must free seg_array and iovecptr */
521 to_free = 1;
522 } else {
523 new_nr_segs = nr_segs;
524 /* use the given iovec description */
525 iovecptr = (struct iovec *)iov;
526 /* There is only 1 element in the seg_array */
527 seg_count = 1;
528 /* and its value is the number of segments passed in */
529 seg_array = &nr_segs;
530 /* We dont have to free up anything */
531 to_free = 0;
532 }
533 ptr = iovecptr;
534
535 gossip_debug(GOSSIP_FILE_DEBUG,
536 "%s(%pU) %zd@%llu\n",
537 __func__,
538 handle,
539 count,
540 llu(*offset));
541 gossip_debug(GOSSIP_FILE_DEBUG,
542 "%s(%pU): new_nr_segs: %lu, seg_count: %lu\n",
543 __func__,
544 handle,
545 new_nr_segs, seg_count);
546
547/* PVFS2_KERNEL_DEBUG is a CFLAGS define. */
548#ifdef PVFS2_KERNEL_DEBUG
549 for (seg = 0; seg < new_nr_segs; seg++)
550 gossip_debug(GOSSIP_FILE_DEBUG,
551 "%s: %d) %p to %p [%d bytes]\n",
552 __func__,
553 (int)seg + 1,
554 iovecptr[seg].iov_base,
555 iovecptr[seg].iov_base + iovecptr[seg].iov_len,
556 (int)iovecptr[seg].iov_len);
557 for (seg = 0; seg < seg_count; seg++)
558 gossip_debug(GOSSIP_FILE_DEBUG,
559 "%s: %zd) %lu\n",
560 __func__,
561 seg + 1,
562 seg_array[seg]);
563#endif
564 seg = 0;
565 while (total_count < count) {
566 size_t each_count;
567 size_t amt_complete;
568
569 /* how much to transfer in this loop iteration */
570 each_count =
571 (((count - total_count) > pvfs_bufmap_size_query()) ?
572 pvfs_bufmap_size_query() :
573 (count - total_count));
574
575 gossip_debug(GOSSIP_FILE_DEBUG,
576 "%s(%pU): size of each_count(%d)\n",
577 __func__,
578 handle,
579 (int)each_count);
580 gossip_debug(GOSSIP_FILE_DEBUG,
581 "%s(%pU): BEFORE wait_for_io: offset is %d\n",
582 __func__,
583 handle,
584 (int)*offset);
585
586 ret = wait_for_direct_io(type, inode, offset, ptr,
4d1c4404 587 seg_array[seg], each_count, 0);
5db11c21
MM
588 gossip_debug(GOSSIP_FILE_DEBUG,
589 "%s(%pU): return from wait_for_io:%d\n",
590 __func__,
591 handle,
592 (int)ret);
593
594 if (ret < 0)
595 goto out;
596
597 /* advance the iovec pointer */
598 ptr += seg_array[seg];
599 seg++;
600 *offset += ret;
601 total_count += ret;
602 amt_complete = ret;
603
604 gossip_debug(GOSSIP_FILE_DEBUG,
605 "%s(%pU): AFTER wait_for_io: offset is %d\n",
606 __func__,
607 handle,
608 (int)*offset);
609
610 /*
611 * if we got a short I/O operations,
612 * fall out and return what we got so far
613 */
614 if (amt_complete < each_count)
615 break;
616 } /*end while */
617
618 if (total_count > 0)
619 ret = total_count;
620out:
621 if (to_free) {
622 kfree(iovecptr);
623 kfree(seg_array);
624 }
625 if (ret > 0) {
626 if (type == PVFS_IO_READ) {
627 file_accessed(file);
628 } else {
629 SetMtimeFlag(pvfs2_inode);
630 inode->i_mtime = CURRENT_TIME;
631 mark_inode_dirty_sync(inode);
632 }
633 }
634
635 gossip_debug(GOSSIP_FILE_DEBUG,
636 "%s(%pU): Value(%d) returned.\n",
637 __func__,
638 handle,
639 (int)ret);
640
641 return ret;
642}
643
644/*
645 * Read data from a specified offset in a file (referenced by inode).
646 * Data may be placed either in a user or kernel buffer.
647 */
648ssize_t pvfs2_inode_read(struct inode *inode,
649 char __user *buf,
650 size_t count,
651 loff_t *offset,
652 loff_t readahead_size)
653{
654 struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
655 size_t bufmap_size;
656 struct iovec vec;
657 ssize_t ret = -EINVAL;
658
659 g_pvfs2_stats.reads++;
660
661 vec.iov_base = buf;
662 vec.iov_len = count;
663
664 bufmap_size = pvfs_bufmap_size_query();
665 if (count > bufmap_size) {
666 gossip_debug(GOSSIP_FILE_DEBUG,
667 "%s: count is too large (%zd/%zd)!\n",
668 __func__, count, bufmap_size);
669 return -EINVAL;
670 }
671
672 gossip_debug(GOSSIP_FILE_DEBUG,
673 "%s(%pU) %zd@%llu\n",
674 __func__,
675 &pvfs2_inode->refn.khandle,
676 count,
677 llu(*offset));
678
679 ret = wait_for_direct_io(PVFS_IO_READ, inode, offset, &vec, 1,
4d1c4404 680 count, readahead_size);
5db11c21
MM
681 if (ret > 0)
682 *offset += ret;
683
684 gossip_debug(GOSSIP_FILE_DEBUG,
685 "%s(%pU): Value(%zd) returned.\n",
686 __func__,
687 &pvfs2_inode->refn.khandle,
688 ret);
689
690 return ret;
691}
692
693static ssize_t pvfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
694{
695 struct file *file = iocb->ki_filp;
696 loff_t pos = *(&iocb->ki_pos);
697 ssize_t rc = 0;
698 unsigned long nr_segs = iter->nr_segs;
699
700 BUG_ON(iocb->private);
701
702 gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_read_iter\n");
703
704 g_pvfs2_stats.reads++;
705
706 rc = do_readv_writev(PVFS_IO_READ,
707 file,
708 &pos,
709 iter->iov,
710 nr_segs);
711 iocb->ki_pos = pos;
712
713 return rc;
714}
715
716static ssize_t pvfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
717{
718 struct file *file = iocb->ki_filp;
719 loff_t pos = *(&iocb->ki_pos);
720 unsigned long nr_segs = iter->nr_segs;
721 ssize_t rc;
722
723 BUG_ON(iocb->private);
724
725 gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_write_iter\n");
726
727 mutex_lock(&file->f_mapping->host->i_mutex);
728
729 /* Make sure generic_write_checks sees an up to date inode size. */
730 if (file->f_flags & O_APPEND) {
731 rc = pvfs2_inode_getattr(file->f_mapping->host,
732 PVFS_ATTR_SYS_SIZE);
733 if (rc) {
734 gossip_err("%s: pvfs2_inode_getattr failed, rc:%zd:.\n",
735 __func__, rc);
736 goto out;
737 }
738 }
739
740 if (file->f_pos > i_size_read(file->f_mapping->host))
741 pvfs2_i_size_write(file->f_mapping->host, file->f_pos);
742
743 rc = generic_write_checks(iocb, iter);
744
745 if (rc <= 0) {
746 gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
747 __func__, rc);
748 goto out;
749 }
750
751 rc = do_readv_writev(PVFS_IO_WRITE,
752 file,
753 &pos,
754 iter->iov,
755 nr_segs);
756 if (rc < 0) {
757 gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
758 __func__, rc);
759 goto out;
760 }
761
762 iocb->ki_pos = pos;
763 g_pvfs2_stats.writes++;
764
765out:
766
767 mutex_unlock(&file->f_mapping->host->i_mutex);
768 return rc;
769}
770
771/*
772 * Perform a miscellaneous operation on a file.
773 */
84d02150 774static long pvfs2_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5db11c21
MM
775{
776 int ret = -ENOTTY;
777 __u64 val = 0;
778 unsigned long uval;
779
780 gossip_debug(GOSSIP_FILE_DEBUG,
781 "pvfs2_ioctl: called with cmd %d\n",
782 cmd);
783
784 /*
785 * we understand some general ioctls on files, such as the immutable
786 * and append flags
787 */
788 if (cmd == FS_IOC_GETFLAGS) {
789 val = 0;
790 ret = pvfs2_xattr_get_default(file->f_path.dentry,
791 "user.pvfs2.meta_hint",
792 &val,
793 sizeof(val),
794 0);
795 if (ret < 0 && ret != -ENODATA)
796 return ret;
797 else if (ret == -ENODATA)
798 val = 0;
799 uval = val;
800 gossip_debug(GOSSIP_FILE_DEBUG,
801 "pvfs2_ioctl: FS_IOC_GETFLAGS: %llu\n",
802 (unsigned long long)uval);
803 return put_user(uval, (int __user *)arg);
804 } else if (cmd == FS_IOC_SETFLAGS) {
805 ret = 0;
806 if (get_user(uval, (int __user *)arg))
807 return -EFAULT;
808 /*
809 * PVFS_MIRROR_FL is set internally when the mirroring mode
810 * is turned on for a file. The user is not allowed to turn
811 * on this bit, but the bit is present if the user first gets
812 * the flags and then updates the flags with some new
813 * settings. So, we ignore it in the following edit. bligon.
814 */
815 if ((uval & ~PVFS_MIRROR_FL) &
816 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
817 gossip_err("pvfs2_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
818 return -EINVAL;
819 }
820 val = uval;
821 gossip_debug(GOSSIP_FILE_DEBUG,
822 "pvfs2_ioctl: FS_IOC_SETFLAGS: %llu\n",
823 (unsigned long long)val);
824 ret = pvfs2_xattr_set_default(file->f_path.dentry,
825 "user.pvfs2.meta_hint",
826 &val,
827 sizeof(val),
828 0,
829 0);
830 }
831
832 return ret;
833}
834
835/*
836 * Memory map a region of a file.
837 */
838static int pvfs2_file_mmap(struct file *file, struct vm_area_struct *vma)
839{
840 gossip_debug(GOSSIP_FILE_DEBUG,
841 "pvfs2_file_mmap: called on %s\n",
842 (file ?
843 (char *)file->f_path.dentry->d_name.name :
844 (char *)"Unknown"));
845
846 /* set the sequential readahead hint */
847 vma->vm_flags |= VM_SEQ_READ;
848 vma->vm_flags &= ~VM_RAND_READ;
35390803
MB
849
850 /* Use readonly mmap since we cannot support writable maps. */
851 return generic_file_readonly_mmap(file, vma);
5db11c21
MM
852}
853
854#define mapping_nrpages(idata) ((idata)->nrpages)
855
856/*
857 * Called to notify the module that there are no more references to
858 * this file (i.e. no processes have it open).
859 *
860 * \note Not called when each file is closed.
861 */
84d02150 862static int pvfs2_file_release(struct inode *inode, struct file *file)
5db11c21
MM
863{
864 gossip_debug(GOSSIP_FILE_DEBUG,
865 "pvfs2_file_release: called on %s\n",
866 file->f_path.dentry->d_name.name);
867
868 pvfs2_flush_inode(inode);
869
870 /*
54804949
MM
871 * remove all associated inode pages from the page cache and mmap
872 * readahead cache (if any); this forces an expensive refresh of
873 * data for the next caller of mmap (or 'get_block' accesses)
5db11c21
MM
874 */
875 if (file->f_path.dentry->d_inode &&
876 file->f_path.dentry->d_inode->i_mapping &&
877 mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
878 truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
879 0);
880 return 0;
881}
882
883/*
884 * Push all data for a specific file onto permanent storage.
885 */
84d02150
MM
886static int pvfs2_fsync(struct file *file,
887 loff_t start,
888 loff_t end,
889 int datasync)
5db11c21
MM
890{
891 int ret = -EINVAL;
892 struct pvfs2_inode_s *pvfs2_inode =
893 PVFS2_I(file->f_path.dentry->d_inode);
894 struct pvfs2_kernel_op_s *new_op = NULL;
895
896 /* required call */
897 filemap_write_and_wait_range(file->f_mapping, start, end);
898
899 new_op = op_alloc(PVFS2_VFS_OP_FSYNC);
900 if (!new_op)
901 return -ENOMEM;
902 new_op->upcall.req.fsync.refn = pvfs2_inode->refn;
903
904 ret = service_operation(new_op,
905 "pvfs2_fsync",
906 get_interruptible_flag(file->f_path.dentry->d_inode));
907
908 gossip_debug(GOSSIP_FILE_DEBUG,
909 "pvfs2_fsync got return value of %d\n",
910 ret);
911
912 op_release(new_op);
913
914 pvfs2_flush_inode(file->f_path.dentry->d_inode);
915 return ret;
916}
917
918/*
919 * Change the file pointer position for an instance of an open file.
920 *
921 * \note If .llseek is overriden, we must acquire lock as described in
922 * Documentation/filesystems/Locking.
923 *
924 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
925 * require much changes to the FS
926 */
84d02150 927static loff_t pvfs2_file_llseek(struct file *file, loff_t offset, int origin)
5db11c21
MM
928{
929 int ret = -EINVAL;
930 struct inode *inode = file->f_path.dentry->d_inode;
931
932 if (!inode) {
933 gossip_err("pvfs2_file_llseek: invalid inode (NULL)\n");
934 return ret;
935 }
936
937 if (origin == PVFS2_SEEK_END) {
938 /*
939 * revalidate the inode's file size.
940 * NOTE: We are only interested in file size here,
941 * so we set mask accordingly.
942 */
943 ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_SIZE);
944 if (ret) {
945 gossip_debug(GOSSIP_FILE_DEBUG,
946 "%s:%s:%d calling make bad inode\n",
947 __FILE__,
948 __func__,
949 __LINE__);
950 pvfs2_make_bad_inode(inode);
951 return ret;
952 }
953 }
954
955 gossip_debug(GOSSIP_FILE_DEBUG,
54804949
MM
956 "pvfs2_file_llseek: offset is %ld | origin is %d"
957 " | inode size is %lu\n",
5db11c21
MM
958 (long)offset,
959 origin,
960 (unsigned long)file->f_path.dentry->d_inode->i_size);
961
962 return generic_file_llseek(file, offset, origin);
963}
964
965/*
966 * Support local locks (locks that only this kernel knows about)
967 * if Orangefs was mounted -o local_lock.
968 */
84d02150 969static int pvfs2_lock(struct file *filp, int cmd, struct file_lock *fl)
5db11c21 970{
f957ae2d 971 int rc = -EINVAL;
5db11c21
MM
972
973 if (PVFS2_SB(filp->f_inode->i_sb)->flags & PVFS2_OPT_LOCAL_LOCK) {
974 if (cmd == F_GETLK) {
975 rc = 0;
976 posix_test_lock(filp, fl);
977 } else {
978 rc = posix_lock_file(filp, fl, NULL);
979 }
980 }
981
982 return rc;
983}
984
985/** PVFS2 implementation of VFS file operations */
986const struct file_operations pvfs2_file_operations = {
987 .llseek = pvfs2_file_llseek,
988 .read_iter = pvfs2_file_read_iter,
989 .write_iter = pvfs2_file_write_iter,
990 .lock = pvfs2_lock,
991 .unlocked_ioctl = pvfs2_ioctl,
992 .mmap = pvfs2_file_mmap,
993 .open = generic_file_open,
994 .release = pvfs2_file_release,
995 .fsync = pvfs2_fsync,
996};