fuse: on 64-bit store time in d_fsdata directly
[linux-2.6-block.git] / fs / fuse / file.c
CommitLineData
b6aeaded
MS
1/*
2 FUSE: Filesystem in Userspace
1729a16c 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
b6aeaded
MS
4
5 This program can be distributed under the terms of the GNU GPL.
6 See the file COPYING.
7*/
8
9#include "fuse_i.h"
10
11#include <linux/pagemap.h>
12#include <linux/slab.h>
13#include <linux/kernel.h>
e8edc6e0 14#include <linux/sched.h>
7a36094d 15#include <linux/sched/signal.h>
08cbf542 16#include <linux/module.h>
d9d318d3 17#include <linux/compat.h>
478e0841 18#include <linux/swap.h>
3634a632 19#include <linux/falloc.h>
e2e40f2c 20#include <linux/uio.h>
b6aeaded 21
7213394c
MS
22static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
23 struct fuse_page_desc **desc)
4c4f03f7
MS
24{
25 struct page **pages;
26
27 pages = kzalloc(npages * (sizeof(struct page *) +
28 sizeof(struct fuse_page_desc)), flags);
29 *desc = (void *) (pages + npages);
30
31 return pages;
32}
33
91fe96b4
MS
34static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
35 int opcode, struct fuse_open_out *outargp)
b6aeaded 36{
b6aeaded 37 struct fuse_open_in inarg;
7078187a 38 FUSE_ARGS(args);
fd72faac
MS
39
40 memset(&inarg, 0, sizeof(inarg));
6ff958ed
MS
41 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
42 if (!fc->atomic_o_trunc)
43 inarg.flags &= ~O_TRUNC;
d5b48543
MS
44 args.opcode = opcode;
45 args.nodeid = nodeid;
46 args.in_numargs = 1;
47 args.in_args[0].size = sizeof(inarg);
48 args.in_args[0].value = &inarg;
49 args.out_numargs = 1;
50 args.out_args[0].size = sizeof(*outargp);
51 args.out_args[0].value = outargp;
fd72faac 52
7078187a 53 return fuse_simple_request(fc, &args);
fd72faac
MS
54}
55
4cb54866
MS
56struct fuse_release_args {
57 struct fuse_args args;
58 struct fuse_release_in inarg;
59 struct inode *inode;
60};
61
acf99433 62struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
fd72faac
MS
63{
64 struct fuse_file *ff;
6b2db28a 65
68227c03 66 ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL);
6b2db28a
TH
67 if (unlikely(!ff))
68 return NULL;
69
da5e4714 70 ff->fc = fc;
4cb54866
MS
71 ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL);
72 if (!ff->release_args) {
6b2db28a
TH
73 kfree(ff);
74 return NULL;
fd72faac 75 }
6b2db28a
TH
76
77 INIT_LIST_HEAD(&ff->write_entry);
5d7bc7e8 78 mutex_init(&ff->readdir.lock);
4e8c2eb5 79 refcount_set(&ff->count, 1);
6b2db28a
TH
80 RB_CLEAR_NODE(&ff->polled_node);
81 init_waitqueue_head(&ff->poll_wait);
82
75126f55 83 ff->kh = atomic64_inc_return(&fc->khctr);
6b2db28a 84
fd72faac
MS
85 return ff;
86}
87
88void fuse_file_free(struct fuse_file *ff)
89{
4cb54866 90 kfree(ff->release_args);
5d7bc7e8 91 mutex_destroy(&ff->readdir.lock);
fd72faac
MS
92 kfree(ff);
93}
94
267d8444 95static struct fuse_file *fuse_file_get(struct fuse_file *ff)
c756e0a4 96{
4e8c2eb5 97 refcount_inc(&ff->count);
c756e0a4
MS
98 return ff;
99}
100
4cb54866
MS
101static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
102 int error)
819c4b3b 103{
4cb54866
MS
104 struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
105
106 iput(ra->inode);
107 kfree(ra);
819c4b3b
MS
108}
109
2e64ff15 110static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
c756e0a4 111{
4e8c2eb5 112 if (refcount_dec_and_test(&ff->count)) {
4cb54866 113 struct fuse_args *args = &ff->release_args->args;
8b0797a4 114
d9a9ea94 115 if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
4cb54866
MS
116 /* Do nothing when client does not implement 'open' */
117 fuse_release_end(ff->fc, args, 0);
7678ac50 118 } else if (sync) {
4cb54866
MS
119 fuse_simple_request(ff->fc, args);
120 fuse_release_end(ff->fc, args, 0);
5a18ec17 121 } else {
4cb54866
MS
122 args->end = fuse_release_end;
123 if (fuse_simple_background(ff->fc, args,
124 GFP_KERNEL | __GFP_NOFAIL))
125 fuse_release_end(ff->fc, args, -ENOTCONN);
5a18ec17 126 }
c756e0a4
MS
127 kfree(ff);
128 }
129}
130
08cbf542
TH
131int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
132 bool isdir)
91fe96b4 133{
91fe96b4 134 struct fuse_file *ff;
91fe96b4
MS
135 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
136
137 ff = fuse_file_alloc(fc);
138 if (!ff)
139 return -ENOMEM;
140
7678ac50 141 ff->fh = 0;
fabf7e02
CA
142 /* Default for no-open */
143 ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
d9a9ea94 144 if (isdir ? !fc->no_opendir : !fc->no_open) {
7678ac50
AG
145 struct fuse_open_out outarg;
146 int err;
147
148 err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
149 if (!err) {
150 ff->fh = outarg.fh;
151 ff->open_flags = outarg.open_flags;
152
d9a9ea94 153 } else if (err != -ENOSYS) {
7678ac50
AG
154 fuse_file_free(ff);
155 return err;
156 } else {
d9a9ea94
CA
157 if (isdir)
158 fc->no_opendir = 1;
159 else
160 fc->no_open = 1;
7678ac50 161 }
91fe96b4
MS
162 }
163
164 if (isdir)
7678ac50 165 ff->open_flags &= ~FOPEN_DIRECT_IO;
91fe96b4 166
91fe96b4 167 ff->nodeid = nodeid;
267d8444 168 file->private_data = ff;
91fe96b4
MS
169
170 return 0;
171}
08cbf542 172EXPORT_SYMBOL_GPL(fuse_do_open);
91fe96b4 173
650b22b9
PE
174static void fuse_link_write_file(struct file *file)
175{
176 struct inode *inode = file_inode(file);
650b22b9
PE
177 struct fuse_inode *fi = get_fuse_inode(inode);
178 struct fuse_file *ff = file->private_data;
179 /*
180 * file may be written through mmap, so chain it onto the
181 * inodes's write_file list
182 */
f15ecfef 183 spin_lock(&fi->lock);
650b22b9
PE
184 if (list_empty(&ff->write_entry))
185 list_add(&ff->write_entry, &fi->write_files);
f15ecfef 186 spin_unlock(&fi->lock);
650b22b9
PE
187}
188
c7b7143c 189void fuse_finish_open(struct inode *inode, struct file *file)
fd72faac 190{
c7b7143c 191 struct fuse_file *ff = file->private_data;
a0822c55 192 struct fuse_conn *fc = get_fuse_conn(inode);
c7b7143c 193
c7b7143c 194 if (!(ff->open_flags & FOPEN_KEEP_CACHE))
b1009979 195 invalidate_inode_pages2(inode->i_mapping);
bbd84f33
KS
196 if (ff->open_flags & FOPEN_STREAM)
197 stream_open(inode, file);
198 else if (ff->open_flags & FOPEN_NONSEEKABLE)
a7c1b990 199 nonseekable_open(inode, file);
a0822c55
KS
200 if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
201 struct fuse_inode *fi = get_fuse_inode(inode);
202
f15ecfef 203 spin_lock(&fi->lock);
4510d86f 204 fi->attr_version = atomic64_inc_return(&fc->attr_version);
a0822c55 205 i_size_write(inode, 0);
f15ecfef 206 spin_unlock(&fi->lock);
a0822c55 207 fuse_invalidate_attr(inode);
75caeecd
MP
208 if (fc->writeback_cache)
209 file_update_time(file);
a0822c55 210 }
4d99ff8f
PE
211 if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
212 fuse_link_write_file(file);
fd72faac
MS
213}
214
91fe96b4 215int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
fd72faac 216{
acf99433 217 struct fuse_conn *fc = get_fuse_conn(inode);
b6aeaded 218 int err;
75caeecd
MP
219 bool lock_inode = (file->f_flags & O_TRUNC) &&
220 fc->atomic_o_trunc &&
221 fc->writeback_cache;
b6aeaded
MS
222
223 err = generic_file_open(inode, file);
224 if (err)
225 return err;
226
75caeecd 227 if (lock_inode)
5955102c 228 inode_lock(inode);
75caeecd 229
91fe96b4 230 err = fuse_do_open(fc, get_node_id(inode), file, isdir);
b6aeaded 231
75caeecd
MP
232 if (!err)
233 fuse_finish_open(inode, file);
91fe96b4 234
75caeecd 235 if (lock_inode)
5955102c 236 inode_unlock(inode);
75caeecd
MP
237
238 return err;
b6aeaded
MS
239}
240
ebf84d0c
KT
241static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
242 int flags, int opcode)
64c6d8ed 243{
8b0797a4 244 struct fuse_conn *fc = ff->fc;
4cb54866 245 struct fuse_release_args *ra = ff->release_args;
b6aeaded 246
f15ecfef
KT
247 /* Inode is NULL on error path of fuse_create_open() */
248 if (likely(fi)) {
249 spin_lock(&fi->lock);
250 list_del(&ff->write_entry);
251 spin_unlock(&fi->lock);
252 }
8b0797a4 253 spin_lock(&fc->lock);
8b0797a4
MS
254 if (!RB_EMPTY_NODE(&ff->polled_node))
255 rb_erase(&ff->polled_node, &fc->polled_files);
256 spin_unlock(&fc->lock);
257
357ccf2b 258 wake_up_interruptible_all(&ff->poll_wait);
8b0797a4 259
4cb54866
MS
260 ra->inarg.fh = ff->fh;
261 ra->inarg.flags = flags;
262 ra->args.in_numargs = 1;
263 ra->args.in_args[0].size = sizeof(struct fuse_release_in);
264 ra->args.in_args[0].value = &ra->inarg;
265 ra->args.opcode = opcode;
266 ra->args.nodeid = ff->nodeid;
267 ra->args.force = true;
268 ra->args.nocreds = true;
fd72faac
MS
269}
270
2e64ff15 271void fuse_release_common(struct file *file, bool isdir)
fd72faac 272{
ebf84d0c 273 struct fuse_inode *fi = get_fuse_inode(file_inode(file));
9a87ad3d 274 struct fuse_file *ff = file->private_data;
4cb54866 275 struct fuse_release_args *ra = ff->release_args;
2e64ff15 276 int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
6b2db28a 277
ebf84d0c 278 fuse_prepare_release(fi, ff, file->f_flags, opcode);
6b2db28a 279
37fb3a30 280 if (ff->flock) {
4cb54866
MS
281 ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
282 ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
283 (fl_owner_t) file);
37fb3a30 284 }
baebccbe 285 /* Hold inode until release is finished */
4cb54866 286 ra->inode = igrab(file_inode(file));
6b2db28a 287
6b2db28a
TH
288 /*
289 * Normally this will send the RELEASE request, however if
290 * some asynchronous READ or WRITE requests are outstanding,
291 * the sending will be delayed.
5a18ec17
MS
292 *
293 * Make the release synchronous if this is a fuseblk mount,
294 * synchronous RELEASE is allowed (and desirable) in this case
295 * because the server can be trusted not to screw up.
6b2db28a 296 */
1ccd1ea2 297 fuse_file_put(ff, ff->fc->destroy, isdir);
b6aeaded
MS
298}
299
04730fef
MS
300static int fuse_open(struct inode *inode, struct file *file)
301{
91fe96b4 302 return fuse_open_common(inode, file, false);
04730fef
MS
303}
304
305static int fuse_release(struct inode *inode, struct file *file)
306{
e7cc133c
PE
307 struct fuse_conn *fc = get_fuse_conn(inode);
308
309 /* see fuse_vma_close() for !writeback_cache case */
310 if (fc->writeback_cache)
1e18bda8 311 write_inode_now(inode, 1);
b0aa7606 312
2e64ff15 313 fuse_release_common(file, false);
8b0797a4
MS
314
315 /* return value is ignored by VFS */
316 return 0;
317}
318
ebf84d0c 319void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
8b0797a4 320{
4e8c2eb5 321 WARN_ON(refcount_read(&ff->count) > 1);
ebf84d0c 322 fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
267d8444
MS
323 /*
324 * iput(NULL) is a no-op and since the refcount is 1 and everything's
325 * synchronous, we are fine with not doing igrab() here"
326 */
2e64ff15 327 fuse_file_put(ff, true, false);
04730fef 328}
08cbf542 329EXPORT_SYMBOL_GPL(fuse_sync_release);
04730fef 330
71421259 331/*
9c8ef561
MS
332 * Scramble the ID space with XTEA, so that the value of the files_struct
333 * pointer is not exposed to userspace.
71421259 334 */
f3332114 335u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
71421259 336{
9c8ef561
MS
337 u32 *k = fc->scramble_key;
338 u64 v = (unsigned long) id;
339 u32 v0 = v;
340 u32 v1 = v >> 32;
341 u32 sum = 0;
342 int i;
343
344 for (i = 0; i < 32; i++) {
345 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
346 sum += 0x9E3779B9;
347 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
348 }
349
350 return (u64) v0 + ((u64) v1 << 32);
71421259
MS
351}
352
33826ebb
MS
353struct fuse_writepage_args {
354 struct fuse_io_args ia;
355 struct list_head writepages_entry;
356 struct list_head queue_entry;
357 struct fuse_writepage_args *next;
358 struct inode *inode;
359};
360
361static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
2fe93bd4
MS
362 pgoff_t idx_from, pgoff_t idx_to)
363{
33826ebb 364 struct fuse_writepage_args *wpa;
2fe93bd4 365
33826ebb 366 list_for_each_entry(wpa, &fi->writepages, writepages_entry) {
2fe93bd4
MS
367 pgoff_t curr_index;
368
33826ebb
MS
369 WARN_ON(get_fuse_inode(wpa->inode) != fi);
370 curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
371 if (idx_from < curr_index + wpa->ia.ap.num_pages &&
2fe93bd4 372 curr_index <= idx_to) {
33826ebb 373 return wpa;
2fe93bd4
MS
374 }
375 }
376 return NULL;
377}
378
3be5a52b 379/*
ea8cd333 380 * Check if any page in a range is under writeback
3be5a52b
MS
381 *
382 * This is currently done by walking the list of writepage requests
383 * for the inode, which can be pretty inefficient.
384 */
ea8cd333
PE
385static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
386 pgoff_t idx_to)
3be5a52b 387{
3be5a52b 388 struct fuse_inode *fi = get_fuse_inode(inode);
2fe93bd4 389 bool found;
3be5a52b 390
f15ecfef 391 spin_lock(&fi->lock);
2fe93bd4 392 found = fuse_find_writeback(fi, idx_from, idx_to);
f15ecfef 393 spin_unlock(&fi->lock);
3be5a52b
MS
394
395 return found;
396}
397
ea8cd333
PE
398static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
399{
400 return fuse_range_is_writeback(inode, index, index);
401}
402
3be5a52b
MS
403/*
404 * Wait for page writeback to be completed.
405 *
406 * Since fuse doesn't rely on the VM writeback tracking, this has to
407 * use some other means.
408 */
17b2cbe2 409static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
3be5a52b
MS
410{
411 struct fuse_inode *fi = get_fuse_inode(inode);
412
413 wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
3be5a52b
MS
414}
415
fe38d7df
MP
416/*
417 * Wait for all pending writepages on the inode to finish.
418 *
419 * This is currently done by blocking further writes with FUSE_NOWRITE
420 * and waiting for all sent writes to complete.
421 *
422 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
423 * could conflict with truncation.
424 */
425static void fuse_sync_writes(struct inode *inode)
426{
427 fuse_set_nowrite(inode);
428 fuse_release_nowrite(inode);
429}
430
75e1fcc0 431static int fuse_flush(struct file *file, fl_owner_t id)
b6aeaded 432{
6131ffaa 433 struct inode *inode = file_inode(file);
b6aeaded
MS
434 struct fuse_conn *fc = get_fuse_conn(inode);
435 struct fuse_file *ff = file->private_data;
b6aeaded 436 struct fuse_flush_in inarg;
c500ebaa 437 FUSE_ARGS(args);
b6aeaded
MS
438 int err;
439
248d86e8
MS
440 if (is_bad_inode(inode))
441 return -EIO;
442
b6aeaded
MS
443 if (fc->no_flush)
444 return 0;
445
1e18bda8 446 err = write_inode_now(inode, 1);
fe38d7df
MP
447 if (err)
448 return err;
449
5955102c 450 inode_lock(inode);
fe38d7df 451 fuse_sync_writes(inode);
5955102c 452 inode_unlock(inode);
fe38d7df 453
4a7f4e88 454 err = filemap_check_errors(file->f_mapping);
9ebce595
MP
455 if (err)
456 return err;
457
b6aeaded
MS
458 memset(&inarg, 0, sizeof(inarg));
459 inarg.fh = ff->fh;
9c8ef561 460 inarg.lock_owner = fuse_lock_owner_id(fc, id);
c500ebaa
MS
461 args.opcode = FUSE_FLUSH;
462 args.nodeid = get_node_id(inode);
463 args.in_numargs = 1;
464 args.in_args[0].size = sizeof(inarg);
465 args.in_args[0].value = &inarg;
466 args.force = true;
467
468 err = fuse_simple_request(fc, &args);
b6aeaded
MS
469 if (err == -ENOSYS) {
470 fc->no_flush = 1;
471 err = 0;
472 }
473 return err;
474}
475
02c24a82 476int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
a9c2d1e8 477 int datasync, int opcode)
b6aeaded 478{
7ea80859 479 struct inode *inode = file->f_mapping->host;
b6aeaded
MS
480 struct fuse_conn *fc = get_fuse_conn(inode);
481 struct fuse_file *ff = file->private_data;
7078187a 482 FUSE_ARGS(args);
b6aeaded 483 struct fuse_fsync_in inarg;
a9c2d1e8
MS
484
485 memset(&inarg, 0, sizeof(inarg));
486 inarg.fh = ff->fh;
154603fe 487 inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
d5b48543
MS
488 args.opcode = opcode;
489 args.nodeid = get_node_id(inode);
490 args.in_numargs = 1;
491 args.in_args[0].size = sizeof(inarg);
492 args.in_args[0].value = &inarg;
a9c2d1e8
MS
493 return fuse_simple_request(fc, &args);
494}
495
496static int fuse_fsync(struct file *file, loff_t start, loff_t end,
497 int datasync)
498{
499 struct inode *inode = file->f_mapping->host;
500 struct fuse_conn *fc = get_fuse_conn(inode);
b6aeaded
MS
501 int err;
502
248d86e8
MS
503 if (is_bad_inode(inode))
504 return -EIO;
505
5955102c 506 inode_lock(inode);
02c24a82 507
3be5a52b
MS
508 /*
509 * Start writeback against all dirty pages of the inode, then
510 * wait for all outstanding writes, before sending the FSYNC
511 * request.
512 */
7e51fe1d 513 err = file_write_and_wait_range(file, start, end);
3be5a52b 514 if (err)
02c24a82 515 goto out;
3be5a52b
MS
516
517 fuse_sync_writes(inode);
ac7f052b
AK
518
519 /*
520 * Due to implementation of fuse writeback
7e51fe1d 521 * file_write_and_wait_range() does not catch errors.
ac7f052b
AK
522 * We have to do this directly after fuse_sync_writes()
523 */
7e51fe1d 524 err = file_check_and_advance_wb_err(file);
ac7f052b
AK
525 if (err)
526 goto out;
527
1e18bda8
MS
528 err = sync_inode_metadata(inode, 1);
529 if (err)
530 goto out;
3be5a52b 531
a9c2d1e8 532 if (fc->no_fsync)
22401e7b 533 goto out;
b0aa7606 534
a9c2d1e8 535 err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
b6aeaded 536 if (err == -ENOSYS) {
a9c2d1e8 537 fc->no_fsync = 1;
b6aeaded
MS
538 err = 0;
539 }
02c24a82 540out:
5955102c 541 inode_unlock(inode);
b6aeaded 542
a9c2d1e8 543 return err;
82547981
MS
544}
545
00793ca5
MS
546void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
547 size_t count, int opcode)
548{
549 struct fuse_file *ff = file->private_data;
550 struct fuse_args *args = &ia->ap.args;
551
552 ia->read.in.fh = ff->fh;
553 ia->read.in.offset = pos;
554 ia->read.in.size = count;
555 ia->read.in.flags = file->f_flags;
556 args->opcode = opcode;
557 args->nodeid = ff->nodeid;
558 args->in_numargs = 1;
559 args->in_args[0].size = sizeof(ia->read.in);
560 args->in_args[0].value = &ia->read.in;
561 args->out_argvar = true;
562 args->out_numargs = 1;
563 args->out_args[0].size = count;
564}
565
45ac96ed
MS
566static void fuse_release_user_pages(struct fuse_args_pages *ap,
567 bool should_dirty)
187c5c36 568{
45ac96ed 569 unsigned int i;
187c5c36 570
45ac96ed 571 for (i = 0; i < ap->num_pages; i++) {
8fba54ae 572 if (should_dirty)
45ac96ed
MS
573 set_page_dirty_lock(ap->pages[i]);
574 put_page(ap->pages[i]);
187c5c36
MP
575 }
576}
577
744742d6
SF
578static void fuse_io_release(struct kref *kref)
579{
580 kfree(container_of(kref, struct fuse_io_priv, refcnt));
581}
582
9d5722b7
CH
583static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
584{
585 if (io->err)
586 return io->err;
587
588 if (io->bytes >= 0 && io->write)
589 return -EIO;
590
591 return io->bytes < 0 ? io->size : io->bytes;
592}
593
01e9d11a
MP
594/**
595 * In case of short read, the caller sets 'pos' to the position of
596 * actual end of fuse request in IO request. Otherwise, if bytes_requested
597 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
598 *
599 * An example:
600 * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
601 * both submitted asynchronously. The first of them was ACKed by userspace as
602 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
603 * second request was ACKed as short, e.g. only 1K was read, resulting in
604 * pos == 33K.
605 *
606 * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
607 * will be equal to the length of the longest contiguous fragment of
608 * transferred data starting from the beginning of IO request.
609 */
610static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
611{
612 int left;
613
614 spin_lock(&io->lock);
615 if (err)
616 io->err = io->err ? : err;
617 else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
618 io->bytes = pos;
619
620 left = --io->reqs;
7879c4e5 621 if (!left && io->blocking)
9d5722b7 622 complete(io->done);
01e9d11a
MP
623 spin_unlock(&io->lock);
624
7879c4e5 625 if (!left && !io->blocking) {
9d5722b7 626 ssize_t res = fuse_get_res_by_io(io);
01e9d11a 627
9d5722b7
CH
628 if (res >= 0) {
629 struct inode *inode = file_inode(io->iocb->ki_filp);
630 struct fuse_conn *fc = get_fuse_conn(inode);
631 struct fuse_inode *fi = get_fuse_inode(inode);
01e9d11a 632
f15ecfef 633 spin_lock(&fi->lock);
4510d86f 634 fi->attr_version = atomic64_inc_return(&fc->attr_version);
f15ecfef 635 spin_unlock(&fi->lock);
01e9d11a
MP
636 }
637
04b2fa9f 638 io->iocb->ki_complete(io->iocb, res, 0);
01e9d11a 639 }
744742d6
SF
640
641 kref_put(&io->refcnt, fuse_io_release);
01e9d11a
MP
642}
643
45ac96ed
MS
644static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
645 unsigned int npages)
646{
647 struct fuse_io_args *ia;
648
649 ia = kzalloc(sizeof(*ia), GFP_KERNEL);
650 if (ia) {
651 ia->io = io;
652 ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
653 &ia->ap.descs);
654 if (!ia->ap.pages) {
655 kfree(ia);
656 ia = NULL;
657 }
658 }
659 return ia;
660}
661
662static void fuse_io_free(struct fuse_io_args *ia)
01e9d11a 663{
45ac96ed
MS
664 kfree(ia->ap.pages);
665 kfree(ia);
666}
667
668static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
669 int err)
670{
671 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
672 struct fuse_io_priv *io = ia->io;
01e9d11a
MP
673 ssize_t pos = -1;
674
45ac96ed 675 fuse_release_user_pages(&ia->ap, io->should_dirty);
01e9d11a 676
45ac96ed
MS
677 if (err) {
678 /* Nothing */
679 } else if (io->write) {
680 if (ia->write.out.size > ia->write.in.size) {
681 err = -EIO;
682 } else if (ia->write.in.size != ia->write.out.size) {
683 pos = ia->write.in.offset - io->offset +
684 ia->write.out.size;
685 }
01e9d11a 686 } else {
45ac96ed
MS
687 u32 outsize = args->out_args[0].size;
688
689 if (ia->read.in.size != outsize)
690 pos = ia->read.in.offset - io->offset + outsize;
01e9d11a
MP
691 }
692
45ac96ed
MS
693 fuse_aio_complete(io, err, pos);
694 fuse_io_free(ia);
01e9d11a
MP
695}
696
45ac96ed
MS
697static ssize_t fuse_async_req_send(struct fuse_conn *fc,
698 struct fuse_io_args *ia, size_t num_bytes)
01e9d11a 699{
45ac96ed
MS
700 ssize_t err;
701 struct fuse_io_priv *io = ia->io;
702
01e9d11a 703 spin_lock(&io->lock);
744742d6 704 kref_get(&io->refcnt);
01e9d11a
MP
705 io->size += num_bytes;
706 io->reqs++;
707 spin_unlock(&io->lock);
708
45ac96ed
MS
709 ia->ap.args.end = fuse_aio_complete_req;
710 err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
01e9d11a 711
45ac96ed 712 return err ?: num_bytes;
01e9d11a
MP
713}
714
45ac96ed
MS
715static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
716 fl_owner_t owner)
04730fef 717{
45ac96ed 718 struct file *file = ia->io->iocb->ki_filp;
2106cb18
MS
719 struct fuse_file *ff = file->private_data;
720 struct fuse_conn *fc = ff->fc;
f3332114 721
45ac96ed 722 fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
f3332114 723 if (owner != NULL) {
45ac96ed
MS
724 ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
725 ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
f3332114 726 }
36cf66ed 727
45ac96ed
MS
728 if (ia->io->async)
729 return fuse_async_req_send(fc, ia, count);
36cf66ed 730
45ac96ed 731 return fuse_simple_request(fc, &ia->ap.args);
04730fef
MS
732}
733
5c5c5e51
MS
734static void fuse_read_update_size(struct inode *inode, loff_t size,
735 u64 attr_ver)
736{
737 struct fuse_conn *fc = get_fuse_conn(inode);
738 struct fuse_inode *fi = get_fuse_inode(inode);
739
f15ecfef 740 spin_lock(&fi->lock);
06a7c3c2
MP
741 if (attr_ver == fi->attr_version && size < inode->i_size &&
742 !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
4510d86f 743 fi->attr_version = atomic64_inc_return(&fc->attr_version);
5c5c5e51
MS
744 i_size_write(inode, size);
745 }
f15ecfef 746 spin_unlock(&fi->lock);
5c5c5e51
MS
747}
748
a0d45d84 749static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
134831e3 750 struct fuse_args_pages *ap)
a92adc82 751{
8373200b
PE
752 struct fuse_conn *fc = get_fuse_conn(inode);
753
754 if (fc->writeback_cache) {
755 /*
756 * A hole in a file. Some data after the hole are in page cache,
757 * but have not reached the client fs yet. So, the hole is not
758 * present there.
759 */
760 int i;
09cbfeaf
KS
761 int start_idx = num_read >> PAGE_SHIFT;
762 size_t off = num_read & (PAGE_SIZE - 1);
a92adc82 763
134831e3
MS
764 for (i = start_idx; i < ap->num_pages; i++) {
765 zero_user_segment(ap->pages[i], off, PAGE_SIZE);
8373200b
PE
766 off = 0;
767 }
768 } else {
134831e3 769 loff_t pos = page_offset(ap->pages[0]) + num_read;
8373200b
PE
770 fuse_read_update_size(inode, pos, attr_ver);
771 }
a92adc82
PE
772}
773
482fce55 774static int fuse_do_readpage(struct file *file, struct page *page)
b6aeaded
MS
775{
776 struct inode *inode = page->mapping->host;
777 struct fuse_conn *fc = get_fuse_conn(inode);
5c5c5e51 778 loff_t pos = page_offset(page);
00793ca5
MS
779 struct fuse_page_desc desc = { .length = PAGE_SIZE };
780 struct fuse_io_args ia = {
781 .ap.args.page_zeroing = true,
782 .ap.args.out_pages = true,
783 .ap.num_pages = 1,
784 .ap.pages = &page,
785 .ap.descs = &desc,
786 };
787 ssize_t res;
5c5c5e51 788 u64 attr_ver;
248d86e8 789
3be5a52b 790 /*
25985edc 791 * Page writeback can extend beyond the lifetime of the
3be5a52b
MS
792 * page-cache page, so make sure we read a properly synced
793 * page.
794 */
795 fuse_wait_on_page_writeback(inode, page->index);
796
5c5c5e51
MS
797 attr_ver = fuse_get_attr_version(fc);
798
00793ca5
MS
799 fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
800 res = fuse_simple_request(fc, &ia.ap.args);
801 if (res < 0)
802 return res;
803 /*
804 * Short read means EOF. If file size is larger, truncate it
805 */
806 if (res < desc.length)
134831e3 807 fuse_short_read(inode, attr_ver, res, &ia.ap);
5c5c5e51 808
00793ca5 809 SetPageUptodate(page);
482fce55 810
00793ca5 811 return 0;
482fce55
MP
812}
813
814static int fuse_readpage(struct file *file, struct page *page)
815{
816 struct inode *inode = page->mapping->host;
817 int err;
818
819 err = -EIO;
820 if (is_bad_inode(inode))
821 goto out;
822
823 err = fuse_do_readpage(file, page);
451418fc 824 fuse_invalidate_atime(inode);
b6aeaded
MS
825 out:
826 unlock_page(page);
827 return err;
828}
829
134831e3
MS
830static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
831 int err)
db50b96c 832{
c1aa96a5 833 int i;
134831e3
MS
834 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
835 struct fuse_args_pages *ap = &ia->ap;
836 size_t count = ia->read.in.size;
837 size_t num_read = args->out_args[0].size;
ce534fb0 838 struct address_space *mapping = NULL;
c1aa96a5 839
134831e3
MS
840 for (i = 0; mapping == NULL && i < ap->num_pages; i++)
841 mapping = ap->pages[i]->mapping;
5c5c5e51 842
ce534fb0
MS
843 if (mapping) {
844 struct inode *inode = mapping->host;
845
846 /*
847 * Short read means EOF. If file size is larger, truncate it
848 */
134831e3
MS
849 if (!err && num_read < count)
850 fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
ce534fb0 851
451418fc 852 fuse_invalidate_atime(inode);
ce534fb0 853 }
c1aa96a5 854
134831e3
MS
855 for (i = 0; i < ap->num_pages; i++) {
856 struct page *page = ap->pages[i];
857
858 if (!err)
db50b96c 859 SetPageUptodate(page);
c1aa96a5
MS
860 else
861 SetPageError(page);
db50b96c 862 unlock_page(page);
09cbfeaf 863 put_page(page);
db50b96c 864 }
134831e3
MS
865 if (ia->ff)
866 fuse_file_put(ia->ff, false, false);
867
868 fuse_io_free(ia);
c1aa96a5
MS
869}
870
134831e3 871static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
c1aa96a5 872{
2106cb18
MS
873 struct fuse_file *ff = file->private_data;
874 struct fuse_conn *fc = ff->fc;
134831e3
MS
875 struct fuse_args_pages *ap = &ia->ap;
876 loff_t pos = page_offset(ap->pages[0]);
877 size_t count = ap->num_pages << PAGE_SHIFT;
878 int err;
879
880 ap->args.out_pages = true;
881 ap->args.page_zeroing = true;
882 ap->args.page_replace = true;
883 fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
884 ia->read.attr_ver = fuse_get_attr_version(fc);
9cd68455 885 if (fc->async_read) {
134831e3
MS
886 ia->ff = fuse_file_get(ff);
887 ap->args.end = fuse_readpages_end;
888 err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
889 if (!err)
890 return;
9cd68455 891 } else {
134831e3 892 err = fuse_simple_request(fc, &ap->args);
9cd68455 893 }
134831e3 894 fuse_readpages_end(fc, &ap->args, err);
db50b96c
MS
895}
896
c756e0a4 897struct fuse_fill_data {
134831e3 898 struct fuse_io_args *ia;
a6643094 899 struct file *file;
db50b96c 900 struct inode *inode;
134831e3
MS
901 unsigned int nr_pages;
902 unsigned int max_pages;
db50b96c
MS
903};
904
905static int fuse_readpages_fill(void *_data, struct page *page)
906{
c756e0a4 907 struct fuse_fill_data *data = _data;
134831e3
MS
908 struct fuse_io_args *ia = data->ia;
909 struct fuse_args_pages *ap = &ia->ap;
db50b96c
MS
910 struct inode *inode = data->inode;
911 struct fuse_conn *fc = get_fuse_conn(inode);
912
3be5a52b
MS
913 fuse_wait_on_page_writeback(inode, page->index);
914
134831e3
MS
915 if (ap->num_pages &&
916 (ap->num_pages == fc->max_pages ||
917 (ap->num_pages + 1) * PAGE_SIZE > fc->max_read ||
918 ap->pages[ap->num_pages - 1]->index + 1 != page->index)) {
919 data->max_pages = min_t(unsigned int, data->nr_pages,
920 fc->max_pages);
921 fuse_send_readpages(ia, data->file);
922 data->ia = ia = fuse_io_alloc(NULL, data->max_pages);
923 if (!ia) {
db50b96c 924 unlock_page(page);
134831e3 925 return -ENOMEM;
db50b96c 926 }
134831e3 927 ap = &ia->ap;
db50b96c 928 }
f8dbdf81 929
134831e3 930 if (WARN_ON(ap->num_pages >= data->max_pages)) {
109728cc 931 unlock_page(page);
134831e3 932 fuse_io_free(ia);
f8dbdf81
MP
933 return -EIO;
934 }
935
09cbfeaf 936 get_page(page);
134831e3
MS
937 ap->pages[ap->num_pages] = page;
938 ap->descs[ap->num_pages].length = PAGE_SIZE;
939 ap->num_pages++;
f8dbdf81 940 data->nr_pages--;
db50b96c
MS
941 return 0;
942}
943
944static int fuse_readpages(struct file *file, struct address_space *mapping,
945 struct list_head *pages, unsigned nr_pages)
946{
947 struct inode *inode = mapping->host;
948 struct fuse_conn *fc = get_fuse_conn(inode);
c756e0a4 949 struct fuse_fill_data data;
db50b96c 950 int err;
248d86e8 951
1d7ea732 952 err = -EIO;
248d86e8 953 if (is_bad_inode(inode))
2e990021 954 goto out;
248d86e8 955
a6643094 956 data.file = file;
db50b96c 957 data.inode = inode;
f8dbdf81 958 data.nr_pages = nr_pages;
134831e3
MS
959 data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages);
960;
961 data.ia = fuse_io_alloc(NULL, data.max_pages);
962 err = -ENOMEM;
963 if (!data.ia)
2e990021 964 goto out;
db50b96c
MS
965
966 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
d3406ffa 967 if (!err) {
134831e3
MS
968 if (data.ia->ap.num_pages)
969 fuse_send_readpages(data.ia, file);
d3406ffa 970 else
134831e3 971 fuse_io_free(data.ia);
d3406ffa 972 }
2e990021 973out:
1d7ea732 974 return err;
db50b96c
MS
975}
976
55752a3a 977static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
bcb4be80
MS
978{
979 struct inode *inode = iocb->ki_filp->f_mapping->host;
a8894274 980 struct fuse_conn *fc = get_fuse_conn(inode);
bcb4be80 981
a8894274
BF
982 /*
983 * In auto invalidate mode, always update attributes on read.
984 * Otherwise, only update if we attempt to read past EOF (to ensure
985 * i_size is up to date).
986 */
987 if (fc->auto_inval_data ||
37c20f16 988 (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
bcb4be80 989 int err;
5b97eeac 990 err = fuse_update_attributes(inode, iocb->ki_filp);
bcb4be80
MS
991 if (err)
992 return err;
993 }
994
37c20f16 995 return generic_file_read_iter(iocb, to);
bcb4be80
MS
996}
997
338f2e3f
MS
998static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
999 loff_t pos, size_t count)
1000{
1001 struct fuse_args *args = &ia->ap.args;
1002
1003 ia->write.in.fh = ff->fh;
1004 ia->write.in.offset = pos;
1005 ia->write.in.size = count;
1006 args->opcode = FUSE_WRITE;
1007 args->nodeid = ff->nodeid;
1008 args->in_numargs = 2;
1009 if (ff->fc->minor < 9)
1010 args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
1011 else
1012 args->in_args[0].size = sizeof(ia->write.in);
1013 args->in_args[0].value = &ia->write.in;
1014 args->in_args[1].size = count;
1015 args->out_numargs = 1;
1016 args->out_args[0].size = sizeof(ia->write.out);
1017 args->out_args[0].value = &ia->write.out;
1018}
1019
1020static unsigned int fuse_write_flags(struct kiocb *iocb)
1021{
1022 unsigned int flags = iocb->ki_filp->f_flags;
1023
1024 if (iocb->ki_flags & IOCB_DSYNC)
1025 flags |= O_DSYNC;
1026 if (iocb->ki_flags & IOCB_SYNC)
1027 flags |= O_SYNC;
1028
1029 return flags;
1030}
1031
45ac96ed
MS
1032static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
1033 size_t count, fl_owner_t owner)
b25e82e5 1034{
45ac96ed 1035 struct kiocb *iocb = ia->io->iocb;
e1c0eecb 1036 struct file *file = iocb->ki_filp;
2106cb18
MS
1037 struct fuse_file *ff = file->private_data;
1038 struct fuse_conn *fc = ff->fc;
45ac96ed
MS
1039 struct fuse_write_in *inarg = &ia->write.in;
1040 ssize_t err;
2d698b07 1041
45ac96ed 1042 fuse_write_args_fill(ia, ff, pos, count);
338f2e3f 1043 inarg->flags = fuse_write_flags(iocb);
f3332114 1044 if (owner != NULL) {
f3332114
MS
1045 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
1046 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
1047 }
36cf66ed 1048
45ac96ed
MS
1049 if (ia->io->async)
1050 return fuse_async_req_send(fc, ia, count);
1051
1052 err = fuse_simple_request(fc, &ia->ap.args);
1053 if (!err && ia->write.out.size > count)
1054 err = -EIO;
36cf66ed 1055
45ac96ed 1056 return err ?: ia->write.out.size;
b6aeaded
MS
1057}
1058
b0aa7606 1059bool fuse_write_update_size(struct inode *inode, loff_t pos)
854512ec
MS
1060{
1061 struct fuse_conn *fc = get_fuse_conn(inode);
1062 struct fuse_inode *fi = get_fuse_inode(inode);
b0aa7606 1063 bool ret = false;
854512ec 1064
f15ecfef 1065 spin_lock(&fi->lock);
4510d86f 1066 fi->attr_version = atomic64_inc_return(&fc->attr_version);
b0aa7606 1067 if (pos > inode->i_size) {
854512ec 1068 i_size_write(inode, pos);
b0aa7606
MP
1069 ret = true;
1070 }
f15ecfef 1071 spin_unlock(&fi->lock);
b0aa7606
MP
1072
1073 return ret;
854512ec
MS
1074}
1075
338f2e3f
MS
1076static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
1077 struct kiocb *iocb, struct inode *inode,
1078 loff_t pos, size_t count)
ea9b9907 1079{
338f2e3f
MS
1080 struct fuse_args_pages *ap = &ia->ap;
1081 struct file *file = iocb->ki_filp;
1082 struct fuse_file *ff = file->private_data;
1083 struct fuse_conn *fc = ff->fc;
1084 unsigned int offset, i;
1085 int err;
ea9b9907 1086
338f2e3f
MS
1087 for (i = 0; i < ap->num_pages; i++)
1088 fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
ea9b9907 1089
338f2e3f
MS
1090 fuse_write_args_fill(ia, ff, pos, count);
1091 ia->write.in.flags = fuse_write_flags(iocb);
ea9b9907 1092
338f2e3f
MS
1093 err = fuse_simple_request(fc, &ap->args);
1094
1095 offset = ap->descs[0].offset;
1096 count = ia->write.out.size;
1097 for (i = 0; i < ap->num_pages; i++) {
1098 struct page *page = ap->pages[i];
ea9b9907 1099
338f2e3f 1100 if (!err && !offset && count >= PAGE_SIZE)
ea9b9907
NP
1101 SetPageUptodate(page);
1102
09cbfeaf
KS
1103 if (count > PAGE_SIZE - offset)
1104 count -= PAGE_SIZE - offset;
ea9b9907
NP
1105 else
1106 count = 0;
1107 offset = 0;
1108
1109 unlock_page(page);
09cbfeaf 1110 put_page(page);
ea9b9907
NP
1111 }
1112
338f2e3f 1113 return err;
ea9b9907
NP
1114}
1115
338f2e3f
MS
1116static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
1117 struct address_space *mapping,
1118 struct iov_iter *ii, loff_t pos,
1119 unsigned int max_pages)
ea9b9907
NP
1120{
1121 struct fuse_conn *fc = get_fuse_conn(mapping->host);
09cbfeaf 1122 unsigned offset = pos & (PAGE_SIZE - 1);
ea9b9907
NP
1123 size_t count = 0;
1124 int err;
1125
338f2e3f
MS
1126 ap->args.in_pages = true;
1127 ap->descs[0].offset = offset;
ea9b9907
NP
1128
1129 do {
1130 size_t tmp;
1131 struct page *page;
09cbfeaf
KS
1132 pgoff_t index = pos >> PAGE_SHIFT;
1133 size_t bytes = min_t(size_t, PAGE_SIZE - offset,
ea9b9907
NP
1134 iov_iter_count(ii));
1135
1136 bytes = min_t(size_t, bytes, fc->max_write - count);
1137
1138 again:
1139 err = -EFAULT;
1140 if (iov_iter_fault_in_readable(ii, bytes))
1141 break;
1142
1143 err = -ENOMEM;
54566b2c 1144 page = grab_cache_page_write_begin(mapping, index, 0);
ea9b9907
NP
1145 if (!page)
1146 break;
1147
931e80e4 1148 if (mapping_writably_mapped(mapping))
1149 flush_dcache_page(page);
1150
ea9b9907 1151 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
ea9b9907
NP
1152 flush_dcache_page(page);
1153
3ca8138f 1154 iov_iter_advance(ii, tmp);
ea9b9907
NP
1155 if (!tmp) {
1156 unlock_page(page);
09cbfeaf 1157 put_page(page);
ea9b9907
NP
1158 bytes = min(bytes, iov_iter_single_seg_count(ii));
1159 goto again;
1160 }
1161
1162 err = 0;
338f2e3f
MS
1163 ap->pages[ap->num_pages] = page;
1164 ap->descs[ap->num_pages].length = tmp;
1165 ap->num_pages++;
ea9b9907 1166
ea9b9907
NP
1167 count += tmp;
1168 pos += tmp;
1169 offset += tmp;
09cbfeaf 1170 if (offset == PAGE_SIZE)
ea9b9907
NP
1171 offset = 0;
1172
78bb6cb9
MS
1173 if (!fc->big_writes)
1174 break;
ea9b9907 1175 } while (iov_iter_count(ii) && count < fc->max_write &&
338f2e3f 1176 ap->num_pages < max_pages && offset == 0);
ea9b9907
NP
1177
1178 return count > 0 ? count : err;
1179}
1180
5da784cc
CS
1181static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
1182 unsigned int max_pages)
d07f09f5 1183{
5da784cc 1184 return min_t(unsigned int,
09cbfeaf
KS
1185 ((pos + len - 1) >> PAGE_SHIFT) -
1186 (pos >> PAGE_SHIFT) + 1,
5da784cc 1187 max_pages);
d07f09f5
MP
1188}
1189
e1c0eecb 1190static ssize_t fuse_perform_write(struct kiocb *iocb,
ea9b9907
NP
1191 struct address_space *mapping,
1192 struct iov_iter *ii, loff_t pos)
1193{
1194 struct inode *inode = mapping->host;
1195 struct fuse_conn *fc = get_fuse_conn(inode);
06a7c3c2 1196 struct fuse_inode *fi = get_fuse_inode(inode);
ea9b9907
NP
1197 int err = 0;
1198 ssize_t res = 0;
1199
06a7c3c2
MP
1200 if (inode->i_size < pos + iov_iter_count(ii))
1201 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1202
ea9b9907 1203 do {
ea9b9907 1204 ssize_t count;
338f2e3f
MS
1205 struct fuse_io_args ia = {};
1206 struct fuse_args_pages *ap = &ia.ap;
5da784cc
CS
1207 unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
1208 fc->max_pages);
ea9b9907 1209
338f2e3f
MS
1210 ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
1211 if (!ap->pages) {
1212 err = -ENOMEM;
ea9b9907
NP
1213 break;
1214 }
1215
338f2e3f 1216 count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
ea9b9907
NP
1217 if (count <= 0) {
1218 err = count;
1219 } else {
338f2e3f
MS
1220 err = fuse_send_write_pages(&ia, iocb, inode,
1221 pos, count);
ea9b9907 1222 if (!err) {
338f2e3f
MS
1223 size_t num_written = ia.write.out.size;
1224
ea9b9907
NP
1225 res += num_written;
1226 pos += num_written;
1227
1228 /* break out of the loop on short write */
1229 if (num_written != count)
1230 err = -EIO;
1231 }
1232 }
338f2e3f 1233 kfree(ap->pages);
ea9b9907
NP
1234 } while (!err && iov_iter_count(ii));
1235
1236 if (res > 0)
1237 fuse_write_update_size(inode, pos);
1238
06a7c3c2 1239 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
ea9b9907
NP
1240 fuse_invalidate_attr(inode);
1241
1242 return res > 0 ? res : err;
1243}
1244
55752a3a 1245static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
ea9b9907
NP
1246{
1247 struct file *file = iocb->ki_filp;
1248 struct address_space *mapping = file->f_mapping;
ea9b9907 1249 ssize_t written = 0;
4273b793 1250 ssize_t written_buffered = 0;
ea9b9907
NP
1251 struct inode *inode = mapping->host;
1252 ssize_t err;
4273b793 1253 loff_t endbyte = 0;
ea9b9907 1254
4d99ff8f
PE
1255 if (get_fuse_conn(inode)->writeback_cache) {
1256 /* Update size (EOF optimization) and mode (SUID clearing) */
5b97eeac 1257 err = fuse_update_attributes(mapping->host, file);
4d99ff8f
PE
1258 if (err)
1259 return err;
1260
84c3d55c 1261 return generic_file_write_iter(iocb, from);
4d99ff8f
PE
1262 }
1263
5955102c 1264 inode_lock(inode);
ea9b9907
NP
1265
1266 /* We can write back this queue in page reclaim */
de1414a6 1267 current->backing_dev_info = inode_to_bdi(inode);
ea9b9907 1268
3309dd04
AV
1269 err = generic_write_checks(iocb, from);
1270 if (err <= 0)
ea9b9907
NP
1271 goto out;
1272
5fa8e0a1 1273 err = file_remove_privs(file);
ea9b9907
NP
1274 if (err)
1275 goto out;
1276
c3b2da31
JB
1277 err = file_update_time(file);
1278 if (err)
1279 goto out;
ea9b9907 1280
2ba48ce5 1281 if (iocb->ki_flags & IOCB_DIRECT) {
3309dd04 1282 loff_t pos = iocb->ki_pos;
1af5bb49 1283 written = generic_file_direct_write(iocb, from);
84c3d55c 1284 if (written < 0 || !iov_iter_count(from))
4273b793
AA
1285 goto out;
1286
1287 pos += written;
ea9b9907 1288
e1c0eecb 1289 written_buffered = fuse_perform_write(iocb, mapping, from, pos);
4273b793
AA
1290 if (written_buffered < 0) {
1291 err = written_buffered;
1292 goto out;
1293 }
1294 endbyte = pos + written_buffered - 1;
1295
1296 err = filemap_write_and_wait_range(file->f_mapping, pos,
1297 endbyte);
1298 if (err)
1299 goto out;
1300
1301 invalidate_mapping_pages(file->f_mapping,
09cbfeaf
KS
1302 pos >> PAGE_SHIFT,
1303 endbyte >> PAGE_SHIFT);
4273b793
AA
1304
1305 written += written_buffered;
1306 iocb->ki_pos = pos + written_buffered;
1307 } else {
e1c0eecb 1308 written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
4273b793 1309 if (written >= 0)
3309dd04 1310 iocb->ki_pos += written;
4273b793 1311 }
ea9b9907
NP
1312out:
1313 current->backing_dev_info = NULL;
5955102c 1314 inode_unlock(inode);
e1c0eecb
MS
1315 if (written > 0)
1316 written = generic_write_sync(iocb, written);
ea9b9907
NP
1317
1318 return written ? written : err;
1319}
1320
093f38a2
MS
1321static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
1322 unsigned int index,
1323 unsigned int nr_pages)
85f40aec
MP
1324{
1325 int i;
1326
7c190c8b 1327 for (i = index; i < index + nr_pages; i++)
093f38a2 1328 descs[i].length = PAGE_SIZE - descs[i].offset;
85f40aec
MP
1329}
1330
7c190c8b
MP
1331static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
1332{
1333 return (unsigned long)ii->iov->iov_base + ii->iov_offset;
1334}
1335
1336static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
1337 size_t max_size)
1338{
1339 return min(iov_iter_single_seg_count(ii), max_size);
1340}
1341
45ac96ed
MS
1342static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
1343 size_t *nbytesp, int write,
1344 unsigned int max_pages)
413ef8cb 1345{
7c190c8b 1346 size_t nbytes = 0; /* # bytes already packed in req */
742f9927 1347 ssize_t ret = 0;
b98d023a 1348
f4975c67 1349 /* Special case for kernel I/O: can copy directly into the buffer */
00e23707 1350 if (iov_iter_is_kvec(ii)) {
7c190c8b
MP
1351 unsigned long user_addr = fuse_get_user_addr(ii);
1352 size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
1353
f4975c67 1354 if (write)
45ac96ed 1355 ap->args.in_args[1].value = (void *) user_addr;
f4975c67 1356 else
45ac96ed 1357 ap->args.out_args[0].value = (void *) user_addr;
f4975c67 1358
b98d023a
MP
1359 iov_iter_advance(ii, frag_size);
1360 *nbytesp = frag_size;
f4975c67
MS
1361 return 0;
1362 }
413ef8cb 1363
45ac96ed 1364 while (nbytes < *nbytesp && ap->num_pages < max_pages) {
7c190c8b 1365 unsigned npages;
f67da30c 1366 size_t start;
45ac96ed 1367 ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
2c80929c 1368 *nbytesp - nbytes,
45ac96ed 1369 max_pages - ap->num_pages,
c7f3888a 1370 &start);
7c190c8b 1371 if (ret < 0)
742f9927 1372 break;
7c190c8b 1373
c9c37e2e
AV
1374 iov_iter_advance(ii, ret);
1375 nbytes += ret;
7c190c8b 1376
c9c37e2e
AV
1377 ret += start;
1378 npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
7c190c8b 1379
45ac96ed
MS
1380 ap->descs[ap->num_pages].offset = start;
1381 fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
7c190c8b 1382
45ac96ed
MS
1383 ap->num_pages += npages;
1384 ap->descs[ap->num_pages - 1].length -=
c9c37e2e 1385 (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
7c190c8b 1386 }
f4975c67
MS
1387
1388 if (write)
45ac96ed 1389 ap->args.in_pages = 1;
f4975c67 1390 else
45ac96ed 1391 ap->args.out_pages = 1;
f4975c67 1392
7c190c8b 1393 *nbytesp = nbytes;
f4975c67 1394
2c932d4c 1395 return ret < 0 ? ret : 0;
413ef8cb
MS
1396}
1397
d22a943f
AV
1398ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1399 loff_t *ppos, int flags)
413ef8cb 1400{
ea8cd333
PE
1401 int write = flags & FUSE_DIO_WRITE;
1402 int cuse = flags & FUSE_DIO_CUSE;
e1c0eecb 1403 struct file *file = io->iocb->ki_filp;
ea8cd333 1404 struct inode *inode = file->f_mapping->host;
2106cb18
MS
1405 struct fuse_file *ff = file->private_data;
1406 struct fuse_conn *fc = ff->fc;
413ef8cb
MS
1407 size_t nmax = write ? fc->max_write : fc->max_read;
1408 loff_t pos = *ppos;
d22a943f 1409 size_t count = iov_iter_count(iter);
09cbfeaf
KS
1410 pgoff_t idx_from = pos >> PAGE_SHIFT;
1411 pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
413ef8cb 1412 ssize_t res = 0;
742f9927 1413 int err = 0;
45ac96ed
MS
1414 struct fuse_io_args *ia;
1415 unsigned int max_pages;
248d86e8 1416
45ac96ed
MS
1417 max_pages = iov_iter_npages(iter, fc->max_pages);
1418 ia = fuse_io_alloc(io, max_pages);
1419 if (!ia)
1420 return -ENOMEM;
413ef8cb 1421
45ac96ed 1422 ia->io = io;
ea8cd333
PE
1423 if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1424 if (!write)
5955102c 1425 inode_lock(inode);
ea8cd333
PE
1426 fuse_sync_writes(inode);
1427 if (!write)
5955102c 1428 inode_unlock(inode);
ea8cd333
PE
1429 }
1430
61c12b49 1431 io->should_dirty = !write && iter_is_iovec(iter);
413ef8cb 1432 while (count) {
45ac96ed 1433 ssize_t nres;
2106cb18 1434 fl_owner_t owner = current->files;
f4975c67 1435 size_t nbytes = min(count, nmax);
45ac96ed
MS
1436
1437 err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
1438 max_pages);
742f9927 1439 if (err && !nbytes)
413ef8cb 1440 break;
f4975c67 1441
4a2abf99 1442 if (write) {
45ac96ed
MS
1443 if (!capable(CAP_FSETID))
1444 ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV;
4a2abf99 1445
45ac96ed 1446 nres = fuse_send_write(ia, pos, nbytes, owner);
4a2abf99 1447 } else {
45ac96ed 1448 nres = fuse_send_read(ia, pos, nbytes, owner);
4a2abf99 1449 }
2106cb18 1450
45ac96ed
MS
1451 if (!io->async || nres < 0) {
1452 fuse_release_user_pages(&ia->ap, io->should_dirty);
1453 fuse_io_free(ia);
1454 }
1455 ia = NULL;
1456 if (nres < 0) {
1457 err = nres;
413ef8cb
MS
1458 break;
1459 }
45ac96ed
MS
1460 WARN_ON(nres > nbytes);
1461
413ef8cb
MS
1462 count -= nres;
1463 res += nres;
1464 pos += nres;
413ef8cb
MS
1465 if (nres != nbytes)
1466 break;
56cf34ff 1467 if (count) {
45ac96ed
MS
1468 max_pages = iov_iter_npages(iter, fc->max_pages);
1469 ia = fuse_io_alloc(io, max_pages);
1470 if (!ia)
56cf34ff
MS
1471 break;
1472 }
413ef8cb 1473 }
45ac96ed
MS
1474 if (ia)
1475 fuse_io_free(ia);
d09cb9d7 1476 if (res > 0)
413ef8cb 1477 *ppos = pos;
413ef8cb 1478
742f9927 1479 return res > 0 ? res : err;
413ef8cb 1480}
08cbf542 1481EXPORT_SYMBOL_GPL(fuse_direct_io);
413ef8cb 1482
36cf66ed 1483static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
d22a943f
AV
1484 struct iov_iter *iter,
1485 loff_t *ppos)
413ef8cb 1486{
d09cb9d7 1487 ssize_t res;
e1c0eecb 1488 struct inode *inode = file_inode(io->iocb->ki_filp);
d09cb9d7 1489
d22a943f 1490 res = fuse_direct_io(io, iter, ppos, 0);
d09cb9d7 1491
9a2eb24d 1492 fuse_invalidate_atime(inode);
d09cb9d7
MS
1493
1494 return res;
413ef8cb
MS
1495}
1496
23c94e1c
MR
1497static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
1498
15316263 1499static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
b98d023a 1500{
23c94e1c
MR
1501 ssize_t res;
1502
1503 if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
23c94e1c
MR
1504 res = fuse_direct_IO(iocb, to);
1505 } else {
1506 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1507
1508 res = __fuse_direct_read(&io, to, &iocb->ki_pos);
1509 }
1510
1511 return res;
b98d023a
MP
1512}
1513
15316263 1514static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
4273b793 1515{
e1c0eecb
MS
1516 struct inode *inode = file_inode(iocb->ki_filp);
1517 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
15316263 1518 ssize_t res;
4273b793 1519
4273b793 1520 /* Don't allow parallel writes to the same file */
5955102c 1521 inode_lock(inode);
3309dd04 1522 res = generic_write_checks(iocb, from);
23c94e1c
MR
1523 if (res > 0) {
1524 if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
1525 res = fuse_direct_IO(iocb, from);
1526 } else {
1527 res = fuse_direct_io(&io, from, &iocb->ki_pos,
1528 FUSE_DIO_WRITE);
1529 }
1530 }
812408fb 1531 fuse_invalidate_attr(inode);
bcba24cc 1532 if (res > 0)
15316263 1533 fuse_write_update_size(inode, iocb->ki_pos);
5955102c 1534 inode_unlock(inode);
4273b793
AA
1535
1536 return res;
1537}
1538
55752a3a
MS
1539static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1540{
2f7b6f5b
MS
1541 struct file *file = iocb->ki_filp;
1542 struct fuse_file *ff = file->private_data;
1543
1544 if (is_bad_inode(file_inode(file)))
1545 return -EIO;
55752a3a
MS
1546
1547 if (!(ff->open_flags & FOPEN_DIRECT_IO))
1548 return fuse_cache_read_iter(iocb, to);
1549 else
1550 return fuse_direct_read_iter(iocb, to);
1551}
1552
1553static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1554{
2f7b6f5b
MS
1555 struct file *file = iocb->ki_filp;
1556 struct fuse_file *ff = file->private_data;
1557
1558 if (is_bad_inode(file_inode(file)))
1559 return -EIO;
55752a3a
MS
1560
1561 if (!(ff->open_flags & FOPEN_DIRECT_IO))
1562 return fuse_cache_write_iter(iocb, from);
1563 else
1564 return fuse_direct_write_iter(iocb, from);
1565}
1566
33826ebb 1567static void fuse_writepage_free(struct fuse_writepage_args *wpa)
b6aeaded 1568{
33826ebb 1569 struct fuse_args_pages *ap = &wpa->ia.ap;
385b1268
PE
1570 int i;
1571
33826ebb
MS
1572 for (i = 0; i < ap->num_pages; i++)
1573 __free_page(ap->pages[i]);
1574
1575 if (wpa->ia.ff)
1576 fuse_file_put(wpa->ia.ff, false, false);
8b284dc4 1577
33826ebb
MS
1578 kfree(ap->pages);
1579 kfree(wpa);
3be5a52b
MS
1580}
1581
33826ebb
MS
1582static void fuse_writepage_finish(struct fuse_conn *fc,
1583 struct fuse_writepage_args *wpa)
3be5a52b 1584{
33826ebb
MS
1585 struct fuse_args_pages *ap = &wpa->ia.ap;
1586 struct inode *inode = wpa->inode;
3be5a52b 1587 struct fuse_inode *fi = get_fuse_inode(inode);
de1414a6 1588 struct backing_dev_info *bdi = inode_to_bdi(inode);
385b1268 1589 int i;
3be5a52b 1590
33826ebb
MS
1591 list_del(&wpa->writepages_entry);
1592 for (i = 0; i < ap->num_pages; i++) {
93f78d88 1593 dec_wb_stat(&bdi->wb, WB_WRITEBACK);
33826ebb 1594 dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
93f78d88 1595 wb_writeout_inc(&bdi->wb);
385b1268 1596 }
3be5a52b
MS
1597 wake_up(&fi->page_waitq);
1598}
1599
f15ecfef 1600/* Called under fi->lock, may release and reacquire it */
33826ebb
MS
1601static void fuse_send_writepage(struct fuse_conn *fc,
1602 struct fuse_writepage_args *wpa, loff_t size)
f15ecfef
KT
1603__releases(fi->lock)
1604__acquires(fi->lock)
3be5a52b 1605{
33826ebb
MS
1606 struct fuse_writepage_args *aux, *next;
1607 struct fuse_inode *fi = get_fuse_inode(wpa->inode);
1608 struct fuse_write_in *inarg = &wpa->ia.write.in;
1609 struct fuse_args *args = &wpa->ia.ap.args;
1610 __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
1611 int err;
3be5a52b 1612
33826ebb 1613 fi->writectr++;
385b1268
PE
1614 if (inarg->offset + data_size <= size) {
1615 inarg->size = data_size;
3be5a52b 1616 } else if (inarg->offset < size) {
385b1268 1617 inarg->size = size - inarg->offset;
3be5a52b
MS
1618 } else {
1619 /* Got truncated off completely */
1620 goto out_free;
b6aeaded 1621 }
3be5a52b 1622
33826ebb
MS
1623 args->in_args[1].size = inarg->size;
1624 args->force = true;
1625 args->nocreds = true;
1626
1627 err = fuse_simple_background(fc, args, GFP_ATOMIC);
1628 if (err == -ENOMEM) {
1629 spin_unlock(&fi->lock);
1630 err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
1631 spin_lock(&fi->lock);
1632 }
1633
f15ecfef 1634 /* Fails on broken connection only */
33826ebb 1635 if (unlikely(err))
f15ecfef
KT
1636 goto out_free;
1637
3be5a52b
MS
1638 return;
1639
1640 out_free:
33826ebb
MS
1641 fi->writectr--;
1642 fuse_writepage_finish(fc, wpa);
f15ecfef 1643 spin_unlock(&fi->lock);
e2653bd5
MS
1644
1645 /* After fuse_writepage_finish() aux request list is private */
33826ebb
MS
1646 for (aux = wpa->next; aux; aux = next) {
1647 next = aux->next;
1648 aux->next = NULL;
1649 fuse_writepage_free(aux);
e2653bd5
MS
1650 }
1651
33826ebb 1652 fuse_writepage_free(wpa);
f15ecfef 1653 spin_lock(&fi->lock);
b6aeaded
MS
1654}
1655
3be5a52b
MS
1656/*
1657 * If fi->writectr is positive (no truncate or fsync going on) send
1658 * all queued writepage requests.
1659 *
f15ecfef 1660 * Called with fi->lock
3be5a52b
MS
1661 */
1662void fuse_flush_writepages(struct inode *inode)
f15ecfef
KT
1663__releases(fi->lock)
1664__acquires(fi->lock)
b6aeaded 1665{
3be5a52b
MS
1666 struct fuse_conn *fc = get_fuse_conn(inode);
1667 struct fuse_inode *fi = get_fuse_inode(inode);
9de5be06 1668 loff_t crop = i_size_read(inode);
33826ebb 1669 struct fuse_writepage_args *wpa;
3be5a52b
MS
1670
1671 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
33826ebb
MS
1672 wpa = list_entry(fi->queued_writes.next,
1673 struct fuse_writepage_args, queue_entry);
1674 list_del_init(&wpa->queue_entry);
1675 fuse_send_writepage(fc, wpa, crop);
3be5a52b
MS
1676 }
1677}
1678
33826ebb
MS
1679static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
1680 int error)
3be5a52b 1681{
33826ebb
MS
1682 struct fuse_writepage_args *wpa =
1683 container_of(args, typeof(*wpa), ia.ap.args);
1684 struct inode *inode = wpa->inode;
3be5a52b
MS
1685 struct fuse_inode *fi = get_fuse_inode(inode);
1686
33826ebb 1687 mapping_set_error(inode->i_mapping, error);
f15ecfef 1688 spin_lock(&fi->lock);
33826ebb 1689 while (wpa->next) {
6eaf4782 1690 struct fuse_conn *fc = get_fuse_conn(inode);
33826ebb
MS
1691 struct fuse_write_in *inarg = &wpa->ia.write.in;
1692 struct fuse_writepage_args *next = wpa->next;
1693
1694 wpa->next = next->next;
1695 next->next = NULL;
1696 next->ia.ff = fuse_file_get(wpa->ia.ff);
8b284dc4 1697 list_add(&next->writepages_entry, &fi->writepages);
6eaf4782
MP
1698
1699 /*
1700 * Skip fuse_flush_writepages() to make it easy to crop requests
1701 * based on primary request size.
1702 *
1703 * 1st case (trivial): there are no concurrent activities using
1704 * fuse_set/release_nowrite. Then we're on safe side because
1705 * fuse_flush_writepages() would call fuse_send_writepage()
1706 * anyway.
1707 *
1708 * 2nd case: someone called fuse_set_nowrite and it is waiting
1709 * now for completion of all in-flight requests. This happens
1710 * rarely and no more than once per page, so this should be
1711 * okay.
1712 *
1713 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
1714 * of fuse_set_nowrite..fuse_release_nowrite section. The fact
1715 * that fuse_set_nowrite returned implies that all in-flight
1716 * requests were completed along with all of their secondary
1717 * requests. Further primary requests are blocked by negative
1718 * writectr. Hence there cannot be any in-flight requests and
1719 * no invocations of fuse_writepage_end() while we're in
1720 * fuse_set_nowrite..fuse_release_nowrite section.
1721 */
1722 fuse_send_writepage(fc, next, inarg->offset + inarg->size);
8b284dc4 1723 }
3be5a52b 1724 fi->writectr--;
33826ebb 1725 fuse_writepage_finish(fc, wpa);
f15ecfef 1726 spin_unlock(&fi->lock);
33826ebb 1727 fuse_writepage_free(wpa);
3be5a52b
MS
1728}
1729
1e18bda8
MS
1730static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
1731 struct fuse_inode *fi)
adcadfa8 1732{
72523425 1733 struct fuse_file *ff = NULL;
adcadfa8 1734
f15ecfef 1735 spin_lock(&fi->lock);
1e18bda8 1736 if (!list_empty(&fi->write_files)) {
72523425
MS
1737 ff = list_entry(fi->write_files.next, struct fuse_file,
1738 write_entry);
1739 fuse_file_get(ff);
1740 }
f15ecfef 1741 spin_unlock(&fi->lock);
adcadfa8
PE
1742
1743 return ff;
1744}
1745
1e18bda8
MS
1746static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
1747 struct fuse_inode *fi)
1748{
1749 struct fuse_file *ff = __fuse_write_file_get(fc, fi);
1750 WARN_ON(!ff);
1751 return ff;
1752}
1753
1754int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
1755{
1756 struct fuse_conn *fc = get_fuse_conn(inode);
1757 struct fuse_inode *fi = get_fuse_inode(inode);
1758 struct fuse_file *ff;
1759 int err;
1760
1761 ff = __fuse_write_file_get(fc, fi);
ab9e13f7 1762 err = fuse_flush_times(inode, ff);
1e18bda8 1763 if (ff)
2e64ff15 1764 fuse_file_put(ff, false, false);
1e18bda8
MS
1765
1766 return err;
1767}
1768
33826ebb
MS
1769static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
1770{
1771 struct fuse_writepage_args *wpa;
1772 struct fuse_args_pages *ap;
1773
1774 wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
1775 if (wpa) {
1776 ap = &wpa->ia.ap;
1777 ap->num_pages = 0;
1778 ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
1779 if (!ap->pages) {
1780 kfree(wpa);
1781 wpa = NULL;
1782 }
1783 }
1784 return wpa;
1785
1786}
1787
3be5a52b
MS
1788static int fuse_writepage_locked(struct page *page)
1789{
1790 struct address_space *mapping = page->mapping;
1791 struct inode *inode = mapping->host;
1792 struct fuse_conn *fc = get_fuse_conn(inode);
1793 struct fuse_inode *fi = get_fuse_inode(inode);
33826ebb
MS
1794 struct fuse_writepage_args *wpa;
1795 struct fuse_args_pages *ap;
3be5a52b 1796 struct page *tmp_page;
72523425 1797 int error = -ENOMEM;
3be5a52b
MS
1798
1799 set_page_writeback(page);
1800
33826ebb
MS
1801 wpa = fuse_writepage_args_alloc();
1802 if (!wpa)
3be5a52b 1803 goto err;
33826ebb 1804 ap = &wpa->ia.ap;
3be5a52b
MS
1805
1806 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1807 if (!tmp_page)
1808 goto err_free;
1809
72523425 1810 error = -EIO;
33826ebb
MS
1811 wpa->ia.ff = fuse_write_file_get(fc, fi);
1812 if (!wpa->ia.ff)
27f1b363 1813 goto err_nofile;
72523425 1814
33826ebb 1815 fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
3be5a52b
MS
1816
1817 copy_highpage(tmp_page, page);
33826ebb
MS
1818 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
1819 wpa->next = NULL;
1820 ap->args.in_pages = true;
1821 ap->num_pages = 1;
1822 ap->pages[0] = tmp_page;
1823 ap->descs[0].offset = 0;
1824 ap->descs[0].length = PAGE_SIZE;
1825 ap->args.end = fuse_writepage_end;
1826 wpa->inode = inode;
3be5a52b 1827
93f78d88 1828 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
11fb9989 1829 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
3be5a52b 1830
f15ecfef 1831 spin_lock(&fi->lock);
33826ebb
MS
1832 list_add(&wpa->writepages_entry, &fi->writepages);
1833 list_add_tail(&wpa->queue_entry, &fi->queued_writes);
3be5a52b 1834 fuse_flush_writepages(inode);
f15ecfef 1835 spin_unlock(&fi->lock);
3be5a52b 1836
4a4ac4eb
MP
1837 end_page_writeback(page);
1838
3be5a52b
MS
1839 return 0;
1840
27f1b363
MP
1841err_nofile:
1842 __free_page(tmp_page);
3be5a52b 1843err_free:
33826ebb 1844 kfree(wpa);
3be5a52b 1845err:
9183976e 1846 mapping_set_error(page->mapping, error);
3be5a52b 1847 end_page_writeback(page);
72523425 1848 return error;
3be5a52b
MS
1849}
1850
1851static int fuse_writepage(struct page *page, struct writeback_control *wbc)
1852{
1853 int err;
1854
ff17be08
MS
1855 if (fuse_page_is_writeback(page->mapping->host, page->index)) {
1856 /*
1857 * ->writepages() should be called for sync() and friends. We
1858 * should only get here on direct reclaim and then we are
1859 * allowed to skip a page which is already in flight
1860 */
1861 WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
1862
1863 redirty_page_for_writepage(wbc, page);
d5880c7a 1864 unlock_page(page);
ff17be08
MS
1865 return 0;
1866 }
1867
3be5a52b
MS
1868 err = fuse_writepage_locked(page);
1869 unlock_page(page);
1870
1871 return err;
1872}
1873
26d614df 1874struct fuse_fill_wb_data {
33826ebb 1875 struct fuse_writepage_args *wpa;
26d614df
PE
1876 struct fuse_file *ff;
1877 struct inode *inode;
2d033eaa 1878 struct page **orig_pages;
33826ebb 1879 unsigned int max_pages;
26d614df
PE
1880};
1881
33826ebb
MS
1882static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
1883{
1884 struct fuse_args_pages *ap = &data->wpa->ia.ap;
1885 struct fuse_conn *fc = get_fuse_conn(data->inode);
1886 struct page **pages;
1887 struct fuse_page_desc *descs;
1888 unsigned int npages = min_t(unsigned int,
1889 max_t(unsigned int, data->max_pages * 2,
1890 FUSE_DEFAULT_MAX_PAGES_PER_REQ),
1891 fc->max_pages);
1892 WARN_ON(npages <= data->max_pages);
1893
1894 pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
1895 if (!pages)
1896 return false;
1897
1898 memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
1899 memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
1900 kfree(ap->pages);
1901 ap->pages = pages;
1902 ap->descs = descs;
1903 data->max_pages = npages;
1904
1905 return true;
1906}
1907
26d614df
PE
1908static void fuse_writepages_send(struct fuse_fill_wb_data *data)
1909{
33826ebb 1910 struct fuse_writepage_args *wpa = data->wpa;
26d614df 1911 struct inode *inode = data->inode;
26d614df 1912 struct fuse_inode *fi = get_fuse_inode(inode);
33826ebb 1913 int num_pages = wpa->ia.ap.num_pages;
2d033eaa 1914 int i;
26d614df 1915
33826ebb 1916 wpa->ia.ff = fuse_file_get(data->ff);
f15ecfef 1917 spin_lock(&fi->lock);
33826ebb 1918 list_add_tail(&wpa->queue_entry, &fi->queued_writes);
26d614df 1919 fuse_flush_writepages(inode);
f15ecfef 1920 spin_unlock(&fi->lock);
2d033eaa
MP
1921
1922 for (i = 0; i < num_pages; i++)
1923 end_page_writeback(data->orig_pages[i]);
26d614df
PE
1924}
1925
7f305ca1
MS
1926/*
1927 * First recheck under fi->lock if the offending offset is still under
419234d5
MS
1928 * writeback. If yes, then iterate auxiliary write requests, to see if there's
1929 * one already added for a page at this offset. If there's none, then insert
1930 * this new request onto the auxiliary list, otherwise reuse the existing one by
7f305ca1
MS
1931 * copying the new page contents over to the old temporary page.
1932 */
33826ebb 1933static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
8b284dc4
MS
1934 struct page *page)
1935{
33826ebb
MS
1936 struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
1937 struct fuse_writepage_args *tmp;
1938 struct fuse_writepage_args *old_wpa;
1939 struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
8b284dc4 1940
33826ebb 1941 WARN_ON(new_ap->num_pages != 0);
8b284dc4 1942
f15ecfef 1943 spin_lock(&fi->lock);
33826ebb
MS
1944 list_del(&new_wpa->writepages_entry);
1945 old_wpa = fuse_find_writeback(fi, page->index, page->index);
1946 if (!old_wpa) {
1947 list_add(&new_wpa->writepages_entry, &fi->writepages);
f15ecfef 1948 spin_unlock(&fi->lock);
2fe93bd4 1949 return false;
f6011081 1950 }
8b284dc4 1951
33826ebb
MS
1952 new_ap->num_pages = 1;
1953 for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
7f305ca1
MS
1954 pgoff_t curr_index;
1955
33826ebb
MS
1956 WARN_ON(tmp->inode != new_wpa->inode);
1957 curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
419234d5 1958 if (curr_index == page->index) {
33826ebb
MS
1959 WARN_ON(tmp->ia.ap.num_pages != 1);
1960 swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
7f305ca1 1961 break;
8b284dc4
MS
1962 }
1963 }
1964
7f305ca1 1965 if (!tmp) {
33826ebb
MS
1966 new_wpa->next = old_wpa->next;
1967 old_wpa->next = new_wpa;
7f305ca1 1968 }
41b6e41f 1969
f15ecfef 1970 spin_unlock(&fi->lock);
7f305ca1
MS
1971
1972 if (tmp) {
33826ebb 1973 struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
8b284dc4 1974
93f78d88 1975 dec_wb_stat(&bdi->wb, WB_WRITEBACK);
33826ebb 1976 dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
93f78d88 1977 wb_writeout_inc(&bdi->wb);
33826ebb 1978 fuse_writepage_free(new_wpa);
8b284dc4 1979 }
7f305ca1 1980
2fe93bd4 1981 return true;
8b284dc4
MS
1982}
1983
26d614df
PE
1984static int fuse_writepages_fill(struct page *page,
1985 struct writeback_control *wbc, void *_data)
1986{
1987 struct fuse_fill_wb_data *data = _data;
33826ebb
MS
1988 struct fuse_writepage_args *wpa = data->wpa;
1989 struct fuse_args_pages *ap = &wpa->ia.ap;
26d614df 1990 struct inode *inode = data->inode;
f15ecfef 1991 struct fuse_inode *fi = get_fuse_inode(inode);
26d614df
PE
1992 struct fuse_conn *fc = get_fuse_conn(inode);
1993 struct page *tmp_page;
8b284dc4 1994 bool is_writeback;
26d614df
PE
1995 int err;
1996
1997 if (!data->ff) {
1998 err = -EIO;
1999 data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
2000 if (!data->ff)
2001 goto out_unlock;
2002 }
2003
8b284dc4
MS
2004 /*
2005 * Being under writeback is unlikely but possible. For example direct
2006 * read to an mmaped fuse file will set the page dirty twice; once when
2007 * the pages are faulted with get_user_pages(), and then after the read
2008 * completed.
2009 */
2010 is_writeback = fuse_page_is_writeback(inode, page->index);
2011
33826ebb
MS
2012 if (wpa && ap->num_pages &&
2013 (is_writeback || ap->num_pages == fc->max_pages ||
2014 (ap->num_pages + 1) * PAGE_SIZE > fc->max_write ||
2015 data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) {
8b284dc4 2016 fuse_writepages_send(data);
33826ebb
MS
2017 data->wpa = NULL;
2018 } else if (wpa && ap->num_pages == data->max_pages) {
2019 if (!fuse_pages_realloc(data)) {
e52a8250 2020 fuse_writepages_send(data);
33826ebb 2021 data->wpa = NULL;
e52a8250 2022 }
26d614df 2023 }
e52a8250 2024
26d614df
PE
2025 err = -ENOMEM;
2026 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2027 if (!tmp_page)
2028 goto out_unlock;
2029
2030 /*
2031 * The page must not be redirtied until the writeout is completed
2032 * (i.e. userspace has sent a reply to the write request). Otherwise
2033 * there could be more than one temporary page instance for each real
2034 * page.
2035 *
2036 * This is ensured by holding the page lock in page_mkwrite() while
2037 * checking fuse_page_is_writeback(). We already hold the page lock
2038 * since clear_page_dirty_for_io() and keep it held until we add the
33826ebb 2039 * request to the fi->writepages list and increment ap->num_pages.
26d614df
PE
2040 * After this fuse_page_is_writeback() will indicate that the page is
2041 * under writeback, so we can release the page lock.
2042 */
33826ebb 2043 if (data->wpa == NULL) {
26d614df
PE
2044 struct fuse_inode *fi = get_fuse_inode(inode);
2045
2046 err = -ENOMEM;
33826ebb
MS
2047 wpa = fuse_writepage_args_alloc();
2048 if (!wpa) {
26d614df
PE
2049 __free_page(tmp_page);
2050 goto out_unlock;
2051 }
33826ebb 2052 data->max_pages = 1;
26d614df 2053
33826ebb
MS
2054 ap = &wpa->ia.ap;
2055 fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0);
2056 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
2057 wpa->next = NULL;
2058 ap->args.in_pages = true;
2059 ap->args.end = fuse_writepage_end;
2060 ap->num_pages = 0;
2061 wpa->inode = inode;
26d614df 2062
f15ecfef 2063 spin_lock(&fi->lock);
33826ebb 2064 list_add(&wpa->writepages_entry, &fi->writepages);
f15ecfef 2065 spin_unlock(&fi->lock);
26d614df 2066
33826ebb 2067 data->wpa = wpa;
26d614df
PE
2068 }
2069 set_page_writeback(page);
2070
2071 copy_highpage(tmp_page, page);
33826ebb
MS
2072 ap->pages[ap->num_pages] = tmp_page;
2073 ap->descs[ap->num_pages].offset = 0;
2074 ap->descs[ap->num_pages].length = PAGE_SIZE;
26d614df 2075
93f78d88 2076 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
11fb9989 2077 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
8b284dc4
MS
2078
2079 err = 0;
33826ebb 2080 if (is_writeback && fuse_writepage_in_flight(wpa, page)) {
8b284dc4 2081 end_page_writeback(page);
33826ebb 2082 data->wpa = NULL;
8b284dc4
MS
2083 goto out_unlock;
2084 }
33826ebb 2085 data->orig_pages[ap->num_pages] = page;
26d614df
PE
2086
2087 /*
f15ecfef 2088 * Protected by fi->lock against concurrent access by
26d614df
PE
2089 * fuse_page_is_writeback().
2090 */
f15ecfef 2091 spin_lock(&fi->lock);
33826ebb 2092 ap->num_pages++;
f15ecfef 2093 spin_unlock(&fi->lock);
26d614df 2094
26d614df
PE
2095out_unlock:
2096 unlock_page(page);
2097
2098 return err;
2099}
2100
2101static int fuse_writepages(struct address_space *mapping,
2102 struct writeback_control *wbc)
2103{
2104 struct inode *inode = mapping->host;
5da784cc 2105 struct fuse_conn *fc = get_fuse_conn(inode);
26d614df
PE
2106 struct fuse_fill_wb_data data;
2107 int err;
2108
2109 err = -EIO;
2110 if (is_bad_inode(inode))
2111 goto out;
2112
2113 data.inode = inode;
33826ebb 2114 data.wpa = NULL;
26d614df
PE
2115 data.ff = NULL;
2116
2d033eaa 2117 err = -ENOMEM;
5da784cc 2118 data.orig_pages = kcalloc(fc->max_pages,
f2b3455e 2119 sizeof(struct page *),
2d033eaa
MP
2120 GFP_NOFS);
2121 if (!data.orig_pages)
2122 goto out;
2123
26d614df 2124 err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
33826ebb 2125 if (data.wpa) {
26d614df 2126 /* Ignore errors if we can write at least one page */
33826ebb 2127 WARN_ON(!data.wpa->ia.ap.num_pages);
26d614df
PE
2128 fuse_writepages_send(&data);
2129 err = 0;
2130 }
2131 if (data.ff)
2e64ff15 2132 fuse_file_put(data.ff, false, false);
2d033eaa
MP
2133
2134 kfree(data.orig_pages);
26d614df
PE
2135out:
2136 return err;
2137}
2138
6b12c1b3
PE
2139/*
2140 * It's worthy to make sure that space is reserved on disk for the write,
2141 * but how to implement it without killing performance need more thinking.
2142 */
2143static int fuse_write_begin(struct file *file, struct address_space *mapping,
2144 loff_t pos, unsigned len, unsigned flags,
2145 struct page **pagep, void **fsdata)
2146{
09cbfeaf 2147 pgoff_t index = pos >> PAGE_SHIFT;
a455589f 2148 struct fuse_conn *fc = get_fuse_conn(file_inode(file));
6b12c1b3
PE
2149 struct page *page;
2150 loff_t fsize;
2151 int err = -ENOMEM;
2152
2153 WARN_ON(!fc->writeback_cache);
2154
2155 page = grab_cache_page_write_begin(mapping, index, flags);
2156 if (!page)
2157 goto error;
2158
2159 fuse_wait_on_page_writeback(mapping->host, page->index);
2160
09cbfeaf 2161 if (PageUptodate(page) || len == PAGE_SIZE)
6b12c1b3
PE
2162 goto success;
2163 /*
2164 * Check if the start this page comes after the end of file, in which
2165 * case the readpage can be optimized away.
2166 */
2167 fsize = i_size_read(mapping->host);
09cbfeaf
KS
2168 if (fsize <= (pos & PAGE_MASK)) {
2169 size_t off = pos & ~PAGE_MASK;
6b12c1b3
PE
2170 if (off)
2171 zero_user_segment(page, 0, off);
2172 goto success;
2173 }
2174 err = fuse_do_readpage(file, page);
2175 if (err)
2176 goto cleanup;
2177success:
2178 *pagep = page;
2179 return 0;
2180
2181cleanup:
2182 unlock_page(page);
09cbfeaf 2183 put_page(page);
6b12c1b3
PE
2184error:
2185 return err;
2186}
2187
2188static int fuse_write_end(struct file *file, struct address_space *mapping,
2189 loff_t pos, unsigned len, unsigned copied,
2190 struct page *page, void *fsdata)
2191{
2192 struct inode *inode = page->mapping->host;
2193
59c3b76c
MS
2194 /* Haven't copied anything? Skip zeroing, size extending, dirtying. */
2195 if (!copied)
2196 goto unlock;
2197
6b12c1b3
PE
2198 if (!PageUptodate(page)) {
2199 /* Zero any unwritten bytes at the end of the page */
09cbfeaf 2200 size_t endoff = (pos + copied) & ~PAGE_MASK;
6b12c1b3 2201 if (endoff)
09cbfeaf 2202 zero_user_segment(page, endoff, PAGE_SIZE);
6b12c1b3
PE
2203 SetPageUptodate(page);
2204 }
2205
2206 fuse_write_update_size(inode, pos + copied);
2207 set_page_dirty(page);
59c3b76c
MS
2208
2209unlock:
6b12c1b3 2210 unlock_page(page);
09cbfeaf 2211 put_page(page);
6b12c1b3
PE
2212
2213 return copied;
2214}
2215
3be5a52b
MS
2216static int fuse_launder_page(struct page *page)
2217{
2218 int err = 0;
2219 if (clear_page_dirty_for_io(page)) {
2220 struct inode *inode = page->mapping->host;
2221 err = fuse_writepage_locked(page);
2222 if (!err)
2223 fuse_wait_on_page_writeback(inode, page->index);
2224 }
2225 return err;
2226}
2227
2228/*
2229 * Write back dirty pages now, because there may not be any suitable
2230 * open files later
2231 */
2232static void fuse_vma_close(struct vm_area_struct *vma)
2233{
2234 filemap_write_and_wait(vma->vm_file->f_mapping);
2235}
2236
2237/*
2238 * Wait for writeback against this page to complete before allowing it
2239 * to be marked dirty again, and hence written back again, possibly
2240 * before the previous writepage completed.
2241 *
2242 * Block here, instead of in ->writepage(), so that the userspace fs
2243 * can only block processes actually operating on the filesystem.
2244 *
2245 * Otherwise unprivileged userspace fs would be able to block
2246 * unrelated:
2247 *
2248 * - page migration
2249 * - sync(2)
2250 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
2251 */
46fb504a 2252static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
3be5a52b 2253{
c2ec175c 2254 struct page *page = vmf->page;
11bac800 2255 struct inode *inode = file_inode(vmf->vma->vm_file);
cca24370 2256
11bac800 2257 file_update_time(vmf->vma->vm_file);
cca24370
MS
2258 lock_page(page);
2259 if (page->mapping != inode->i_mapping) {
2260 unlock_page(page);
2261 return VM_FAULT_NOPAGE;
2262 }
3be5a52b
MS
2263
2264 fuse_wait_on_page_writeback(inode, page->index);
cca24370 2265 return VM_FAULT_LOCKED;
3be5a52b
MS
2266}
2267
f0f37e2f 2268static const struct vm_operations_struct fuse_file_vm_ops = {
3be5a52b
MS
2269 .close = fuse_vma_close,
2270 .fault = filemap_fault,
f1820361 2271 .map_pages = filemap_map_pages,
3be5a52b
MS
2272 .page_mkwrite = fuse_page_mkwrite,
2273};
2274
2275static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
2276{
55752a3a
MS
2277 struct fuse_file *ff = file->private_data;
2278
2279 if (ff->open_flags & FOPEN_DIRECT_IO) {
2280 /* Can't provide the coherency needed for MAP_SHARED */
2281 if (vma->vm_flags & VM_MAYSHARE)
2282 return -ENODEV;
2283
2284 invalidate_inode_pages2(file->f_mapping);
2285
2286 return generic_file_mmap(file, vma);
2287 }
2288
650b22b9
PE
2289 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
2290 fuse_link_write_file(file);
2291
3be5a52b
MS
2292 file_accessed(file);
2293 vma->vm_ops = &fuse_file_vm_ops;
b6aeaded
MS
2294 return 0;
2295}
2296
0b6e9ea0
SF
2297static int convert_fuse_file_lock(struct fuse_conn *fc,
2298 const struct fuse_file_lock *ffl,
71421259
MS
2299 struct file_lock *fl)
2300{
2301 switch (ffl->type) {
2302 case F_UNLCK:
2303 break;
2304
2305 case F_RDLCK:
2306 case F_WRLCK:
2307 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
2308 ffl->end < ffl->start)
2309 return -EIO;
2310
2311 fl->fl_start = ffl->start;
2312 fl->fl_end = ffl->end;
0b6e9ea0
SF
2313
2314 /*
9d5b86ac
BC
2315 * Convert pid into init's pid namespace. The locks API will
2316 * translate it into the caller's pid namespace.
0b6e9ea0
SF
2317 */
2318 rcu_read_lock();
9d5b86ac 2319 fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
0b6e9ea0 2320 rcu_read_unlock();
71421259
MS
2321 break;
2322
2323 default:
2324 return -EIO;
2325 }
2326 fl->fl_type = ffl->type;
2327 return 0;
2328}
2329
7078187a 2330static void fuse_lk_fill(struct fuse_args *args, struct file *file,
a9ff4f87 2331 const struct file_lock *fl, int opcode, pid_t pid,
7078187a 2332 int flock, struct fuse_lk_in *inarg)
71421259 2333{
6131ffaa 2334 struct inode *inode = file_inode(file);
9c8ef561 2335 struct fuse_conn *fc = get_fuse_conn(inode);
71421259 2336 struct fuse_file *ff = file->private_data;
7078187a
MS
2337
2338 memset(inarg, 0, sizeof(*inarg));
2339 inarg->fh = ff->fh;
2340 inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
2341 inarg->lk.start = fl->fl_start;
2342 inarg->lk.end = fl->fl_end;
2343 inarg->lk.type = fl->fl_type;
2344 inarg->lk.pid = pid;
a9ff4f87 2345 if (flock)
7078187a 2346 inarg->lk_flags |= FUSE_LK_FLOCK;
d5b48543
MS
2347 args->opcode = opcode;
2348 args->nodeid = get_node_id(inode);
2349 args->in_numargs = 1;
2350 args->in_args[0].size = sizeof(*inarg);
2351 args->in_args[0].value = inarg;
71421259
MS
2352}
2353
2354static int fuse_getlk(struct file *file, struct file_lock *fl)
2355{
6131ffaa 2356 struct inode *inode = file_inode(file);
71421259 2357 struct fuse_conn *fc = get_fuse_conn(inode);
7078187a
MS
2358 FUSE_ARGS(args);
2359 struct fuse_lk_in inarg;
71421259
MS
2360 struct fuse_lk_out outarg;
2361 int err;
2362
7078187a 2363 fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
d5b48543
MS
2364 args.out_numargs = 1;
2365 args.out_args[0].size = sizeof(outarg);
2366 args.out_args[0].value = &outarg;
7078187a 2367 err = fuse_simple_request(fc, &args);
71421259 2368 if (!err)
0b6e9ea0 2369 err = convert_fuse_file_lock(fc, &outarg.lk, fl);
71421259
MS
2370
2371 return err;
2372}
2373
a9ff4f87 2374static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
71421259 2375{
6131ffaa 2376 struct inode *inode = file_inode(file);
71421259 2377 struct fuse_conn *fc = get_fuse_conn(inode);
7078187a
MS
2378 FUSE_ARGS(args);
2379 struct fuse_lk_in inarg;
71421259 2380 int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
0b6e9ea0
SF
2381 struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
2382 pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
71421259
MS
2383 int err;
2384
8fb47a4f 2385 if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
48e90761
MS
2386 /* NLM needs asynchronous locks, which we don't support yet */
2387 return -ENOLCK;
2388 }
2389
71421259 2390 /* Unlock on close is handled by the flush method */
50f2112c 2391 if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
71421259
MS
2392 return 0;
2393
0b6e9ea0 2394 fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
7078187a 2395 err = fuse_simple_request(fc, &args);
71421259 2396
a4d27e75
MS
2397 /* locking is restartable */
2398 if (err == -EINTR)
2399 err = -ERESTARTSYS;
7078187a 2400
71421259
MS
2401 return err;
2402}
2403
2404static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
2405{
6131ffaa 2406 struct inode *inode = file_inode(file);
71421259
MS
2407 struct fuse_conn *fc = get_fuse_conn(inode);
2408 int err;
2409
48e90761
MS
2410 if (cmd == F_CANCELLK) {
2411 err = 0;
2412 } else if (cmd == F_GETLK) {
71421259 2413 if (fc->no_lock) {
9d6a8c5c 2414 posix_test_lock(file, fl);
71421259
MS
2415 err = 0;
2416 } else
2417 err = fuse_getlk(file, fl);
2418 } else {
2419 if (fc->no_lock)
48e90761 2420 err = posix_lock_file(file, fl, NULL);
71421259 2421 else
a9ff4f87 2422 err = fuse_setlk(file, fl, 0);
71421259
MS
2423 }
2424 return err;
2425}
2426
a9ff4f87
MS
2427static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
2428{
6131ffaa 2429 struct inode *inode = file_inode(file);
a9ff4f87
MS
2430 struct fuse_conn *fc = get_fuse_conn(inode);
2431 int err;
2432
37fb3a30 2433 if (fc->no_flock) {
4f656367 2434 err = locks_lock_file_wait(file, fl);
a9ff4f87 2435 } else {
37fb3a30
MS
2436 struct fuse_file *ff = file->private_data;
2437
a9ff4f87 2438 /* emulate flock with POSIX locks */
37fb3a30 2439 ff->flock = true;
a9ff4f87
MS
2440 err = fuse_setlk(file, fl, 1);
2441 }
2442
2443 return err;
2444}
2445
b2d2272f
MS
2446static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
2447{
2448 struct inode *inode = mapping->host;
2449 struct fuse_conn *fc = get_fuse_conn(inode);
7078187a 2450 FUSE_ARGS(args);
b2d2272f
MS
2451 struct fuse_bmap_in inarg;
2452 struct fuse_bmap_out outarg;
2453 int err;
2454
2455 if (!inode->i_sb->s_bdev || fc->no_bmap)
2456 return 0;
2457
b2d2272f
MS
2458 memset(&inarg, 0, sizeof(inarg));
2459 inarg.block = block;
2460 inarg.blocksize = inode->i_sb->s_blocksize;
d5b48543
MS
2461 args.opcode = FUSE_BMAP;
2462 args.nodeid = get_node_id(inode);
2463 args.in_numargs = 1;
2464 args.in_args[0].size = sizeof(inarg);
2465 args.in_args[0].value = &inarg;
2466 args.out_numargs = 1;
2467 args.out_args[0].size = sizeof(outarg);
2468 args.out_args[0].value = &outarg;
7078187a 2469 err = fuse_simple_request(fc, &args);
b2d2272f
MS
2470 if (err == -ENOSYS)
2471 fc->no_bmap = 1;
2472
2473 return err ? 0 : outarg.block;
2474}
2475
0b5da8db
R
2476static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
2477{
2478 struct inode *inode = file->f_mapping->host;
2479 struct fuse_conn *fc = get_fuse_conn(inode);
2480 struct fuse_file *ff = file->private_data;
2481 FUSE_ARGS(args);
2482 struct fuse_lseek_in inarg = {
2483 .fh = ff->fh,
2484 .offset = offset,
2485 .whence = whence
2486 };
2487 struct fuse_lseek_out outarg;
2488 int err;
2489
2490 if (fc->no_lseek)
2491 goto fallback;
2492
d5b48543
MS
2493 args.opcode = FUSE_LSEEK;
2494 args.nodeid = ff->nodeid;
2495 args.in_numargs = 1;
2496 args.in_args[0].size = sizeof(inarg);
2497 args.in_args[0].value = &inarg;
2498 args.out_numargs = 1;
2499 args.out_args[0].size = sizeof(outarg);
2500 args.out_args[0].value = &outarg;
0b5da8db
R
2501 err = fuse_simple_request(fc, &args);
2502 if (err) {
2503 if (err == -ENOSYS) {
2504 fc->no_lseek = 1;
2505 goto fallback;
2506 }
2507 return err;
2508 }
2509
2510 return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
2511
2512fallback:
5b97eeac 2513 err = fuse_update_attributes(inode, file);
0b5da8db
R
2514 if (!err)
2515 return generic_file_llseek(file, offset, whence);
2516 else
2517 return err;
2518}
2519
965c8e59 2520static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
5559b8f4
MS
2521{
2522 loff_t retval;
6131ffaa 2523 struct inode *inode = file_inode(file);
5559b8f4 2524
0b5da8db
R
2525 switch (whence) {
2526 case SEEK_SET:
2527 case SEEK_CUR:
2528 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
965c8e59 2529 retval = generic_file_llseek(file, offset, whence);
0b5da8db
R
2530 break;
2531 case SEEK_END:
5955102c 2532 inode_lock(inode);
5b97eeac 2533 retval = fuse_update_attributes(inode, file);
0b5da8db
R
2534 if (!retval)
2535 retval = generic_file_llseek(file, offset, whence);
5955102c 2536 inode_unlock(inode);
0b5da8db
R
2537 break;
2538 case SEEK_HOLE:
2539 case SEEK_DATA:
5955102c 2540 inode_lock(inode);
0b5da8db 2541 retval = fuse_lseek(file, offset, whence);
5955102c 2542 inode_unlock(inode);
0b5da8db
R
2543 break;
2544 default:
2545 retval = -EINVAL;
2546 }
c07c3d19 2547
5559b8f4
MS
2548 return retval;
2549}
2550
d9d318d3
MS
2551/*
2552 * CUSE servers compiled on 32bit broke on 64bit kernels because the
2553 * ABI was defined to be 'struct iovec' which is different on 32bit
2554 * and 64bit. Fortunately we can determine which structure the server
2555 * used from the size of the reply.
2556 */
1baa26b2
MS
2557static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
2558 size_t transferred, unsigned count,
2559 bool is_compat)
d9d318d3
MS
2560{
2561#ifdef CONFIG_COMPAT
2562 if (count * sizeof(struct compat_iovec) == transferred) {
2563 struct compat_iovec *ciov = src;
2564 unsigned i;
2565
2566 /*
2567 * With this interface a 32bit server cannot support
2568 * non-compat (i.e. ones coming from 64bit apps) ioctl
2569 * requests
2570 */
2571 if (!is_compat)
2572 return -EINVAL;
2573
2574 for (i = 0; i < count; i++) {
2575 dst[i].iov_base = compat_ptr(ciov[i].iov_base);
2576 dst[i].iov_len = ciov[i].iov_len;
2577 }
2578 return 0;
2579 }
2580#endif
2581
2582 if (count * sizeof(struct iovec) != transferred)
2583 return -EIO;
2584
2585 memcpy(dst, src, transferred);
2586 return 0;
2587}
2588
7572777e 2589/* Make sure iov_length() won't overflow */
5da784cc
CS
2590static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
2591 size_t count)
7572777e
MS
2592{
2593 size_t n;
5da784cc 2594 u32 max = fc->max_pages << PAGE_SHIFT;
7572777e 2595
fb6ccff6 2596 for (n = 0; n < count; n++, iov++) {
7572777e
MS
2597 if (iov->iov_len > (size_t) max)
2598 return -ENOMEM;
2599 max -= iov->iov_len;
2600 }
2601 return 0;
2602}
2603
1baa26b2
MS
2604static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
2605 void *src, size_t transferred, unsigned count,
2606 bool is_compat)
2607{
2608 unsigned i;
2609 struct fuse_ioctl_iovec *fiov = src;
2610
2611 if (fc->minor < 16) {
2612 return fuse_copy_ioctl_iovec_old(dst, src, transferred,
2613 count, is_compat);
2614 }
2615
2616 if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
2617 return -EIO;
2618
2619 for (i = 0; i < count; i++) {
2620 /* Did the server supply an inappropriate value? */
2621 if (fiov[i].base != (unsigned long) fiov[i].base ||
2622 fiov[i].len != (unsigned long) fiov[i].len)
2623 return -EIO;
2624
2625 dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
2626 dst[i].iov_len = (size_t) fiov[i].len;
2627
2628#ifdef CONFIG_COMPAT
2629 if (is_compat &&
2630 (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
2631 (compat_size_t) dst[i].iov_len != fiov[i].len))
2632 return -EIO;
2633#endif
2634 }
2635
2636 return 0;
2637}
2638
2639
59efec7b
TH
2640/*
2641 * For ioctls, there is no generic way to determine how much memory
2642 * needs to be read and/or written. Furthermore, ioctls are allowed
2643 * to dereference the passed pointer, so the parameter requires deep
2644 * copying but FUSE has no idea whatsoever about what to copy in or
2645 * out.
2646 *
2647 * This is solved by allowing FUSE server to retry ioctl with
2648 * necessary in/out iovecs. Let's assume the ioctl implementation
2649 * needs to read in the following structure.
2650 *
2651 * struct a {
2652 * char *buf;
2653 * size_t buflen;
2654 * }
2655 *
2656 * On the first callout to FUSE server, inarg->in_size and
2657 * inarg->out_size will be NULL; then, the server completes the ioctl
2658 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
2659 * the actual iov array to
2660 *
2661 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
2662 *
2663 * which tells FUSE to copy in the requested area and retry the ioctl.
2664 * On the second round, the server has access to the structure and
2665 * from that it can tell what to look for next, so on the invocation,
2666 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
2667 *
2668 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
2669 * { .iov_base = a.buf, .iov_len = a.buflen } }
2670 *
2671 * FUSE will copy both struct a and the pointed buffer from the
2672 * process doing the ioctl and retry ioctl with both struct a and the
2673 * buffer.
2674 *
2675 * This time, FUSE server has everything it needs and completes ioctl
2676 * without FUSE_IOCTL_RETRY which finishes the ioctl call.
2677 *
2678 * Copying data out works the same way.
2679 *
2680 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
2681 * automatically initializes in and out iovs by decoding @cmd with
2682 * _IOC_* macros and the server is not allowed to request RETRY. This
2683 * limits ioctl data transfers to well-formed ioctls and is the forced
2684 * behavior for all FUSE servers.
2685 */
08cbf542
TH
2686long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
2687 unsigned int flags)
59efec7b 2688{
59efec7b 2689 struct fuse_file *ff = file->private_data;
d36f2487 2690 struct fuse_conn *fc = ff->fc;
59efec7b
TH
2691 struct fuse_ioctl_in inarg = {
2692 .fh = ff->fh,
2693 .cmd = cmd,
2694 .arg = arg,
2695 .flags = flags
2696 };
2697 struct fuse_ioctl_out outarg;
8ac83505 2698 struct iovec *iov_page = NULL;
59efec7b 2699 struct iovec *in_iov = NULL, *out_iov = NULL;
093f38a2
MS
2700 unsigned int in_iovs = 0, out_iovs = 0, max_pages;
2701 size_t in_size, out_size, c;
2702 ssize_t transferred;
acbe5fda
MS
2703 int err, i;
2704 struct iov_iter ii;
093f38a2 2705 struct fuse_args_pages ap = {};
59efec7b 2706
1baa26b2
MS
2707#if BITS_PER_LONG == 32
2708 inarg.flags |= FUSE_IOCTL_32BIT;
2709#else
6407f44a 2710 if (flags & FUSE_IOCTL_COMPAT) {
1baa26b2 2711 inarg.flags |= FUSE_IOCTL_32BIT;
6407f44a
IA
2712#ifdef CONFIG_X86_X32
2713 if (in_x32_syscall())
2714 inarg.flags |= FUSE_IOCTL_COMPAT_X32;
2715#endif
2716 }
1baa26b2
MS
2717#endif
2718
59efec7b 2719 /* assume all the iovs returned by client always fits in a page */
1baa26b2 2720 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
59efec7b 2721
59efec7b 2722 err = -ENOMEM;
093f38a2 2723 ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
8ac83505 2724 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
093f38a2 2725 if (!ap.pages || !iov_page)
59efec7b
TH
2726 goto out;
2727
093f38a2
MS
2728 fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);
2729
59efec7b
TH
2730 /*
2731 * If restricted, initialize IO parameters as encoded in @cmd.
2732 * RETRY from server is not allowed.
2733 */
2734 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
8ac83505 2735 struct iovec *iov = iov_page;
59efec7b 2736
c9f0523d 2737 iov->iov_base = (void __user *)arg;
59efec7b
TH
2738 iov->iov_len = _IOC_SIZE(cmd);
2739
2740 if (_IOC_DIR(cmd) & _IOC_WRITE) {
2741 in_iov = iov;
2742 in_iovs = 1;
2743 }
2744
2745 if (_IOC_DIR(cmd) & _IOC_READ) {
2746 out_iov = iov;
2747 out_iovs = 1;
2748 }
2749 }
2750
2751 retry:
2752 inarg.in_size = in_size = iov_length(in_iov, in_iovs);
2753 inarg.out_size = out_size = iov_length(out_iov, out_iovs);
2754
2755 /*
2756 * Out data can be used either for actual out data or iovs,
2757 * make sure there always is at least one page.
2758 */
2759 out_size = max_t(size_t, out_size, PAGE_SIZE);
2760 max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
2761
2762 /* make sure there are enough buffer pages and init request with them */
2763 err = -ENOMEM;
5da784cc 2764 if (max_pages > fc->max_pages)
59efec7b 2765 goto out;
093f38a2
MS
2766 while (ap.num_pages < max_pages) {
2767 ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2768 if (!ap.pages[ap.num_pages])
59efec7b 2769 goto out;
093f38a2 2770 ap.num_pages++;
59efec7b
TH
2771 }
2772
59efec7b
TH
2773
2774 /* okay, let's send it to the client */
093f38a2
MS
2775 ap.args.opcode = FUSE_IOCTL;
2776 ap.args.nodeid = ff->nodeid;
2777 ap.args.in_numargs = 1;
2778 ap.args.in_args[0].size = sizeof(inarg);
2779 ap.args.in_args[0].value = &inarg;
59efec7b 2780 if (in_size) {
093f38a2
MS
2781 ap.args.in_numargs++;
2782 ap.args.in_args[1].size = in_size;
2783 ap.args.in_pages = true;
59efec7b 2784
acbe5fda
MS
2785 err = -EFAULT;
2786 iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
093f38a2
MS
2787 for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
2788 c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
acbe5fda
MS
2789 if (c != PAGE_SIZE && iov_iter_count(&ii))
2790 goto out;
2791 }
59efec7b
TH
2792 }
2793
093f38a2
MS
2794 ap.args.out_numargs = 2;
2795 ap.args.out_args[0].size = sizeof(outarg);
2796 ap.args.out_args[0].value = &outarg;
2797 ap.args.out_args[1].size = out_size;
2798 ap.args.out_pages = true;
2799 ap.args.out_argvar = true;
59efec7b 2800
093f38a2
MS
2801 transferred = fuse_simple_request(fc, &ap.args);
2802 err = transferred;
2803 if (transferred < 0)
59efec7b
TH
2804 goto out;
2805
2806 /* did it ask for retry? */
2807 if (outarg.flags & FUSE_IOCTL_RETRY) {
8ac83505 2808 void *vaddr;
59efec7b
TH
2809
2810 /* no retry if in restricted mode */
2811 err = -EIO;
2812 if (!(flags & FUSE_IOCTL_UNRESTRICTED))
2813 goto out;
2814
2815 in_iovs = outarg.in_iovs;
2816 out_iovs = outarg.out_iovs;
2817
2818 /*
2819 * Make sure things are in boundary, separate checks
2820 * are to protect against overflow.
2821 */
2822 err = -ENOMEM;
2823 if (in_iovs > FUSE_IOCTL_MAX_IOV ||
2824 out_iovs > FUSE_IOCTL_MAX_IOV ||
2825 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
2826 goto out;
2827
093f38a2 2828 vaddr = kmap_atomic(ap.pages[0]);
1baa26b2 2829 err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
d9d318d3
MS
2830 transferred, in_iovs + out_iovs,
2831 (flags & FUSE_IOCTL_COMPAT) != 0);
2408f6ef 2832 kunmap_atomic(vaddr);
d9d318d3
MS
2833 if (err)
2834 goto out;
59efec7b 2835
8ac83505 2836 in_iov = iov_page;
59efec7b
TH
2837 out_iov = in_iov + in_iovs;
2838
5da784cc 2839 err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
7572777e
MS
2840 if (err)
2841 goto out;
2842
5da784cc 2843 err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
7572777e
MS
2844 if (err)
2845 goto out;
2846
59efec7b
TH
2847 goto retry;
2848 }
2849
2850 err = -EIO;
2851 if (transferred > inarg.out_size)
2852 goto out;
2853
acbe5fda
MS
2854 err = -EFAULT;
2855 iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
093f38a2
MS
2856 for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
2857 c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
acbe5fda
MS
2858 if (c != PAGE_SIZE && iov_iter_count(&ii))
2859 goto out;
2860 }
2861 err = 0;
59efec7b 2862 out:
8ac83505 2863 free_page((unsigned long) iov_page);
093f38a2
MS
2864 while (ap.num_pages)
2865 __free_page(ap.pages[--ap.num_pages]);
2866 kfree(ap.pages);
59efec7b
TH
2867
2868 return err ? err : outarg.result;
2869}
08cbf542 2870EXPORT_SYMBOL_GPL(fuse_do_ioctl);
59efec7b 2871
b18da0c5
MS
2872long fuse_ioctl_common(struct file *file, unsigned int cmd,
2873 unsigned long arg, unsigned int flags)
d36f2487 2874{
6131ffaa 2875 struct inode *inode = file_inode(file);
d36f2487
MS
2876 struct fuse_conn *fc = get_fuse_conn(inode);
2877
c2132c1b 2878 if (!fuse_allow_current_process(fc))
d36f2487
MS
2879 return -EACCES;
2880
2881 if (is_bad_inode(inode))
2882 return -EIO;
2883
2884 return fuse_do_ioctl(file, cmd, arg, flags);
2885}
2886
59efec7b
TH
2887static long fuse_file_ioctl(struct file *file, unsigned int cmd,
2888 unsigned long arg)
2889{
b18da0c5 2890 return fuse_ioctl_common(file, cmd, arg, 0);
59efec7b
TH
2891}
2892
2893static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
2894 unsigned long arg)
2895{
b18da0c5 2896 return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
59efec7b
TH
2897}
2898
95668a69
TH
2899/*
2900 * All files which have been polled are linked to RB tree
2901 * fuse_conn->polled_files which is indexed by kh. Walk the tree and
2902 * find the matching one.
2903 */
2904static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
2905 struct rb_node **parent_out)
2906{
2907 struct rb_node **link = &fc->polled_files.rb_node;
2908 struct rb_node *last = NULL;
2909
2910 while (*link) {
2911 struct fuse_file *ff;
2912
2913 last = *link;
2914 ff = rb_entry(last, struct fuse_file, polled_node);
2915
2916 if (kh < ff->kh)
2917 link = &last->rb_left;
2918 else if (kh > ff->kh)
2919 link = &last->rb_right;
2920 else
2921 return link;
2922 }
2923
2924 if (parent_out)
2925 *parent_out = last;
2926 return link;
2927}
2928
2929/*
2930 * The file is about to be polled. Make sure it's on the polled_files
2931 * RB tree. Note that files once added to the polled_files tree are
2932 * not removed before the file is released. This is because a file
2933 * polled once is likely to be polled again.
2934 */
2935static void fuse_register_polled_file(struct fuse_conn *fc,
2936 struct fuse_file *ff)
2937{
2938 spin_lock(&fc->lock);
2939 if (RB_EMPTY_NODE(&ff->polled_node)) {
f3846266 2940 struct rb_node **link, *uninitialized_var(parent);
95668a69
TH
2941
2942 link = fuse_find_polled_node(fc, ff->kh, &parent);
2943 BUG_ON(*link);
2944 rb_link_node(&ff->polled_node, parent, link);
2945 rb_insert_color(&ff->polled_node, &fc->polled_files);
2946 }
2947 spin_unlock(&fc->lock);
2948}
2949
076ccb76 2950__poll_t fuse_file_poll(struct file *file, poll_table *wait)
95668a69 2951{
95668a69 2952 struct fuse_file *ff = file->private_data;
797759aa 2953 struct fuse_conn *fc = ff->fc;
95668a69
TH
2954 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
2955 struct fuse_poll_out outarg;
7078187a 2956 FUSE_ARGS(args);
95668a69
TH
2957 int err;
2958
2959 if (fc->no_poll)
2960 return DEFAULT_POLLMASK;
2961
2962 poll_wait(file, &ff->poll_wait, wait);
c71d227f 2963 inarg.events = mangle_poll(poll_requested_events(wait));
95668a69
TH
2964
2965 /*
2966 * Ask for notification iff there's someone waiting for it.
2967 * The client may ignore the flag and always notify.
2968 */
2969 if (waitqueue_active(&ff->poll_wait)) {
2970 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
2971 fuse_register_polled_file(fc, ff);
2972 }
2973
d5b48543
MS
2974 args.opcode = FUSE_POLL;
2975 args.nodeid = ff->nodeid;
2976 args.in_numargs = 1;
2977 args.in_args[0].size = sizeof(inarg);
2978 args.in_args[0].value = &inarg;
2979 args.out_numargs = 1;
2980 args.out_args[0].size = sizeof(outarg);
2981 args.out_args[0].value = &outarg;
7078187a 2982 err = fuse_simple_request(fc, &args);
95668a69
TH
2983
2984 if (!err)
c71d227f 2985 return demangle_poll(outarg.revents);
95668a69
TH
2986 if (err == -ENOSYS) {
2987 fc->no_poll = 1;
2988 return DEFAULT_POLLMASK;
2989 }
a9a08845 2990 return EPOLLERR;
95668a69 2991}
08cbf542 2992EXPORT_SYMBOL_GPL(fuse_file_poll);
95668a69
TH
2993
2994/*
2995 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
2996 * wakes up the poll waiters.
2997 */
2998int fuse_notify_poll_wakeup(struct fuse_conn *fc,
2999 struct fuse_notify_poll_wakeup_out *outarg)
3000{
3001 u64 kh = outarg->kh;
3002 struct rb_node **link;
3003
3004 spin_lock(&fc->lock);
3005
3006 link = fuse_find_polled_node(fc, kh, NULL);
3007 if (*link) {
3008 struct fuse_file *ff;
3009
3010 ff = rb_entry(*link, struct fuse_file, polled_node);
3011 wake_up_interruptible_sync(&ff->poll_wait);
3012 }
3013
3014 spin_unlock(&fc->lock);
3015 return 0;
3016}
3017
efb9fa9e
MP
3018static void fuse_do_truncate(struct file *file)
3019{
3020 struct inode *inode = file->f_mapping->host;
3021 struct iattr attr;
3022
3023 attr.ia_valid = ATTR_SIZE;
3024 attr.ia_size = i_size_read(inode);
3025
3026 attr.ia_file = file;
3027 attr.ia_valid |= ATTR_FILE;
3028
62490330 3029 fuse_do_setattr(file_dentry(file), &attr, file);
efb9fa9e
MP
3030}
3031
5da784cc 3032static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
e5c5f05d 3033{
5da784cc 3034 return round_up(off, fc->max_pages << PAGE_SHIFT);
e5c5f05d
MP
3035}
3036
4273b793 3037static ssize_t
c8b8e32d 3038fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
4273b793 3039{
9d5722b7 3040 DECLARE_COMPLETION_ONSTACK(wait);
4273b793 3041 ssize_t ret = 0;
60b9df7a
MS
3042 struct file *file = iocb->ki_filp;
3043 struct fuse_file *ff = file->private_data;
e5c5f05d 3044 bool async_dio = ff->fc->async_dio;
4273b793 3045 loff_t pos = 0;
bcba24cc
MP
3046 struct inode *inode;
3047 loff_t i_size;
a6cbcd4a 3048 size_t count = iov_iter_count(iter);
c8b8e32d 3049 loff_t offset = iocb->ki_pos;
36cf66ed 3050 struct fuse_io_priv *io;
4273b793 3051
4273b793 3052 pos = offset;
bcba24cc
MP
3053 inode = file->f_mapping->host;
3054 i_size = i_size_read(inode);
4273b793 3055
6f673763 3056 if ((iov_iter_rw(iter) == READ) && (offset > i_size))
9fe55eea
SW
3057 return 0;
3058
439ee5f0 3059 /* optimization for short read */
6f673763 3060 if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
439ee5f0
MP
3061 if (offset >= i_size)
3062 return 0;
5da784cc 3063 iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
6b775b18 3064 count = iov_iter_count(iter);
439ee5f0
MP
3065 }
3066
bcba24cc 3067 io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
36cf66ed
MP
3068 if (!io)
3069 return -ENOMEM;
bcba24cc 3070 spin_lock_init(&io->lock);
744742d6 3071 kref_init(&io->refcnt);
bcba24cc
MP
3072 io->reqs = 1;
3073 io->bytes = -1;
3074 io->size = 0;
3075 io->offset = offset;
6f673763 3076 io->write = (iov_iter_rw(iter) == WRITE);
bcba24cc 3077 io->err = 0;
bcba24cc
MP
3078 /*
3079 * By default, we want to optimize all I/Os with async request
60b9df7a 3080 * submission to the client filesystem if supported.
bcba24cc 3081 */
e5c5f05d 3082 io->async = async_dio;
bcba24cc 3083 io->iocb = iocb;
7879c4e5 3084 io->blocking = is_sync_kiocb(iocb);
bcba24cc
MP
3085
3086 /*
7879c4e5
AS
3087 * We cannot asynchronously extend the size of a file.
3088 * In such case the aio will behave exactly like sync io.
bcba24cc 3089 */
7879c4e5
AS
3090 if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE)
3091 io->blocking = true;
4273b793 3092
7879c4e5 3093 if (io->async && io->blocking) {
744742d6
SF
3094 /*
3095 * Additional reference to keep io around after
3096 * calling fuse_aio_complete()
3097 */
3098 kref_get(&io->refcnt);
9d5722b7 3099 io->done = &wait;
744742d6 3100 }
9d5722b7 3101
6f673763 3102 if (iov_iter_rw(iter) == WRITE) {
6b775b18 3103 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
812408fb
AV
3104 fuse_invalidate_attr(inode);
3105 } else {
d22a943f 3106 ret = __fuse_direct_read(io, iter, &pos);
812408fb 3107 }
36cf66ed 3108
bcba24cc 3109 if (io->async) {
ebacb812
LC
3110 bool blocking = io->blocking;
3111
bcba24cc
MP
3112 fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
3113
3114 /* we have a non-extending, async request, so return */
ebacb812 3115 if (!blocking)
bcba24cc
MP
3116 return -EIOCBQUEUED;
3117
9d5722b7
CH
3118 wait_for_completion(&wait);
3119 ret = fuse_get_res_by_io(io);
bcba24cc
MP
3120 }
3121
744742d6 3122 kref_put(&io->refcnt, fuse_io_release);
9d5722b7 3123
6f673763 3124 if (iov_iter_rw(iter) == WRITE) {
efb9fa9e
MP
3125 if (ret > 0)
3126 fuse_write_update_size(inode, pos);
3127 else if (ret < 0 && offset + count > i_size)
3128 fuse_do_truncate(file);
3129 }
4273b793
AA
3130
3131 return ret;
3132}
3133
26eb3bae
MS
3134static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
3135{
3136 int err = filemap_write_and_wait_range(inode->i_mapping, start, end);
3137
3138 if (!err)
3139 fuse_sync_writes(inode);
3140
3141 return err;
3142}
3143
cdadb11c
MS
3144static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
3145 loff_t length)
05ba1f08
AP
3146{
3147 struct fuse_file *ff = file->private_data;
1c68271c 3148 struct inode *inode = file_inode(file);
0ab08f57 3149 struct fuse_inode *fi = get_fuse_inode(inode);
05ba1f08 3150 struct fuse_conn *fc = ff->fc;
7078187a 3151 FUSE_ARGS(args);
05ba1f08
AP
3152 struct fuse_fallocate_in inarg = {
3153 .fh = ff->fh,
3154 .offset = offset,
3155 .length = length,
3156 .mode = mode
3157 };
3158 int err;
14c14414
MP
3159 bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
3160 (mode & FALLOC_FL_PUNCH_HOLE);
05ba1f08 3161
4adb8302
MS
3162 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
3163 return -EOPNOTSUPP;
3164
519c6040
MS
3165 if (fc->no_fallocate)
3166 return -EOPNOTSUPP;
3167
14c14414 3168 if (lock_inode) {
5955102c 3169 inode_lock(inode);
bde52788
MP
3170 if (mode & FALLOC_FL_PUNCH_HOLE) {
3171 loff_t endbyte = offset + length - 1;
26eb3bae
MS
3172
3173 err = fuse_writeback_range(inode, offset, endbyte);
bde52788
MP
3174 if (err)
3175 goto out;
bde52788 3176 }
3634a632
BF
3177 }
3178
0cbade02
LB
3179 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
3180 offset + length > i_size_read(inode)) {
3181 err = inode_newsize_ok(inode, offset + length);
3182 if (err)
35d6fcbb 3183 goto out;
0cbade02
LB
3184 }
3185
0ab08f57
MP
3186 if (!(mode & FALLOC_FL_KEEP_SIZE))
3187 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3188
d5b48543
MS
3189 args.opcode = FUSE_FALLOCATE;
3190 args.nodeid = ff->nodeid;
3191 args.in_numargs = 1;
3192 args.in_args[0].size = sizeof(inarg);
3193 args.in_args[0].value = &inarg;
7078187a 3194 err = fuse_simple_request(fc, &args);
519c6040
MS
3195 if (err == -ENOSYS) {
3196 fc->no_fallocate = 1;
3197 err = -EOPNOTSUPP;
3198 }
bee6c307
BF
3199 if (err)
3200 goto out;
3201
3202 /* we could have extended the file */
b0aa7606
MP
3203 if (!(mode & FALLOC_FL_KEEP_SIZE)) {
3204 bool changed = fuse_write_update_size(inode, offset + length);
3205
93d2269d
MS
3206 if (changed && fc->writeback_cache)
3207 file_update_time(file);
b0aa7606 3208 }
bee6c307
BF
3209
3210 if (mode & FALLOC_FL_PUNCH_HOLE)
3211 truncate_pagecache_range(inode, offset, offset + length - 1);
3212
3213 fuse_invalidate_attr(inode);
3214
3634a632 3215out:
0ab08f57
MP
3216 if (!(mode & FALLOC_FL_KEEP_SIZE))
3217 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3218
bde52788 3219 if (lock_inode)
5955102c 3220 inode_unlock(inode);
3634a632 3221
05ba1f08
AP
3222 return err;
3223}
05ba1f08 3224
64bf5ff5
DC
3225static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
3226 struct file *file_out, loff_t pos_out,
3227 size_t len, unsigned int flags)
88bc7d50
NV
3228{
3229 struct fuse_file *ff_in = file_in->private_data;
3230 struct fuse_file *ff_out = file_out->private_data;
a2bc9236 3231 struct inode *inode_in = file_inode(file_in);
88bc7d50
NV
3232 struct inode *inode_out = file_inode(file_out);
3233 struct fuse_inode *fi_out = get_fuse_inode(inode_out);
3234 struct fuse_conn *fc = ff_in->fc;
3235 FUSE_ARGS(args);
3236 struct fuse_copy_file_range_in inarg = {
3237 .fh_in = ff_in->fh,
3238 .off_in = pos_in,
3239 .nodeid_out = ff_out->nodeid,
3240 .fh_out = ff_out->fh,
3241 .off_out = pos_out,
3242 .len = len,
3243 .flags = flags
3244 };
3245 struct fuse_write_out outarg;
3246 ssize_t err;
3247 /* mark unstable when write-back is not used, and file_out gets
3248 * extended */
3249 bool is_unstable = (!fc->writeback_cache) &&
3250 ((pos_out + len) > inode_out->i_size);
3251
3252 if (fc->no_copy_file_range)
3253 return -EOPNOTSUPP;
3254
5dae222a
AG
3255 if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
3256 return -EXDEV;
3257
a2bc9236
MS
3258 if (fc->writeback_cache) {
3259 inode_lock(inode_in);
26eb3bae 3260 err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
a2bc9236
MS
3261 inode_unlock(inode_in);
3262 if (err)
3263 return err;
3264 }
3265
88bc7d50
NV
3266 inode_lock(inode_out);
3267
fe0da9c0
AG
3268 err = file_modified(file_out);
3269 if (err)
3270 goto out;
3271
88bc7d50 3272 if (fc->writeback_cache) {
26eb3bae 3273 err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
88bc7d50
NV
3274 if (err)
3275 goto out;
88bc7d50
NV
3276 }
3277
3278 if (is_unstable)
3279 set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3280
d5b48543
MS
3281 args.opcode = FUSE_COPY_FILE_RANGE;
3282 args.nodeid = ff_in->nodeid;
3283 args.in_numargs = 1;
3284 args.in_args[0].size = sizeof(inarg);
3285 args.in_args[0].value = &inarg;
3286 args.out_numargs = 1;
3287 args.out_args[0].size = sizeof(outarg);
3288 args.out_args[0].value = &outarg;
88bc7d50
NV
3289 err = fuse_simple_request(fc, &args);
3290 if (err == -ENOSYS) {
3291 fc->no_copy_file_range = 1;
3292 err = -EOPNOTSUPP;
3293 }
3294 if (err)
3295 goto out;
3296
3297 if (fc->writeback_cache) {
3298 fuse_write_update_size(inode_out, pos_out + outarg.size);
3299 file_update_time(file_out);
3300 }
3301
3302 fuse_invalidate_attr(inode_out);
3303
3304 err = outarg.size;
3305out:
3306 if (is_unstable)
3307 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3308
3309 inode_unlock(inode_out);
fe0da9c0 3310 file_accessed(file_in);
88bc7d50
NV
3311
3312 return err;
3313}
3314
64bf5ff5
DC
3315static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
3316 struct file *dst_file, loff_t dst_off,
3317 size_t len, unsigned int flags)
3318{
3319 ssize_t ret;
3320
3321 ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
3322 len, flags);
3323
5dae222a 3324 if (ret == -EOPNOTSUPP || ret == -EXDEV)
64bf5ff5
DC
3325 ret = generic_copy_file_range(src_file, src_off, dst_file,
3326 dst_off, len, flags);
3327 return ret;
3328}
3329
4b6f5d20 3330static const struct file_operations fuse_file_operations = {
5559b8f4 3331 .llseek = fuse_file_llseek,
37c20f16 3332 .read_iter = fuse_file_read_iter,
84c3d55c 3333 .write_iter = fuse_file_write_iter,
b6aeaded
MS
3334 .mmap = fuse_file_mmap,
3335 .open = fuse_open,
3336 .flush = fuse_flush,
3337 .release = fuse_release,
3338 .fsync = fuse_fsync,
71421259 3339 .lock = fuse_file_lock,
a9ff4f87 3340 .flock = fuse_file_flock,
3c3db095
MS
3341 .splice_read = generic_file_splice_read,
3342 .splice_write = iter_file_splice_write,
59efec7b
TH
3343 .unlocked_ioctl = fuse_file_ioctl,
3344 .compat_ioctl = fuse_file_compat_ioctl,
95668a69 3345 .poll = fuse_file_poll,
05ba1f08 3346 .fallocate = fuse_file_fallocate,
d4136d60 3347 .copy_file_range = fuse_copy_file_range,
413ef8cb
MS
3348};
3349
f5e54d6e 3350static const struct address_space_operations fuse_file_aops = {
b6aeaded 3351 .readpage = fuse_readpage,
3be5a52b 3352 .writepage = fuse_writepage,
26d614df 3353 .writepages = fuse_writepages,
3be5a52b 3354 .launder_page = fuse_launder_page,
db50b96c 3355 .readpages = fuse_readpages,
3be5a52b 3356 .set_page_dirty = __set_page_dirty_nobuffers,
b2d2272f 3357 .bmap = fuse_bmap,
4273b793 3358 .direct_IO = fuse_direct_IO,
6b12c1b3
PE
3359 .write_begin = fuse_write_begin,
3360 .write_end = fuse_write_end,
b6aeaded
MS
3361};
3362
3363void fuse_init_file_inode(struct inode *inode)
3364{
ab2257e9
MS
3365 struct fuse_inode *fi = get_fuse_inode(inode);
3366
45323fb7
MS
3367 inode->i_fop = &fuse_file_operations;
3368 inode->i_data.a_ops = &fuse_file_aops;
ab2257e9
MS
3369
3370 INIT_LIST_HEAD(&fi->write_files);
3371 INIT_LIST_HEAD(&fi->queued_writes);
3372 fi->writectr = 0;
3373 init_waitqueue_head(&fi->page_waitq);
3374 INIT_LIST_HEAD(&fi->writepages);
b6aeaded 3375}