Merge tag 'docs-6.4-2' of git://git.lwn.net/linux
[linux-block.git] / fs / xfs / xfs_file.c
CommitLineData
0b61f8a4 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
7b718769
NS
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
1da177e4 5 */
1da177e4 6#include "xfs.h"
dda35b8f 7#include "xfs_fs.h"
70a9883c 8#include "xfs_shared.h"
a4fbe6ab 9#include "xfs_format.h"
239880ef
DC
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
1da177e4 12#include "xfs_mount.h"
1da177e4 13#include "xfs_inode.h"
239880ef 14#include "xfs_trans.h"
fd3200be 15#include "xfs_inode_item.h"
dda35b8f 16#include "xfs_bmap.h"
c24b5dfa 17#include "xfs_bmap_util.h"
2b9ab5ab 18#include "xfs_dir2.h"
c24b5dfa 19#include "xfs_dir2_priv.h"
ddcd856d 20#include "xfs_ioctl.h"
dda35b8f 21#include "xfs_trace.h"
239880ef 22#include "xfs_log.h"
dc06f398 23#include "xfs_icache.h"
781355c6 24#include "xfs_pnfs.h"
68a9f5e7 25#include "xfs_iomap.h"
0613f16c 26#include "xfs_reflink.h"
1da177e4 27
ea6c49b7 28#include <linux/dax.h>
2fe17c10 29#include <linux/falloc.h>
66114cad 30#include <linux/backing-dev.h>
a39e596b 31#include <linux/mman.h>
40144e49 32#include <linux/fadvise.h>
f736d93d 33#include <linux/mount.h>
1da177e4 34
f0f37e2f 35static const struct vm_operations_struct xfs_file_vm_ops;
1da177e4 36
25219dbf
DW
37/*
38 * Decide if the given file range is aligned to the size of the fundamental
39 * allocation unit for the file.
40 */
41static bool
42xfs_is_falloc_aligned(
43 struct xfs_inode *ip,
44 loff_t pos,
45 long long int len)
46{
47 struct xfs_mount *mp = ip->i_mount;
48 uint64_t mask;
49
50 if (XFS_IS_REALTIME_INODE(ip)) {
51 if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
52 u64 rextbytes;
53 u32 mod;
54
55 rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
56 div_u64_rem(pos, rextbytes, &mod);
57 if (mod)
58 return false;
59 div_u64_rem(len, rextbytes, &mod);
60 return mod == 0;
61 }
62 mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
63 } else {
64 mask = mp->m_sb.sb_blocksize - 1;
65 }
66
67 return !((pos | len) & mask);
68}
69
1da2f2db
CH
70/*
71 * Fsync operations on directories are much simpler than on regular files,
72 * as there is no file data to flush, and thus also no need for explicit
73 * cache flush operations, and there are no non-transaction metadata updates
74 * on directories either.
75 */
76STATIC int
77xfs_dir_fsync(
78 struct file *file,
79 loff_t start,
80 loff_t end,
81 int datasync)
82{
83 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
1da2f2db
CH
84
85 trace_xfs_dir_fsync(ip);
54fbdd10 86 return xfs_log_force_inode(ip);
1da2f2db
CH
87}
88
5f9b4b0d
DC
89static xfs_csn_t
90xfs_fsync_seq(
f22c7f87
CH
91 struct xfs_inode *ip,
92 bool datasync)
93{
94 if (!xfs_ipincount(ip))
95 return 0;
96 if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
97 return 0;
5f9b4b0d 98 return ip->i_itemp->ili_commit_seq;
f22c7f87
CH
99}
100
101/*
102 * All metadata updates are logged, which means that we just have to flush the
103 * log up to the latest LSN that touched the inode.
104 *
105 * If we have concurrent fsync/fdatasync() calls, we need them to all block on
106 * the log force before we clear the ili_fsync_fields field. This ensures that
107 * we don't get a racing sync operation that does not wait for the metadata to
108 * hit the journal before returning. If we race with clearing ili_fsync_fields,
109 * then all that will happen is the log force will do nothing as the lsn will
110 * already be on disk. We can't race with setting ili_fsync_fields because that
111 * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
112 * shared until after the ili_fsync_fields is cleared.
113 */
114static int
115xfs_fsync_flush_log(
116 struct xfs_inode *ip,
117 bool datasync,
118 int *log_flushed)
119{
120 int error = 0;
5f9b4b0d 121 xfs_csn_t seq;
f22c7f87
CH
122
123 xfs_ilock(ip, XFS_ILOCK_SHARED);
5f9b4b0d
DC
124 seq = xfs_fsync_seq(ip, datasync);
125 if (seq) {
126 error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
f22c7f87
CH
127 log_flushed);
128
129 spin_lock(&ip->i_itemp->ili_lock);
130 ip->i_itemp->ili_fsync_fields = 0;
131 spin_unlock(&ip->i_itemp->ili_lock);
132 }
133 xfs_iunlock(ip, XFS_ILOCK_SHARED);
134 return error;
135}
136
fd3200be
CH
137STATIC int
138xfs_file_fsync(
139 struct file *file,
02c24a82
JB
140 loff_t start,
141 loff_t end,
fd3200be
CH
142 int datasync)
143{
f22c7f87 144 struct xfs_inode *ip = XFS_I(file->f_mapping->host);
a27a263b 145 struct xfs_mount *mp = ip->i_mount;
7d839e32 146 int error, err2;
fd3200be
CH
147 int log_flushed = 0;
148
cca28fb8 149 trace_xfs_file_fsync(ip);
fd3200be 150
1b180274 151 error = file_write_and_wait_range(file, start, end);
02c24a82
JB
152 if (error)
153 return error;
154
75c8c50f 155 if (xfs_is_shutdown(mp))
b474c7ae 156 return -EIO;
fd3200be
CH
157
158 xfs_iflags_clear(ip, XFS_ITRUNCATED);
159
2291dab2
DC
160 /*
161 * If we have an RT and/or log subvolume we need to make sure to flush
162 * the write cache the device used for file data first. This is to
163 * ensure newly written file data make it to disk before logging the new
164 * inode size in case of an extending write.
165 */
166 if (XFS_IS_REALTIME_INODE(ip))
7d839e32 167 error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
2291dab2 168 else if (mp->m_logdev_targp != mp->m_ddev_targp)
7d839e32 169 error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
a27a263b 170
fd3200be 171 /*
ae29e422 172 * Any inode that has dirty modifications in the log is pinned. The
7d839e32 173 * racy check here for a pinned inode will not catch modifications
ae29e422
CH
174 * that happen concurrently to the fsync call, but fsync semantics
175 * only require to sync previously completed I/O.
fd3200be 176 */
7d839e32
DW
177 if (xfs_ipincount(ip)) {
178 err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
179 if (err2 && !error)
180 error = err2;
181 }
b1037058 182
a27a263b
CH
183 /*
184 * If we only have a single device, and the log force about was
185 * a no-op we might have to flush the data device cache here.
186 * This can only happen for fdatasync/O_DSYNC if we were overwriting
187 * an already allocated file and thus do not have any metadata to
188 * commit.
189 */
2291dab2 190 if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
7d839e32
DW
191 mp->m_logdev_targp == mp->m_ddev_targp) {
192 err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
193 if (err2 && !error)
194 error = err2;
195 }
fd3200be 196
2451337d 197 return error;
fd3200be
CH
198}
199
f50b8f47
CH
200static int
201xfs_ilock_iocb(
202 struct kiocb *iocb,
203 unsigned int lock_mode)
204{
205 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
206
207 if (iocb->ki_flags & IOCB_NOWAIT) {
208 if (!xfs_ilock_nowait(ip, lock_mode))
209 return -EAGAIN;
210 } else {
211 xfs_ilock(ip, lock_mode);
212 }
213
214 return 0;
215}
216
00258e36 217STATIC ssize_t
ee1b218b 218xfs_file_dio_read(
dda35b8f 219 struct kiocb *iocb,
b4f5d2c6 220 struct iov_iter *to)
dda35b8f 221{
acdda3aa 222 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
acdda3aa 223 ssize_t ret;
dda35b8f 224
3e40b13c 225 trace_xfs_file_direct_read(iocb, to);
dda35b8f 226
3e40b13c 227 if (!iov_iter_count(to))
f1285ff0 228 return 0; /* skip atime */
dda35b8f 229
a447d7cd
CH
230 file_accessed(iocb->ki_filp);
231
f50b8f47
CH
232 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
233 if (ret)
234 return ret;
786f847f 235 ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
65523218 236 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
acdda3aa 237
16d4d435
CH
238 return ret;
239}
240
f021bd07 241static noinline ssize_t
16d4d435
CH
242xfs_file_dax_read(
243 struct kiocb *iocb,
244 struct iov_iter *to)
245{
6c31f495 246 struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
16d4d435
CH
247 ssize_t ret = 0;
248
3e40b13c 249 trace_xfs_file_dax_read(iocb, to);
16d4d435 250
3e40b13c 251 if (!iov_iter_count(to))
16d4d435
CH
252 return 0; /* skip atime */
253
f50b8f47
CH
254 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
255 if (ret)
256 return ret;
690c2a38 257 ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
65523218 258 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
bbc5a740 259
f1285ff0 260 file_accessed(iocb->ki_filp);
bbc5a740
CH
261 return ret;
262}
263
264STATIC ssize_t
ee1b218b 265xfs_file_buffered_read(
bbc5a740
CH
266 struct kiocb *iocb,
267 struct iov_iter *to)
268{
269 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
270 ssize_t ret;
271
3e40b13c 272 trace_xfs_file_buffered_read(iocb, to);
dda35b8f 273
f50b8f47
CH
274 ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
275 if (ret)
276 return ret;
b4f5d2c6 277 ret = generic_file_read_iter(iocb, to);
65523218 278 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
bbc5a740
CH
279
280 return ret;
281}
282
283STATIC ssize_t
284xfs_file_read_iter(
285 struct kiocb *iocb,
286 struct iov_iter *to)
287{
16d4d435
CH
288 struct inode *inode = file_inode(iocb->ki_filp);
289 struct xfs_mount *mp = XFS_I(inode)->i_mount;
bbc5a740
CH
290 ssize_t ret = 0;
291
292 XFS_STATS_INC(mp, xs_read_calls);
293
75c8c50f 294 if (xfs_is_shutdown(mp))
bbc5a740
CH
295 return -EIO;
296
16d4d435
CH
297 if (IS_DAX(inode))
298 ret = xfs_file_dax_read(iocb, to);
299 else if (iocb->ki_flags & IOCB_DIRECT)
ee1b218b 300 ret = xfs_file_dio_read(iocb, to);
3176c3e0 301 else
ee1b218b 302 ret = xfs_file_buffered_read(iocb, to);
dda35b8f 303
dda35b8f 304 if (ret > 0)
ff6d6af2 305 XFS_STATS_ADD(mp, xs_read_bytes, ret);
dda35b8f
CH
306 return ret;
307}
308
4d8d1581
DC
309/*
310 * Common pre-write limit and setup checks.
311 *
5bf1f262
CH
312 * Called with the iolocked held either shared and exclusive according to
313 * @iolock, and returns with it held. Might upgrade the iolock to exclusive
314 * if called for a direct write beyond i_size.
4d8d1581
DC
315 */
316STATIC ssize_t
ee1b218b 317xfs_file_write_checks(
99733fa3
AV
318 struct kiocb *iocb,
319 struct iov_iter *from,
a1033753 320 unsigned int *iolock)
4d8d1581 321{
99733fa3 322 struct file *file = iocb->ki_filp;
4d8d1581
DC
323 struct inode *inode = file->f_mapping->host;
324 struct xfs_inode *ip = XFS_I(inode);
3309dd04 325 ssize_t error = 0;
99733fa3 326 size_t count = iov_iter_count(from);
3136e8bb 327 bool drained_dio = false;
f5c54717 328 loff_t isize;
4d8d1581 329
7271d243 330restart:
3309dd04
AV
331 error = generic_write_checks(iocb, from);
332 if (error <= 0)
4d8d1581 333 return error;
4d8d1581 334
354be7e3
CH
335 if (iocb->ki_flags & IOCB_NOWAIT) {
336 error = break_layout(inode, false);
337 if (error == -EWOULDBLOCK)
338 error = -EAGAIN;
339 } else {
340 error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
341 }
342
781355c6
CH
343 if (error)
344 return error;
345
65523218
CH
346 /*
347 * For changing security info in file_remove_privs() we need i_rwsem
348 * exclusively.
349 */
a6de82ca 350 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
65523218 351 xfs_iunlock(ip, *iolock);
a6de82ca 352 *iolock = XFS_IOLOCK_EXCL;
354be7e3
CH
353 error = xfs_ilock_iocb(iocb, *iolock);
354 if (error) {
355 *iolock = 0;
356 return error;
357 }
a6de82ca
JK
358 goto restart;
359 }
977ec4dd 360
4d8d1581
DC
361 /*
362 * If the offset is beyond the size of the file, we need to zero any
363 * blocks that fall between the existing EOF and the start of this
977ec4dd
DC
364 * write. If zeroing is needed and we are currently holding the iolock
365 * shared, we need to update it to exclusive which implies having to
366 * redo all checks before.
367 *
368 * We need to serialise against EOF updates that occur in IO completions
369 * here. We want to make sure that nobody is changing the size while we
370 * do this check until we have placed an IO barrier (i.e. hold the
371 * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
372 * spinlock effectively forms a memory barrier once we have the
373 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
374 * hence be able to correctly determine if we need to run zeroing.
b9d59846 375 *
977ec4dd
DC
376 * We can do an unlocked check here safely as IO completion can only
377 * extend EOF. Truncate is locked out at this point, so the EOF can
378 * not move backwards, only forwards. Hence we only need to take the
379 * slow path and spin locks when we are at or beyond the current EOF.
4d8d1581 380 */
977ec4dd
DC
381 if (iocb->ki_pos <= i_size_read(inode))
382 goto out;
383
b9d59846 384 spin_lock(&ip->i_flags_lock);
f5c54717
CH
385 isize = i_size_read(inode);
386 if (iocb->ki_pos > isize) {
b9d59846 387 spin_unlock(&ip->i_flags_lock);
354be7e3
CH
388
389 if (iocb->ki_flags & IOCB_NOWAIT)
390 return -EAGAIN;
391
3136e8bb
BF
392 if (!drained_dio) {
393 if (*iolock == XFS_IOLOCK_SHARED) {
65523218 394 xfs_iunlock(ip, *iolock);
3136e8bb 395 *iolock = XFS_IOLOCK_EXCL;
65523218 396 xfs_ilock(ip, *iolock);
3136e8bb
BF
397 iov_iter_reexpand(from, count);
398 }
40c63fbc
DC
399 /*
400 * We now have an IO submission barrier in place, but
401 * AIO can do EOF updates during IO completion and hence
402 * we now need to wait for all of them to drain. Non-AIO
403 * DIO will have drained before we are given the
404 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
405 * no-op.
406 */
407 inode_dio_wait(inode);
3136e8bb 408 drained_dio = true;
7271d243
DC
409 goto restart;
410 }
977ec4dd 411
f5c54717 412 trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
f1ba5faf 413 error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
467f7899
CH
414 if (error)
415 return error;
b9d59846
DC
416 } else
417 spin_unlock(&ip->i_flags_lock);
4d8d1581 418
977ec4dd 419out:
1aa91d9c 420 return kiocb_modified(iocb);
4d8d1581
DC
421}
422
acdda3aa
CH
423static int
424xfs_dio_write_end_io(
425 struct kiocb *iocb,
426 ssize_t size,
6fe7b990 427 int error,
acdda3aa
CH
428 unsigned flags)
429{
430 struct inode *inode = file_inode(iocb->ki_filp);
431 struct xfs_inode *ip = XFS_I(inode);
432 loff_t offset = iocb->ki_pos;
73d30d48 433 unsigned int nofs_flag;
acdda3aa
CH
434
435 trace_xfs_end_io_direct_write(ip, offset, size);
436
75c8c50f 437 if (xfs_is_shutdown(ip->i_mount))
acdda3aa
CH
438 return -EIO;
439
6fe7b990
MB
440 if (error)
441 return error;
442 if (!size)
443 return 0;
acdda3aa 444
ed5c3e66
DC
445 /*
446 * Capture amount written on completion as we can't reliably account
447 * for it on submission.
448 */
449 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
450
73d30d48
CH
451 /*
452 * We can allocate memory here while doing writeback on behalf of
453 * memory reclaim. To avoid memory allocation deadlocks set the
454 * task-wide nofs context for the following operations.
455 */
456 nofs_flag = memalloc_nofs_save();
457
ee70daab
EG
458 if (flags & IOMAP_DIO_COW) {
459 error = xfs_reflink_end_cow(ip, offset, size);
460 if (error)
73d30d48 461 goto out;
ee70daab
EG
462 }
463
464 /*
465 * Unwritten conversion updates the in-core isize after extent
466 * conversion but before updating the on-disk size. Updating isize any
467 * earlier allows a racing dio read to find unwritten extents before
468 * they are converted.
469 */
73d30d48
CH
470 if (flags & IOMAP_DIO_UNWRITTEN) {
471 error = xfs_iomap_write_unwritten(ip, offset, size, true);
472 goto out;
473 }
ee70daab 474
acdda3aa
CH
475 /*
476 * We need to update the in-core inode size here so that we don't end up
477 * with the on-disk inode size being outside the in-core inode size. We
478 * have no other method of updating EOF for AIO, so always do it here
479 * if necessary.
480 *
481 * We need to lock the test/set EOF update as we can be racing with
482 * other IO completions here to update the EOF. Failing to serialise
483 * here can result in EOF moving backwards and Bad Things Happen when
484 * that occurs.
977ec4dd
DC
485 *
486 * As IO completion only ever extends EOF, we can do an unlocked check
487 * here to avoid taking the spinlock. If we land within the current EOF,
488 * then we do not need to do an extending update at all, and we don't
489 * need to take the lock to check this. If we race with an update moving
490 * EOF, then we'll either still be beyond EOF and need to take the lock,
491 * or we'll be within EOF and we don't need to take it at all.
acdda3aa 492 */
977ec4dd
DC
493 if (offset + size <= i_size_read(inode))
494 goto out;
495
acdda3aa
CH
496 spin_lock(&ip->i_flags_lock);
497 if (offset + size > i_size_read(inode)) {
498 i_size_write(inode, offset + size);
ee70daab 499 spin_unlock(&ip->i_flags_lock);
acdda3aa 500 error = xfs_setfilesize(ip, offset, size);
ee70daab
EG
501 } else {
502 spin_unlock(&ip->i_flags_lock);
503 }
acdda3aa 504
73d30d48
CH
505out:
506 memalloc_nofs_restore(nofs_flag);
acdda3aa
CH
507 return error;
508}
509
838c4f3d
CH
510static const struct iomap_dio_ops xfs_dio_write_ops = {
511 .end_io = xfs_dio_write_end_io,
512};
513
f0d26e86 514/*
caa89dbc 515 * Handle block aligned direct I/O writes
f0d26e86 516 */
caa89dbc
DC
517static noinline ssize_t
518xfs_file_dio_write_aligned(
519 struct xfs_inode *ip,
f0d26e86 520 struct kiocb *iocb,
b3188919 521 struct iov_iter *from)
f0d26e86 522{
a1033753 523 unsigned int iolock = XFS_IOLOCK_SHARED;
caa89dbc 524 ssize_t ret;
f0d26e86 525
caa89dbc
DC
526 ret = xfs_ilock_iocb(iocb, iolock);
527 if (ret)
528 return ret;
529 ret = xfs_file_write_checks(iocb, from, &iolock);
530 if (ret)
531 goto out_unlock;
f0d26e86 532
7271d243 533 /*
caa89dbc
DC
534 * We don't need to hold the IOLOCK exclusively across the IO, so demote
535 * the iolock back to shared if we had to take the exclusive lock in
536 * xfs_file_write_checks() for other reasons.
7271d243 537 */
caa89dbc
DC
538 if (iolock == XFS_IOLOCK_EXCL) {
539 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
d0606464 540 iolock = XFS_IOLOCK_SHARED;
c58cb165 541 }
caa89dbc
DC
542 trace_xfs_file_direct_write(iocb, from);
543 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
786f847f 544 &xfs_dio_write_ops, 0, NULL, 0);
caa89dbc
DC
545out_unlock:
546 if (iolock)
547 xfs_iunlock(ip, iolock);
548 return ret;
549}
f0d26e86 550
caa89dbc
DC
551/*
552 * Handle block unaligned direct I/O writes
553 *
554 * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
555 * them to be done in parallel with reads and other direct I/O writes. However,
556 * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
557 * to do sub-block zeroing and that requires serialisation against other direct
558 * I/O to the same block. In this case we need to serialise the submission of
559 * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
ed1128c2
DC
560 * In the case where sub-block zeroing is not required, we can do concurrent
561 * sub-block dios to the same block successfully.
caa89dbc 562 *
ed1128c2
DC
563 * Optimistically submit the I/O using the shared lock first, but use the
564 * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
565 * if block allocation or partial block zeroing would be required. In that case
566 * we try again with the exclusive lock.
caa89dbc
DC
567 */
568static noinline ssize_t
569xfs_file_dio_write_unaligned(
570 struct xfs_inode *ip,
571 struct kiocb *iocb,
572 struct iov_iter *from)
573{
ed1128c2
DC
574 size_t isize = i_size_read(VFS_I(ip));
575 size_t count = iov_iter_count(from);
a1033753 576 unsigned int iolock = XFS_IOLOCK_SHARED;
ed1128c2 577 unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
caa89dbc
DC
578 ssize_t ret;
579
ed1128c2
DC
580 /*
581 * Extending writes need exclusivity because of the sub-block zeroing
582 * that the DIO code always does for partial tail blocks beyond EOF, so
583 * don't even bother trying the fast path in this case.
584 */
585 if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
ed1128c2
DC
586 if (iocb->ki_flags & IOCB_NOWAIT)
587 return -EAGAIN;
93e6aa43 588retry_exclusive:
ed1128c2
DC
589 iolock = XFS_IOLOCK_EXCL;
590 flags = IOMAP_DIO_FORCE_WAIT;
591 }
592
593 ret = xfs_ilock_iocb(iocb, iolock);
594 if (ret)
595 return ret;
caa89dbc
DC
596
597 /*
598 * We can't properly handle unaligned direct I/O to reflink files yet,
599 * as we can't unshare a partial block.
600 */
601 if (xfs_is_cow_inode(ip)) {
602 trace_xfs_reflink_bounce_dio_write(iocb, from);
603 ret = -ENOTBLK;
604 goto out_unlock;
29a5d29e 605 }
0ee7a3f6 606
ee1b218b 607 ret = xfs_file_write_checks(iocb, from, &iolock);
4d8d1581 608 if (ret)
caa89dbc 609 goto out_unlock;
f0d26e86 610
eda77982 611 /*
ed1128c2
DC
612 * If we are doing exclusive unaligned I/O, this must be the only I/O
613 * in-flight. Otherwise we risk data corruption due to unwritten extent
614 * conversions from the AIO end_io handler. Wait for all other I/O to
615 * drain first.
eda77982 616 */
ed1128c2
DC
617 if (flags & IOMAP_DIO_FORCE_WAIT)
618 inode_dio_wait(VFS_I(ip));
f0d26e86 619
3e40b13c 620 trace_xfs_file_direct_write(iocb, from);
f150b423 621 ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
786f847f 622 &xfs_dio_write_ops, flags, NULL, 0);
ed1128c2
DC
623
624 /*
625 * Retry unaligned I/O with exclusive blocking semantics if the DIO
626 * layer rejected it for mapping or locking reasons. If we are doing
627 * nonblocking user I/O, propagate the error.
628 */
629 if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
630 ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
631 xfs_iunlock(ip, iolock);
632 goto retry_exclusive;
633 }
634
caa89dbc 635out_unlock:
354be7e3
CH
636 if (iolock)
637 xfs_iunlock(ip, iolock);
16d4d435
CH
638 return ret;
639}
640
caa89dbc
DC
641static ssize_t
642xfs_file_dio_write(
643 struct kiocb *iocb,
644 struct iov_iter *from)
645{
646 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
647 struct xfs_buftarg *target = xfs_inode_buftarg(ip);
648 size_t count = iov_iter_count(from);
649
650 /* direct I/O must be aligned to device logical sector size */
651 if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
652 return -EINVAL;
653 if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
654 return xfs_file_dio_write_unaligned(ip, iocb, from);
655 return xfs_file_dio_write_aligned(ip, iocb, from);
656}
657
f021bd07 658static noinline ssize_t
16d4d435
CH
659xfs_file_dax_write(
660 struct kiocb *iocb,
661 struct iov_iter *from)
662{
6c31f495 663 struct inode *inode = iocb->ki_filp->f_mapping->host;
16d4d435 664 struct xfs_inode *ip = XFS_I(inode);
a1033753 665 unsigned int iolock = XFS_IOLOCK_EXCL;
6c31f495 666 ssize_t ret, error = 0;
6c31f495 667 loff_t pos;
16d4d435 668
f50b8f47
CH
669 ret = xfs_ilock_iocb(iocb, iolock);
670 if (ret)
671 return ret;
ee1b218b 672 ret = xfs_file_write_checks(iocb, from, &iolock);
16d4d435
CH
673 if (ret)
674 goto out;
675
6c31f495 676 pos = iocb->ki_pos;
8b2180b3 677
3e40b13c 678 trace_xfs_file_dax_write(iocb, from);
ea6c49b7 679 ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
6c31f495
CH
680 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
681 i_size_write(inode, iocb->ki_pos);
682 error = xfs_setfilesize(ip, pos, ret);
16d4d435 683 }
16d4d435 684out:
354be7e3
CH
685 if (iolock)
686 xfs_iunlock(ip, iolock);
ed5c3e66
DC
687 if (error)
688 return error;
689
690 if (ret > 0) {
691 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
692
693 /* Handle various SYNC-type writes */
694 ret = generic_write_sync(iocb, ret);
695 }
696 return ret;
f0d26e86
DC
697}
698
00258e36 699STATIC ssize_t
ee1b218b 700xfs_file_buffered_write(
dda35b8f 701 struct kiocb *iocb,
b3188919 702 struct iov_iter *from)
dda35b8f 703{
2d9ac431 704 struct inode *inode = iocb->ki_filp->f_mapping->host;
00258e36 705 struct xfs_inode *ip = XFS_I(inode);
637bbc75 706 ssize_t ret;
a636b1d1 707 bool cleared_space = false;
a1033753 708 unsigned int iolock;
dda35b8f 709
c3155097
BF
710write_retry:
711 iolock = XFS_IOLOCK_EXCL;
1aa91d9c
SR
712 ret = xfs_ilock_iocb(iocb, iolock);
713 if (ret)
714 return ret;
dda35b8f 715
ee1b218b 716 ret = xfs_file_write_checks(iocb, from, &iolock);
4d8d1581 717 if (ret)
d0606464 718 goto out;
dda35b8f
CH
719
720 /* We can write back this queue in page reclaim */
de1414a6 721 current->backing_dev_info = inode_to_bdi(inode);
dda35b8f 722
3e40b13c 723 trace_xfs_file_buffered_write(iocb, from);
f150b423
CH
724 ret = iomap_file_buffered_write(iocb, from,
725 &xfs_buffered_write_iomap_ops);
0a64bc2c 726 if (likely(ret >= 0))
99733fa3 727 iocb->ki_pos += ret;
dc06f398 728
637bbc75 729 /*
dc06f398
BF
730 * If we hit a space limit, try to free up some lingering preallocated
731 * space before returning an error. In the case of ENOSPC, first try to
732 * write back all dirty inodes to free up some of the excess reserved
733 * metadata space. This reduces the chances that the eofblocks scan
734 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
735 * also behaves as a filter to prevent too many eofblocks scans from
111068f8
DW
736 * running at the same time. Use a synchronous scan to increase the
737 * effectiveness of the scan.
637bbc75 738 */
a636b1d1 739 if (ret == -EDQUOT && !cleared_space) {
c3155097 740 xfs_iunlock(ip, iolock);
2d53f66b 741 xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
111068f8
DW
742 cleared_space = true;
743 goto write_retry;
a636b1d1 744 } else if (ret == -ENOSPC && !cleared_space) {
b26b2bf1 745 struct xfs_icwalk icw = {0};
dc06f398 746
a636b1d1 747 cleared_space = true;
9aa05000 748 xfs_flush_inodes(ip->i_mount);
c3155097
BF
749
750 xfs_iunlock(ip, iolock);
b26b2bf1
DW
751 icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
752 xfs_blockgc_free_space(ip->i_mount, &icw);
9aa05000 753 goto write_retry;
dda35b8f 754 }
d0606464 755
dda35b8f 756 current->backing_dev_info = NULL;
d0606464 757out:
c3155097
BF
758 if (iolock)
759 xfs_iunlock(ip, iolock);
ed5c3e66
DC
760
761 if (ret > 0) {
762 XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
763 /* Handle various SYNC-type writes */
764 ret = generic_write_sync(iocb, ret);
765 }
637bbc75
DC
766 return ret;
767}
768
769STATIC ssize_t
bf97f3bc 770xfs_file_write_iter(
637bbc75 771 struct kiocb *iocb,
bf97f3bc 772 struct iov_iter *from)
637bbc75 773{
2d9ac431 774 struct inode *inode = iocb->ki_filp->f_mapping->host;
637bbc75
DC
775 struct xfs_inode *ip = XFS_I(inode);
776 ssize_t ret;
bf97f3bc 777 size_t ocount = iov_iter_count(from);
637bbc75 778
ff6d6af2 779 XFS_STATS_INC(ip->i_mount, xs_write_calls);
637bbc75 780
637bbc75
DC
781 if (ocount == 0)
782 return 0;
783
75c8c50f 784 if (xfs_is_shutdown(ip->i_mount))
bf97f3bc 785 return -EIO;
637bbc75 786
16d4d435 787 if (IS_DAX(inode))
ed5c3e66
DC
788 return xfs_file_dax_write(iocb, from);
789
790 if (iocb->ki_flags & IOCB_DIRECT) {
0613f16c
DW
791 /*
792 * Allow a directio write to fall back to a buffered
793 * write *only* in the case that we're doing a reflink
794 * CoW. In all other directio scenarios we do not
795 * allow an operation to fall back to buffered mode.
796 */
ee1b218b 797 ret = xfs_file_dio_write(iocb, from);
80e543ae 798 if (ret != -ENOTBLK)
ed5c3e66 799 return ret;
0613f16c 800 }
dda35b8f 801
ee1b218b 802 return xfs_file_buffered_write(iocb, from);
dda35b8f
CH
803}
804
d6dc57e2
DW
805static void
806xfs_wait_dax_page(
e25ff835 807 struct inode *inode)
d6dc57e2
DW
808{
809 struct xfs_inode *ip = XFS_I(inode);
810
d6dc57e2
DW
811 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
812 schedule();
813 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
814}
815
13f9e267 816int
d6dc57e2
DW
817xfs_break_dax_layouts(
818 struct inode *inode,
e25ff835 819 bool *retry)
d6dc57e2
DW
820{
821 struct page *page;
822
823 ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
824
825 page = dax_layout_busy_page(inode->i_mapping);
826 if (!page)
827 return 0;
828
e25ff835 829 *retry = true;
d6dc57e2
DW
830 return ___wait_var_event(&page->_refcount,
831 atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
e25ff835 832 0, 0, xfs_wait_dax_page(inode));
d6dc57e2
DW
833}
834
69eb5fa1
DW
835int
836xfs_break_layouts(
837 struct inode *inode,
838 uint *iolock,
839 enum layout_break_reason reason)
840{
841 bool retry;
d6dc57e2 842 int error;
69eb5fa1
DW
843
844 ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
845
d6dc57e2
DW
846 do {
847 retry = false;
848 switch (reason) {
849 case BREAK_UNMAP:
a4722a64 850 error = xfs_break_dax_layouts(inode, &retry);
d6dc57e2
DW
851 if (error || retry)
852 break;
53004ee7 853 fallthrough;
d6dc57e2
DW
854 case BREAK_WRITE:
855 error = xfs_break_leased_layouts(inode, iolock, &retry);
856 break;
857 default:
858 WARN_ON_ONCE(1);
859 error = -EINVAL;
860 }
861 } while (error == 0 && retry);
862
863 return error;
69eb5fa1
DW
864}
865
cea267c2
DC
866/* Does this file, inode, or mount want synchronous writes? */
867static inline bool xfs_file_sync_writes(struct file *filp)
868{
869 struct xfs_inode *ip = XFS_I(file_inode(filp));
870
871 if (xfs_has_wsync(ip->i_mount))
872 return true;
873 if (filp->f_flags & (__O_SYNC | O_DSYNC))
874 return true;
875 if (IS_SYNC(file_inode(filp)))
876 return true;
877
878 return false;
879}
880
a904b1ca
NJ
881#define XFS_FALLOC_FL_SUPPORTED \
882 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
883 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
98cc2db5 884 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
a904b1ca 885
2fe17c10
CH
886STATIC long
887xfs_file_fallocate(
83aee9e4
CH
888 struct file *file,
889 int mode,
890 loff_t offset,
891 loff_t len)
2fe17c10 892{
83aee9e4
CH
893 struct inode *inode = file_inode(file);
894 struct xfs_inode *ip = XFS_I(inode);
83aee9e4 895 long error;
c63a8eae 896 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
83aee9e4 897 loff_t new_size = 0;
749f24f3 898 bool do_file_insert = false;
2fe17c10 899
83aee9e4
CH
900 if (!S_ISREG(inode->i_mode))
901 return -EINVAL;
a904b1ca 902 if (mode & ~XFS_FALLOC_FL_SUPPORTED)
2fe17c10
CH
903 return -EOPNOTSUPP;
904
781355c6 905 xfs_ilock(ip, iolock);
69eb5fa1 906 error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
781355c6
CH
907 if (error)
908 goto out_unlock;
909
249bd908
DC
910 /*
911 * Must wait for all AIO to complete before we continue as AIO can
912 * change the file size on completion without holding any locks we
913 * currently hold. We must do this first because AIO can update both
914 * the on disk and in memory inode sizes, and the operations that follow
915 * require the in-memory size to be fully up-to-date.
916 */
917 inode_dio_wait(inode);
918
919 /*
920 * Now AIO and DIO has drained we flush and (if necessary) invalidate
921 * the cached range over the first operation we are about to run.
922 *
923 * We care about zero and collapse here because they both run a hole
924 * punch over the range first. Because that can zero data, and the range
925 * of invalidation for the shift operations is much larger, we still do
926 * the required flush for collapse in xfs_prepare_shift().
927 *
928 * Insert has the same range requirements as collapse, and we extend the
929 * file first which can zero data. Hence insert has the same
930 * flush/invalidate requirements as collapse and so they are both
931 * handled at the right time by xfs_prepare_shift().
932 */
933 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
934 FALLOC_FL_COLLAPSE_RANGE)) {
935 error = xfs_flush_unmap_range(ip, offset, len);
936 if (error)
937 goto out_unlock;
938 }
939
fbe7e520
DC
940 error = file_modified(file);
941 if (error)
942 goto out_unlock;
943
83aee9e4
CH
944 if (mode & FALLOC_FL_PUNCH_HOLE) {
945 error = xfs_free_file_space(ip, offset, len);
946 if (error)
947 goto out_unlock;
e1d8fb88 948 } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
25219dbf 949 if (!xfs_is_falloc_aligned(ip, offset, len)) {
2451337d 950 error = -EINVAL;
e1d8fb88
NJ
951 goto out_unlock;
952 }
953
23fffa92
LC
954 /*
955 * There is no need to overlap collapse range with EOF,
956 * in which case it is effectively a truncate operation
957 */
958 if (offset + len >= i_size_read(inode)) {
2451337d 959 error = -EINVAL;
23fffa92
LC
960 goto out_unlock;
961 }
962
e1d8fb88
NJ
963 new_size = i_size_read(inode) - len;
964
965 error = xfs_collapse_file_space(ip, offset, len);
966 if (error)
967 goto out_unlock;
a904b1ca 968 } else if (mode & FALLOC_FL_INSERT_RANGE) {
7d83fb14 969 loff_t isize = i_size_read(inode);
a904b1ca 970
25219dbf 971 if (!xfs_is_falloc_aligned(ip, offset, len)) {
a904b1ca
NJ
972 error = -EINVAL;
973 goto out_unlock;
974 }
975
7d83fb14
DW
976 /*
977 * New inode size must not exceed ->s_maxbytes, accounting for
978 * possible signed overflow.
979 */
980 if (inode->i_sb->s_maxbytes - isize < len) {
a904b1ca
NJ
981 error = -EFBIG;
982 goto out_unlock;
983 }
7d83fb14 984 new_size = isize + len;
a904b1ca
NJ
985
986 /* Offset should be less than i_size */
7d83fb14 987 if (offset >= isize) {
a904b1ca
NJ
988 error = -EINVAL;
989 goto out_unlock;
990 }
749f24f3 991 do_file_insert = true;
83aee9e4
CH
992 } else {
993 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
994 offset + len > i_size_read(inode)) {
995 new_size = offset + len;
2451337d 996 error = inode_newsize_ok(inode, new_size);
83aee9e4
CH
997 if (error)
998 goto out_unlock;
999 }
2fe17c10 1000
66ae56a5 1001 if (mode & FALLOC_FL_ZERO_RANGE) {
360c09c0
CH
1002 /*
1003 * Punch a hole and prealloc the range. We use a hole
1004 * punch rather than unwritten extent conversion for two
1005 * reasons:
1006 *
1007 * 1.) Hole punch handles partial block zeroing for us.
1008 * 2.) If prealloc returns ENOSPC, the file range is
1009 * still zero-valued by virtue of the hole punch.
1010 */
1011 unsigned int blksize = i_blocksize(inode);
1012
1013 trace_xfs_zero_file_space(ip);
1014
1015 error = xfs_free_file_space(ip, offset, len);
1016 if (error)
1017 goto out_unlock;
1018
1019 len = round_up(offset + len, blksize) -
1020 round_down(offset, blksize);
1021 offset = round_down(offset, blksize);
66ae56a5
CH
1022 } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
1023 error = xfs_reflink_unshare(ip, offset, len);
1024 if (error)
1025 goto out_unlock;
66ae56a5
CH
1026 } else {
1027 /*
1028 * If always_cow mode we can't use preallocations and
1029 * thus should not create them.
1030 */
1031 if (xfs_is_always_cow_inode(ip)) {
1032 error = -EOPNOTSUPP;
1033 goto out_unlock;
1034 }
360c09c0 1035 }
66ae56a5 1036
360c09c0 1037 if (!xfs_is_always_cow_inode(ip)) {
4d1b97f9 1038 error = xfs_alloc_file_space(ip, offset, len);
360c09c0
CH
1039 if (error)
1040 goto out_unlock;
98cc2db5 1041 }
fbe7e520 1042 }
2fe17c10
CH
1043
1044 /* Change file size if needed */
1045 if (new_size) {
1046 struct iattr iattr;
1047
1048 iattr.ia_valid = ATTR_SIZE;
1049 iattr.ia_size = new_size;
c1632a0f 1050 error = xfs_vn_setattr_size(file_mnt_idmap(file),
f736d93d 1051 file_dentry(file), &iattr);
a904b1ca
NJ
1052 if (error)
1053 goto out_unlock;
2fe17c10
CH
1054 }
1055
a904b1ca
NJ
1056 /*
1057 * Perform hole insertion now that the file size has been
1058 * updated so that if we crash during the operation we don't
1059 * leave shifted extents past EOF and hence losing access to
1060 * the data that is contained within them.
1061 */
472c6e46 1062 if (do_file_insert) {
a904b1ca 1063 error = xfs_insert_file_space(ip, offset, len);
472c6e46
DC
1064 if (error)
1065 goto out_unlock;
1066 }
1067
cea267c2 1068 if (xfs_file_sync_writes(file))
472c6e46 1069 error = xfs_log_force_inode(ip);
a904b1ca 1070
2fe17c10 1071out_unlock:
781355c6 1072 xfs_iunlock(ip, iolock);
2451337d 1073 return error;
2fe17c10
CH
1074}
1075
40144e49
JK
1076STATIC int
1077xfs_file_fadvise(
1078 struct file *file,
1079 loff_t start,
1080 loff_t end,
1081 int advice)
1082{
1083 struct xfs_inode *ip = XFS_I(file_inode(file));
1084 int ret;
1085 int lockflags = 0;
1086
1087 /*
1088 * Operations creating pages in page cache need protection from hole
1089 * punching and similar ops
1090 */
1091 if (advice == POSIX_FADV_WILLNEED) {
1092 lockflags = XFS_IOLOCK_SHARED;
1093 xfs_ilock(ip, lockflags);
1094 }
1095 ret = generic_fadvise(file, start, end, advice);
1096 if (lockflags)
1097 xfs_iunlock(ip, lockflags);
1098 return ret;
1099}
3fc9f5e4 1100
da034bcc 1101STATIC loff_t
2e5dfc99 1102xfs_file_remap_range(
3fc9f5e4
DW
1103 struct file *file_in,
1104 loff_t pos_in,
1105 struct file *file_out,
1106 loff_t pos_out,
1107 loff_t len,
1108 unsigned int remap_flags)
9fe26045 1109{
3fc9f5e4
DW
1110 struct inode *inode_in = file_inode(file_in);
1111 struct xfs_inode *src = XFS_I(inode_in);
1112 struct inode *inode_out = file_inode(file_out);
1113 struct xfs_inode *dest = XFS_I(inode_out);
1114 struct xfs_mount *mp = src->i_mount;
1115 loff_t remapped = 0;
1116 xfs_extlen_t cowextsize;
1117 int ret;
1118
2e5dfc99
DW
1119 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
1120 return -EINVAL;
cc714660 1121
38c26bfd 1122 if (!xfs_has_reflink(mp))
3fc9f5e4
DW
1123 return -EOPNOTSUPP;
1124
75c8c50f 1125 if (xfs_is_shutdown(mp))
3fc9f5e4
DW
1126 return -EIO;
1127
1128 /* Prepare and then clone file data. */
1129 ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
1130 &len, remap_flags);
451d34ee 1131 if (ret || len == 0)
3fc9f5e4
DW
1132 return ret;
1133
1134 trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1135
1136 ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
1137 &remapped);
1138 if (ret)
1139 goto out_unlock;
1140
1141 /*
1142 * Carry the cowextsize hint from src to dest if we're sharing the
1143 * entire source file to the entire destination file, the source file
1144 * has a cowextsize hint, and the destination file does not.
1145 */
1146 cowextsize = 0;
1147 if (pos_in == 0 && len == i_size_read(inode_in) &&
3e09ab8f 1148 (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
3fc9f5e4 1149 pos_out == 0 && len >= i_size_read(inode_out) &&
3e09ab8f 1150 !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
b33ce57d 1151 cowextsize = src->i_cowextsize;
3fc9f5e4
DW
1152
1153 ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
1154 remap_flags);
5833112d
CH
1155 if (ret)
1156 goto out_unlock;
3fc9f5e4 1157
5ffce3cc 1158 if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
5833112d 1159 xfs_log_force_inode(dest);
3fc9f5e4 1160out_unlock:
e2aaee9c 1161 xfs_iunlock2_io_mmap(src, dest);
3fc9f5e4
DW
1162 if (ret)
1163 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1164 return remapped > 0 ? remapped : ret;
9fe26045 1165}
2fe17c10 1166
1da177e4 1167STATIC int
3562fd45 1168xfs_file_open(
1da177e4 1169 struct inode *inode,
f999a5bf 1170 struct file *file)
1da177e4 1171{
75c8c50f 1172 if (xfs_is_shutdown(XFS_M(inode->i_sb)))
f999a5bf 1173 return -EIO;
d8aeb44a
JA
1174 file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
1175 FMODE_DIO_PARALLEL_WRITE;
f3bf67c6 1176 return generic_file_open(inode, file);
f999a5bf
CH
1177}
1178
1179STATIC int
1180xfs_dir_open(
1181 struct inode *inode,
1182 struct file *file)
1183{
1184 struct xfs_inode *ip = XFS_I(inode);
a1033753 1185 unsigned int mode;
f999a5bf
CH
1186 int error;
1187
1188 error = xfs_file_open(inode, file);
1189 if (error)
1190 return error;
1191
1192 /*
1193 * If there are any blocks, read-ahead block 0 as we're almost
1194 * certain to have the next operation be a read there.
1195 */
309ecac8 1196 mode = xfs_ilock_data_map_shared(ip);
daf83964 1197 if (ip->i_df.if_nextents > 0)
06566fda 1198 error = xfs_dir3_data_readahead(ip, 0, 0);
f999a5bf 1199 xfs_iunlock(ip, mode);
7a652bbe 1200 return error;
1da177e4
LT
1201}
1202
1da177e4 1203STATIC int
3562fd45 1204xfs_file_release(
1da177e4
LT
1205 struct inode *inode,
1206 struct file *filp)
1207{
2451337d 1208 return xfs_release(XFS_I(inode));
1da177e4
LT
1209}
1210
1da177e4 1211STATIC int
3562fd45 1212xfs_file_readdir(
b8227554
AV
1213 struct file *file,
1214 struct dir_context *ctx)
1da177e4 1215{
b8227554 1216 struct inode *inode = file_inode(file);
739bfb2a 1217 xfs_inode_t *ip = XFS_I(inode);
051e7cd4
CH
1218 size_t bufsize;
1219
1220 /*
1221 * The Linux API doesn't pass down the total size of the buffer
1222 * we read into down to the filesystem. With the filldir concept
1223 * it's not needed for correct information, but the XFS dir2 leaf
1224 * code wants an estimate of the buffer size to calculate it's
1225 * readahead window and size the buffers used for mapping to
1226 * physical blocks.
1227 *
1228 * Try to give it an estimate that's good enough, maybe at some
1229 * point we can change the ->readdir prototype to include the
a9cc799e 1230 * buffer size. For now we use the current glibc buffer size.
051e7cd4 1231 */
13d2c10b 1232 bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
051e7cd4 1233
acb9553c 1234 return xfs_readdir(NULL, ip, ctx, bufsize);
3fe3e6b1
JL
1235}
1236
1237STATIC loff_t
1238xfs_file_llseek(
1239 struct file *file,
1240 loff_t offset,
59f9c004 1241 int whence)
3fe3e6b1 1242{
9b2970aa
CH
1243 struct inode *inode = file->f_mapping->host;
1244
75c8c50f 1245 if (xfs_is_shutdown(XFS_I(inode)->i_mount))
9b2970aa
CH
1246 return -EIO;
1247
59f9c004 1248 switch (whence) {
9b2970aa 1249 default:
59f9c004 1250 return generic_file_llseek(file, offset, whence);
3fe3e6b1 1251 case SEEK_HOLE:
60271ab7 1252 offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
9b2970aa 1253 break;
49c69591 1254 case SEEK_DATA:
60271ab7 1255 offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
9b2970aa 1256 break;
3fe3e6b1 1257 }
9b2970aa
CH
1258
1259 if (offset < 0)
1260 return offset;
1261 return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3fe3e6b1
JL
1262}
1263
ea6c49b7 1264#ifdef CONFIG_FS_DAX
47ba8cc7 1265static inline vm_fault_t
ea6c49b7
SR
1266xfs_dax_fault(
1267 struct vm_fault *vmf,
1268 enum page_entry_size pe_size,
1269 bool write_fault,
1270 pfn_t *pfn)
1271{
1272 return dax_iomap_fault(vmf, pe_size, pfn, NULL,
1273 (write_fault && !vmf->cow_page) ?
1274 &xfs_dax_write_iomap_ops :
1275 &xfs_read_iomap_ops);
1276}
1277#else
47ba8cc7 1278static inline vm_fault_t
ea6c49b7
SR
1279xfs_dax_fault(
1280 struct vm_fault *vmf,
1281 enum page_entry_size pe_size,
1282 bool write_fault,
1283 pfn_t *pfn)
1284{
47ba8cc7
DW
1285 ASSERT(0);
1286 return VM_FAULT_SIGBUS;
ea6c49b7
SR
1287}
1288#endif
1289
de0e8c20
DC
1290/*
1291 * Locking for serialisation of IO during page faults. This results in a lock
1292 * ordering of:
1293 *
c1e8d7c6 1294 * mmap_lock (MM)
6b698ede 1295 * sb_start_pagefault(vfs, freeze)
2433480a 1296 * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
6b698ede
DC
1297 * page_lock (MM)
1298 * i_lock (XFS - extent map serialisation)
de0e8c20 1299 */
05edd888 1300static vm_fault_t
d522d569
CH
1301__xfs_filemap_fault(
1302 struct vm_fault *vmf,
1303 enum page_entry_size pe_size,
1304 bool write_fault)
de0e8c20 1305{
11bac800 1306 struct inode *inode = file_inode(vmf->vma->vm_file);
d522d569 1307 struct xfs_inode *ip = XFS_I(inode);
05edd888 1308 vm_fault_t ret;
de0e8c20 1309
d522d569 1310 trace_xfs_filemap_fault(ip, pe_size, write_fault);
de0e8c20 1311
d522d569
CH
1312 if (write_fault) {
1313 sb_start_pagefault(inode->i_sb);
1314 file_update_time(vmf->vma->vm_file);
1315 }
de0e8c20 1316
6b698ede 1317 if (IS_DAX(inode)) {
a39e596b
CH
1318 pfn_t pfn;
1319
2433480a 1320 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ea6c49b7 1321 ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
a39e596b
CH
1322 if (ret & VM_FAULT_NEEDDSYNC)
1323 ret = dax_finish_sync_fault(vmf, pe_size, pfn);
2433480a 1324 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
6b698ede 1325 } else {
2433480a
JK
1326 if (write_fault) {
1327 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
f150b423 1328 ret = iomap_page_mkwrite(vmf,
118e021b 1329 &xfs_page_mkwrite_iomap_ops);
2433480a
JK
1330 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1331 } else {
d522d569 1332 ret = filemap_fault(vmf);
2433480a 1333 }
6b698ede 1334 }
6b698ede 1335
d522d569
CH
1336 if (write_fault)
1337 sb_end_pagefault(inode->i_sb);
6b698ede 1338 return ret;
de0e8c20
DC
1339}
1340
b17164e2
MP
1341static inline bool
1342xfs_is_write_fault(
1343 struct vm_fault *vmf)
1344{
1345 return (vmf->flags & FAULT_FLAG_WRITE) &&
1346 (vmf->vma->vm_flags & VM_SHARED);
1347}
1348
05edd888 1349static vm_fault_t
6b698ede 1350xfs_filemap_fault(
075a924d
DC
1351 struct vm_fault *vmf)
1352{
6b698ede 1353 /* DAX can shortcut the normal fault path on write faults! */
d522d569
CH
1354 return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
1355 IS_DAX(file_inode(vmf->vma->vm_file)) &&
b17164e2 1356 xfs_is_write_fault(vmf));
6b698ede
DC
1357}
1358
05edd888 1359static vm_fault_t
a2d58167 1360xfs_filemap_huge_fault(
c791ace1
DJ
1361 struct vm_fault *vmf,
1362 enum page_entry_size pe_size)
acd76e74 1363{
d522d569 1364 if (!IS_DAX(file_inode(vmf->vma->vm_file)))
acd76e74
MW
1365 return VM_FAULT_FALLBACK;
1366
d522d569
CH
1367 /* DAX can shortcut the normal fault path on write faults! */
1368 return __xfs_filemap_fault(vmf, pe_size,
b17164e2 1369 xfs_is_write_fault(vmf));
d522d569 1370}
acd76e74 1371
05edd888 1372static vm_fault_t
d522d569
CH
1373xfs_filemap_page_mkwrite(
1374 struct vm_fault *vmf)
1375{
1376 return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
acd76e74
MW
1377}
1378
3af49285 1379/*
7b565c9f
JK
1380 * pfn_mkwrite was originally intended to ensure we capture time stamp updates
1381 * on write faults. In reality, it needs to serialise against truncate and
1382 * prepare memory for writing so handle is as standard write fault.
3af49285 1383 */
05edd888 1384static vm_fault_t
3af49285 1385xfs_filemap_pfn_mkwrite(
3af49285
DC
1386 struct vm_fault *vmf)
1387{
1388
7b565c9f 1389 return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
acd76e74
MW
1390}
1391
6b698ede
DC
1392static const struct vm_operations_struct xfs_file_vm_ops = {
1393 .fault = xfs_filemap_fault,
a2d58167 1394 .huge_fault = xfs_filemap_huge_fault,
945ea457 1395 .map_pages = filemap_map_pages,
6b698ede 1396 .page_mkwrite = xfs_filemap_page_mkwrite,
3af49285 1397 .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
6b698ede
DC
1398};
1399
1400STATIC int
1401xfs_file_mmap(
30fa529e
CH
1402 struct file *file,
1403 struct vm_area_struct *vma)
6b698ede 1404{
30fa529e
CH
1405 struct inode *inode = file_inode(file);
1406 struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
b21fec41 1407
a39e596b 1408 /*
b21fec41
PG
1409 * We don't support synchronous mappings for non-DAX files and
1410 * for DAX files if underneath dax_device is not synchronous.
a39e596b 1411 */
30fa529e 1412 if (!daxdev_mapping_supported(vma, target->bt_daxdev))
a39e596b
CH
1413 return -EOPNOTSUPP;
1414
30fa529e 1415 file_accessed(file);
6b698ede 1416 vma->vm_ops = &xfs_file_vm_ops;
30fa529e 1417 if (IS_DAX(inode))
1c71222e 1418 vm_flags_set(vma, VM_HUGEPAGE);
6b698ede 1419 return 0;
075a924d
DC
1420}
1421
4b6f5d20 1422const struct file_operations xfs_file_operations = {
3fe3e6b1 1423 .llseek = xfs_file_llseek,
b4f5d2c6 1424 .read_iter = xfs_file_read_iter,
bf97f3bc 1425 .write_iter = xfs_file_write_iter,
82c156f8 1426 .splice_read = generic_file_splice_read,
8d020765 1427 .splice_write = iter_file_splice_write,
3e08773c 1428 .iopoll = iocb_bio_iopoll,
3562fd45 1429 .unlocked_ioctl = xfs_file_ioctl,
1da177e4 1430#ifdef CONFIG_COMPAT
3562fd45 1431 .compat_ioctl = xfs_file_compat_ioctl,
1da177e4 1432#endif
3562fd45 1433 .mmap = xfs_file_mmap,
a39e596b 1434 .mmap_supported_flags = MAP_SYNC,
3562fd45
NS
1435 .open = xfs_file_open,
1436 .release = xfs_file_release,
1437 .fsync = xfs_file_fsync,
dbe6ec81 1438 .get_unmapped_area = thp_get_unmapped_area,
2fe17c10 1439 .fallocate = xfs_file_fallocate,
40144e49 1440 .fadvise = xfs_file_fadvise,
2e5dfc99 1441 .remap_file_range = xfs_file_remap_range,
1da177e4
LT
1442};
1443
4b6f5d20 1444const struct file_operations xfs_dir_file_operations = {
f999a5bf 1445 .open = xfs_dir_open,
1da177e4 1446 .read = generic_read_dir,
3b0a3c1a 1447 .iterate_shared = xfs_file_readdir,
59af1584 1448 .llseek = generic_file_llseek,
3562fd45 1449 .unlocked_ioctl = xfs_file_ioctl,
d3870398 1450#ifdef CONFIG_COMPAT
3562fd45 1451 .compat_ioctl = xfs_file_compat_ioctl,
d3870398 1452#endif
1da2f2db 1453 .fsync = xfs_dir_fsync,
1da177e4 1454};