xfs: add support for sub-pagesize writeback without buffer_heads
[linux-block.git] / fs / xfs / xfs_aops.c
CommitLineData
0b61f8a4 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
7b718769
NS
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
1da177e4 5 */
1da177e4 6#include "xfs.h"
70a9883c 7#include "xfs_shared.h"
239880ef
DC
8#include "xfs_format.h"
9#include "xfs_log_format.h"
10#include "xfs_trans_resv.h"
1da177e4 11#include "xfs_mount.h"
1da177e4 12#include "xfs_inode.h"
239880ef 13#include "xfs_trans.h"
281627df 14#include "xfs_inode_item.h"
a844f451 15#include "xfs_alloc.h"
1da177e4 16#include "xfs_error.h"
1da177e4 17#include "xfs_iomap.h"
0b1b213f 18#include "xfs_trace.h"
3ed3a434 19#include "xfs_bmap.h"
68988114 20#include "xfs_bmap_util.h"
a4fbe6ab 21#include "xfs_bmap_btree.h"
ef473667 22#include "xfs_reflink.h"
1da177e4
LT
23#include <linux/writeback.h>
24
fbcc0256
DC
25/*
26 * structure owned by writepages passed to individual writepage calls
27 */
28struct xfs_writepage_ctx {
29 struct xfs_bmbt_irec imap;
fbcc0256 30 unsigned int io_type;
fbcc0256 31 struct xfs_ioend *ioend;
fbcc0256
DC
32};
33
20a90f58 34struct block_device *
6214ed44 35xfs_find_bdev_for_inode(
046f1685 36 struct inode *inode)
6214ed44 37{
046f1685 38 struct xfs_inode *ip = XFS_I(inode);
6214ed44
CH
39 struct xfs_mount *mp = ip->i_mount;
40
71ddabb9 41 if (XFS_IS_REALTIME_INODE(ip))
6214ed44
CH
42 return mp->m_rtdev_targp->bt_bdev;
43 else
44 return mp->m_ddev_targp->bt_bdev;
45}
46
486aff5e
DW
47struct dax_device *
48xfs_find_daxdev_for_inode(
49 struct inode *inode)
50{
51 struct xfs_inode *ip = XFS_I(inode);
52 struct xfs_mount *mp = ip->i_mount;
53
54 if (XFS_IS_REALTIME_INODE(ip))
55 return mp->m_rtdev_targp->bt_daxdev;
56 else
57 return mp->m_ddev_targp->bt_daxdev;
58}
59
ac8ee546
CH
60static void
61xfs_finish_page_writeback(
62 struct inode *inode,
63 struct bio_vec *bvec,
64 int error)
65{
82cb1417
CH
66 struct iomap_page *iop = to_iomap_page(bvec->bv_page);
67
ac8ee546
CH
68 if (error) {
69 SetPageError(bvec->bv_page);
70 mapping_set_error(inode->i_mapping, -EIO);
71 }
ac8ee546 72
82cb1417
CH
73 ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
74 ASSERT(!iop || atomic_read(&iop->write_count) > 0);
8353a814 75
82cb1417 76 if (!iop || atomic_dec_and_test(&iop->write_count))
8353a814 77 end_page_writeback(bvec->bv_page);
37992c18
DC
78}
79
80/*
81 * We're now finished for good with this ioend structure. Update the page
82 * state, release holds on bios, and finally free up memory. Do not use the
83 * ioend after this.
f6d6d4fc 84 */
0829c360
CH
85STATIC void
86xfs_destroy_ioend(
0e51a8e1
CH
87 struct xfs_ioend *ioend,
88 int error)
0829c360 89{
37992c18 90 struct inode *inode = ioend->io_inode;
8353a814
CH
91 struct bio *bio = &ioend->io_inline_bio;
92 struct bio *last = ioend->io_bio, *next;
93 u64 start = bio->bi_iter.bi_sector;
94 bool quiet = bio_flagged(bio, BIO_QUIET);
f6d6d4fc 95
0e51a8e1 96 for (bio = &ioend->io_inline_bio; bio; bio = next) {
37992c18
DC
97 struct bio_vec *bvec;
98 int i;
99
0e51a8e1
CH
100 /*
101 * For the last bio, bi_private points to the ioend, so we
102 * need to explicitly end the iteration here.
103 */
104 if (bio == last)
105 next = NULL;
106 else
107 next = bio->bi_private;
583fa586 108
37992c18 109 /* walk each page on bio, ending page IO on them */
82cb1417
CH
110 bio_for_each_segment_all(bvec, bio, i)
111 xfs_finish_page_writeback(inode, bvec, error);
37992c18 112 bio_put(bio);
f6d6d4fc 113 }
8353a814
CH
114
115 if (unlikely(error && !quiet)) {
116 xfs_err_ratelimited(XFS_I(inode)->i_mount,
117 "writeback error on sector %llu", start);
118 }
0829c360
CH
119}
120
fc0063c4
CH
121/*
122 * Fast and loose check if this write could update the on-disk inode size.
123 */
124static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
125{
126 return ioend->io_offset + ioend->io_size >
127 XFS_I(ioend->io_inode)->i_d.di_size;
128}
129
281627df
CH
130STATIC int
131xfs_setfilesize_trans_alloc(
132 struct xfs_ioend *ioend)
133{
134 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
135 struct xfs_trans *tp;
136 int error;
137
4df0f7f1
DC
138 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
139 XFS_TRANS_NOFS, &tp);
253f4911 140 if (error)
281627df 141 return error;
281627df
CH
142
143 ioend->io_append_trans = tp;
144
d9457dc0 145 /*
437a255a 146 * We may pass freeze protection with a transaction. So tell lockdep
d9457dc0
JK
147 * we released it.
148 */
bee9182d 149 __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
281627df
CH
150 /*
151 * We hand off the transaction to the completion thread now, so
152 * clear the flag here.
153 */
9070733b 154 current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
281627df
CH
155 return 0;
156}
157
ba87ea69 158/*
2813d682 159 * Update on-disk file size now that data has been written to disk.
ba87ea69 160 */
281627df 161STATIC int
e372843a 162__xfs_setfilesize(
2ba66237
CH
163 struct xfs_inode *ip,
164 struct xfs_trans *tp,
165 xfs_off_t offset,
166 size_t size)
ba87ea69 167{
ba87ea69 168 xfs_fsize_t isize;
ba87ea69 169
aa6bf01d 170 xfs_ilock(ip, XFS_ILOCK_EXCL);
2ba66237 171 isize = xfs_new_eof(ip, offset + size);
281627df
CH
172 if (!isize) {
173 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4906e215 174 xfs_trans_cancel(tp);
281627df 175 return 0;
ba87ea69
LM
176 }
177
2ba66237 178 trace_xfs_setfilesize(ip, offset, size);
281627df
CH
179
180 ip->i_d.di_size = isize;
181 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
182 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
183
70393313 184 return xfs_trans_commit(tp);
77d7a0c2
DC
185}
186
e372843a
CH
187int
188xfs_setfilesize(
189 struct xfs_inode *ip,
190 xfs_off_t offset,
191 size_t size)
192{
193 struct xfs_mount *mp = ip->i_mount;
194 struct xfs_trans *tp;
195 int error;
196
197 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
198 if (error)
199 return error;
200
201 return __xfs_setfilesize(ip, tp, offset, size);
202}
203
2ba66237
CH
204STATIC int
205xfs_setfilesize_ioend(
0e51a8e1
CH
206 struct xfs_ioend *ioend,
207 int error)
2ba66237
CH
208{
209 struct xfs_inode *ip = XFS_I(ioend->io_inode);
210 struct xfs_trans *tp = ioend->io_append_trans;
211
212 /*
213 * The transaction may have been allocated in the I/O submission thread,
214 * thus we need to mark ourselves as being in a transaction manually.
215 * Similarly for freeze protection.
216 */
9070733b 217 current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
bee9182d 218 __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
2ba66237 219
5cb13dcd 220 /* we abort the update if there was an IO error */
0e51a8e1 221 if (error) {
5cb13dcd 222 xfs_trans_cancel(tp);
0e51a8e1 223 return error;
5cb13dcd
Z
224 }
225
e372843a 226 return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
2ba66237
CH
227}
228
0829c360 229/*
5ec4fabb 230 * IO write completion.
f6d6d4fc
CH
231 */
232STATIC void
5ec4fabb 233xfs_end_io(
77d7a0c2 234 struct work_struct *work)
0829c360 235{
0e51a8e1
CH
236 struct xfs_ioend *ioend =
237 container_of(work, struct xfs_ioend, io_work);
238 struct xfs_inode *ip = XFS_I(ioend->io_inode);
787eb485
CH
239 xfs_off_t offset = ioend->io_offset;
240 size_t size = ioend->io_size;
4e4cbee9 241 int error;
ba87ea69 242
af055e37 243 /*
787eb485 244 * Just clean up the in-memory strutures if the fs has been shut down.
af055e37 245 */
787eb485 246 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
0e51a8e1 247 error = -EIO;
787eb485
CH
248 goto done;
249 }
04f658ee 250
43caeb18 251 /*
787eb485 252 * Clean up any COW blocks on an I/O error.
43caeb18 253 */
4e4cbee9 254 error = blk_status_to_errno(ioend->io_bio->bi_status);
787eb485
CH
255 if (unlikely(error)) {
256 switch (ioend->io_type) {
257 case XFS_IO_COW:
258 xfs_reflink_cancel_cow_range(ip, offset, size, true);
259 break;
43caeb18 260 }
787eb485
CH
261
262 goto done;
43caeb18
DW
263 }
264
5ec4fabb 265 /*
787eb485 266 * Success: commit the COW or unwritten blocks if needed.
5ec4fabb 267 */
787eb485
CH
268 switch (ioend->io_type) {
269 case XFS_IO_COW:
270 error = xfs_reflink_end_cow(ip, offset, size);
271 break;
272 case XFS_IO_UNWRITTEN:
ee70daab
EG
273 /* writeback should never update isize */
274 error = xfs_iomap_write_unwritten(ip, offset, size, false);
787eb485
CH
275 break;
276 default:
277 ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
278 break;
5ec4fabb 279 }
ba87ea69 280
04f658ee 281done:
787eb485
CH
282 if (ioend->io_append_trans)
283 error = xfs_setfilesize_ioend(ioend, error);
0e51a8e1 284 xfs_destroy_ioend(ioend, error);
c626d174
DC
285}
286
0e51a8e1
CH
287STATIC void
288xfs_end_bio(
289 struct bio *bio)
0829c360 290{
0e51a8e1
CH
291 struct xfs_ioend *ioend = bio->bi_private;
292 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
0829c360 293
43caeb18 294 if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
0e51a8e1
CH
295 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
296 else if (ioend->io_append_trans)
297 queue_work(mp->m_data_workqueue, &ioend->io_work);
298 else
4e4cbee9 299 xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
0829c360
CH
300}
301
1da177e4
LT
302STATIC int
303xfs_map_blocks(
5c665e5b 304 struct xfs_writepage_ctx *wpc,
1da177e4 305 struct inode *inode,
5c665e5b 306 loff_t offset)
1da177e4 307{
a206c817
CH
308 struct xfs_inode *ip = XFS_I(inode);
309 struct xfs_mount *mp = ip->i_mount;
93407472 310 ssize_t count = i_blocksize(inode);
889c65b3 311 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
5c665e5b
CH
312 struct xfs_bmbt_irec imap;
313 int whichfork = XFS_DATA_FORK;
060d4eaa 314 struct xfs_iext_cursor icur;
889c65b3 315 bool imap_valid;
a206c817 316 int error = 0;
a206c817 317
889c65b3
CH
318 /*
319 * We have to make sure the cached mapping is within EOF to protect
320 * against eofblocks trimming on file release leaving us with a stale
321 * mapping. Otherwise, a page for a subsequent file extending buffered
322 * write could get picked up by this writeback cycle and written to the
323 * wrong blocks.
324 *
325 * Note that what we really want here is a generic mapping invalidation
326 * mechanism to protect us from arbitrary extent modifying contexts, not
327 * just eofblocks.
328 */
329 xfs_trim_extent_eof(&wpc->imap, ip);
330
331 /*
332 * COW fork blocks can overlap data fork blocks even if the blocks
333 * aren't shared. COW I/O always takes precedent, so we must always
334 * check for overlap on reflink inodes unless the mapping is already a
335 * COW one.
336 */
337 imap_valid = offset_fsb >= wpc->imap.br_startoff &&
338 offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
339 if (imap_valid &&
340 (!xfs_is_reflink_inode(ip) || wpc->io_type == XFS_IO_COW))
341 return 0;
342
a206c817 343 if (XFS_FORCED_SHUTDOWN(mp))
b474c7ae 344 return -EIO;
a206c817 345
889c65b3
CH
346 /*
347 * If we don't have a valid map, now it's time to get a new one for this
348 * offset. This will convert delayed allocations (including COW ones)
349 * into real extents. If we return without a valid map, it means we
350 * landed in a hole and we skip the block.
351 */
988ef927 352 xfs_ilock(ip, XFS_ILOCK_SHARED);
8ff2957d
CH
353 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
354 (ip->i_df.if_flags & XFS_IFEXTENTS));
d2c28191 355 ASSERT(offset <= mp->m_super->s_maxbytes);
8ff2957d 356
060d4eaa
CH
357 if (offset > mp->m_super->s_maxbytes - count)
358 count = mp->m_super->s_maxbytes - offset;
359 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
060d4eaa
CH
360
361 /*
362 * Check if this is offset is covered by a COW extents, and if yes use
363 * it directly instead of looking up anything in the data fork.
364 */
5c665e5b 365 if (xfs_is_reflink_inode(ip) &&
060d4eaa
CH
366 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap) &&
367 imap.br_startoff <= offset_fsb) {
5c665e5b
CH
368 xfs_iunlock(ip, XFS_ILOCK_SHARED);
369 /*
370 * Truncate can race with writeback since writeback doesn't
371 * take the iolock and truncate decreases the file size before
372 * it starts truncating the pages between new_size and old_size.
373 * Therefore, we can end up in the situation where writeback
374 * gets a CoW fork mapping but the truncate makes the mapping
375 * invalid and we end up in here trying to get a new mapping.
376 * bail out here so that we simply never get a valid mapping
377 * and so we drop the write altogether. The page truncation
378 * will kill the contents anyway.
379 */
380 if (offset > i_size_read(inode)) {
381 wpc->io_type = XFS_IO_HOLE;
382 return 0;
383 }
384 whichfork = XFS_COW_FORK;
385 wpc->io_type = XFS_IO_COW;
386 goto allocate_blocks;
387 }
388
389 /*
390 * Map valid and no COW extent in the way? We're done.
391 */
889c65b3 392 if (imap_valid) {
5c665e5b
CH
393 xfs_iunlock(ip, XFS_ILOCK_SHARED);
394 return 0;
395 }
396
397 /*
398 * If we don't have a valid map, now it's time to get a new one for this
399 * offset. This will convert delayed allocations (including COW ones)
400 * into real extents.
401 */
3345746e
CH
402 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
403 imap.br_startoff = end_fsb; /* fake a hole past EOF */
8ff2957d 404 xfs_iunlock(ip, XFS_ILOCK_SHARED);
a206c817 405
3345746e
CH
406 if (imap.br_startoff > offset_fsb) {
407 /* landed in a hole or beyond EOF */
408 imap.br_blockcount = imap.br_startoff - offset_fsb;
5c665e5b 409 imap.br_startoff = offset_fsb;
5c665e5b
CH
410 imap.br_startblock = HOLESTARTBLOCK;
411 wpc->io_type = XFS_IO_HOLE;
e2f6ad46
DC
412 } else {
413 if (isnullstartblock(imap.br_startblock)) {
414 /* got a delalloc extent */
415 wpc->io_type = XFS_IO_DELALLOC;
416 goto allocate_blocks;
417 }
5c665e5b 418
e2f6ad46
DC
419 if (imap.br_state == XFS_EXT_UNWRITTEN)
420 wpc->io_type = XFS_IO_UNWRITTEN;
421 else
422 wpc->io_type = XFS_IO_OVERWRITE;
8ff2957d 423 }
e2f6ad46 424
5c665e5b
CH
425 wpc->imap = imap;
426 trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
427 return 0;
428allocate_blocks:
429 error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap);
430 if (error)
431 return error;
432 wpc->imap = imap;
433 trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
8ff2957d 434 return 0;
1da177e4
LT
435}
436
f6d6d4fc 437/*
bb18782a
DC
438 * Submit the bio for an ioend. We are passed an ioend with a bio attached to
439 * it, and we submit that bio. The ioend may be used for multiple bio
440 * submissions, so we only want to allocate an append transaction for the ioend
441 * once. In the case of multiple bio submission, each bio will take an IO
442 * reference to the ioend to ensure that the ioend completion is only done once
443 * all bios have been submitted and the ioend is really done.
7bf7f352
DC
444 *
445 * If @fail is non-zero, it means that we have a situation where some part of
446 * the submission process has failed after we have marked paged for writeback
bb18782a
DC
447 * and unlocked them. In this situation, we need to fail the bio and ioend
448 * rather than submit it to IO. This typically only happens on a filesystem
449 * shutdown.
f6d6d4fc 450 */
e10de372 451STATIC int
f6d6d4fc 452xfs_submit_ioend(
06342cf8 453 struct writeback_control *wbc,
0e51a8e1 454 struct xfs_ioend *ioend,
e10de372 455 int status)
f6d6d4fc 456{
5eda4300
DW
457 /* Convert CoW extents to regular */
458 if (!status && ioend->io_type == XFS_IO_COW) {
4a2d01b0
DC
459 /*
460 * Yuk. This can do memory allocation, but is not a
461 * transactional operation so everything is done in GFP_KERNEL
462 * context. That can deadlock, because we hold pages in
463 * writeback state and GFP_KERNEL allocations can block on them.
464 * Hence we must operate in nofs conditions here.
465 */
466 unsigned nofs_flag;
467
468 nofs_flag = memalloc_nofs_save();
5eda4300
DW
469 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
470 ioend->io_offset, ioend->io_size);
4a2d01b0 471 memalloc_nofs_restore(nofs_flag);
5eda4300
DW
472 }
473
e10de372
DC
474 /* Reserve log space if we might write beyond the on-disk inode size. */
475 if (!status &&
0e51a8e1 476 ioend->io_type != XFS_IO_UNWRITTEN &&
bb18782a
DC
477 xfs_ioend_is_append(ioend) &&
478 !ioend->io_append_trans)
e10de372 479 status = xfs_setfilesize_trans_alloc(ioend);
bb18782a 480
0e51a8e1
CH
481 ioend->io_bio->bi_private = ioend;
482 ioend->io_bio->bi_end_io = xfs_end_bio;
7637241e 483 ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
70fd7614 484
e10de372
DC
485 /*
486 * If we are failing the IO now, just mark the ioend with an
487 * error and finish it. This will run IO completion immediately
488 * as there is only one reference to the ioend at this point in
489 * time.
490 */
491 if (status) {
4e4cbee9 492 ioend->io_bio->bi_status = errno_to_blk_status(status);
0e51a8e1 493 bio_endio(ioend->io_bio);
e10de372
DC
494 return status;
495 }
d88992f6 496
31d7d58d 497 ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
4e49ea4a 498 submit_bio(ioend->io_bio);
e10de372 499 return 0;
f6d6d4fc 500}
f6d6d4fc 501
0e51a8e1
CH
502static struct xfs_ioend *
503xfs_alloc_ioend(
504 struct inode *inode,
505 unsigned int type,
506 xfs_off_t offset,
3faed667
CH
507 struct block_device *bdev,
508 sector_t sector)
0e51a8e1
CH
509{
510 struct xfs_ioend *ioend;
511 struct bio *bio;
f6d6d4fc 512
e292d7bc 513 bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
3faed667
CH
514 bio_set_dev(bio, bdev);
515 bio->bi_iter.bi_sector = sector;
0e51a8e1
CH
516
517 ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
518 INIT_LIST_HEAD(&ioend->io_list);
519 ioend->io_type = type;
520 ioend->io_inode = inode;
521 ioend->io_size = 0;
522 ioend->io_offset = offset;
523 INIT_WORK(&ioend->io_work, xfs_end_io);
524 ioend->io_append_trans = NULL;
525 ioend->io_bio = bio;
526 return ioend;
527}
528
529/*
530 * Allocate a new bio, and chain the old bio to the new one.
531 *
532 * Note that we have to do perform the chaining in this unintuitive order
533 * so that the bi_private linkage is set up in the right direction for the
534 * traversal in xfs_destroy_ioend().
535 */
536static void
537xfs_chain_bio(
538 struct xfs_ioend *ioend,
539 struct writeback_control *wbc,
3faed667
CH
540 struct block_device *bdev,
541 sector_t sector)
0e51a8e1
CH
542{
543 struct bio *new;
544
545 new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
3faed667
CH
546 bio_set_dev(new, bdev);
547 new->bi_iter.bi_sector = sector;
0e51a8e1
CH
548 bio_chain(ioend->io_bio, new);
549 bio_get(ioend->io_bio); /* for xfs_destroy_ioend */
7637241e 550 ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
31d7d58d 551 ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
4e49ea4a 552 submit_bio(ioend->io_bio);
0e51a8e1 553 ioend->io_bio = new;
f6d6d4fc
CH
554}
555
556/*
3faed667
CH
557 * Test to see if we have an existing ioend structure that we could append to
558 * first, otherwise finish off the current ioend and start another.
f6d6d4fc
CH
559 */
560STATIC void
561xfs_add_to_ioend(
562 struct inode *inode,
7336cea8 563 xfs_off_t offset,
3faed667 564 struct page *page,
82cb1417 565 struct iomap_page *iop,
e10de372 566 struct xfs_writepage_ctx *wpc,
bb18782a 567 struct writeback_control *wbc,
e10de372 568 struct list_head *iolist)
f6d6d4fc 569{
3faed667
CH
570 struct xfs_inode *ip = XFS_I(inode);
571 struct xfs_mount *mp = ip->i_mount;
572 struct block_device *bdev = xfs_find_bdev_for_inode(inode);
573 unsigned len = i_blocksize(inode);
574 unsigned poff = offset & (PAGE_SIZE - 1);
575 sector_t sector;
576
577 sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
578 ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
579
fbcc0256 580 if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
3faed667 581 sector != bio_end_sector(wpc->ioend->io_bio) ||
0df61da8 582 offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
e10de372
DC
583 if (wpc->ioend)
584 list_add(&wpc->ioend->io_list, iolist);
3faed667
CH
585 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
586 bdev, sector);
f6d6d4fc
CH
587 }
588
82cb1417
CH
589 if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
590 if (iop)
591 atomic_inc(&iop->write_count);
592 if (bio_full(wpc->ioend->io_bio))
593 xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
594 __bio_add_page(wpc->ioend->io_bio, page, len, poff);
595 }
bb18782a 596
3faed667 597 wpc->ioend->io_size += len;
f6d6d4fc
CH
598}
599
3ed3a434
DC
600STATIC void
601xfs_vm_invalidatepage(
602 struct page *page,
d47992f8
LC
603 unsigned int offset,
604 unsigned int length)
3ed3a434 605{
82cb1417
CH
606 trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
607 iomap_invalidatepage(page, offset, length);
3ed3a434
DC
608}
609
610/*
82cb1417
CH
611 * If the page has delalloc blocks on it, we need to punch them out before we
612 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
613 * inode that can trip up a later direct I/O read operation on the same region.
3ed3a434 614 *
82cb1417
CH
615 * We prevent this by truncating away the delalloc regions on the page. Because
616 * they are delalloc, we can do this without needing a transaction. Indeed - if
617 * we get ENOSPC errors, we have to be able to do this truncation without a
618 * transaction as there is no space left for block reservation (typically why we
619 * see a ENOSPC in writeback).
3ed3a434
DC
620 */
621STATIC void
622xfs_aops_discard_page(
623 struct page *page)
624{
625 struct inode *inode = page->mapping->host;
626 struct xfs_inode *ip = XFS_I(inode);
03625721 627 struct xfs_mount *mp = ip->i_mount;
3ed3a434 628 loff_t offset = page_offset(page);
03625721
CH
629 xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset);
630 int error;
3ed3a434 631
03625721 632 if (XFS_FORCED_SHUTDOWN(mp))
e8c3753c
DC
633 goto out_invalidate;
634
03625721 635 xfs_alert(mp,
c9690043 636 "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
3ed3a434
DC
637 page, ip->i_ino, offset);
638
03625721
CH
639 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
640 PAGE_SIZE / i_blocksize(inode));
03625721
CH
641 if (error && !XFS_FORCED_SHUTDOWN(mp))
642 xfs_alert(mp, "page discard unable to remove delalloc mapping.");
3ed3a434 643out_invalidate:
09cbfeaf 644 xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
3ed3a434
DC
645}
646
e10de372
DC
647/*
648 * We implement an immediate ioend submission policy here to avoid needing to
649 * chain multiple ioends and hence nest mempool allocations which can violate
650 * forward progress guarantees we need to provide. The current ioend we are
82cb1417 651 * adding blocks to is cached on the writepage context, and if the new block
e10de372
DC
652 * does not append to the cached ioend it will create a new ioend and cache that
653 * instead.
654 *
655 * If a new ioend is created and cached, the old ioend is returned and queued
656 * locally for submission once the entire page is processed or an error has been
657 * detected. While ioends are submitted immediately after they are completed,
658 * batching optimisations are provided by higher level block plugging.
659 *
660 * At the end of a writeback pass, there will be a cached ioend remaining on the
661 * writepage context that the caller will need to submit.
662 */
bfce7d2e
DC
663static int
664xfs_writepage_map(
665 struct xfs_writepage_ctx *wpc,
e10de372 666 struct writeback_control *wbc,
bfce7d2e
DC
667 struct inode *inode,
668 struct page *page,
2d5f4b5b 669 uint64_t end_offset)
bfce7d2e 670{
e10de372 671 LIST_HEAD(submit_list);
82cb1417
CH
672 struct iomap_page *iop = to_iomap_page(page);
673 unsigned len = i_blocksize(inode);
e10de372 674 struct xfs_ioend *ioend, *next;
6a4c9501 675 uint64_t file_offset; /* file offset of page */
82cb1417 676 int error = 0, count = 0, i;
bfce7d2e 677
82cb1417
CH
678 ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
679 ASSERT(!iop || atomic_read(&iop->write_count) == 0);
ac8ee546 680
e2f6ad46 681 /*
82cb1417
CH
682 * Walk through the page to find areas to write back. If we run off the
683 * end of the current map or find the current map invalid, grab a new
684 * one.
e2f6ad46 685 */
82cb1417
CH
686 for (i = 0, file_offset = page_offset(page);
687 i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
688 i++, file_offset += len) {
689 if (iop && !test_bit(i, iop->uptodate))
bfce7d2e 690 continue;
bfce7d2e 691
889c65b3
CH
692 error = xfs_map_blocks(wpc, inode, file_offset);
693 if (error)
694 break;
82cb1417 695 if (wpc->io_type == XFS_IO_HOLE)
5c665e5b 696 continue;
82cb1417
CH
697 xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
698 &submit_list);
5c665e5b 699 count++;
e2f6ad46 700 }
bfce7d2e 701
e10de372 702 ASSERT(wpc->ioend || list_empty(&submit_list));
1b65d3dd
CH
703 ASSERT(PageLocked(page));
704 ASSERT(!PageWriteback(page));
bfce7d2e 705
bfce7d2e 706 /*
82cb1417
CH
707 * On error, we have to fail the ioend here because we may have set
708 * pages under writeback, we have to make sure we run IO completion to
709 * mark the error state of the IO appropriately, so we can't cancel the
710 * ioend directly here. That means we have to mark this page as under
711 * writeback if we included any blocks from it in the ioend chain so
712 * that completion treats it correctly.
bfce7d2e 713 *
e10de372
DC
714 * If we didn't include the page in the ioend, the on error we can
715 * simply discard and unlock it as there are no other users of the page
82cb1417
CH
716 * now. The caller will still need to trigger submission of outstanding
717 * ioends on the writepage context so they are treated correctly on
718 * error.
bfce7d2e 719 */
8e1f065b
CH
720 if (unlikely(error)) {
721 if (!count) {
722 xfs_aops_discard_page(page);
723 ClearPageUptodate(page);
724 unlock_page(page);
725 goto done;
726 }
727
1b65d3dd
CH
728 /*
729 * If the page was not fully cleaned, we need to ensure that the
730 * higher layers come back to it correctly. That means we need
731 * to keep the page dirty, and for WB_SYNC_ALL writeback we need
732 * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
733 * so another attempt to write this page in this writeback sweep
734 * will be made.
735 */
8e1f065b 736 set_page_writeback_keepwrite(page);
e10de372 737 } else {
1b65d3dd
CH
738 clear_page_dirty_for_io(page);
739 set_page_writeback(page);
bfce7d2e 740 }
e10de372 741
8e1f065b
CH
742 unlock_page(page);
743
744 /*
745 * Preserve the original error if there was one, otherwise catch
746 * submission errors here and propagate into subsequent ioend
747 * submissions.
748 */
749 list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
750 int error2;
751
752 list_del_init(&ioend->io_list);
753 error2 = xfs_submit_ioend(wbc, ioend, error);
754 if (error2 && !error)
755 error = error2;
756 }
757
758 /*
82cb1417
CH
759 * We can end up here with no error and nothing to write only if we race
760 * with a partial page truncate on a sub-page block sized filesystem.
8e1f065b
CH
761 */
762 if (!count)
763 end_page_writeback(page);
764done:
bfce7d2e
DC
765 mapping_set_error(page->mapping, error);
766 return error;
767}
768
1da177e4 769/*
89f3b363
CH
770 * Write out a dirty page.
771 *
772 * For delalloc space on the page we need to allocate space and flush it.
773 * For unwritten space on the page we need to start the conversion to
774 * regular allocated space.
1da177e4 775 */
1da177e4 776STATIC int
fbcc0256 777xfs_do_writepage(
89f3b363 778 struct page *page,
fbcc0256
DC
779 struct writeback_control *wbc,
780 void *data)
1da177e4 781{
fbcc0256 782 struct xfs_writepage_ctx *wpc = data;
89f3b363 783 struct inode *inode = page->mapping->host;
1da177e4 784 loff_t offset;
c8ce540d 785 uint64_t end_offset;
ad68972a 786 pgoff_t end_index;
89f3b363 787
34097dfe 788 trace_xfs_writepage(inode, page, 0, 0);
89f3b363
CH
789
790 /*
791 * Refuse to write the page out if we are called from reclaim context.
792 *
d4f7a5cb
CH
793 * This avoids stack overflows when called from deeply used stacks in
794 * random callers for direct reclaim or memcg reclaim. We explicitly
795 * allow reclaim from kswapd as the stack usage there is relatively low.
89f3b363 796 *
94054fa3
MG
797 * This should never happen except in the case of a VM regression so
798 * warn about it.
89f3b363 799 */
94054fa3
MG
800 if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
801 PF_MEMALLOC))
b5420f23 802 goto redirty;
1da177e4 803
89f3b363 804 /*
680a647b
CH
805 * Given that we do not allow direct reclaim to call us, we should
806 * never be called while in a filesystem transaction.
89f3b363 807 */
9070733b 808 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
b5420f23 809 goto redirty;
89f3b363 810
8695d27e 811 /*
ad68972a
DC
812 * Is this page beyond the end of the file?
813 *
8695d27e
JL
814 * The page index is less than the end_index, adjust the end_offset
815 * to the highest offset that this page should represent.
816 * -----------------------------------------------------
817 * | file mapping | <EOF> |
818 * -----------------------------------------------------
819 * | Page ... | Page N-2 | Page N-1 | Page N | |
820 * ^--------------------------------^----------|--------
821 * | desired writeback range | see else |
822 * ---------------------------------^------------------|
823 */
ad68972a 824 offset = i_size_read(inode);
09cbfeaf 825 end_index = offset >> PAGE_SHIFT;
8695d27e 826 if (page->index < end_index)
09cbfeaf 827 end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
8695d27e
JL
828 else {
829 /*
830 * Check whether the page to write out is beyond or straddles
831 * i_size or not.
832 * -------------------------------------------------------
833 * | file mapping | <EOF> |
834 * -------------------------------------------------------
835 * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
836 * ^--------------------------------^-----------|---------
837 * | | Straddles |
838 * ---------------------------------^-----------|--------|
839 */
09cbfeaf 840 unsigned offset_into_page = offset & (PAGE_SIZE - 1);
6b7a03f0
CH
841
842 /*
ff9a28f6
JK
843 * Skip the page if it is fully outside i_size, e.g. due to a
844 * truncate operation that is in progress. We must redirty the
845 * page so that reclaim stops reclaiming it. Otherwise
846 * xfs_vm_releasepage() is called on it and gets confused.
8695d27e
JL
847 *
848 * Note that the end_index is unsigned long, it would overflow
849 * if the given offset is greater than 16TB on 32-bit system
850 * and if we do check the page is fully outside i_size or not
851 * via "if (page->index >= end_index + 1)" as "end_index + 1"
852 * will be evaluated to 0. Hence this page will be redirtied
853 * and be written out repeatedly which would result in an
854 * infinite loop, the user program that perform this operation
855 * will hang. Instead, we can verify this situation by checking
856 * if the page to write is totally beyond the i_size or if it's
857 * offset is just equal to the EOF.
6b7a03f0 858 */
8695d27e
JL
859 if (page->index > end_index ||
860 (page->index == end_index && offset_into_page == 0))
ff9a28f6 861 goto redirty;
6b7a03f0
CH
862
863 /*
864 * The page straddles i_size. It must be zeroed out on each
865 * and every writepage invocation because it may be mmapped.
866 * "A file is mapped in multiples of the page size. For a file
8695d27e 867 * that is not a multiple of the page size, the remaining
6b7a03f0
CH
868 * memory is zeroed when mapped, and writes to that region are
869 * not written out to the file."
870 */
09cbfeaf 871 zero_user_segment(page, offset_into_page, PAGE_SIZE);
8695d27e
JL
872
873 /* Adjust the end_offset to the end of file */
874 end_offset = offset;
1da177e4
LT
875 }
876
2d5f4b5b 877 return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
f51623b2 878
b5420f23 879redirty:
f51623b2
NS
880 redirty_page_for_writepage(wbc, page);
881 unlock_page(page);
882 return 0;
f51623b2
NS
883}
884
fbcc0256
DC
885STATIC int
886xfs_vm_writepage(
887 struct page *page,
888 struct writeback_control *wbc)
889{
890 struct xfs_writepage_ctx wpc = {
891 .io_type = XFS_IO_INVALID,
892 };
893 int ret;
894
895 ret = xfs_do_writepage(page, wbc, &wpc);
e10de372
DC
896 if (wpc.ioend)
897 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
898 return ret;
fbcc0256
DC
899}
900
7d4fb40a
NS
901STATIC int
902xfs_vm_writepages(
903 struct address_space *mapping,
904 struct writeback_control *wbc)
905{
fbcc0256
DC
906 struct xfs_writepage_ctx wpc = {
907 .io_type = XFS_IO_INVALID,
908 };
909 int ret;
910
b3aea4ed 911 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
fbcc0256 912 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
e10de372
DC
913 if (wpc.ioend)
914 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
915 return ret;
7d4fb40a
NS
916}
917
6e2608df
DW
918STATIC int
919xfs_dax_writepages(
920 struct address_space *mapping,
921 struct writeback_control *wbc)
922{
923 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
924 return dax_writeback_mapping_range(mapping,
925 xfs_find_bdev_for_inode(mapping->host), wbc);
926}
927
f51623b2 928STATIC int
238f4c54 929xfs_vm_releasepage(
f51623b2
NS
930 struct page *page,
931 gfp_t gfp_mask)
932{
34097dfe 933 trace_xfs_releasepage(page->mapping->host, page, 0, 0);
82cb1417 934 return iomap_releasepage(page, gfp_mask);
1da177e4
LT
935}
936
1da177e4 937STATIC sector_t
e4c573bb 938xfs_vm_bmap(
1da177e4
LT
939 struct address_space *mapping,
940 sector_t block)
941{
b84e7722 942 struct xfs_inode *ip = XFS_I(mapping->host);
1da177e4 943
b84e7722 944 trace_xfs_vm_bmap(ip);
db1327b1
DW
945
946 /*
947 * The swap code (ab-)uses ->bmap to get a block mapping and then
793057e1 948 * bypasses the file system for actual I/O. We really can't allow
db1327b1 949 * that on reflinks inodes, so we have to skip out here. And yes,
eb5e248d
DW
950 * 0 is the magic code for a bmap error.
951 *
952 * Since we don't pass back blockdev info, we can't return bmap
953 * information for rt files either.
db1327b1 954 */
eb5e248d 955 if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
db1327b1 956 return 0;
b84e7722 957 return iomap_bmap(mapping, block, &xfs_iomap_ops);
1da177e4
LT
958}
959
960STATIC int
e4c573bb 961xfs_vm_readpage(
1da177e4
LT
962 struct file *unused,
963 struct page *page)
964{
121e213e 965 trace_xfs_vm_readpage(page->mapping->host, 1);
82cb1417 966 return iomap_readpage(page, &xfs_iomap_ops);
1da177e4
LT
967}
968
969STATIC int
e4c573bb 970xfs_vm_readpages(
1da177e4
LT
971 struct file *unused,
972 struct address_space *mapping,
973 struct list_head *pages,
974 unsigned nr_pages)
975{
121e213e 976 trace_xfs_vm_readpages(mapping->host, nr_pages);
82cb1417 977 return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
22e757a4
DC
978}
979
67482129
DW
980static int
981xfs_iomap_swapfile_activate(
982 struct swap_info_struct *sis,
983 struct file *swap_file,
984 sector_t *span)
985{
986 sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
987 return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
988}
989
f5e54d6e 990const struct address_space_operations xfs_address_space_operations = {
e4c573bb
NS
991 .readpage = xfs_vm_readpage,
992 .readpages = xfs_vm_readpages,
993 .writepage = xfs_vm_writepage,
7d4fb40a 994 .writepages = xfs_vm_writepages,
82cb1417 995 .set_page_dirty = iomap_set_page_dirty,
238f4c54
NS
996 .releasepage = xfs_vm_releasepage,
997 .invalidatepage = xfs_vm_invalidatepage,
e4c573bb 998 .bmap = xfs_vm_bmap,
6e2608df 999 .direct_IO = noop_direct_IO,
82cb1417
CH
1000 .migratepage = iomap_migrate_page,
1001 .is_partially_uptodate = iomap_is_partially_uptodate,
aa261f54 1002 .error_remove_page = generic_error_remove_page,
67482129 1003 .swap_activate = xfs_iomap_swapfile_activate,
1da177e4 1004};
6e2608df
DW
1005
1006const struct address_space_operations xfs_dax_aops = {
1007 .writepages = xfs_dax_writepages,
1008 .direct_IO = noop_direct_IO,
1009 .set_page_dirty = noop_set_page_dirty,
1010 .invalidatepage = noop_invalidatepage,
67482129 1011 .swap_activate = xfs_iomap_swapfile_activate,
6e2608df 1012};