Merge tag 'mm-hotfixes-stable-2025-07-11-16-16' of git://git.kernel.org/pub/scm/linux...
[linux-block.git] / fs / xfs / xfs_aops.c
CommitLineData
0b61f8a4 1// SPDX-License-Identifier: GPL-2.0
1da177e4 2/*
7b718769 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
058dd70c 4 * Copyright (c) 2016-2025 Christoph Hellwig.
7b718769 5 * All Rights Reserved.
1da177e4 6 */
1da177e4 7#include "xfs.h"
70a9883c 8#include "xfs_shared.h"
239880ef
DC
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
1da177e4 12#include "xfs_mount.h"
1da177e4 13#include "xfs_inode.h"
239880ef 14#include "xfs_trans.h"
1da177e4 15#include "xfs_iomap.h"
0b1b213f 16#include "xfs_trace.h"
3ed3a434 17#include "xfs_bmap.h"
68988114 18#include "xfs_bmap_util.h"
ef473667 19#include "xfs_reflink.h"
c2beff99
DW
20#include "xfs_errortag.h"
21#include "xfs_error.h"
2d873efd 22#include "xfs_icache.h"
058dd70c
CH
23#include "xfs_zone_alloc.h"
24#include "xfs_rtgroup.h"
1da177e4 25
fbcc0256 26struct xfs_writepage_ctx {
598ecfba 27 struct iomap_writepage_ctx ctx;
d9252d52 28 unsigned int data_seq;
e666aa37 29 unsigned int cow_seq;
fbcc0256
DC
30};
31
598ecfba
CH
32static inline struct xfs_writepage_ctx *
33XFS_WPC(struct iomap_writepage_ctx *ctx)
34{
35 return container_of(ctx, struct xfs_writepage_ctx, ctx);
36}
37
fc0063c4
CH
38/*
39 * Fast and loose check if this write could update the on-disk inode size.
40 */
598ecfba 41static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
fc0063c4
CH
42{
43 return ioend->io_offset + ioend->io_size >
13d2c10b 44 XFS_I(ioend->io_inode)->i_disk_size;
fc0063c4
CH
45}
46
ba87ea69 47/*
2813d682 48 * Update on-disk file size now that data has been written to disk.
ba87ea69 49 */
e7a3d7e7
BF
50int
51xfs_setfilesize(
2ba66237 52 struct xfs_inode *ip,
2ba66237
CH
53 xfs_off_t offset,
54 size_t size)
ba87ea69 55{
e7a3d7e7
BF
56 struct xfs_mount *mp = ip->i_mount;
57 struct xfs_trans *tp;
ba87ea69 58 xfs_fsize_t isize;
e7a3d7e7
BF
59 int error;
60
61 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
62 if (error)
63 return error;
ba87ea69 64
aa6bf01d 65 xfs_ilock(ip, XFS_ILOCK_EXCL);
2ba66237 66 isize = xfs_new_eof(ip, offset + size);
281627df
CH
67 if (!isize) {
68 xfs_iunlock(ip, XFS_ILOCK_EXCL);
4906e215 69 xfs_trans_cancel(tp);
281627df 70 return 0;
ba87ea69
LM
71 }
72
2ba66237 73 trace_xfs_setfilesize(ip, offset, size);
281627df 74
13d2c10b 75 ip->i_disk_size = isize;
281627df
CH
76 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
77 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
78
70393313 79 return xfs_trans_commit(tp);
77d7a0c2
DC
80}
81
058dd70c
CH
82static void
83xfs_ioend_put_open_zones(
84 struct iomap_ioend *ioend)
85{
86 struct iomap_ioend *tmp;
87
88 /*
89 * Put the open zone for all ioends merged into this one (if any).
90 */
91 list_for_each_entry(tmp, &ioend->io_list, io_list)
92 xfs_open_zone_put(tmp->io_private);
93
94 /*
95 * The main ioend might not have an open zone if the submission failed
96 * before xfs_zone_alloc_and_submit got called.
97 */
98 if (ioend->io_private)
99 xfs_open_zone_put(ioend->io_private);
100}
101
0829c360 102/*
5ec4fabb 103 * IO write completion.
f6d6d4fc
CH
104 */
105STATIC void
cb357bf3 106xfs_end_ioend(
598ecfba 107 struct iomap_ioend *ioend)
0829c360 108{
0e51a8e1 109 struct xfs_inode *ip = XFS_I(ioend->io_inode);
5ca5916b 110 struct xfs_mount *mp = ip->i_mount;
058dd70c 111 bool is_zoned = xfs_is_zoned_inode(ip);
787eb485
CH
112 xfs_off_t offset = ioend->io_offset;
113 size_t size = ioend->io_size;
73d30d48 114 unsigned int nofs_flag;
4e4cbee9 115 int error;
ba87ea69 116
73d30d48
CH
117 /*
118 * We can allocate memory here while doing writeback on behalf of
119 * memory reclaim. To avoid memory allocation deadlocks set the
120 * task-wide nofs context for the following operations.
121 */
122 nofs_flag = memalloc_nofs_save();
123
af055e37 124 /*
f9dd7ba4 125 * Just clean up the in-memory structures if the fs has been shut down.
af055e37 126 */
5ca5916b 127 if (xfs_is_shutdown(mp)) {
0e51a8e1 128 error = -EIO;
787eb485
CH
129 goto done;
130 }
04f658ee 131
43caeb18 132 /*
5ca5916b
BF
133 * Clean up all COW blocks and underlying data fork delalloc blocks on
134 * I/O error. The delalloc punch is required because this ioend was
135 * mapped to blocks in the COW fork and the associated pages are no
136 * longer dirty. If we don't remove delalloc blocks here, they become
137 * stale and can corrupt free space accounting on unmount.
43caeb18 138 */
ae5535ef 139 error = blk_status_to_errno(ioend->io_bio.bi_status);
787eb485 140 if (unlikely(error)) {
71027333 141 if (ioend->io_flags & IOMAP_IOEND_SHARED) {
058dd70c 142 ASSERT(!is_zoned);
787eb485 143 xfs_reflink_cancel_cow_range(ip, offset, size, true);
8fe3b21e 144 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
058dd70c 145 offset + size, NULL);
5ca5916b 146 }
787eb485 147 goto done;
43caeb18
DW
148 }
149
5ec4fabb 150 /*
be225fec 151 * Success: commit the COW or unwritten blocks if needed.
5ec4fabb 152 */
058dd70c
CH
153 if (is_zoned)
154 error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
155 ioend->io_private, NULLFSBLOCK);
156 else if (ioend->io_flags & IOMAP_IOEND_SHARED)
787eb485 157 error = xfs_reflink_end_cow(ip, offset, size);
71027333 158 else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
ee70daab 159 error = xfs_iomap_write_unwritten(ip, offset, size, false);
ba87ea69 160
2e238340
CH
161 if (!error &&
162 !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
163 xfs_ioend_is_append(ioend))
99fc33d1 164 error = xfs_setfilesize(ip, offset, size);
04f658ee 165done:
058dd70c
CH
166 if (is_zoned)
167 xfs_ioend_put_open_zones(ioend);
598ecfba 168 iomap_finish_ioends(ioend, error);
73d30d48 169 memalloc_nofs_restore(nofs_flag);
3994fc48
DW
170}
171
ebb7fb15
DC
172/*
173 * Finish all pending IO completions that require transactional modifications.
174 *
175 * We try to merge physical and logically contiguous ioends before completion to
176 * minimise the number of transactions we need to perform during IO completion.
177 * Both unwritten extent conversion and COW remapping need to iterate and modify
178 * one physical extent at a time, so we gain nothing by merging physically
179 * discontiguous extents here.
180 *
181 * The ioend chain length that we can be processing here is largely unbound in
182 * length and we may have to perform significant amounts of work on each ioend
183 * to complete it. Hence we have to be careful about holding the CPU for too
184 * long in this loop.
185 */
cb357bf3
DW
186void
187xfs_end_io(
188 struct work_struct *work)
189{
433dad94
CH
190 struct xfs_inode *ip =
191 container_of(work, struct xfs_inode, i_ioend_work);
598ecfba 192 struct iomap_ioend *ioend;
433dad94 193 struct list_head tmp;
cb357bf3
DW
194 unsigned long flags;
195
cb357bf3 196 spin_lock_irqsave(&ip->i_ioend_lock, flags);
433dad94 197 list_replace_init(&ip->i_ioend_list, &tmp);
cb357bf3
DW
198 spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
199
598ecfba
CH
200 iomap_sort_ioends(&tmp);
201 while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
433dad94 202 io_list))) {
cb357bf3 203 list_del_init(&ioend->io_list);
6e552494 204 iomap_ioend_try_merge(ioend, &tmp);
cb357bf3 205 xfs_end_ioend(ioend);
ebb7fb15 206 cond_resched();
cb357bf3
DW
207 }
208}
209
2e238340 210void
0e51a8e1
CH
211xfs_end_bio(
212 struct bio *bio)
0829c360 213{
ae5535ef 214 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
cb357bf3 215 struct xfs_inode *ip = XFS_I(ioend->io_inode);
058dd70c 216 struct xfs_mount *mp = ip->i_mount;
cb357bf3 217 unsigned long flags;
0829c360 218
058dd70c
CH
219 /*
220 * For Appends record the actually written block number and set the
221 * boundary flag if needed.
222 */
223 if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
224 ioend->io_sector = bio->bi_iter.bi_sector;
225 xfs_mark_rtg_boundary(ioend);
226 }
227
598ecfba
CH
228 spin_lock_irqsave(&ip->i_ioend_lock, flags);
229 if (list_empty(&ip->i_ioend_list))
058dd70c 230 WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
598ecfba
CH
231 &ip->i_ioend_work));
232 list_add_tail(&ioend->io_list, &ip->i_ioend_list);
233 spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
0829c360
CH
234}
235
d9252d52
BF
236/*
237 * Fast revalidation of the cached writeback mapping. Return true if the current
238 * mapping is valid, false otherwise.
239 */
240static bool
241xfs_imap_valid(
598ecfba 242 struct iomap_writepage_ctx *wpc,
d9252d52 243 struct xfs_inode *ip,
4e087a3b 244 loff_t offset)
d9252d52 245{
4e087a3b
CH
246 if (offset < wpc->iomap.offset ||
247 offset >= wpc->iomap.offset + wpc->iomap.length)
d9252d52
BF
248 return false;
249 /*
250 * If this is a COW mapping, it is sufficient to check that the mapping
251 * covers the offset. Be careful to check this first because the caller
252 * can revalidate a COW mapping without updating the data seqno.
253 */
760fea8b 254 if (wpc->iomap.flags & IOMAP_F_SHARED)
d9252d52
BF
255 return true;
256
257 /*
258 * This is not a COW mapping. Check the sequence number of the data fork
259 * because concurrent changes could have invalidated the extent. Check
260 * the COW fork because concurrent changes since the last time we
261 * checked (and found nothing at this offset) could have added
262 * overlapping blocks.
263 */
c2beff99
DW
264 if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
265 trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
266 XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
d9252d52 267 return false;
c2beff99 268 }
d9252d52 269 if (xfs_inode_has_cow_data(ip) &&
c2beff99
DW
270 XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
271 trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
272 XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
d9252d52 273 return false;
c2beff99 274 }
d9252d52
BF
275 return true;
276}
277
598ecfba 278static int
1da177e4 279xfs_map_blocks(
598ecfba 280 struct iomap_writepage_ctx *wpc,
1da177e4 281 struct inode *inode,
19871b5c
CH
282 loff_t offset,
283 unsigned int len)
1da177e4 284{
a206c817
CH
285 struct xfs_inode *ip = XFS_I(inode);
286 struct xfs_mount *mp = ip->i_mount;
93407472 287 ssize_t count = i_blocksize(inode);
b4e29032
CH
288 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
289 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
c2f09217
DW
290 xfs_fileoff_t cow_fsb;
291 int whichfork;
5c665e5b 292 struct xfs_bmbt_irec imap;
060d4eaa 293 struct xfs_iext_cursor icur;
7588cbee 294 int retries = 0;
a206c817 295 int error = 0;
2e08371a 296 unsigned int *seq;
a206c817 297
75c8c50f 298 if (xfs_is_shutdown(mp))
d9252d52
BF
299 return -EIO;
300
c2beff99
DW
301 XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
302
889c65b3
CH
303 /*
304 * COW fork blocks can overlap data fork blocks even if the blocks
305 * aren't shared. COW I/O always takes precedent, so we must always
306 * check for overlap on reflink inodes unless the mapping is already a
e666aa37
CH
307 * COW one, or the COW fork hasn't changed from the last time we looked
308 * at it.
309 *
310 * It's safe to check the COW fork if_seq here without the ILOCK because
311 * we've indirectly protected against concurrent updates: writeback has
312 * the page locked, which prevents concurrent invalidations by reflink
313 * and directio and prevents concurrent buffered writes to the same
314 * page. Changes to if_seq always happen under i_lock, which protects
315 * against concurrent updates and provides a memory barrier on the way
316 * out that ensures that we always see the current value.
889c65b3 317 */
4e087a3b 318 if (xfs_imap_valid(wpc, ip, offset))
889c65b3
CH
319 return 0;
320
889c65b3
CH
321 /*
322 * If we don't have a valid map, now it's time to get a new one for this
323 * offset. This will convert delayed allocations (including COW ones)
324 * into real extents. If we return without a valid map, it means we
325 * landed in a hole and we skip the block.
326 */
7588cbee 327retry:
c2f09217
DW
328 cow_fsb = NULLFILEOFF;
329 whichfork = XFS_DATA_FORK;
988ef927 330 xfs_ilock(ip, XFS_ILOCK_SHARED);
b2197a36 331 ASSERT(!xfs_need_iread_extents(&ip->i_df));
060d4eaa
CH
332
333 /*
334 * Check if this is offset is covered by a COW extents, and if yes use
335 * it directly instead of looking up anything in the data fork.
336 */
51d62690 337 if (xfs_inode_has_cow_data(ip) &&
e666aa37
CH
338 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
339 cow_fsb = imap.br_startoff;
340 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
598ecfba 341 XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
5c665e5b 342 xfs_iunlock(ip, XFS_ILOCK_SHARED);
be225fec 343
760fea8b 344 whichfork = XFS_COW_FORK;
5c665e5b
CH
345 goto allocate_blocks;
346 }
347
348 /*
d9252d52
BF
349 * No COW extent overlap. Revalidate now that we may have updated
350 * ->cow_seq. If the data mapping is still valid, we're done.
5c665e5b 351 */
4e087a3b 352 if (xfs_imap_valid(wpc, ip, offset)) {
5c665e5b
CH
353 xfs_iunlock(ip, XFS_ILOCK_SHARED);
354 return 0;
355 }
356
357 /*
358 * If we don't have a valid map, now it's time to get a new one for this
359 * offset. This will convert delayed allocations (including COW ones)
360 * into real extents.
361 */
3345746e
CH
362 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
363 imap.br_startoff = end_fsb; /* fake a hole past EOF */
598ecfba 364 XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
8ff2957d 365 xfs_iunlock(ip, XFS_ILOCK_SHARED);
a206c817 366
12df89f2 367 /* landed in a hole or beyond EOF? */
3345746e 368 if (imap.br_startoff > offset_fsb) {
3345746e 369 imap.br_blockcount = imap.br_startoff - offset_fsb;
5c665e5b 370 imap.br_startoff = offset_fsb;
5c665e5b 371 imap.br_startblock = HOLESTARTBLOCK;
be225fec 372 imap.br_state = XFS_EXT_NORM;
8ff2957d 373 }
e2f6ad46 374
12df89f2
CH
375 /*
376 * Truncate to the next COW extent if there is one. This is the only
377 * opportunity to do this because we can skip COW fork lookups for the
378 * subsequent blocks in the mapping; however, the requirement to treat
379 * the COW range separately remains.
380 */
381 if (cow_fsb != NULLFILEOFF &&
382 cow_fsb < imap.br_startoff + imap.br_blockcount)
383 imap.br_blockcount = cow_fsb - imap.br_startoff;
384
385 /* got a delalloc extent? */
386 if (imap.br_startblock != HOLESTARTBLOCK &&
387 isnullstartblock(imap.br_startblock))
388 goto allocate_blocks;
389
304a68b9 390 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
760fea8b 391 trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
5c665e5b
CH
392 return 0;
393allocate_blocks:
2e08371a
ZY
394 /*
395 * Convert a dellalloc extent to a real one. The current page is held
396 * locked so nothing could have removed the block backing offset_fsb,
397 * although it could have moved from the COW to the data fork by another
398 * thread.
399 */
400 if (whichfork == XFS_COW_FORK)
401 seq = &XFS_WPC(wpc)->cow_seq;
402 else
403 seq = &XFS_WPC(wpc)->data_seq;
404
405 error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
406 &wpc->iomap, seq);
7588cbee
CH
407 if (error) {
408 /*
409 * If we failed to find the extent in the COW fork we might have
410 * raced with a COW to data fork conversion or truncate.
411 * Restart the lookup to catch the extent in the data fork for
412 * the former case, but prevent additional retries to avoid
413 * looping forever for the latter case.
414 */
760fea8b 415 if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
7588cbee
CH
416 goto retry;
417 ASSERT(error != -EAGAIN);
5c665e5b 418 return error;
7588cbee 419 }
4ad765ed
CH
420
421 /*
422 * Due to merging the return real extent might be larger than the
423 * original delalloc one. Trim the return extent to the next COW
424 * boundary again to force a re-lookup.
425 */
760fea8b 426 if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
4e087a3b
CH
427 loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
428
429 if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
430 wpc->iomap.length = cow_offset - wpc->iomap.offset;
431 }
4ad765ed 432
4e087a3b
CH
433 ASSERT(wpc->iomap.offset <= offset);
434 ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
760fea8b 435 trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
8ff2957d 436 return 0;
1da177e4
LT
437}
438
34ecde3c
JA
439static bool
440xfs_ioend_needs_wq_completion(
441 struct iomap_ioend *ioend)
442{
443 /* Changing inode size requires a transaction. */
444 if (xfs_ioend_is_append(ioend))
445 return true;
446
447 /* Extent manipulation requires a transaction. */
448 if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
449 return true;
450
451 /* Page cache invalidation cannot be done in irq context. */
452 if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
453 return true;
454
455 return false;
456}
457
598ecfba 458static int
c5010593
CH
459xfs_submit_ioend(
460 struct iomap_writepage_ctx *wpc,
e10de372 461 int status)
f6d6d4fc 462{
c5010593 463 struct iomap_ioend *ioend = wpc->ioend;
73d30d48
CH
464 unsigned int nofs_flag;
465
466 /*
467 * We can allocate memory here while doing writeback on behalf of
468 * memory reclaim. To avoid memory allocation deadlocks set the
469 * task-wide nofs context for the following operations.
470 */
471 nofs_flag = memalloc_nofs_save();
472
5eda4300 473 /* Convert CoW extents to regular */
71027333 474 if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
5eda4300
DW
475 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
476 ioend->io_offset, ioend->io_size);
477 }
478
73d30d48
CH
479 memalloc_nofs_restore(nofs_flag);
480
7adb8f14 481 /* send ioends that might require a transaction to the completion wq */
34ecde3c 482 if (xfs_ioend_needs_wq_completion(ioend))
ae5535ef 483 ioend->io_bio.bi_end_io = xfs_end_bio;
c5010593
CH
484
485 if (status)
486 return status;
487 submit_bio(&ioend->io_bio);
488 return 0;
f6d6d4fc
CH
489}
490
3ed3a434 491/*
8ac5b996
DC
492 * If the folio has delalloc blocks on it, the caller is asking us to punch them
493 * out. If we don't, we can leave a stale delalloc mapping covered by a clean
494 * page that needs to be dirtied again before the delalloc mapping can be
495 * converted. This stale delalloc mapping can trip up a later direct I/O read
496 * operation on the same region.
3ed3a434 497 *
8ac5b996 498 * We prevent this by truncating away the delalloc regions on the folio. Because
82cb1417
CH
499 * they are delalloc, we can do this without needing a transaction. Indeed - if
500 * we get ENOSPC errors, we have to be able to do this truncation without a
8ac5b996
DC
501 * transaction as there is no space left for block reservation (typically why
502 * we see a ENOSPC in writeback).
3ed3a434 503 */
598ecfba 504static void
6e478521
MWO
505xfs_discard_folio(
506 struct folio *folio,
507 loff_t pos)
3ed3a434 508{
7348b322 509 struct xfs_inode *ip = XFS_I(folio->mapping->host);
03625721 510 struct xfs_mount *mp = ip->i_mount;
3ed3a434 511
75c8c50f 512 if (xfs_is_shutdown(mp))
e9c3a8e8 513 return;
e8c3753c 514
4ab45e25 515 xfs_alert_ratelimited(mp,
6e478521
MWO
516 "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
517 folio, ip->i_ino, pos);
3ed3a434 518
8ac5b996 519 /*
c1950a11 520 * The end of the punch range is always the offset of the first
8ac5b996
DC
521 * byte of the next folio. Hence the end offset is only dependent on the
522 * folio itself and not the start offset that is passed in.
523 */
8fe3b21e 524 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
058dd70c 525 folio_pos(folio) + folio_size(folio), NULL);
3ed3a434
DC
526}
527
598ecfba
CH
528static const struct iomap_writeback_ops xfs_writeback_ops = {
529 .map_blocks = xfs_map_blocks,
c5010593 530 .submit_ioend = xfs_submit_ioend,
6e478521 531 .discard_folio = xfs_discard_folio,
598ecfba 532};
f51623b2 533
058dd70c
CH
534struct xfs_zoned_writepage_ctx {
535 struct iomap_writepage_ctx ctx;
536 struct xfs_open_zone *open_zone;
537};
538
539static inline struct xfs_zoned_writepage_ctx *
540XFS_ZWPC(struct iomap_writepage_ctx *ctx)
541{
542 return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
543}
544
545static int
546xfs_zoned_map_blocks(
547 struct iomap_writepage_ctx *wpc,
548 struct inode *inode,
549 loff_t offset,
550 unsigned int len)
551{
552 struct xfs_inode *ip = XFS_I(inode);
553 struct xfs_mount *mp = ip->i_mount;
554 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
555 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
556 xfs_filblks_t count_fsb;
557 struct xfs_bmbt_irec imap, del;
558 struct xfs_iext_cursor icur;
559
560 if (xfs_is_shutdown(mp))
561 return -EIO;
562
563 XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
564
565 /*
566 * All dirty data must be covered by delalloc extents. But truncate can
567 * remove delalloc extents underneath us or reduce their size.
568 * Returning a hole tells iomap to not write back any data from this
569 * range, which is the right thing to do in that case.
570 *
571 * Otherwise just tell iomap to treat ranges previously covered by a
572 * delalloc extent as mapped. The actual block allocation will be done
573 * just before submitting the bio.
574 *
575 * This implies we never map outside folios that are locked or marked
576 * as under writeback, and thus there is no need check the fork sequence
577 * count here.
578 */
579 xfs_ilock(ip, XFS_ILOCK_EXCL);
580 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
581 imap.br_startoff = end_fsb; /* fake a hole past EOF */
582 if (imap.br_startoff > offset_fsb) {
583 imap.br_blockcount = imap.br_startoff - offset_fsb;
584 imap.br_startoff = offset_fsb;
585 imap.br_startblock = HOLESTARTBLOCK;
586 imap.br_state = XFS_EXT_NORM;
587 xfs_iunlock(ip, XFS_ILOCK_EXCL);
588 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
589 return 0;
590 }
591 end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
592 count_fsb = end_fsb - offset_fsb;
593
594 del = imap;
595 xfs_trim_extent(&del, offset_fsb, count_fsb);
596 xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
597 XFS_BMAPI_REMAP);
598 xfs_iunlock(ip, XFS_ILOCK_EXCL);
599
600 wpc->iomap.type = IOMAP_MAPPED;
601 wpc->iomap.flags = IOMAP_F_DIRTY;
602 wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
603 wpc->iomap.offset = offset;
604 wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
605 wpc->iomap.flags = IOMAP_F_ANON_WRITE;
606
607 trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
608 return 0;
609}
610
611static int
612xfs_zoned_submit_ioend(
613 struct iomap_writepage_ctx *wpc,
614 int status)
615{
616 wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
617 if (status)
618 return status;
619 xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
620 return 0;
621}
622
623static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
624 .map_blocks = xfs_zoned_map_blocks,
625 .submit_ioend = xfs_zoned_submit_ioend,
626 .discard_folio = xfs_discard_folio,
627};
628
7d4fb40a
NS
629STATIC int
630xfs_vm_writepages(
631 struct address_space *mapping,
632 struct writeback_control *wbc)
633{
058dd70c
CH
634 struct xfs_inode *ip = XFS_I(mapping->host);
635
636 xfs_iflags_clear(ip, XFS_ITRUNCATED);
fbcc0256 637
058dd70c
CH
638 if (xfs_is_zoned_inode(ip)) {
639 struct xfs_zoned_writepage_ctx xc = { };
640 int error;
641
642 error = iomap_writepages(mapping, wbc, &xc.ctx,
643 &xfs_zoned_writeback_ops);
644 if (xc.open_zone)
645 xfs_open_zone_put(xc.open_zone);
646 return error;
647 } else {
648 struct xfs_writepage_ctx wpc = { };
649
650 return iomap_writepages(mapping, wbc, &wpc.ctx,
651 &xfs_writeback_ops);
652 }
7d4fb40a
NS
653}
654
6e2608df
DW
655STATIC int
656xfs_dax_writepages(
657 struct address_space *mapping,
658 struct writeback_control *wbc)
659{
30fa529e
CH
660 struct xfs_inode *ip = XFS_I(mapping->host);
661
662 xfs_iflags_clear(ip, XFS_ITRUNCATED);
6e2608df 663 return dax_writeback_mapping_range(mapping,
3f666c56 664 xfs_inode_buftarg(ip)->bt_daxdev, wbc);
6e2608df
DW
665}
666
1da177e4 667STATIC sector_t
e4c573bb 668xfs_vm_bmap(
1da177e4
LT
669 struct address_space *mapping,
670 sector_t block)
671{
b84e7722 672 struct xfs_inode *ip = XFS_I(mapping->host);
1da177e4 673
b84e7722 674 trace_xfs_vm_bmap(ip);
db1327b1
DW
675
676 /*
677 * The swap code (ab-)uses ->bmap to get a block mapping and then
793057e1 678 * bypasses the file system for actual I/O. We really can't allow
db1327b1 679 * that on reflinks inodes, so we have to skip out here. And yes,
eb5e248d
DW
680 * 0 is the magic code for a bmap error.
681 *
682 * Since we don't pass back blockdev info, we can't return bmap
683 * information for rt files either.
db1327b1 684 */
66ae56a5 685 if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
db1327b1 686 return 0;
690c2a38 687 return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
1da177e4
LT
688}
689
690STATIC int
7479c505 691xfs_vm_read_folio(
1da177e4 692 struct file *unused,
7479c505 693 struct folio *folio)
1da177e4 694{
7479c505 695 return iomap_read_folio(folio, &xfs_read_iomap_ops);
1da177e4
LT
696}
697
9d24a13a
MWO
698STATIC void
699xfs_vm_readahead(
700 struct readahead_control *rac)
1da177e4 701{
9d24a13a 702 iomap_readahead(rac, &xfs_read_iomap_ops);
22e757a4
DC
703}
704
67482129 705static int
3cd6a805 706xfs_vm_swap_activate(
67482129
DW
707 struct swap_info_struct *sis,
708 struct file *swap_file,
709 sector_t *span)
710{
2d873efd
CH
711 struct xfs_inode *ip = XFS_I(file_inode(swap_file));
712
713 /*
714 * Swap file activation can race against concurrent shared extent
715 * removal in files that have been cloned. If this happens,
716 * iomap_swapfile_iter() can fail because it encountered a shared
717 * extent even though an operation is in progress to remove those
718 * shared extents.
719 *
720 * This race becomes problematic when we defer extent removal
721 * operations beyond the end of a syscall (i.e. use async background
722 * processing algorithms). Users think the extents are no longer
723 * shared, but iomap_swapfile_iter() still sees them as shared
724 * because the refcountbt entries for the extents being removed have
725 * not yet been updated. Hence the swapon call fails unexpectedly.
726 *
727 * The race condition is currently most obvious from the unlink()
728 * operation as extent removal is deferred until after the last
729 * reference to the inode goes away. We then process the extent
730 * removal asynchronously, hence triggers the "syscall completed but
731 * work not done" condition mentioned above. To close this race
732 * window, we need to flush any pending inodegc operations to ensure
733 * they have updated the refcountbt records before we try to map the
734 * swapfile.
735 */
736 xfs_inodegc_flush(ip->i_mount);
737
738 /*
739 * Direct the swap code to the correct block device when this file
740 * sits on the RT device.
741 */
742 sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
743
690c2a38
CH
744 return iomap_swapfile_activate(sis, swap_file, span,
745 &xfs_read_iomap_ops);
67482129
DW
746}
747
f5e54d6e 748const struct address_space_operations xfs_address_space_operations = {
7479c505 749 .read_folio = xfs_vm_read_folio,
9d24a13a 750 .readahead = xfs_vm_readahead,
7d4fb40a 751 .writepages = xfs_vm_writepages,
4ce02c67 752 .dirty_folio = iomap_dirty_folio,
8597447d 753 .release_folio = iomap_release_folio,
d82354f6 754 .invalidate_folio = iomap_invalidate_folio,
e4c573bb 755 .bmap = xfs_vm_bmap,
2ec810d5 756 .migrate_folio = filemap_migrate_folio,
82cb1417 757 .is_partially_uptodate = iomap_is_partially_uptodate,
af7628d6 758 .error_remove_folio = generic_error_remove_folio,
3cd6a805 759 .swap_activate = xfs_vm_swap_activate,
1da177e4 760};
6e2608df
DW
761
762const struct address_space_operations xfs_dax_aops = {
763 .writepages = xfs_dax_writepages,
46de8b97 764 .dirty_folio = noop_dirty_folio,
3cd6a805 765 .swap_activate = xfs_vm_swap_activate,
6e2608df 766};