Merge tag 'mm-hotfixes-stable-2025-07-11-16-16' of git://git.kernel.org/pub/scm/linux...
[linux-block.git] / fs / xfs / xfs_aops.c
... / ...
CommitLineData
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4 * Copyright (c) 2016-2025 Christoph Hellwig.
5 * All Rights Reserved.
6 */
7#include "xfs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
12#include "xfs_mount.h"
13#include "xfs_inode.h"
14#include "xfs_trans.h"
15#include "xfs_iomap.h"
16#include "xfs_trace.h"
17#include "xfs_bmap.h"
18#include "xfs_bmap_util.h"
19#include "xfs_reflink.h"
20#include "xfs_errortag.h"
21#include "xfs_error.h"
22#include "xfs_icache.h"
23#include "xfs_zone_alloc.h"
24#include "xfs_rtgroup.h"
25
26struct xfs_writepage_ctx {
27 struct iomap_writepage_ctx ctx;
28 unsigned int data_seq;
29 unsigned int cow_seq;
30};
31
32static inline struct xfs_writepage_ctx *
33XFS_WPC(struct iomap_writepage_ctx *ctx)
34{
35 return container_of(ctx, struct xfs_writepage_ctx, ctx);
36}
37
38/*
39 * Fast and loose check if this write could update the on-disk inode size.
40 */
41static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
42{
43 return ioend->io_offset + ioend->io_size >
44 XFS_I(ioend->io_inode)->i_disk_size;
45}
46
47/*
48 * Update on-disk file size now that data has been written to disk.
49 */
50int
51xfs_setfilesize(
52 struct xfs_inode *ip,
53 xfs_off_t offset,
54 size_t size)
55{
56 struct xfs_mount *mp = ip->i_mount;
57 struct xfs_trans *tp;
58 xfs_fsize_t isize;
59 int error;
60
61 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
62 if (error)
63 return error;
64
65 xfs_ilock(ip, XFS_ILOCK_EXCL);
66 isize = xfs_new_eof(ip, offset + size);
67 if (!isize) {
68 xfs_iunlock(ip, XFS_ILOCK_EXCL);
69 xfs_trans_cancel(tp);
70 return 0;
71 }
72
73 trace_xfs_setfilesize(ip, offset, size);
74
75 ip->i_disk_size = isize;
76 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
77 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
78
79 return xfs_trans_commit(tp);
80}
81
82static void
83xfs_ioend_put_open_zones(
84 struct iomap_ioend *ioend)
85{
86 struct iomap_ioend *tmp;
87
88 /*
89 * Put the open zone for all ioends merged into this one (if any).
90 */
91 list_for_each_entry(tmp, &ioend->io_list, io_list)
92 xfs_open_zone_put(tmp->io_private);
93
94 /*
95 * The main ioend might not have an open zone if the submission failed
96 * before xfs_zone_alloc_and_submit got called.
97 */
98 if (ioend->io_private)
99 xfs_open_zone_put(ioend->io_private);
100}
101
102/*
103 * IO write completion.
104 */
105STATIC void
106xfs_end_ioend(
107 struct iomap_ioend *ioend)
108{
109 struct xfs_inode *ip = XFS_I(ioend->io_inode);
110 struct xfs_mount *mp = ip->i_mount;
111 bool is_zoned = xfs_is_zoned_inode(ip);
112 xfs_off_t offset = ioend->io_offset;
113 size_t size = ioend->io_size;
114 unsigned int nofs_flag;
115 int error;
116
117 /*
118 * We can allocate memory here while doing writeback on behalf of
119 * memory reclaim. To avoid memory allocation deadlocks set the
120 * task-wide nofs context for the following operations.
121 */
122 nofs_flag = memalloc_nofs_save();
123
124 /*
125 * Just clean up the in-memory structures if the fs has been shut down.
126 */
127 if (xfs_is_shutdown(mp)) {
128 error = -EIO;
129 goto done;
130 }
131
132 /*
133 * Clean up all COW blocks and underlying data fork delalloc blocks on
134 * I/O error. The delalloc punch is required because this ioend was
135 * mapped to blocks in the COW fork and the associated pages are no
136 * longer dirty. If we don't remove delalloc blocks here, they become
137 * stale and can corrupt free space accounting on unmount.
138 */
139 error = blk_status_to_errno(ioend->io_bio.bi_status);
140 if (unlikely(error)) {
141 if (ioend->io_flags & IOMAP_IOEND_SHARED) {
142 ASSERT(!is_zoned);
143 xfs_reflink_cancel_cow_range(ip, offset, size, true);
144 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
145 offset + size, NULL);
146 }
147 goto done;
148 }
149
150 /*
151 * Success: commit the COW or unwritten blocks if needed.
152 */
153 if (is_zoned)
154 error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
155 ioend->io_private, NULLFSBLOCK);
156 else if (ioend->io_flags & IOMAP_IOEND_SHARED)
157 error = xfs_reflink_end_cow(ip, offset, size);
158 else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
159 error = xfs_iomap_write_unwritten(ip, offset, size, false);
160
161 if (!error &&
162 !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
163 xfs_ioend_is_append(ioend))
164 error = xfs_setfilesize(ip, offset, size);
165done:
166 if (is_zoned)
167 xfs_ioend_put_open_zones(ioend);
168 iomap_finish_ioends(ioend, error);
169 memalloc_nofs_restore(nofs_flag);
170}
171
172/*
173 * Finish all pending IO completions that require transactional modifications.
174 *
175 * We try to merge physical and logically contiguous ioends before completion to
176 * minimise the number of transactions we need to perform during IO completion.
177 * Both unwritten extent conversion and COW remapping need to iterate and modify
178 * one physical extent at a time, so we gain nothing by merging physically
179 * discontiguous extents here.
180 *
181 * The ioend chain length that we can be processing here is largely unbound in
182 * length and we may have to perform significant amounts of work on each ioend
183 * to complete it. Hence we have to be careful about holding the CPU for too
184 * long in this loop.
185 */
186void
187xfs_end_io(
188 struct work_struct *work)
189{
190 struct xfs_inode *ip =
191 container_of(work, struct xfs_inode, i_ioend_work);
192 struct iomap_ioend *ioend;
193 struct list_head tmp;
194 unsigned long flags;
195
196 spin_lock_irqsave(&ip->i_ioend_lock, flags);
197 list_replace_init(&ip->i_ioend_list, &tmp);
198 spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
199
200 iomap_sort_ioends(&tmp);
201 while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
202 io_list))) {
203 list_del_init(&ioend->io_list);
204 iomap_ioend_try_merge(ioend, &tmp);
205 xfs_end_ioend(ioend);
206 cond_resched();
207 }
208}
209
210void
211xfs_end_bio(
212 struct bio *bio)
213{
214 struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
215 struct xfs_inode *ip = XFS_I(ioend->io_inode);
216 struct xfs_mount *mp = ip->i_mount;
217 unsigned long flags;
218
219 /*
220 * For Appends record the actually written block number and set the
221 * boundary flag if needed.
222 */
223 if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
224 ioend->io_sector = bio->bi_iter.bi_sector;
225 xfs_mark_rtg_boundary(ioend);
226 }
227
228 spin_lock_irqsave(&ip->i_ioend_lock, flags);
229 if (list_empty(&ip->i_ioend_list))
230 WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
231 &ip->i_ioend_work));
232 list_add_tail(&ioend->io_list, &ip->i_ioend_list);
233 spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
234}
235
236/*
237 * Fast revalidation of the cached writeback mapping. Return true if the current
238 * mapping is valid, false otherwise.
239 */
240static bool
241xfs_imap_valid(
242 struct iomap_writepage_ctx *wpc,
243 struct xfs_inode *ip,
244 loff_t offset)
245{
246 if (offset < wpc->iomap.offset ||
247 offset >= wpc->iomap.offset + wpc->iomap.length)
248 return false;
249 /*
250 * If this is a COW mapping, it is sufficient to check that the mapping
251 * covers the offset. Be careful to check this first because the caller
252 * can revalidate a COW mapping without updating the data seqno.
253 */
254 if (wpc->iomap.flags & IOMAP_F_SHARED)
255 return true;
256
257 /*
258 * This is not a COW mapping. Check the sequence number of the data fork
259 * because concurrent changes could have invalidated the extent. Check
260 * the COW fork because concurrent changes since the last time we
261 * checked (and found nothing at this offset) could have added
262 * overlapping blocks.
263 */
264 if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
265 trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
266 XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
267 return false;
268 }
269 if (xfs_inode_has_cow_data(ip) &&
270 XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
271 trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
272 XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
273 return false;
274 }
275 return true;
276}
277
278static int
279xfs_map_blocks(
280 struct iomap_writepage_ctx *wpc,
281 struct inode *inode,
282 loff_t offset,
283 unsigned int len)
284{
285 struct xfs_inode *ip = XFS_I(inode);
286 struct xfs_mount *mp = ip->i_mount;
287 ssize_t count = i_blocksize(inode);
288 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
289 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
290 xfs_fileoff_t cow_fsb;
291 int whichfork;
292 struct xfs_bmbt_irec imap;
293 struct xfs_iext_cursor icur;
294 int retries = 0;
295 int error = 0;
296 unsigned int *seq;
297
298 if (xfs_is_shutdown(mp))
299 return -EIO;
300
301 XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
302
303 /*
304 * COW fork blocks can overlap data fork blocks even if the blocks
305 * aren't shared. COW I/O always takes precedent, so we must always
306 * check for overlap on reflink inodes unless the mapping is already a
307 * COW one, or the COW fork hasn't changed from the last time we looked
308 * at it.
309 *
310 * It's safe to check the COW fork if_seq here without the ILOCK because
311 * we've indirectly protected against concurrent updates: writeback has
312 * the page locked, which prevents concurrent invalidations by reflink
313 * and directio and prevents concurrent buffered writes to the same
314 * page. Changes to if_seq always happen under i_lock, which protects
315 * against concurrent updates and provides a memory barrier on the way
316 * out that ensures that we always see the current value.
317 */
318 if (xfs_imap_valid(wpc, ip, offset))
319 return 0;
320
321 /*
322 * If we don't have a valid map, now it's time to get a new one for this
323 * offset. This will convert delayed allocations (including COW ones)
324 * into real extents. If we return without a valid map, it means we
325 * landed in a hole and we skip the block.
326 */
327retry:
328 cow_fsb = NULLFILEOFF;
329 whichfork = XFS_DATA_FORK;
330 xfs_ilock(ip, XFS_ILOCK_SHARED);
331 ASSERT(!xfs_need_iread_extents(&ip->i_df));
332
333 /*
334 * Check if this is offset is covered by a COW extents, and if yes use
335 * it directly instead of looking up anything in the data fork.
336 */
337 if (xfs_inode_has_cow_data(ip) &&
338 xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
339 cow_fsb = imap.br_startoff;
340 if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
341 XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
342 xfs_iunlock(ip, XFS_ILOCK_SHARED);
343
344 whichfork = XFS_COW_FORK;
345 goto allocate_blocks;
346 }
347
348 /*
349 * No COW extent overlap. Revalidate now that we may have updated
350 * ->cow_seq. If the data mapping is still valid, we're done.
351 */
352 if (xfs_imap_valid(wpc, ip, offset)) {
353 xfs_iunlock(ip, XFS_ILOCK_SHARED);
354 return 0;
355 }
356
357 /*
358 * If we don't have a valid map, now it's time to get a new one for this
359 * offset. This will convert delayed allocations (including COW ones)
360 * into real extents.
361 */
362 if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
363 imap.br_startoff = end_fsb; /* fake a hole past EOF */
364 XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
365 xfs_iunlock(ip, XFS_ILOCK_SHARED);
366
367 /* landed in a hole or beyond EOF? */
368 if (imap.br_startoff > offset_fsb) {
369 imap.br_blockcount = imap.br_startoff - offset_fsb;
370 imap.br_startoff = offset_fsb;
371 imap.br_startblock = HOLESTARTBLOCK;
372 imap.br_state = XFS_EXT_NORM;
373 }
374
375 /*
376 * Truncate to the next COW extent if there is one. This is the only
377 * opportunity to do this because we can skip COW fork lookups for the
378 * subsequent blocks in the mapping; however, the requirement to treat
379 * the COW range separately remains.
380 */
381 if (cow_fsb != NULLFILEOFF &&
382 cow_fsb < imap.br_startoff + imap.br_blockcount)
383 imap.br_blockcount = cow_fsb - imap.br_startoff;
384
385 /* got a delalloc extent? */
386 if (imap.br_startblock != HOLESTARTBLOCK &&
387 isnullstartblock(imap.br_startblock))
388 goto allocate_blocks;
389
390 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
391 trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
392 return 0;
393allocate_blocks:
394 /*
395 * Convert a dellalloc extent to a real one. The current page is held
396 * locked so nothing could have removed the block backing offset_fsb,
397 * although it could have moved from the COW to the data fork by another
398 * thread.
399 */
400 if (whichfork == XFS_COW_FORK)
401 seq = &XFS_WPC(wpc)->cow_seq;
402 else
403 seq = &XFS_WPC(wpc)->data_seq;
404
405 error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
406 &wpc->iomap, seq);
407 if (error) {
408 /*
409 * If we failed to find the extent in the COW fork we might have
410 * raced with a COW to data fork conversion or truncate.
411 * Restart the lookup to catch the extent in the data fork for
412 * the former case, but prevent additional retries to avoid
413 * looping forever for the latter case.
414 */
415 if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
416 goto retry;
417 ASSERT(error != -EAGAIN);
418 return error;
419 }
420
421 /*
422 * Due to merging the return real extent might be larger than the
423 * original delalloc one. Trim the return extent to the next COW
424 * boundary again to force a re-lookup.
425 */
426 if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
427 loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
428
429 if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
430 wpc->iomap.length = cow_offset - wpc->iomap.offset;
431 }
432
433 ASSERT(wpc->iomap.offset <= offset);
434 ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
435 trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
436 return 0;
437}
438
439static bool
440xfs_ioend_needs_wq_completion(
441 struct iomap_ioend *ioend)
442{
443 /* Changing inode size requires a transaction. */
444 if (xfs_ioend_is_append(ioend))
445 return true;
446
447 /* Extent manipulation requires a transaction. */
448 if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
449 return true;
450
451 /* Page cache invalidation cannot be done in irq context. */
452 if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
453 return true;
454
455 return false;
456}
457
458static int
459xfs_submit_ioend(
460 struct iomap_writepage_ctx *wpc,
461 int status)
462{
463 struct iomap_ioend *ioend = wpc->ioend;
464 unsigned int nofs_flag;
465
466 /*
467 * We can allocate memory here while doing writeback on behalf of
468 * memory reclaim. To avoid memory allocation deadlocks set the
469 * task-wide nofs context for the following operations.
470 */
471 nofs_flag = memalloc_nofs_save();
472
473 /* Convert CoW extents to regular */
474 if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
475 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
476 ioend->io_offset, ioend->io_size);
477 }
478
479 memalloc_nofs_restore(nofs_flag);
480
481 /* send ioends that might require a transaction to the completion wq */
482 if (xfs_ioend_needs_wq_completion(ioend))
483 ioend->io_bio.bi_end_io = xfs_end_bio;
484
485 if (status)
486 return status;
487 submit_bio(&ioend->io_bio);
488 return 0;
489}
490
491/*
492 * If the folio has delalloc blocks on it, the caller is asking us to punch them
493 * out. If we don't, we can leave a stale delalloc mapping covered by a clean
494 * page that needs to be dirtied again before the delalloc mapping can be
495 * converted. This stale delalloc mapping can trip up a later direct I/O read
496 * operation on the same region.
497 *
498 * We prevent this by truncating away the delalloc regions on the folio. Because
499 * they are delalloc, we can do this without needing a transaction. Indeed - if
500 * we get ENOSPC errors, we have to be able to do this truncation without a
501 * transaction as there is no space left for block reservation (typically why
502 * we see a ENOSPC in writeback).
503 */
504static void
505xfs_discard_folio(
506 struct folio *folio,
507 loff_t pos)
508{
509 struct xfs_inode *ip = XFS_I(folio->mapping->host);
510 struct xfs_mount *mp = ip->i_mount;
511
512 if (xfs_is_shutdown(mp))
513 return;
514
515 xfs_alert_ratelimited(mp,
516 "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
517 folio, ip->i_ino, pos);
518
519 /*
520 * The end of the punch range is always the offset of the first
521 * byte of the next folio. Hence the end offset is only dependent on the
522 * folio itself and not the start offset that is passed in.
523 */
524 xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
525 folio_pos(folio) + folio_size(folio), NULL);
526}
527
528static const struct iomap_writeback_ops xfs_writeback_ops = {
529 .map_blocks = xfs_map_blocks,
530 .submit_ioend = xfs_submit_ioend,
531 .discard_folio = xfs_discard_folio,
532};
533
534struct xfs_zoned_writepage_ctx {
535 struct iomap_writepage_ctx ctx;
536 struct xfs_open_zone *open_zone;
537};
538
539static inline struct xfs_zoned_writepage_ctx *
540XFS_ZWPC(struct iomap_writepage_ctx *ctx)
541{
542 return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
543}
544
545static int
546xfs_zoned_map_blocks(
547 struct iomap_writepage_ctx *wpc,
548 struct inode *inode,
549 loff_t offset,
550 unsigned int len)
551{
552 struct xfs_inode *ip = XFS_I(inode);
553 struct xfs_mount *mp = ip->i_mount;
554 xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
555 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
556 xfs_filblks_t count_fsb;
557 struct xfs_bmbt_irec imap, del;
558 struct xfs_iext_cursor icur;
559
560 if (xfs_is_shutdown(mp))
561 return -EIO;
562
563 XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
564
565 /*
566 * All dirty data must be covered by delalloc extents. But truncate can
567 * remove delalloc extents underneath us or reduce their size.
568 * Returning a hole tells iomap to not write back any data from this
569 * range, which is the right thing to do in that case.
570 *
571 * Otherwise just tell iomap to treat ranges previously covered by a
572 * delalloc extent as mapped. The actual block allocation will be done
573 * just before submitting the bio.
574 *
575 * This implies we never map outside folios that are locked or marked
576 * as under writeback, and thus there is no need check the fork sequence
577 * count here.
578 */
579 xfs_ilock(ip, XFS_ILOCK_EXCL);
580 if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
581 imap.br_startoff = end_fsb; /* fake a hole past EOF */
582 if (imap.br_startoff > offset_fsb) {
583 imap.br_blockcount = imap.br_startoff - offset_fsb;
584 imap.br_startoff = offset_fsb;
585 imap.br_startblock = HOLESTARTBLOCK;
586 imap.br_state = XFS_EXT_NORM;
587 xfs_iunlock(ip, XFS_ILOCK_EXCL);
588 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
589 return 0;
590 }
591 end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
592 count_fsb = end_fsb - offset_fsb;
593
594 del = imap;
595 xfs_trim_extent(&del, offset_fsb, count_fsb);
596 xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
597 XFS_BMAPI_REMAP);
598 xfs_iunlock(ip, XFS_ILOCK_EXCL);
599
600 wpc->iomap.type = IOMAP_MAPPED;
601 wpc->iomap.flags = IOMAP_F_DIRTY;
602 wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
603 wpc->iomap.offset = offset;
604 wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
605 wpc->iomap.flags = IOMAP_F_ANON_WRITE;
606
607 trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
608 return 0;
609}
610
611static int
612xfs_zoned_submit_ioend(
613 struct iomap_writepage_ctx *wpc,
614 int status)
615{
616 wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
617 if (status)
618 return status;
619 xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
620 return 0;
621}
622
623static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
624 .map_blocks = xfs_zoned_map_blocks,
625 .submit_ioend = xfs_zoned_submit_ioend,
626 .discard_folio = xfs_discard_folio,
627};
628
629STATIC int
630xfs_vm_writepages(
631 struct address_space *mapping,
632 struct writeback_control *wbc)
633{
634 struct xfs_inode *ip = XFS_I(mapping->host);
635
636 xfs_iflags_clear(ip, XFS_ITRUNCATED);
637
638 if (xfs_is_zoned_inode(ip)) {
639 struct xfs_zoned_writepage_ctx xc = { };
640 int error;
641
642 error = iomap_writepages(mapping, wbc, &xc.ctx,
643 &xfs_zoned_writeback_ops);
644 if (xc.open_zone)
645 xfs_open_zone_put(xc.open_zone);
646 return error;
647 } else {
648 struct xfs_writepage_ctx wpc = { };
649
650 return iomap_writepages(mapping, wbc, &wpc.ctx,
651 &xfs_writeback_ops);
652 }
653}
654
655STATIC int
656xfs_dax_writepages(
657 struct address_space *mapping,
658 struct writeback_control *wbc)
659{
660 struct xfs_inode *ip = XFS_I(mapping->host);
661
662 xfs_iflags_clear(ip, XFS_ITRUNCATED);
663 return dax_writeback_mapping_range(mapping,
664 xfs_inode_buftarg(ip)->bt_daxdev, wbc);
665}
666
667STATIC sector_t
668xfs_vm_bmap(
669 struct address_space *mapping,
670 sector_t block)
671{
672 struct xfs_inode *ip = XFS_I(mapping->host);
673
674 trace_xfs_vm_bmap(ip);
675
676 /*
677 * The swap code (ab-)uses ->bmap to get a block mapping and then
678 * bypasses the file system for actual I/O. We really can't allow
679 * that on reflinks inodes, so we have to skip out here. And yes,
680 * 0 is the magic code for a bmap error.
681 *
682 * Since we don't pass back blockdev info, we can't return bmap
683 * information for rt files either.
684 */
685 if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
686 return 0;
687 return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
688}
689
690STATIC int
691xfs_vm_read_folio(
692 struct file *unused,
693 struct folio *folio)
694{
695 return iomap_read_folio(folio, &xfs_read_iomap_ops);
696}
697
698STATIC void
699xfs_vm_readahead(
700 struct readahead_control *rac)
701{
702 iomap_readahead(rac, &xfs_read_iomap_ops);
703}
704
705static int
706xfs_vm_swap_activate(
707 struct swap_info_struct *sis,
708 struct file *swap_file,
709 sector_t *span)
710{
711 struct xfs_inode *ip = XFS_I(file_inode(swap_file));
712
713 /*
714 * Swap file activation can race against concurrent shared extent
715 * removal in files that have been cloned. If this happens,
716 * iomap_swapfile_iter() can fail because it encountered a shared
717 * extent even though an operation is in progress to remove those
718 * shared extents.
719 *
720 * This race becomes problematic when we defer extent removal
721 * operations beyond the end of a syscall (i.e. use async background
722 * processing algorithms). Users think the extents are no longer
723 * shared, but iomap_swapfile_iter() still sees them as shared
724 * because the refcountbt entries for the extents being removed have
725 * not yet been updated. Hence the swapon call fails unexpectedly.
726 *
727 * The race condition is currently most obvious from the unlink()
728 * operation as extent removal is deferred until after the last
729 * reference to the inode goes away. We then process the extent
730 * removal asynchronously, hence triggers the "syscall completed but
731 * work not done" condition mentioned above. To close this race
732 * window, we need to flush any pending inodegc operations to ensure
733 * they have updated the refcountbt records before we try to map the
734 * swapfile.
735 */
736 xfs_inodegc_flush(ip->i_mount);
737
738 /*
739 * Direct the swap code to the correct block device when this file
740 * sits on the RT device.
741 */
742 sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
743
744 return iomap_swapfile_activate(sis, swap_file, span,
745 &xfs_read_iomap_ops);
746}
747
748const struct address_space_operations xfs_address_space_operations = {
749 .read_folio = xfs_vm_read_folio,
750 .readahead = xfs_vm_readahead,
751 .writepages = xfs_vm_writepages,
752 .dirty_folio = iomap_dirty_folio,
753 .release_folio = iomap_release_folio,
754 .invalidate_folio = iomap_invalidate_folio,
755 .bmap = xfs_vm_bmap,
756 .migrate_folio = filemap_migrate_folio,
757 .is_partially_uptodate = iomap_is_partially_uptodate,
758 .error_remove_folio = generic_error_remove_folio,
759 .swap_activate = xfs_vm_swap_activate,
760};
761
762const struct address_space_operations xfs_dax_aops = {
763 .writepages = xfs_dax_writepages,
764 .dirty_folio = noop_dirty_folio,
765 .swap_activate = xfs_vm_swap_activate,
766};