fs/xfs/xfs_aops.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4  * Copyright (c) 2016-2025 Christoph Hellwig.
   5  * All Rights Reserved.
   6  */
   7 #include "xfs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_mount.h"
  13 #include "xfs_inode.h"
  14 #include "xfs_trans.h"
  15 #include "xfs_iomap.h"
  16 #include "xfs_trace.h"
  17 #include "xfs_bmap.h"
  18 #include "xfs_bmap_util.h"
  19 #include "xfs_reflink.h"
  20 #include "xfs_errortag.h"
  21 #include "xfs_error.h"
  22 #include "xfs_icache.h"
  23 #include "xfs_zone_alloc.h"
  24 #include "xfs_rtgroup.h"
  25
  26 struct xfs_writepage_ctx {
  27         struct iomap_writepage_ctx ctx;
  28         unsigned int            data_seq;
  29         unsigned int            cow_seq;
  30 };
  31
  32 static inline struct xfs_writepage_ctx *
  33 XFS_WPC(struct iomap_writepage_ctx *ctx)
  34 {
  35         return container_of(ctx, struct xfs_writepage_ctx, ctx);
  36 }
  37
  38 /*
  39  * Fast and loose check if this write could update the on-disk inode size.
  40  */
  41 static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
  42 {
  43         return ioend->io_offset + ioend->io_size >
  44                 XFS_I(ioend->io_inode)->i_disk_size;
  45 }
  46
  47 /*
  48  * Update on-disk file size now that data has been written to disk.
  49  */
  50 int
  51 xfs_setfilesize(
  52         struct xfs_inode        *ip,
  53         xfs_off_t               offset,
  54         size_t                  size)
  55 {
  56         struct xfs_mount        *mp = ip->i_mount;
  57         struct xfs_trans        *tp;
  58         xfs_fsize_t             isize;
  59         int                     error;
  60
  61         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
  62         if (error)
  63                 return error;
  64
  65         xfs_ilock(ip, XFS_ILOCK_EXCL);
  66         isize = xfs_new_eof(ip, offset + size);
  67         if (!isize) {
  68                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
  69                 xfs_trans_cancel(tp);
  70                 return 0;
  71         }
  72
  73         trace_xfs_setfilesize(ip, offset, size);
  74
  75         ip->i_disk_size = isize;
  76         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  77         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  78
  79         return xfs_trans_commit(tp);
  80 }
  81
  82 static void
  83 xfs_ioend_put_open_zones(
  84         struct iomap_ioend      *ioend)
  85 {
  86         struct iomap_ioend *tmp;
  87
  88         /*
  89          * Put the open zone for all ioends merged into this one (if any).
  90          */
  91         list_for_each_entry(tmp, &ioend->io_list, io_list)
  92                 xfs_open_zone_put(tmp->io_private);
  93
  94         /*
  95          * The main ioend might not have an open zone if the submission failed
  96          * before xfs_zone_alloc_and_submit got called.
  97          */
  98         if (ioend->io_private)
  99                 xfs_open_zone_put(ioend->io_private);
 100 }
 101
 102 /*
 103  * IO write completion.
 104  */
 105 STATIC void
 106 xfs_end_ioend(
 107         struct iomap_ioend      *ioend)
 108 {
 109         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 110         struct xfs_mount        *mp = ip->i_mount;
 111         bool                    is_zoned = xfs_is_zoned_inode(ip);
 112         xfs_off_t               offset = ioend->io_offset;
 113         size_t                  size = ioend->io_size;
 114         unsigned int            nofs_flag;
 115         int                     error;
 116
 117         /*
 118          * We can allocate memory here while doing writeback on behalf of
 119          * memory reclaim.  To avoid memory allocation deadlocks set the
 120          * task-wide nofs context for the following operations.
 121          */
 122         nofs_flag = memalloc_nofs_save();
 123
 124         /*
 125          * Just clean up the in-memory structures if the fs has been shut down.
 126          */
 127         if (xfs_is_shutdown(mp)) {
 128                 error = -EIO;
 129                 goto done;
 130         }
 131
 132         /*
 133          * Clean up all COW blocks and underlying data fork delalloc blocks on
 134          * I/O error. The delalloc punch is required because this ioend was
 135          * mapped to blocks in the COW fork and the associated pages are no
 136          * longer dirty. If we don't remove delalloc blocks here, they become
 137          * stale and can corrupt free space accounting on unmount.
 138          */
 139         error = blk_status_to_errno(ioend->io_bio.bi_status);
 140         if (unlikely(error)) {
 141                 if (ioend->io_flags & IOMAP_IOEND_SHARED) {
 142                         ASSERT(!is_zoned);
 143                         xfs_reflink_cancel_cow_range(ip, offset, size, true);
 144                         xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
 145                                         offset + size, NULL);
 146                 }
 147                 goto done;
 148         }
 149
 150         /*
 151          * Success: commit the COW or unwritten blocks if needed.
 152          */
 153         if (is_zoned)
 154                 error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
 155                                 ioend->io_private, NULLFSBLOCK);
 156         else if (ioend->io_flags & IOMAP_IOEND_SHARED)
 157                 error = xfs_reflink_end_cow(ip, offset, size);
 158         else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
 159                 error = xfs_iomap_write_unwritten(ip, offset, size, false);
 160
 161         if (!error &&
 162             !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
 163             xfs_ioend_is_append(ioend))
 164                 error = xfs_setfilesize(ip, offset, size);
 165 done:
 166         if (is_zoned)
 167                 xfs_ioend_put_open_zones(ioend);
 168         iomap_finish_ioends(ioend, error);
 169         memalloc_nofs_restore(nofs_flag);
 170 }
 171
 172 /*
 173  * Finish all pending IO completions that require transactional modifications.
 174  *
 175  * We try to merge physical and logically contiguous ioends before completion to
 176  * minimise the number of transactions we need to perform during IO completion.
 177  * Both unwritten extent conversion and COW remapping need to iterate and modify
 178  * one physical extent at a time, so we gain nothing by merging physically
 179  * discontiguous extents here.
 180  *
 181  * The ioend chain length that we can be processing here is largely unbound in
 182  * length and we may have to perform significant amounts of work on each ioend
 183  * to complete it. Hence we have to be careful about holding the CPU for too
 184  * long in this loop.
 185  */
 186 void
 187 xfs_end_io(
 188         struct work_struct      *work)
 189 {
 190         struct xfs_inode        *ip =
 191                 container_of(work, struct xfs_inode, i_ioend_work);
 192         struct iomap_ioend      *ioend;
 193         struct list_head        tmp;
 194         unsigned long           flags;
 195
 196         spin_lock_irqsave(&ip->i_ioend_lock, flags);
 197         list_replace_init(&ip->i_ioend_list, &tmp);
 198         spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
 199
 200         iomap_sort_ioends(&tmp);
 201         while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
 202                         io_list))) {
 203                 list_del_init(&ioend->io_list);
 204                 iomap_ioend_try_merge(ioend, &tmp);
 205                 xfs_end_ioend(ioend);
 206                 cond_resched();
 207         }
 208 }
 209
 210 void
 211 xfs_end_bio(
 212         struct bio              *bio)
 213 {
 214         struct iomap_ioend      *ioend = iomap_ioend_from_bio(bio);
 215         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 216         struct xfs_mount        *mp = ip->i_mount;
 217         unsigned long           flags;
 218
 219         /*
 220          * For Appends record the actually written block number and set the
 221          * boundary flag if needed.
 222          */
 223         if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
 224                 ioend->io_sector = bio->bi_iter.bi_sector;
 225                 xfs_mark_rtg_boundary(ioend);
 226         }
 227
 228         spin_lock_irqsave(&ip->i_ioend_lock, flags);
 229         if (list_empty(&ip->i_ioend_list))
 230                 WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
 231                                          &ip->i_ioend_work));
 232         list_add_tail(&ioend->io_list, &ip->i_ioend_list);
 233         spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
 234 }
 235
 236 /*
 237  * Fast revalidation of the cached writeback mapping. Return true if the current
 238  * mapping is valid, false otherwise.
 239  */
 240 static bool
 241 xfs_imap_valid(
 242         struct iomap_writepage_ctx      *wpc,
 243         struct xfs_inode                *ip,
 244         loff_t                          offset)
 245 {
 246         if (offset < wpc->iomap.offset ||
 247             offset >= wpc->iomap.offset + wpc->iomap.length)
 248                 return false;
 249         /*
 250          * If this is a COW mapping, it is sufficient to check that the mapping
 251          * covers the offset. Be careful to check this first because the caller
 252          * can revalidate a COW mapping without updating the data seqno.
 253          */
 254         if (wpc->iomap.flags & IOMAP_F_SHARED)
 255                 return true;
 256
 257         /*
 258          * This is not a COW mapping. Check the sequence number of the data fork
 259          * because concurrent changes could have invalidated the extent. Check
 260          * the COW fork because concurrent changes since the last time we
 261          * checked (and found nothing at this offset) could have added
 262          * overlapping blocks.
 263          */
 264         if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
 265                 trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
 266                                 XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
 267                 return false;
 268         }
 269         if (xfs_inode_has_cow_data(ip) &&
 270             XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
 271                 trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
 272                                 XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
 273                 return false;
 274         }
 275         return true;
 276 }
 277
 278 static int
 279 xfs_map_blocks(
 280         struct iomap_writepage_ctx *wpc,
 281         struct inode            *inode,
 282         loff_t                  offset,
 283         unsigned int            len)
 284 {
 285         struct xfs_inode        *ip = XFS_I(inode);
 286         struct xfs_mount        *mp = ip->i_mount;
 287         ssize_t                 count = i_blocksize(inode);
 288         xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
 289         xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
 290         xfs_fileoff_t           cow_fsb;
 291         int                     whichfork;
 292         struct xfs_bmbt_irec    imap;
 293         struct xfs_iext_cursor  icur;
 294         int                     retries = 0;
 295         int                     error = 0;
 296         unsigned int            *seq;
 297
 298         if (xfs_is_shutdown(mp))
 299                 return -EIO;
 300
 301         XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
 302
 303         /*
 304          * COW fork blocks can overlap data fork blocks even if the blocks
 305          * aren't shared.  COW I/O always takes precedent, so we must always
 306          * check for overlap on reflink inodes unless the mapping is already a
 307          * COW one, or the COW fork hasn't changed from the last time we looked
 308          * at it.
 309          *
 310          * It's safe to check the COW fork if_seq here without the ILOCK because
 311          * we've indirectly protected against concurrent updates: writeback has
 312          * the page locked, which prevents concurrent invalidations by reflink
 313          * and directio and prevents concurrent buffered writes to the same
 314          * page.  Changes to if_seq always happen under i_lock, which protects
 315          * against concurrent updates and provides a memory barrier on the way
 316          * out that ensures that we always see the current value.
 317          */
 318         if (xfs_imap_valid(wpc, ip, offset))
 319                 return 0;
 320
 321         /*
 322          * If we don't have a valid map, now it's time to get a new one for this
 323          * offset.  This will convert delayed allocations (including COW ones)
 324          * into real extents.  If we return without a valid map, it means we
 325          * landed in a hole and we skip the block.
 326          */
 327 retry:
 328         cow_fsb = NULLFILEOFF;
 329         whichfork = XFS_DATA_FORK;
 330         xfs_ilock(ip, XFS_ILOCK_SHARED);
 331         ASSERT(!xfs_need_iread_extents(&ip->i_df));
 332
 333         /*
 334          * Check if this is offset is covered by a COW extents, and if yes use
 335          * it directly instead of looking up anything in the data fork.
 336          */
 337         if (xfs_inode_has_cow_data(ip) &&
 338             xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
 339                 cow_fsb = imap.br_startoff;
 340         if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
 341                 XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
 342                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 343
 344                 whichfork = XFS_COW_FORK;
 345                 goto allocate_blocks;
 346         }
 347
 348         /*
 349          * No COW extent overlap. Revalidate now that we may have updated
 350          * ->cow_seq. If the data mapping is still valid, we're done.
 351          */
 352         if (xfs_imap_valid(wpc, ip, offset)) {
 353                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 354                 return 0;
 355         }
 356
 357         /*
 358          * If we don't have a valid map, now it's time to get a new one for this
 359          * offset.  This will convert delayed allocations (including COW ones)
 360          * into real extents.
 361          */
 362         if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
 363                 imap.br_startoff = end_fsb;     /* fake a hole past EOF */
 364         XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
 365         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 366
 367         /* landed in a hole or beyond EOF? */
 368         if (imap.br_startoff > offset_fsb) {
 369                 imap.br_blockcount = imap.br_startoff - offset_fsb;
 370                 imap.br_startoff = offset_fsb;
 371                 imap.br_startblock = HOLESTARTBLOCK;
 372                 imap.br_state = XFS_EXT_NORM;
 373         }
 374
 375         /*
 376          * Truncate to the next COW extent if there is one.  This is the only
 377          * opportunity to do this because we can skip COW fork lookups for the
 378          * subsequent blocks in the mapping; however, the requirement to treat
 379          * the COW range separately remains.
 380          */
 381         if (cow_fsb != NULLFILEOFF &&
 382             cow_fsb < imap.br_startoff + imap.br_blockcount)
 383                 imap.br_blockcount = cow_fsb - imap.br_startoff;
 384
 385         /* got a delalloc extent? */
 386         if (imap.br_startblock != HOLESTARTBLOCK &&
 387             isnullstartblock(imap.br_startblock))
 388                 goto allocate_blocks;
 389
 390         xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
 391         trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
 392         return 0;
 393 allocate_blocks:
 394         /*
 395          * Convert a dellalloc extent to a real one. The current page is held
 396          * locked so nothing could have removed the block backing offset_fsb,
 397          * although it could have moved from the COW to the data fork by another
 398          * thread.
 399          */
 400         if (whichfork == XFS_COW_FORK)
 401                 seq = &XFS_WPC(wpc)->cow_seq;
 402         else
 403                 seq = &XFS_WPC(wpc)->data_seq;
 404
 405         error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
 406                                 &wpc->iomap, seq);
 407         if (error) {
 408                 /*
 409                  * If we failed to find the extent in the COW fork we might have
 410                  * raced with a COW to data fork conversion or truncate.
 411                  * Restart the lookup to catch the extent in the data fork for
 412                  * the former case, but prevent additional retries to avoid
 413                  * looping forever for the latter case.
 414                  */
 415                 if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
 416                         goto retry;
 417                 ASSERT(error != -EAGAIN);
 418                 return error;
 419         }
 420
 421         /*
 422          * Due to merging the return real extent might be larger than the
 423          * original delalloc one.  Trim the return extent to the next COW
 424          * boundary again to force a re-lookup.
 425          */
 426         if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
 427                 loff_t          cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
 428
 429                 if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
 430                         wpc->iomap.length = cow_offset - wpc->iomap.offset;
 431         }
 432
 433         ASSERT(wpc->iomap.offset <= offset);
 434         ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
 435         trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
 436         return 0;
 437 }
 438
 439 static bool
 440 xfs_ioend_needs_wq_completion(
 441         struct iomap_ioend      *ioend)
 442 {
 443         /* Changing inode size requires a transaction. */
 444         if (xfs_ioend_is_append(ioend))
 445                 return true;
 446
 447         /* Extent manipulation requires a transaction. */
 448         if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
 449                 return true;
 450
 451         /* Page cache invalidation cannot be done in irq context. */
 452         if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
 453                 return true;
 454
 455         return false;
 456 }
 457
 458 static int
 459 xfs_submit_ioend(
 460         struct iomap_writepage_ctx *wpc,
 461         int                     status)
 462 {
 463         struct iomap_ioend      *ioend = wpc->ioend;
 464         unsigned int            nofs_flag;
 465
 466         /*
 467          * We can allocate memory here while doing writeback on behalf of
 468          * memory reclaim.  To avoid memory allocation deadlocks set the
 469          * task-wide nofs context for the following operations.
 470          */
 471         nofs_flag = memalloc_nofs_save();
 472
 473         /* Convert CoW extents to regular */
 474         if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
 475                 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 476                                 ioend->io_offset, ioend->io_size);
 477         }
 478
 479         memalloc_nofs_restore(nofs_flag);
 480
 481         /* send ioends that might require a transaction to the completion wq */
 482         if (xfs_ioend_needs_wq_completion(ioend))
 483                 ioend->io_bio.bi_end_io = xfs_end_bio;
 484
 485         if (status)
 486                 return status;
 487         submit_bio(&ioend->io_bio);
 488         return 0;
 489 }
 490
 491 /*
 492  * If the folio has delalloc blocks on it, the caller is asking us to punch them
 493  * out. If we don't, we can leave a stale delalloc mapping covered by a clean
 494  * page that needs to be dirtied again before the delalloc mapping can be
 495  * converted. This stale delalloc mapping can trip up a later direct I/O read
 496  * operation on the same region.
 497  *
 498  * We prevent this by truncating away the delalloc regions on the folio. Because
 499  * they are delalloc, we can do this without needing a transaction. Indeed - if
 500  * we get ENOSPC errors, we have to be able to do this truncation without a
 501  * transaction as there is no space left for block reservation (typically why
 502  * we see a ENOSPC in writeback).
 503  */
 504 static void
 505 xfs_discard_folio(
 506         struct folio            *folio,
 507         loff_t                  pos)
 508 {
 509         struct xfs_inode        *ip = XFS_I(folio->mapping->host);
 510         struct xfs_mount        *mp = ip->i_mount;
 511
 512         if (xfs_is_shutdown(mp))
 513                 return;
 514
 515         xfs_alert_ratelimited(mp,
 516                 "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
 517                         folio, ip->i_ino, pos);
 518
 519         /*
 520          * The end of the punch range is always the offset of the first
 521          * byte of the next folio. Hence the end offset is only dependent on the
 522          * folio itself and not the start offset that is passed in.
 523          */
 524         xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
 525                                 folio_pos(folio) + folio_size(folio), NULL);
 526 }
 527
 528 static const struct iomap_writeback_ops xfs_writeback_ops = {
 529         .map_blocks             = xfs_map_blocks,
 530         .submit_ioend           = xfs_submit_ioend,
 531         .discard_folio          = xfs_discard_folio,
 532 };
 533
 534 struct xfs_zoned_writepage_ctx {
 535         struct iomap_writepage_ctx      ctx;
 536         struct xfs_open_zone            *open_zone;
 537 };
 538
 539 static inline struct xfs_zoned_writepage_ctx *
 540 XFS_ZWPC(struct iomap_writepage_ctx *ctx)
 541 {
 542         return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
 543 }
 544
 545 static int
 546 xfs_zoned_map_blocks(
 547         struct iomap_writepage_ctx *wpc,
 548         struct inode            *inode,
 549         loff_t                  offset,
 550         unsigned int            len)
 551 {
 552         struct xfs_inode        *ip = XFS_I(inode);
 553         struct xfs_mount        *mp = ip->i_mount;
 554         xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
 555         xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + len);
 556         xfs_filblks_t           count_fsb;
 557         struct xfs_bmbt_irec    imap, del;
 558         struct xfs_iext_cursor  icur;
 559
 560         if (xfs_is_shutdown(mp))
 561                 return -EIO;
 562
 563         XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
 564
 565         /*
 566          * All dirty data must be covered by delalloc extents.  But truncate can
 567          * remove delalloc extents underneath us or reduce their size.
 568          * Returning a hole tells iomap to not write back any data from this
 569          * range, which is the right thing to do in that case.
 570          *
 571          * Otherwise just tell iomap to treat ranges previously covered by a
 572          * delalloc extent as mapped.  The actual block allocation will be done
 573          * just before submitting the bio.
 574          *
 575          * This implies we never map outside folios that are locked or marked
 576          * as under writeback, and thus there is no need check the fork sequence
 577          * count here.
 578          */
 579         xfs_ilock(ip, XFS_ILOCK_EXCL);
 580         if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
 581                 imap.br_startoff = end_fsb;     /* fake a hole past EOF */
 582         if (imap.br_startoff > offset_fsb) {
 583                 imap.br_blockcount = imap.br_startoff - offset_fsb;
 584                 imap.br_startoff = offset_fsb;
 585                 imap.br_startblock = HOLESTARTBLOCK;
 586                 imap.br_state = XFS_EXT_NORM;
 587                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 588                 xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
 589                 return 0;
 590         }
 591         end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
 592         count_fsb = end_fsb - offset_fsb;
 593
 594         del = imap;
 595         xfs_trim_extent(&del, offset_fsb, count_fsb);
 596         xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
 597                         XFS_BMAPI_REMAP);
 598         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 599
 600         wpc->iomap.type = IOMAP_MAPPED;
 601         wpc->iomap.flags = IOMAP_F_DIRTY;
 602         wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
 603         wpc->iomap.offset = offset;
 604         wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
 605         wpc->iomap.flags = IOMAP_F_ANON_WRITE;
 606
 607         trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
 608         return 0;
 609 }
 610
 611 static int
 612 xfs_zoned_submit_ioend(
 613         struct iomap_writepage_ctx *wpc,
 614         int                     status)
 615 {
 616         wpc->ioend->io_bio.bi_end_io = xfs_end_bio;
 617         if (status)
 618                 return status;
 619         xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone);
 620         return 0;
 621 }
 622
 623 static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
 624         .map_blocks             = xfs_zoned_map_blocks,
 625         .submit_ioend           = xfs_zoned_submit_ioend,
 626         .discard_folio          = xfs_discard_folio,
 627 };
 628
 629 STATIC int
 630 xfs_vm_writepages(
 631         struct address_space    *mapping,
 632         struct writeback_control *wbc)
 633 {
 634         struct xfs_inode        *ip = XFS_I(mapping->host);
 635
 636         xfs_iflags_clear(ip, XFS_ITRUNCATED);
 637
 638         if (xfs_is_zoned_inode(ip)) {
 639                 struct xfs_zoned_writepage_ctx  xc = { };
 640                 int                             error;
 641
 642                 error = iomap_writepages(mapping, wbc, &xc.ctx,
 643                                          &xfs_zoned_writeback_ops);
 644                 if (xc.open_zone)
 645                         xfs_open_zone_put(xc.open_zone);
 646                 return error;
 647         } else {
 648                 struct xfs_writepage_ctx        wpc = { };
 649
 650                 return iomap_writepages(mapping, wbc, &wpc.ctx,
 651                                 &xfs_writeback_ops);
 652         }
 653 }
 654
 655 STATIC int
 656 xfs_dax_writepages(
 657         struct address_space    *mapping,
 658         struct writeback_control *wbc)
 659 {
 660         struct xfs_inode        *ip = XFS_I(mapping->host);
 661
 662         xfs_iflags_clear(ip, XFS_ITRUNCATED);
 663         return dax_writeback_mapping_range(mapping,
 664                         xfs_inode_buftarg(ip)->bt_daxdev, wbc);
 665 }
 666
 667 STATIC sector_t
 668 xfs_vm_bmap(
 669         struct address_space    *mapping,
 670         sector_t                block)
 671 {
 672         struct xfs_inode        *ip = XFS_I(mapping->host);
 673
 674         trace_xfs_vm_bmap(ip);
 675
 676         /*
 677          * The swap code (ab-)uses ->bmap to get a block mapping and then
 678          * bypasses the file system for actual I/O.  We really can't allow
 679          * that on reflinks inodes, so we have to skip out here.  And yes,
 680          * 0 is the magic code for a bmap error.
 681          *
 682          * Since we don't pass back blockdev info, we can't return bmap
 683          * information for rt files either.
 684          */
 685         if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
 686                 return 0;
 687         return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
 688 }
 689
 690 STATIC int
 691 xfs_vm_read_folio(
 692         struct file             *unused,
 693         struct folio            *folio)
 694 {
 695         return iomap_read_folio(folio, &xfs_read_iomap_ops);
 696 }
 697
 698 STATIC void
 699 xfs_vm_readahead(
 700         struct readahead_control        *rac)
 701 {
 702         iomap_readahead(rac, &xfs_read_iomap_ops);
 703 }
 704
 705 static int
 706 xfs_vm_swap_activate(
 707         struct swap_info_struct         *sis,
 708         struct file                     *swap_file,
 709         sector_t                        *span)
 710 {
 711         struct xfs_inode                *ip = XFS_I(file_inode(swap_file));
 712
 713         /*
 714          * Swap file activation can race against concurrent shared extent
 715          * removal in files that have been cloned.  If this happens,
 716          * iomap_swapfile_iter() can fail because it encountered a shared
 717          * extent even though an operation is in progress to remove those
 718          * shared extents.
 719          *
 720          * This race becomes problematic when we defer extent removal
 721          * operations beyond the end of a syscall (i.e. use async background
 722          * processing algorithms).  Users think the extents are no longer
 723          * shared, but iomap_swapfile_iter() still sees them as shared
 724          * because the refcountbt entries for the extents being removed have
 725          * not yet been updated.  Hence the swapon call fails unexpectedly.
 726          *
 727          * The race condition is currently most obvious from the unlink()
 728          * operation as extent removal is deferred until after the last
 729          * reference to the inode goes away.  We then process the extent
 730          * removal asynchronously, hence triggers the "syscall completed but
 731          * work not done" condition mentioned above.  To close this race
 732          * window, we need to flush any pending inodegc operations to ensure
 733          * they have updated the refcountbt records before we try to map the
 734          * swapfile.
 735          */
 736         xfs_inodegc_flush(ip->i_mount);
 737
 738         /*
 739          * Direct the swap code to the correct block device when this file
 740          * sits on the RT device.
 741          */
 742         sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
 743
 744         return iomap_swapfile_activate(sis, swap_file, span,
 745                         &xfs_read_iomap_ops);
 746 }
 747
 748 const struct address_space_operations xfs_address_space_operations = {
 749         .read_folio             = xfs_vm_read_folio,
 750         .readahead              = xfs_vm_readahead,
 751         .writepages             = xfs_vm_writepages,
 752         .dirty_folio            = iomap_dirty_folio,
 753         .release_folio          = iomap_release_folio,
 754         .invalidate_folio       = iomap_invalidate_folio,
 755         .bmap                   = xfs_vm_bmap,
 756         .migrate_folio          = filemap_migrate_folio,
 757         .is_partially_uptodate  = iomap_is_partially_uptodate,
 758         .error_remove_folio     = generic_error_remove_folio,
 759         .swap_activate          = xfs_vm_swap_activate,
 760 };
 761
 762 const struct address_space_operations xfs_dax_aops = {
 763         .writepages             = xfs_dax_writepages,
 764         .dirty_folio            = noop_dirty_folio,
 765         .swap_activate          = xfs_vm_swap_activate,
 766 };