xfs: commit CoW-based atomic writes atomically
authorJohn Garry <john.g.garry@oracle.com>
Wed, 7 May 2025 21:18:31 +0000 (14:18 -0700)
committerDarrick J. Wong <djwong@kernel.org>
Wed, 7 May 2025 21:25:32 +0000 (14:25 -0700)
When completing a CoW-based write, each extent range mapping update is
covered by a separate transaction.

For a CoW-based atomic write, all mappings must be changed at once, so
change to use a single transaction.

Note that there is a limit on the amount of log intent items which can be
fit into a single transaction, but this is being ignored for now since
the count of items for a typical atomic write would be much less than is
typically supported. A typical atomic write would be expected to be 64KB
or less, which means only 16 possible extents unmaps, which is quite
small.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
[djwong: add tr_atomic_ioend]
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
fs/xfs/libxfs/xfs_log_rlimit.c
fs/xfs/libxfs/xfs_trans_resv.c
fs/xfs/libxfs/xfs_trans_resv.h
fs/xfs/xfs_file.c
fs/xfs/xfs_reflink.c
fs/xfs/xfs_reflink.h

index d3bd6a86c8fe9b4188a244c8da73cede09135db6..34bba96d30ca7ec2e49228eeaa58a769a6d24002 100644 (file)
@@ -91,6 +91,7 @@ xfs_log_calc_trans_resv_for_minlogblocks(
         */
        if (xfs_want_minlogsize_fixes(&mp->m_sb)) {
                xfs_trans_resv_calc(mp, resv);
+               resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
                return;
        }
 
@@ -107,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks(
 
        xfs_trans_resv_calc(mp, resv);
 
+       /* Copy the dynamic transaction reservation types from the running fs */
+       resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
+
        if (xfs_has_reflink(mp)) {
                /*
                 * In the early days of reflink, typical log operation counts
index 580d00ae28573dcf27181db55999d8ac30b72f5c..a841432abf834c271b6021626a9509ef06a3f882 100644 (file)
@@ -1284,6 +1284,15 @@ xfs_calc_namespace_reservations(
        resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
 }
 
+STATIC void
+xfs_calc_default_atomic_ioend_reservation(
+       struct xfs_mount        *mp,
+       struct xfs_trans_resv   *resp)
+{
+       /* Pick a default that will scale reasonably for the log size. */
+       resp->tr_atomic_ioend = resp->tr_itruncate;
+}
+
 void
 xfs_trans_resv_calc(
        struct xfs_mount        *mp,
@@ -1378,4 +1387,10 @@ xfs_trans_resv_calc(
        resp->tr_itruncate.tr_logcount += logcount_adj;
        resp->tr_write.tr_logcount += logcount_adj;
        resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
+
+       /*
+        * Now that we've finished computing the static reservations, we can
+        * compute the dynamic reservation for atomic writes.
+        */
+       xfs_calc_default_atomic_ioend_reservation(mp, resp);
 }
index d9d0032cbbc5d4a8a3e1ccdfaa289b1304a4112a..670045d417a65fd6ac04a0fb449351d30ba2e648 100644 (file)
@@ -48,6 +48,7 @@ struct xfs_trans_resv {
        struct xfs_trans_res    tr_qm_dqalloc;  /* allocate quota on disk */
        struct xfs_trans_res    tr_sb;          /* modify superblock */
        struct xfs_trans_res    tr_fsyncts;     /* update timestamps on fsync */
+       struct xfs_trans_res    tr_atomic_ioend; /* untorn write completion */
 };
 
 /* shorthand way of accessing reservation structure */
index e8acd6ca8f278656ca688ee3dc50572f1ee8a87a..32883ec8ca2e0e452a821e5885df86bb56c02f18 100644 (file)
@@ -576,7 +576,10 @@ xfs_dio_write_end_io(
        nofs_flag = memalloc_nofs_save();
 
        if (flags & IOMAP_DIO_COW) {
-               error = xfs_reflink_end_cow(ip, offset, size);
+               if (iocb->ki_flags & IOCB_ATOMIC)
+                       error = xfs_reflink_end_atomic_cow(ip, offset, size);
+               else
+                       error = xfs_reflink_end_cow(ip, offset, size);
                if (error)
                        goto out;
        }
index f5d3389160984f316cf0dc1aadb16f40dc1d1d7f..218dee76768b71968eea38673e64654362b9b71d 100644 (file)
@@ -984,6 +984,62 @@ xfs_reflink_end_cow(
        return error;
 }
 
+/*
+ * Fully remap all of the file's data fork at once, which is the critical part
+ * in achieving atomic behaviour.
+ * The regular CoW end path does not use function as to keep the block
+ * reservation per transaction as low as possible.
+ */
+int
+xfs_reflink_end_atomic_cow(
+       struct xfs_inode                *ip,
+       xfs_off_t                       offset,
+       xfs_off_t                       count)
+{
+       xfs_fileoff_t                   offset_fsb;
+       xfs_fileoff_t                   end_fsb;
+       int                             error = 0;
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_trans                *tp;
+       unsigned int                    resblks;
+
+       trace_xfs_reflink_end_cow(ip, offset, count);
+
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       end_fsb = XFS_B_TO_FSB(mp, offset + count);
+
+       /*
+        * Each remapping operation could cause a btree split, so in the worst
+        * case that's one for each block.
+        */
+       resblks = (end_fsb - offset_fsb) *
+                       XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
+
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_atomic_ioend, resblks, 0,
+                       XFS_TRANS_RESERVE, &tp);
+       if (error)
+               return error;
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip, 0);
+
+       while (end_fsb > offset_fsb && !error) {
+               error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
+                               end_fsb);
+       }
+       if (error) {
+               trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+               goto out_cancel;
+       }
+       error = xfs_trans_commit(tp);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
+out_cancel:
+       xfs_trans_cancel(tp);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
+}
+
 /*
  * Free all CoW staging blocks that are still referenced by the ondisk refcount
  * metadata.  The ondisk metadata does not track which inode created the
index 379619f2424784bb8820c664754e8320878519ea..412e9b6f2082fc4d24118365158a9bd3c91e5921 100644 (file)
@@ -45,6 +45,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
                xfs_off_t count, bool cancel_real);
 extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
                xfs_off_t count);
+int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
+               xfs_off_t count);
 extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
 extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
                struct file *file_out, loff_t pos_out, loff_t len,