xfs: allow sysadmins to specify a maximum atomic write limit at mount time
authorDarrick J. Wong <djwong@kernel.org>
Wed, 7 May 2025 21:18:34 +0000 (14:18 -0700)
committerDarrick J. Wong <djwong@kernel.org>
Wed, 7 May 2025 21:25:33 +0000 (14:25 -0700)
Introduce a mount option to allow sysadmins to specify the maximum size
of an atomic write.  If the filesystem can work with the supplied value,
that becomes the new guaranteed maximum.

The value mustn't be too big for the existing filesystem geometry (max
write size, max AG/rtgroup size).  We dynamically recompute the
tr_atomic_write transaction reservation based on the given block size,
check that the current log size isn't less than the new minimum log size
constraints, and set a new maximum.

The actual software atomic write max is still computed based off of
tr_atomic_ioend the same way it has for the past few commits.  Note also
that xfs_calc_atomic_write_log_geometry is non-static because mkfs will
need that.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Documentation/admin-guide/xfs.rst
fs/xfs/libxfs/xfs_trans_resv.c
fs/xfs/libxfs/xfs_trans_resv.h
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_super.c
fs/xfs/xfs_trace.h

index 5becb441c3cba0fe8d1e9a4f0f1bc16bd713810d..a18328a5fb93be33b943b9941bfba6fa09c745cd 100644 (file)
@@ -151,6 +151,17 @@ When mounting an XFS filesystem, the following options are accepted.
        optional, and the log section can be separate from the data
        section or contained within it.
 
+  max_atomic_write=value
+       Set the maximum size of an atomic write.  The size may be
+       specified in bytes, in kilobytes with a "k" suffix, in megabytes
+       with a "m" suffix, or in gigabytes with a "g" suffix.  The size
+       cannot be larger than the maximum write size, larger than the
+       size of any allocation group, or larger than the size of a
+       remapping operation that the log can complete atomically.
+
+       The default value is to set the maximum I/O completion size
+       to allow each CPU to handle one at a time.
+
   max_open_zones=value
        Specify the max number of zones to keep open for writing on a
        zoned rt device. Many open zones aids file data separation
index e73c09fbd24c30323f29a91d870e30b92e1dcdf7..86a111d0f2fc7cfa59c50da2420f023a6fe813b8 100644 (file)
@@ -1488,3 +1488,72 @@ xfs_calc_max_atomic_write_fsblocks(
 
        return ret;
 }
+
+/*
+ * Compute the log blocks and transaction reservation needed to complete an
+ * atomic write of a given number of blocks.  Worst case, each block requires
+ * separate handling.  A return value of 0 means something went wrong.
+ */
+xfs_extlen_t
+xfs_calc_atomic_write_log_geometry(
+       struct xfs_mount        *mp,
+       xfs_extlen_t            blockcount,
+       unsigned int            *new_logres)
+{
+       struct xfs_trans_res    *curr_res = &M_RES(mp)->tr_atomic_ioend;
+       uint                    old_logres = curr_res->tr_logres;
+       unsigned int            per_intent, step_size;
+       unsigned int            logres;
+       xfs_extlen_t            min_logblocks;
+
+       ASSERT(blockcount > 0);
+
+       xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
+
+       per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size);
+
+       /* Check for overflows */
+       if (check_mul_overflow(blockcount, per_intent, &logres) ||
+           check_add_overflow(logres, step_size, &logres))
+               return 0;
+
+       curr_res->tr_logres = logres;
+       min_logblocks = xfs_log_calc_minimum_size(mp);
+       curr_res->tr_logres = old_logres;
+
+       trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size,
+                       blockcount, min_logblocks, logres);
+
+       *new_logres = logres;
+       return min_logblocks;
+}
+
+/*
+ * Compute the transaction reservation needed to complete an out of place
+ * atomic write of a given number of blocks.
+ */
+int
+xfs_calc_atomic_write_reservation(
+       struct xfs_mount        *mp,
+       xfs_extlen_t            blockcount)
+{
+       unsigned int            new_logres;
+       xfs_extlen_t            min_logblocks;
+
+       /*
+        * If the caller doesn't ask for a specific atomic write size, then
+        * use the defaults.
+        */
+       if (blockcount == 0) {
+               xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
+               return 0;
+       }
+
+       min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount,
+                       &new_logres);
+       if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks)
+               return -EINVAL;
+
+       M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres;
+       return 0;
+}
index a6d303b836883feead0db7536006d327eb9218e8..336279e0fc61371ea469e8b66b0143af3362d18c 100644 (file)
@@ -122,5 +122,9 @@ unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
 unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
 
 xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp);
+xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp,
+               xfs_extlen_t blockcount, unsigned int *new_logres);
+int xfs_calc_atomic_write_reservation(struct xfs_mount *mp,
+               xfs_extlen_t blockcount);
 
 #endif /* __XFS_TRANS_RESV_H__ */
index 86089e27b8e765a40688e546e4471734dd8713d8..29276fe60df9c6d21e3ad0a3de551c92a46ef239 100644 (file)
@@ -742,6 +742,82 @@ xfs_calc_atomic_write_unit_max(
                        max_agsize, max_rgsize);
 }
 
+/*
+ * Try to set the atomic write maximum to a new value that we got from
+ * userspace via mount option.
+ */
+int
+xfs_set_max_atomic_write_opt(
+       struct xfs_mount        *mp,
+       unsigned long long      new_max_bytes)
+{
+       const xfs_filblks_t     new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes);
+       const xfs_extlen_t      max_write = xfs_calc_atomic_write_max(mp);
+       const xfs_extlen_t      max_group =
+               max(mp->m_groups[XG_TYPE_AG].blocks,
+                   mp->m_groups[XG_TYPE_RTG].blocks);
+       const xfs_extlen_t      max_group_write =
+               max(xfs_calc_perag_awu_max(mp), xfs_calc_rtgroup_awu_max(mp));
+       int                     error;
+
+       if (new_max_bytes == 0)
+               goto set_limit;
+
+       ASSERT(max_write <= U32_MAX);
+
+       /* generic_atomic_write_valid enforces power of two length */
+       if (!is_power_of_2(new_max_bytes)) {
+               xfs_warn(mp,
+ "max atomic write size of %llu bytes is not a power of 2",
+                               new_max_bytes);
+               return -EINVAL;
+       }
+
+       if (new_max_bytes & mp->m_blockmask) {
+               xfs_warn(mp,
+ "max atomic write size of %llu bytes not aligned with fsblock",
+                               new_max_bytes);
+               return -EINVAL;
+       }
+
+       if (new_max_fsbs > max_write) {
+               xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than max write size %lluk",
+                               new_max_bytes >> 10,
+                               XFS_FSB_TO_B(mp, max_write) >> 10);
+               return -EINVAL;
+       }
+
+       if (new_max_fsbs > max_group) {
+               xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than allocation group size %lluk",
+                               new_max_bytes >> 10,
+                               XFS_FSB_TO_B(mp, max_group) >> 10);
+               return -EINVAL;
+       }
+
+       if (new_max_fsbs > max_group_write) {
+               xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than max allocation group write size %lluk",
+                               new_max_bytes >> 10,
+                               XFS_FSB_TO_B(mp, max_group_write) >> 10);
+               return -EINVAL;
+       }
+
+set_limit:
+       error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs);
+       if (error) {
+               xfs_warn(mp,
+ "cannot support completing atomic writes of %lluk",
+                               new_max_bytes >> 10);
+               return error;
+       }
+
+       xfs_calc_atomic_write_unit_max(mp);
+       mp->m_awu_max_bytes = new_max_bytes;
+       return 0;
+}
+
 /* Compute maximum possible height for realtime btree types for this fs. */
 static inline void
 xfs_rtbtree_compute_maxlevels(
@@ -1163,7 +1239,9 @@ xfs_mountfs(
         * derived from transaction reservations, so we must do this after the
         * log is fully initialized.
         */
-       xfs_calc_atomic_write_unit_max(mp);
+       error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes);
+       if (error)
+               goto out_agresv;
 
        return 0;
 
index e2abf31438e0e2d1060f16714f30b4b6ee717b51..5b5df70570c0db88c1a404df28118bace5a12d59 100644 (file)
@@ -237,6 +237,9 @@ typedef struct xfs_mount {
        unsigned int            m_max_open_zones;
        unsigned int            m_zonegc_low_space;
 
+       /* max_atomic_write mount option value */
+       unsigned long long      m_awu_max_bytes;
+
        /*
         * Bitsets of per-fs metadata that have been checked and/or are sick.
         * Callers must hold m_sb_lock to access these two fields.
@@ -804,4 +807,7 @@ static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
        percpu_counter_add(&mp->m_delalloc_blks, delta);
 }
 
+int xfs_set_max_atomic_write_opt(struct xfs_mount *mp,
+               unsigned long long new_max_bytes);
+
 #endif /* __XFS_MOUNT_H__ */
index 77a3c003fc4ff7c6f1d04fe1df04ca4ea95166e1..8e3ae1749855adcbb0a1c04de9a0aeb15a1ae135 100644 (file)
@@ -111,7 +111,7 @@ enum {
        Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
        Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
        Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
-       Opt_lifetime, Opt_nolifetime,
+       Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
 };
 
 static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -159,6 +159,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
        fsparam_u32("max_open_zones",   Opt_max_open_zones),
        fsparam_flag("lifetime",        Opt_lifetime),
        fsparam_flag("nolifetime",      Opt_nolifetime),
+       fsparam_string("max_atomic_write",      Opt_max_atomic_write),
        {}
 };
 
@@ -241,6 +242,9 @@ xfs_fs_show_options(
 
        if (mp->m_max_open_zones)
                seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
+       if (mp->m_awu_max_bytes)
+               seq_printf(m, ",max_atomic_write=%lluk",
+                               mp->m_awu_max_bytes >> 10);
 
        return 0;
 }
@@ -1343,6 +1347,42 @@ suffix_kstrtoint(
        return ret;
 }
 
+static int
+suffix_kstrtoull(
+       const char              *s,
+       unsigned int            base,
+       unsigned long long      *res)
+{
+       int                     last, shift_left_factor = 0;
+       unsigned long long      _res;
+       char                    *value;
+       int                     ret = 0;
+
+       value = kstrdup(s, GFP_KERNEL);
+       if (!value)
+               return -ENOMEM;
+
+       last = strlen(value) - 1;
+       if (value[last] == 'K' || value[last] == 'k') {
+               shift_left_factor = 10;
+               value[last] = '\0';
+       }
+       if (value[last] == 'M' || value[last] == 'm') {
+               shift_left_factor = 20;
+               value[last] = '\0';
+       }
+       if (value[last] == 'G' || value[last] == 'g') {
+               shift_left_factor = 30;
+               value[last] = '\0';
+       }
+
+       if (kstrtoull(value, base, &_res))
+               ret = -EINVAL;
+       kfree(value);
+       *res = _res << shift_left_factor;
+       return ret;
+}
+
 static inline void
 xfs_fs_warn_deprecated(
        struct fs_context       *fc,
@@ -1527,6 +1567,14 @@ xfs_fs_parse_param(
        case Opt_nolifetime:
                parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
                return 0;
+       case Opt_max_atomic_write:
+               if (suffix_kstrtoull(param->string, 10,
+                                    &parsing_mp->m_awu_max_bytes)) {
+                       xfs_warn(parsing_mp,
+ "max atomic write size must be positive integer");
+                       return -EINVAL;
+               }
+               return 0;
        default:
                xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
                return -EINVAL;
@@ -2137,6 +2185,14 @@ xfs_fs_reconfigure(
        if (error)
                return error;
 
+       /* Validate new max_atomic_write option before making other changes */
+       if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
+               error = xfs_set_max_atomic_write_opt(mp,
+                               new_mp->m_awu_max_bytes);
+               if (error)
+                       return error;
+       }
+
        /* inode32 -> inode64 */
        if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
                mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
index d5ae00f8e04cf43f94540a7afa3f4ad1b6487daf..01d284a1c75961a528dd4386a1c4ac9c005b535d 100644 (file)
@@ -230,6 +230,39 @@ TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks,
                  __entry->blockcount)
 );
 
+TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry,
+       TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
+                unsigned int step_size, unsigned int blockcount,
+                unsigned int min_logblocks, unsigned int logres),
+       TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(unsigned int, per_intent)
+               __field(unsigned int, step_size)
+               __field(unsigned int, blockcount)
+               __field(unsigned int, min_logblocks)
+               __field(unsigned int, cur_logblocks)
+               __field(unsigned int, logres)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->per_intent = per_intent;
+               __entry->step_size = step_size;
+               __entry->blockcount = blockcount;
+               __entry->min_logblocks = min_logblocks;
+               __entry->cur_logblocks = mp->m_sb.sb_logblocks;
+               __entry->logres = logres;
+       ),
+       TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->per_intent,
+                 __entry->step_size,
+                 __entry->blockcount,
+                 __entry->min_logblocks,
+                 __entry->cur_logblocks,
+                 __entry->logres)
+);
+
 TRACE_EVENT(xlog_intent_recovery_failed,
        TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
                 int error),