xfs: add support for zoned space reservations
authorChristoph Hellwig <hch@lst.de>
Thu, 13 Feb 2025 08:16:06 +0000 (09:16 +0100)
committerChristoph Hellwig <hch@lst.de>
Mon, 3 Mar 2025 15:17:07 +0000 (08:17 -0700)
For zoned file systems garbage collection (GC) has to take the iolock
and mmaplock after moving data to a new place to synchronize with
readers.  This means waiting for garbage collection with the iolock can
deadlock.

To avoid this, the worst case required blocks have to be reserved before
taking the iolock, which is done using a new RTAVAILABLE counter that
tracks blocks that are free to write into and don't require garbage
collection.  The new helpers try to take these available blocks, and
if there aren't enough available it wakes and waits for GC.  This is
done using a list of on-stack reservations to ensure fairness.

Co-developed-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
fs/xfs/Makefile
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_types.h
fs/xfs/xfs_mount.c
fs/xfs/xfs_trace.h
fs/xfs/xfs_zone_alloc.c
fs/xfs/xfs_zone_alloc.h
fs/xfs/xfs_zone_priv.h
fs/xfs/xfs_zone_space_resv.c [new file with mode: 0644]

index 28bd2627e9ef56db74d69a4efe9310283df04b36..bdedf4bdb1db6c1ee2e1415e21f94218943e54f9 100644 (file)
@@ -138,7 +138,8 @@ xfs-$(CONFIG_XFS_QUOTA)             += xfs_dquot.o \
 
 # xfs_rtbitmap is shared with libxfs
 xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o \
-                                  xfs_zone_alloc.o
+                                  xfs_zone_alloc.o \
+                                  xfs_zone_space_resv.o
 
 xfs-$(CONFIG_XFS_POSIX_ACL)    += xfs_acl.o
 xfs-$(CONFIG_SYSCTL)           += xfs_sysctl.o
index 522c126e52fb6a5e9f549fdd8140193f7ae09d42..63255820b58ac868f3deb9d0dfeb6ae1032f9591 100644 (file)
@@ -40,6 +40,7 @@
 #include "xfs_symlink_remote.h"
 #include "xfs_inode_util.h"
 #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
 
 struct kmem_cache              *xfs_bmap_intent_cache;
 
@@ -4788,12 +4789,18 @@ xfs_bmap_del_extent_delay(
        da_diff = da_old - da_new;
        fdblocks = da_diff;
 
-       if (bflags & XFS_BMAPI_REMAP)
+       if (bflags & XFS_BMAPI_REMAP) {
                ;
-       else if (isrt)
-               xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
-       else
+       } else if (isrt) {
+               xfs_rtbxlen_t   rtxlen;
+
+               rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+               if (xfs_is_zoned_inode(ip))
+                       xfs_zoned_add_available(mp, rtxlen);
+               xfs_add_frextents(mp, rtxlen);
+       } else {
                fdblocks += del->br_blockcount;
+       }
 
        xfs_add_fdblocks(mp, fdblocks);
        xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
index dc1db15f0be521873cb05ae74e330382f9f667bf..f6f4f2d4b5dbf50012d0041cc6685421a3e2e07b 100644 (file)
@@ -244,12 +244,22 @@ enum xfs_free_counter {
         */
        XC_FREE_RTEXTENTS,
 
+       /*
+        * Number of available for use RT extents.
+        *
+        * This counter only exists for zoned RT device and indicates the number
+        * of RT extents that can be directly used by writes.  XC_FREE_RTEXTENTS
+        * also includes blocks that have been written previously and freed, but
+        * sit in a rtgroup that still needs a zone reset.
+        */
+       XC_FREE_RTAVAILABLE,
        XC_FREE_NR,
 };
 
 #define XFS_FREECOUNTER_STR \
        { XC_FREE_BLOCKS,               "blocks" }, \
-       { XC_FREE_RTEXTENTS,            "rtextents" }
+       { XC_FREE_RTEXTENTS,            "rtextents" }, \
+       { XC_FREE_RTAVAILABLE,          "rtavailable" }
 
 /*
  * Type verifier functions
index 24c43f22d088c30cac832aa9f4602d100a8c8950..066805e7205498cdeb55c5c7016ca9c3154e5599 100644 (file)
@@ -465,6 +465,7 @@ xfs_mount_reset_sbqflags(
 static const char *const xfs_free_pool_name[] = {
        [XC_FREE_BLOCKS]        = "free blocks",
        [XC_FREE_RTEXTENTS]     = "free rt extents",
+       [XC_FREE_RTAVAILABLE]   = "available rt extents",
 };
 
 uint64_t
@@ -472,22 +473,27 @@ xfs_default_resblks(
        struct xfs_mount        *mp,
        enum xfs_free_counter   ctr)
 {
-       uint64_t resblks;
-
-       if (ctr == XC_FREE_RTEXTENTS)
+       switch (ctr) {
+       case XC_FREE_BLOCKS:
+               /*
+                * Default to 5% or 8192 FSBs of space reserved, whichever is
+                * smaller.
+                *
+                * This is intended to cover concurrent allocation transactions
+                * when we initially hit ENOSPC.  These each require a 4 block
+                * reservation. Hence by default we cover roughly 2000
+                * concurrent allocation reservations.
+                */
+               return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
+       case XC_FREE_RTEXTENTS:
+       case XC_FREE_RTAVAILABLE:
+               if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
+                       return xfs_zoned_default_resblks(mp, ctr);
                return 0;
-
-       /*
-        * We default to 5% or 8192 fsbs of space reserved, whichever is
-        * smaller.  This is intended to cover concurrent allocation
-        * transactions when we initially hit enospc. These each require a 4
-        * block reservation. Hence by default we cover roughly 2000 concurrent
-        * allocation reservations.
-        */
-       resblks = mp->m_sb.sb_dblocks;
-       do_div(resblks, 20);
-       resblks = min_t(uint64_t, resblks, 8192);
-       return resblks;
+       default:
+               ASSERT(0);
+               return 0;
+       }
 }
 
 /* Ensure the summary counts are correct. */
index 27ba3013f21b224a97b0b76c9a34197eaf233226..6d6099ef50af12a20465f008654bbff527c65c60 100644 (file)
@@ -363,6 +363,28 @@ DEFINE_EVENT(xfs_zone_alloc_class, name,                   \
        TP_ARGS(oz, rgbno, len))
 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
 DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+
+TRACE_EVENT(xfs_zones_mount,
+       TP_PROTO(struct xfs_mount *mp),
+       TP_ARGS(mp),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_rgnumber_t, rgcount)
+               __field(uint32_t, blocks)
+               __field(unsigned int, max_open_zones)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->rgcount = mp->m_sb.sb_rgcount;
+               __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks;
+               __entry->max_open_zones = mp->m_max_open_zones;
+       ),
+       TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+               __entry->rgcount,
+               __entry->blocks,
+               __entry->max_open_zones)
+);
 #endif /* CONFIG_XFS_RT */
 
 TRACE_EVENT(xfs_inodegc_worker,
@@ -5767,6 +5789,7 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
 
 TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
 TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
+TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
 
 DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
        TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
index 21734a2d0336a019411c4d227723df1e915bb4f2..3d3f7589bf637d510fb47547ca20d90281e7b7dc 100644 (file)
@@ -922,6 +922,7 @@ xfs_mount_zones(
        xfs_info(mp, "%u zones of %u blocks size (%u max open)",
                 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
                 mp->m_max_open_zones);
+       trace_xfs_zones_mount(mp);
 
        if (bdev_is_zoned(bt->bt_bdev)) {
                error = blkdev_report_zones(bt->bt_bdev,
@@ -939,6 +940,7 @@ xfs_mount_zones(
                }
        }
 
+       xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
        xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
                        iz.available + iz.reclaimable);
        return 0;
index 78cd7bfc6ac8ea7a19cc6b385ce3e62f5d2fce42..28c9cffb72d50245a0d61074548d30d2abd1ce4a 100644 (file)
@@ -5,6 +5,30 @@
 struct iomap_ioend;
 struct xfs_open_zone;
 
+struct xfs_zone_alloc_ctx {
+       struct xfs_open_zone    *open_zone;
+       xfs_filblks_t           reserved_blocks;
+};
+
+/*
+ * Grab any available space, even if it is less than what the caller asked for.
+ */
+#define XFS_ZR_GREEDY          (1U << 0)
+/*
+ * Only grab instantly available space, don't wait or GC.
+ */
+#define XFS_ZR_NOWAIT          (1U << 1)
+/*
+ * Dip into the reserved pool.
+ */
+#define XFS_ZR_RESERVED                (1U << 2)
+
+int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+               unsigned int flags, struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+               struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+
 void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
                struct xfs_open_zone **oz);
 int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
@@ -18,6 +42,9 @@ void xfs_zoned_wake_all(struct xfs_mount *mp);
 bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
 void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
 
+uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
+               enum xfs_free_counter ctr);
+
 #ifdef CONFIG_XFS_RT
 int xfs_mount_zones(struct xfs_mount *mp);
 void xfs_unmount_zones(struct xfs_mount *mp);
index 23d2fd6088aefb8b8cbd0f1b7b96b3dae1e9caff..5283d77482d46f1161e55a55870186a0b58fcbb7 100644 (file)
@@ -86,4 +86,6 @@ struct xfs_zone_info {
 
 struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
 
+void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
+
 #endif /* _XFS_ZONE_PRIV_H */
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
new file mode 100644 (file)
index 0000000..eff9be0
--- /dev/null
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+
+/*
+ * Note: the zoned allocator does not support a rtextsize > 1, so this code and
+ * the allocator itself uses file system blocks interchangeable with realtime
+ * extents without doing the otherwise required conversions.
+ */
+
+/*
+ * Per-task space reservation.
+ *
+ * Tasks that need to wait for GC to free up space allocate one of these
+ * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
+ * The GC thread will then wake the tasks in order when space becomes available.
+ */
+struct xfs_zone_reservation {
+       struct list_head        entry;
+       struct task_struct      *task;
+       xfs_filblks_t           count_fsb;
+};
+
+/*
+ * Calculate the number of reserved blocks.
+ *
+ * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
+ * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
+ * available for writes without waiting for GC.
+ *
+ * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
+ * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
+ * is further restricted by at least one zone as well as the optional
+ * persistently reserved blocks.  This allows the allocator to run more
+ * smoothly by not always triggering GC.
+ */
+uint64_t
+xfs_zoned_default_resblks(
+       struct xfs_mount        *mp,
+       enum xfs_free_counter   ctr)
+{
+       switch (ctr) {
+       case XC_FREE_RTEXTENTS:
+               return (uint64_t)XFS_RESERVED_ZONES *
+                       mp->m_groups[XG_TYPE_RTG].blocks +
+                       mp->m_sb.sb_rtreserved;
+       case XC_FREE_RTAVAILABLE:
+               return (uint64_t)XFS_GC_ZONES *
+                       mp->m_groups[XG_TYPE_RTG].blocks;
+       default:
+               ASSERT(0);
+               return 0;
+       }
+}
+
+void
+xfs_zoned_resv_wake_all(
+       struct xfs_mount                *mp)
+{
+       struct xfs_zone_info            *zi = mp->m_zone_info;
+       struct xfs_zone_reservation     *reservation;
+
+       spin_lock(&zi->zi_reservation_lock);
+       list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
+               wake_up_process(reservation->task);
+       spin_unlock(&zi->zi_reservation_lock);
+}
+
+void
+xfs_zoned_add_available(
+       struct xfs_mount                *mp,
+       xfs_filblks_t                   count_fsb)
+{
+       struct xfs_zone_info            *zi = mp->m_zone_info;
+       struct xfs_zone_reservation     *reservation;
+
+       if (list_empty_careful(&zi->zi_reclaim_reservations)) {
+               xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+               return;
+       }
+
+       spin_lock(&zi->zi_reservation_lock);
+       xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+       count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
+       list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
+               if (reservation->count_fsb > count_fsb)
+                       break;
+               wake_up_process(reservation->task);
+               count_fsb -= reservation->count_fsb;
+
+       }
+       spin_unlock(&zi->zi_reservation_lock);
+}
+
+static int
+xfs_zoned_space_wait_error(
+       struct xfs_mount                *mp)
+{
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+       if (fatal_signal_pending(current))
+               return -EINTR;
+       return 0;
+}
+
+static int
+xfs_zoned_reserve_available(
+       struct xfs_inode                *ip,
+       xfs_filblks_t                   count_fsb,
+       unsigned int                    flags)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_zone_info            *zi = mp->m_zone_info;
+       struct xfs_zone_reservation     reservation = {
+               .task           = current,
+               .count_fsb      = count_fsb,
+       };
+       int                             error;
+
+       /*
+        * If there are no waiters, try to directly grab the available blocks
+        * from the percpu counter.
+        *
+        * If the caller wants to dip into the reserved pool also bypass the
+        * wait list.  This relies on the fact that we have a very graciously
+        * sized reserved pool that always has enough space.  If the reserved
+        * allocations fail we're in trouble.
+        */
+       if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
+           (flags & XFS_ZR_RESERVED))) {
+               error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+                               flags & XFS_ZR_RESERVED);
+               if (error != -ENOSPC)
+                       return error;
+       }
+
+       if (flags & XFS_ZR_NOWAIT)
+               return -EAGAIN;
+
+       spin_lock(&zi->zi_reservation_lock);
+       list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
+       while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
+               set_current_state(TASK_KILLABLE);
+
+               error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+                               flags & XFS_ZR_RESERVED);
+               if (error != -ENOSPC)
+                       break;
+
+               spin_unlock(&zi->zi_reservation_lock);
+               schedule();
+               spin_lock(&zi->zi_reservation_lock);
+       }
+       list_del(&reservation.entry);
+       spin_unlock(&zi->zi_reservation_lock);
+
+       __set_current_state(TASK_RUNNING);
+       return error;
+}
+
+/*
+ * Implement greedy space allocation for short writes by trying to grab all
+ * that is left after locking out other threads from trying to do the same.
+ *
+ * This isn't exactly optimal and can hopefully be replaced by a proper
+ * percpu_counter primitive one day.
+ */
+static int
+xfs_zoned_reserve_extents_greedy(
+       struct xfs_inode                *ip,
+       xfs_filblks_t                   *count_fsb,
+       unsigned int                    flags)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_zone_info            *zi = mp->m_zone_info;
+       s64                             len = *count_fsb;
+       int                             error = -ENOSPC;
+
+       spin_lock(&zi->zi_reservation_lock);
+       len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+       if (len > 0) {
+               *count_fsb = len;
+               error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
+                               flags & XFS_ZR_RESERVED);
+       }
+       spin_unlock(&zi->zi_reservation_lock);
+       return error;
+}
+
+int
+xfs_zoned_space_reserve(
+       struct xfs_inode                *ip,
+       xfs_filblks_t                   count_fsb,
+       unsigned int                    flags,
+       struct xfs_zone_alloc_ctx       *ac)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       int                             error;
+
+       ASSERT(ac->reserved_blocks == 0);
+       ASSERT(ac->open_zone == NULL);
+
+       error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+                       flags & XFS_ZR_RESERVED);
+       if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
+               error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
+       if (error)
+               return error;
+
+       error = xfs_zoned_reserve_available(ip, count_fsb, flags);
+       if (error) {
+               xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
+               return error;
+       }
+       ac->reserved_blocks = count_fsb;
+       return 0;
+}
+
+void
+xfs_zoned_space_unreserve(
+       struct xfs_inode                *ip,
+       struct xfs_zone_alloc_ctx       *ac)
+{
+       if (ac->reserved_blocks > 0) {
+               struct xfs_mount        *mp = ip->i_mount;
+
+               xfs_zoned_add_available(mp, ac->reserved_blocks);
+               xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
+       }
+       if (ac->open_zone)
+               xfs_open_zone_put(ac->open_zone);
+}