xfs: add support for zoned space reservations

author Christoph Hellwig <hch@lst.de>

Thu, 13 Feb 2025 08:16:06 +0000 (09:16 +0100)

committer Christoph Hellwig <hch@lst.de>

Mon, 3 Mar 2025 15:17:07 +0000 (08:17 -0700)
author Christoph Hellwig <hch@lst.de>
Thu, 13 Feb 2025 08:16:06 +0000 (09:16 +0100)
committer Christoph Hellwig <hch@lst.de>
Mon, 3 Mar 2025 15:17:07 +0000 (08:17 -0700)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile

index 28bd2627e9ef56db74d69a4efe9310283df04b36..bdedf4bdb1db6c1ee2e1415e21f94218943e54f9 100644 (file)
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -138,7 +138,8 @@ xfs-$(CONFIG_XFS_QUOTA)             += xfs_dquot.o \
  
  # xfs_rtbitmap is shared with libxfs
  xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o \
-                                  xfs_zone_alloc.o
+                                  xfs_zone_alloc.o \
+                                  xfs_zone_space_resv.o
  
  xfs-$(CONFIG_XFS_POSIX_ACL)    += xfs_acl.o
  xfs-$(CONFIG_SYSCTL)           += xfs_sysctl.o
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index 522c126e52fb6a5e9f549fdd8140193f7ae09d42..63255820b58ac868f3deb9d0dfeb6ae1032f9591 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -40,6 +40,7 @@
  #include "xfs_symlink_remote.h"
  #include "xfs_inode_util.h"
  #include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
  
  struct kmem_cache              *xfs_bmap_intent_cache;
  
@@ -4788,12 +4789,18 @@ xfs_bmap_del_extent_delay(
         da_diff = da_old - da_new;
         fdblocks = da_diff;
  
-       if (bflags & XFS_BMAPI_REMAP)
+       if (bflags & XFS_BMAPI_REMAP) {
                 ;
-       else if (isrt)
-               xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
-       else
+       } else if (isrt) {
+               xfs_rtbxlen_t   rtxlen;
+
+               rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+               if (xfs_is_zoned_inode(ip))
+                       xfs_zoned_add_available(mp, rtxlen);
+               xfs_add_frextents(mp, rtxlen);
+       } else {
                 fdblocks += del->br_blockcount;
+       }
  
         xfs_add_fdblocks(mp, fdblocks);
         xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h

index dc1db15f0be521873cb05ae74e330382f9f667bf..f6f4f2d4b5dbf50012d0041cc6685421a3e2e07b 100644 (file)
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -244,12 +244,22 @@ enum xfs_free_counter {
          */
         XC_FREE_RTEXTENTS,
  
+       /*
+        * Number of available for use RT extents.
+        *
+        * This counter only exists for zoned RT device and indicates the number
+        * of RT extents that can be directly used by writes.  XC_FREE_RTEXTENTS
+        * also includes blocks that have been written previously and freed, but
+        * sit in a rtgroup that still needs a zone reset.
+        */
+       XC_FREE_RTAVAILABLE,
         XC_FREE_NR,
  };
  
  #define XFS_FREECOUNTER_STR \
         { XC_FREE_BLOCKS,               "blocks" }, \
-       { XC_FREE_RTEXTENTS,            "rtextents" }
+       { XC_FREE_RTEXTENTS,            "rtextents" }, \
+       { XC_FREE_RTAVAILABLE,          "rtavailable" }
  
  /*
   * Type verifier functions
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index 24c43f22d088c30cac832aa9f4602d100a8c8950..066805e7205498cdeb55c5c7016ca9c3154e5599 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -465,6 +465,7 @@ xfs_mount_reset_sbqflags(
  static const char *const xfs_free_pool_name[] = {
         [XC_FREE_BLOCKS]        = "free blocks",
         [XC_FREE_RTEXTENTS]     = "free rt extents",
+       [XC_FREE_RTAVAILABLE]   = "available rt extents",
  };
  
  uint64_t
@@ -472,22 +473,27 @@ xfs_default_resblks(
         struct xfs_mount        *mp,
         enum xfs_free_counter   ctr)
  {
-       uint64_t resblks;
-
-       if (ctr == XC_FREE_RTEXTENTS)
+       switch (ctr) {
+       case XC_FREE_BLOCKS:
+               /*
+                * Default to 5% or 8192 FSBs of space reserved, whichever is
+                * smaller.
+                *
+                * This is intended to cover concurrent allocation transactions
+                * when we initially hit ENOSPC.  These each require a 4 block
+                * reservation. Hence by default we cover roughly 2000
+                * concurrent allocation reservations.
+                */
+               return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
+       case XC_FREE_RTEXTENTS:
+       case XC_FREE_RTAVAILABLE:
+               if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
+                       return xfs_zoned_default_resblks(mp, ctr);
                 return 0;
-
-       /*
-        * We default to 5% or 8192 fsbs of space reserved, whichever is
-        * smaller.  This is intended to cover concurrent allocation
-        * transactions when we initially hit enospc. These each require a 4
-        * block reservation. Hence by default we cover roughly 2000 concurrent
-        * allocation reservations.
-        */
-       resblks = mp->m_sb.sb_dblocks;
-       do_div(resblks, 20);
-       resblks = min_t(uint64_t, resblks, 8192);
-       return resblks;
+       default:
+               ASSERT(0);
+               return 0;
+       }
  }
  
  /* Ensure the summary counts are correct. */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index 27ba3013f21b224a97b0b76c9a34197eaf233226..6d6099ef50af12a20465f008654bbff527c65c60 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -363,6 +363,28 @@ DEFINE_EVENT(xfs_zone_alloc_class, name,                   \
         TP_ARGS(oz, rgbno, len))
  DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
  DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+
+TRACE_EVENT(xfs_zones_mount,
+       TP_PROTO(struct xfs_mount *mp),
+       TP_ARGS(mp),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_rgnumber_t, rgcount)
+               __field(uint32_t, blocks)
+               __field(unsigned int, max_open_zones)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->rgcount = mp->m_sb.sb_rgcount;
+               __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks;
+               __entry->max_open_zones = mp->m_max_open_zones;
+       ),
+       TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+               __entry->rgcount,
+               __entry->blocks,
+               __entry->max_open_zones)
+);
  #endif /* CONFIG_XFS_RT */
  
  TRACE_EVENT(xfs_inodegc_worker,
@@ -5767,6 +5789,7 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
  
  TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
  TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
+TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
  
  DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
         TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c

index 21734a2d0336a019411c4d227723df1e915bb4f2..3d3f7589bf637d510fb47547ca20d90281e7b7dc 100644 (file)
--- a/fs/xfs/xfs_zone_alloc.c
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -922,6 +922,7 @@ xfs_mount_zones(
         xfs_info(mp, "%u zones of %u blocks size (%u max open)",
                  mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
                  mp->m_max_open_zones);
+       trace_xfs_zones_mount(mp);
  
         if (bdev_is_zoned(bt->bt_bdev)) {
                 error = blkdev_report_zones(bt->bt_bdev,
@@ -939,6 +940,7 @@ xfs_mount_zones(
                 }
         }
  
+       xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
         xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
                         iz.available + iz.reclaimable);
         return 0;
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h

index 78cd7bfc6ac8ea7a19cc6b385ce3e62f5d2fce42..28c9cffb72d50245a0d61074548d30d2abd1ce4a 100644 (file)
--- a/fs/xfs/xfs_zone_alloc.h
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -5,6 +5,30 @@
  struct iomap_ioend;
  struct xfs_open_zone;
  
+struct xfs_zone_alloc_ctx {
+       struct xfs_open_zone    *open_zone;
+       xfs_filblks_t           reserved_blocks;
+};
+
+/*
+ * Grab any available space, even if it is less than what the caller asked for.
+ */
+#define XFS_ZR_GREEDY          (1U << 0)
+/*
+ * Only grab instantly available space, don't wait or GC.
+ */
+#define XFS_ZR_NOWAIT          (1U << 1)
+/*
+ * Dip into the reserved pool.
+ */
+#define XFS_ZR_RESERVED                (1U << 2)
+
+int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb,
+               unsigned int flags, struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_space_unreserve(struct xfs_inode *ip,
+               struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+
  void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
                 struct xfs_open_zone **oz);
  int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
@@ -18,6 +42,9 @@ void xfs_zoned_wake_all(struct xfs_mount *mp);
  bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
  void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
  
+uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
+               enum xfs_free_counter ctr);
+
  #ifdef CONFIG_XFS_RT
  int xfs_mount_zones(struct xfs_mount *mp);
  void xfs_unmount_zones(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h

index 23d2fd6088aefb8b8cbd0f1b7b96b3dae1e9caff..5283d77482d46f1161e55a55870186a0b58fcbb7 100644 (file)
--- a/fs/xfs/xfs_zone_priv.h
+++ b/fs/xfs/xfs_zone_priv.h
@@ -86,4 +86,6 @@ struct xfs_zone_info {
  
  struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, bool is_gc);
  
+void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
+
  #endif /* _XFS_ZONE_PRIV_H */
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c

new file mode 100644 (file)

index 0000000..eff9be0
--- /dev/null
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+
+/*
+ * Note: the zoned allocator does not support a rtextsize > 1, so this code and
+ * the allocator itself uses file system blocks interchangeable with realtime
+ * extents without doing the otherwise required conversions.
+ */
+
+/*
+ * Per-task space reservation.
+ *
+ * Tasks that need to wait for GC to free up space allocate one of these
+ * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
+ * The GC thread will then wake the tasks in order when space becomes available.
+ */
+struct xfs_zone_reservation {
+       struct list_head        entry;
+       struct task_struct      *task;
+       xfs_filblks_t           count_fsb;
+};
+
+/*
+ * Calculate the number of reserved blocks.
+ *
+ * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
+ * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
+ * available for writes without waiting for GC.
+ *
+ * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
+ * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
+ * is further restricted by at least one zone as well as the optional
+ * persistently reserved blocks.  This allows the allocator to run more
+ * smoothly by not always triggering GC.
+ */
+uint64_t
+xfs_zoned_default_resblks(
+       struct xfs_mount        *mp,
+       enum xfs_free_counter   ctr)
+{
+       switch (ctr) {
+       case XC_FREE_RTEXTENTS:
+               return (uint64_t)XFS_RESERVED_ZONES *
+                       mp->m_groups[XG_TYPE_RTG].blocks +
+                       mp->m_sb.sb_rtreserved;
+       case XC_FREE_RTAVAILABLE:
+               return (uint64_t)XFS_GC_ZONES *
+                       mp->m_groups[XG_TYPE_RTG].blocks;
+       default:
+               ASSERT(0);
+               return 0;
+       }
+}
+
+void
+xfs_zoned_resv_wake_all(
+       struct xfs_mount                *mp)
+{
+       struct xfs_zone_info            *zi = mp->m_zone_info;
+       struct xfs_zone_reservation     *reservation;
+
+       spin_lock(&zi->zi_reservation_lock);
+       list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
+               wake_up_process(reservation->task);
+       spin_unlock(&zi->zi_reservation_lock);
+}
+
+void
+xfs_zoned_add_available(
+       struct xfs_mount                *mp,
+       xfs_filblks_t                   count_fsb)
+{
+       struct xfs_zone_info            *zi = mp->m_zone_info;
+       struct xfs_zone_reservation     *reservation;
+
+       if (list_empty_careful(&zi->zi_reclaim_reservations)) {
+               xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+               return;
+       }
+
+       spin_lock(&zi->zi_reservation_lock);
+       xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+       count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
+       list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
+               if (reservation->count_fsb > count_fsb)
+                       break;
+               wake_up_process(reservation->task);
+               count_fsb -= reservation->count_fsb;
+
+       }
+       spin_unlock(&zi->zi_reservation_lock);
+}
+
+static int
+xfs_zoned_space_wait_error(
+       struct xfs_mount                *mp)
+{
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+       if (fatal_signal_pending(current))
+               return -EINTR;
+       return 0;
+}
+
+static int
+xfs_zoned_reserve_available(
+       struct xfs_inode                *ip,
+       xfs_filblks_t                   count_fsb,
+       unsigned int                    flags)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_zone_info            *zi = mp->m_zone_info;
+       struct xfs_zone_reservation     reservation = {
+               .task           = current,
+               .count_fsb      = count_fsb,
+       };
+       int                             error;
+
+       /*
+        * If there are no waiters, try to directly grab the available blocks
+        * from the percpu counter.
+        *
+        * If the caller wants to dip into the reserved pool also bypass the
+        * wait list.  This relies on the fact that we have a very graciously
+        * sized reserved pool that always has enough space.  If the reserved
+        * allocations fail we're in trouble.
+        */
+       if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
+           (flags & XFS_ZR_RESERVED))) {
+               error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+                               flags & XFS_ZR_RESERVED);
+               if (error != -ENOSPC)
+                       return error;
+       }
+
+       if (flags & XFS_ZR_NOWAIT)
+               return -EAGAIN;
+
+       spin_lock(&zi->zi_reservation_lock);
+       list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
+       while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
+               set_current_state(TASK_KILLABLE);
+
+               error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+                               flags & XFS_ZR_RESERVED);
+               if (error != -ENOSPC)
+                       break;
+
+               spin_unlock(&zi->zi_reservation_lock);
+               schedule();
+               spin_lock(&zi->zi_reservation_lock);
+       }
+       list_del(&reservation.entry);
+       spin_unlock(&zi->zi_reservation_lock);
+
+       __set_current_state(TASK_RUNNING);
+       return error;
+}
+
+/*
+ * Implement greedy space allocation for short writes by trying to grab all
+ * that is left after locking out other threads from trying to do the same.
+ *
+ * This isn't exactly optimal and can hopefully be replaced by a proper
+ * percpu_counter primitive one day.
+ */
+static int
+xfs_zoned_reserve_extents_greedy(
+       struct xfs_inode                *ip,
+       xfs_filblks_t                   *count_fsb,
+       unsigned int                    flags)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       struct xfs_zone_info            *zi = mp->m_zone_info;
+       s64                             len = *count_fsb;
+       int                             error = -ENOSPC;
+
+       spin_lock(&zi->zi_reservation_lock);
+       len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+       if (len > 0) {
+               *count_fsb = len;
+               error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
+                               flags & XFS_ZR_RESERVED);
+       }
+       spin_unlock(&zi->zi_reservation_lock);
+       return error;
+}
+
+int
+xfs_zoned_space_reserve(
+       struct xfs_inode                *ip,
+       xfs_filblks_t                   count_fsb,
+       unsigned int                    flags,
+       struct xfs_zone_alloc_ctx       *ac)
+{
+       struct xfs_mount                *mp = ip->i_mount;
+       int                             error;
+
+       ASSERT(ac->reserved_blocks == 0);
+       ASSERT(ac->open_zone == NULL);
+
+       error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+                       flags & XFS_ZR_RESERVED);
+       if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
+               error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
+       if (error)
+               return error;
+
+       error = xfs_zoned_reserve_available(ip, count_fsb, flags);
+       if (error) {
+               xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
+               return error;
+       }
+       ac->reserved_blocks = count_fsb;
+       return 0;
+}
+
+void
+xfs_zoned_space_unreserve(
+       struct xfs_inode                *ip,
+       struct xfs_zone_alloc_ctx       *ac)
+{
+       if (ac->reserved_blocks > 0) {
+               struct xfs_mount        *mp = ip->i_mount;
+
+               xfs_zoned_add_available(mp, ac->reserved_blocks);
+               xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
+       }
+       if (ac->open_zone)
+               xfs_open_zone_put(ac->open_zone);
+}
author	Christoph Hellwig <hch@lst.de>
	Thu, 13 Feb 2025 08:16:06 +0000 (09:16 +0100)
committer	Christoph Hellwig <hch@lst.de>
	Mon, 3 Mar 2025 15:17:07 +0000 (08:17 -0700)
fs/xfs/Makefile		patch \| blob \| blame \| history
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| blame \| history
fs/xfs/libxfs/xfs_types.h		patch \| blob \| blame \| history
fs/xfs/xfs_mount.c		patch \| blob \| blame \| history
fs/xfs/xfs_trace.h		patch \| blob \| blame \| history
fs/xfs/xfs_zone_alloc.c		patch \| blob \| blame \| history
fs/xfs/xfs_zone_alloc.h		patch \| blob \| blame \| history
fs/xfs/xfs_zone_priv.h		patch \| blob \| blame \| history
fs/xfs/xfs_zone_space_resv.c	[new file with mode: 0644]	patch \| blob