io_uring: add support for IORING_SETUP_CQE_MIXED

author Jens Axboe <axboe@kernel.dk>

Thu, 7 Aug 2025 20:14:41 +0000 (14:14 -0600)

committer Jens Axboe <axboe@kernel.dk>

Sat, 9 Aug 2025 14:38:04 +0000 (08:38 -0600)
author Jens Axboe <axboe@kernel.dk>
Thu, 7 Aug 2025 20:14:41 +0000 (14:14 -0600)
committer Jens Axboe <axboe@kernel.dk>
Sat, 9 Aug 2025 14:38:04 +0000 (08:38 -0600)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h

index 69337eb1db334510e5e8db669bfb6559dc1fdb84..9396afb01dc80fb1efd455a769e22bac274ec1cf 100644 (file)
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -225,6 +225,12 @@ enum io_uring_sqe_flags_bit {
  /* Use hybrid poll in iopoll process */
  #define IORING_SETUP_HYBRID_IOPOLL     (1U << 17)
  
+/*
+ * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have
+ * IORING_CQE_F_32 set in cqe->flags.
+ */
+#define IORING_SETUP_CQE_MIXED         (1U << 18)
+
  enum io_uring_op {
         IORING_OP_NOP,
         IORING_OP_READV,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c

index 4ef69dd58734aee28ed03a5d9c2be5dcedfb12ee..afdb5de0707d8873ac65dc7d2209c616aed242b0 100644 (file)
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -598,27 +598,29 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx)
  
  static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
  {
-       size_t cqe_size = sizeof(struct io_uring_cqe);
-
         lockdep_assert_held(&ctx->uring_lock);
  
         /* don't abort if we're dying, entries must get freed */
         if (!dying && __io_cqring_events(ctx) == ctx->cq_entries)
                 return;
  
-       if (ctx->flags & IORING_SETUP_CQE32)
-               cqe_size <<= 1;
-
         io_cq_lock(ctx);
         while (!list_empty(&ctx->cq_overflow_list)) {
+               size_t cqe_size = sizeof(struct io_uring_cqe);
                 struct io_uring_cqe *cqe;
                 struct io_overflow_cqe *ocqe;
+               bool is_cqe32 = false;
  
                 ocqe = list_first_entry(&ctx->cq_overflow_list,
                                         struct io_overflow_cqe, list);
+               if (ocqe->cqe.flags & IORING_CQE_F_32 ||
+                   ctx->flags & IORING_SETUP_CQE32) {
+                       is_cqe32 = true;
+                       cqe_size <<= 1;
+               }
  
                 if (!dying) {
-                       if (!io_get_cqe_overflow(ctx, &cqe, true))
+                       if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32))
                                 break;
                         memcpy(cqe, &ocqe->cqe, cqe_size);
                 }
@@ -730,10 +732,12 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
  {
         struct io_overflow_cqe *ocqe;
         size_t ocq_size = sizeof(struct io_overflow_cqe);
-       bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+       bool is_cqe32 = false;
  
-       if (is_cqe32)
-               ocq_size += sizeof(struct io_uring_cqe);
+       if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) {
+               is_cqe32 = true;
+               ocq_size <<= 1;
+       }
  
         ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT);
         trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
@@ -751,12 +755,29 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
         return ocqe;
  }
  
+/*
+ * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE
+ * because the ring is a single 16b entry away from wrapping.
+ */
+static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
+{
+       if (__io_cqring_events(ctx) < ctx->cq_entries) {
+               struct io_uring_cqe *cqe = &ctx->rings->cqes[off];
+
+               memset(cqe, 0, sizeof(*cqe));
+               cqe->flags = IORING_CQE_F_SKIP;
+               ctx->cached_cq_tail++;
+               return true;
+       }
+       return false;
+}
+
  /*
   * writes to the cq entry need to come after reading head; the
   * control dependency is enough as we're using WRITE_ONCE to
   * fill the cq entry
   */
-bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
  {
         struct io_rings *rings = ctx->rings;
         unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
@@ -770,12 +791,22 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
         if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
                 return false;
  
+       /*
+        * Post dummy CQE if a 32b CQE is needed and there's only room for a
+        * 16b CQE before the ring wraps.
+        */
+       if (cqe32 && ctx->cq_entries - off == 1) {
+               if (!io_fill_nop_cqe(ctx, off))
+                       return false;
+               off = 0;
+       }
+
         /* userspace may cheat modifying the tail, be safe and do min */
         queued = min(__io_cqring_events(ctx), ctx->cq_entries);
         free = ctx->cq_entries - queued;
         /* we need a contiguous range, limit based on the current array offset */
         len = min(free, ctx->cq_entries - off);
-       if (!len)
+       if (len < (cqe32 + 1))
                 return false;
  
         if (ctx->flags & IORING_SETUP_CQE32) {
@@ -793,9 +824,9 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx,
  {
         struct io_uring_cqe *cqe;
  
-       if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
+       if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))))
                 return false;
-       if (unlikely(!io_get_cqe(ctx, &cqe)))
+       if (unlikely(!io_get_cqe(ctx, &cqe, true)))
                 return false;
  
         memcpy(cqe, src_cqe, 2 * sizeof(*cqe));
@@ -806,14 +837,15 @@ static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx,
  static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
                               u32 cflags)
  {
+       bool cqe32 = cflags & IORING_CQE_F_32;
         struct io_uring_cqe *cqe;
  
-       if (likely(io_get_cqe(ctx, &cqe))) {
+       if (likely(io_get_cqe(ctx, &cqe, cqe32))) {
                 WRITE_ONCE(cqe->user_data, user_data);
                 WRITE_ONCE(cqe->res, res);
                 WRITE_ONCE(cqe->flags, cflags);
  
-               if (ctx->flags & IORING_SETUP_CQE32) {
+               if (cqe32) {
                         WRITE_ONCE(cqe->big_cqe[0], 0);
                         WRITE_ONCE(cqe->big_cqe[1], 0);
                 }
@@ -2735,6 +2767,10 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
                 if (check_shl_overflow(off, 1, &off))
                         return SIZE_MAX;
         }
+       if (flags & IORING_SETUP_CQE_MIXED) {
+               if (cq_entries < 2)
+                       return SIZE_MAX;
+       }
  
  #ifdef CONFIG_SMP
         off = ALIGN(off, SMP_CACHE_BYTES);
@@ -3658,6 +3694,14 @@ static int io_uring_sanitise_params(struct io_uring_params *p)
             !(flags & IORING_SETUP_SINGLE_ISSUER))
                 return -EINVAL;
  
+       /*
+        * Nonsensical to ask for CQE32 and mixed CQE support, it's not
+        * supported to post 16b CQEs on a ring setup with CQE32.
+        */
+       if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) ==
+           (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))
+               return -EINVAL;
+
         return 0;
  }
  
@@ -3884,7 +3928,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
                         IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
                         IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
                         IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
-                       IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL))
+                       IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL |
+                       IORING_SETUP_CQE_MIXED))
                 return -EINVAL;
  
         return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h

index abc6de227f74d23bbca84b761324072135932da7..2e4f7223a7679a039e6eb244cbdc8e5e24ff5562 100644 (file)
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -75,7 +75,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq)
  unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
                          unsigned int cq_entries, size_t *sq_offset);
  int io_uring_fill_params(unsigned entries, struct io_uring_params *p);
-bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32);
  int io_run_task_work_sig(struct io_ring_ctx *ctx);
  void io_req_defer_failed(struct io_kiocb *req, s32 res);
  bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
@@ -169,25 +169,31 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
  
  static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
                                         struct io_uring_cqe **ret,
-                                       bool overflow)
+                                       bool overflow, bool cqe32)
  {
         io_lockdep_assert_cq_locked(ctx);
  
-       if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
-               if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
+       if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) {
+               if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32)))
                         return false;
         }
         *ret = ctx->cqe_cached;
         ctx->cached_cq_tail++;
         ctx->cqe_cached++;
-       if (ctx->flags & IORING_SETUP_CQE32)
+       if (ctx->flags & IORING_SETUP_CQE32) {
+               ctx->cqe_cached++;
+       } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) {
                 ctx->cqe_cached++;
+               ctx->cached_cq_tail++;
+       }
+       WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel);
         return true;
  }
  
-static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret)
+static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret,
+                               bool cqe32)
  {
-       return io_get_cqe_overflow(ctx, ret, false);
+       return io_get_cqe_overflow(ctx, ret, false, cqe32);
  }
  
  static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
@@ -196,25 +202,24 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
         io_lockdep_assert_cq_locked(ctx);
  
         ctx->submit_state.cq_flush = true;
-       return io_get_cqe(ctx, cqe_ret);
+       return io_get_cqe(ctx, cqe_ret, false);
  }
  
  static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
                                             struct io_kiocb *req)
  {
+       bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32;
         struct io_uring_cqe *cqe;
  
         /*
-        * If we can't get a cq entry, userspace overflowed the
-        * submission (by quite a lot). Increment the overflow count in
-        * the ring.
+        * If we can't get a cq entry, userspace overflowed the submission
+        * (by quite a lot).
          */
-       if (unlikely(!io_get_cqe(ctx, &cqe)))
+       if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32)))
                 return false;
  
-
         memcpy(cqe, &req->cqe, sizeof(*cqe));
-       if (ctx->flags & IORING_SETUP_CQE32) {
+       if (is_cqe32) {
                 memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
                 memset(&req->big_cqe, 0, sizeof(req->big_cqe));
         }
@@ -239,6 +244,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags)
         req->cqe.flags = cflags;
  }
  
+static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx)
+{
+       if (ctx->flags & IORING_SETUP_CQE_MIXED)
+               return IORING_CQE_F_32;
+       return 0;
+}
+
+static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags,
+                                   __u64 extra1, __u64 extra2)
+{
+       req->cqe.res = res;
+       req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx);
+       req->big_cqe.extra1 = extra1;
+       req->big_cqe.extra2 = extra2;
+}
+
  static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache,
                                               struct io_kiocb *req)
  {
diff --git a/io_uring/register.c b/io_uring/register.c

index a59589249fce7acddebd9f1c2e80ffd2f879abd3..a1a9b2884eae5e76e03c4535e8c935788d9bca3e 100644 (file)
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -396,7 +396,8 @@ static void io_register_free_rings(struct io_ring_ctx *ctx,
  
  #define RESIZE_FLAGS   (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
  #define COPY_FLAGS     (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
-                        IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
+                        IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \
+                        IORING_SETUP_CQE_MIXED)
  
  static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
  {
author	Jens Axboe <axboe@kernel.dk>
	Thu, 7 Aug 2025 20:14:41 +0000 (14:14 -0600)
committer	Jens Axboe <axboe@kernel.dk>
	Sat, 9 Aug 2025 14:38:04 +0000 (08:38 -0600)
include/uapi/linux/io_uring.h		patch \| blob \| blame \| history
io_uring/io_uring.c		patch \| blob \| blame \| history
io_uring/io_uring.h		patch \| blob \| blame \| history
io_uring/register.c		patch \| blob \| blame \| history