static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
{
- size_t cqe_size = sizeof(struct io_uring_cqe);
-
lockdep_assert_held(&ctx->uring_lock);
/* don't abort if we're dying, entries must get freed */
if (!dying && __io_cqring_events(ctx) == ctx->cq_entries)
return;
- if (ctx->flags & IORING_SETUP_CQE32)
- cqe_size <<= 1;
-
io_cq_lock(ctx);
while (!list_empty(&ctx->cq_overflow_list)) {
+ size_t cqe_size = sizeof(struct io_uring_cqe);
struct io_uring_cqe *cqe;
struct io_overflow_cqe *ocqe;
+ bool is_cqe32 = false;
ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list);
+ if (ocqe->cqe.flags & IORING_CQE_F_32 ||
+ ctx->flags & IORING_SETUP_CQE32) {
+ is_cqe32 = true;
+ cqe_size <<= 1;
+ }
if (!dying) {
- if (!io_get_cqe_overflow(ctx, &cqe, true))
+ if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32))
break;
memcpy(cqe, &ocqe->cqe, cqe_size);
}
{
struct io_overflow_cqe *ocqe;
size_t ocq_size = sizeof(struct io_overflow_cqe);
- bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32);
+ bool is_cqe32 = false;
- if (is_cqe32)
- ocq_size += sizeof(struct io_uring_cqe);
+ if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) {
+ is_cqe32 = true;
+ ocq_size <<= 1;
+ }
ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT);
trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
return ocqe;
}
+/*
+ * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE
+ * because the ring is a single 16b entry away from wrapping.
+ */
+static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off)
+{
+ if (__io_cqring_events(ctx) < ctx->cq_entries) {
+ struct io_uring_cqe *cqe = &ctx->rings->cqes[off];
+
+ memset(cqe, 0, sizeof(*cqe));
+ cqe->flags = IORING_CQE_F_SKIP;
+ ctx->cached_cq_tail++;
+ return true;
+ }
+ return false;
+}
+
/*
* writes to the cq entry need to come after reading head; the
* control dependency is enough as we're using WRITE_ONCE to
* fill the cq entry
*/
-bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32)
{
struct io_rings *rings = ctx->rings;
unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1);
if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)))
return false;
+ /*
+ * Post dummy CQE if a 32b CQE is needed and there's only room for a
+ * 16b CQE before the ring wraps.
+ */
+ if (cqe32 && ctx->cq_entries - off == 1) {
+ if (!io_fill_nop_cqe(ctx, off))
+ return false;
+ off = 0;
+ }
+
/* userspace may cheat modifying the tail, be safe and do min */
queued = min(__io_cqring_events(ctx), ctx->cq_entries);
free = ctx->cq_entries - queued;
/* we need a contiguous range, limit based on the current array offset */
len = min(free, ctx->cq_entries - off);
- if (!len)
+ if (len < (cqe32 + 1))
return false;
if (ctx->flags & IORING_SETUP_CQE32) {
{
struct io_uring_cqe *cqe;
- if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32)))
+ if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))))
return false;
- if (unlikely(!io_get_cqe(ctx, &cqe)))
+ if (unlikely(!io_get_cqe(ctx, &cqe, true)))
return false;
memcpy(cqe, src_cqe, 2 * sizeof(*cqe));
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{
+ bool cqe32 = cflags & IORING_CQE_F_32;
struct io_uring_cqe *cqe;
- if (likely(io_get_cqe(ctx, &cqe))) {
+ if (likely(io_get_cqe(ctx, &cqe, cqe32))) {
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
- if (ctx->flags & IORING_SETUP_CQE32) {
+ if (cqe32) {
WRITE_ONCE(cqe->big_cqe[0], 0);
WRITE_ONCE(cqe->big_cqe[1], 0);
}
if (check_shl_overflow(off, 1, &off))
return SIZE_MAX;
}
+ if (flags & IORING_SETUP_CQE_MIXED) {
+ if (cq_entries < 2)
+ return SIZE_MAX;
+ }
#ifdef CONFIG_SMP
off = ALIGN(off, SMP_CACHE_BYTES);
!(flags & IORING_SETUP_SINGLE_ISSUER))
return -EINVAL;
+ /*
+ * Nonsensical to ask for CQE32 and mixed CQE support, it's not
+ * supported to post 16b CQEs on a ring setup with CQE32.
+ */
+ if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) ==
+ (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))
+ return -EINVAL;
+
return 0;
}
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
- IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL))
+ IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL |
+ IORING_SETUP_CQE_MIXED))
return -EINVAL;
return io_uring_create(entries, &p, params);
unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset);
int io_uring_fill_params(unsigned entries, struct io_uring_params *p);
-bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
+bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
struct io_uring_cqe **ret,
- bool overflow)
+ bool overflow, bool cqe32)
{
io_lockdep_assert_cq_locked(ctx);
- if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
- if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
+ if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) {
+ if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32)))
return false;
}
*ret = ctx->cqe_cached;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
- if (ctx->flags & IORING_SETUP_CQE32)
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ ctx->cqe_cached++;
+ } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) {
ctx->cqe_cached++;
+ ctx->cached_cq_tail++;
+ }
+ WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel);
return true;
}
-static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret)
+static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret,
+ bool cqe32)
{
- return io_get_cqe_overflow(ctx, ret, false);
+ return io_get_cqe_overflow(ctx, ret, false, cqe32);
}
static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
io_lockdep_assert_cq_locked(ctx);
ctx->submit_state.cq_flush = true;
- return io_get_cqe(ctx, cqe_ret);
+ return io_get_cqe(ctx, cqe_ret, false);
}
static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
+ bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32;
struct io_uring_cqe *cqe;
/*
- * If we can't get a cq entry, userspace overflowed the
- * submission (by quite a lot). Increment the overflow count in
- * the ring.
+ * If we can't get a cq entry, userspace overflowed the submission
+ * (by quite a lot).
*/
- if (unlikely(!io_get_cqe(ctx, &cqe)))
+ if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32)))
return false;
-
memcpy(cqe, &req->cqe, sizeof(*cqe));
- if (ctx->flags & IORING_SETUP_CQE32) {
+ if (is_cqe32) {
memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}
req->cqe.flags = cflags;
}
+static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx)
+{
+ if (ctx->flags & IORING_SETUP_CQE_MIXED)
+ return IORING_CQE_F_32;
+ return 0;
+}
+
+static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags,
+ __u64 extra1, __u64 extra2)
+{
+ req->cqe.res = res;
+ req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx);
+ req->big_cqe.extra1 = extra1;
+ req->big_cqe.extra2 = extra2;
+}
+
static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache,
struct io_kiocb *req)
{