atomic_t cq_wait_nr;
atomic_t cq_timeouts;
struct wait_queue_head cq_wait;
+
+ /*
+ * If registered with IORING_REGISTER_CQWAIT_REG, a single
+ * page holds N entries, mapped in cq_wait_arg. cq_wait_index
+ * is the maximum allowable index.
+ */
+ struct io_uring_reg_wait *cq_wait_arg;
+ unsigned char cq_wait_index;
} ____cacheline_aligned_in_smp;
/* timeouts */
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
+
+ struct page **cq_wait_page;
};
struct io_tw_state {
#define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4)
#define IORING_ENTER_ABS_TIMER (1U << 5)
+#define IORING_ENTER_EXT_ARG_REG (1U << 6)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
/* resize CQ ring */
IORING_REGISTER_RESIZE_RINGS = 33,
+ /* register fixed io_uring_reg_wait arguments */
+ IORING_REGISTER_CQWAIT_REG = 34,
+
/* this goes last */
IORING_REGISTER_LAST,
IORING_RESTRICTION_LAST
};
+enum {
+ IORING_REG_WAIT_TS = (1U << 0),
+};
+
+/*
+ * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
+ * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
+ * called rather than pass in a wait argument structure separately.
+ */
+struct io_uring_cqwait_reg_arg {
+ __u32 flags;
+ __u32 struct_size;
+ __u32 nr_entries;
+ __u32 pad;
+ __u64 user_addr;
+ __u64 pad2[3];
+};
+
+/*
+ * Argument for io_uring_enter(2) with
+ * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
+ * is an index into a previously registered fixed wait region described by
+ * the below structure.
+ */
+struct io_uring_reg_wait {
+ struct __kernel_timespec ts;
+ __u32 min_wait_usec;
+ __u32 flags;
+ __u64 sigmask;
+ __u32 sigmask_sz;
+ __u32 pad[3];
+ __u64 pad2[2];
+};
+
+/*
+ * Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG
+ */
struct io_uring_getevents_arg {
__u64 sigmask;
__u32 sigmask_sz;
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
+ io_unregister_cqwait_reg(ctx);
mutex_unlock(&ctx->uring_lock);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
io_uring_cancel_generic(cancel_all, NULL);
}
-static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
+static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
+ const struct io_uring_getevents_arg __user *uarg)
{
- if (flags & IORING_ENTER_EXT_ARG) {
- struct io_uring_getevents_arg arg;
+ struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg);
- if (argsz != sizeof(arg))
+ if (arg) {
+ unsigned int index = (unsigned int) (uintptr_t) uarg;
+
+ if (index <= ctx->cq_wait_index)
+ return arg + index;
+ }
+
+ return ERR_PTR(-EFAULT);
+}
+
+static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
+ const void __user *argp, size_t argsz)
+{
+ struct io_uring_getevents_arg arg;
+
+ if (!(flags & IORING_ENTER_EXT_ARG))
+ return 0;
+
+ if (flags & IORING_ENTER_EXT_ARG_REG) {
+ if (argsz != sizeof(struct io_uring_reg_wait))
return -EINVAL;
- if (copy_from_user(&arg, argp, sizeof(arg)))
- return -EFAULT;
+ return PTR_ERR(io_get_ext_arg_reg(ctx, argp));
}
+ if (argsz != sizeof(arg))
+ return -EINVAL;
+ if (copy_from_user(&arg, argp, sizeof(arg)))
+ return -EFAULT;
return 0;
}
-static int io_get_ext_arg(unsigned flags, const void __user *argp,
- struct ext_arg *ext_arg)
+static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
+ const void __user *argp, struct ext_arg *ext_arg)
{
const struct io_uring_getevents_arg __user *uarg = argp;
struct io_uring_getevents_arg arg;
return 0;
}
+ if (flags & IORING_ENTER_EXT_ARG_REG) {
+ struct io_uring_reg_wait *w;
+
+ if (ext_arg->argsz != sizeof(struct io_uring_reg_wait))
+ return -EINVAL;
+ w = io_get_ext_arg_reg(ctx, argp);
+ if (IS_ERR(w))
+ return PTR_ERR(w);
+
+ if (w->flags & ~IORING_REG_WAIT_TS)
+ return -EINVAL;
+ ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC;
+ ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask));
+ ext_arg->argsz = READ_ONCE(w->sigmask_sz);
+ if (w->flags & IORING_REG_WAIT_TS) {
+ ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec);
+ ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec);
+ ext_arg->ts_set = true;
+ }
+ return 0;
+ }
+
/*
* EXT_ARG is set - ensure we agree on the size of it and copy in our
* timespec and sigset_t pointers if good.
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
IORING_ENTER_REGISTERED_RING |
- IORING_ENTER_ABS_TIMER)))
+ IORING_ENTER_ABS_TIMER |
+ IORING_ENTER_EXT_ARG_REG)))
return -EINVAL;
/*
*/
mutex_lock(&ctx->uring_lock);
iopoll_locked:
- ret2 = io_validate_ext_arg(flags, argp, argsz);
+ ret2 = io_validate_ext_arg(ctx, flags, argp, argsz);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
} else {
struct ext_arg ext_arg = { .argsz = argsz };
- ret2 = io_get_ext_arg(flags, argp, &ext_arg);
+ ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg);
if (likely(!ret2)) {
min_complete = min(min_complete,
ctx->cq_entries);
return ret;
}
+void io_unregister_cqwait_reg(struct io_ring_ctx *ctx)
+{
+ unsigned short npages = 1;
+
+ if (!ctx->cq_wait_page)
+ return;
+
+ io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true);
+ ctx->cq_wait_arg = NULL;
+ if (ctx->user)
+ __io_unaccount_mem(ctx->user, 1);
+}
+
+/*
+ * Register a page holding N entries of struct io_uring_reg_wait, which can
+ * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set.
+ * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing
+ * in a pointer for a struct io_uring_getevents_arg, an index into this
+ * registered array is passed, avoiding two (arg + timeout) copies per
+ * invocation.
+ */
+static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg)
+{
+ struct io_uring_cqwait_reg_arg arg;
+ struct io_uring_reg_wait *reg;
+ struct page **pages;
+ unsigned long len;
+ int nr_pages, poff;
+ int ret;
+
+ if (ctx->cq_wait_page || ctx->cq_wait_arg)
+ return -EBUSY;
+ if (copy_from_user(&arg, uarg, sizeof(arg)))
+ return -EFAULT;
+ if (!arg.nr_entries || arg.flags)
+ return -EINVAL;
+ if (arg.struct_size != sizeof(*reg))
+ return -EINVAL;
+ if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len))
+ return -EOVERFLOW;
+ if (len > PAGE_SIZE)
+ return -EINVAL;
+ /* offset + len must fit within a page, and must be reg_wait aligned */
+ poff = arg.user_addr & ~PAGE_MASK;
+ if (len + poff > PAGE_SIZE)
+ return -EINVAL;
+ if (poff % arg.struct_size)
+ return -EINVAL;
+
+ pages = io_pin_pages(arg.user_addr, len, &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ ret = -EINVAL;
+ if (nr_pages != 1)
+ goto out_free;
+ if (ctx->user) {
+ ret = __io_account_mem(ctx->user, 1);
+ if (ret)
+ goto out_free;
+ }
+
+ reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
+ if (reg) {
+ ctx->cq_wait_index = arg.nr_entries - 1;
+ WRITE_ONCE(ctx->cq_wait_page, pages);
+ WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff);
+ return 0;
+ }
+ ret = -ENOMEM;
+ if (ctx->user)
+ __io_unaccount_mem(ctx->user, 1);
+out_free:
+ io_pages_free(&pages, nr_pages);
+ return ret;
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
break;
ret = io_register_resize_rings(ctx, arg);
break;
+ case IORING_REGISTER_CQWAIT_REG:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_cqwait_reg(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
int io_eventfd_unregister(struct io_ring_ctx *ctx);
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
struct file *io_uring_register_get_file(unsigned int fd, bool registered);
+void io_unregister_cqwait_reg(struct io_ring_ctx *ctx);
#endif