1 // SPDX-License-Identifier: GPL-2.0
3 * Code related to the io_uring_register() syscall
5 * Copyright (C) 2023 Jens Axboe
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/io_uring.h>
18 #include <linux/io_uring_types.h>
29 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
30 IORING_REGISTER_LAST + IORING_OP_LAST)
32 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
33 unsigned int eventfd_async)
35 struct io_ev_fd *ev_fd;
36 __s32 __user *fds = arg;
39 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
40 lockdep_is_held(&ctx->uring_lock));
44 if (copy_from_user(&fd, fds, sizeof(*fds)))
47 ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
51 ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
52 if (IS_ERR(ev_fd->cq_ev_fd)) {
53 int ret = PTR_ERR(ev_fd->cq_ev_fd);
58 spin_lock(&ctx->completion_lock);
59 ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
60 spin_unlock(&ctx->completion_lock);
62 ev_fd->eventfd_async = eventfd_async;
64 rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
65 atomic_set(&ev_fd->refs, 1);
66 atomic_set(&ev_fd->ops, 0);
70 int io_eventfd_unregister(struct io_ring_ctx *ctx)
72 struct io_ev_fd *ev_fd;
74 ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
75 lockdep_is_held(&ctx->uring_lock));
77 ctx->has_evfd = false;
78 rcu_assign_pointer(ctx->io_ev_fd, NULL);
79 if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
80 call_rcu(&ev_fd->rcu, io_eventfd_ops);
87 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
90 struct io_uring_probe *p;
94 size = struct_size(p, ops, nr_args);
97 p = kzalloc(size, GFP_KERNEL);
102 if (copy_from_user(p, arg, size))
105 if (memchr_inv(p, 0, size))
108 p->last_op = IORING_OP_LAST - 1;
109 if (nr_args > IORING_OP_LAST)
110 nr_args = IORING_OP_LAST;
112 for (i = 0; i < nr_args; i++) {
114 if (!io_issue_defs[i].not_supported)
115 p->ops[i].flags = IO_URING_OP_SUPPORTED;
120 if (copy_to_user(arg, p, size))
127 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
129 const struct cred *creds;
131 creds = xa_erase(&ctx->personalities, id);
141 static int io_register_personality(struct io_ring_ctx *ctx)
143 const struct cred *creds;
147 creds = get_current_cred();
149 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
150 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
158 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
159 void __user *arg, unsigned int nr_args)
161 struct io_uring_restriction *res;
165 /* Restrictions allowed only if rings started disabled */
166 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
169 /* We allow only a single restrictions registration */
170 if (ctx->restrictions.registered)
173 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
176 size = array_size(nr_args, sizeof(*res));
177 if (size == SIZE_MAX)
180 res = memdup_user(arg, size);
186 for (i = 0; i < nr_args; i++) {
187 switch (res[i].opcode) {
188 case IORING_RESTRICTION_REGISTER_OP:
189 if (res[i].register_op >= IORING_REGISTER_LAST) {
194 __set_bit(res[i].register_op,
195 ctx->restrictions.register_op);
197 case IORING_RESTRICTION_SQE_OP:
198 if (res[i].sqe_op >= IORING_OP_LAST) {
203 __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op);
205 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
206 ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags;
208 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
209 ctx->restrictions.sqe_flags_required = res[i].sqe_flags;
218 /* Reset all restrictions if an error happened */
220 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
222 ctx->restrictions.registered = true;
228 static int io_register_enable_rings(struct io_ring_ctx *ctx)
230 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
233 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
234 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
236 * Lazy activation attempts would fail if it was polled before
237 * submitter_task is set.
239 if (wq_has_sleeper(&ctx->poll_wq))
240 io_activate_pollwq(ctx);
243 if (ctx->restrictions.registered)
246 ctx->flags &= ~IORING_SETUP_R_DISABLED;
247 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
248 wake_up(&ctx->sq_data->wait);
252 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
253 cpumask_var_t new_mask)
257 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
258 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
260 mutex_unlock(&ctx->uring_lock);
261 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
262 mutex_lock(&ctx->uring_lock);
268 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
269 void __user *arg, unsigned len)
271 cpumask_var_t new_mask;
274 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
277 cpumask_clear(new_mask);
278 if (len > cpumask_size())
279 len = cpumask_size();
281 if (in_compat_syscall()) {
282 ret = compat_get_bitmap(cpumask_bits(new_mask),
283 (const compat_ulong_t __user *)arg,
284 len * 8 /* CHAR_BIT */);
286 ret = copy_from_user(new_mask, arg, len);
290 free_cpumask_var(new_mask);
294 ret = __io_register_iowq_aff(ctx, new_mask);
295 free_cpumask_var(new_mask);
299 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
301 return __io_register_iowq_aff(ctx, NULL);
304 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
306 __must_hold(&ctx->uring_lock)
308 struct io_tctx_node *node;
309 struct io_uring_task *tctx = NULL;
310 struct io_sq_data *sqd = NULL;
314 if (copy_from_user(new_count, arg, sizeof(new_count)))
316 for (i = 0; i < ARRAY_SIZE(new_count); i++)
317 if (new_count[i] > INT_MAX)
320 if (ctx->flags & IORING_SETUP_SQPOLL) {
324 * Observe the correct sqd->lock -> ctx->uring_lock
325 * ordering. Fine to drop uring_lock here, we hold
328 refcount_inc(&sqd->refs);
329 mutex_unlock(&ctx->uring_lock);
330 mutex_lock(&sqd->lock);
331 mutex_lock(&ctx->uring_lock);
333 tctx = sqd->thread->io_uring;
336 tctx = current->io_uring;
339 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
341 for (i = 0; i < ARRAY_SIZE(new_count); i++)
343 ctx->iowq_limits[i] = new_count[i];
344 ctx->iowq_limits_set = true;
346 if (tctx && tctx->io_wq) {
347 ret = io_wq_max_workers(tctx->io_wq, new_count);
351 memset(new_count, 0, sizeof(new_count));
355 mutex_unlock(&sqd->lock);
359 if (copy_to_user(arg, new_count, sizeof(new_count)))
362 /* that's it for SQPOLL, only the SQPOLL task creates requests */
366 /* now propagate the restriction to all registered users */
367 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
368 struct io_uring_task *tctx = node->task->io_uring;
370 if (WARN_ON_ONCE(!tctx->io_wq))
373 for (i = 0; i < ARRAY_SIZE(new_count); i++)
374 new_count[i] = ctx->iowq_limits[i];
375 /* ignore errors, it always returns zero anyway */
376 (void)io_wq_max_workers(tctx->io_wq, new_count);
381 mutex_unlock(&sqd->lock);
387 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
388 void __user *arg, unsigned nr_args)
389 __releases(ctx->uring_lock)
390 __acquires(ctx->uring_lock)
395 * We don't quiesce the refs for register anymore and so it can't be
396 * dying as we're holding a file ref here.
398 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
401 if (ctx->submitter_task && ctx->submitter_task != current)
404 if (ctx->restricted) {
405 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
406 if (!test_bit(opcode, ctx->restrictions.register_op))
411 case IORING_REGISTER_BUFFERS:
415 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
417 case IORING_UNREGISTER_BUFFERS:
421 ret = io_sqe_buffers_unregister(ctx);
423 case IORING_REGISTER_FILES:
427 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
429 case IORING_UNREGISTER_FILES:
433 ret = io_sqe_files_unregister(ctx);
435 case IORING_REGISTER_FILES_UPDATE:
436 ret = io_register_files_update(ctx, arg, nr_args);
438 case IORING_REGISTER_EVENTFD:
442 ret = io_eventfd_register(ctx, arg, 0);
444 case IORING_REGISTER_EVENTFD_ASYNC:
448 ret = io_eventfd_register(ctx, arg, 1);
450 case IORING_UNREGISTER_EVENTFD:
454 ret = io_eventfd_unregister(ctx);
456 case IORING_REGISTER_PROBE:
458 if (!arg || nr_args > 256)
460 ret = io_probe(ctx, arg, nr_args);
462 case IORING_REGISTER_PERSONALITY:
466 ret = io_register_personality(ctx);
468 case IORING_UNREGISTER_PERSONALITY:
472 ret = io_unregister_personality(ctx, nr_args);
474 case IORING_REGISTER_ENABLE_RINGS:
478 ret = io_register_enable_rings(ctx);
480 case IORING_REGISTER_RESTRICTIONS:
481 ret = io_register_restrictions(ctx, arg, nr_args);
483 case IORING_REGISTER_FILES2:
484 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
486 case IORING_REGISTER_FILES_UPDATE2:
487 ret = io_register_rsrc_update(ctx, arg, nr_args,
490 case IORING_REGISTER_BUFFERS2:
491 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
493 case IORING_REGISTER_BUFFERS_UPDATE:
494 ret = io_register_rsrc_update(ctx, arg, nr_args,
497 case IORING_REGISTER_IOWQ_AFF:
499 if (!arg || !nr_args)
501 ret = io_register_iowq_aff(ctx, arg, nr_args);
503 case IORING_UNREGISTER_IOWQ_AFF:
507 ret = io_unregister_iowq_aff(ctx);
509 case IORING_REGISTER_IOWQ_MAX_WORKERS:
511 if (!arg || nr_args != 2)
513 ret = io_register_iowq_max_workers(ctx, arg);
515 case IORING_REGISTER_RING_FDS:
516 ret = io_ringfd_register(ctx, arg, nr_args);
518 case IORING_UNREGISTER_RING_FDS:
519 ret = io_ringfd_unregister(ctx, arg, nr_args);
521 case IORING_REGISTER_PBUF_RING:
523 if (!arg || nr_args != 1)
525 ret = io_register_pbuf_ring(ctx, arg);
527 case IORING_UNREGISTER_PBUF_RING:
529 if (!arg || nr_args != 1)
531 ret = io_unregister_pbuf_ring(ctx, arg);
533 case IORING_REGISTER_SYNC_CANCEL:
535 if (!arg || nr_args != 1)
537 ret = io_sync_cancel(ctx, arg);
539 case IORING_REGISTER_FILE_ALLOC_RANGE:
543 ret = io_register_file_alloc_range(ctx, arg);
545 case IORING_REGISTER_PBUF_STATUS:
547 if (!arg || nr_args != 1)
549 ret = io_register_pbuf_status(ctx, arg);
559 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
560 void __user *, arg, unsigned int, nr_args)
562 struct io_ring_ctx *ctx;
565 bool use_registered_ring;
567 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
568 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
570 if (opcode >= IORING_REGISTER_LAST)
573 if (use_registered_ring) {
575 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
576 * need only dereference our task private array to find it.
578 struct io_uring_task *tctx = current->io_uring;
580 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
582 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
583 file = tctx->registered_rings[fd];
591 if (!io_is_uring_fops(file))
595 ctx = file->private_data;
597 mutex_lock(&ctx->uring_lock);
598 ret = __io_uring_register(ctx, opcode, arg, nr_args);
599 mutex_unlock(&ctx->uring_lock);
600 trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
602 if (!use_registered_ring)