From: Linus Torvalds Date: Tue, 21 Jan 2025 04:27:33 +0000 (-0800) Subject: Merge tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux X-Git-Tag: io_uring-6.14-20240131~24 X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=a312e1706ce6c124f04ec85ddece240f3bb2a696;p=linux-block.git Merge tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux Pull io_uring updates from Jens Axboe: "Not a lot in terms of features this time around, mostly just cleanups and code consolidation: - Support for PI meta data read/write via io_uring, with NVMe and SCSI covered - Cleanup the per-op structure caching, making it consistent across various command types - Consolidate the various user mapped features into a concept called regions, making the various users of that consistent - Various cleanups and fixes" * tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux: (56 commits) io_uring/fdinfo: fix io_uring_show_fdinfo() misuse of ->d_iname io_uring: reuse io_should_terminate_tw() for cmds io_uring: Factor out a function to parse restrictions io_uring/rsrc: require cloned buffers to share accounting contexts io_uring: simplify the SQPOLL thread check when cancelling requests io_uring: expose read/write attribute capability io_uring/rw: don't gate retry on completion context io_uring/rw: handle -EAGAIN retry at IO completion time io_uring/rw: use io_rw_recycle() from cleanup path io_uring/rsrc: simplify the bvec iter count calculation io_uring: ensure io_queue_deferred() is out-of-line io_uring/rw: always clear ->bytes_done on io_async_rw setup io_uring/rw: use NULL for rw->free_iovec assigment io_uring/rw: don't mask in f_iocb_flags io_uring/msg_ring: Drop custom destructor io_uring: Move old async data allocation helper to header io_uring/rw: Allocate async data through helper io_uring/net: Allocate msghdr async data through helper io_uring/uring_cmd: Allocate async data through generic helper io_uring/poll: Allocate apoll with generic alloc_cache helper ... --- a312e1706ce6c124f04ec85ddece240f3bb2a696 diff --cc io_uring/register.c index 371aec87e078,4d507a0390e8..05025047d1da --- a/io_uring/register.c +++ b/io_uring/register.c @@@ -403,11 -396,11 +396,11 @@@ static void io_register_free_rings(stru static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { + struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; size_t size, sq_array_offset; + unsigned i, tail, old_head; struct io_uring_params p; - void *ptr; - unsigned i, tail; int ret; /* for single issuer, must be owner resizing */ @@@ -441,29 -434,26 +434,34 @@@ if (size == SIZE_MAX) return -EOVERFLOW; - if (!(p.flags & IORING_SETUP_NO_MMAP)) - n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); - else - n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, - p.cq_off.user_addr, size); - if (IS_ERR(n.rings)) - return PTR_ERR(n.rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.rings = io_region_get_ptr(&n.ring_region); - n.rings->sq_ring_mask = p.sq_entries - 1; - n.rings->cq_ring_mask = p.cq_entries - 1; - n.rings->sq_ring_entries = p.sq_entries; - n.rings->cq_ring_entries = p.cq_entries; + /* + * At this point n.rings is shared with userspace, just like o.rings + * is as well. While we don't expect userspace to modify it while + * a resize is in progress, and it's most likely that userspace will + * shoot itself in the foot if it does, we can't always assume good + * intent... Use read/write once helpers from here on to indicate the + * shared nature of it. + */ + WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); + WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); + WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); + WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); if (copy_to_user(arg, &p, sizeof(p))) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return -EFAULT; } @@@ -516,23 -508,20 +516,22 @@@ * Now copy SQ and CQ entries, if any. If either of the destination * rings can't hold what is already there, then fail the operation. */ - n.sq_sqes = ptr; - tail = o.rings->sq.tail; - if (tail - o.rings->sq.head > p.sq_entries) + tail = READ_ONCE(o.rings->sq.tail); + old_head = READ_ONCE(o.rings->sq.head); + if (tail - old_head > p.sq_entries) goto overflow; - for (i = o.rings->sq.head; i < tail; i++) { + for (i = old_head; i < tail; i++) { unsigned src_head = i & (ctx->sq_entries - 1); - unsigned dst_head = i & n.rings->sq_ring_mask; + unsigned dst_head = i & (p.sq_entries - 1); n.sq_sqes[dst_head] = o.sq_sqes[src_head]; } - WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head)); - WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail)); - n.rings->sq.head = o.rings->sq.head; - n.rings->sq.tail = o.rings->sq.tail; ++ WRITE_ONCE(n.rings->sq.head, old_head); ++ WRITE_ONCE(n.rings->sq.tail, tail); - tail = o.rings->cq.tail; - if (tail - o.rings->cq.head > p.cq_entries) { + tail = READ_ONCE(o.rings->cq.tail); + old_head = READ_ONCE(o.rings->cq.head); + if (tail - old_head > p.cq_entries) { overflow: /* restore old rings, and return -EOVERFLOW via cleanup path */ ctx->rings = o.rings; @@@ -547,8 -536,8 +546,8 @@@ n.rings->cqes[dst_head] = o.rings->cqes[src_head]; } - WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head)); - WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail)); - n.rings->cq.head = o.rings->cq.head; - n.rings->cq.tail = o.rings->cq.tail; ++ WRITE_ONCE(n.rings->cq.head, old_head); ++ WRITE_ONCE(n.rings->cq.tail, tail); /* invalidate cached cqe refill */ ctx->cqe_cached = ctx->cqe_sentinel = NULL; diff --cc io_uring/uring_cmd.c index ce7726a04883,d235043db21e..fc94c465a985 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@@ -188,14 -163,14 +168,22 @@@ void io_uring_cmd_done(struct io_uring_ } EXPORT_SYMBOL_GPL(io_uring_cmd_done); ++static void io_uring_cmd_init_once(void *obj) ++{ ++ struct io_uring_cmd_data *data = obj; ++ ++ data->op_data = NULL; ++} ++ static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache; + struct io_uring_cmd_data *cache; - cache = io_uring_async_get(req); - if (unlikely(!cache)) - cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req, NULL); ++ cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req, ++ io_uring_cmd_init_once); + if (!cache) return -ENOMEM; if (!(req->flags & REQ_F_FORCE_ASYNC)) {