From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 21 Jan 2025 04:27:33 +0000 (-0800)
Subject: Merge tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux
X-Git-Tag: io_uring-6.14-20240131~24
X-Git-Url: https://git.kernel.dk/?a=commitdiff_plain;h=a312e1706ce6c124f04ec85ddece240f3bb2a696;p=linux-block.git

Merge tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:
 "Not a lot in terms of features this time around, mostly just cleanups
  and code consolidation:

   - Support for PI meta data read/write via io_uring, with NVMe and
     SCSI covered

   - Cleanup the per-op structure caching, making it consistent across
     various command types

   - Consolidate the various user mapped features into a concept called
     regions, making the various users of that consistent

   - Various cleanups and fixes"

* tag 'for-6.14/io_uring-20250119' of git://git.kernel.dk/linux: (56 commits)
  io_uring/fdinfo: fix io_uring_show_fdinfo() misuse of ->d_iname
  io_uring: reuse io_should_terminate_tw() for cmds
  io_uring: Factor out a function to parse restrictions
  io_uring/rsrc: require cloned buffers to share accounting contexts
  io_uring: simplify the SQPOLL thread check when cancelling requests
  io_uring: expose read/write attribute capability
  io_uring/rw: don't gate retry on completion context
  io_uring/rw: handle -EAGAIN retry at IO completion time
  io_uring/rw: use io_rw_recycle() from cleanup path
  io_uring/rsrc: simplify the bvec iter count calculation
  io_uring: ensure io_queue_deferred() is out-of-line
  io_uring/rw: always clear ->bytes_done on io_async_rw setup
  io_uring/rw: use NULL for rw->free_iovec assigment
  io_uring/rw: don't mask in f_iocb_flags
  io_uring/msg_ring: Drop custom destructor
  io_uring: Move old async data allocation helper to header
  io_uring/rw: Allocate async data through helper
  io_uring/net: Allocate msghdr async data through helper
  io_uring/uring_cmd: Allocate async data through generic helper
  io_uring/poll: Allocate apoll with generic alloc_cache helper
  ...
---

a312e1706ce6c124f04ec85ddece240f3bb2a696
diff --cc io_uring/register.c
index 371aec87e078,4d507a0390e8..05025047d1da
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@@ -403,11 -396,11 +396,11 @@@ static void io_register_free_rings(stru
  
  static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
  {
+ 	struct io_uring_region_desc rd;
  	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
  	size_t size, sq_array_offset;
 +	unsigned i, tail, old_head;
  	struct io_uring_params p;
- 	void *ptr;
 -	unsigned i, tail;
  	int ret;
  
  	/* for single issuer, must be owner resizing */
@@@ -441,29 -434,26 +434,34 @@@
  	if (size == SIZE_MAX)
  		return -EOVERFLOW;
  
- 	if (!(p.flags & IORING_SETUP_NO_MMAP))
- 		n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
- 	else
- 		n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
- 						p.cq_off.user_addr, size);
- 	if (IS_ERR(n.rings))
- 		return PTR_ERR(n.rings);
+ 	memset(&rd, 0, sizeof(rd));
+ 	rd.size = PAGE_ALIGN(size);
+ 	if (p.flags & IORING_SETUP_NO_MMAP) {
+ 		rd.user_addr = p.cq_off.user_addr;
+ 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ 	}
+ 	ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
+ 	if (ret) {
+ 		io_register_free_rings(ctx, &p, &n);
+ 		return ret;
+ 	}
+ 	n.rings = io_region_get_ptr(&n.ring_region);
  
 -	n.rings->sq_ring_mask = p.sq_entries - 1;
 -	n.rings->cq_ring_mask = p.cq_entries - 1;
 -	n.rings->sq_ring_entries = p.sq_entries;
 -	n.rings->cq_ring_entries = p.cq_entries;
 +	/*
 +	 * At this point n.rings is shared with userspace, just like o.rings
 +	 * is as well. While we don't expect userspace to modify it while
 +	 * a resize is in progress, and it's most likely that userspace will
 +	 * shoot itself in the foot if it does, we can't always assume good
 +	 * intent... Use read/write once helpers from here on to indicate the
 +	 * shared nature of it.
 +	 */
 +	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
 +	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
 +	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
 +	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
  
  	if (copy_to_user(arg, &p, sizeof(p))) {
- 		io_register_free_rings(&p, &n);
+ 		io_register_free_rings(ctx, &p, &n);
  		return -EFAULT;
  	}
  
@@@ -516,23 -508,20 +516,22 @@@
  	 * Now copy SQ and CQ entries, if any. If either of the destination
  	 * rings can't hold what is already there, then fail the operation.
  	 */
- 	n.sq_sqes = ptr;
 -	tail = o.rings->sq.tail;
 -	if (tail - o.rings->sq.head > p.sq_entries)
 +	tail = READ_ONCE(o.rings->sq.tail);
 +	old_head = READ_ONCE(o.rings->sq.head);
 +	if (tail - old_head > p.sq_entries)
  		goto overflow;
 -	for (i = o.rings->sq.head; i < tail; i++) {
 +	for (i = old_head; i < tail; i++) {
  		unsigned src_head = i & (ctx->sq_entries - 1);
 -		unsigned dst_head = i & n.rings->sq_ring_mask;
 +		unsigned dst_head = i & (p.sq_entries - 1);
  
  		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
  	}
- 	WRITE_ONCE(n.rings->sq.head, READ_ONCE(o.rings->sq.head));
- 	WRITE_ONCE(n.rings->sq.tail, READ_ONCE(o.rings->sq.tail));
 -	n.rings->sq.head = o.rings->sq.head;
 -	n.rings->sq.tail = o.rings->sq.tail;
++	WRITE_ONCE(n.rings->sq.head, old_head);
++	WRITE_ONCE(n.rings->sq.tail, tail);
  
 -	tail = o.rings->cq.tail;
 -	if (tail - o.rings->cq.head > p.cq_entries) {
 +	tail = READ_ONCE(o.rings->cq.tail);
 +	old_head = READ_ONCE(o.rings->cq.head);
 +	if (tail - old_head > p.cq_entries) {
  overflow:
  		/* restore old rings, and return -EOVERFLOW via cleanup path */
  		ctx->rings = o.rings;
@@@ -547,8 -536,8 +546,8 @@@
  
  		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
  	}
- 	WRITE_ONCE(n.rings->cq.head, READ_ONCE(o.rings->cq.head));
- 	WRITE_ONCE(n.rings->cq.tail, READ_ONCE(o.rings->cq.tail));
 -	n.rings->cq.head = o.rings->cq.head;
 -	n.rings->cq.tail = o.rings->cq.tail;
++	WRITE_ONCE(n.rings->cq.head, old_head);
++	WRITE_ONCE(n.rings->cq.tail, tail);
  	/* invalidate cached cqe refill */
  	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
  
diff --cc io_uring/uring_cmd.c
index ce7726a04883,d235043db21e..fc94c465a985
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@@ -188,14 -163,14 +168,22 @@@ void io_uring_cmd_done(struct io_uring_
  }
  EXPORT_SYMBOL_GPL(io_uring_cmd_done);
  
++static void io_uring_cmd_init_once(void *obj)
++{
++	struct io_uring_cmd_data *data = obj;
++
++	data->op_data = NULL;
++}	
++
  static int io_uring_cmd_prep_setup(struct io_kiocb *req,
  				   const struct io_uring_sqe *sqe)
  {
  	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
 -	struct uring_cache *cache;
 +	struct io_uring_cmd_data *cache;
  
- 	cache = io_uring_async_get(req);
- 	if (unlikely(!cache))
 -	cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req, NULL);
++	cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req,
++			io_uring_cmd_init_once);
+ 	if (!cache)
  		return -ENOMEM;
  
  	if (!(req->flags & REQ_F_FORCE_ASYNC)) {