summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2021-11-05 17:20:54 -0600
committerJens Axboe <axboe@kernel.dk>2022-05-13 09:51:02 -0600
commit0e842735612d316bdfc32ae0b4ad56602bda0644 (patch)
treed9169874c2ddc5a10ecf0fe72060c7c188f3a684
parent68d3d76273191ed1b0eb38dcabdb11086238f411 (diff)
io_uring: support for user allocated memory for rings/sqesfor-5.19/io_uring-huge
Currently io_uring applications must call mmap(2) twice to map the rings themselves, and the sqes array. This works fine, but it does not support using huge pages to back the rings/sqes. Provide a way for the application to pass in pre-allocated memory for the rings/sqes, which can then suitably be allocated from shmfs or via mmap to get huge page support. Particularly for larger rings, this reduces the TLBs needed. If an application wishes to take advantage of that, it must pre-allocate the memory needed for the sq/cq ring, and the sqes. The former must be passed in via the io_uring_params->cq_off.user_data field, while the latter is passed in via the io_uring_params->sq_off.user_data field. Then it must set IORING_SETUP_NO_MMAP in the io_uring_params->flags field, and io_uring will then map the existing memory into the kernel for shared use. The application must not call mmap(2) to map rings as it otherwise would have, that will now fail with -EINVAL if this setup flag was used. The pages used for the rings and sqes must be contigious. The intent here is clearly that huge pages should be used, otherwise the normal setup procedure works fine as-is. The application may use one huge page for both the rings and sqes. Outside of those initialization changes, everything works like it did before. Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--fs/io_uring.c118
-rw-r--r--include/uapi/linux/io_uring.h10
2 files changed, 116 insertions, 12 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a590e6a778c9..b3f937f3cfab 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -366,6 +366,16 @@ struct io_ring_ctx {
struct percpu_ref refs;
struct io_rings *rings;
+
+ /*
+ * If IORING_SETUP_NO_MMAP is used, then the below holds
+ * the gup'ed pages for the two rings, and the sqes.
+ */
+ struct page **ring_pages;
+ struct page **sqe_pages;
+ int n_ring_pages;
+ int n_sqe_pages;
+
unsigned int flags;
enum task_work_notify_mode notify_method;
unsigned int compat: 1;
@@ -9589,12 +9599,78 @@ static void io_mem_free(void *ptr)
free_compound_page(page);
}
+static void io_pages_free(struct page ***pages, int npages)
+{
+ struct page **page_array;
+ int i;
+
+ if (!pages)
+ return;
+ page_array = *pages;
+ for (i = 0; i < npages; i++)
+ unpin_user_page(page_array[i]);
+ kvfree(page_array);
+ *pages = NULL;
+}
+
+static void *__io_uaddr_map(struct page ***pages, int *npages,
+ unsigned long uaddr, size_t size)
+{
+ struct page **page_array;
+ int ret;
+
+ if (uaddr & (PAGE_SIZE - 1) || !size)
+ return ERR_PTR(-EINVAL);
+
+ *npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ page_array = kvmalloc_array(*npages, sizeof(struct page *), GFP_KERNEL);
+ if (!page_array) {
+ *npages = 0;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ret = pin_user_pages_fast(uaddr, *npages, FOLL_WRITE | FOLL_LONGTERM,
+ page_array);
+ if (ret != *npages) {
+err:
+ if (ret > 0)
+ io_pages_free(&page_array, ret);
+ *npages = 0;
+ return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
+ }
+ /* pages must be contig */
+ ret--;
+ if (page_array[0] + ret != page_array[ret])
+ goto err;
+ *pages = page_array;
+ return page_to_virt(page_array[0]);
+}
+
+static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
+ size_t size)
+{
+ return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
+ size);
+}
+
+static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
+ size_t size)
+{
+ return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
+ size);
+}
+
static void io_rings_free(struct io_ring_ctx *ctx)
{
- io_mem_free(ctx->rings);
- io_mem_free(ctx->sq_sqes);
- ctx->rings = NULL;
- ctx->sq_sqes = NULL;
+ if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
+ io_mem_free(ctx->rings);
+ io_mem_free(ctx->sq_sqes);
+ ctx->rings = NULL;
+ ctx->sq_sqes = NULL;
+ } else {
+ io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
+ io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
+ }
}
static void *io_mem_alloc(size_t size)
@@ -10872,10 +10948,15 @@ static void *io_uring_validate_mmap_request(struct file *file,
static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct io_ring_ctx *ctx = file->private_data;
size_t sz = vma->vm_end - vma->vm_start;
unsigned long pfn;
void *ptr;
+ /* Don't allow mmap if the ring was setup without it */
+ if (ctx->flags & IORING_SETUP_NO_MMAP)
+ return -EINVAL;
+
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
@@ -10888,6 +10969,12 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct io_ring_ctx *ctx = file->private_data;
+
+ /* Don't allow mmap if the ring was setup without it */
+ if (ctx->flags & IORING_SETUP_NO_MMAP)
+ return -EINVAL;
+
return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL;
}
@@ -11302,7 +11389,11 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (size == SIZE_MAX)
return -EOVERFLOW;
- rings = io_mem_alloc(size);
+ if (!(ctx->flags & IORING_SETUP_NO_MMAP))
+ rings = io_mem_alloc(size);
+ else
+ rings = io_rings_map(ctx, p->cq_off.user_addr, size);
+
if (IS_ERR(rings))
return PTR_ERR(rings);
@@ -11319,13 +11410,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return -EOVERFLOW;
}
- ptr = io_mem_alloc(size);
+ if (!(ctx->flags & IORING_SETUP_NO_MMAP))
+ ptr = io_mem_alloc(size);
+ else
+ ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
+
if (IS_ERR(ptr)) {
io_rings_free(ctx);
return PTR_ERR(ptr);
}
- ctx->sq_sqes = io_mem_alloc(size);
+ ctx->sq_sqes = ptr;
return 0;
}
@@ -11488,7 +11583,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
p->sq_off.resv1 = 0;
- p->sq_off.resv2 = 0;
+ if (!(ctx->flags & IORING_SETUP_NO_MMAP))
+ p->sq_off.user_addr = 0;
p->cq_off.head = offsetof(struct io_rings, cq.head);
p->cq_off.tail = offsetof(struct io_rings, cq.tail);
@@ -11498,7 +11594,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
p->cq_off.cqes = offsetof(struct io_rings, cqes);
p->cq_off.flags = offsetof(struct io_rings, cq_flags);
p->cq_off.resv1 = 0;
- p->cq_off.resv2 = 0;
+ if (!(ctx->flags & IORING_SETUP_NO_MMAP))
+ p->cq_off.user_addr = 0;
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
@@ -11558,7 +11655,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
- IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG))
+ IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
+ IORING_SETUP_NO_MMAP))
return -EINVAL;
return io_uring_create(entries, &p, params);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 36ec43dc7bf9..6fc63bc8cb3e 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -126,6 +126,12 @@ enum {
*/
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
+/*
+ * Rather than have the kernel allocate ring memory and then have the app
+ * mmap it, have the application pass in the memory for the backing of them.
+ */
+#define IORING_SETUP_NO_MMAP (1U << 12)
+
enum {
IORING_OP_NOP,
IORING_OP_READV,
@@ -273,7 +279,7 @@ struct io_sqring_offsets {
__u32 dropped;
__u32 array;
__u32 resv1;
- __u64 resv2;
+ __u64 user_addr;
};
/*
@@ -292,7 +298,7 @@ struct io_cqring_offsets {
__u32 cqes;
__u32 flags;
__u32 resv1;
- __u64 resv2;
+ __u64 user_addr;
};
/*