io_uring/zcrx: add interface queue and refill queue
authorDavid Wei <dw@davidwei.uk>
Sat, 15 Feb 2025 00:09:36 +0000 (16:09 -0800)
committerJens Axboe <axboe@kernel.dk>
Mon, 17 Feb 2025 12:41:03 +0000 (05:41 -0700)
Add a new object called an interface queue (ifq) that represents a net
rx queue that has been configured for zero copy. Each ifq is registered
using a new registration opcode IORING_REGISTER_ZCRX_IFQ.

The refill queue is allocated by the kernel and mapped by userspace
using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main
SQ/CQ. It is used by userspace to return buffers that it is done with,
which will then be re-used by the netdev again.

The main CQ ring is used to notify userspace of received data by using
the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each
entry contains the offset + len to the data.

For now, each io_uring instance only has a single ifq.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: David Wei <dw@davidwei.uk>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20250215000947.789731-2-dw@davidwei.uk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Kconfig
include/linux/io_uring_types.h
include/uapi/linux/io_uring.h
io_uring/KConfig [new file with mode: 0644]
io_uring/Makefile
io_uring/io_uring.c
io_uring/memmap.h
io_uring/register.c
io_uring/zcrx.c [new file with mode: 0644]
io_uring/zcrx.h [new file with mode: 0644]

diff --git a/Kconfig b/Kconfig
index 745bc773f567067a85ce6574fb41ce80833247d9..529ea7694ba984b631e3fd964109545f370aaa1d 100644 (file)
--- a/Kconfig
+++ b/Kconfig
@@ -30,3 +30,5 @@ source "lib/Kconfig"
 source "lib/Kconfig.debug"
 
 source "Documentation/Kconfig"
+
+source "io_uring/KConfig"
index 123e693687305e07c69d4e6807840866faace46d..c0fe8a00fe53af18bfcbeaff44a158d4fc6425b4 100644 (file)
@@ -40,6 +40,8 @@ enum io_uring_cmd_flags {
        IO_URING_F_TASK_DEAD            = (1 << 13),
 };
 
+struct io_zcrx_ifq;
+
 struct io_wq_work_node {
        struct io_wq_work_node *next;
 };
@@ -382,6 +384,8 @@ struct io_ring_ctx {
        struct wait_queue_head          poll_wq;
        struct io_restriction           restrictions;
 
+       struct io_zcrx_ifq              *ifq;
+
        u32                     pers_next;
        struct xarray           personalities;
 
@@ -434,6 +438,8 @@ struct io_ring_ctx {
        struct io_mapped_region         ring_region;
        /* used for optimised request parameter and wait argument passing  */
        struct io_mapped_region         param_region;
+       /* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
+       struct io_mapped_region         zcrx_region;
 };
 
 /*
index e11c826385277c11bf8c90cd94c4af5f9e6c1105..6a1632d0fba1648967bc2360dcfccf8c675c74af 100644 (file)
@@ -639,7 +639,8 @@ enum io_uring_register_op {
        /* send MSG_RING without having a ring */
        IORING_REGISTER_SEND_MSG_RING           = 31,
 
-       /* 32 reserved for zc rx */
+       /* register a netdev hw rx queue for zerocopy */
+       IORING_REGISTER_ZCRX_IFQ                = 32,
 
        /* resize CQ ring */
        IORING_REGISTER_RESIZE_RINGS            = 33,
@@ -956,6 +957,46 @@ enum io_uring_socket_op {
        SOCKET_URING_OP_SETSOCKOPT,
 };
 
+/* Zero copy receive refill queue entry */
+struct io_uring_zcrx_rqe {
+       __u64   off;
+       __u32   len;
+       __u32   __pad;
+};
+
+struct io_uring_zcrx_cqe {
+       __u64   off;
+       __u64   __pad;
+};
+
+/* The bit from which area id is encoded into offsets */
+#define IORING_ZCRX_AREA_SHIFT 48
+#define IORING_ZCRX_AREA_MASK  (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
+
+struct io_uring_zcrx_offsets {
+       __u32   head;
+       __u32   tail;
+       __u32   rqes;
+       __u32   __resv2;
+       __u64   __resv[2];
+};
+
+/*
+ * Argument for IORING_REGISTER_ZCRX_IFQ
+ */
+struct io_uring_zcrx_ifq_reg {
+       __u32   if_idx;
+       __u32   if_rxq;
+       __u32   rq_entries;
+       __u32   flags;
+
+       __u64   area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
+       __u64   region_ptr; /* struct io_uring_region_desc * */
+
+       struct io_uring_zcrx_offsets offsets;
+       __u64   __resv[4];
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/io_uring/KConfig b/io_uring/KConfig
new file mode 100644 (file)
index 0000000..9e2a4be
--- /dev/null
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# io_uring configuration
+#
+
+config IO_URING_ZCRX
+       def_bool y
+       depends on PAGE_POOL
+       depends on INET
+       depends on NET_RX_BUSY_POLL
index d695b60dba4f0410abc80acf9466a4cd166ad912..98e48339d84dc4ae1de0e163e16482af794db50f 100644 (file)
@@ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING)                += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
                                        epoll.o statx.o timeout.o fdinfo.o \
                                        cancel.o waitid.o register.o \
                                        truncate.o memmap.o alloc_cache.o
+obj-$(CONFIG_IO_URING_ZCRX)    += zcrx.o
 obj-$(CONFIG_IO_WQ)            += io-wq.o
 obj-$(CONFIG_FUTEX)            += futex.o
 obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
index b688953d1de8e52d3a44d6f63bd06034956ec0d1..58528bf61638ebcdeb69f2b740d5ca13ec3ed176 100644 (file)
@@ -97,6 +97,7 @@
 #include "uring_cmd.h"
 #include "msg_ring.h"
 #include "memmap.h"
+#include "zcrx.h"
 
 #include "timeout.h"
 #include "poll.h"
@@ -2700,6 +2701,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
        mutex_lock(&ctx->uring_lock);
        io_sqe_buffers_unregister(ctx);
        io_sqe_files_unregister(ctx);
+       io_unregister_zcrx_ifqs(ctx);
        io_cqring_overflow_kill(ctx);
        io_eventfd_unregister(ctx);
        io_free_alloc_caches(ctx);
@@ -2859,6 +2861,11 @@ static __cold void io_ring_exit_work(struct work_struct *work)
                        io_cqring_overflow_kill(ctx);
                        mutex_unlock(&ctx->uring_lock);
                }
+               if (ctx->ifq) {
+                       mutex_lock(&ctx->uring_lock);
+                       io_shutdown_zcrx_ifqs(ctx);
+                       mutex_unlock(&ctx->uring_lock);
+               }
 
                if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
                        io_move_task_work_from_local(ctx);
index c898dcba2b4ecceaec95b52c84481be6589e3016..dad0aa5b1b457475665ed392c89b158f5e5e50b4 100644 (file)
@@ -2,6 +2,7 @@
 #define IO_URING_MEMMAP_H
 
 #define IORING_MAP_OFF_PARAM_REGION            0x20000000ULL
+#define IORING_MAP_OFF_ZCRX_REGION             0x30000000ULL
 
 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
 
index 9a4d2fbce4aec6ae470bfc717261d1cdfb61479c..cc23a4c205cd43250f1fc47849052746e5d521e7 100644 (file)
@@ -30,6 +30,7 @@
 #include "eventfd.h"
 #include "msg_ring.h"
 #include "memmap.h"
+#include "zcrx.h"
 
 #define IORING_MAX_RESTRICTIONS        (IORING_RESTRICTION_LAST + \
                                 IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -813,6 +814,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
                        break;
                ret = io_register_clone_buffers(ctx, arg);
                break;
+       case IORING_REGISTER_ZCRX_IFQ:
+               ret = -EINVAL;
+               if (!arg || nr_args != 1)
+                       break;
+               ret = io_register_zcrx_ifq(ctx, arg);
+               break;
        case IORING_REGISTER_RESIZE_RINGS:
                ret = -EINVAL;
                if (!arg || nr_args != 1)
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
new file mode 100644 (file)
index 0000000..f3ace7e
--- /dev/null
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "kbuf.h"
+#include "memmap.h"
+#include "zcrx.h"
+
+#define IO_RQ_MAX_ENTRIES              32768
+
+static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
+                                struct io_uring_zcrx_ifq_reg *reg,
+                                struct io_uring_region_desc *rd)
+{
+       size_t off, size;
+       void *ptr;
+       int ret;
+
+       off = sizeof(struct io_uring);
+       size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
+       if (size > rd->size)
+               return -EINVAL;
+
+       ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
+                                        IORING_MAP_OFF_ZCRX_REGION);
+       if (ret < 0)
+               return ret;
+
+       ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
+       ifq->rq_ring = (struct io_uring *)ptr;
+       ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
+       return 0;
+}
+
+static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
+{
+       io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
+       ifq->rq_ring = NULL;
+       ifq->rqes = NULL;
+}
+
+static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
+{
+       struct io_zcrx_ifq *ifq;
+
+       ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
+       if (!ifq)
+               return NULL;
+
+       ifq->if_rxq = -1;
+       ifq->ctx = ctx;
+       return ifq;
+}
+
+static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
+{
+       io_free_rbuf_ring(ifq);
+       kfree(ifq);
+}
+
+int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
+                         struct io_uring_zcrx_ifq_reg __user *arg)
+{
+       struct io_uring_zcrx_ifq_reg reg;
+       struct io_uring_region_desc rd;
+       struct io_zcrx_ifq *ifq;
+       int ret;
+
+       /*
+        * 1. Interface queue allocation.
+        * 2. It can observe data destined for sockets of other tasks.
+        */
+       if (!capable(CAP_NET_ADMIN))
+               return -EPERM;
+
+       /* mandatory io_uring features for zc rx */
+       if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
+             ctx->flags & IORING_SETUP_CQE32))
+               return -EINVAL;
+       if (ctx->ifq)
+               return -EBUSY;
+       if (copy_from_user(&reg, arg, sizeof(reg)))
+               return -EFAULT;
+       if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
+               return -EFAULT;
+       if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
+               return -EINVAL;
+       if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
+               return -EINVAL;
+       if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
+               if (!(ctx->flags & IORING_SETUP_CLAMP))
+                       return -EINVAL;
+               reg.rq_entries = IO_RQ_MAX_ENTRIES;
+       }
+       reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
+
+       if (!reg.area_ptr)
+               return -EFAULT;
+
+       ifq = io_zcrx_ifq_alloc(ctx);
+       if (!ifq)
+               return -ENOMEM;
+
+       ret = io_allocate_rbuf_ring(ifq, &reg, &rd);
+       if (ret)
+               goto err;
+
+       ifq->rq_entries = reg.rq_entries;
+       ifq->if_rxq = reg.if_rxq;
+
+       reg.offsets.rqes = sizeof(struct io_uring);
+       reg.offsets.head = offsetof(struct io_uring, head);
+       reg.offsets.tail = offsetof(struct io_uring, tail);
+
+       if (copy_to_user(arg, &reg, sizeof(reg)) ||
+           copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) {
+               ret = -EFAULT;
+               goto err;
+       }
+
+       ctx->ifq = ifq;
+       return 0;
+err:
+       io_zcrx_ifq_free(ifq);
+       return ret;
+}
+
+void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
+{
+       struct io_zcrx_ifq *ifq = ctx->ifq;
+
+       lockdep_assert_held(&ctx->uring_lock);
+
+       if (!ifq)
+               return;
+
+       ctx->ifq = NULL;
+       io_zcrx_ifq_free(ifq);
+}
+
+void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
+{
+       lockdep_assert_held(&ctx->uring_lock);
+}
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
new file mode 100644 (file)
index 0000000..58e4ab6
--- /dev/null
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IOU_ZC_RX_H
+#define IOU_ZC_RX_H
+
+#include <linux/io_uring_types.h>
+
+struct io_zcrx_ifq {
+       struct io_ring_ctx              *ctx;
+       struct io_uring                 *rq_ring;
+       struct io_uring_zcrx_rqe        *rqes;
+       u32                             rq_entries;
+
+       u32                             if_rxq;
+};
+
+#if defined(CONFIG_IO_URING_ZCRX)
+int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
+                        struct io_uring_zcrx_ifq_reg __user *arg);
+void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
+void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
+#else
+static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
+                                       struct io_uring_zcrx_ifq_reg __user *arg)
+{
+       return -EOPNOTSUPP;
+}
+static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
+{
+}
+static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
+{
+}
+#endif
+
+#endif