summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2022-05-16 09:34:02 -0600
committerJens Axboe <axboe@kernel.dk>2022-05-16 09:34:02 -0600
commitdb2153d86f04fbfccfb67e52caafdeeb658d15e4 (patch)
treed3290a1d561507311a4014b4d603d3e0cd9e92a4
parent4d6a12e5d6bd7d7260d202cf888c911a039376ea (diff)
parent797439af0962b65451425032c1559f0800fca13e (diff)
downloadliburing-db2153d86f04fbfccfb67e52caafdeeb658d15e4.tar.gz
liburing-db2153d86f04fbfccfb67e52caafdeeb658d15e4.tar.bz2
Merge branch 'big-sqe'
* big-sqe: test/nop: make less verbose and don't fail on older kernels liburing: Update io_uring.h liburing: Test all configurations with NOP test liburing: add large CQE tests to nop test liburing: index large CQE's correctly liburing: return correct ring size for large CQE's liburing: increase mmap size for large CQE's liburing: Update io_uring.h with large CQE kernel changes test/nop: add basic IORING_SETUP_SQE128 tests setup: add basic support for SQE128 io_uring.h: add IORING_SETUP_SQE128
-rw-r--r--src/include/liburing.h31
-rw-r--r--src/include/liburing/io_uring.h25
-rw-r--r--src/queue.c6
-rw-r--r--src/setup.c41
-rw-r--r--test/nop.c79
-rw-r--r--test/test.h35
6 files changed, 192 insertions, 25 deletions
diff --git a/src/include/liburing.h b/src/include/liburing.h
index 89b2e5b..1aedc35 100644
--- a/src/include/liburing.h
+++ b/src/include/liburing.h
@@ -189,6 +189,16 @@ int __io_uring_get_cqe(struct io_uring *ring,
#define LIBURING_UDATA_TIMEOUT ((__u64) -1)
+/*
+ * Calculates the step size for CQE iteration.
+ * For standard CQE's its 1, for big CQE's its two.
+ */
+#define io_uring_cqe_shift(ring) \
+ (!!((ring)->flags & IORING_SETUP_CQE32))
+
+#define io_uring_cqe_index(ring,ptr,mask) \
+ (((ptr) & (mask)) << io_uring_cqe_shift(ring))
+
#define io_uring_for_each_cqe(ring, head, cqe) \
/* \
* io_uring_smp_load_acquire() enforces the order of tail \
@@ -196,7 +206,7 @@ int __io_uring_get_cqe(struct io_uring *ring,
*/ \
for (head = *(ring)->cq.khead; \
(cqe = (head != io_uring_smp_load_acquire((ring)->cq.ktail) ? \
- &(ring)->cq.cqes[head & (*(ring)->cq.kring_mask)] : NULL)); \
+ &(ring)->cq.cqes[io_uring_cqe_index(ring, head, *(ring)->cq.kring_mask)] : NULL)); \
head++) \
/*
@@ -901,6 +911,10 @@ static inline int __io_uring_peek_cqe(struct io_uring *ring,
int err = 0;
unsigned available;
unsigned mask = *ring->cq.kring_mask;
+ int shift = 0;
+
+ if (ring->flags & IORING_SETUP_CQE32)
+ shift = 1;
do {
unsigned tail = io_uring_smp_load_acquire(ring->cq.ktail);
@@ -911,7 +925,7 @@ static inline int __io_uring_peek_cqe(struct io_uring *ring,
if (!available)
break;
- cqe = &ring->cq.cqes[head & mask];
+ cqe = &ring->cq.cqes[(head & mask) << shift];
if (!(ring->features & IORING_FEAT_EXT_ARG) &&
cqe->user_data == LIBURING_UDATA_TIMEOUT) {
if (cqe->res < 0)
@@ -969,13 +983,20 @@ static inline struct io_uring_sqe *_io_uring_get_sqe(struct io_uring *ring)
struct io_uring_sq *sq = &ring->sq;
unsigned int head = io_uring_smp_load_acquire(sq->khead);
unsigned int next = sq->sqe_tail + 1;
- struct io_uring_sqe *sqe = NULL;
+ int shift = 0;
+
+ if (ring->flags & IORING_SETUP_SQE128)
+ shift = 1;
if (next - head <= *sq->kring_entries) {
- sqe = &sq->sqes[sq->sqe_tail & *sq->kring_mask];
+ struct io_uring_sqe *sqe;
+
+ sqe = &sq->sqes[(sq->sqe_tail & *sq->kring_mask) << shift];
sq->sqe_tail = next;
+ return sqe;
}
- return sqe;
+
+ return NULL;
}
#ifndef LIBURING_INTERNAL
diff --git a/src/include/liburing/io_uring.h b/src/include/liburing/io_uring.h
index 166d8ca..0acf05d 100644
--- a/src/include/liburing/io_uring.h
+++ b/src/include/liburing/io_uring.h
@@ -108,8 +108,25 @@ enum {
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */
+/*
+ * Cooperative task running. When requests complete, they often require
+ * forcing the submitter to transition to the kernel to complete. If this
+ * flag is set, work will be done when the task transitions anyway, rather
+ * than force an inter-processor interrupt reschedule. This avoids interrupting
+ * a task running in userspace, and saves an IPI.
+ */
+#define IORING_SETUP_COOP_TASKRUN (1U << 8)
+/*
+ * If COOP_TASKRUN is set, get notified if task work is available for
+ * running and a kernel transition would be needed to run it. This sets
+ * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
+ */
+#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
-enum {
+#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */
+#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */
+
+enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
IORING_OP_WRITEV,
@@ -226,6 +243,12 @@ struct io_uring_cqe {
__u64 user_data; /* sqe->data submission passed back */
__s32 res; /* result code for this event */
__u32 flags;
+
+ /*
+ * If the ring is initialized with IORING_SETUP_CQE32, then this field
+ * contains 16-bytes of padding, doubling the size of the CQE.
+ */
+ __u64 big_cqe[];
};
/*
diff --git a/src/queue.c b/src/queue.c
index 36b4b29..ce0ecf6 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -133,6 +133,10 @@ unsigned io_uring_peek_batch_cqe(struct io_uring *ring,
{
unsigned ready;
bool overflow_checked = false;
+ int shift = 0;
+
+ if (ring->flags & IORING_SETUP_CQE32)
+ shift = 1;
again:
ready = io_uring_cq_ready(ring);
@@ -145,7 +149,7 @@ again:
count = count > ready ? ready : count;
last = head + count;
for (;head != last; head++, i++)
- cqes[i] = &ring->cq.cqes[head & mask];
+ cqes[i] = &ring->cq.cqes[(head & mask) << shift];
return count;
}
diff --git a/src/setup.c b/src/setup.c
index 35981da..d2adc7f 100644
--- a/src/setup.c
+++ b/src/setup.c
@@ -21,8 +21,12 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
size_t size;
int ret;
+ size = sizeof(struct io_uring_cqe);
+ if (p->flags & IORING_SETUP_CQE32)
+ size += sizeof(struct io_uring_cqe);
+
sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
- cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
+ cq->ring_sz = p->cq_off.cqes + p->cq_entries * size;
if (p->features & IORING_FEAT_SINGLE_MMAP) {
if (cq->ring_sz > sq->ring_sz)
@@ -56,8 +60,10 @@ static int io_uring_mmap(int fd, struct io_uring_params *p,
sq->kdropped = sq->ring_ptr + p->sq_off.dropped;
sq->array = sq->ring_ptr + p->sq_off.array;
- size = p->sq_entries * sizeof(struct io_uring_sqe);
- sq->sqes = __sys_mmap(0, size, PROT_READ | PROT_WRITE,
+ size = sizeof(struct io_uring_sqe);
+ if (p->flags & IORING_SETUP_SQE128)
+ size += 64;
+ sq->sqes = __sys_mmap(0, size * p->sq_entries, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
if (IS_ERR(sq->sqes)) {
ret = PTR_ERR(sq->sqes);
@@ -109,7 +115,10 @@ int io_uring_ring_dontfork(struct io_uring *ring)
if (!ring->sq.ring_ptr || !ring->sq.sqes || !ring->cq.ring_ptr)
return -EINVAL;
- len = *ring->sq.kring_entries * sizeof(struct io_uring_sqe);
+ len = sizeof(struct io_uring_sqe);
+ if (ring->flags & IORING_SETUP_SQE128)
+ len += 64;
+ len *= *ring->sq.kring_entries;
ret = __sys_madvise(ring->sq.sqes, len, MADV_DONTFORK);
if (ret < 0)
return ret;
@@ -166,8 +175,12 @@ void io_uring_queue_exit(struct io_uring *ring)
{
struct io_uring_sq *sq = &ring->sq;
struct io_uring_cq *cq = &ring->cq;
+ size_t sqe_size;
- __sys_munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe));
+ sqe_size = sizeof(struct io_uring_sqe);
+ if (ring->flags & IORING_SETUP_SQE128)
+ sqe_size += 64;
+ __sys_munmap(sq->sqes, sqe_size * *sq->kring_entries);
io_uring_unmap_rings(sq, cq);
/*
* Not strictly required, but frees up the slot we used now rather
@@ -239,17 +252,23 @@ static size_t npages(size_t size, unsigned page_size)
#define KRING_SIZE 320
-static size_t rings_size(unsigned entries, unsigned cq_entries,
- unsigned page_size)
+static size_t rings_size(struct io_uring_params *p, unsigned entries,
+ unsigned cq_entries, unsigned page_size)
{
size_t pages, sq_size, cq_size;
- cq_size = KRING_SIZE;
- cq_size += cq_entries * sizeof(struct io_uring_cqe);
+ cq_size = sizeof(struct io_uring_cqe);
+ if (p->flags & IORING_SETUP_CQE32)
+ cq_size += sizeof(struct io_uring_cqe);
+ cq_size *= cq_entries;
+ cq_size += KRING_SIZE;
cq_size = (cq_size + 63) & ~63UL;
pages = (size_t) 1 << npages(cq_size, page_size);
- sq_size = sizeof(struct io_uring_sqe) * entries;
+ sq_size = sizeof(struct io_uring_sqe);
+ if (p->flags & IORING_SETUP_SQE128)
+ sq_size += 64;
+ sq_size *= entries;
pages += (size_t) 1 << npages(sq_size, page_size);
return pages * page_size;
}
@@ -317,7 +336,7 @@ ssize_t io_uring_mlock_size_params(unsigned entries, struct io_uring_params *p)
}
page_size = get_page_size();
- return rings_size(entries, cq_entries, page_size);
+ return rings_size(p, entries, cq_entries, page_size);
}
/*
diff --git a/test/nop.c b/test/nop.c
index 82201bd..ce223b3 100644
--- a/test/nop.c
+++ b/test/nop.c
@@ -11,12 +11,16 @@
#include <fcntl.h>
#include "liburing.h"
+#include "test.h"
+
+static int seq;
static int test_single_nop(struct io_uring *ring)
{
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
int ret;
+ bool cqe32 = (ring->flags & IORING_SETUP_CQE32);
sqe = io_uring_get_sqe(ring);
if (!sqe) {
@@ -25,6 +29,11 @@ static int test_single_nop(struct io_uring *ring)
}
io_uring_prep_nop(sqe);
+ if (cqe32) {
+ sqe->addr = 1234;
+ sqe->addr2 = 5678;
+ }
+ sqe->user_data = ++seq;
ret = io_uring_submit(ring);
if (ret <= 0) {
@@ -37,7 +46,21 @@ static int test_single_nop(struct io_uring *ring)
fprintf(stderr, "wait completion %d\n", ret);
goto err;
}
+ if (!cqe->user_data) {
+ fprintf(stderr, "Unexpected 0 user_data\n");
+ goto err;
+ }
+ if (cqe32) {
+ if (cqe->big_cqe[0] != 1234) {
+ fprintf(stderr, "Unexpected extra1\n");
+ goto err;
+ }
+ if (cqe->big_cqe[1] != 5678) {
+ fprintf(stderr, "Unexpected extra2\n");
+ goto err;
+ }
+ }
io_uring_cqe_seen(ring, cqe);
return 0;
err:
@@ -49,6 +72,7 @@ static int test_barrier_nop(struct io_uring *ring)
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
int ret, i;
+ bool cqe32 = (ring->flags & IORING_SETUP_CQE32);
for (i = 0; i < 8; i++) {
sqe = io_uring_get_sqe(ring);
@@ -60,6 +84,11 @@ static int test_barrier_nop(struct io_uring *ring)
io_uring_prep_nop(sqe);
if (i == 4)
sqe->flags = IOSQE_IO_DRAIN;
+ if (cqe32) {
+ sqe->addr = 1234;
+ sqe->addr2 = 5678;
+ }
+ sqe->user_data = ++seq;
}
ret = io_uring_submit(ring);
@@ -77,6 +106,20 @@ static int test_barrier_nop(struct io_uring *ring)
fprintf(stderr, "wait completion %d\n", ret);
goto err;
}
+ if (!cqe->user_data) {
+ fprintf(stderr, "Unexpected 0 user_data\n");
+ goto err;
+ }
+ if (cqe32) {
+ if (cqe->big_cqe[0] != 1234) {
+ fprintf(stderr, "Unexpected extra1\n");
+ goto err;
+ }
+ if (cqe->big_cqe[1] != 5678) {
+ fprintf(stderr, "Unexpected extra2\n");
+ goto err;
+ }
+ }
io_uring_cqe_seen(ring, cqe);
}
@@ -85,16 +128,17 @@ err:
return 1;
}
-int main(int argc, char *argv[])
+static int test_ring(unsigned flags)
{
struct io_uring ring;
+ struct io_uring_params p = { };
int ret;
- if (argc > 1)
- return 0;
-
- ret = io_uring_queue_init(8, &ring, 0);
+ p.flags = flags;
+ ret = io_uring_queue_init_params(8, &ring, &p);
if (ret) {
+ if (ret == -EINVAL)
+ return 0;
fprintf(stderr, "ring setup failed: %d\n", ret);
return 1;
}
@@ -102,13 +146,34 @@ int main(int argc, char *argv[])
ret = test_single_nop(&ring);
if (ret) {
fprintf(stderr, "test_single_nop failed\n");
- return ret;
+ goto err;
}
ret = test_barrier_nop(&ring);
if (ret) {
fprintf(stderr, "test_barrier_nop failed\n");
- return ret;
+ goto err;
+ }
+
+err:
+ io_uring_queue_exit(&ring);
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ if (argc > 1)
+ return 0;
+
+ FOR_ALL_TEST_CONFIGS {
+ ret = test_ring(IORING_GET_TEST_CONFIG_FLAGS());
+ if (ret) {
+ fprintf(stderr, "Normal ring test failed: %s\n",
+ IORING_GET_TEST_CONFIG_DESCRIPTION());
+ return ret;
+ }
}
return 0;
diff --git a/test/test.h b/test/test.h
new file mode 100644
index 0000000..3628163
--- /dev/null
+++ b/test/test.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Description: Test configs for tests.
+ */
+#ifndef LIBURING_TEST_H
+#define LIBURING_TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct io_uring_test_config {
+ unsigned int flags;
+ const char *description;
+} io_uring_test_config;
+
+io_uring_test_config io_uring_test_configs[] = {
+ { 0, "default" },
+ { IORING_SETUP_SQE128, "large SQE"},
+ { IORING_SETUP_CQE32, "large CQE"},
+ { IORING_SETUP_SQE128 | IORING_SETUP_CQE32, "large SQE/CQE" },
+};
+
+#define FOR_ALL_TEST_CONFIGS \
+ for (int i = 0; i < sizeof(io_uring_test_configs) / sizeof(io_uring_test_configs[0]); i++)
+
+#define IORING_GET_TEST_CONFIG_FLAGS() (io_uring_test_configs[i].flags)
+#define IORING_GET_TEST_CONFIG_DESCRIPTION() (io_uring_test_configs[i].description)
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif