#include "../lib/pow2.h"
#include "../optgroup.h"
#include "../lib/memalign.h"
+#include "../lib/fls.h"
#ifdef ARCH_HAVE_AIORING
-#ifndef IOCB_FLAG_HIPRI
-#define IOCB_FLAG_HIPRI (1 << 2)
-#endif
-
/*
- * io_setup2(2) flags
+ * io_uring_setup(2) flags
*/
-#ifndef IOCTX_FLAG_IOPOLL
-#define IOCTX_FLAG_IOPOLL (1 << 0)
-#endif
#ifndef IOCTX_FLAG_SCQRING
-#define IOCTX_FLAG_SCQRING (1 << 1)
+#define IOCTX_FLAG_SCQRING (1 << 0)
+#endif
+#ifndef IOCTX_FLAG_IOPOLL
+#define IOCTX_FLAG_IOPOLL (1 << 1)
#endif
#ifndef IOCTX_FLAG_FIXEDBUFS
#define IOCTX_FLAG_FIXEDBUFS (1 << 2)
#define IOCTX_FLAG_SQPOLL (1 << 5)
#endif
+#define IORING_OFF_SQ_RING 0ULL
+#define IORING_OFF_CQ_RING 0x8000000ULL
+#define IORING_OFF_IOCB 0x10000000ULL
/*
- * io_ring_enter(2) flags
+ * io_uring_enter(2) flags
*/
-#ifndef IORING_FLAG_SUBMIT
-#define IORING_FLAG_SUBMIT (1 << 0)
-#endif
-#ifndef IORING_FLAG_GETEVENTS
-#define IORING_FLAG_GETEVENTS (1 << 1)
+#ifndef IORING_ENTER_GETEVENTS
+#define IORING_ENTER_GETEVENTS (1 << 0)
#endif
typedef uint64_t u64;
#define IOEV_RES2_CACHEHIT (1 << 0)
+struct aio_sqring_offsets {
+ u32 head;
+ u32 tail;
+ u32 ring_mask;
+ u32 ring_entries;
+ u32 flags;
+ u32 dropped;
+ u32 array;
+ u32 resv[3];
+};
+
+struct aio_cqring_offsets {
+ u32 head;
+ u32 tail;
+ u32 ring_mask;
+ u32 ring_entries;
+ u32 overflow;
+ u32 events;
+ u32 resv[4];
+};
+
+struct aio_uring_params {
+ u32 sq_entries;
+ u32 cq_entries;
+ u32 flags;
+ u16 sq_thread_cpu;
+ u16 resv[9];
+ struct aio_sqring_offsets sq_off;
+ struct aio_cqring_offsets cq_off;
+};
+
struct aio_sq_ring {
- union {
- struct {
- u32 head;
- u32 tail;
- u32 nr_events;
- u16 sq_thread_cpu;
- u16 kflags;
- u64 iocbs;
- };
- u32 pad[16];
- };
- u32 array[0];
+ u32 *head;
+ u32 *tail;
+ u32 *ring_mask;
+ u32 *ring_entries;
+ u32 *flags;
+ u32 *array;
};
struct aio_cq_ring {
- union {
- struct {
- u32 head;
- u32 tail;
- u32 nr_events;
- };
- struct io_event pad;
- };
- struct io_event events[0];
+ u32 *head;
+ u32 *tail;
+ u32 *ring_mask;
+ u32 *ring_entries;
+ struct io_event *events;
+};
+
+struct aioring_mmap {
+ void *ptr;
+ size_t len;
};
struct aioring_data {
- io_context_t aio_ctx;
+ int ring_fd;
+
struct io_u **io_us;
struct io_u **io_u_index;
- struct aio_sq_ring *sq_ring;
+ struct aio_sq_ring sq_ring;
struct iocb *iocbs;
+ struct iovec *iovecs;
+ unsigned sq_ring_mask;
- struct aio_cq_ring *cq_ring;
+ struct aio_cq_ring cq_ring;
struct io_event *events;
+ unsigned cq_ring_mask;
int queued;
int cq_ring_off;
+ unsigned iodepth;
uint64_t cachehit;
uint64_t cachemiss;
+
+ struct aioring_mmap mmap[3];
};
struct aioring_options {
},
};
-static int io_ring_enter(io_context_t ctx, unsigned int to_submit,
+static int io_uring_enter(struct aioring_data *ld, unsigned int to_submit,
unsigned int min_complete, unsigned int flags)
{
- return syscall(__NR_sys_io_ring_enter, ctx, to_submit, min_complete,
- flags);
+ return syscall(__NR_sys_io_uring_enter, ld->ring_fd, to_submit,
+ min_complete, flags);
}
static int fio_aioring_prep(struct thread_data *td, struct io_u *io_u)
{
struct aioring_data *ld = td->io_ops_data;
struct fio_file *f = io_u->file;
- struct aioring_options *o = td->eo;
struct iocb *iocb;
iocb = &ld->iocbs[io_u->index];
- if (io_u->ddir == DDIR_READ) {
- if (o->fixedbufs) {
- iocb->aio_fildes = f->fd;
+ if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+ if (io_u->ddir == DDIR_READ)
iocb->aio_lio_opcode = IO_CMD_PREAD;
- iocb->u.c.offset = io_u->offset;
- } else {
- io_prep_pread(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
- if (o->hipri)
- iocb->u.c.flags |= IOCB_FLAG_HIPRI;
- }
- } else if (io_u->ddir == DDIR_WRITE) {
- if (o->fixedbufs) {
- iocb->aio_fildes = f->fd;
+ else
iocb->aio_lio_opcode = IO_CMD_PWRITE;
- iocb->u.c.offset = io_u->offset;
- } else {
- io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
- if (o->hipri)
- iocb->u.c.flags |= IOCB_FLAG_HIPRI;
- }
+ iocb->aio_reqprio = 0;
+ iocb->aio_fildes = f->fd;
+ iocb->u.c.buf = io_u->xfer_buf;
+ iocb->u.c.nbytes = io_u->xfer_buflen;
+ iocb->u.c.offset = io_u->offset;
+ iocb->u.c.flags = 0;
} else if (ddir_sync(io_u->ddir))
io_prep_fsync(iocb, f->fd);
struct aioring_data *ld = td->io_ops_data;
struct io_event *ev;
struct io_u *io_u;
- int index;
+ unsigned index;
- index = event + ld->cq_ring_off;
- if (index >= ld->cq_ring->nr_events)
- index -= ld->cq_ring->nr_events;
+ index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
- ev = &ld->cq_ring->events[index];
+ ev = &ld->cq_ring.events[index];
io_u = ev->data;
if (ev->res != io_u->xfer_buflen) {
unsigned int max)
{
struct aioring_data *ld = td->io_ops_data;
- struct aio_cq_ring *ring = ld->cq_ring;
+ struct aio_cq_ring *ring = &ld->cq_ring;
u32 head, reaped = 0;
- head = ring->head;
+ head = *ring->head;
do {
read_barrier();
- if (head == ring->tail)
+ if (head == *ring->tail)
break;
reaped++;
head++;
- if (head == ring->nr_events)
- head = 0;
} while (reaped + events < max);
- ring->head = head;
+ *ring->head = head;
write_barrier();
return reaped;
}
struct aioring_data *ld = td->io_ops_data;
unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
struct aioring_options *o = td->eo;
- struct aio_cq_ring *ring = ld->cq_ring;
- int r, events = 0;
+ struct aio_cq_ring *ring = &ld->cq_ring;
+ unsigned events = 0;
+ int r;
- ld->cq_ring_off = ring->head;
+ ld->cq_ring_off = *ring->head;
do {
r = fio_aioring_cqring_reap(td, events, max);
if (r) {
}
if (!o->sqthread_poll) {
- r = io_ring_enter(ld->aio_ctx, 0, actual_min,
- IORING_FLAG_GETEVENTS);
+ r = io_uring_enter(ld, 0, actual_min,
+ IORING_ENTER_GETEVENTS);
if (r < 0) {
if (errno == EAGAIN)
continue;
- td_verror(td, errno, "io_ring_enter get");
+ td_verror(td, errno, "io_uring_enter");
break;
}
}
struct io_u *io_u)
{
struct aioring_data *ld = td->io_ops_data;
- struct aio_sq_ring *ring = ld->sq_ring;
+ struct aio_sq_ring *ring = &ld->sq_ring;
unsigned tail, next_tail;
fio_ro_check(td, io_u);
- if (ld->queued == td->o.iodepth)
+ if (ld->queued == ld->iodepth)
return FIO_Q_BUSY;
if (io_u->ddir == DDIR_TRIM) {
return FIO_Q_COMPLETED;
}
- tail = ring->tail;
+ tail = *ring->tail;
next_tail = tail + 1;
- if (next_tail == ring->nr_events)
- next_tail = 0;
read_barrier();
- if (next_tail == ring->head)
+ if (next_tail == *ring->head)
return FIO_Q_BUSY;
- ring->array[tail] = io_u->index;
- ring->tail = next_tail;
+ ring->array[tail & ld->sq_ring_mask] = io_u->index;
+ *ring->tail = next_tail;
write_barrier();
ld->queued++;
fio_gettime(&now, NULL);
while (nr--) {
- int index = ld->sq_ring->array[start];
- struct io_u *io_u = io_u = ld->io_u_index[index];
+ struct aio_sq_ring *ring = &ld->sq_ring;
+ int index = ring->array[start & ld->sq_ring_mask];
+ struct io_u *io_u = ld->io_u_index[index];
memcpy(&io_u->issue_time, &now, sizeof(now));
io_u_queued(td, io_u);
start++;
- if (start == ld->sq_ring->nr_events)
- start = 0;
}
}
/* Nothing to do */
if (o->sqthread_poll) {
- struct aio_sq_ring *ring = ld->sq_ring;
+ struct aio_sq_ring *ring = &ld->sq_ring;
- if (ring->kflags & IORING_SQ_NEED_WAKEUP)
- io_ring_enter(ld->aio_ctx, ld->queued, 0, IORING_FLAG_SUBMIT);
+ if (*ring->flags & IORING_SQ_NEED_WAKEUP)
+ io_uring_enter(ld, ld->queued, 0, 0);
ld->queued = 0;
return 0;
}
do {
- int start = ld->sq_ring->head;
+ unsigned start = *ld->sq_ring.head;
long nr = ld->queued;
- ret = io_ring_enter(ld->aio_ctx, nr, 0, IORING_FLAG_SUBMIT |
- IORING_FLAG_GETEVENTS);
+ ret = io_uring_enter(ld, nr, 0, IORING_ENTER_GETEVENTS);
if (ret > 0) {
fio_aioring_queued(td, start, ret);
io_u_mark_submit(td, ret);
usleep(1);
continue;
}
- td_verror(td, errno, "io_ring_enter sumit");
+ td_verror(td, errno, "io_uring_enter submit");
break;
}
} while (ld->queued);
return ret;
}
-static size_t aioring_cq_size(struct thread_data *td)
+static void fio_aioring_unmap(struct aioring_data *ld)
{
- return sizeof(struct aio_cq_ring) + 2 * td->o.iodepth * sizeof(struct io_event);
-}
+ int i;
-static size_t aioring_sq_iocb(struct thread_data *td)
-{
- return sizeof(struct iocb) * td->o.iodepth;
-}
-
-static size_t aioring_sq_size(struct thread_data *td)
-{
- return sizeof(struct aio_sq_ring) + td->o.iodepth * sizeof(u32);
+ for (i = 0; i < ARRAY_SIZE(ld->mmap); i++)
+ munmap(ld->mmap[i].ptr, ld->mmap[i].len);
+ close(ld->ring_fd);
}
static void fio_aioring_cleanup(struct thread_data *td)
td->ts.cachehit += ld->cachehit;
td->ts.cachemiss += ld->cachemiss;
- /* Bump depth to match init depth */
- td->o.iodepth++;
-
/*
* Work-around to avoid huge RCU stalls at exit time. If we
* don't do this here, then it'll be torn down by exit_aio().
* speeding it up a lot.
*/
if (!(td->flags & TD_F_CHILD))
- io_destroy(ld->aio_ctx);
+ fio_aioring_unmap(ld);
+
free(ld->io_u_index);
free(ld->io_us);
- fio_memfree(ld->sq_ring, aioring_sq_size(td), false);
- fio_memfree(ld->iocbs, aioring_sq_iocb(td), false);
- fio_memfree(ld->cq_ring, aioring_cq_size(td), false);
+ free(ld->iovecs);
free(ld);
}
}
+static int fio_aioring_mmap(struct aioring_data *ld, struct aio_uring_params *p)
+{
+ struct aio_sq_ring *sring = &ld->sq_ring;
+ struct aio_cq_ring *cring = &ld->cq_ring;
+ void *ptr;
+
+ ld->mmap[0].len = p->sq_off.array + p->sq_entries * sizeof(u32);
+ ptr = mmap(0, ld->mmap[0].len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, ld->ring_fd,
+ IORING_OFF_SQ_RING);
+ ld->mmap[0].ptr = ptr;
+ sring->head = ptr + p->sq_off.head;
+ sring->tail = ptr + p->sq_off.tail;
+ sring->ring_mask = ptr + p->sq_off.ring_mask;
+ sring->ring_entries = ptr + p->sq_off.ring_entries;
+ sring->flags = ptr + p->sq_off.flags;
+ sring->array = ptr + p->sq_off.array;
+ ld->sq_ring_mask = *sring->ring_mask;
+
+ ld->mmap[1].len = p->sq_entries * sizeof(struct iocb);
+ ld->iocbs = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, ld->ring_fd,
+ IORING_OFF_IOCB);
+ ld->mmap[1].ptr = ld->iocbs;
+
+ ld->mmap[2].len = p->cq_off.events +
+ p->cq_entries * sizeof(struct io_event);
+ ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, ld->ring_fd,
+ IORING_OFF_CQ_RING);
+ ld->mmap[2].ptr = ptr;
+ cring->head = ptr + p->cq_off.head;
+ cring->tail = ptr + p->cq_off.tail;
+ cring->ring_mask = ptr + p->cq_off.ring_mask;
+ cring->ring_entries = ptr + p->cq_off.ring_entries;
+ cring->events = ptr + p->cq_off.events;
+ ld->cq_ring_mask = *cring->ring_mask;
+ return 0;
+}
+
static int fio_aioring_queue_init(struct thread_data *td)
{
struct aioring_data *ld = td->io_ops_data;
struct aioring_options *o = td->eo;
- int flags = IOCTX_FLAG_SCQRING;
int depth = td->o.iodepth;
+ struct aio_uring_params p;
+ int ret;
+
+ memset(&p, 0, sizeof(p));
+ p.flags = IOCTX_FLAG_SCQRING;
if (o->hipri)
- flags |= IOCTX_FLAG_IOPOLL;
+ p.flags |= IOCTX_FLAG_IOPOLL;
if (o->sqthread_set) {
- ld->sq_ring->sq_thread_cpu = o->sqthread;
- flags |= IOCTX_FLAG_SQTHREAD;
+ p.sq_thread_cpu = o->sqthread;
+ p.flags |= IOCTX_FLAG_SQTHREAD;
if (o->sqthread_poll)
- flags |= IOCTX_FLAG_SQPOLL;
+ p.flags |= IOCTX_FLAG_SQPOLL;
}
if (o->sqwq)
- flags |= IOCTX_FLAG_SQWQ;
+ p.flags |= IOCTX_FLAG_SQWQ;
if (o->fixedbufs) {
struct rlimit rlim = {
};
setrlimit(RLIMIT_MEMLOCK, &rlim);
- flags |= IOCTX_FLAG_FIXEDBUFS;
+ p.flags |= IOCTX_FLAG_FIXEDBUFS;
}
- return syscall(__NR_sys_io_setup2, depth, flags,
- ld->sq_ring, ld->cq_ring, &ld->aio_ctx);
+ ret = syscall(__NR_sys_io_uring_setup, depth, ld->iovecs, &p);
+ if (ret < 0)
+ return ret;
+
+ ld->ring_fd = ret;
+ return fio_aioring_mmap(ld, &p);
}
static int fio_aioring_post_init(struct thread_data *td)
struct aioring_data *ld = td->io_ops_data;
struct aioring_options *o = td->eo;
struct io_u *io_u;
- struct iocb *iocb;
- int err = 0;
+ int err;
if (o->fixedbufs) {
int i;
for (i = 0; i < td->o.iodepth; i++) {
- io_u = ld->io_u_index[i];
- iocb = &ld->iocbs[i];
- iocb->u.c.buf = io_u->buf;
- iocb->u.c.nbytes = td_max_bs(td);
+ struct iovec *iov = &ld->iovecs[i];
- if (o->hipri)
- iocb->u.c.flags |= IOCB_FLAG_HIPRI;
+ io_u = ld->io_u_index[i];
+ iov->iov_base = io_u->buf;
+ iov->iov_len = td_max_bs(td);
}
}
err = fio_aioring_queue_init(td);
if (err) {
- td_verror(td, -err, "io_queue_init");
+ td_verror(td, errno, "io_queue_init");
return 1;
}
- /* Adjust depth back again */
- td->o.iodepth--;
return 0;
}
+static unsigned roundup_pow2(unsigned depth)
+{
+ return 1UL << __fls(depth - 1);
+}
+
static int fio_aioring_init(struct thread_data *td)
{
struct aioring_data *ld;
- /* ring needs an extra entry, add one to achieve QD set */
- td->o.iodepth++;
-
ld = calloc(1, sizeof(*ld));
+ /* ring depth must be a power-of-2 */
+ ld->iodepth = td->o.iodepth;
+ td->o.iodepth = roundup_pow2(td->o.iodepth);
+
/* io_u index */
ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
ld->io_us = calloc(td->o.iodepth, sizeof(struct io_u *));
- ld->iocbs = fio_memalign(page_size, aioring_sq_iocb(td), false);
- memset(ld->iocbs, 0, aioring_sq_iocb(td));
-
- ld->sq_ring = fio_memalign(page_size, aioring_sq_size(td), false);
- memset(ld->sq_ring, 0, aioring_sq_size(td));
- ld->sq_ring->nr_events = td->o.iodepth;
- ld->sq_ring->iocbs = (u64) (uintptr_t) ld->iocbs;
-
- ld->cq_ring = fio_memalign(page_size, aioring_cq_size(td), false);
- memset(ld->cq_ring, 0, aioring_cq_size(td));
- ld->cq_ring->nr_events = td->o.iodepth * 2;
+ ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec));
td->io_ops_data = ld;
return 0;