#include "../lib/pow2.h"
#include "../optgroup.h"
#include "../lib/memalign.h"
+#include "../lib/fls.h"
#ifdef ARCH_HAVE_AIORING
-#ifndef IOCB_FLAG_HIPRI
-#define IOCB_FLAG_HIPRI (1 << 2)
-#endif
-
/*
* io_setup2(2) flags
*/
#ifndef IOCTX_FLAG_SQWQ
#define IOCTX_FLAG_SQWQ (1 << 4)
#endif
+#ifndef IOCTX_FLAG_SQPOLL
+#define IOCTX_FLAG_SQPOLL (1 << 5)
+#endif
+
/*
* io_ring_enter(2) flags
*/
-#ifndef IORING_FLAG_SUBMIT
-#define IORING_FLAG_SUBMIT (1 << 0)
-#endif
#ifndef IORING_FLAG_GETEVENTS
-#define IORING_FLAG_GETEVENTS (1 << 1)
+#define IORING_FLAG_GETEVENTS (1 << 0)
#endif
typedef uint64_t u64;
typedef uint32_t u32;
typedef uint16_t u16;
+#define IORING_SQ_NEED_WAKEUP (1 << 0)
+
+#define IOEV_RES2_CACHEHIT (1 << 0)
+
struct aio_sq_ring {
union {
struct {
u32 tail;
u32 nr_events;
u16 sq_thread_cpu;
+ u16 kflags;
u64 iocbs;
};
u32 pad[16];
struct aio_sq_ring *sq_ring;
struct iocb *iocbs;
+ unsigned sq_ring_mask;
struct aio_cq_ring *cq_ring;
struct io_event *events;
+ unsigned cq_ring_mask;
int queued;
int cq_ring_off;
+ unsigned iodepth;
+
+ uint64_t cachehit;
+ uint64_t cachemiss;
};
struct aioring_options {
unsigned int fixedbufs;
unsigned int sqthread;
unsigned int sqthread_set;
+ unsigned int sqthread_poll;
unsigned int sqwq;
};
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_LIBAIO,
},
+ {
+ .name = "sqthread_poll",
+ .lname = "Kernel SQ thread should poll",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct aioring_options, sqthread_poll),
+ .help = "Used with sqthread, enables kernel side polling",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBAIO,
+ },
{
.name = "sqwq",
.lname = "Offload submission to kernel workqueue",
},
};
-static int fio_aioring_commit(struct thread_data *td);
-
static int io_ring_enter(io_context_t ctx, unsigned int to_submit,
unsigned int min_complete, unsigned int flags)
{
{
struct aioring_data *ld = td->io_ops_data;
struct fio_file *f = io_u->file;
- struct aioring_options *o = td->eo;
struct iocb *iocb;
iocb = &ld->iocbs[io_u->index];
- if (io_u->ddir == DDIR_READ) {
- if (o->fixedbufs) {
- iocb->aio_fildes = f->fd;
+ if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
+ if (io_u->ddir == DDIR_READ)
iocb->aio_lio_opcode = IO_CMD_PREAD;
- iocb->u.c.offset = io_u->offset;
- } else {
- io_prep_pread(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
- if (o->hipri)
- iocb->u.c.flags |= IOCB_FLAG_HIPRI;
- }
- } else if (io_u->ddir == DDIR_WRITE) {
- if (o->fixedbufs) {
- iocb->aio_fildes = f->fd;
+ else
iocb->aio_lio_opcode = IO_CMD_PWRITE;
- iocb->u.c.offset = io_u->offset;
- } else {
- io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
- if (o->hipri)
- iocb->u.c.flags |= IOCB_FLAG_HIPRI;
- }
+ iocb->aio_reqprio = 0;
+ iocb->aio_fildes = f->fd;
+ iocb->u.c.buf = io_u->xfer_buf;
+ iocb->u.c.nbytes = io_u->xfer_buflen;
+ iocb->u.c.offset = io_u->offset;
+ iocb->u.c.flags = 0;
} else if (ddir_sync(io_u->ddir))
io_prep_fsync(iocb, f->fd);
struct aioring_data *ld = td->io_ops_data;
struct io_event *ev;
struct io_u *io_u;
- int index;
+ unsigned index;
- index = event + ld->cq_ring_off;
- if (index >= ld->cq_ring->nr_events)
- index -= ld->cq_ring->nr_events;
+ index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
ev = &ld->cq_ring->events[index];
io_u = ev->data;
} else
io_u->error = 0;
+ if (io_u->ddir == DDIR_READ) {
+ if (ev->res2 & IOEV_RES2_CACHEHIT)
+ ld->cachehit++;
+ else
+ ld->cachemiss++;
+ }
+
return io_u;
}
break;
reaped++;
head++;
- if (head == ring->nr_events)
- head = 0;
} while (reaped + events < max);
ring->head = head;
{
struct aioring_data *ld = td->io_ops_data;
unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
+ struct aioring_options *o = td->eo;
struct aio_cq_ring *ring = ld->cq_ring;
- int r, events = 0;
+ unsigned events = 0;
+ int r;
ld->cq_ring_off = ring->head;
do {
continue;
}
- r = io_ring_enter(ld->aio_ctx, 0, actual_min,
- IORING_FLAG_GETEVENTS);
- if (r < 0) {
- if (errno == EAGAIN)
- continue;
- td_verror(td, errno, "io_ring_enter get");
- break;
+ if (!o->sqthread_poll) {
+ r = io_ring_enter(ld->aio_ctx, 0, actual_min,
+ IORING_FLAG_GETEVENTS);
+ if (r < 0) {
+ if (errno == EAGAIN)
+ continue;
+ td_verror(td, errno, "io_ring_enter get");
+ break;
+ }
}
} while (events < min);
fio_ro_check(td, io_u);
- if (ld->queued == td->o.iodepth)
+ if (ld->queued == ld->iodepth)
return FIO_Q_BUSY;
if (io_u->ddir == DDIR_TRIM) {
tail = ring->tail;
next_tail = tail + 1;
- if (next_tail == ring->nr_events)
- next_tail = 0;
read_barrier();
if (next_tail == ring->head)
return FIO_Q_BUSY;
- ring->array[tail] = io_u->index;
+ ring->array[tail & ld->sq_ring_mask] = io_u->index;
ring->tail = next_tail;
write_barrier();
fio_gettime(&now, NULL);
while (nr--) {
- int index = ld->sq_ring->array[start];
- struct io_u *io_u = io_u = ld->io_u_index[index];
+ int index = ld->sq_ring->array[start & ld->sq_ring_mask];
+ struct io_u *io_u = ld->io_u_index[index];
memcpy(&io_u->issue_time, &now, sizeof(now));
io_u_queued(td, io_u);
start++;
- if (start == ld->sq_ring->nr_events)
- start = 0;
}
}
static int fio_aioring_commit(struct thread_data *td)
{
struct aioring_data *ld = td->io_ops_data;
+ struct aioring_options *o = td->eo;
int ret;
if (!ld->queued)
return 0;
+ /* Nothing to do */
+ if (o->sqthread_poll) {
+ struct aio_sq_ring *ring = ld->sq_ring;
+
+ if (ring->kflags & IORING_SQ_NEED_WAKEUP)
+ io_ring_enter(ld->aio_ctx, ld->queued, 0, 0);
+ ld->queued = 0;
+ return 0;
+ }
+
do {
- int start = ld->sq_ring->head;
+ unsigned start = ld->sq_ring->head;
long nr = ld->queued;
- ret = io_ring_enter(ld->aio_ctx, nr, 0, IORING_FLAG_SUBMIT |
- IORING_FLAG_GETEVENTS);
+ ret = io_ring_enter(ld->aio_ctx, nr, 0, IORING_FLAG_GETEVENTS);
if (ret > 0) {
fio_aioring_queued(td, start, ret);
io_u_mark_submit(td, ret);
return sizeof(struct aio_sq_ring) + td->o.iodepth * sizeof(u32);
}
+static unsigned roundup_pow2(unsigned depth)
+{
+ return 1UL << __fls(depth - 1);
+}
+
static void fio_aioring_cleanup(struct thread_data *td)
{
struct aioring_data *ld = td->io_ops_data;
if (ld) {
- /* Bump depth to match init depth */
- td->o.iodepth++;
+ td->ts.cachehit += ld->cachehit;
+ td->ts.cachemiss += ld->cachemiss;
/*
* Work-around to avoid huge RCU stalls at exit time. If we
if (o->sqthread_set) {
ld->sq_ring->sq_thread_cpu = o->sqthread;
flags |= IOCTX_FLAG_SQTHREAD;
- } else if (o->sqwq)
+ if (o->sqthread_poll)
+ flags |= IOCTX_FLAG_SQPOLL;
+ }
+ if (o->sqwq)
flags |= IOCTX_FLAG_SQWQ;
if (o->fixedbufs) {
iocb = &ld->iocbs[i];
iocb->u.c.buf = io_u->buf;
iocb->u.c.nbytes = td_max_bs(td);
-
- if (o->hipri)
- iocb->u.c.flags |= IOCB_FLAG_HIPRI;
}
}
err = fio_aioring_queue_init(td);
+
if (err) {
- td_verror(td, -err, "io_queue_init");
+ td_verror(td, errno, "io_queue_init");
return 1;
}
- /* Adjust depth back again */
- td->o.iodepth--;
return 0;
}
static int fio_aioring_init(struct thread_data *td)
{
- struct aioring_options *o = td->eo;
struct aioring_data *ld;
- if (o->sqthread_set && o->sqwq) {
- log_err("fio: aioring sqthread and sqwq are mutually exclusive\n");
- return 1;
- }
-
- /* ring needs an extra entry, add one to achieve QD set */
- td->o.iodepth++;
-
ld = calloc(1, sizeof(*ld));
+ /* ring depth must be a power-of-2 */
+ ld->iodepth = td->o.iodepth;
+ td->o.iodepth = roundup_pow2(td->o.iodepth);
+
/* io_u index */
ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
ld->io_us = calloc(td->o.iodepth, sizeof(struct io_u *));
memset(ld->sq_ring, 0, aioring_sq_size(td));
ld->sq_ring->nr_events = td->o.iodepth;
ld->sq_ring->iocbs = (u64) (uintptr_t) ld->iocbs;
+ ld->sq_ring_mask = td->o.iodepth - 1;
ld->cq_ring = fio_memalign(page_size, aioring_cq_size(td), false);
memset(ld->cq_ring, 0, aioring_cq_size(td));
ld->cq_ring->nr_events = td->o.iodepth * 2;
+ ld->cq_ring_mask = (2 * td->o.iodepth) - 1;
td->io_ops_data = ld;
return 0;