[fio.git] / engines / libaio.c

/*
 * libaio engine
 *
 * IO engine using the Linux native aio interface.
 *
 */
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <libaio.h>
#include <sys/time.h>
#include <sys/resource.h>

#include "../fio.h"
#include "../lib/pow2.h"
#include "../optgroup.h"
#include "../lib/memalign.h"
#include "cmdprio.h"

/* Should be defined in newest aio_abi.h */
#ifndef IOCB_FLAG_IOPRIO
#define IOCB_FLAG_IOPRIO    (1 << 1)
#endif

/* Hack for libaio < 0.3.111 */
#ifndef CONFIG_LIBAIO_RW_FLAGS
#define aio_rw_flags __pad2
#endif

static int fio_libaio_commit(struct thread_data *td);
static int fio_libaio_init(struct thread_data *td);

struct libaio_data {
	io_context_t aio_ctx;
	struct io_event *aio_events;
	struct iocb **iocbs;
	struct io_u **io_us;

	struct io_u **io_u_index;

	/*
	 * Basic ring buffer. 'head' is incremented in _queue(), and
	 * 'tail' is incremented in _commit(). We keep 'queued' so
	 * that we know if the ring is full or empty, when
	 * 'head' == 'tail'. 'entries' is the ring size, and
	 * 'is_pow2' is just an optimization to use AND instead of
	 * modulus to get the remainder on ring increment.
	 */
	int is_pow2;
	unsigned int entries;
	unsigned int queued;
	unsigned int head;
	unsigned int tail;

	bool use_cmdprio;
};

struct libaio_options {
	void *pad;
	unsigned int userspace_reap;
	struct cmdprio cmdprio;
	unsigned int nowait;
};

static struct fio_option options[] = {
	{
		.name	= "userspace_reap",
		.lname	= "Libaio userspace reaping",
		.type	= FIO_OPT_STR_SET,
		.off1	= offsetof(struct libaio_options, userspace_reap),
		.help	= "Use alternative user-space reap implementation",
		.category = FIO_OPT_C_ENGINE,
		.group	= FIO_OPT_G_LIBAIO,
	},
#ifdef FIO_HAVE_IOPRIO_CLASS
	{
		.name	= "cmdprio_percentage",
		.lname	= "high priority percentage",
		.type	= FIO_OPT_INT,
		.off1	= offsetof(struct libaio_options,
				   cmdprio.percentage[DDIR_READ]),
		.off2	= offsetof(struct libaio_options,
				   cmdprio.percentage[DDIR_WRITE]),
		.minval	= 0,
		.maxval	= 100,
		.help	= "Send high priority I/O this percentage of the time",
		.category = FIO_OPT_C_ENGINE,
		.group	= FIO_OPT_G_LIBAIO,
	},
#else
	{
		.name	= "cmdprio_percentage",
		.lname	= "high priority percentage",
		.type	= FIO_OPT_UNSUPPORTED,
		.help	= "Your platform does not support I/O priority classes",
	},
#endif
	{
		.name	= "nowait",
		.lname	= "RWF_NOWAIT",
		.type	= FIO_OPT_BOOL,
		.off1	= offsetof(struct libaio_options, nowait),
		.help	= "Set RWF_NOWAIT for reads/writes",
		.category = FIO_OPT_C_ENGINE,
		.group	= FIO_OPT_G_LIBAIO,
	},
	{
		.name	= NULL,
	},
};

static inline void ring_inc(struct libaio_data *ld, unsigned int *val,
			    unsigned int add)
{
	if (ld->is_pow2)
		*val = (*val + add) & (ld->entries - 1);
	else
		*val = (*val + add) % ld->entries;
}

static int fio_libaio_prep(struct thread_data *td, struct io_u *io_u)
{
	struct libaio_options *o = td->eo;
	struct fio_file *f = io_u->file;
	struct iocb *iocb = &io_u->iocb;

	if (io_u->ddir == DDIR_READ) {
		io_prep_pread(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
		if (o->nowait)
			iocb->aio_rw_flags |= RWF_NOWAIT;
	} else if (io_u->ddir == DDIR_WRITE) {
		io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
		if (o->nowait)
			iocb->aio_rw_flags |= RWF_NOWAIT;
	} else if (ddir_sync(io_u->ddir))
		io_prep_fsync(iocb, f->fd);

	return 0;
}

static void fio_libaio_prio_prep(struct thread_data *td, struct io_u *io_u)
{
	struct libaio_options *o = td->eo;
	struct cmdprio *cmdprio = &o->cmdprio;
	unsigned int p = cmdprio->percentage[io_u->ddir];

	if (p && rand_between(&td->prio_state, 0, 99) < p) {
		io_u->iocb.aio_reqprio = ioprio_value(IOPRIO_CLASS_RT, 0);
		io_u->iocb.u.c.flags |= IOCB_FLAG_IOPRIO;
		io_u->flags |= IO_U_F_PRIORITY;
	}
}

static struct io_u *fio_libaio_event(struct thread_data *td, int event)
{
	struct libaio_data *ld = td->io_ops_data;
	struct io_event *ev;
	struct io_u *io_u;

	ev = ld->aio_events + event;
	io_u = container_of(ev->obj, struct io_u, iocb);

	if (ev->res != io_u->xfer_buflen) {
		if (ev->res > io_u->xfer_buflen)
			io_u->error = -ev->res;
		else
			io_u->resid = io_u->xfer_buflen - ev->res;
	} else
		io_u->error = 0;

	return io_u;
}

struct aio_ring {
	unsigned id;		 /** kernel internal index number */
	unsigned nr;		 /** number of io_events */
	unsigned head;
	unsigned tail;

	unsigned magic;
	unsigned compat_features;
	unsigned incompat_features;
	unsigned header_length;	/** size of aio_ring */

	struct io_event events[0];
};

#define AIO_RING_MAGIC	0xa10a10a1

static int user_io_getevents(io_context_t aio_ctx, unsigned int max,
			     struct io_event *events)
{
	long i = 0;
	unsigned head;
	struct aio_ring *ring = (struct aio_ring*) aio_ctx;

	while (i < max) {
		head = ring->head;

		if (head == ring->tail) {
			/* There are no more completions */
			break;
		} else {
			/* There is another completion to reap */
			events[i] = ring->events[head];
			atomic_store_release(&ring->head,
					     (head + 1) % ring->nr);
			i++;
		}
	}

	return i;
}

static int fio_libaio_getevents(struct thread_data *td, unsigned int min,
				unsigned int max, const struct timespec *t)
{
	struct libaio_data *ld = td->io_ops_data;
	struct libaio_options *o = td->eo;
	unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
	struct timespec __lt, *lt = NULL;
	int r, events = 0;

	if (t) {
		__lt = *t;
		lt = &__lt;
	}

	do {
		if (o->userspace_reap == 1
		    && actual_min == 0
		    && ((struct aio_ring *)(ld->aio_ctx))->magic
				== AIO_RING_MAGIC) {
			r = user_io_getevents(ld->aio_ctx, max,
				ld->aio_events + events);
		} else {
			r = io_getevents(ld->aio_ctx, actual_min,
				max, ld->aio_events + events, lt);
		}
		if (r > 0)
			events += r;
		else if ((min && r == 0) || r == -EAGAIN) {
			fio_libaio_commit(td);
			if (actual_min)
				usleep(10);
		} else if (r != -EINTR)
			break;
	} while (events < min);

	return r < 0 ? r : events;
}

static enum fio_q_status fio_libaio_queue(struct thread_data *td,
					  struct io_u *io_u)
{
	struct libaio_data *ld = td->io_ops_data;

	fio_ro_check(td, io_u);

	if (ld->queued == td->o.iodepth)
		return FIO_Q_BUSY;

	/*
	 * fsync is tricky, since it can fail and we need to do it
	 * serialized with other io. the reason is that linux doesn't
	 * support aio fsync yet. So return busy for the case where we
	 * have pending io, to let fio complete those first.
	 */
	if (ddir_sync(io_u->ddir)) {
		if (ld->queued)
			return FIO_Q_BUSY;

		do_io_u_sync(td, io_u);
		return FIO_Q_COMPLETED;
	}

	if (io_u->ddir == DDIR_TRIM) {
		if (ld->queued)
			return FIO_Q_BUSY;

		do_io_u_trim(td, io_u);
		io_u_mark_submit(td, 1);
		io_u_mark_complete(td, 1);
		return FIO_Q_COMPLETED;
	}

	if (ld->use_cmdprio)
		fio_libaio_prio_prep(td, io_u);

	ld->iocbs[ld->head] = &io_u->iocb;
	ld->io_us[ld->head] = io_u;
	ring_inc(ld, &ld->head, 1);
	ld->queued++;
	return FIO_Q_QUEUED;
}

static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us,
			      unsigned int nr)
{
	struct timespec now;
	unsigned int i;

	if (!fio_fill_issue_time(td))
		return;

	fio_gettime(&now, NULL);

	for (i = 0; i < nr; i++) {
		struct io_u *io_u = io_us[i];

		memcpy(&io_u->issue_time, &now, sizeof(now));
		io_u_queued(td, io_u);
	}
}

static int fio_libaio_commit(struct thread_data *td)
{
	struct libaio_data *ld = td->io_ops_data;
	struct iocb **iocbs;
	struct io_u **io_us;
	struct timespec ts;
	int ret, wait_start = 0;

	if (!ld->queued)
		return 0;

	do {
		long nr = ld->queued;

		nr = min((unsigned int) nr, ld->entries - ld->tail);
		io_us = ld->io_us + ld->tail;
		iocbs = ld->iocbs + ld->tail;

		ret = io_submit(ld->aio_ctx, nr, iocbs);
		if (ret > 0) {
			fio_libaio_queued(td, io_us, ret);
			io_u_mark_submit(td, ret);

			ld->queued -= ret;
			ring_inc(ld, &ld->tail, ret);
			ret = 0;
			wait_start = 0;
		} else if (ret == -EINTR || !ret) {
			if (!ret)
				io_u_mark_submit(td, ret);
			wait_start = 0;
			continue;
		} else if (ret == -EAGAIN) {
			/*
			 * If we get EAGAIN, we should break out without
			 * error and let the upper layer reap some
			 * events for us. If we have no queued IO, we
			 * must loop here. If we loop for more than 30s,
			 * just error out, something must be buggy in the
			 * IO path.
			 */
			if (ld->queued) {
				ret = 0;
				break;
			}
			if (!wait_start) {
				fio_gettime(&ts, NULL);
				wait_start = 1;
			} else if (mtime_since_now(&ts) > 30000) {
				log_err("fio: aio appears to be stalled, giving up\n");
				break;
			}
			usleep(1);
			continue;
		} else if (ret == -ENOMEM) {
			/*
			 * If we get -ENOMEM, reap events if we can. If
			 * we cannot, treat it as a fatal event since there's
			 * nothing we can do about it.
			 */
			if (ld->queued)
				ret = 0;
			break;
		} else
			break;
	} while (ld->queued);

	return ret;
}

static int fio_libaio_cancel(struct thread_data *td, struct io_u *io_u)
{
	struct libaio_data *ld = td->io_ops_data;

	return io_cancel(ld->aio_ctx, &io_u->iocb, ld->aio_events);
}

static void fio_libaio_cleanup(struct thread_data *td)
{
	struct libaio_data *ld = td->io_ops_data;

	if (ld) {
		/*
		 * Work-around to avoid huge RCU stalls at exit time. If we
		 * don't do this here, then it'll be torn down by exit_aio().
		 * But for that case we can parallellize the freeing, thus
		 * speeding it up a lot.
		 */
		if (!(td->flags & TD_F_CHILD))
			io_destroy(ld->aio_ctx);
		free(ld->aio_events);
		free(ld->iocbs);
		free(ld->io_us);
		free(ld);
	}
}

static int fio_libaio_post_init(struct thread_data *td)
{
	struct libaio_data *ld = td->io_ops_data;
	int err;

	err = io_queue_init(td->o.iodepth, &ld->aio_ctx);
	if (err) {
		td_verror(td, -err, "io_queue_init");
		return 1;
	}

	return 0;
}

static int fio_libaio_init(struct thread_data *td)
{
	struct libaio_data *ld;
	struct libaio_options *o = td->eo;
	struct cmdprio *cmdprio = &o->cmdprio;
	int ret;

	ld = calloc(1, sizeof(*ld));

	ld->entries = td->o.iodepth;
	ld->is_pow2 = is_power_of_2(ld->entries);
	ld->aio_events = calloc(ld->entries, sizeof(struct io_event));
	ld->iocbs = calloc(ld->entries, sizeof(struct iocb *));
	ld->io_us = calloc(ld->entries, sizeof(struct io_u *));

	td->io_ops_data = ld;

	ret = fio_cmdprio_init(td, cmdprio, &ld->use_cmdprio);
	if (ret) {
		td_verror(td, EINVAL, "fio_libaio_init");
		return 1;
	}

	return 0;
}

FIO_STATIC struct ioengine_ops ioengine = {
	.name			= "libaio",
	.version		= FIO_IOOPS_VERSION,
	.flags			= FIO_ASYNCIO_SYNC_TRIM,
	.init			= fio_libaio_init,
	.post_init		= fio_libaio_post_init,
	.prep			= fio_libaio_prep,
	.queue			= fio_libaio_queue,
	.commit			= fio_libaio_commit,
	.cancel			= fio_libaio_cancel,
	.getevents		= fio_libaio_getevents,
	.event			= fio_libaio_event,
	.cleanup		= fio_libaio_cleanup,
	.open_file		= generic_open_file,
	.close_file		= generic_close_file,
	.get_file_size		= generic_get_file_size,
	.options		= options,
	.option_struct_size	= sizeof(struct libaio_options),
};

static void fio_init fio_libaio_register(void)
{
	register_ioengine(&ioengine);
}

static void fio_exit fio_libaio_unregister(void)
{
	unregister_ioengine(&ioengine);
}
Commit	Line	Data
	1	/*
	2	* libaio engine
	3	*
	4	* IO engine using the Linux native aio interface.
	5	*
	6	*/
	7	#include <stdlib.h>
	8	#include <unistd.h>
	9	#include <errno.h>
	10	#include <libaio.h>
	11	#include <sys/time.h>
	12	#include <sys/resource.h>
	13
	14	#include "../fio.h"
	15	#include "../lib/pow2.h"
	16	#include "../optgroup.h"
	17	#include "../lib/memalign.h"
	18	#include "cmdprio.h"
	19
	20	/* Should be defined in newest aio_abi.h */
	21	#ifndef IOCB_FLAG_IOPRIO
	22	#define IOCB_FLAG_IOPRIO (1 << 1)
	23	#endif
	24
	25	/* Hack for libaio < 0.3.111 */
	26	#ifndef CONFIG_LIBAIO_RW_FLAGS
	27	#define aio_rw_flags __pad2
	28	#endif
	29
	30	static int fio_libaio_commit(struct thread_data *td);
	31	static int fio_libaio_init(struct thread_data *td);
	32
	33	struct libaio_data {
	34	io_context_t aio_ctx;
	35	struct io_event *aio_events;
	36	struct iocb **iocbs;
	37	struct io_u **io_us;
	38
	39	struct io_u **io_u_index;
	40
	41	/*
	42	* Basic ring buffer. 'head' is incremented in _queue(), and
	43	* 'tail' is incremented in _commit(). We keep 'queued' so
	44	* that we know if the ring is full or empty, when
	45	* 'head' == 'tail'. 'entries' is the ring size, and
	46	* 'is_pow2' is just an optimization to use AND instead of
	47	* modulus to get the remainder on ring increment.
	48	*/
	49	int is_pow2;
	50	unsigned int entries;
	51	unsigned int queued;
	52	unsigned int head;
	53	unsigned int tail;
	54
	55	bool use_cmdprio;
	56	};
	57
	58	struct libaio_options {
	59	void *pad;
	60	unsigned int userspace_reap;
	61	struct cmdprio cmdprio;
	62	unsigned int nowait;
	63	};
	64
	65	static struct fio_option options[] = {
	66	{
	67	.name = "userspace_reap",
	68	.lname = "Libaio userspace reaping",
	69	.type = FIO_OPT_STR_SET,
	70	.off1 = offsetof(struct libaio_options, userspace_reap),
	71	.help = "Use alternative user-space reap implementation",
	72	.category = FIO_OPT_C_ENGINE,
	73	.group = FIO_OPT_G_LIBAIO,
	74	},
	75	#ifdef FIO_HAVE_IOPRIO_CLASS
	76	{
	77	.name = "cmdprio_percentage",
	78	.lname = "high priority percentage",
	79	.type = FIO_OPT_INT,
	80	.off1 = offsetof(struct libaio_options,
	81	cmdprio.percentage[DDIR_READ]),
	82	.off2 = offsetof(struct libaio_options,
	83	cmdprio.percentage[DDIR_WRITE]),
	84	.minval = 0,
	85	.maxval = 100,
	86	.help = "Send high priority I/O this percentage of the time",
	87	.category = FIO_OPT_C_ENGINE,
	88	.group = FIO_OPT_G_LIBAIO,
	89	},
	90	#else
	91	{
	92	.name = "cmdprio_percentage",
	93	.lname = "high priority percentage",
	94	.type = FIO_OPT_UNSUPPORTED,
	95	.help = "Your platform does not support I/O priority classes",
	96	},
	97	#endif
	98	{
	99	.name = "nowait",
	100	.lname = "RWF_NOWAIT",
	101	.type = FIO_OPT_BOOL,
	102	.off1 = offsetof(struct libaio_options, nowait),
	103	.help = "Set RWF_NOWAIT for reads/writes",
	104	.category = FIO_OPT_C_ENGINE,
	105	.group = FIO_OPT_G_LIBAIO,
	106	},
	107	{
	108	.name = NULL,
	109	},
	110	};
	111
	112	static inline void ring_inc(struct libaio_data ld, unsigned int val,
	113	unsigned int add)
	114	{
	115	if (ld->is_pow2)
	116	val = (val + add) & (ld->entries - 1);
	117	else
	118	val = (val + add) % ld->entries;
	119	}
	120
	121	static int fio_libaio_prep(struct thread_data td, struct io_u io_u)
	122	{
	123	struct libaio_options *o = td->eo;
	124	struct fio_file *f = io_u->file;
	125	struct iocb *iocb = &io_u->iocb;
	126
	127	if (io_u->ddir == DDIR_READ) {
	128	io_prep_pread(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
	129	if (o->nowait)
	130	iocb->aio_rw_flags \|= RWF_NOWAIT;
	131	} else if (io_u->ddir == DDIR_WRITE) {
	132	io_prep_pwrite(iocb, f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
	133	if (o->nowait)
	134	iocb->aio_rw_flags \|= RWF_NOWAIT;
	135	} else if (ddir_sync(io_u->ddir))
	136	io_prep_fsync(iocb, f->fd);
	137
	138	return 0;
	139	}
	140
	141	static void fio_libaio_prio_prep(struct thread_data td, struct io_u io_u)
	142	{
	143	struct libaio_options *o = td->eo;
	144	struct cmdprio *cmdprio = &o->cmdprio;
	145	unsigned int p = cmdprio->percentage[io_u->ddir];
	146
	147	if (p && rand_between(&td->prio_state, 0, 99) < p) {
	148	io_u->iocb.aio_reqprio = ioprio_value(IOPRIO_CLASS_RT, 0);
	149	io_u->iocb.u.c.flags \|= IOCB_FLAG_IOPRIO;
	150	io_u->flags \|= IO_U_F_PRIORITY;
	151	}
	152	}
	153
	154	static struct io_u fio_libaio_event(struct thread_data td, int event)
	155	{
	156	struct libaio_data *ld = td->io_ops_data;
	157	struct io_event *ev;
	158	struct io_u *io_u;
	159
	160	ev = ld->aio_events + event;
	161	io_u = container_of(ev->obj, struct io_u, iocb);
	162
	163	if (ev->res != io_u->xfer_buflen) {
	164	if (ev->res > io_u->xfer_buflen)
	165	io_u->error = -ev->res;
	166	else
	167	io_u->resid = io_u->xfer_buflen - ev->res;
	168	} else
	169	io_u->error = 0;
	170
	171	return io_u;
	172	}
	173
	174	struct aio_ring {
	175	unsigned id; /** kernel internal index number */
	176	unsigned nr; /** number of io_events */
	177	unsigned head;
	178	unsigned tail;
	179
	180	unsigned magic;
	181	unsigned compat_features;
	182	unsigned incompat_features;
	183	unsigned header_length; /** size of aio_ring */
	184
	185	struct io_event events[0];
	186	};
	187
	188	#define AIO_RING_MAGIC 0xa10a10a1
	189
	190	static int user_io_getevents(io_context_t aio_ctx, unsigned int max,
	191	struct io_event *events)
	192	{
	193	long i = 0;
	194	unsigned head;
	195	struct aio_ring ring = (struct aio_ring) aio_ctx;
	196
	197	while (i < max) {
	198	head = ring->head;
	199
	200	if (head == ring->tail) {
	201	/* There are no more completions */
	202	break;
	203	} else {
	204	/* There is another completion to reap */
	205	events[i] = ring->events[head];
	206	atomic_store_release(&ring->head,
	207	(head + 1) % ring->nr);
	208	i++;
	209	}
	210	}
	211
	212	return i;
	213	}
	214
	215	static int fio_libaio_getevents(struct thread_data *td, unsigned int min,
	216	unsigned int max, const struct timespec *t)
	217	{
	218	struct libaio_data *ld = td->io_ops_data;
	219	struct libaio_options *o = td->eo;
	220	unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
	221	struct timespec __lt, *lt = NULL;
	222	int r, events = 0;
	223
	224	if (t) {
	225	__lt = *t;
	226	lt = &__lt;
	227	}
	228
	229	do {
	230	if (o->userspace_reap == 1
	231	&& actual_min == 0
	232	&& ((struct aio_ring *)(ld->aio_ctx))->magic
	233	== AIO_RING_MAGIC) {
	234	r = user_io_getevents(ld->aio_ctx, max,
	235	ld->aio_events + events);
	236	} else {
	237	r = io_getevents(ld->aio_ctx, actual_min,
	238	max, ld->aio_events + events, lt);
	239	}
	240	if (r > 0)
	241	events += r;
	242	else if ((min && r == 0) \|\| r == -EAGAIN) {
	243	fio_libaio_commit(td);
	244	if (actual_min)
	245	usleep(10);
	246	} else if (r != -EINTR)
	247	break;
	248	} while (events < min);
	249
	250	return r < 0 ? r : events;
	251	}
	252
	253	static enum fio_q_status fio_libaio_queue(struct thread_data *td,
	254	struct io_u *io_u)
	255	{
	256	struct libaio_data *ld = td->io_ops_data;
	257
	258	fio_ro_check(td, io_u);
	259
	260	if (ld->queued == td->o.iodepth)
	261	return FIO_Q_BUSY;
	262
	263	/*
	264	* fsync is tricky, since it can fail and we need to do it
	265	* serialized with other io. the reason is that linux doesn't
	266	* support aio fsync yet. So return busy for the case where we
	267	* have pending io, to let fio complete those first.
	268	*/
	269	if (ddir_sync(io_u->ddir)) {
	270	if (ld->queued)
	271	return FIO_Q_BUSY;
	272
	273	do_io_u_sync(td, io_u);
	274	return FIO_Q_COMPLETED;
	275	}
	276
	277	if (io_u->ddir == DDIR_TRIM) {
	278	if (ld->queued)
	279	return FIO_Q_BUSY;
	280
	281	do_io_u_trim(td, io_u);
	282	io_u_mark_submit(td, 1);
	283	io_u_mark_complete(td, 1);
	284	return FIO_Q_COMPLETED;
	285	}
	286
	287	if (ld->use_cmdprio)
	288	fio_libaio_prio_prep(td, io_u);
	289
	290	ld->iocbs[ld->head] = &io_u->iocb;
	291	ld->io_us[ld->head] = io_u;
	292	ring_inc(ld, &ld->head, 1);
	293	ld->queued++;
	294	return FIO_Q_QUEUED;
	295	}
	296
	297	static void fio_libaio_queued(struct thread_data td, struct io_u *io_us,
	298	unsigned int nr)
	299	{
	300	struct timespec now;
	301	unsigned int i;
	302
	303	if (!fio_fill_issue_time(td))
	304	return;
	305
	306	fio_gettime(&now, NULL);
	307
	308	for (i = 0; i < nr; i++) {
	309	struct io_u *io_u = io_us[i];
	310
	311	memcpy(&io_u->issue_time, &now, sizeof(now));
	312	io_u_queued(td, io_u);
	313	}
	314	}
	315
	316	static int fio_libaio_commit(struct thread_data *td)
	317	{
	318	struct libaio_data *ld = td->io_ops_data;
	319	struct iocb **iocbs;
	320	struct io_u **io_us;
	321	struct timespec ts;
	322	int ret, wait_start = 0;
	323
	324	if (!ld->queued)
	325	return 0;
	326
	327	do {
	328	long nr = ld->queued;
	329
	330	nr = min((unsigned int) nr, ld->entries - ld->tail);
	331	io_us = ld->io_us + ld->tail;
	332	iocbs = ld->iocbs + ld->tail;
	333
	334	ret = io_submit(ld->aio_ctx, nr, iocbs);
	335	if (ret > 0) {
	336	fio_libaio_queued(td, io_us, ret);
	337	io_u_mark_submit(td, ret);
	338
	339	ld->queued -= ret;
	340	ring_inc(ld, &ld->tail, ret);
	341	ret = 0;
	342	wait_start = 0;
	343	} else if (ret == -EINTR \|\| !ret) {
	344	if (!ret)
	345	io_u_mark_submit(td, ret);
	346	wait_start = 0;
	347	continue;
	348	} else if (ret == -EAGAIN) {
	349	/*
	350	* If we get EAGAIN, we should break out without
	351	* error and let the upper layer reap some
	352	* events for us. If we have no queued IO, we
	353	* must loop here. If we loop for more than 30s,
	354	* just error out, something must be buggy in the
	355	* IO path.
	356	*/
	357	if (ld->queued) {
	358	ret = 0;
	359	break;
	360	}
	361	if (!wait_start) {
	362	fio_gettime(&ts, NULL);
	363	wait_start = 1;
	364	} else if (mtime_since_now(&ts) > 30000) {
	365	log_err("fio: aio appears to be stalled, giving up\n");
	366	break;
	367	}
	368	usleep(1);
	369	continue;
	370	} else if (ret == -ENOMEM) {
	371	/*
	372	* If we get -ENOMEM, reap events if we can. If
	373	* we cannot, treat it as a fatal event since there's
	374	* nothing we can do about it.
	375	*/
	376	if (ld->queued)
	377	ret = 0;
	378	break;
	379	} else
	380	break;
	381	} while (ld->queued);
	382
	383	return ret;
	384	}
	385
	386	static int fio_libaio_cancel(struct thread_data td, struct io_u io_u)
	387	{
	388	struct libaio_data *ld = td->io_ops_data;
	389
	390	return io_cancel(ld->aio_ctx, &io_u->iocb, ld->aio_events);
	391	}
	392
	393	static void fio_libaio_cleanup(struct thread_data *td)
	394	{
	395	struct libaio_data *ld = td->io_ops_data;
	396
	397	if (ld) {
	398	/*
	399	* Work-around to avoid huge RCU stalls at exit time. If we
	400	* don't do this here, then it'll be torn down by exit_aio().
	401	* But for that case we can parallellize the freeing, thus
	402	* speeding it up a lot.
	403	*/
	404	if (!(td->flags & TD_F_CHILD))
	405	io_destroy(ld->aio_ctx);
	406	free(ld->aio_events);
	407	free(ld->iocbs);
	408	free(ld->io_us);
	409	free(ld);
	410	}
	411	}
	412
	413	static int fio_libaio_post_init(struct thread_data *td)
	414	{
	415	struct libaio_data *ld = td->io_ops_data;
	416	int err;
	417
	418	err = io_queue_init(td->o.iodepth, &ld->aio_ctx);
	419	if (err) {
	420	td_verror(td, -err, "io_queue_init");
	421	return 1;
	422	}
	423
	424	return 0;
	425	}
	426
	427	static int fio_libaio_init(struct thread_data *td)
	428	{
	429	struct libaio_data *ld;
	430	struct libaio_options *o = td->eo;
	431	struct cmdprio *cmdprio = &o->cmdprio;
	432	int ret;
	433
	434	ld = calloc(1, sizeof(*ld));
	435
	436	ld->entries = td->o.iodepth;
	437	ld->is_pow2 = is_power_of_2(ld->entries);
	438	ld->aio_events = calloc(ld->entries, sizeof(struct io_event));
	439	ld->iocbs = calloc(ld->entries, sizeof(struct iocb *));
	440	ld->io_us = calloc(ld->entries, sizeof(struct io_u *));
	441
	442	td->io_ops_data = ld;
	443
	444	ret = fio_cmdprio_init(td, cmdprio, &ld->use_cmdprio);
	445	if (ret) {
	446	td_verror(td, EINVAL, "fio_libaio_init");
	447	return 1;
	448	}
	449
	450	return 0;
	451	}
	452
	453	FIO_STATIC struct ioengine_ops ioengine = {
	454	.name = "libaio",
	455	.version = FIO_IOOPS_VERSION,
	456	.flags = FIO_ASYNCIO_SYNC_TRIM,
	457	.init = fio_libaio_init,
	458	.post_init = fio_libaio_post_init,
	459	.prep = fio_libaio_prep,
	460	.queue = fio_libaio_queue,
	461	.commit = fio_libaio_commit,
	462	.cancel = fio_libaio_cancel,
	463	.getevents = fio_libaio_getevents,
	464	.event = fio_libaio_event,
	465	.cleanup = fio_libaio_cleanup,
	466	.open_file = generic_open_file,
	467	.close_file = generic_close_file,
	468	.get_file_size = generic_get_file_size,
	469	.options = options,
	470	.option_struct_size = sizeof(struct libaio_options),
	471	};
	472
	473	static void fio_init fio_libaio_register(void)
	474	{
	475	register_ioengine(&ioengine);
	476	}
	477
	478	static void fio_exit fio_libaio_unregister(void)
	479	{
	480	unregister_ioengine(&ioengine);
	481	}