4 * IO engine using the new native Linux aio io_uring interface. See:
6 * http://git.kernel.dk/cgit/linux-block/log/?h=io_uring
13 #include <sys/resource.h>
16 #include "../lib/pow2.h"
17 #include "../optgroup.h"
18 #include "../lib/memalign.h"
19 #include "../lib/fls.h"
20 #include "../lib/roundup.h"
22 #ifdef ARCH_HAVE_IOURING
24 #include "../lib/types.h"
25 #include "../os/linux/io_uring.h"
32 unsigned *ring_entries;
41 unsigned *ring_entries;
42 struct io_uring_cqe *cqes;
53 struct io_u **io_u_index;
57 struct io_sq_ring sq_ring;
58 struct io_uring_sqe *sqes;
60 unsigned sq_ring_mask;
62 struct io_cq_ring cq_ring;
63 unsigned cq_ring_mask;
68 bool ioprio_class_set;
72 struct ioring_mmap mmap[3];
77 struct ioring_options {
80 struct cmdprio cmdprio;
81 unsigned int fixedbufs;
82 unsigned int registerfiles;
83 unsigned int sqpoll_thread;
84 unsigned int sqpoll_set;
85 unsigned int sqpoll_cpu;
86 unsigned int nonvectored;
87 unsigned int uncached;
89 unsigned int force_async;
92 static const int ddir_to_op[2][2] = {
93 { IORING_OP_READV, IORING_OP_READ },
94 { IORING_OP_WRITEV, IORING_OP_WRITE }
97 static const int fixed_ddir_to_op[2] = {
102 static int fio_ioring_sqpoll_cb(void *data, unsigned long long *val)
104 struct ioring_options *o = data;
106 o->sqpoll_cpu = *val;
111 static struct fio_option options[] = {
114 .lname = "High Priority",
115 .type = FIO_OPT_STR_SET,
116 .off1 = offsetof(struct ioring_options, hipri),
117 .help = "Use polled IO completions",
118 .category = FIO_OPT_C_ENGINE,
119 .group = FIO_OPT_G_IOURING,
121 #ifdef FIO_HAVE_IOPRIO_CLASS
123 .name = "cmdprio_percentage",
124 .lname = "high priority percentage",
126 .off1 = offsetof(struct ioring_options,
127 cmdprio.percentage[DDIR_READ]),
128 .off2 = offsetof(struct ioring_options,
129 cmdprio.percentage[DDIR_WRITE]),
132 .help = "Send high priority I/O this percentage of the time",
133 .category = FIO_OPT_C_ENGINE,
134 .group = FIO_OPT_G_IOURING,
137 .name = "cmdprio_class",
138 .lname = "Asynchronous I/O priority class",
140 .off1 = offsetof(struct ioring_options,
141 cmdprio.class[DDIR_READ]),
142 .off2 = offsetof(struct ioring_options,
143 cmdprio.class[DDIR_WRITE]),
144 .help = "Set asynchronous IO priority class",
145 .minval = IOPRIO_MIN_PRIO_CLASS + 1,
146 .maxval = IOPRIO_MAX_PRIO_CLASS,
148 .category = FIO_OPT_C_ENGINE,
149 .group = FIO_OPT_G_IOURING,
153 .lname = "Asynchronous I/O priority level",
155 .off1 = offsetof(struct ioring_options,
156 cmdprio.level[DDIR_READ]),
157 .off2 = offsetof(struct ioring_options,
158 cmdprio.level[DDIR_WRITE]),
159 .help = "Set asynchronous IO priority level",
160 .minval = IOPRIO_MIN_PRIO,
161 .maxval = IOPRIO_MAX_PRIO,
163 .category = FIO_OPT_C_ENGINE,
164 .group = FIO_OPT_G_IOURING,
168 .name = "cmdprio_percentage",
169 .lname = "high priority percentage",
170 .type = FIO_OPT_UNSUPPORTED,
171 .help = "Your platform does not support I/O priority classes",
174 .name = "cmdprio_class",
175 .lname = "Asynchronous I/O priority class",
176 .type = FIO_OPT_UNSUPPORTED,
177 .help = "Your platform does not support I/O priority classes",
181 .lname = "Asynchronous I/O priority level",
182 .type = FIO_OPT_UNSUPPORTED,
183 .help = "Your platform does not support I/O priority classes",
188 .lname = "Fixed (pre-mapped) IO buffers",
189 .type = FIO_OPT_STR_SET,
190 .off1 = offsetof(struct ioring_options, fixedbufs),
191 .help = "Pre map IO buffers",
192 .category = FIO_OPT_C_ENGINE,
193 .group = FIO_OPT_G_IOURING,
196 .name = "registerfiles",
197 .lname = "Register file set",
198 .type = FIO_OPT_STR_SET,
199 .off1 = offsetof(struct ioring_options, registerfiles),
200 .help = "Pre-open/register files",
201 .category = FIO_OPT_C_ENGINE,
202 .group = FIO_OPT_G_IOURING,
205 .name = "sqthread_poll",
206 .lname = "Kernel SQ thread polling",
208 .off1 = offsetof(struct ioring_options, sqpoll_thread),
209 .help = "Offload submission/completion to kernel thread",
210 .category = FIO_OPT_C_ENGINE,
211 .group = FIO_OPT_G_IOURING,
214 .name = "sqthread_poll_cpu",
215 .lname = "SQ Thread Poll CPU",
217 .cb = fio_ioring_sqpoll_cb,
218 .help = "What CPU to run SQ thread polling on",
219 .category = FIO_OPT_C_ENGINE,
220 .group = FIO_OPT_G_IOURING,
223 .name = "nonvectored",
224 .lname = "Non-vectored",
226 .off1 = offsetof(struct ioring_options, nonvectored),
228 .help = "Use non-vectored read/write commands",
229 .category = FIO_OPT_C_ENGINE,
230 .group = FIO_OPT_G_IOURING,
236 .off1 = offsetof(struct ioring_options, uncached),
237 .help = "Use RWF_UNCACHED for buffered read/writes",
238 .category = FIO_OPT_C_ENGINE,
239 .group = FIO_OPT_G_IOURING,
243 .lname = "RWF_NOWAIT",
244 .type = FIO_OPT_BOOL,
245 .off1 = offsetof(struct ioring_options, nowait),
246 .help = "Use RWF_NOWAIT for reads/writes",
247 .category = FIO_OPT_C_ENGINE,
248 .group = FIO_OPT_G_IOURING,
251 .name = "force_async",
252 .lname = "Force async",
254 .off1 = offsetof(struct ioring_options, force_async),
255 .help = "Set IOSQE_ASYNC every N requests",
256 .category = FIO_OPT_C_ENGINE,
257 .group = FIO_OPT_G_IOURING,
264 static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit,
265 unsigned int min_complete, unsigned int flags)
267 return syscall(__NR_io_uring_enter, ld->ring_fd, to_submit,
268 min_complete, flags, NULL, 0);
271 static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
273 struct ioring_data *ld = td->io_ops_data;
274 struct ioring_options *o = td->eo;
275 struct fio_file *f = io_u->file;
276 struct io_uring_sqe *sqe;
278 sqe = &ld->sqes[io_u->index];
280 if (o->registerfiles) {
281 sqe->fd = f->engine_pos;
282 sqe->flags = IOSQE_FIXED_FILE;
288 if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
290 sqe->opcode = fixed_ddir_to_op[io_u->ddir];
291 sqe->addr = (unsigned long) io_u->xfer_buf;
292 sqe->len = io_u->xfer_buflen;
293 sqe->buf_index = io_u->index;
295 struct iovec *iov = &ld->iovecs[io_u->index];
298 * Update based on actual io_u, requeue could have
301 iov->iov_base = io_u->xfer_buf;
302 iov->iov_len = io_u->xfer_buflen;
304 sqe->opcode = ddir_to_op[io_u->ddir][!!o->nonvectored];
305 if (o->nonvectored) {
306 sqe->addr = (unsigned long) iov->iov_base;
307 sqe->len = iov->iov_len;
309 sqe->addr = (unsigned long) iov;
314 if (!td->o.odirect && o->uncached)
315 sqe->rw_flags |= RWF_UNCACHED;
317 sqe->rw_flags |= RWF_NOWAIT;
318 if (ld->ioprio_class_set)
319 sqe->ioprio = td->o.ioprio_class << 13;
321 sqe->ioprio |= td->o.ioprio;
322 sqe->off = io_u->offset;
323 } else if (ddir_sync(io_u->ddir)) {
325 if (io_u->ddir == DDIR_SYNC_FILE_RANGE) {
326 sqe->off = f->first_write;
327 sqe->len = f->last_write - f->first_write;
328 sqe->sync_range_flags = td->o.sync_file_range;
329 sqe->opcode = IORING_OP_SYNC_FILE_RANGE;
334 if (io_u->ddir == DDIR_DATASYNC)
335 sqe->fsync_flags |= IORING_FSYNC_DATASYNC;
336 sqe->opcode = IORING_OP_FSYNC;
340 if (o->force_async && ++ld->prepped == o->force_async) {
342 sqe->flags |= IOSQE_ASYNC;
345 sqe->user_data = (unsigned long) io_u;
349 static struct io_u *fio_ioring_event(struct thread_data *td, int event)
351 struct ioring_data *ld = td->io_ops_data;
352 struct io_uring_cqe *cqe;
356 index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
358 cqe = &ld->cq_ring.cqes[index];
359 io_u = (struct io_u *) (uintptr_t) cqe->user_data;
361 if (cqe->res != io_u->xfer_buflen) {
362 if (cqe->res > io_u->xfer_buflen)
363 io_u->error = -cqe->res;
365 io_u->resid = io_u->xfer_buflen - cqe->res;
372 static int fio_ioring_cqring_reap(struct thread_data *td, unsigned int events,
375 struct ioring_data *ld = td->io_ops_data;
376 struct io_cq_ring *ring = &ld->cq_ring;
377 unsigned head, reaped = 0;
381 if (head == atomic_load_acquire(ring->tail))
385 } while (reaped + events < max);
388 atomic_store_release(ring->head, head);
393 static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
394 unsigned int max, const struct timespec *t)
396 struct ioring_data *ld = td->io_ops_data;
397 unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
398 struct ioring_options *o = td->eo;
399 struct io_cq_ring *ring = &ld->cq_ring;
403 ld->cq_ring_off = *ring->head;
405 r = fio_ioring_cqring_reap(td, events, max);
413 if (!o->sqpoll_thread) {
414 r = io_uring_enter(ld, 0, actual_min,
415 IORING_ENTER_GETEVENTS);
417 if (errno == EAGAIN || errno == EINTR)
419 td_verror(td, errno, "io_uring_enter");
423 } while (events < min);
425 return r < 0 ? r : events;
428 static void fio_ioring_prio_prep(struct thread_data *td, struct io_u *io_u)
430 struct ioring_options *o = td->eo;
431 struct ioring_data *ld = td->io_ops_data;
432 struct io_uring_sqe *sqe = &ld->sqes[io_u->index];
433 struct cmdprio *cmdprio = &o->cmdprio;
434 enum fio_ddir ddir = io_u->ddir;
435 unsigned int p = cmdprio->percentage[ddir];
437 if (p && rand_between(&td->prio_state, 0, 99) < p) {
439 ioprio_value(cmdprio->class[ddir], cmdprio->level[ddir]);
440 io_u->flags |= IO_U_F_PRIORITY;
446 static enum fio_q_status fio_ioring_queue(struct thread_data *td,
449 struct ioring_data *ld = td->io_ops_data;
450 struct io_sq_ring *ring = &ld->sq_ring;
451 unsigned tail, next_tail;
453 fio_ro_check(td, io_u);
455 if (ld->queued == ld->iodepth)
458 if (io_u->ddir == DDIR_TRIM) {
462 do_io_u_trim(td, io_u);
463 io_u_mark_submit(td, 1);
464 io_u_mark_complete(td, 1);
465 return FIO_Q_COMPLETED;
469 next_tail = tail + 1;
470 if (next_tail == atomic_load_acquire(ring->head))
474 fio_ioring_prio_prep(td, io_u);
475 ring->array[tail & ld->sq_ring_mask] = io_u->index;
476 atomic_store_release(ring->tail, next_tail);
482 static void fio_ioring_queued(struct thread_data *td, int start, int nr)
484 struct ioring_data *ld = td->io_ops_data;
487 if (!fio_fill_issue_time(td))
490 fio_gettime(&now, NULL);
493 struct io_sq_ring *ring = &ld->sq_ring;
494 int index = ring->array[start & ld->sq_ring_mask];
495 struct io_u *io_u = ld->io_u_index[index];
497 memcpy(&io_u->issue_time, &now, sizeof(now));
498 io_u_queued(td, io_u);
504 static int fio_ioring_commit(struct thread_data *td)
506 struct ioring_data *ld = td->io_ops_data;
507 struct ioring_options *o = td->eo;
514 * Kernel side does submission. just need to check if the ring is
515 * flagged as needing a kick, if so, call io_uring_enter(). This
516 * only happens if we've been idle too long.
518 if (o->sqpoll_thread) {
519 struct io_sq_ring *ring = &ld->sq_ring;
522 flags = atomic_load_acquire(ring->flags);
523 if (flags & IORING_SQ_NEED_WAKEUP)
524 io_uring_enter(ld, ld->queued, 0,
525 IORING_ENTER_SQ_WAKEUP);
531 unsigned start = *ld->sq_ring.head;
532 long nr = ld->queued;
534 ret = io_uring_enter(ld, nr, 0, IORING_ENTER_GETEVENTS);
536 fio_ioring_queued(td, start, ret);
537 io_u_mark_submit(td, ret);
542 io_u_mark_submit(td, ret);
545 if (errno == EAGAIN || errno == EINTR) {
546 ret = fio_ioring_cqring_reap(td, 0, ld->queued);
549 /* Shouldn't happen */
553 td_verror(td, errno, "io_uring_enter submit");
556 } while (ld->queued);
561 static void fio_ioring_unmap(struct ioring_data *ld)
565 for (i = 0; i < FIO_ARRAY_SIZE(ld->mmap); i++)
566 munmap(ld->mmap[i].ptr, ld->mmap[i].len);
570 static void fio_ioring_cleanup(struct thread_data *td)
572 struct ioring_data *ld = td->io_ops_data;
575 if (!(td->flags & TD_F_CHILD))
576 fio_ioring_unmap(ld);
578 free(ld->io_u_index);
585 static int fio_ioring_mmap(struct ioring_data *ld, struct io_uring_params *p)
587 struct io_sq_ring *sring = &ld->sq_ring;
588 struct io_cq_ring *cring = &ld->cq_ring;
591 ld->mmap[0].len = p->sq_off.array + p->sq_entries * sizeof(__u32);
592 ptr = mmap(0, ld->mmap[0].len, PROT_READ | PROT_WRITE,
593 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
595 ld->mmap[0].ptr = ptr;
596 sring->head = ptr + p->sq_off.head;
597 sring->tail = ptr + p->sq_off.tail;
598 sring->ring_mask = ptr + p->sq_off.ring_mask;
599 sring->ring_entries = ptr + p->sq_off.ring_entries;
600 sring->flags = ptr + p->sq_off.flags;
601 sring->array = ptr + p->sq_off.array;
602 ld->sq_ring_mask = *sring->ring_mask;
604 ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
605 ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
606 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
608 ld->mmap[1].ptr = ld->sqes;
610 ld->mmap[2].len = p->cq_off.cqes +
611 p->cq_entries * sizeof(struct io_uring_cqe);
612 ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
613 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
615 ld->mmap[2].ptr = ptr;
616 cring->head = ptr + p->cq_off.head;
617 cring->tail = ptr + p->cq_off.tail;
618 cring->ring_mask = ptr + p->cq_off.ring_mask;
619 cring->ring_entries = ptr + p->cq_off.ring_entries;
620 cring->cqes = ptr + p->cq_off.cqes;
621 ld->cq_ring_mask = *cring->ring_mask;
625 static void fio_ioring_probe(struct thread_data *td)
627 struct ioring_data *ld = td->io_ops_data;
628 struct ioring_options *o = td->eo;
629 struct io_uring_probe *p;
632 /* already set by user, don't touch */
633 if (o->nonvectored != -1)
636 /* default to off, as that's always safe */
639 p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
643 memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
644 ret = syscall(__NR_io_uring_register, ld->ring_fd,
645 IORING_REGISTER_PROBE, p, 256);
649 if (IORING_OP_WRITE > p->ops_len)
652 if ((p->ops[IORING_OP_READ].flags & IO_URING_OP_SUPPORTED) &&
653 (p->ops[IORING_OP_WRITE].flags & IO_URING_OP_SUPPORTED))
659 static int fio_ioring_queue_init(struct thread_data *td)
661 struct ioring_data *ld = td->io_ops_data;
662 struct ioring_options *o = td->eo;
663 int depth = td->o.iodepth;
664 struct io_uring_params p;
667 memset(&p, 0, sizeof(p));
670 p.flags |= IORING_SETUP_IOPOLL;
671 if (o->sqpoll_thread) {
672 p.flags |= IORING_SETUP_SQPOLL;
674 p.flags |= IORING_SETUP_SQ_AFF;
675 p.sq_thread_cpu = o->sqpoll_cpu;
679 ret = syscall(__NR_io_uring_setup, depth, &p);
685 fio_ioring_probe(td);
688 ret = syscall(__NR_io_uring_register, ld->ring_fd,
689 IORING_REGISTER_BUFFERS, ld->iovecs, depth);
694 return fio_ioring_mmap(ld, &p);
697 static int fio_ioring_register_files(struct thread_data *td)
699 struct ioring_data *ld = td->io_ops_data;
704 ld->fds = calloc(td->o.nr_files, sizeof(int));
706 for_each_file(td, f, i) {
707 ret = generic_open_file(td, f);
714 ret = syscall(__NR_io_uring_register, ld->ring_fd,
715 IORING_REGISTER_FILES, ld->fds, td->o.nr_files);
723 * Pretend the file is closed again, and really close it if we hit
726 for_each_file(td, f, i) {
729 ret2 = generic_close_file(td, f);
737 static int fio_ioring_post_init(struct thread_data *td)
739 struct ioring_data *ld = td->io_ops_data;
740 struct ioring_options *o = td->eo;
744 for (i = 0; i < td->o.iodepth; i++) {
745 struct iovec *iov = &ld->iovecs[i];
747 io_u = ld->io_u_index[i];
748 iov->iov_base = io_u->buf;
749 iov->iov_len = td_max_bs(td);
752 err = fio_ioring_queue_init(td);
754 int init_err = errno;
756 if (init_err == ENOSYS)
757 log_err("fio: your kernel doesn't support io_uring\n");
758 td_verror(td, init_err, "io_queue_init");
762 for (i = 0; i < td->o.iodepth; i++) {
763 struct io_uring_sqe *sqe;
766 memset(sqe, 0, sizeof(*sqe));
769 if (o->registerfiles) {
770 err = fio_ioring_register_files(td);
772 td_verror(td, errno, "ioring_register_files");
780 static int fio_ioring_init(struct thread_data *td)
782 struct ioring_options *o = td->eo;
783 struct ioring_data *ld;
784 struct cmdprio *cmdprio = &o->cmdprio;
787 /* sqthread submission requires registered files */
788 if (o->sqpoll_thread)
789 o->registerfiles = 1;
791 if (o->registerfiles && td->o.nr_files != td->o.open_files) {
792 log_err("fio: io_uring registered files require nr_files to "
793 "be identical to open_files\n");
797 ld = calloc(1, sizeof(*ld));
799 /* ring depth must be a power-of-2 */
800 ld->iodepth = td->o.iodepth;
801 td->o.iodepth = roundup_pow2(td->o.iodepth);
804 ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
805 ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec));
807 td->io_ops_data = ld;
809 ret = fio_cmdprio_init(td, cmdprio, &ld->use_cmdprio);
811 td_verror(td, EINVAL, "fio_ioring_init");
815 if (fio_option_is_set(&td->o, ioprio_class))
816 ld->ioprio_class_set = true;
817 if (fio_option_is_set(&td->o, ioprio))
818 ld->ioprio_set = true;
823 static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u)
825 struct ioring_data *ld = td->io_ops_data;
827 ld->io_u_index[io_u->index] = io_u;
831 static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f)
833 struct ioring_data *ld = td->io_ops_data;
834 struct ioring_options *o = td->eo;
836 if (!ld || !o->registerfiles)
837 return generic_open_file(td, f);
839 f->fd = ld->fds[f->engine_pos];
843 static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
845 struct ioring_data *ld = td->io_ops_data;
846 struct ioring_options *o = td->eo;
848 if (!ld || !o->registerfiles)
849 return generic_close_file(td, f);
855 static struct ioengine_ops ioengine = {
857 .version = FIO_IOOPS_VERSION,
858 .flags = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD,
859 .init = fio_ioring_init,
860 .post_init = fio_ioring_post_init,
861 .io_u_init = fio_ioring_io_u_init,
862 .prep = fio_ioring_prep,
863 .queue = fio_ioring_queue,
864 .commit = fio_ioring_commit,
865 .getevents = fio_ioring_getevents,
866 .event = fio_ioring_event,
867 .cleanup = fio_ioring_cleanup,
868 .open_file = fio_ioring_open_file,
869 .close_file = fio_ioring_close_file,
870 .get_file_size = generic_get_file_size,
872 .option_struct_size = sizeof(struct ioring_options),
875 static void fio_init fio_ioring_register(void)
877 register_ioengine(&ioengine);
880 static void fio_exit fio_ioring_unregister(void)
882 unregister_ioengine(&ioengine);