Rename t/aio-ring to t/io_uring
[fio.git] / engines / aioring.c
CommitLineData
52885fa2
JA
1/*
2 * aioring engine
3 *
a90cd050
JA
4 * IO engine using the new native Linux libaio ring interface. See:
5 *
6 * http://git.kernel.dk/cgit/linux-block/log/?h=aio-poll
52885fa2
JA
7 *
8 */
9#include <stdlib.h>
10#include <unistd.h>
11#include <errno.h>
12#include <libaio.h>
13#include <sys/time.h>
14#include <sys/resource.h>
15
16#include "../fio.h"
17#include "../lib/pow2.h"
18#include "../optgroup.h"
19#include "../lib/memalign.h"
b87aa01a 20#include "../lib/fls.h"
52885fa2 21
a30e63cf
JA
22#ifdef ARCH_HAVE_AIORING
23
52885fa2 24/*
9a2d78b3 25 * io_uring_setup(2) flags
52885fa2 26 */
52885fa2 27#ifndef IOCTX_FLAG_SCQRING
9a2d78b3
JA
28#define IOCTX_FLAG_SCQRING (1 << 0)
29#endif
30#ifndef IOCTX_FLAG_IOPOLL
31#define IOCTX_FLAG_IOPOLL (1 << 1)
52885fa2
JA
32#endif
33#ifndef IOCTX_FLAG_FIXEDBUFS
34#define IOCTX_FLAG_FIXEDBUFS (1 << 2)
35#endif
36#ifndef IOCTX_FLAG_SQTHREAD
37#define IOCTX_FLAG_SQTHREAD (1 << 3)
38#endif
39#ifndef IOCTX_FLAG_SQWQ
40#define IOCTX_FLAG_SQWQ (1 << 4)
41#endif
771c9901
JA
42#ifndef IOCTX_FLAG_SQPOLL
43#define IOCTX_FLAG_SQPOLL (1 << 5)
44#endif
45
9a2d78b3
JA
46#define IORING_OFF_SQ_RING 0ULL
47#define IORING_OFF_CQ_RING 0x8000000ULL
48#define IORING_OFF_IOCB 0x10000000ULL
52885fa2
JA
49
50/*
9a2d78b3 51 * io_uring_enter(2) flags
52885fa2 52 */
9a2d78b3
JA
53#ifndef IORING_ENTER_GETEVENTS
54#define IORING_ENTER_GETEVENTS (1 << 0)
52885fa2
JA
55#endif
56
57typedef uint64_t u64;
58typedef uint32_t u32;
59typedef uint16_t u16;
60
4cdbc048
JA
61#define IORING_SQ_NEED_WAKEUP (1 << 0)
62
96563db9
JA
63#define IOEV_RES2_CACHEHIT (1 << 0)
64
ac122fea 65struct aio_sqring_offsets {
9a2d78b3
JA
66 u32 head;
67 u32 tail;
68 u32 ring_mask;
69 u32 ring_entries;
70 u32 flags;
b538f06c 71 u32 dropped;
ac122fea 72 u32 array;
b538f06c 73 u32 resv[3];
ac122fea
JA
74};
75
76struct aio_cqring_offsets {
77 u32 head;
78 u32 tail;
79 u32 ring_mask;
80 u32 ring_entries;
81 u32 overflow;
82 u32 events;
b538f06c 83 u32 resv[4];
9a2d78b3
JA
84};
85
86struct aio_uring_params {
87 u32 sq_entries;
88 u32 cq_entries;
89 u32 flags;
90 u16 sq_thread_cpu;
91 u16 resv[9];
ac122fea
JA
92 struct aio_sqring_offsets sq_off;
93 struct aio_cqring_offsets cq_off;
9a2d78b3
JA
94};
95
52885fa2 96struct aio_sq_ring {
9a2d78b3
JA
97 u32 *head;
98 u32 *tail;
99 u32 *ring_mask;
100 u32 *ring_entries;
101 u32 *flags;
102 u32 *array;
52885fa2
JA
103};
104
105struct aio_cq_ring {
9a2d78b3
JA
106 u32 *head;
107 u32 *tail;
108 u32 *ring_mask;
109 u32 *ring_entries;
110 struct io_event *events;
111};
112
113struct aioring_mmap {
114 void *ptr;
115 size_t len;
52885fa2
JA
116};
117
118struct aioring_data {
9a2d78b3
JA
119 int ring_fd;
120
52885fa2
JA
121 struct io_u **io_us;
122 struct io_u **io_u_index;
123
9a2d78b3 124 struct aio_sq_ring sq_ring;
52885fa2 125 struct iocb *iocbs;
9a2d78b3 126 struct iovec *iovecs;
b87aa01a 127 unsigned sq_ring_mask;
52885fa2 128
9a2d78b3 129 struct aio_cq_ring cq_ring;
52885fa2 130 struct io_event *events;
b87aa01a 131 unsigned cq_ring_mask;
52885fa2
JA
132
133 int queued;
134 int cq_ring_off;
b87aa01a 135 unsigned iodepth;
96563db9
JA
136
137 uint64_t cachehit;
138 uint64_t cachemiss;
9a2d78b3
JA
139
140 struct aioring_mmap mmap[3];
52885fa2
JA
141};
142
143struct aioring_options {
144 void *pad;
145 unsigned int hipri;
146 unsigned int fixedbufs;
a90cd050
JA
147 unsigned int sqthread;
148 unsigned int sqthread_set;
771c9901 149 unsigned int sqthread_poll;
a90cd050 150 unsigned int sqwq;
52885fa2
JA
151};
152
a90cd050
JA
153static int fio_aioring_sqthread_cb(void *data,
154 unsigned long long *val)
155{
156 struct aioring_options *o = data;
157
158 o->sqthread = *val;
159 o->sqthread_set = 1;
160 return 0;
161}
162
52885fa2
JA
163static struct fio_option options[] = {
164 {
165 .name = "hipri",
166 .lname = "High Priority",
167 .type = FIO_OPT_STR_SET,
168 .off1 = offsetof(struct aioring_options, hipri),
169 .help = "Use polled IO completions",
170 .category = FIO_OPT_C_ENGINE,
171 .group = FIO_OPT_G_LIBAIO,
172 },
173 {
174 .name = "fixedbufs",
175 .lname = "Fixed (pre-mapped) IO buffers",
176 .type = FIO_OPT_STR_SET,
177 .off1 = offsetof(struct aioring_options, fixedbufs),
178 .help = "Pre map IO buffers",
179 .category = FIO_OPT_C_ENGINE,
180 .group = FIO_OPT_G_LIBAIO,
181 },
a90cd050
JA
182 {
183 .name = "sqthread",
184 .lname = "Use kernel SQ thread on this CPU",
185 .type = FIO_OPT_INT,
186 .cb = fio_aioring_sqthread_cb,
187 .help = "Offload submission to kernel thread",
188 .category = FIO_OPT_C_ENGINE,
189 .group = FIO_OPT_G_LIBAIO,
190 },
771c9901
JA
191 {
192 .name = "sqthread_poll",
193 .lname = "Kernel SQ thread should poll",
194 .type = FIO_OPT_STR_SET,
195 .off1 = offsetof(struct aioring_options, sqthread_poll),
196 .help = "Used with sqthread, enables kernel side polling",
197 .category = FIO_OPT_C_ENGINE,
198 .group = FIO_OPT_G_LIBAIO,
199 },
a90cd050
JA
200 {
201 .name = "sqwq",
202 .lname = "Offload submission to kernel workqueue",
203 .type = FIO_OPT_STR_SET,
204 .off1 = offsetof(struct aioring_options, sqwq),
205 .help = "Offload submission to kernel workqueue",
206 .category = FIO_OPT_C_ENGINE,
207 .group = FIO_OPT_G_LIBAIO,
208 },
52885fa2
JA
209 {
210 .name = NULL,
211 },
212};
213
9a2d78b3 214static int io_uring_enter(struct aioring_data *ld, unsigned int to_submit,
52885fa2
JA
215 unsigned int min_complete, unsigned int flags)
216{
9a2d78b3
JA
217 return syscall(__NR_sys_io_uring_enter, ld->ring_fd, to_submit,
218 min_complete, flags);
52885fa2
JA
219}
220
221static int fio_aioring_prep(struct thread_data *td, struct io_u *io_u)
222{
223 struct aioring_data *ld = td->io_ops_data;
224 struct fio_file *f = io_u->file;
52885fa2
JA
225 struct iocb *iocb;
226
227 iocb = &ld->iocbs[io_u->index];
228
e3970057
JA
229 if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
230 if (io_u->ddir == DDIR_READ)
52885fa2 231 iocb->aio_lio_opcode = IO_CMD_PREAD;
e3970057 232 else
52885fa2 233 iocb->aio_lio_opcode = IO_CMD_PWRITE;
e3970057
JA
234 iocb->aio_reqprio = 0;
235 iocb->aio_fildes = f->fd;
236 iocb->u.c.buf = io_u->xfer_buf;
237 iocb->u.c.nbytes = io_u->xfer_buflen;
238 iocb->u.c.offset = io_u->offset;
ac4f3d4e 239 iocb->u.c.flags = 0;
52885fa2
JA
240 } else if (ddir_sync(io_u->ddir))
241 io_prep_fsync(iocb, f->fd);
242
243 iocb->data = io_u;
244 return 0;
245}
246
247static struct io_u *fio_aioring_event(struct thread_data *td, int event)
248{
249 struct aioring_data *ld = td->io_ops_data;
250 struct io_event *ev;
251 struct io_u *io_u;
b87aa01a 252 unsigned index;
52885fa2 253
b87aa01a 254 index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
52885fa2 255
9a2d78b3 256 ev = &ld->cq_ring.events[index];
52885fa2
JA
257 io_u = ev->data;
258
259 if (ev->res != io_u->xfer_buflen) {
260 if (ev->res > io_u->xfer_buflen)
261 io_u->error = -ev->res;
262 else
263 io_u->resid = io_u->xfer_buflen - ev->res;
264 } else
265 io_u->error = 0;
266
96563db9
JA
267 if (io_u->ddir == DDIR_READ) {
268 if (ev->res2 & IOEV_RES2_CACHEHIT)
269 ld->cachehit++;
270 else
271 ld->cachemiss++;
272 }
273
52885fa2
JA
274 return io_u;
275}
276
277static int fio_aioring_cqring_reap(struct thread_data *td, unsigned int events,
278 unsigned int max)
279{
280 struct aioring_data *ld = td->io_ops_data;
9a2d78b3 281 struct aio_cq_ring *ring = &ld->cq_ring;
52885fa2
JA
282 u32 head, reaped = 0;
283
9a2d78b3 284 head = *ring->head;
52885fa2
JA
285 do {
286 read_barrier();
9a2d78b3 287 if (head == *ring->tail)
52885fa2
JA
288 break;
289 reaped++;
290 head++;
52885fa2
JA
291 } while (reaped + events < max);
292
9a2d78b3 293 *ring->head = head;
52885fa2
JA
294 write_barrier();
295 return reaped;
296}
297
298static int fio_aioring_getevents(struct thread_data *td, unsigned int min,
299 unsigned int max, const struct timespec *t)
300{
301 struct aioring_data *ld = td->io_ops_data;
302 unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
771c9901 303 struct aioring_options *o = td->eo;
9a2d78b3 304 struct aio_cq_ring *ring = &ld->cq_ring;
b87aa01a
JA
305 unsigned events = 0;
306 int r;
52885fa2 307
9a2d78b3 308 ld->cq_ring_off = *ring->head;
52885fa2
JA
309 do {
310 r = fio_aioring_cqring_reap(td, events, max);
311 if (r) {
312 events += r;
313 continue;
314 }
315
771c9901 316 if (!o->sqthread_poll) {
9a2d78b3
JA
317 r = io_uring_enter(ld, 0, actual_min,
318 IORING_ENTER_GETEVENTS);
771c9901
JA
319 if (r < 0) {
320 if (errno == EAGAIN)
321 continue;
9a2d78b3 322 td_verror(td, errno, "io_uring_enter");
771c9901
JA
323 break;
324 }
52885fa2
JA
325 }
326 } while (events < min);
327
328 return r < 0 ? r : events;
329}
330
331static enum fio_q_status fio_aioring_queue(struct thread_data *td,
332 struct io_u *io_u)
333{
334 struct aioring_data *ld = td->io_ops_data;
9a2d78b3 335 struct aio_sq_ring *ring = &ld->sq_ring;
52885fa2
JA
336 unsigned tail, next_tail;
337
338 fio_ro_check(td, io_u);
339
b87aa01a 340 if (ld->queued == ld->iodepth)
52885fa2
JA
341 return FIO_Q_BUSY;
342
52885fa2
JA
343 if (io_u->ddir == DDIR_TRIM) {
344 if (ld->queued)
345 return FIO_Q_BUSY;
346
347 do_io_u_trim(td, io_u);
348 io_u_mark_submit(td, 1);
349 io_u_mark_complete(td, 1);
350 return FIO_Q_COMPLETED;
351 }
352
9a2d78b3 353 tail = *ring->tail;
52885fa2 354 next_tail = tail + 1;
52885fa2 355 read_barrier();
9a2d78b3 356 if (next_tail == *ring->head)
52885fa2
JA
357 return FIO_Q_BUSY;
358
b87aa01a 359 ring->array[tail & ld->sq_ring_mask] = io_u->index;
9a2d78b3 360 *ring->tail = next_tail;
52885fa2
JA
361 write_barrier();
362
363 ld->queued++;
364 return FIO_Q_QUEUED;
365}
366
367static void fio_aioring_queued(struct thread_data *td, int start, int nr)
368{
369 struct aioring_data *ld = td->io_ops_data;
370 struct timespec now;
371
372 if (!fio_fill_issue_time(td))
373 return;
374
375 fio_gettime(&now, NULL);
376
377 while (nr--) {
9a2d78b3
JA
378 struct aio_sq_ring *ring = &ld->sq_ring;
379 int index = ring->array[start & ld->sq_ring_mask];
f8289afc 380 struct io_u *io_u = ld->io_u_index[index];
52885fa2
JA
381
382 memcpy(&io_u->issue_time, &now, sizeof(now));
383 io_u_queued(td, io_u);
384
385 start++;
52885fa2
JA
386 }
387}
388
389static int fio_aioring_commit(struct thread_data *td)
390{
391 struct aioring_data *ld = td->io_ops_data;
771c9901 392 struct aioring_options *o = td->eo;
52885fa2
JA
393 int ret;
394
395 if (!ld->queued)
396 return 0;
397
771c9901
JA
398 /* Nothing to do */
399 if (o->sqthread_poll) {
9a2d78b3 400 struct aio_sq_ring *ring = &ld->sq_ring;
4cdbc048 401
9a2d78b3
JA
402 if (*ring->flags & IORING_SQ_NEED_WAKEUP)
403 io_uring_enter(ld, ld->queued, 0, 0);
771c9901
JA
404 ld->queued = 0;
405 return 0;
406 }
407
52885fa2 408 do {
9a2d78b3 409 unsigned start = *ld->sq_ring.head;
52885fa2
JA
410 long nr = ld->queued;
411
9a2d78b3 412 ret = io_uring_enter(ld, nr, 0, IORING_ENTER_GETEVENTS);
52885fa2
JA
413 if (ret > 0) {
414 fio_aioring_queued(td, start, ret);
415 io_u_mark_submit(td, ret);
416
417 ld->queued -= ret;
418 ret = 0;
a90cd050
JA
419 } else if (!ret) {
420 io_u_mark_submit(td, ret);
52885fa2 421 continue;
a90cd050
JA
422 } else {
423 if (errno == EAGAIN) {
424 ret = fio_aioring_cqring_reap(td, 0, ld->queued);
425 if (ret)
426 continue;
427 /* Shouldn't happen */
428 usleep(1);
429 continue;
52885fa2 430 }
9a2d78b3 431 td_verror(td, errno, "io_uring_enter submit");
52885fa2 432 break;
a90cd050 433 }
52885fa2
JA
434 } while (ld->queued);
435
436 return ret;
437}
438
9a2d78b3 439static void fio_aioring_unmap(struct aioring_data *ld)
52885fa2 440{
9a2d78b3 441 int i;
52885fa2 442
9a2d78b3
JA
443 for (i = 0; i < ARRAY_SIZE(ld->mmap); i++)
444 munmap(ld->mmap[i].ptr, ld->mmap[i].len);
445 close(ld->ring_fd);
b87aa01a
JA
446}
447
52885fa2
JA
448static void fio_aioring_cleanup(struct thread_data *td)
449{
450 struct aioring_data *ld = td->io_ops_data;
451
452 if (ld) {
96563db9
JA
453 td->ts.cachehit += ld->cachehit;
454 td->ts.cachemiss += ld->cachemiss;
455
52885fa2
JA
456 /*
457 * Work-around to avoid huge RCU stalls at exit time. If we
458 * don't do this here, then it'll be torn down by exit_aio().
459 * But for that case we can parallellize the freeing, thus
460 * speeding it up a lot.
461 */
462 if (!(td->flags & TD_F_CHILD))
9a2d78b3
JA
463 fio_aioring_unmap(ld);
464
52885fa2
JA
465 free(ld->io_u_index);
466 free(ld->io_us);
9a2d78b3 467 free(ld->iovecs);
52885fa2
JA
468 free(ld);
469 }
470}
471
9a2d78b3
JA
472static int fio_aioring_mmap(struct aioring_data *ld, struct aio_uring_params *p)
473{
474 struct aio_sq_ring *sring = &ld->sq_ring;
475 struct aio_cq_ring *cring = &ld->cq_ring;
476 void *ptr;
477
ac122fea 478 ld->mmap[0].len = p->sq_off.array + p->sq_entries * sizeof(u32);
9a2d78b3
JA
479 ptr = mmap(0, ld->mmap[0].len, PROT_READ | PROT_WRITE,
480 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
481 IORING_OFF_SQ_RING);
482 ld->mmap[0].ptr = ptr;
483 sring->head = ptr + p->sq_off.head;
484 sring->tail = ptr + p->sq_off.tail;
485 sring->ring_mask = ptr + p->sq_off.ring_mask;
486 sring->ring_entries = ptr + p->sq_off.ring_entries;
487 sring->flags = ptr + p->sq_off.flags;
ac122fea 488 sring->array = ptr + p->sq_off.array;
9a2d78b3
JA
489 ld->sq_ring_mask = *sring->ring_mask;
490
491 ld->mmap[1].len = p->sq_entries * sizeof(struct iocb);
492 ld->iocbs = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
493 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
494 IORING_OFF_IOCB);
495 ld->mmap[1].ptr = ld->iocbs;
496
ac122fea 497 ld->mmap[2].len = p->cq_off.events +
9a2d78b3
JA
498 p->cq_entries * sizeof(struct io_event);
499 ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
500 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
501 IORING_OFF_CQ_RING);
502 ld->mmap[2].ptr = ptr;
503 cring->head = ptr + p->cq_off.head;
504 cring->tail = ptr + p->cq_off.tail;
505 cring->ring_mask = ptr + p->cq_off.ring_mask;
506 cring->ring_entries = ptr + p->cq_off.ring_entries;
ac122fea 507 cring->events = ptr + p->cq_off.events;
9a2d78b3
JA
508 ld->cq_ring_mask = *cring->ring_mask;
509 return 0;
510}
511
52885fa2
JA
512static int fio_aioring_queue_init(struct thread_data *td)
513{
52885fa2
JA
514 struct aioring_data *ld = td->io_ops_data;
515 struct aioring_options *o = td->eo;
52885fa2 516 int depth = td->o.iodepth;
9a2d78b3
JA
517 struct aio_uring_params p;
518 int ret;
519
520 memset(&p, 0, sizeof(p));
521 p.flags = IOCTX_FLAG_SCQRING;
52885fa2
JA
522
523 if (o->hipri)
9a2d78b3 524 p.flags |= IOCTX_FLAG_IOPOLL;
a90cd050 525 if (o->sqthread_set) {
9a2d78b3
JA
526 p.sq_thread_cpu = o->sqthread;
527 p.flags |= IOCTX_FLAG_SQTHREAD;
771c9901 528 if (o->sqthread_poll)
9a2d78b3 529 p.flags |= IOCTX_FLAG_SQPOLL;
f635f1fb
JA
530 }
531 if (o->sqwq)
9a2d78b3 532 p.flags |= IOCTX_FLAG_SQWQ;
a90cd050 533
52885fa2
JA
534 if (o->fixedbufs) {
535 struct rlimit rlim = {
536 .rlim_cur = RLIM_INFINITY,
537 .rlim_max = RLIM_INFINITY,
538 };
539
540 setrlimit(RLIMIT_MEMLOCK, &rlim);
9a2d78b3 541 p.flags |= IOCTX_FLAG_FIXEDBUFS;
52885fa2
JA
542 }
543
9a2d78b3
JA
544 ret = syscall(__NR_sys_io_uring_setup, depth, ld->iovecs, &p);
545 if (ret < 0)
546 return ret;
547
548 ld->ring_fd = ret;
549 return fio_aioring_mmap(ld, &p);
52885fa2
JA
550}
551
552static int fio_aioring_post_init(struct thread_data *td)
553{
554 struct aioring_data *ld = td->io_ops_data;
555 struct aioring_options *o = td->eo;
556 struct io_u *io_u;
9a2d78b3 557 int err;
52885fa2
JA
558
559 if (o->fixedbufs) {
560 int i;
561
562 for (i = 0; i < td->o.iodepth; i++) {
9a2d78b3
JA
563 struct iovec *iov = &ld->iovecs[i];
564
52885fa2 565 io_u = ld->io_u_index[i];
9a2d78b3
JA
566 iov->iov_base = io_u->buf;
567 iov->iov_len = td_max_bs(td);
52885fa2
JA
568 }
569 }
570
571 err = fio_aioring_queue_init(td);
572 if (err) {
d63a472d 573 td_verror(td, errno, "io_queue_init");
52885fa2
JA
574 return 1;
575 }
576
577 return 0;
578}
579
9a2d78b3
JA
580static unsigned roundup_pow2(unsigned depth)
581{
582 return 1UL << __fls(depth - 1);
583}
584
52885fa2
JA
585static int fio_aioring_init(struct thread_data *td)
586{
587 struct aioring_data *ld;
588
52885fa2
JA
589 ld = calloc(1, sizeof(*ld));
590
b87aa01a
JA
591 /* ring depth must be a power-of-2 */
592 ld->iodepth = td->o.iodepth;
593 td->o.iodepth = roundup_pow2(td->o.iodepth);
594
52885fa2
JA
595 /* io_u index */
596 ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
597 ld->io_us = calloc(td->o.iodepth, sizeof(struct io_u *));
598
9a2d78b3 599 ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec));
52885fa2
JA
600
601 td->io_ops_data = ld;
602 return 0;
603}
604
605static int fio_aioring_io_u_init(struct thread_data *td, struct io_u *io_u)
606{
607 struct aioring_data *ld = td->io_ops_data;
608
609 ld->io_u_index[io_u->index] = io_u;
610 return 0;
611}
612
613static struct ioengine_ops ioengine = {
614 .name = "aio-ring",
615 .version = FIO_IOOPS_VERSION,
616 .init = fio_aioring_init,
617 .post_init = fio_aioring_post_init,
618 .io_u_init = fio_aioring_io_u_init,
619 .prep = fio_aioring_prep,
620 .queue = fio_aioring_queue,
621 .commit = fio_aioring_commit,
622 .getevents = fio_aioring_getevents,
623 .event = fio_aioring_event,
624 .cleanup = fio_aioring_cleanup,
625 .open_file = generic_open_file,
626 .close_file = generic_close_file,
627 .get_file_size = generic_get_file_size,
628 .options = options,
629 .option_struct_size = sizeof(struct aioring_options),
630};
631
632static void fio_init fio_aioring_register(void)
633{
52885fa2 634 register_ioengine(&ioengine);
52885fa2
JA
635}
636
637static void fio_exit fio_aioring_unregister(void)
638{
52885fa2 639 unregister_ioengine(&ioengine);
52885fa2 640}
1f90e9bb 641#endif