libaio,io_uring: introduce cmdprio_class and cmdprio options
[fio.git] / engines / io_uring.c
CommitLineData
52885fa2 1/*
bffad86f 2 * io_uring engine
52885fa2 3 *
bffad86f 4 * IO engine using the new native Linux aio io_uring interface. See:
a90cd050 5 *
bffad86f 6 * http://git.kernel.dk/cgit/linux-block/log/?h=io_uring
52885fa2
JA
7 *
8 */
9#include <stdlib.h>
10#include <unistd.h>
11#include <errno.h>
52885fa2
JA
12#include <sys/time.h>
13#include <sys/resource.h>
14
15#include "../fio.h"
16#include "../lib/pow2.h"
17#include "../optgroup.h"
18#include "../lib/memalign.h"
b87aa01a 19#include "../lib/fls.h"
6d975f2c 20#include "../lib/roundup.h"
52885fa2 21
bffad86f 22#ifdef ARCH_HAVE_IOURING
52885fa2 23
57fa61f0 24#include "../lib/types.h"
f3e769a4 25#include "../os/linux/io_uring.h"
e9f6567a 26#include "cmdprio.h"
9a2d78b3 27
bffad86f 28struct io_sq_ring {
e2239016
JA
29 unsigned *head;
30 unsigned *tail;
31 unsigned *ring_mask;
32 unsigned *ring_entries;
33 unsigned *flags;
34 unsigned *array;
52885fa2
JA
35};
36
bffad86f 37struct io_cq_ring {
e2239016
JA
38 unsigned *head;
39 unsigned *tail;
40 unsigned *ring_mask;
41 unsigned *ring_entries;
f0403f94 42 struct io_uring_cqe *cqes;
9a2d78b3
JA
43};
44
bffad86f 45struct ioring_mmap {
9a2d78b3
JA
46 void *ptr;
47 size_t len;
52885fa2
JA
48};
49
bffad86f 50struct ioring_data {
9a2d78b3
JA
51 int ring_fd;
52
52885fa2
JA
53 struct io_u **io_u_index;
54
5ffd5626
JA
55 int *fds;
56
bffad86f 57 struct io_sq_ring sq_ring;
f0403f94 58 struct io_uring_sqe *sqes;
9a2d78b3 59 struct iovec *iovecs;
b87aa01a 60 unsigned sq_ring_mask;
52885fa2 61
bffad86f 62 struct io_cq_ring cq_ring;
b87aa01a 63 unsigned cq_ring_mask;
52885fa2
JA
64
65 int queued;
66 int cq_ring_off;
b87aa01a 67 unsigned iodepth;
1af44196
XW
68 bool ioprio_class_set;
69 bool ioprio_set;
5a59a81d 70 int prepped;
96563db9 71
bffad86f 72 struct ioring_mmap mmap[3];
e9f6567a
DLM
73
74 bool use_cmdprio;
52885fa2
JA
75};
76
bffad86f 77struct ioring_options {
52885fa2
JA
78 void *pad;
79 unsigned int hipri;
e9f6567a 80 struct cmdprio cmdprio;
52885fa2 81 unsigned int fixedbufs;
5ffd5626 82 unsigned int registerfiles;
3d7d00a3 83 unsigned int sqpoll_thread;
2ea53ca3
JA
84 unsigned int sqpoll_set;
85 unsigned int sqpoll_cpu;
b10b1e70 86 unsigned int nonvectored;
4a87b584 87 unsigned int uncached;
7d42e66e 88 unsigned int nowait;
5a59a81d 89 unsigned int force_async;
52885fa2
JA
90};
91
b10b1e70
JA
92static const int ddir_to_op[2][2] = {
93 { IORING_OP_READV, IORING_OP_READ },
94 { IORING_OP_WRITEV, IORING_OP_WRITE }
95};
96
3f1e3af7
KB
97static const int fixed_ddir_to_op[2] = {
98 IORING_OP_READ_FIXED,
99 IORING_OP_WRITE_FIXED
100};
101
2ea53ca3 102static int fio_ioring_sqpoll_cb(void *data, unsigned long long *val)
a90cd050 103{
bffad86f 104 struct ioring_options *o = data;
a90cd050 105
2ea53ca3
JA
106 o->sqpoll_cpu = *val;
107 o->sqpoll_set = 1;
a90cd050
JA
108 return 0;
109}
110
52885fa2
JA
111static struct fio_option options[] = {
112 {
113 .name = "hipri",
114 .lname = "High Priority",
115 .type = FIO_OPT_STR_SET,
bffad86f 116 .off1 = offsetof(struct ioring_options, hipri),
52885fa2
JA
117 .help = "Use polled IO completions",
118 .category = FIO_OPT_C_ENGINE,
27f436d9 119 .group = FIO_OPT_G_IOURING,
52885fa2 120 },
b2a432bf
PC
121#ifdef FIO_HAVE_IOPRIO_CLASS
122 {
123 .name = "cmdprio_percentage",
124 .lname = "high priority percentage",
125 .type = FIO_OPT_INT,
e9f6567a
DLM
126 .off1 = offsetof(struct ioring_options,
127 cmdprio.percentage[DDIR_READ]),
128 .off2 = offsetof(struct ioring_options,
129 cmdprio.percentage[DDIR_WRITE]),
130 .minval = 0,
b2a432bf
PC
131 .maxval = 100,
132 .help = "Send high priority I/O this percentage of the time",
133 .category = FIO_OPT_C_ENGINE,
134 .group = FIO_OPT_G_IOURING,
135 },
12f9d54a
DLM
136 {
137 .name = "cmdprio_class",
138 .lname = "Asynchronous I/O priority class",
139 .type = FIO_OPT_INT,
140 .off1 = offsetof(struct ioring_options,
141 cmdprio.class[DDIR_READ]),
142 .off2 = offsetof(struct ioring_options,
143 cmdprio.class[DDIR_WRITE]),
144 .help = "Set asynchronous IO priority class",
145 .minval = IOPRIO_MIN_PRIO_CLASS + 1,
146 .maxval = IOPRIO_MAX_PRIO_CLASS,
147 .interval = 1,
148 .category = FIO_OPT_C_ENGINE,
149 .group = FIO_OPT_G_IOURING,
150 },
151 {
152 .name = "cmdprio",
153 .lname = "Asynchronous I/O priority level",
154 .type = FIO_OPT_INT,
155 .off1 = offsetof(struct ioring_options,
156 cmdprio.level[DDIR_READ]),
157 .off2 = offsetof(struct ioring_options,
158 cmdprio.level[DDIR_WRITE]),
159 .help = "Set asynchronous IO priority level",
160 .minval = IOPRIO_MIN_PRIO,
161 .maxval = IOPRIO_MAX_PRIO,
162 .interval = 1,
163 .category = FIO_OPT_C_ENGINE,
164 .group = FIO_OPT_G_IOURING,
165 },
b2a432bf
PC
166#else
167 {
168 .name = "cmdprio_percentage",
169 .lname = "high priority percentage",
170 .type = FIO_OPT_UNSUPPORTED,
171 .help = "Your platform does not support I/O priority classes",
172 },
12f9d54a
DLM
173 {
174 .name = "cmdprio_class",
175 .lname = "Asynchronous I/O priority class",
176 .type = FIO_OPT_UNSUPPORTED,
177 .help = "Your platform does not support I/O priority classes",
178 },
179 {
180 .name = "cmdprio",
181 .lname = "Asynchronous I/O priority level",
182 .type = FIO_OPT_UNSUPPORTED,
183 .help = "Your platform does not support I/O priority classes",
184 },
b2a432bf 185#endif
52885fa2
JA
186 {
187 .name = "fixedbufs",
188 .lname = "Fixed (pre-mapped) IO buffers",
189 .type = FIO_OPT_STR_SET,
bffad86f 190 .off1 = offsetof(struct ioring_options, fixedbufs),
52885fa2
JA
191 .help = "Pre map IO buffers",
192 .category = FIO_OPT_C_ENGINE,
27f436d9 193 .group = FIO_OPT_G_IOURING,
52885fa2 194 },
5ffd5626
JA
195 {
196 .name = "registerfiles",
197 .lname = "Register file set",
198 .type = FIO_OPT_STR_SET,
199 .off1 = offsetof(struct ioring_options, registerfiles),
200 .help = "Pre-open/register files",
201 .category = FIO_OPT_C_ENGINE,
27f436d9 202 .group = FIO_OPT_G_IOURING,
5ffd5626 203 },
771c9901
JA
204 {
205 .name = "sqthread_poll",
3d7d00a3
JA
206 .lname = "Kernel SQ thread polling",
207 .type = FIO_OPT_INT,
208 .off1 = offsetof(struct ioring_options, sqpoll_thread),
209 .help = "Offload submission/completion to kernel thread",
210 .category = FIO_OPT_C_ENGINE,
27f436d9 211 .group = FIO_OPT_G_IOURING,
3d7d00a3
JA
212 },
213 {
214 .name = "sqthread_poll_cpu",
215 .lname = "SQ Thread Poll CPU",
2ea53ca3
JA
216 .type = FIO_OPT_INT,
217 .cb = fio_ioring_sqpoll_cb,
3d7d00a3 218 .help = "What CPU to run SQ thread polling on",
a90cd050 219 .category = FIO_OPT_C_ENGINE,
27f436d9 220 .group = FIO_OPT_G_IOURING,
a90cd050 221 },
b10b1e70
JA
222 {
223 .name = "nonvectored",
224 .lname = "Non-vectored",
225 .type = FIO_OPT_INT,
226 .off1 = offsetof(struct ioring_options, nonvectored),
556d8415 227 .def = "-1",
b10b1e70
JA
228 .help = "Use non-vectored read/write commands",
229 .category = FIO_OPT_C_ENGINE,
230 .group = FIO_OPT_G_IOURING,
231 },
4a87b584
JA
232 {
233 .name = "uncached",
234 .lname = "Uncached",
235 .type = FIO_OPT_INT,
236 .off1 = offsetof(struct ioring_options, uncached),
237 .help = "Use RWF_UNCACHED for buffered read/writes",
238 .category = FIO_OPT_C_ENGINE,
239 .group = FIO_OPT_G_IOURING,
240 },
7d42e66e
KK
241 {
242 .name = "nowait",
243 .lname = "RWF_NOWAIT",
244 .type = FIO_OPT_BOOL,
245 .off1 = offsetof(struct ioring_options, nowait),
246 .help = "Use RWF_NOWAIT for reads/writes",
247 .category = FIO_OPT_C_ENGINE,
248 .group = FIO_OPT_G_IOURING,
249 },
5a59a81d
JA
250 {
251 .name = "force_async",
252 .lname = "Force async",
253 .type = FIO_OPT_INT,
254 .off1 = offsetof(struct ioring_options, force_async),
255 .help = "Set IOSQE_ASYNC every N requests",
256 .category = FIO_OPT_C_ENGINE,
257 .group = FIO_OPT_G_IOURING,
258 },
52885fa2
JA
259 {
260 .name = NULL,
261 },
262};
263
bffad86f 264static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit,
52885fa2
JA
265 unsigned int min_complete, unsigned int flags)
266{
bfed648c 267 return syscall(__NR_io_uring_enter, ld->ring_fd, to_submit,
521164fa 268 min_complete, flags, NULL, 0);
52885fa2
JA
269}
270
bffad86f 271static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
52885fa2 272{
bffad86f 273 struct ioring_data *ld = td->io_ops_data;
cfcc8564 274 struct ioring_options *o = td->eo;
52885fa2 275 struct fio_file *f = io_u->file;
f0403f94 276 struct io_uring_sqe *sqe;
52885fa2 277
f0403f94 278 sqe = &ld->sqes[io_u->index];
34d6090e 279
5ffd5626
JA
280 if (o->registerfiles) {
281 sqe->fd = f->engine_pos;
282 sqe->flags = IOSQE_FIXED_FILE;
283 } else {
284 sqe->fd = f->fd;
87b69ef2 285 sqe->flags = 0;
5ffd5626 286 }
52885fa2 287
e3970057 288 if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
f0403f94 289 if (o->fixedbufs) {
3f1e3af7 290 sqe->opcode = fixed_ddir_to_op[io_u->ddir];
919850d2 291 sqe->addr = (unsigned long) io_u->xfer_buf;
f0403f94 292 sqe->len = io_u->xfer_buflen;
2ea53ca3 293 sqe->buf_index = io_u->index;
cfcc8564 294 } else {
832faaaf
JA
295 struct iovec *iov = &ld->iovecs[io_u->index];
296
297 /*
298 * Update based on actual io_u, requeue could have
299 * adjusted these
300 */
301 iov->iov_base = io_u->xfer_buf;
302 iov->iov_len = io_u->xfer_buflen;
303
3f1e3af7 304 sqe->opcode = ddir_to_op[io_u->ddir][!!o->nonvectored];
b10b1e70 305 if (o->nonvectored) {
832faaaf
JA
306 sqe->addr = (unsigned long) iov->iov_base;
307 sqe->len = iov->iov_len;
b10b1e70 308 } else {
832faaaf 309 sqe->addr = (unsigned long) iov;
b10b1e70
JA
310 sqe->len = 1;
311 }
cfcc8564 312 }
fd70e361 313 sqe->rw_flags = 0;
4a87b584 314 if (!td->o.odirect && o->uncached)
fd70e361 315 sqe->rw_flags |= RWF_UNCACHED;
7d42e66e
KK
316 if (o->nowait)
317 sqe->rw_flags |= RWF_NOWAIT;
1af44196 318 if (ld->ioprio_class_set)
b7ed2a86 319 sqe->ioprio = td->o.ioprio_class << 13;
1af44196 320 if (ld->ioprio_set)
b7ed2a86 321 sqe->ioprio |= td->o.ioprio;
f0403f94 322 sqe->off = io_u->offset;
48e698fa 323 } else if (ddir_sync(io_u->ddir)) {
7c70f506 324 sqe->ioprio = 0;
01387bfe
AF
325 if (io_u->ddir == DDIR_SYNC_FILE_RANGE) {
326 sqe->off = f->first_write;
327 sqe->len = f->last_write - f->first_write;
328 sqe->sync_range_flags = td->o.sync_file_range;
329 sqe->opcode = IORING_OP_SYNC_FILE_RANGE;
330 } else {
7c70f506
JA
331 sqe->off = 0;
332 sqe->addr = 0;
333 sqe->len = 0;
01387bfe
AF
334 if (io_u->ddir == DDIR_DATASYNC)
335 sqe->fsync_flags |= IORING_FSYNC_DATASYNC;
336 sqe->opcode = IORING_OP_FSYNC;
337 }
48e698fa 338 }
52885fa2 339
5a59a81d
JA
340 if (o->force_async && ++ld->prepped == o->force_async) {
341 ld->prepped = 0;
342 sqe->flags |= IOSQE_ASYNC;
343 }
344
48e698fa 345 sqe->user_data = (unsigned long) io_u;
52885fa2
JA
346 return 0;
347}
348
bffad86f 349static struct io_u *fio_ioring_event(struct thread_data *td, int event)
52885fa2 350{
bffad86f 351 struct ioring_data *ld = td->io_ops_data;
f0403f94 352 struct io_uring_cqe *cqe;
52885fa2 353 struct io_u *io_u;
b87aa01a 354 unsigned index;
52885fa2 355
b87aa01a 356 index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
52885fa2 357
f0403f94 358 cqe = &ld->cq_ring.cqes[index];
e3466352 359 io_u = (struct io_u *) (uintptr_t) cqe->user_data;
52885fa2 360
f0403f94
JA
361 if (cqe->res != io_u->xfer_buflen) {
362 if (cqe->res > io_u->xfer_buflen)
363 io_u->error = -cqe->res;
52885fa2 364 else
f0403f94 365 io_u->resid = io_u->xfer_buflen - cqe->res;
52885fa2
JA
366 } else
367 io_u->error = 0;
368
369 return io_u;
370}
371
bffad86f 372static int fio_ioring_cqring_reap(struct thread_data *td, unsigned int events,
52885fa2
JA
373 unsigned int max)
374{
bffad86f
JA
375 struct ioring_data *ld = td->io_ops_data;
376 struct io_cq_ring *ring = &ld->cq_ring;
e2239016 377 unsigned head, reaped = 0;
52885fa2 378
9a2d78b3 379 head = *ring->head;
52885fa2 380 do {
9e26aff9 381 if (head == atomic_load_acquire(ring->tail))
52885fa2
JA
382 break;
383 reaped++;
384 head++;
52885fa2
JA
385 } while (reaped + events < max);
386
76ce63dd
AB
387 if (reaped)
388 atomic_store_release(ring->head, head);
389
52885fa2
JA
390 return reaped;
391}
392
bffad86f
JA
393static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
394 unsigned int max, const struct timespec *t)
52885fa2 395{
bffad86f 396 struct ioring_data *ld = td->io_ops_data;
52885fa2 397 unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
bffad86f
JA
398 struct ioring_options *o = td->eo;
399 struct io_cq_ring *ring = &ld->cq_ring;
b87aa01a
JA
400 unsigned events = 0;
401 int r;
52885fa2 402
9a2d78b3 403 ld->cq_ring_off = *ring->head;
52885fa2 404 do {
bffad86f 405 r = fio_ioring_cqring_reap(td, events, max);
52885fa2
JA
406 if (r) {
407 events += r;
f7cbbbf8
ST
408 if (actual_min != 0)
409 actual_min -= r;
52885fa2
JA
410 continue;
411 }
412
3d7d00a3 413 if (!o->sqpoll_thread) {
9a2d78b3
JA
414 r = io_uring_enter(ld, 0, actual_min,
415 IORING_ENTER_GETEVENTS);
771c9901 416 if (r < 0) {
f6abd731 417 if (errno == EAGAIN || errno == EINTR)
771c9901 418 continue;
9a2d78b3 419 td_verror(td, errno, "io_uring_enter");
771c9901
JA
420 break;
421 }
52885fa2
JA
422 }
423 } while (events < min);
424
425 return r < 0 ? r : events;
426}
427
b2a432bf
PC
428static void fio_ioring_prio_prep(struct thread_data *td, struct io_u *io_u)
429{
430 struct ioring_options *o = td->eo;
431 struct ioring_data *ld = td->io_ops_data;
e9f6567a
DLM
432 struct io_uring_sqe *sqe = &ld->sqes[io_u->index];
433 struct cmdprio *cmdprio = &o->cmdprio;
12f9d54a
DLM
434 enum fio_ddir ddir = io_u->ddir;
435 unsigned int p = cmdprio->percentage[ddir];
e9f6567a
DLM
436
437 if (p && rand_between(&td->prio_state, 0, 99) < p) {
12f9d54a
DLM
438 sqe->ioprio =
439 ioprio_value(cmdprio->class[ddir], cmdprio->level[ddir]);
b2a432bf 440 io_u->flags |= IO_U_F_PRIORITY;
ff9b6876 441 } else {
e9f6567a 442 sqe->ioprio = 0;
b2a432bf 443 }
b2a432bf
PC
444}
445
bffad86f
JA
446static enum fio_q_status fio_ioring_queue(struct thread_data *td,
447 struct io_u *io_u)
52885fa2 448{
bffad86f
JA
449 struct ioring_data *ld = td->io_ops_data;
450 struct io_sq_ring *ring = &ld->sq_ring;
52885fa2
JA
451 unsigned tail, next_tail;
452
453 fio_ro_check(td, io_u);
454
b87aa01a 455 if (ld->queued == ld->iodepth)
52885fa2
JA
456 return FIO_Q_BUSY;
457
52885fa2
JA
458 if (io_u->ddir == DDIR_TRIM) {
459 if (ld->queued)
460 return FIO_Q_BUSY;
461
462 do_io_u_trim(td, io_u);
463 io_u_mark_submit(td, 1);
464 io_u_mark_complete(td, 1);
465 return FIO_Q_COMPLETED;
466 }
467
9a2d78b3 468 tail = *ring->tail;
52885fa2 469 next_tail = tail + 1;
9e26aff9 470 if (next_tail == atomic_load_acquire(ring->head))
52885fa2
JA
471 return FIO_Q_BUSY;
472
e9f6567a 473 if (ld->use_cmdprio)
b2a432bf 474 fio_ioring_prio_prep(td, io_u);
b87aa01a 475 ring->array[tail & ld->sq_ring_mask] = io_u->index;
9e26aff9 476 atomic_store_release(ring->tail, next_tail);
52885fa2
JA
477
478 ld->queued++;
479 return FIO_Q_QUEUED;
480}
481
bffad86f 482static void fio_ioring_queued(struct thread_data *td, int start, int nr)
52885fa2 483{
bffad86f 484 struct ioring_data *ld = td->io_ops_data;
52885fa2
JA
485 struct timespec now;
486
487 if (!fio_fill_issue_time(td))
488 return;
489
490 fio_gettime(&now, NULL);
491
492 while (nr--) {
bffad86f 493 struct io_sq_ring *ring = &ld->sq_ring;
9a2d78b3 494 int index = ring->array[start & ld->sq_ring_mask];
f8289afc 495 struct io_u *io_u = ld->io_u_index[index];
52885fa2
JA
496
497 memcpy(&io_u->issue_time, &now, sizeof(now));
498 io_u_queued(td, io_u);
499
500 start++;
52885fa2
JA
501 }
502}
503
bffad86f 504static int fio_ioring_commit(struct thread_data *td)
52885fa2 505{
bffad86f
JA
506 struct ioring_data *ld = td->io_ops_data;
507 struct ioring_options *o = td->eo;
52885fa2
JA
508 int ret;
509
510 if (!ld->queued)
511 return 0;
512
3d7d00a3
JA
513 /*
514 * Kernel side does submission. just need to check if the ring is
515 * flagged as needing a kick, if so, call io_uring_enter(). This
516 * only happens if we've been idle too long.
517 */
518 if (o->sqpoll_thread) {
bffad86f 519 struct io_sq_ring *ring = &ld->sq_ring;
2dd96cc4 520 unsigned flags;
4cdbc048 521
2dd96cc4
JA
522 flags = atomic_load_acquire(ring->flags);
523 if (flags & IORING_SQ_NEED_WAKEUP)
b532dd6d
JA
524 io_uring_enter(ld, ld->queued, 0,
525 IORING_ENTER_SQ_WAKEUP);
771c9901
JA
526 ld->queued = 0;
527 return 0;
528 }
529
52885fa2 530 do {
9a2d78b3 531 unsigned start = *ld->sq_ring.head;
52885fa2
JA
532 long nr = ld->queued;
533
9a2d78b3 534 ret = io_uring_enter(ld, nr, 0, IORING_ENTER_GETEVENTS);
52885fa2 535 if (ret > 0) {
bffad86f 536 fio_ioring_queued(td, start, ret);
52885fa2
JA
537 io_u_mark_submit(td, ret);
538
539 ld->queued -= ret;
540 ret = 0;
a90cd050
JA
541 } else if (!ret) {
542 io_u_mark_submit(td, ret);
52885fa2 543 continue;
a90cd050 544 } else {
f6abd731 545 if (errno == EAGAIN || errno == EINTR) {
bffad86f 546 ret = fio_ioring_cqring_reap(td, 0, ld->queued);
a90cd050
JA
547 if (ret)
548 continue;
549 /* Shouldn't happen */
550 usleep(1);
551 continue;
52885fa2 552 }
9a2d78b3 553 td_verror(td, errno, "io_uring_enter submit");
52885fa2 554 break;
a90cd050 555 }
52885fa2
JA
556 } while (ld->queued);
557
558 return ret;
559}
560
bffad86f 561static void fio_ioring_unmap(struct ioring_data *ld)
52885fa2 562{
9a2d78b3 563 int i;
52885fa2 564
59f94d26 565 for (i = 0; i < FIO_ARRAY_SIZE(ld->mmap); i++)
9a2d78b3
JA
566 munmap(ld->mmap[i].ptr, ld->mmap[i].len);
567 close(ld->ring_fd);
b87aa01a
JA
568}
569
bffad86f 570static void fio_ioring_cleanup(struct thread_data *td)
52885fa2 571{
bffad86f 572 struct ioring_data *ld = td->io_ops_data;
52885fa2
JA
573
574 if (ld) {
52885fa2 575 if (!(td->flags & TD_F_CHILD))
bffad86f 576 fio_ioring_unmap(ld);
9a2d78b3 577
52885fa2 578 free(ld->io_u_index);
9a2d78b3 579 free(ld->iovecs);
5ffd5626 580 free(ld->fds);
52885fa2
JA
581 free(ld);
582 }
583}
584
bffad86f 585static int fio_ioring_mmap(struct ioring_data *ld, struct io_uring_params *p)
9a2d78b3 586{
bffad86f
JA
587 struct io_sq_ring *sring = &ld->sq_ring;
588 struct io_cq_ring *cring = &ld->cq_ring;
9a2d78b3
JA
589 void *ptr;
590
e2239016 591 ld->mmap[0].len = p->sq_off.array + p->sq_entries * sizeof(__u32);
9a2d78b3
JA
592 ptr = mmap(0, ld->mmap[0].len, PROT_READ | PROT_WRITE,
593 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
594 IORING_OFF_SQ_RING);
595 ld->mmap[0].ptr = ptr;
596 sring->head = ptr + p->sq_off.head;
597 sring->tail = ptr + p->sq_off.tail;
598 sring->ring_mask = ptr + p->sq_off.ring_mask;
599 sring->ring_entries = ptr + p->sq_off.ring_entries;
600 sring->flags = ptr + p->sq_off.flags;
ac122fea 601 sring->array = ptr + p->sq_off.array;
9a2d78b3
JA
602 ld->sq_ring_mask = *sring->ring_mask;
603
f0403f94
JA
604 ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
605 ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
9a2d78b3 606 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
f0403f94
JA
607 IORING_OFF_SQES);
608 ld->mmap[1].ptr = ld->sqes;
9a2d78b3 609
f0403f94
JA
610 ld->mmap[2].len = p->cq_off.cqes +
611 p->cq_entries * sizeof(struct io_uring_cqe);
9a2d78b3
JA
612 ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
613 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
614 IORING_OFF_CQ_RING);
615 ld->mmap[2].ptr = ptr;
616 cring->head = ptr + p->cq_off.head;
617 cring->tail = ptr + p->cq_off.tail;
618 cring->ring_mask = ptr + p->cq_off.ring_mask;
619 cring->ring_entries = ptr + p->cq_off.ring_entries;
f0403f94 620 cring->cqes = ptr + p->cq_off.cqes;
9a2d78b3
JA
621 ld->cq_ring_mask = *cring->ring_mask;
622 return 0;
623}
624
556d8415
JA
625static void fio_ioring_probe(struct thread_data *td)
626{
627 struct ioring_data *ld = td->io_ops_data;
628 struct ioring_options *o = td->eo;
629 struct io_uring_probe *p;
630 int ret;
631
632 /* already set by user, don't touch */
633 if (o->nonvectored != -1)
634 return;
635
636 /* default to off, as that's always safe */
637 o->nonvectored = 0;
638
639 p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
640 if (!p)
641 return;
642
643 memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
644 ret = syscall(__NR_io_uring_register, ld->ring_fd,
645 IORING_REGISTER_PROBE, p, 256);
646 if (ret < 0)
647 goto out;
648
649 if (IORING_OP_WRITE > p->ops_len)
650 goto out;
651
652 if ((p->ops[IORING_OP_READ].flags & IO_URING_OP_SUPPORTED) &&
653 (p->ops[IORING_OP_WRITE].flags & IO_URING_OP_SUPPORTED))
654 o->nonvectored = 1;
655out:
656 free(p);
657}
658
bffad86f 659static int fio_ioring_queue_init(struct thread_data *td)
52885fa2 660{
bffad86f
JA
661 struct ioring_data *ld = td->io_ops_data;
662 struct ioring_options *o = td->eo;
52885fa2 663 int depth = td->o.iodepth;
bffad86f 664 struct io_uring_params p;
9a2d78b3
JA
665 int ret;
666
667 memset(&p, 0, sizeof(p));
52885fa2
JA
668
669 if (o->hipri)
bffad86f 670 p.flags |= IORING_SETUP_IOPOLL;
3d7d00a3
JA
671 if (o->sqpoll_thread) {
672 p.flags |= IORING_SETUP_SQPOLL;
673 if (o->sqpoll_set) {
674 p.flags |= IORING_SETUP_SQ_AFF;
675 p.sq_thread_cpu = o->sqpoll_cpu;
676 }
f635f1fb 677 }
a90cd050 678
bfed648c 679 ret = syscall(__NR_io_uring_setup, depth, &p);
9a2d78b3
JA
680 if (ret < 0)
681 return ret;
682
683 ld->ring_fd = ret;
2ea53ca3 684
556d8415
JA
685 fio_ioring_probe(td);
686
2ea53ca3 687 if (o->fixedbufs) {
bfed648c 688 ret = syscall(__NR_io_uring_register, ld->ring_fd,
919850d2 689 IORING_REGISTER_BUFFERS, ld->iovecs, depth);
2ea53ca3
JA
690 if (ret < 0)
691 return ret;
692 }
693
bffad86f 694 return fio_ioring_mmap(ld, &p);
52885fa2
JA
695}
696
5ffd5626
JA
697static int fio_ioring_register_files(struct thread_data *td)
698{
699 struct ioring_data *ld = td->io_ops_data;
700 struct fio_file *f;
701 unsigned int i;
702 int ret;
703
704 ld->fds = calloc(td->o.nr_files, sizeof(int));
705
706 for_each_file(td, f, i) {
707 ret = generic_open_file(td, f);
708 if (ret)
709 goto err;
710 ld->fds[i] = f->fd;
711 f->engine_pos = i;
712 }
713
bfed648c 714 ret = syscall(__NR_io_uring_register, ld->ring_fd,
5ffd5626
JA
715 IORING_REGISTER_FILES, ld->fds, td->o.nr_files);
716 if (ret) {
717err:
718 free(ld->fds);
719 ld->fds = NULL;
720 }
721
722 /*
723 * Pretend the file is closed again, and really close it if we hit
724 * an error.
725 */
726 for_each_file(td, f, i) {
727 if (ret) {
728 int fio_unused ret2;
729 ret2 = generic_close_file(td, f);
730 } else
731 f->fd = -1;
732 }
733
734 return ret;
735}
736
bffad86f 737static int fio_ioring_post_init(struct thread_data *td)
52885fa2 738{
bffad86f 739 struct ioring_data *ld = td->io_ops_data;
5ffd5626 740 struct ioring_options *o = td->eo;
52885fa2 741 struct io_u *io_u;
650346e1 742 int err, i;
52885fa2 743
650346e1
JA
744 for (i = 0; i < td->o.iodepth; i++) {
745 struct iovec *iov = &ld->iovecs[i];
9a2d78b3 746
650346e1
JA
747 io_u = ld->io_u_index[i];
748 iov->iov_base = io_u->buf;
749 iov->iov_len = td_max_bs(td);
52885fa2
JA
750 }
751
bffad86f 752 err = fio_ioring_queue_init(td);
52885fa2 753 if (err) {
0442b53f 754 int init_err = errno;
c4f5c92f 755
0442b53f 756 if (init_err == ENOSYS)
c4f5c92f 757 log_err("fio: your kernel doesn't support io_uring\n");
0442b53f 758 td_verror(td, init_err, "io_queue_init");
52885fa2
JA
759 return 1;
760 }
761
7c70f506
JA
762 for (i = 0; i < td->o.iodepth; i++) {
763 struct io_uring_sqe *sqe;
764
765 sqe = &ld->sqes[i];
766 memset(sqe, 0, sizeof(*sqe));
767 }
768
5ffd5626
JA
769 if (o->registerfiles) {
770 err = fio_ioring_register_files(td);
771 if (err) {
772 td_verror(td, errno, "ioring_register_files");
773 return 1;
774 }
775 }
776
52885fa2
JA
777 return 0;
778}
779
bffad86f 780static int fio_ioring_init(struct thread_data *td)
52885fa2 781{
5ffd5626 782 struct ioring_options *o = td->eo;
bffad86f 783 struct ioring_data *ld;
e9f6567a
DLM
784 struct cmdprio *cmdprio = &o->cmdprio;
785 int ret;
52885fa2 786
5ffd5626
JA
787 /* sqthread submission requires registered files */
788 if (o->sqpoll_thread)
789 o->registerfiles = 1;
790
791 if (o->registerfiles && td->o.nr_files != td->o.open_files) {
792 log_err("fio: io_uring registered files require nr_files to "
793 "be identical to open_files\n");
794 return 1;
795 }
796
52885fa2
JA
797 ld = calloc(1, sizeof(*ld));
798
b87aa01a
JA
799 /* ring depth must be a power-of-2 */
800 ld->iodepth = td->o.iodepth;
801 td->o.iodepth = roundup_pow2(td->o.iodepth);
802
52885fa2
JA
803 /* io_u index */
804 ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
650346e1 805 ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec));
52885fa2
JA
806
807 td->io_ops_data = ld;
b2a432bf 808
e9f6567a
DLM
809 ret = fio_cmdprio_init(td, cmdprio, &ld->use_cmdprio);
810 if (ret) {
811 td_verror(td, EINVAL, "fio_ioring_init");
b2a432bf
PC
812 return 1;
813 }
1af44196
XW
814
815 if (fio_option_is_set(&td->o, ioprio_class))
816 ld->ioprio_class_set = true;
817 if (fio_option_is_set(&td->o, ioprio))
818 ld->ioprio_set = true;
819
52885fa2
JA
820 return 0;
821}
822
bffad86f 823static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u)
52885fa2 824{
bffad86f 825 struct ioring_data *ld = td->io_ops_data;
52885fa2
JA
826
827 ld->io_u_index[io_u->index] = io_u;
828 return 0;
829}
830
5ffd5626
JA
831static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f)
832{
833 struct ioring_data *ld = td->io_ops_data;
834 struct ioring_options *o = td->eo;
835
17318cf6 836 if (!ld || !o->registerfiles)
5ffd5626
JA
837 return generic_open_file(td, f);
838
839 f->fd = ld->fds[f->engine_pos];
840 return 0;
841}
842
843static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
844{
17318cf6 845 struct ioring_data *ld = td->io_ops_data;
5ffd5626
JA
846 struct ioring_options *o = td->eo;
847
17318cf6 848 if (!ld || !o->registerfiles)
5ffd5626
JA
849 return generic_close_file(td, f);
850
851 f->fd = -1;
852 return 0;
853}
854
52885fa2 855static struct ioengine_ops ioengine = {
bffad86f 856 .name = "io_uring",
52885fa2 857 .version = FIO_IOOPS_VERSION,
8bfe330e 858 .flags = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD,
bffad86f
JA
859 .init = fio_ioring_init,
860 .post_init = fio_ioring_post_init,
861 .io_u_init = fio_ioring_io_u_init,
862 .prep = fio_ioring_prep,
863 .queue = fio_ioring_queue,
864 .commit = fio_ioring_commit,
865 .getevents = fio_ioring_getevents,
866 .event = fio_ioring_event,
867 .cleanup = fio_ioring_cleanup,
5ffd5626
JA
868 .open_file = fio_ioring_open_file,
869 .close_file = fio_ioring_close_file,
52885fa2
JA
870 .get_file_size = generic_get_file_size,
871 .options = options,
bffad86f 872 .option_struct_size = sizeof(struct ioring_options),
52885fa2
JA
873};
874
bffad86f 875static void fio_init fio_ioring_register(void)
52885fa2 876{
52885fa2 877 register_ioengine(&ioengine);
52885fa2
JA
878}
879
bffad86f 880static void fio_exit fio_ioring_unregister(void)
52885fa2 881{
52885fa2 882 unregister_ioengine(&ioengine);
52885fa2 883}
1f90e9bb 884#endif