stat: used shared sem for stats lock
[fio.git] / engines / io_uring.c
... / ...
CommitLineData
1/*
2 * io_uring engine
3 *
4 * IO engine using the new native Linux aio io_uring interface. See:
5 *
6 * http://git.kernel.dk/cgit/linux-block/log/?h=io_uring
7 *
8 */
9#include <stdlib.h>
10#include <unistd.h>
11#include <errno.h>
12#include <sys/time.h>
13#include <sys/resource.h>
14
15#include "../fio.h"
16#include "../lib/pow2.h"
17#include "../optgroup.h"
18#include "../lib/memalign.h"
19#include "../lib/fls.h"
20#include "../lib/roundup.h"
21#include "../verify.h"
22
23#ifdef ARCH_HAVE_IOURING
24
25#include "../lib/types.h"
26#include "../os/linux/io_uring.h"
27#include "cmdprio.h"
28#include "zbd.h"
29#include "nvme.h"
30
31#include <sys/stat.h>
32
33#ifndef IO_INTEGRITY_CHK_GUARD
34/* flags for integrity meta */
35#define IO_INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */
36#define IO_INTEGRITY_CHK_REFTAG (1U << 1) /* enforce ref check */
37#define IO_INTEGRITY_CHK_APPTAG (1U << 2) /* enforce app check */
38#endif /* IO_INTEGRITY_CHK_GUARD */
39
40#ifndef FS_IOC_GETLBMD_CAP
41/* Protection info capability flags */
42#define LBMD_PI_CAP_INTEGRITY (1 << 0)
43#define LBMD_PI_CAP_REFTAG (1 << 1)
44
45/* Checksum types for Protection Information */
46#define LBMD_PI_CSUM_NONE 0
47#define LBMD_PI_CSUM_IP 1
48#define LBMD_PI_CSUM_CRC16_T10DIF 2
49#define LBMD_PI_CSUM_CRC64_NVME 4
50
51/*
52 * Logical block metadata capability descriptor
53 * If the device does not support metadata, all the fields will be zero.
54 * Applications must check lbmd_flags to determine whether metadata is
55 * supported or not.
56 */
57struct logical_block_metadata_cap {
58 /* Bitmask of logical block metadata capability flags */
59 __u32 lbmd_flags;
60 /*
61 * The amount of data described by each unit of logical block
62 * metadata
63 */
64 __u16 lbmd_interval;
65 /*
66 * Size in bytes of the logical block metadata associated with each
67 * interval
68 */
69 __u8 lbmd_size;
70 /*
71 * Size in bytes of the opaque block tag associated with each
72 * interval
73 */
74 __u8 lbmd_opaque_size;
75 /*
76 * Offset in bytes of the opaque block tag within the logical block
77 * metadata
78 */
79 __u8 lbmd_opaque_offset;
80 /* Size in bytes of the T10 PI tuple associated with each interval */
81 __u8 lbmd_pi_size;
82 /* Offset in bytes of T10 PI tuple within the logical block metadata */
83 __u8 lbmd_pi_offset;
84 /* T10 PI guard tag type */
85 __u8 lbmd_guard_tag_type;
86 /* Size in bytes of the T10 PI application tag */
87 __u8 lbmd_app_tag_size;
88 /* Size in bytes of the T10 PI reference tag */
89 __u8 lbmd_ref_tag_size;
90 /* Size in bytes of the T10 PI storage tag */
91 __u8 lbmd_storage_tag_size;
92 __u8 pad;
93};
94
95#define FS_IOC_GETLBMD_CAP _IOWR(0x15, 2, struct logical_block_metadata_cap)
96#endif /* FS_IOC_GETLBMD_CAP */
97
98enum uring_cmd_type {
99 FIO_URING_CMD_NVME = 1,
100};
101
102enum uring_cmd_write_mode {
103 FIO_URING_CMD_WMODE_WRITE = 1,
104 FIO_URING_CMD_WMODE_UNCOR,
105 FIO_URING_CMD_WMODE_ZEROES,
106 FIO_URING_CMD_WMODE_VERIFY,
107};
108
109enum uring_cmd_verify_mode {
110 FIO_URING_CMD_VMODE_READ = 1,
111 FIO_URING_CMD_VMODE_COMPARE,
112};
113
114struct io_sq_ring {
115 unsigned *head;
116 unsigned *tail;
117 unsigned *ring_mask;
118 unsigned *ring_entries;
119 unsigned *flags;
120 unsigned *array;
121};
122
123struct io_cq_ring {
124 unsigned *head;
125 unsigned *tail;
126 unsigned *ring_mask;
127 unsigned *ring_entries;
128 struct io_uring_cqe *cqes;
129};
130
131struct ioring_mmap {
132 void *ptr;
133 size_t len;
134};
135
136struct ioring_data {
137 int ring_fd;
138
139 struct io_u **io_u_index;
140 char *md_buf;
141 char *pi_attr;
142
143 int *fds;
144
145 struct io_sq_ring sq_ring;
146 struct io_uring_sqe *sqes;
147 struct iovec *iovecs;
148 unsigned sq_ring_mask;
149
150 struct io_cq_ring cq_ring;
151 unsigned cq_ring_mask;
152
153 int async_trim_fail;
154 int queued;
155 int cq_ring_off;
156 unsigned iodepth;
157 int prepped;
158
159 struct ioring_mmap mmap[3];
160
161 struct cmdprio cmdprio;
162
163 struct nvme_dsm *dsm;
164 uint32_t cdw12_flags[DDIR_RWDIR_CNT];
165 uint8_t write_opcode;
166
167 bool is_uring_cmd_eng;
168
169 struct nvme_cmd_ext_io_opts ext_opts;
170};
171
172struct ioring_options {
173 struct thread_data *td;
174 unsigned int hipri;
175 unsigned int readfua;
176 unsigned int writefua;
177 unsigned int deac;
178 unsigned int write_mode;
179 unsigned int verify_mode;
180 struct cmdprio_options cmdprio_options;
181 unsigned int fixedbufs;
182 unsigned int registerfiles;
183 unsigned int sqpoll_thread;
184 unsigned int sqpoll_set;
185 unsigned int sqpoll_cpu;
186 unsigned int nonvectored;
187 unsigned int uncached;
188 unsigned int nowait;
189 unsigned int force_async;
190 unsigned int md_per_io_size;
191 unsigned int pi_act;
192 unsigned int apptag;
193 unsigned int apptag_mask;
194 unsigned int prchk;
195 char *pi_chk;
196 enum uring_cmd_type cmd_type;
197};
198
199static const int ddir_to_op[2][2] = {
200 { IORING_OP_READV, IORING_OP_READ },
201 { IORING_OP_WRITEV, IORING_OP_WRITE }
202};
203
204static const int fixed_ddir_to_op[2] = {
205 IORING_OP_READ_FIXED,
206 IORING_OP_WRITE_FIXED
207};
208
209static int fio_ioring_sqpoll_cb(void *data, unsigned long long *val)
210{
211 struct ioring_options *o = data;
212
213 o->sqpoll_cpu = *val;
214 o->sqpoll_set = 1;
215 return 0;
216}
217
218static struct fio_option options[] = {
219 {
220 .name = "hipri",
221 .lname = "High Priority",
222 .type = FIO_OPT_STR_SET,
223 .off1 = offsetof(struct ioring_options, hipri),
224 .help = "Use polled IO completions",
225 .category = FIO_OPT_C_ENGINE,
226 .group = FIO_OPT_G_IOURING,
227 },
228 {
229 .name = "readfua",
230 .lname = "Read fua flag support",
231 .type = FIO_OPT_BOOL,
232 .off1 = offsetof(struct ioring_options, readfua),
233 .help = "Set FUA flag (force unit access) for all Read operations",
234 .def = "0",
235 .category = FIO_OPT_C_ENGINE,
236 .group = FIO_OPT_G_IOURING,
237 },
238 {
239 .name = "writefua",
240 .lname = "Write fua flag support",
241 .type = FIO_OPT_BOOL,
242 .off1 = offsetof(struct ioring_options, writefua),
243 .help = "Set FUA flag (force unit access) for all Write operations",
244 .def = "0",
245 .category = FIO_OPT_C_ENGINE,
246 .group = FIO_OPT_G_IOURING,
247 },
248 {
249 .name = "write_mode",
250 .lname = "Additional Write commands support (Write Uncorrectable, Write Zeores)",
251 .type = FIO_OPT_STR,
252 .off1 = offsetof(struct ioring_options, write_mode),
253 .help = "Issue Write Uncorrectable or Zeroes command instead of Write command",
254 .def = "write",
255 .posval = {
256 { .ival = "write",
257 .oval = FIO_URING_CMD_WMODE_WRITE,
258 .help = "Issue Write commands for write operations"
259 },
260 { .ival = "uncor",
261 .oval = FIO_URING_CMD_WMODE_UNCOR,
262 .help = "Issue Write Uncorrectable commands for write operations"
263 },
264 { .ival = "zeroes",
265 .oval = FIO_URING_CMD_WMODE_ZEROES,
266 .help = "Issue Write Zeroes commands for write operations"
267 },
268 { .ival = "verify",
269 .oval = FIO_URING_CMD_WMODE_VERIFY,
270 .help = "Issue Verify commands for write operations"
271 },
272 },
273 .category = FIO_OPT_C_ENGINE,
274 .group = FIO_OPT_G_IOURING,
275 },
276 {
277 .name = "verify_mode",
278 .lname = "Do verify based on the configured command (e.g., Read or Compare command)",
279 .type = FIO_OPT_STR,
280 .off1 = offsetof(struct ioring_options, verify_mode),
281 .help = "Issue Read or Compare command in the verification phase",
282 .def = "read",
283 .posval = {
284 { .ival = "read",
285 .oval = FIO_URING_CMD_VMODE_READ,
286 .help = "Issue Read commands in the verification phase"
287 },
288 { .ival = "compare",
289 .oval = FIO_URING_CMD_VMODE_COMPARE,
290 .help = "Issue Compare commands in the verification phase"
291 },
292 },
293 .category = FIO_OPT_C_ENGINE,
294 .group = FIO_OPT_G_IOURING,
295 },
296 {
297 .name = "fixedbufs",
298 .lname = "Fixed (pre-mapped) IO buffers",
299 .type = FIO_OPT_STR_SET,
300 .off1 = offsetof(struct ioring_options, fixedbufs),
301 .help = "Pre map IO buffers",
302 .category = FIO_OPT_C_ENGINE,
303 .group = FIO_OPT_G_IOURING,
304 },
305 {
306 .name = "registerfiles",
307 .lname = "Register file set",
308 .type = FIO_OPT_STR_SET,
309 .off1 = offsetof(struct ioring_options, registerfiles),
310 .help = "Pre-open/register files",
311 .category = FIO_OPT_C_ENGINE,
312 .group = FIO_OPT_G_IOURING,
313 },
314 {
315 .name = "sqthread_poll",
316 .lname = "Kernel SQ thread polling",
317 .type = FIO_OPT_STR_SET,
318 .off1 = offsetof(struct ioring_options, sqpoll_thread),
319 .help = "Offload submission/completion to kernel thread",
320 .category = FIO_OPT_C_ENGINE,
321 .group = FIO_OPT_G_IOURING,
322 },
323 {
324 .name = "sqthread_poll_cpu",
325 .lname = "SQ Thread Poll CPU",
326 .type = FIO_OPT_INT,
327 .cb = fio_ioring_sqpoll_cb,
328 .help = "What CPU to run SQ thread polling on",
329 .category = FIO_OPT_C_ENGINE,
330 .group = FIO_OPT_G_IOURING,
331 },
332 {
333 .name = "nonvectored",
334 .lname = "Non-vectored",
335 .type = FIO_OPT_INT,
336 .off1 = offsetof(struct ioring_options, nonvectored),
337 .def = "-1",
338 .help = "Use non-vectored read/write commands",
339 .category = FIO_OPT_C_ENGINE,
340 .group = FIO_OPT_G_IOURING,
341 },
342 {
343 .name = "uncached",
344 .lname = "Uncached",
345 .type = FIO_OPT_INT,
346 .off1 = offsetof(struct ioring_options, uncached),
347 .help = "Use RWF_DONTCACHE for buffered read/writes",
348 .category = FIO_OPT_C_ENGINE,
349 .group = FIO_OPT_G_IOURING,
350 },
351 {
352 .name = "nowait",
353 .lname = "RWF_NOWAIT",
354 .type = FIO_OPT_BOOL,
355 .off1 = offsetof(struct ioring_options, nowait),
356 .help = "Use RWF_NOWAIT for reads/writes",
357 .category = FIO_OPT_C_ENGINE,
358 .group = FIO_OPT_G_IOURING,
359 },
360 {
361 .name = "force_async",
362 .lname = "Force async",
363 .type = FIO_OPT_INT,
364 .off1 = offsetof(struct ioring_options, force_async),
365 .help = "Set IOSQE_ASYNC every N requests",
366 .category = FIO_OPT_C_ENGINE,
367 .group = FIO_OPT_G_IOURING,
368 },
369 {
370 .name = "cmd_type",
371 .lname = "Uring cmd type",
372 .type = FIO_OPT_STR,
373 .off1 = offsetof(struct ioring_options, cmd_type),
374 .help = "Specify uring-cmd type",
375 .def = "nvme",
376 .posval = {
377 { .ival = "nvme",
378 .oval = FIO_URING_CMD_NVME,
379 .help = "Issue nvme-uring-cmd",
380 },
381 },
382 .category = FIO_OPT_C_ENGINE,
383 .group = FIO_OPT_G_IOURING,
384 },
385 CMDPRIO_OPTIONS(struct ioring_options, FIO_OPT_G_IOURING),
386 {
387 .name = "md_per_io_size",
388 .lname = "Separate Metadata Buffer Size per I/O",
389 .type = FIO_OPT_INT,
390 .off1 = offsetof(struct ioring_options, md_per_io_size),
391 .def = "0",
392 .help = "Size of separate metadata buffer per I/O (Default: 0)",
393 .category = FIO_OPT_C_ENGINE,
394 .group = FIO_OPT_G_IOURING,
395 },
396 {
397 .name = "pi_act",
398 .lname = "Protection Information Action",
399 .type = FIO_OPT_BOOL,
400 .off1 = offsetof(struct ioring_options, pi_act),
401 .def = "1",
402 .help = "Protection Information Action bit (pi_act=1 or pi_act=0)",
403 .category = FIO_OPT_C_ENGINE,
404 .group = FIO_OPT_G_IOURING,
405 },
406 {
407 .name = "pi_chk",
408 .lname = "Protection Information Check",
409 .type = FIO_OPT_STR_STORE,
410 .off1 = offsetof(struct ioring_options, pi_chk),
411 .def = NULL,
412 .help = "Control of Protection Information Checking (pi_chk=GUARD,REFTAG,APPTAG)",
413 .category = FIO_OPT_C_ENGINE,
414 .group = FIO_OPT_G_IOURING,
415 },
416 {
417 .name = "apptag",
418 .lname = "Application Tag used in Protection Information",
419 .type = FIO_OPT_INT,
420 .off1 = offsetof(struct ioring_options, apptag),
421 .def = "0x1234",
422 .help = "Application Tag used in Protection Information field (Default: 0x1234)",
423 .category = FIO_OPT_C_ENGINE,
424 .group = FIO_OPT_G_IOURING,
425 },
426 {
427 .name = "apptag_mask",
428 .lname = "Application Tag Mask",
429 .type = FIO_OPT_INT,
430 .off1 = offsetof(struct ioring_options, apptag_mask),
431 .def = "0xffff",
432 .help = "Application Tag Mask used with Application Tag (Default: 0xffff)",
433 .category = FIO_OPT_C_ENGINE,
434 .group = FIO_OPT_G_IOURING,
435 },
436 {
437 .name = "deac",
438 .lname = "Deallocate bit for write zeroes command",
439 .type = FIO_OPT_BOOL,
440 .off1 = offsetof(struct ioring_options, deac),
441 .help = "Set DEAC (deallocate) flag for write zeroes command",
442 .def = "0",
443 .category = FIO_OPT_C_ENGINE,
444 .group = FIO_OPT_G_IOURING,
445 },
446 {
447 .name = NULL,
448 },
449};
450
451static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit,
452 unsigned int min_complete, unsigned int flags)
453{
454#ifdef FIO_ARCH_HAS_SYSCALL
455 return __do_syscall6(__NR_io_uring_enter, ld->ring_fd, to_submit,
456 min_complete, flags, NULL, 0);
457#else
458 return syscall(__NR_io_uring_enter, ld->ring_fd, to_submit,
459 min_complete, flags, NULL, 0);
460#endif
461}
462
463#ifndef BLOCK_URING_CMD_DISCARD
464#define BLOCK_URING_CMD_DISCARD _IO(0x12, 0)
465#endif
466
467static void fio_ioring_prep_md(struct thread_data *td, struct io_u *io_u)
468{
469 struct ioring_data *ld = td->io_ops_data;
470 struct io_uring_attr_pi *pi_attr = io_u->pi_attr;
471 struct nvme_data *data = FILE_ENG_DATA(io_u->file);
472 struct io_uring_sqe *sqe;
473
474 sqe = &ld->sqes[io_u->index];
475
476 sqe->attr_type_mask = IORING_RW_ATTR_FLAG_PI;
477 sqe->attr_ptr = (__u64)(uintptr_t)pi_attr;
478 pi_attr->addr = (__u64)(uintptr_t)io_u->mmap_data;
479
480 if (pi_attr->flags & IO_INTEGRITY_CHK_REFTAG) {
481 __u64 slba = get_slba(data, io_u->offset);
482 pi_attr->seed = (__u32)slba;
483 }
484}
485
486static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
487{
488 struct ioring_data *ld = td->io_ops_data;
489 struct ioring_options *o = td->eo;
490 struct fio_file *f = io_u->file;
491 struct io_uring_sqe *sqe;
492
493 sqe = &ld->sqes[io_u->index];
494
495 if (o->registerfiles) {
496 sqe->fd = f->engine_pos;
497 sqe->flags = IOSQE_FIXED_FILE;
498 } else {
499 sqe->fd = f->fd;
500 sqe->flags = 0;
501 }
502
503 if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
504 if (o->fixedbufs) {
505 sqe->opcode = fixed_ddir_to_op[io_u->ddir];
506 sqe->addr = (unsigned long) io_u->xfer_buf;
507 sqe->len = io_u->xfer_buflen;
508 sqe->buf_index = io_u->index;
509 } else {
510 struct iovec *iov = &ld->iovecs[io_u->index];
511
512 /*
513 * Update based on actual io_u, requeue could have
514 * adjusted these
515 */
516 iov->iov_base = io_u->xfer_buf;
517 iov->iov_len = io_u->xfer_buflen;
518
519 sqe->opcode = ddir_to_op[io_u->ddir][!!o->nonvectored];
520 if (o->nonvectored) {
521 sqe->addr = (unsigned long) iov->iov_base;
522 sqe->len = iov->iov_len;
523 } else {
524 sqe->addr = (unsigned long) iov;
525 sqe->len = 1;
526 }
527 }
528 if (o->md_per_io_size)
529 fio_ioring_prep_md(td, io_u);
530 sqe->rw_flags = 0;
531 if (!td->o.odirect && o->uncached)
532 sqe->rw_flags |= RWF_DONTCACHE;
533 if (o->nowait)
534 sqe->rw_flags |= RWF_NOWAIT;
535 if (td->o.oatomic && io_u->ddir == DDIR_WRITE)
536 sqe->rw_flags |= RWF_ATOMIC;
537
538 /*
539 * Since io_uring can have a submission context (sqthread_poll)
540 * that is different from the process context, we cannot rely on
541 * the IO priority set by ioprio_set() (options prio, prioclass,
542 * and priohint) to be inherited.
543 * td->ioprio will have the value of the "default prio", so set
544 * this unconditionally. This value might get overridden by
545 * fio_ioring_cmdprio_prep() if the option cmdprio_percentage or
546 * cmdprio_bssplit is used.
547 */
548 sqe->ioprio = td->ioprio;
549 sqe->off = io_u->offset;
550 } else if (ddir_sync(io_u->ddir)) {
551 sqe->ioprio = 0;
552 if (io_u->ddir == DDIR_SYNC_FILE_RANGE) {
553 sqe->off = f->first_write;
554 sqe->len = f->last_write - f->first_write;
555 sqe->sync_range_flags = td->o.sync_file_range;
556 sqe->opcode = IORING_OP_SYNC_FILE_RANGE;
557 } else {
558 sqe->off = 0;
559 sqe->addr = 0;
560 sqe->len = 0;
561 if (io_u->ddir == DDIR_DATASYNC)
562 sqe->fsync_flags |= IORING_FSYNC_DATASYNC;
563 sqe->opcode = IORING_OP_FSYNC;
564 }
565 } else if (io_u->ddir == DDIR_TRIM) {
566 sqe->opcode = IORING_OP_URING_CMD;
567 sqe->addr = io_u->offset;
568 sqe->addr3 = io_u->xfer_buflen;
569 sqe->rw_flags = 0;
570 sqe->len = sqe->off = 0;
571 sqe->ioprio = 0;
572 sqe->cmd_op = BLOCK_URING_CMD_DISCARD;
573 sqe->__pad1 = 0;
574 sqe->file_index = 0;
575 }
576
577 if (o->force_async && ++ld->prepped == o->force_async) {
578 ld->prepped = 0;
579 sqe->flags |= IOSQE_ASYNC;
580 }
581
582 sqe->user_data = (unsigned long) io_u;
583 return 0;
584}
585
586static int fio_ioring_cmd_prep(struct thread_data *td, struct io_u *io_u)
587{
588 struct ioring_data *ld = td->io_ops_data;
589 struct ioring_options *o = td->eo;
590 struct fio_file *f = io_u->file;
591 struct nvme_uring_cmd *cmd;
592 struct io_uring_sqe *sqe;
593 struct nvme_dsm *dsm;
594 void *ptr = ld->dsm;
595 unsigned int dsm_size;
596 uint8_t read_opcode = nvme_cmd_read;
597
598 /* only supports nvme_uring_cmd */
599 if (o->cmd_type != FIO_URING_CMD_NVME)
600 return -EINVAL;
601
602 if (io_u->ddir == DDIR_TRIM && td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM)
603 return 0;
604
605 sqe = &ld->sqes[(io_u->index) << 1];
606
607 if (o->registerfiles) {
608 sqe->fd = f->engine_pos;
609 sqe->flags = IOSQE_FIXED_FILE;
610 } else {
611 sqe->fd = f->fd;
612 }
613 sqe->rw_flags = 0;
614 if (!td->o.odirect && o->uncached)
615 sqe->rw_flags |= RWF_DONTCACHE;
616 if (o->nowait)
617 sqe->rw_flags |= RWF_NOWAIT;
618
619 sqe->opcode = IORING_OP_URING_CMD;
620 sqe->user_data = (unsigned long) io_u;
621 if (o->nonvectored)
622 sqe->cmd_op = NVME_URING_CMD_IO;
623 else
624 sqe->cmd_op = NVME_URING_CMD_IO_VEC;
625 if (o->force_async && ++ld->prepped == o->force_async) {
626 ld->prepped = 0;
627 sqe->flags |= IOSQE_ASYNC;
628 }
629 if (o->fixedbufs) {
630 sqe->uring_cmd_flags = IORING_URING_CMD_FIXED;
631 sqe->buf_index = io_u->index;
632 }
633
634 cmd = (struct nvme_uring_cmd *)sqe->cmd;
635 dsm_size = sizeof(*ld->dsm) + td->o.num_range * sizeof(struct nvme_dsm_range);
636 ptr += io_u->index * dsm_size;
637 dsm = (struct nvme_dsm *)ptr;
638
639 /*
640 * If READ command belongs to the verification phase and the
641 * verify_mode=compare, convert READ to COMPARE command.
642 */
643 if (io_u->flags & IO_U_F_VER_LIST && io_u->ddir == DDIR_READ &&
644 o->verify_mode == FIO_URING_CMD_VMODE_COMPARE) {
645 populate_verify_io_u(td, io_u);
646 read_opcode = nvme_cmd_compare;
647 io_u_set(td, io_u, IO_U_F_VER_IN_DEV);
648 }
649
650 return fio_nvme_uring_cmd_prep(cmd, io_u,
651 o->nonvectored ? NULL : &ld->iovecs[io_u->index],
652 dsm, read_opcode, ld->write_opcode,
653 ld->cdw12_flags[io_u->ddir]);
654}
655
656static void fio_ioring_validate_md(struct thread_data *td, struct io_u *io_u)
657{
658 struct nvme_data *data;
659 struct ioring_options *o = td->eo;
660 int ret;
661
662 data = FILE_ENG_DATA(io_u->file);
663 if (data->pi_type && (io_u->ddir == DDIR_READ) && !o->pi_act) {
664 ret = fio_nvme_pi_verify(data, io_u);
665 if (ret)
666 io_u->error = -ret;
667 }
668
669 return;
670}
671
672static struct io_u *fio_ioring_event(struct thread_data *td, int event)
673{
674 struct ioring_data *ld = td->io_ops_data;
675 struct ioring_options *o = td->eo;
676 struct io_uring_cqe *cqe;
677 struct io_u *io_u;
678 unsigned index;
679
680 index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
681
682 cqe = &ld->cq_ring.cqes[index];
683 io_u = (struct io_u *) (uintptr_t) cqe->user_data;
684
685 /* trim returns 0 on success */
686 if (cqe->res == io_u->xfer_buflen ||
687 (io_u->ddir == DDIR_TRIM && !cqe->res)) {
688 io_u->error = 0;
689 return io_u;
690 }
691
692 if (cqe->res != io_u->xfer_buflen) {
693 if (io_u->ddir == DDIR_TRIM) {
694 ld->async_trim_fail = 1;
695 cqe->res = 0;
696 }
697 if (cqe->res > io_u->xfer_buflen)
698 io_u->error = -cqe->res;
699 else
700 io_u->resid = io_u->xfer_buflen - cqe->res;
701
702 return io_u;
703 }
704
705 if (o->md_per_io_size)
706 fio_ioring_validate_md(td, io_u);
707
708 return io_u;
709}
710
711static struct io_u *fio_ioring_cmd_event(struct thread_data *td, int event)
712{
713 struct ioring_data *ld = td->io_ops_data;
714 struct ioring_options *o = td->eo;
715 struct io_uring_cqe *cqe;
716 struct io_u *io_u;
717 struct nvme_data *data;
718 unsigned index;
719 int ret;
720
721 index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
722 if (o->cmd_type == FIO_URING_CMD_NVME)
723 index <<= 1;
724
725 cqe = &ld->cq_ring.cqes[index];
726 io_u = (struct io_u *) (uintptr_t) cqe->user_data;
727
728 io_u->error = cqe->res;
729 if (io_u->error != 0)
730 goto ret;
731
732 if (o->cmd_type == FIO_URING_CMD_NVME) {
733 data = FILE_ENG_DATA(io_u->file);
734 if (data->pi_type && (io_u->ddir == DDIR_READ) && !o->pi_act) {
735 ret = fio_nvme_pi_verify(data, io_u);
736 if (ret)
737 io_u->error = ret;
738 }
739 }
740
741ret:
742 /*
743 * If IO_U_F_DEVICE_ERROR is not set, io_u->error will be parsed as an
744 * errno, otherwise device-specific error value (status value in CQE).
745 */
746 if ((int)io_u->error > 0)
747 io_u_set(td, io_u, IO_U_F_DEVICE_ERROR);
748 else
749 io_u_clear(td, io_u, IO_U_F_DEVICE_ERROR);
750 io_u->error = abs((int)io_u->error);
751 return io_u;
752}
753
754static char *fio_ioring_cmd_errdetails(struct thread_data *td,
755 struct io_u *io_u)
756{
757 struct ioring_options *o = td->eo;
758 unsigned int sct = (io_u->error >> 8) & 0x7;
759 unsigned int sc = io_u->error & 0xff;
760#define MAXERRDETAIL 1024
761#define MAXMSGCHUNK 128
762 char *msg, msgchunk[MAXMSGCHUNK];
763
764 if (!(io_u->flags & IO_U_F_DEVICE_ERROR))
765 return NULL;
766
767 msg = calloc(1, MAXERRDETAIL);
768 strcpy(msg, "io_uring_cmd: ");
769
770 snprintf(msgchunk, MAXMSGCHUNK, "%s: ", io_u->file->file_name);
771 strlcat(msg, msgchunk, MAXERRDETAIL);
772
773 if (o->cmd_type == FIO_URING_CMD_NVME) {
774 strlcat(msg, "cq entry status (", MAXERRDETAIL);
775
776 snprintf(msgchunk, MAXMSGCHUNK, "sct=0x%02x; ", sct);
777 strlcat(msg, msgchunk, MAXERRDETAIL);
778
779 snprintf(msgchunk, MAXMSGCHUNK, "sc=0x%02x)", sc);
780 strlcat(msg, msgchunk, MAXERRDETAIL);
781 } else {
782 /* Print status code in generic */
783 snprintf(msgchunk, MAXMSGCHUNK, "status=0x%x", io_u->error);
784 strlcat(msg, msgchunk, MAXERRDETAIL);
785 }
786
787 return msg;
788}
789
790static unsigned fio_ioring_cqring_reap(struct thread_data *td, unsigned int max)
791{
792 struct ioring_data *ld = td->io_ops_data;
793 struct io_cq_ring *ring = &ld->cq_ring;
794 unsigned head = *ring->head;
795 unsigned available = atomic_load_acquire(ring->tail) - head;
796
797 if (!available)
798 return 0;
799
800 available = min(available, max);
801 /*
802 * The CQ consumer index is advanced before the CQEs are actually read.
803 * This is generally unsafe, as it lets the kernel reuse the CQE slots.
804 * However, the CQ is sized large enough for the maximum iodepth and a
805 * new SQE won't be submitted until the CQE is processed, so the CQE
806 * slot won't actually be reused until it has been processed.
807 */
808 atomic_store_relaxed(ring->head, head + available);
809 return available;
810}
811
812static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
813 unsigned int max, const struct timespec *t)
814{
815 struct ioring_data *ld = td->io_ops_data;
816 unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
817 struct ioring_options *o = td->eo;
818 struct io_cq_ring *ring = &ld->cq_ring;
819 unsigned events = 0;
820 int r;
821
822 ld->cq_ring_off = *ring->head;
823 for (;;) {
824 r = fio_ioring_cqring_reap(td, max - events);
825 if (r) {
826 events += r;
827 if (events >= min)
828 return events;
829
830 if (actual_min != 0)
831 actual_min -= r;
832 }
833
834 if (!o->sqpoll_thread) {
835 r = io_uring_enter(ld, 0, actual_min,
836 IORING_ENTER_GETEVENTS);
837 if (r < 0) {
838 if (errno == EAGAIN || errno == EINTR)
839 continue;
840 r = -errno;
841 td_verror(td, errno, "io_uring_enter");
842 return r;
843 }
844 }
845 }
846}
847
848static inline void fio_ioring_cmd_nvme_pi(struct thread_data *td,
849 struct io_u *io_u)
850{
851 struct ioring_data *ld = td->io_ops_data;
852 struct nvme_uring_cmd *cmd;
853 struct io_uring_sqe *sqe;
854
855 if (io_u->ddir == DDIR_TRIM)
856 return;
857
858 sqe = &ld->sqes[(io_u->index) << 1];
859 cmd = (struct nvme_uring_cmd *)sqe->cmd;
860
861 fio_nvme_pi_fill(cmd, io_u, &ld->ext_opts);
862}
863
864static inline void fio_ioring_setup_pi(struct thread_data *td,
865 struct io_u *io_u)
866{
867 struct ioring_data *ld = td->io_ops_data;
868
869 if (io_u->ddir == DDIR_TRIM)
870 return;
871
872 fio_nvme_generate_guard(io_u, &ld->ext_opts);
873}
874
875static inline void fio_ioring_cmdprio_prep(struct thread_data *td,
876 struct io_u *io_u)
877{
878 struct ioring_data *ld = td->io_ops_data;
879 struct cmdprio *cmdprio = &ld->cmdprio;
880
881 if (fio_cmdprio_set_ioprio(td, cmdprio, io_u))
882 ld->sqes[io_u->index].ioprio = io_u->ioprio;
883}
884
885static enum fio_q_status fio_ioring_queue(struct thread_data *td,
886 struct io_u *io_u)
887{
888 struct ioring_data *ld = td->io_ops_data;
889 struct ioring_options *o = td->eo;
890 struct io_sq_ring *ring = &ld->sq_ring;
891 unsigned tail;
892
893 fio_ro_check(td, io_u);
894
895 /* should not hit... */
896 if (ld->queued == td->o.iodepth)
897 return FIO_Q_BUSY;
898
899 /* if async trim has been tried and failed, punt to sync */
900 if (io_u->ddir == DDIR_TRIM && ld->async_trim_fail) {
901 if (ld->queued)
902 return FIO_Q_BUSY;
903
904 do_io_u_trim(td, io_u);
905
906 io_u_mark_submit(td, 1);
907 io_u_mark_complete(td, 1);
908 return FIO_Q_COMPLETED;
909 }
910
911 if (ld->cmdprio.mode != CMDPRIO_MODE_NONE)
912 fio_ioring_cmdprio_prep(td, io_u);
913
914 if (o->cmd_type == FIO_URING_CMD_NVME && ld->is_uring_cmd_eng)
915 fio_ioring_cmd_nvme_pi(td, io_u);
916 else if (o->md_per_io_size)
917 fio_ioring_setup_pi(td, io_u);
918
919 tail = *ring->tail;
920 ring->array[tail & ld->sq_ring_mask] = io_u->index;
921 atomic_store_release(ring->tail, tail + 1);
922
923 ld->queued++;
924 return FIO_Q_QUEUED;
925}
926
927static void fio_ioring_queued(struct thread_data *td, int start, int nr)
928{
929 struct ioring_data *ld = td->io_ops_data;
930 struct timespec now;
931
932 if (!fio_fill_issue_time(td))
933 return;
934
935 fio_gettime(&now, NULL);
936
937 while (nr--) {
938 struct io_sq_ring *ring = &ld->sq_ring;
939 int index = ring->array[start & ld->sq_ring_mask];
940 struct io_u *io_u = ld->io_u_index[index];
941
942 memcpy(&io_u->issue_time, &now, sizeof(now));
943 io_u_queued(td, io_u);
944
945 start++;
946 }
947
948 /*
949 * only used for iolog
950 */
951 if (td->o.read_iolog_file)
952 memcpy(&td->last_issue, &now, sizeof(now));
953}
954
955static int fio_ioring_commit(struct thread_data *td)
956{
957 struct ioring_data *ld = td->io_ops_data;
958 struct ioring_options *o = td->eo;
959 int ret;
960
961 if (!ld->queued)
962 return 0;
963
964 /*
965 * Kernel side does submission. just need to check if the ring is
966 * flagged as needing a kick, if so, call io_uring_enter(). This
967 * only happens if we've been idle too long.
968 */
969 if (o->sqpoll_thread) {
970 struct io_sq_ring *ring = &ld->sq_ring;
971 unsigned start = *ld->sq_ring.tail - ld->queued;
972 unsigned flags;
973
974 flags = atomic_load_relaxed(ring->flags);
975 if (flags & IORING_SQ_NEED_WAKEUP)
976 io_uring_enter(ld, ld->queued, 0,
977 IORING_ENTER_SQ_WAKEUP);
978 fio_ioring_queued(td, start, ld->queued);
979 io_u_mark_submit(td, ld->queued);
980
981 ld->queued = 0;
982 return 0;
983 }
984
985 do {
986 unsigned start = *ld->sq_ring.head;
987 long nr = ld->queued;
988
989 ret = io_uring_enter(ld, nr, 0, IORING_ENTER_GETEVENTS);
990 if (ret > 0) {
991 fio_ioring_queued(td, start, ret);
992 io_u_mark_submit(td, ret);
993
994 ld->queued -= ret;
995 ret = 0;
996 } else if (!ret) {
997 io_u_mark_submit(td, ret);
998 continue;
999 } else {
1000 if (errno == EAGAIN || errno == EINTR) {
1001 ret = fio_ioring_cqring_reap(td, ld->queued);
1002 if (ret)
1003 continue;
1004 /* Shouldn't happen */
1005 usleep(1);
1006 continue;
1007 }
1008 ret = -errno;
1009 td_verror(td, errno, "io_uring_enter submit");
1010 break;
1011 }
1012 } while (ld->queued);
1013
1014 return ret;
1015}
1016
1017static void fio_ioring_unmap(struct ioring_data *ld)
1018{
1019 int i;
1020
1021 for (i = 0; i < FIO_ARRAY_SIZE(ld->mmap); i++)
1022 munmap(ld->mmap[i].ptr, ld->mmap[i].len);
1023 close(ld->ring_fd);
1024}
1025
1026static void fio_ioring_cleanup(struct thread_data *td)
1027{
1028 struct ioring_data *ld = td->io_ops_data;
1029
1030 if (ld) {
1031 if (!(td->flags & TD_F_CHILD))
1032 fio_ioring_unmap(ld);
1033
1034 fio_cmdprio_cleanup(&ld->cmdprio);
1035 free(ld->io_u_index);
1036 free(ld->md_buf);
1037 free(ld->pi_attr);
1038 free(ld->iovecs);
1039 free(ld->fds);
1040 free(ld->dsm);
1041 free(ld);
1042 }
1043}
1044
1045static int fio_ioring_mmap(struct ioring_data *ld, struct io_uring_params *p)
1046{
1047 struct io_sq_ring *sring = &ld->sq_ring;
1048 struct io_cq_ring *cring = &ld->cq_ring;
1049 void *ptr;
1050
1051 ld->mmap[0].len = p->sq_off.array + p->sq_entries * sizeof(__u32);
1052 ptr = mmap(0, ld->mmap[0].len, PROT_READ | PROT_WRITE,
1053 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
1054 IORING_OFF_SQ_RING);
1055 ld->mmap[0].ptr = ptr;
1056 sring->head = ptr + p->sq_off.head;
1057 sring->tail = ptr + p->sq_off.tail;
1058 sring->ring_mask = ptr + p->sq_off.ring_mask;
1059 sring->ring_entries = ptr + p->sq_off.ring_entries;
1060 sring->flags = ptr + p->sq_off.flags;
1061 sring->array = ptr + p->sq_off.array;
1062 ld->sq_ring_mask = *sring->ring_mask;
1063
1064 if (p->flags & IORING_SETUP_SQE128)
1065 ld->mmap[1].len = 2 * p->sq_entries * sizeof(struct io_uring_sqe);
1066 else
1067 ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
1068 ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
1069 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
1070 IORING_OFF_SQES);
1071 ld->mmap[1].ptr = ld->sqes;
1072
1073 if (p->flags & IORING_SETUP_CQE32) {
1074 ld->mmap[2].len = p->cq_off.cqes +
1075 2 * p->cq_entries * sizeof(struct io_uring_cqe);
1076 } else {
1077 ld->mmap[2].len = p->cq_off.cqes +
1078 p->cq_entries * sizeof(struct io_uring_cqe);
1079 }
1080 ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
1081 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
1082 IORING_OFF_CQ_RING);
1083 ld->mmap[2].ptr = ptr;
1084 cring->head = ptr + p->cq_off.head;
1085 cring->tail = ptr + p->cq_off.tail;
1086 cring->ring_mask = ptr + p->cq_off.ring_mask;
1087 cring->ring_entries = ptr + p->cq_off.ring_entries;
1088 cring->cqes = ptr + p->cq_off.cqes;
1089 ld->cq_ring_mask = *cring->ring_mask;
1090 return 0;
1091}
1092
1093static void fio_ioring_probe(struct thread_data *td)
1094{
1095 struct ioring_data *ld = td->io_ops_data;
1096 struct ioring_options *o = td->eo;
1097 struct io_uring_probe *p;
1098 int ret;
1099
1100 /* already set by user, don't touch */
1101 if (o->nonvectored != -1)
1102 return;
1103
1104 /* default to off, as that's always safe */
1105 o->nonvectored = 0;
1106
1107 p = calloc(1, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
1108 if (!p)
1109 return;
1110
1111 ret = syscall(__NR_io_uring_register, ld->ring_fd,
1112 IORING_REGISTER_PROBE, p, 256);
1113 if (ret < 0)
1114 goto out;
1115
1116 if (IORING_OP_WRITE > p->ops_len)
1117 goto out;
1118
1119 if ((p->ops[IORING_OP_READ].flags & IO_URING_OP_SUPPORTED) &&
1120 (p->ops[IORING_OP_WRITE].flags & IO_URING_OP_SUPPORTED))
1121 o->nonvectored = 1;
1122out:
1123 free(p);
1124}
1125
1126static int fio_ioring_queue_init(struct thread_data *td)
1127{
1128 struct ioring_data *ld = td->io_ops_data;
1129 struct ioring_options *o = td->eo;
1130 int depth = ld->iodepth;
1131 struct io_uring_params p;
1132 int ret;
1133
1134 memset(&p, 0, sizeof(p));
1135
1136 if (o->hipri)
1137 p.flags |= IORING_SETUP_IOPOLL;
1138 if (o->sqpoll_thread) {
1139 p.flags |= IORING_SETUP_SQPOLL;
1140 if (o->sqpoll_set) {
1141 p.flags |= IORING_SETUP_SQ_AFF;
1142 p.sq_thread_cpu = o->sqpoll_cpu;
1143 }
1144
1145 /*
1146 * Submission latency for sqpoll_thread is just the time it
1147 * takes to fill in the SQ ring entries, and any syscall if
1148 * IORING_SQ_NEED_WAKEUP is set, we don't need to log that time
1149 * separately.
1150 */
1151 td->o.disable_slat = 1;
1152 }
1153
1154 /*
1155 * Clamp CQ ring size at our SQ ring size, we don't need more entries
1156 * than that.
1157 */
1158 p.flags |= IORING_SETUP_CQSIZE;
1159 p.cq_entries = depth;
1160
1161 /*
1162 * Setup COOP_TASKRUN as we don't need to get IPI interrupted for
1163 * completing IO operations.
1164 */
1165 p.flags |= IORING_SETUP_COOP_TASKRUN;
1166
1167 /*
1168 * io_uring is always a single issuer, and we can defer task_work
1169 * runs until we reap events.
1170 */
1171 p.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN;
1172
1173retry:
1174 ret = syscall(__NR_io_uring_setup, depth, &p);
1175 if (ret < 0) {
1176 if (errno == EINVAL && p.flags & IORING_SETUP_DEFER_TASKRUN) {
1177 p.flags &= ~IORING_SETUP_DEFER_TASKRUN;
1178 p.flags &= ~IORING_SETUP_SINGLE_ISSUER;
1179 goto retry;
1180 }
1181 if (errno == EINVAL && p.flags & IORING_SETUP_COOP_TASKRUN) {
1182 p.flags &= ~IORING_SETUP_COOP_TASKRUN;
1183 goto retry;
1184 }
1185 if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
1186 p.flags &= ~IORING_SETUP_CQSIZE;
1187 goto retry;
1188 }
1189 return ret;
1190 }
1191
1192 ld->ring_fd = ret;
1193
1194 fio_ioring_probe(td);
1195
1196 if (o->fixedbufs) {
1197 ret = syscall(__NR_io_uring_register, ld->ring_fd,
1198 IORING_REGISTER_BUFFERS, ld->iovecs, depth);
1199 if (ret < 0)
1200 return ret;
1201 }
1202
1203 return fio_ioring_mmap(ld, &p);
1204}
1205
1206static int fio_ioring_cmd_queue_init(struct thread_data *td)
1207{
1208 struct ioring_data *ld = td->io_ops_data;
1209 struct ioring_options *o = td->eo;
1210 int depth = ld->iodepth;
1211 struct io_uring_params p;
1212 int ret;
1213
1214 memset(&p, 0, sizeof(p));
1215
1216 if (o->hipri)
1217 p.flags |= IORING_SETUP_IOPOLL;
1218 if (o->sqpoll_thread) {
1219 p.flags |= IORING_SETUP_SQPOLL;
1220 if (o->sqpoll_set) {
1221 p.flags |= IORING_SETUP_SQ_AFF;
1222 p.sq_thread_cpu = o->sqpoll_cpu;
1223 }
1224
1225 /*
1226 * Submission latency for sqpoll_thread is just the time it
1227 * takes to fill in the SQ ring entries, and any syscall if
1228 * IORING_SQ_NEED_WAKEUP is set, we don't need to log that time
1229 * separately.
1230 */
1231 td->o.disable_slat = 1;
1232 }
1233 if (o->cmd_type == FIO_URING_CMD_NVME) {
1234 p.flags |= IORING_SETUP_SQE128;
1235 p.flags |= IORING_SETUP_CQE32;
1236 }
1237
1238 /*
1239 * Clamp CQ ring size at our SQ ring size, we don't need more entries
1240 * than that.
1241 */
1242 p.flags |= IORING_SETUP_CQSIZE;
1243 p.cq_entries = depth;
1244
1245 /*
1246 * Setup COOP_TASKRUN as we don't need to get IPI interrupted for
1247 * completing IO operations.
1248 */
1249 p.flags |= IORING_SETUP_COOP_TASKRUN;
1250
1251 /*
1252 * io_uring is always a single issuer, and we can defer task_work
1253 * runs until we reap events.
1254 */
1255 p.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN;
1256
1257retry:
1258 ret = syscall(__NR_io_uring_setup, depth, &p);
1259 if (ret < 0) {
1260 if (errno == EINVAL && p.flags & IORING_SETUP_DEFER_TASKRUN) {
1261 p.flags &= ~IORING_SETUP_DEFER_TASKRUN;
1262 p.flags &= ~IORING_SETUP_SINGLE_ISSUER;
1263 goto retry;
1264 }
1265 if (errno == EINVAL && p.flags & IORING_SETUP_COOP_TASKRUN) {
1266 p.flags &= ~IORING_SETUP_COOP_TASKRUN;
1267 goto retry;
1268 }
1269 if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
1270 p.flags &= ~IORING_SETUP_CQSIZE;
1271 goto retry;
1272 }
1273 return ret;
1274 }
1275
1276 ld->ring_fd = ret;
1277
1278 fio_ioring_probe(td);
1279
1280 if (o->fixedbufs) {
1281 ret = syscall(__NR_io_uring_register, ld->ring_fd,
1282 IORING_REGISTER_BUFFERS, ld->iovecs, depth);
1283 if (ret < 0)
1284 return ret;
1285 }
1286
1287 return fio_ioring_mmap(ld, &p);
1288}
1289
1290static int fio_ioring_register_files(struct thread_data *td)
1291{
1292 struct ioring_data *ld = td->io_ops_data;
1293 struct fio_file *f;
1294 unsigned int i;
1295 int ret;
1296
1297 ld->fds = calloc(td->o.nr_files, sizeof(int));
1298
1299 for_each_file(td, f, i) {
1300 ret = generic_open_file(td, f);
1301 if (ret)
1302 goto err;
1303 ld->fds[i] = f->fd;
1304 f->engine_pos = i;
1305 }
1306
1307 ret = syscall(__NR_io_uring_register, ld->ring_fd,
1308 IORING_REGISTER_FILES, ld->fds, td->o.nr_files);
1309 if (ret) {
1310err:
1311 free(ld->fds);
1312 ld->fds = NULL;
1313 }
1314
1315 /*
1316 * Pretend the file is closed again, and really close it if we hit
1317 * an error.
1318 */
1319 for_each_file(td, f, i) {
1320 if (ret) {
1321 int fio_unused ret2;
1322 ret2 = generic_close_file(td, f);
1323 } else
1324 f->fd = -1;
1325 }
1326
1327 return ret;
1328}
1329
1330static int fio_ioring_post_init(struct thread_data *td)
1331{
1332 struct ioring_data *ld = td->io_ops_data;
1333 struct ioring_options *o = td->eo;
1334 struct io_u *io_u;
1335 int err, i;
1336
1337 for (i = 0; i < td->o.iodepth; i++) {
1338 struct iovec *iov = &ld->iovecs[i];
1339
1340 io_u = ld->io_u_index[i];
1341 iov->iov_base = io_u->buf;
1342 iov->iov_len = td_max_bs(td);
1343 }
1344
1345 err = fio_ioring_queue_init(td);
1346 if (err) {
1347 int init_err = errno;
1348
1349 if (init_err == ENOSYS)
1350 log_err("fio: your kernel doesn't support io_uring\n");
1351 td_verror(td, init_err, "io_queue_init");
1352 return 1;
1353 }
1354
1355 for (i = 0; i < ld->iodepth; i++) {
1356 struct io_uring_sqe *sqe;
1357
1358 sqe = &ld->sqes[i];
1359 memset(sqe, 0, sizeof(*sqe));
1360 }
1361
1362 if (o->registerfiles) {
1363 err = fio_ioring_register_files(td);
1364 if (err) {
1365 td_verror(td, errno, "ioring_register_files");
1366 return 1;
1367 }
1368 }
1369
1370 return 0;
1371}
1372
1373static int fio_ioring_cmd_post_init(struct thread_data *td)
1374{
1375 struct ioring_data *ld = td->io_ops_data;
1376 struct ioring_options *o = td->eo;
1377 struct io_u *io_u;
1378 int err, i;
1379
1380 for (i = 0; i < td->o.iodepth; i++) {
1381 struct iovec *iov = &ld->iovecs[i];
1382
1383 io_u = ld->io_u_index[i];
1384 iov->iov_base = io_u->buf;
1385 iov->iov_len = td_max_bs(td);
1386 }
1387
1388 err = fio_ioring_cmd_queue_init(td);
1389 if (err) {
1390 int init_err = errno;
1391
1392 td_verror(td, init_err, "io_queue_init");
1393 return 1;
1394 }
1395
1396 for (i = 0; i < ld->iodepth; i++) {
1397 struct io_uring_sqe *sqe;
1398
1399 if (o->cmd_type == FIO_URING_CMD_NVME) {
1400 sqe = &ld->sqes[i << 1];
1401 memset(sqe, 0, 2 * sizeof(*sqe));
1402 } else {
1403 sqe = &ld->sqes[i];
1404 memset(sqe, 0, sizeof(*sqe));
1405 }
1406 }
1407
1408 if (o->registerfiles) {
1409 err = fio_ioring_register_files(td);
1410 if (err) {
1411 td_verror(td, errno, "ioring_register_files");
1412 return 1;
1413 }
1414 }
1415
1416 return 0;
1417}
1418
1419static void parse_prchk_flags(struct ioring_options *o)
1420{
1421 if (!o->pi_chk)
1422 return;
1423
1424 if (strstr(o->pi_chk, "GUARD") != NULL)
1425 o->prchk = NVME_IO_PRINFO_PRCHK_GUARD;
1426 if (strstr(o->pi_chk, "REFTAG") != NULL)
1427 o->prchk |= NVME_IO_PRINFO_PRCHK_REF;
1428 if (strstr(o->pi_chk, "APPTAG") != NULL)
1429 o->prchk |= NVME_IO_PRINFO_PRCHK_APP;
1430}
1431
1432static int fio_ioring_cmd_init(struct thread_data *td, struct ioring_data *ld)
1433{
1434 struct ioring_options *o = td->eo;
1435
1436 if (td_write(td)) {
1437 switch (o->write_mode) {
1438 case FIO_URING_CMD_WMODE_UNCOR:
1439 ld->write_opcode = nvme_cmd_write_uncor;
1440 break;
1441 case FIO_URING_CMD_WMODE_ZEROES:
1442 ld->write_opcode = nvme_cmd_write_zeroes;
1443 if (o->deac)
1444 ld->cdw12_flags[DDIR_WRITE] = 1 << 25;
1445 break;
1446 case FIO_URING_CMD_WMODE_VERIFY:
1447 ld->write_opcode = nvme_cmd_verify;
1448 break;
1449 default:
1450 ld->write_opcode = nvme_cmd_write;
1451 break;
1452 }
1453 }
1454
1455 if (o->readfua)
1456 ld->cdw12_flags[DDIR_READ] = 1 << 30;
1457 if (o->writefua)
1458 ld->cdw12_flags[DDIR_WRITE] = 1 << 30;
1459
1460 return 0;
1461}
1462
1463static int fio_ioring_init(struct thread_data *td)
1464{
1465 struct ioring_options *o = td->eo;
1466 struct ioring_data *ld;
1467 struct nvme_dsm *dsm;
1468 void *ptr;
1469 unsigned int dsm_size;
1470 unsigned long long md_size;
1471 int ret, i;
1472 struct nvme_cmd_ext_io_opts *ext_opts;
1473
1474 /* sqthread submission requires registered files */
1475 if (o->sqpoll_thread)
1476 o->registerfiles = 1;
1477
1478 if (o->registerfiles && td->o.nr_files != td->o.open_files) {
1479 log_err("fio: io_uring registered files require nr_files to "
1480 "be identical to open_files\n");
1481 return 1;
1482 }
1483
1484 ld = calloc(1, sizeof(*ld));
1485
1486 ld->is_uring_cmd_eng = (td->io_ops->prep == fio_ioring_cmd_prep);
1487
1488 /*
1489 * The internal io_uring queue depth must be a power-of-2, as that's
1490 * how the ring interface works. So round that up, in case the user
1491 * set iodepth isn't a power-of-2. Leave the fio depth the same, as
1492 * not to be driving too much of an iodepth, if we did round up.
1493 */
1494 ld->iodepth = roundup_pow2(td->o.iodepth);
1495
1496 /* io_u index */
1497 ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
1498
1499 if (!ld->is_uring_cmd_eng && o->md_per_io_size) {
1500 if (o->apptag_mask != 0xffff) {
1501 log_err("fio: io_uring with metadata requires an apptag_mask of 0xffff\n");
1502 free(ld->io_u_index);
1503 free(ld);
1504 return 1;
1505 }
1506 }
1507
1508 /*
1509 * metadata buffer
1510 * We are only supporting iomem=malloc / mem=malloc as of now.
1511 */
1512 if (o->md_per_io_size && (!ld->is_uring_cmd_eng ||
1513 (ld->is_uring_cmd_eng && o->cmd_type == FIO_URING_CMD_NVME))) {
1514 md_size = (unsigned long long) o->md_per_io_size
1515 * (unsigned long long) td->o.iodepth;
1516 md_size += page_mask + td->o.mem_align;
1517 if (td->o.mem_align && td->o.mem_align > page_size)
1518 md_size += td->o.mem_align - page_size;
1519 ld->md_buf = malloc(md_size);
1520 if (!ld->md_buf) {
1521 free(ld->io_u_index);
1522 free(ld);
1523 return 1;
1524 }
1525
1526 if (!ld->is_uring_cmd_eng) {
1527 ld->pi_attr = calloc(ld->iodepth, sizeof(struct io_uring_attr_pi));
1528 if (!ld->pi_attr) {
1529 free(ld->io_u_index);
1530 free(ld->md_buf);
1531 free(ld);
1532 return 1;
1533 }
1534 }
1535
1536 }
1537 parse_prchk_flags(o);
1538 ext_opts = &ld->ext_opts;
1539 if (o->pi_act)
1540 ext_opts->io_flags |= NVME_IO_PRINFO_PRACT;
1541 ext_opts->io_flags |= o->prchk;
1542 ext_opts->apptag = o->apptag;
1543 ext_opts->apptag_mask = o->apptag_mask;
1544
1545 ld->iovecs = calloc(ld->iodepth, sizeof(struct iovec));
1546
1547 td->io_ops_data = ld;
1548
1549 ret = fio_cmdprio_init(td, &ld->cmdprio, &o->cmdprio_options);
1550 if (ret) {
1551 td_verror(td, EINVAL, "fio_ioring_init");
1552 return 1;
1553 }
1554
1555 /*
1556 * For io_uring_cmd, trims are async operations unless we are operating
1557 * in zbd mode where trim means zone reset.
1558 */
1559 if (td_trim(td) && td->o.zone_mode == ZONE_MODE_ZBD &&
1560 ld->is_uring_cmd_eng) {
1561 td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM;
1562 } else {
1563 dsm_size = sizeof(*ld->dsm);
1564 dsm_size += td->o.num_range * sizeof(struct nvme_dsm_range);
1565 ld->dsm = calloc(td->o.iodepth, dsm_size);
1566 ptr = ld->dsm;
1567 for (i = 0; i < td->o.iodepth; i++) {
1568 dsm = (struct nvme_dsm *)ptr;
1569 dsm->nr_ranges = td->o.num_range;
1570 ptr += dsm_size;
1571 }
1572 }
1573
1574 if (ld->is_uring_cmd_eng)
1575 return fio_ioring_cmd_init(td, ld);
1576 return 0;
1577}
1578
1579static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u)
1580{
1581 struct ioring_data *ld = td->io_ops_data;
1582 struct ioring_options *o = td->eo;
1583 struct nvme_pi_data *pi_data;
1584 char *p, *q;
1585
1586 ld->io_u_index[io_u->index] = io_u;
1587
1588 p = PTR_ALIGN(ld->md_buf, page_mask) + td->o.mem_align;
1589 p += o->md_per_io_size * io_u->index;
1590 io_u->mmap_data = p;
1591
1592 if (ld->pi_attr) {
1593 struct io_uring_attr_pi *pi_attr;
1594
1595 q = ld->pi_attr;
1596 q += (sizeof(struct io_uring_attr_pi) * io_u->index);
1597 io_u->pi_attr = q;
1598
1599 pi_attr = io_u->pi_attr;
1600 pi_attr->len = o->md_per_io_size;
1601 pi_attr->app_tag = o->apptag;
1602 pi_attr->flags = 0;
1603 if (strstr(o->pi_chk, "GUARD") != NULL)
1604 pi_attr->flags |= IO_INTEGRITY_CHK_GUARD;
1605 if (strstr(o->pi_chk, "REFTAG") != NULL)
1606 pi_attr->flags |= IO_INTEGRITY_CHK_REFTAG;
1607 if (strstr(o->pi_chk, "APPTAG") != NULL)
1608 pi_attr->flags |= IO_INTEGRITY_CHK_APPTAG;
1609 }
1610
1611 if (!o->pi_act) {
1612 pi_data = calloc(1, sizeof(*pi_data));
1613 pi_data->io_flags |= o->prchk;
1614 pi_data->apptag_mask = o->apptag_mask;
1615 pi_data->apptag = o->apptag;
1616 io_u->engine_data = pi_data;
1617 }
1618
1619 return 0;
1620}
1621
1622static void fio_ioring_io_u_free(struct thread_data *td, struct io_u *io_u)
1623{
1624 struct nvme_pi *pi = io_u->engine_data;
1625
1626 free(pi);
1627 io_u->engine_data = NULL;
1628}
1629
1630static int fio_get_pi_info(struct fio_file *f, struct nvme_data *data)
1631{
1632 struct logical_block_metadata_cap md_cap;
1633 int ret;
1634 int fd, err = 0;
1635
1636 fd = open(f->file_name, O_RDONLY);
1637 if (fd < 0)
1638 return -errno;
1639
1640 ret = ioctl(fd, FS_IOC_GETLBMD_CAP, &md_cap);
1641 if (ret < 0) {
1642 err = -errno;
1643 log_err("%s: failed to query protection information capabilities; error %d\n", f->file_name, errno);
1644 goto out;
1645 }
1646
1647 if (!(md_cap.lbmd_flags & LBMD_PI_CAP_INTEGRITY)) {
1648 log_err("%s: Protection information not supported\n", f->file_name);
1649 err = -ENOTSUP;
1650 goto out;
1651 }
1652
1653 /* Currently we don't support storage tags */
1654 if (md_cap.lbmd_storage_tag_size) {
1655 log_err("%s: Storage tag not supported\n", f->file_name);
1656 err = -ENOTSUP;
1657 goto out;
1658 }
1659
1660 data->lba_size = md_cap.lbmd_interval;
1661 data->lba_shift = ilog2(data->lba_size);
1662 data->ms = md_cap.lbmd_size;
1663 data->pi_size = md_cap.lbmd_pi_size;
1664 data->pi_loc = !(md_cap.lbmd_pi_offset);
1665
1666 /* Assume Type 1 PI if reference tags supported */
1667 if (md_cap.lbmd_flags & LBMD_PI_CAP_REFTAG)
1668 data->pi_type = NVME_NS_DPS_PI_TYPE1;
1669 else
1670 data->pi_type = NVME_NS_DPS_PI_TYPE3;
1671
1672 switch (md_cap.lbmd_guard_tag_type) {
1673 case LBMD_PI_CSUM_CRC16_T10DIF:
1674 data->guard_type = NVME_NVM_NS_16B_GUARD;
1675 break;
1676 case LBMD_PI_CSUM_CRC64_NVME:
1677 data->guard_type = NVME_NVM_NS_64B_GUARD;
1678 break;
1679 default:
1680 log_err("%s: unsupported checksum type %d\n", f->file_name,
1681 md_cap.lbmd_guard_tag_type);
1682 err = -ENOTSUP;
1683 goto out;
1684 }
1685
1686out:
1687 close(fd);
1688 return err;
1689}
1690
1691static inline int fio_ioring_open_file_md(struct thread_data *td, struct fio_file *f)
1692{
1693 int ret = 0;
1694 struct nvme_data *data = NULL;
1695
1696 data = FILE_ENG_DATA(f);
1697 if (data == NULL) {
1698 data = calloc(1, sizeof(struct nvme_data));
1699 ret = fio_get_pi_info(f, data);
1700 if (ret) {
1701 free(data);
1702 return ret;
1703 }
1704
1705 FILE_SET_ENG_DATA(f, data);
1706 }
1707
1708 return ret;
1709}
1710
1711static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f)
1712{
1713 struct ioring_data *ld = td->io_ops_data;
1714 struct ioring_options *o = td->eo;
1715
1716 if (o->md_per_io_size) {
1717 /*
1718 * This will be a no-op when called by the io_uring_cmd
1719 * ioengine because engine data has already been collected by
1720 * the time this call is made
1721 */
1722 int ret = fio_ioring_open_file_md(td, f);
1723 if (ret)
1724 return ret;
1725 }
1726
1727 if (!ld || !o->registerfiles)
1728 return generic_open_file(td, f);
1729
1730 f->fd = ld->fds[f->engine_pos];
1731 return 0;
1732}
1733
1734static int verify_params(struct thread_data *td, struct nvme_data *data,
1735 struct fio_file *f, enum fio_ddir ddir)
1736{
1737 struct ioring_options *o = td->eo;
1738 unsigned int lba_size;
1739
1740 lba_size = data->lba_ext ? data->lba_ext : data->lba_size;
1741 if (td->o.min_bs[ddir] % lba_size || td->o.max_bs[ddir] % lba_size) {
1742 if (data->lba_ext) {
1743 log_err("%s: block size must be a multiple of %u "
1744 "(LBA data size + Metadata size)\n", f->file_name, lba_size);
1745 if (td->o.min_bs[ddir] == td->o.max_bs[ddir] &&
1746 !(td->o.min_bs[ddir] % data->lba_size)) {
1747 /* fixed block size is actually a multiple of LBA data size */
1748 unsigned long long suggestion = lba_size *
1749 (td->o.min_bs[ddir] / data->lba_size);
1750 log_err("Did you mean to use a block size of %llu?\n", suggestion);
1751 }
1752 } else {
1753 log_err("%s: block size must be a multiple of LBA data size\n",
1754 f->file_name);
1755 }
1756 td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
1757 return 1;
1758 }
1759 if (data->ms && !data->lba_ext && ddir != DDIR_TRIM &&
1760 (o->md_per_io_size < ((td->o.max_bs[ddir] / data->lba_size) * data->ms))) {
1761 log_err("%s: md_per_io_size should be at least %llu bytes\n",
1762 f->file_name,
1763 ((td->o.max_bs[ddir] / data->lba_size) * data->ms));
1764 td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
1765 return 1;
1766 }
1767
1768 return 0;
1769}
1770
1771static int fio_ioring_open_nvme(struct thread_data *td, struct fio_file *f)
1772{
1773 struct ioring_options *o = td->eo;
1774 struct nvme_data *data = NULL;
1775 __u64 nlba = 0;
1776 int ret;
1777
1778 /* Store the namespace-id and lba size. */
1779 data = FILE_ENG_DATA(f);
1780 if (data == NULL) {
1781 data = calloc(1, sizeof(struct nvme_data));
1782 ret = fio_nvme_get_info(f, &nlba, o->pi_act, data);
1783 if (ret) {
1784 free(data);
1785 return ret;
1786 }
1787
1788 FILE_SET_ENG_DATA(f, data);
1789 }
1790
1791 for_each_rw_ddir(ddir) {
1792 ret = verify_params(td, data, f, ddir);
1793 if (ret)
1794 return ret;
1795 }
1796
1797 /*
1798 * For extended logical block sizes we cannot use verify when
1799 * end to end data protection checks are enabled, as the PI
1800 * section of data buffer conflicts with verify.
1801 */
1802 if (data->ms && data->pi_type && data->lba_ext &&
1803 td->o.verify != VERIFY_NONE) {
1804 log_err("%s: for extended LBA, verify cannot be used when E2E "
1805 "data protection is enabled\n", f->file_name);
1806 td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
1807 return 1;
1808 }
1809
1810 if (o->write_mode != FIO_URING_CMD_WMODE_WRITE && !td_write(td)) {
1811 log_err("%s: 'readwrite=|rw=' has no write\n", f->file_name);
1812 td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
1813 return 1;
1814 }
1815
1816 return 0;
1817}
1818
1819static int fio_ioring_cmd_open_file(struct thread_data *td, struct fio_file *f)
1820{
1821 struct ioring_options *o = td->eo;
1822
1823 if (o->cmd_type == FIO_URING_CMD_NVME) {
1824 int ret;
1825
1826 ret = fio_ioring_open_nvme(td, f);
1827 if (ret)
1828 return ret;
1829 }
1830
1831 return fio_ioring_open_file(td, f);
1832}
1833
1834static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
1835{
1836 struct ioring_data *ld = td->io_ops_data;
1837 struct ioring_options *o = td->eo;
1838
1839 if (!ld || !o->registerfiles)
1840 return generic_close_file(td, f);
1841
1842 f->fd = -1;
1843 return 0;
1844}
1845
1846static int fio_ioring_cmd_close_file(struct thread_data *td,
1847 struct fio_file *f)
1848{
1849 struct ioring_options *o = td->eo;
1850
1851 if (o->cmd_type == FIO_URING_CMD_NVME) {
1852 struct nvme_data *data = FILE_ENG_DATA(f);
1853
1854 FILE_SET_ENG_DATA(f, NULL);
1855 free(data);
1856 }
1857
1858 return fio_ioring_close_file(td, f);
1859}
1860
1861static int fio_ioring_cmd_get_file_size(struct thread_data *td,
1862 struct fio_file *f)
1863{
1864 struct ioring_options *o = td->eo;
1865
1866 if (fio_file_size_known(f))
1867 return 0;
1868
1869 if (o->cmd_type == FIO_URING_CMD_NVME) {
1870 struct nvme_data *data = NULL;
1871 __u64 nlba = 0;
1872 int ret;
1873
1874 data = calloc(1, sizeof(struct nvme_data));
1875 ret = fio_nvme_get_info(f, &nlba, o->pi_act, data);
1876 if (ret) {
1877 free(data);
1878 return ret;
1879 }
1880
1881 if (data->lba_ext)
1882 f->real_file_size = data->lba_ext * nlba;
1883 else
1884 f->real_file_size = data->lba_size * nlba;
1885 fio_file_set_size_known(f);
1886
1887 FILE_SET_ENG_DATA(f, data);
1888 return 0;
1889 }
1890 return generic_get_file_size(td, f);
1891}
1892
1893static int fio_ioring_cmd_get_zoned_model(struct thread_data *td,
1894 struct fio_file *f,
1895 enum zbd_zoned_model *model)
1896{
1897 return fio_nvme_get_zoned_model(td, f, model);
1898}
1899
1900static int fio_ioring_cmd_report_zones(struct thread_data *td,
1901 struct fio_file *f, uint64_t offset,
1902 struct zbd_zone *zbdz,
1903 unsigned int nr_zones)
1904{
1905 return fio_nvme_report_zones(td, f, offset, zbdz, nr_zones);
1906}
1907
1908static int fio_ioring_cmd_reset_wp(struct thread_data *td, struct fio_file *f,
1909 uint64_t offset, uint64_t length)
1910{
1911 return fio_nvme_reset_wp(td, f, offset, length);
1912}
1913
1914static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td,
1915 struct fio_file *f,
1916 unsigned int *max_open_zones)
1917{
1918 return fio_nvme_get_max_open_zones(td, f, max_open_zones);
1919}
1920
1921static int fio_ioring_cmd_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1922 struct fio_ruhs_info *fruhs_info)
1923{
1924 struct nvme_fdp_ruh_status *ruhs;
1925 int bytes, nr_ruhs, ret, i;
1926
1927 nr_ruhs = fruhs_info->nr_ruhs;
1928 bytes = sizeof(*ruhs) + fruhs_info->nr_ruhs * sizeof(struct nvme_fdp_ruh_status_desc);
1929
1930 ruhs = calloc(1, bytes);
1931 if (!ruhs)
1932 return -ENOMEM;
1933
1934 ret = fio_nvme_iomgmt_ruhs(td, f, ruhs, bytes);
1935 if (ret)
1936 goto free;
1937
1938 fruhs_info->nr_ruhs = le16_to_cpu(ruhs->nruhsd);
1939 for (i = 0; i < nr_ruhs; i++)
1940 fruhs_info->plis[i] = le16_to_cpu(ruhs->ruhss[i].pid);
1941free:
1942 free(ruhs);
1943 return ret;
1944}
1945
1946static struct ioengine_ops ioengine_uring = {
1947 .name = "io_uring",
1948 .version = FIO_IOOPS_VERSION,
1949 .flags = FIO_NO_OFFLOAD | FIO_ASYNCIO_SETS_ISSUE_TIME |
1950 FIO_ATOMICWRITES,
1951 .init = fio_ioring_init,
1952 .post_init = fio_ioring_post_init,
1953 .io_u_init = fio_ioring_io_u_init,
1954 .io_u_free = fio_ioring_io_u_free,
1955 .prep = fio_ioring_prep,
1956 .queue = fio_ioring_queue,
1957 .commit = fio_ioring_commit,
1958 .getevents = fio_ioring_getevents,
1959 .event = fio_ioring_event,
1960 .cleanup = fio_ioring_cleanup,
1961 .open_file = fio_ioring_open_file,
1962 .close_file = fio_ioring_close_file,
1963 .get_file_size = generic_get_file_size,
1964 .options = options,
1965 .option_struct_size = sizeof(struct ioring_options),
1966};
1967
1968static struct ioengine_ops ioengine_uring_cmd = {
1969 .name = "io_uring_cmd",
1970 .version = FIO_IOOPS_VERSION,
1971 .flags = FIO_NO_OFFLOAD | FIO_MEMALIGN | FIO_RAWIO |
1972 FIO_ASYNCIO_SETS_ISSUE_TIME |
1973 FIO_MULTI_RANGE_TRIM,
1974 .init = fio_ioring_init,
1975 .post_init = fio_ioring_cmd_post_init,
1976 .io_u_init = fio_ioring_io_u_init,
1977 .io_u_free = fio_ioring_io_u_free,
1978 .prep = fio_ioring_cmd_prep,
1979 .queue = fio_ioring_queue,
1980 .commit = fio_ioring_commit,
1981 .getevents = fio_ioring_getevents,
1982 .event = fio_ioring_cmd_event,
1983 .errdetails = fio_ioring_cmd_errdetails,
1984 .cleanup = fio_ioring_cleanup,
1985 .open_file = fio_ioring_cmd_open_file,
1986 .close_file = fio_ioring_cmd_close_file,
1987 .get_file_size = fio_ioring_cmd_get_file_size,
1988 .get_zoned_model = fio_ioring_cmd_get_zoned_model,
1989 .report_zones = fio_ioring_cmd_report_zones,
1990 .reset_wp = fio_ioring_cmd_reset_wp,
1991 .get_max_open_zones = fio_ioring_cmd_get_max_open_zones,
1992 .options = options,
1993 .option_struct_size = sizeof(struct ioring_options),
1994 .fdp_fetch_ruhs = fio_ioring_cmd_fetch_ruhs,
1995};
1996
1997static void fio_init fio_ioring_register(void)
1998{
1999 register_ioengine(&ioengine_uring);
2000 register_ioengine(&ioengine_uring_cmd);
2001}
2002
2003static void fio_exit fio_ioring_unregister(void)
2004{
2005 unregister_ioengine(&ioengine_uring);
2006 unregister_ioengine(&ioengine_uring_cmd);
2007}
2008#endif