stat: used shared sem for stats lock
[fio.git] / engines / io_uring.c
CommitLineData
52885fa2 1/*
bffad86f 2 * io_uring engine
52885fa2 3 *
bffad86f 4 * IO engine using the new native Linux aio io_uring interface. See:
a90cd050 5 *
bffad86f 6 * http://git.kernel.dk/cgit/linux-block/log/?h=io_uring
52885fa2
JA
7 *
8 */
9#include <stdlib.h>
10#include <unistd.h>
11#include <errno.h>
52885fa2
JA
12#include <sys/time.h>
13#include <sys/resource.h>
14
15#include "../fio.h"
16#include "../lib/pow2.h"
17#include "../optgroup.h"
18#include "../lib/memalign.h"
b87aa01a 19#include "../lib/fls.h"
6d975f2c 20#include "../lib/roundup.h"
ba342e58 21#include "../verify.h"
52885fa2 22
bffad86f 23#ifdef ARCH_HAVE_IOURING
52885fa2 24
57fa61f0 25#include "../lib/types.h"
f3e769a4 26#include "../os/linux/io_uring.h"
e9f6567a 27#include "cmdprio.h"
16be6037 28#include "zbd.h"
855dc4d4
AG
29#include "nvme.h"
30
31#include <sys/stat.h>
32
f97d9f38
VF
33#ifndef IO_INTEGRITY_CHK_GUARD
34/* flags for integrity meta */
35#define IO_INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */
36#define IO_INTEGRITY_CHK_REFTAG (1U << 1) /* enforce ref check */
37#define IO_INTEGRITY_CHK_APPTAG (1U << 2) /* enforce app check */
38#endif /* IO_INTEGRITY_CHK_GUARD */
39
40#ifndef FS_IOC_GETLBMD_CAP
41/* Protection info capability flags */
42#define LBMD_PI_CAP_INTEGRITY (1 << 0)
43#define LBMD_PI_CAP_REFTAG (1 << 1)
44
45/* Checksum types for Protection Information */
46#define LBMD_PI_CSUM_NONE 0
47#define LBMD_PI_CSUM_IP 1
48#define LBMD_PI_CSUM_CRC16_T10DIF 2
49#define LBMD_PI_CSUM_CRC64_NVME 4
50
51/*
52 * Logical block metadata capability descriptor
53 * If the device does not support metadata, all the fields will be zero.
54 * Applications must check lbmd_flags to determine whether metadata is
55 * supported or not.
56 */
57struct logical_block_metadata_cap {
58 /* Bitmask of logical block metadata capability flags */
59 __u32 lbmd_flags;
60 /*
61 * The amount of data described by each unit of logical block
62 * metadata
63 */
64 __u16 lbmd_interval;
65 /*
66 * Size in bytes of the logical block metadata associated with each
67 * interval
68 */
69 __u8 lbmd_size;
70 /*
71 * Size in bytes of the opaque block tag associated with each
72 * interval
73 */
74 __u8 lbmd_opaque_size;
75 /*
76 * Offset in bytes of the opaque block tag within the logical block
77 * metadata
78 */
79 __u8 lbmd_opaque_offset;
80 /* Size in bytes of the T10 PI tuple associated with each interval */
81 __u8 lbmd_pi_size;
82 /* Offset in bytes of T10 PI tuple within the logical block metadata */
83 __u8 lbmd_pi_offset;
84 /* T10 PI guard tag type */
85 __u8 lbmd_guard_tag_type;
86 /* Size in bytes of the T10 PI application tag */
87 __u8 lbmd_app_tag_size;
88 /* Size in bytes of the T10 PI reference tag */
89 __u8 lbmd_ref_tag_size;
90 /* Size in bytes of the T10 PI storage tag */
91 __u8 lbmd_storage_tag_size;
92 __u8 pad;
93};
94
95#define FS_IOC_GETLBMD_CAP _IOWR(0x15, 2, struct logical_block_metadata_cap)
96#endif /* FS_IOC_GETLBMD_CAP */
97
855dc4d4
AG
98enum uring_cmd_type {
99 FIO_URING_CMD_NVME = 1,
100};
9a2d78b3 101
87a4903f
MI
102enum uring_cmd_write_mode {
103 FIO_URING_CMD_WMODE_WRITE = 1,
104 FIO_URING_CMD_WMODE_UNCOR,
105 FIO_URING_CMD_WMODE_ZEROES,
106 FIO_URING_CMD_WMODE_VERIFY,
107};
108
6170d92a
MI
109enum uring_cmd_verify_mode {
110 FIO_URING_CMD_VMODE_READ = 1,
111 FIO_URING_CMD_VMODE_COMPARE,
112};
113
bffad86f 114struct io_sq_ring {
e2239016
JA
115 unsigned *head;
116 unsigned *tail;
117 unsigned *ring_mask;
118 unsigned *ring_entries;
119 unsigned *flags;
120 unsigned *array;
52885fa2
JA
121};
122
bffad86f 123struct io_cq_ring {
e2239016
JA
124 unsigned *head;
125 unsigned *tail;
126 unsigned *ring_mask;
127 unsigned *ring_entries;
f0403f94 128 struct io_uring_cqe *cqes;
9a2d78b3
JA
129};
130
bffad86f 131struct ioring_mmap {
9a2d78b3
JA
132 void *ptr;
133 size_t len;
52885fa2
JA
134};
135
bffad86f 136struct ioring_data {
9a2d78b3
JA
137 int ring_fd;
138
52885fa2 139 struct io_u **io_u_index;
2d6451c9 140 char *md_buf;
f97d9f38 141 char *pi_attr;
52885fa2 142
5ffd5626
JA
143 int *fds;
144
bffad86f 145 struct io_sq_ring sq_ring;
f0403f94 146 struct io_uring_sqe *sqes;
9a2d78b3 147 struct iovec *iovecs;
b87aa01a 148 unsigned sq_ring_mask;
52885fa2 149
bffad86f 150 struct io_cq_ring cq_ring;
b87aa01a 151 unsigned cq_ring_mask;
52885fa2 152
980fb7f2 153 int async_trim_fail;
52885fa2
JA
154 int queued;
155 int cq_ring_off;
b87aa01a 156 unsigned iodepth;
5a59a81d 157 int prepped;
96563db9 158
bffad86f 159 struct ioring_mmap mmap[3];
d6cbeab4
NC
160
161 struct cmdprio cmdprio;
4885a6eb 162
5d4ee0de 163 struct nvme_dsm *dsm;
55e14d73 164 uint32_t cdw12_flags[DDIR_RWDIR_CNT];
87a4903f 165 uint8_t write_opcode;
9897c064
VF
166
167 bool is_uring_cmd_eng;
be06e55f
VF
168
169 struct nvme_cmd_ext_io_opts ext_opts;
52885fa2
JA
170};
171
bffad86f 172struct ioring_options {
a48f0cc7 173 struct thread_data *td;
52885fa2 174 unsigned int hipri;
55e14d73
MI
175 unsigned int readfua;
176 unsigned int writefua;
dfc79b17 177 unsigned int deac;
87a4903f 178 unsigned int write_mode;
6170d92a 179 unsigned int verify_mode;
d6cbeab4 180 struct cmdprio_options cmdprio_options;
52885fa2 181 unsigned int fixedbufs;
5ffd5626 182 unsigned int registerfiles;
3d7d00a3 183 unsigned int sqpoll_thread;
2ea53ca3
JA
184 unsigned int sqpoll_set;
185 unsigned int sqpoll_cpu;
b10b1e70 186 unsigned int nonvectored;
43c67b9f 187 unsigned int uncached;
7d42e66e 188 unsigned int nowait;
5a59a81d 189 unsigned int force_async;
2d6451c9 190 unsigned int md_per_io_size;
3ee8311a
AK
191 unsigned int pi_act;
192 unsigned int apptag;
193 unsigned int apptag_mask;
194 unsigned int prchk;
195 char *pi_chk;
855dc4d4 196 enum uring_cmd_type cmd_type;
52885fa2
JA
197};
198
b10b1e70
JA
199static const int ddir_to_op[2][2] = {
200 { IORING_OP_READV, IORING_OP_READ },
201 { IORING_OP_WRITEV, IORING_OP_WRITE }
202};
203
3f1e3af7
KB
204static const int fixed_ddir_to_op[2] = {
205 IORING_OP_READ_FIXED,
206 IORING_OP_WRITE_FIXED
207};
208
2ea53ca3 209static int fio_ioring_sqpoll_cb(void *data, unsigned long long *val)
a90cd050 210{
bffad86f 211 struct ioring_options *o = data;
a90cd050 212
2ea53ca3
JA
213 o->sqpoll_cpu = *val;
214 o->sqpoll_set = 1;
a90cd050
JA
215 return 0;
216}
217
52885fa2
JA
218static struct fio_option options[] = {
219 {
220 .name = "hipri",
221 .lname = "High Priority",
222 .type = FIO_OPT_STR_SET,
bffad86f 223 .off1 = offsetof(struct ioring_options, hipri),
52885fa2
JA
224 .help = "Use polled IO completions",
225 .category = FIO_OPT_C_ENGINE,
27f436d9 226 .group = FIO_OPT_G_IOURING,
52885fa2 227 },
55e14d73
MI
228 {
229 .name = "readfua",
230 .lname = "Read fua flag support",
231 .type = FIO_OPT_BOOL,
232 .off1 = offsetof(struct ioring_options, readfua),
233 .help = "Set FUA flag (force unit access) for all Read operations",
234 .def = "0",
235 .category = FIO_OPT_C_ENGINE,
236 .group = FIO_OPT_G_IOURING,
237 },
238 {
239 .name = "writefua",
240 .lname = "Write fua flag support",
241 .type = FIO_OPT_BOOL,
242 .off1 = offsetof(struct ioring_options, writefua),
243 .help = "Set FUA flag (force unit access) for all Write operations",
244 .def = "0",
245 .category = FIO_OPT_C_ENGINE,
246 .group = FIO_OPT_G_IOURING,
247 },
87a4903f
MI
248 {
249 .name = "write_mode",
250 .lname = "Additional Write commands support (Write Uncorrectable, Write Zeores)",
251 .type = FIO_OPT_STR,
252 .off1 = offsetof(struct ioring_options, write_mode),
04230255 253 .help = "Issue Write Uncorrectable or Zeroes command instead of Write command",
87a4903f
MI
254 .def = "write",
255 .posval = {
256 { .ival = "write",
257 .oval = FIO_URING_CMD_WMODE_WRITE,
258 .help = "Issue Write commands for write operations"
259 },
260 { .ival = "uncor",
261 .oval = FIO_URING_CMD_WMODE_UNCOR,
262 .help = "Issue Write Uncorrectable commands for write operations"
263 },
264 { .ival = "zeroes",
265 .oval = FIO_URING_CMD_WMODE_ZEROES,
266 .help = "Issue Write Zeroes commands for write operations"
267 },
268 { .ival = "verify",
269 .oval = FIO_URING_CMD_WMODE_VERIFY,
270 .help = "Issue Verify commands for write operations"
271 },
272 },
273 .category = FIO_OPT_C_ENGINE,
274 .group = FIO_OPT_G_IOURING,
275 },
6170d92a
MI
276 {
277 .name = "verify_mode",
278 .lname = "Do verify based on the configured command (e.g., Read or Compare command)",
279 .type = FIO_OPT_STR,
280 .off1 = offsetof(struct ioring_options, verify_mode),
281 .help = "Issue Read or Compare command in the verification phase",
282 .def = "read",
283 .posval = {
284 { .ival = "read",
285 .oval = FIO_URING_CMD_VMODE_READ,
286 .help = "Issue Read commands in the verification phase"
287 },
288 { .ival = "compare",
289 .oval = FIO_URING_CMD_VMODE_COMPARE,
290 .help = "Issue Compare commands in the verification phase"
291 },
292 },
293 .category = FIO_OPT_C_ENGINE,
294 .group = FIO_OPT_G_IOURING,
295 },
52885fa2
JA
296 {
297 .name = "fixedbufs",
298 .lname = "Fixed (pre-mapped) IO buffers",
299 .type = FIO_OPT_STR_SET,
bffad86f 300 .off1 = offsetof(struct ioring_options, fixedbufs),
52885fa2
JA
301 .help = "Pre map IO buffers",
302 .category = FIO_OPT_C_ENGINE,
27f436d9 303 .group = FIO_OPT_G_IOURING,
52885fa2 304 },
5ffd5626
JA
305 {
306 .name = "registerfiles",
307 .lname = "Register file set",
308 .type = FIO_OPT_STR_SET,
309 .off1 = offsetof(struct ioring_options, registerfiles),
310 .help = "Pre-open/register files",
311 .category = FIO_OPT_C_ENGINE,
27f436d9 312 .group = FIO_OPT_G_IOURING,
5ffd5626 313 },
771c9901
JA
314 {
315 .name = "sqthread_poll",
3d7d00a3 316 .lname = "Kernel SQ thread polling",
d6f936d1 317 .type = FIO_OPT_STR_SET,
3d7d00a3
JA
318 .off1 = offsetof(struct ioring_options, sqpoll_thread),
319 .help = "Offload submission/completion to kernel thread",
320 .category = FIO_OPT_C_ENGINE,
27f436d9 321 .group = FIO_OPT_G_IOURING,
3d7d00a3
JA
322 },
323 {
324 .name = "sqthread_poll_cpu",
325 .lname = "SQ Thread Poll CPU",
2ea53ca3
JA
326 .type = FIO_OPT_INT,
327 .cb = fio_ioring_sqpoll_cb,
3d7d00a3 328 .help = "What CPU to run SQ thread polling on",
a90cd050 329 .category = FIO_OPT_C_ENGINE,
27f436d9 330 .group = FIO_OPT_G_IOURING,
a90cd050 331 },
b10b1e70
JA
332 {
333 .name = "nonvectored",
334 .lname = "Non-vectored",
335 .type = FIO_OPT_INT,
336 .off1 = offsetof(struct ioring_options, nonvectored),
556d8415 337 .def = "-1",
b10b1e70
JA
338 .help = "Use non-vectored read/write commands",
339 .category = FIO_OPT_C_ENGINE,
340 .group = FIO_OPT_G_IOURING,
341 },
4a87b584
JA
342 {
343 .name = "uncached",
344 .lname = "Uncached",
43c67b9f
JA
345 .type = FIO_OPT_INT,
346 .off1 = offsetof(struct ioring_options, uncached),
347 .help = "Use RWF_DONTCACHE for buffered read/writes",
348 .category = FIO_OPT_C_ENGINE,
349 .group = FIO_OPT_G_IOURING,
4a87b584 350 },
7d42e66e
KK
351 {
352 .name = "nowait",
353 .lname = "RWF_NOWAIT",
354 .type = FIO_OPT_BOOL,
355 .off1 = offsetof(struct ioring_options, nowait),
356 .help = "Use RWF_NOWAIT for reads/writes",
357 .category = FIO_OPT_C_ENGINE,
358 .group = FIO_OPT_G_IOURING,
359 },
5a59a81d
JA
360 {
361 .name = "force_async",
362 .lname = "Force async",
363 .type = FIO_OPT_INT,
364 .off1 = offsetof(struct ioring_options, force_async),
365 .help = "Set IOSQE_ASYNC every N requests",
366 .category = FIO_OPT_C_ENGINE,
367 .group = FIO_OPT_G_IOURING,
368 },
855dc4d4
AG
369 {
370 .name = "cmd_type",
371 .lname = "Uring cmd type",
372 .type = FIO_OPT_STR,
373 .off1 = offsetof(struct ioring_options, cmd_type),
374 .help = "Specify uring-cmd type",
375 .def = "nvme",
376 .posval = {
377 { .ival = "nvme",
378 .oval = FIO_URING_CMD_NVME,
379 .help = "Issue nvme-uring-cmd",
380 },
381 },
382 .category = FIO_OPT_C_ENGINE,
383 .group = FIO_OPT_G_IOURING,
384 },
2838f77a 385 CMDPRIO_OPTIONS(struct ioring_options, FIO_OPT_G_IOURING),
2d6451c9
AK
386 {
387 .name = "md_per_io_size",
388 .lname = "Separate Metadata Buffer Size per I/O",
389 .type = FIO_OPT_INT,
390 .off1 = offsetof(struct ioring_options, md_per_io_size),
391 .def = "0",
392 .help = "Size of separate metadata buffer per I/O (Default: 0)",
393 .category = FIO_OPT_C_ENGINE,
394 .group = FIO_OPT_G_IOURING,
395 },
3ee8311a
AK
396 {
397 .name = "pi_act",
398 .lname = "Protection Information Action",
399 .type = FIO_OPT_BOOL,
400 .off1 = offsetof(struct ioring_options, pi_act),
401 .def = "1",
402 .help = "Protection Information Action bit (pi_act=1 or pi_act=0)",
403 .category = FIO_OPT_C_ENGINE,
404 .group = FIO_OPT_G_IOURING,
405 },
406 {
407 .name = "pi_chk",
408 .lname = "Protection Information Check",
409 .type = FIO_OPT_STR_STORE,
410 .off1 = offsetof(struct ioring_options, pi_chk),
411 .def = NULL,
412 .help = "Control of Protection Information Checking (pi_chk=GUARD,REFTAG,APPTAG)",
413 .category = FIO_OPT_C_ENGINE,
414 .group = FIO_OPT_G_IOURING,
415 },
416 {
417 .name = "apptag",
418 .lname = "Application Tag used in Protection Information",
419 .type = FIO_OPT_INT,
420 .off1 = offsetof(struct ioring_options, apptag),
421 .def = "0x1234",
422 .help = "Application Tag used in Protection Information field (Default: 0x1234)",
423 .category = FIO_OPT_C_ENGINE,
424 .group = FIO_OPT_G_IOURING,
425 },
426 {
427 .name = "apptag_mask",
428 .lname = "Application Tag Mask",
429 .type = FIO_OPT_INT,
430 .off1 = offsetof(struct ioring_options, apptag_mask),
431 .def = "0xffff",
432 .help = "Application Tag Mask used with Application Tag (Default: 0xffff)",
433 .category = FIO_OPT_C_ENGINE,
434 .group = FIO_OPT_G_IOURING,
435 },
dfc79b17
VF
436 {
437 .name = "deac",
438 .lname = "Deallocate bit for write zeroes command",
439 .type = FIO_OPT_BOOL,
440 .off1 = offsetof(struct ioring_options, deac),
441 .help = "Set DEAC (deallocate) flag for write zeroes command",
442 .def = "0",
443 .category = FIO_OPT_C_ENGINE,
444 .group = FIO_OPT_G_IOURING,
445 },
52885fa2
JA
446 {
447 .name = NULL,
448 },
449};
450
bffad86f 451static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit,
52885fa2
JA
452 unsigned int min_complete, unsigned int flags)
453{
c377f4f8
JA
454#ifdef FIO_ARCH_HAS_SYSCALL
455 return __do_syscall6(__NR_io_uring_enter, ld->ring_fd, to_submit,
456 min_complete, flags, NULL, 0);
457#else
bfed648c 458 return syscall(__NR_io_uring_enter, ld->ring_fd, to_submit,
521164fa 459 min_complete, flags, NULL, 0);
c377f4f8 460#endif
52885fa2
JA
461}
462
980fb7f2
JA
463#ifndef BLOCK_URING_CMD_DISCARD
464#define BLOCK_URING_CMD_DISCARD _IO(0x12, 0)
465#endif
466
f97d9f38
VF
467static void fio_ioring_prep_md(struct thread_data *td, struct io_u *io_u)
468{
469 struct ioring_data *ld = td->io_ops_data;
470 struct io_uring_attr_pi *pi_attr = io_u->pi_attr;
471 struct nvme_data *data = FILE_ENG_DATA(io_u->file);
472 struct io_uring_sqe *sqe;
473
474 sqe = &ld->sqes[io_u->index];
475
476 sqe->attr_type_mask = IORING_RW_ATTR_FLAG_PI;
477 sqe->attr_ptr = (__u64)(uintptr_t)pi_attr;
478 pi_attr->addr = (__u64)(uintptr_t)io_u->mmap_data;
479
480 if (pi_attr->flags & IO_INTEGRITY_CHK_REFTAG) {
481 __u64 slba = get_slba(data, io_u->offset);
482 pi_attr->seed = (__u32)slba;
483 }
484}
485
bffad86f 486static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
52885fa2 487{
bffad86f 488 struct ioring_data *ld = td->io_ops_data;
cfcc8564 489 struct ioring_options *o = td->eo;
52885fa2 490 struct fio_file *f = io_u->file;
f0403f94 491 struct io_uring_sqe *sqe;
52885fa2 492
f0403f94 493 sqe = &ld->sqes[io_u->index];
34d6090e 494
5ffd5626
JA
495 if (o->registerfiles) {
496 sqe->fd = f->engine_pos;
497 sqe->flags = IOSQE_FIXED_FILE;
498 } else {
499 sqe->fd = f->fd;
87b69ef2 500 sqe->flags = 0;
5ffd5626 501 }
52885fa2 502
e3970057 503 if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
f0403f94 504 if (o->fixedbufs) {
3f1e3af7 505 sqe->opcode = fixed_ddir_to_op[io_u->ddir];
919850d2 506 sqe->addr = (unsigned long) io_u->xfer_buf;
f0403f94 507 sqe->len = io_u->xfer_buflen;
2ea53ca3 508 sqe->buf_index = io_u->index;
cfcc8564 509 } else {
832faaaf
JA
510 struct iovec *iov = &ld->iovecs[io_u->index];
511
512 /*
513 * Update based on actual io_u, requeue could have
514 * adjusted these
515 */
516 iov->iov_base = io_u->xfer_buf;
517 iov->iov_len = io_u->xfer_buflen;
518
3f1e3af7 519 sqe->opcode = ddir_to_op[io_u->ddir][!!o->nonvectored];
b10b1e70 520 if (o->nonvectored) {
832faaaf
JA
521 sqe->addr = (unsigned long) iov->iov_base;
522 sqe->len = iov->iov_len;
b10b1e70 523 } else {
832faaaf 524 sqe->addr = (unsigned long) iov;
b10b1e70
JA
525 sqe->len = 1;
526 }
cfcc8564 527 }
f97d9f38
VF
528 if (o->md_per_io_size)
529 fio_ioring_prep_md(td, io_u);
fd70e361 530 sqe->rw_flags = 0;
43c67b9f
JA
531 if (!td->o.odirect && o->uncached)
532 sqe->rw_flags |= RWF_DONTCACHE;
7d42e66e
KK
533 if (o->nowait)
534 sqe->rw_flags |= RWF_NOWAIT;
b1552b6e
JG
535 if (td->o.oatomic && io_u->ddir == DDIR_WRITE)
536 sqe->rw_flags |= RWF_ATOMIC;
8ff6b289
NC
537
538 /*
539 * Since io_uring can have a submission context (sqthread_poll)
540 * that is different from the process context, we cannot rely on
79012fec
DLM
541 * the IO priority set by ioprio_set() (options prio, prioclass,
542 * and priohint) to be inherited.
8ff6b289
NC
543 * td->ioprio will have the value of the "default prio", so set
544 * this unconditionally. This value might get overridden by
ff00f247 545 * fio_ioring_cmdprio_prep() if the option cmdprio_percentage or
8ff6b289
NC
546 * cmdprio_bssplit is used.
547 */
548 sqe->ioprio = td->ioprio;
f0403f94 549 sqe->off = io_u->offset;
48e698fa 550 } else if (ddir_sync(io_u->ddir)) {
7c70f506 551 sqe->ioprio = 0;
01387bfe
AF
552 if (io_u->ddir == DDIR_SYNC_FILE_RANGE) {
553 sqe->off = f->first_write;
554 sqe->len = f->last_write - f->first_write;
555 sqe->sync_range_flags = td->o.sync_file_range;
556 sqe->opcode = IORING_OP_SYNC_FILE_RANGE;
557 } else {
7c70f506
JA
558 sqe->off = 0;
559 sqe->addr = 0;
560 sqe->len = 0;
01387bfe
AF
561 if (io_u->ddir == DDIR_DATASYNC)
562 sqe->fsync_flags |= IORING_FSYNC_DATASYNC;
563 sqe->opcode = IORING_OP_FSYNC;
564 }
980fb7f2
JA
565 } else if (io_u->ddir == DDIR_TRIM) {
566 sqe->opcode = IORING_OP_URING_CMD;
567 sqe->addr = io_u->offset;
568 sqe->addr3 = io_u->xfer_buflen;
569 sqe->rw_flags = 0;
570 sqe->len = sqe->off = 0;
571 sqe->ioprio = 0;
572 sqe->cmd_op = BLOCK_URING_CMD_DISCARD;
573 sqe->__pad1 = 0;
574 sqe->file_index = 0;
48e698fa 575 }
52885fa2 576
5a59a81d
JA
577 if (o->force_async && ++ld->prepped == o->force_async) {
578 ld->prepped = 0;
579 sqe->flags |= IOSQE_ASYNC;
580 }
581
48e698fa 582 sqe->user_data = (unsigned long) io_u;
52885fa2
JA
583 return 0;
584}
585
855dc4d4
AG
586static int fio_ioring_cmd_prep(struct thread_data *td, struct io_u *io_u)
587{
588 struct ioring_data *ld = td->io_ops_data;
589 struct ioring_options *o = td->eo;
590 struct fio_file *f = io_u->file;
3ce6a3de 591 struct nvme_uring_cmd *cmd;
855dc4d4 592 struct io_uring_sqe *sqe;
5d4ee0de
AK
593 struct nvme_dsm *dsm;
594 void *ptr = ld->dsm;
595 unsigned int dsm_size;
6170d92a 596 uint8_t read_opcode = nvme_cmd_read;
855dc4d4 597
3ce6a3de
JA
598 /* only supports nvme_uring_cmd */
599 if (o->cmd_type != FIO_URING_CMD_NVME)
600 return -EINVAL;
855dc4d4 601
4885a6eb 602 if (io_u->ddir == DDIR_TRIM && td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM)
16be6037
AK
603 return 0;
604
3ce6a3de 605 sqe = &ld->sqes[(io_u->index) << 1];
855dc4d4 606
3ce6a3de
JA
607 if (o->registerfiles) {
608 sqe->fd = f->engine_pos;
609 sqe->flags = IOSQE_FIXED_FILE;
610 } else {
611 sqe->fd = f->fd;
612 }
613 sqe->rw_flags = 0;
43c67b9f
JA
614 if (!td->o.odirect && o->uncached)
615 sqe->rw_flags |= RWF_DONTCACHE;
3ce6a3de
JA
616 if (o->nowait)
617 sqe->rw_flags |= RWF_NOWAIT;
855dc4d4 618
3ce6a3de
JA
619 sqe->opcode = IORING_OP_URING_CMD;
620 sqe->user_data = (unsigned long) io_u;
621 if (o->nonvectored)
622 sqe->cmd_op = NVME_URING_CMD_IO;
623 else
624 sqe->cmd_op = NVME_URING_CMD_IO_VEC;
625 if (o->force_async && ++ld->prepped == o->force_async) {
626 ld->prepped = 0;
627 sqe->flags |= IOSQE_ASYNC;
855dc4d4 628 }
0ebd3bf6
AG
629 if (o->fixedbufs) {
630 sqe->uring_cmd_flags = IORING_URING_CMD_FIXED;
631 sqe->buf_index = io_u->index;
632 }
3ce6a3de
JA
633
634 cmd = (struct nvme_uring_cmd *)sqe->cmd;
5d4ee0de
AK
635 dsm_size = sizeof(*ld->dsm) + td->o.num_range * sizeof(struct nvme_dsm_range);
636 ptr += io_u->index * dsm_size;
637 dsm = (struct nvme_dsm *)ptr;
638
6170d92a
MI
639 /*
640 * If READ command belongs to the verification phase and the
641 * verify_mode=compare, convert READ to COMPARE command.
642 */
643 if (io_u->flags & IO_U_F_VER_LIST && io_u->ddir == DDIR_READ &&
644 o->verify_mode == FIO_URING_CMD_VMODE_COMPARE) {
645 populate_verify_io_u(td, io_u);
646 read_opcode = nvme_cmd_compare;
647 io_u_set(td, io_u, IO_U_F_VER_IN_DEV);
648 }
649
3ce6a3de 650 return fio_nvme_uring_cmd_prep(cmd, io_u,
4885a6eb 651 o->nonvectored ? NULL : &ld->iovecs[io_u->index],
6170d92a
MI
652 dsm, read_opcode, ld->write_opcode,
653 ld->cdw12_flags[io_u->ddir]);
855dc4d4
AG
654}
655
f97d9f38
VF
656static void fio_ioring_validate_md(struct thread_data *td, struct io_u *io_u)
657{
658 struct nvme_data *data;
659 struct ioring_options *o = td->eo;
660 int ret;
661
662 data = FILE_ENG_DATA(io_u->file);
663 if (data->pi_type && (io_u->ddir == DDIR_READ) && !o->pi_act) {
664 ret = fio_nvme_pi_verify(data, io_u);
665 if (ret)
55355b2f 666 io_u->error = -ret;
f97d9f38
VF
667 }
668
669 return;
670}
671
bffad86f 672static struct io_u *fio_ioring_event(struct thread_data *td, int event)
52885fa2 673{
bffad86f 674 struct ioring_data *ld = td->io_ops_data;
f97d9f38 675 struct ioring_options *o = td->eo;
f0403f94 676 struct io_uring_cqe *cqe;
52885fa2 677 struct io_u *io_u;
b87aa01a 678 unsigned index;
52885fa2 679
b87aa01a 680 index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
52885fa2 681
f0403f94 682 cqe = &ld->cq_ring.cqes[index];
e3466352 683 io_u = (struct io_u *) (uintptr_t) cqe->user_data;
52885fa2 684
980fb7f2
JA
685 /* trim returns 0 on success */
686 if (cqe->res == io_u->xfer_buflen ||
687 (io_u->ddir == DDIR_TRIM && !cqe->res)) {
688 io_u->error = 0;
689 return io_u;
690 }
691
f0403f94 692 if (cqe->res != io_u->xfer_buflen) {
980fb7f2
JA
693 if (io_u->ddir == DDIR_TRIM) {
694 ld->async_trim_fail = 1;
695 cqe->res = 0;
696 }
f0403f94
JA
697 if (cqe->res > io_u->xfer_buflen)
698 io_u->error = -cqe->res;
52885fa2 699 else
f0403f94 700 io_u->resid = io_u->xfer_buflen - cqe->res;
f97d9f38
VF
701
702 return io_u;
980fb7f2 703 }
52885fa2 704
f97d9f38
VF
705 if (o->md_per_io_size)
706 fio_ioring_validate_md(td, io_u);
707
52885fa2
JA
708 return io_u;
709}
710
855dc4d4
AG
711static struct io_u *fio_ioring_cmd_event(struct thread_data *td, int event)
712{
713 struct ioring_data *ld = td->io_ops_data;
714 struct ioring_options *o = td->eo;
715 struct io_uring_cqe *cqe;
716 struct io_u *io_u;
5163f35e 717 struct nvme_data *data;
855dc4d4 718 unsigned index;
5163f35e 719 int ret;
855dc4d4
AG
720
721 index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
722 if (o->cmd_type == FIO_URING_CMD_NVME)
723 index <<= 1;
724
725 cqe = &ld->cq_ring.cqes[index];
726 io_u = (struct io_u *) (uintptr_t) cqe->user_data;
727
ebe67b66
MI
728 io_u->error = cqe->res;
729 if (io_u->error != 0)
730 goto ret;
855dc4d4 731
5163f35e
AK
732 if (o->cmd_type == FIO_URING_CMD_NVME) {
733 data = FILE_ENG_DATA(io_u->file);
734 if (data->pi_type && (io_u->ddir == DDIR_READ) && !o->pi_act) {
735 ret = fio_nvme_pi_verify(data, io_u);
736 if (ret)
737 io_u->error = ret;
738 }
739 }
740
ebe67b66
MI
741ret:
742 /*
743 * If IO_U_F_DEVICE_ERROR is not set, io_u->error will be parsed as an
744 * errno, otherwise device-specific error value (status value in CQE).
745 */
746 if ((int)io_u->error > 0)
747 io_u_set(td, io_u, IO_U_F_DEVICE_ERROR);
748 else
749 io_u_clear(td, io_u, IO_U_F_DEVICE_ERROR);
e4e8520b 750 io_u->error = abs((int)io_u->error);
855dc4d4
AG
751 return io_u;
752}
753
2a13699a
MI
754static char *fio_ioring_cmd_errdetails(struct thread_data *td,
755 struct io_u *io_u)
756{
757 struct ioring_options *o = td->eo;
758 unsigned int sct = (io_u->error >> 8) & 0x7;
759 unsigned int sc = io_u->error & 0xff;
760#define MAXERRDETAIL 1024
761#define MAXMSGCHUNK 128
762 char *msg, msgchunk[MAXMSGCHUNK];
763
ebe67b66
MI
764 if (!(io_u->flags & IO_U_F_DEVICE_ERROR))
765 return NULL;
766
2a13699a
MI
767 msg = calloc(1, MAXERRDETAIL);
768 strcpy(msg, "io_uring_cmd: ");
769
770 snprintf(msgchunk, MAXMSGCHUNK, "%s: ", io_u->file->file_name);
771 strlcat(msg, msgchunk, MAXERRDETAIL);
772
773 if (o->cmd_type == FIO_URING_CMD_NVME) {
774 strlcat(msg, "cq entry status (", MAXERRDETAIL);
775
776 snprintf(msgchunk, MAXMSGCHUNK, "sct=0x%02x; ", sct);
777 strlcat(msg, msgchunk, MAXERRDETAIL);
778
779 snprintf(msgchunk, MAXMSGCHUNK, "sc=0x%02x)", sc);
780 strlcat(msg, msgchunk, MAXERRDETAIL);
781 } else {
782 /* Print status code in generic */
783 snprintf(msgchunk, MAXMSGCHUNK, "status=0x%x", io_u->error);
784 strlcat(msg, msgchunk, MAXERRDETAIL);
785 }
786
787 return msg;
788}
789
c6078492 790static unsigned fio_ioring_cqring_reap(struct thread_data *td, unsigned int max)
52885fa2 791{
bffad86f
JA
792 struct ioring_data *ld = td->io_ops_data;
793 struct io_cq_ring *ring = &ld->cq_ring;
740019d2
CSM
794 unsigned head = *ring->head;
795 unsigned available = atomic_load_acquire(ring->tail) - head;
52885fa2 796
740019d2
CSM
797 if (!available)
798 return 0;
76ce63dd 799
740019d2 800 available = min(available, max);
a9af54b0
CSM
801 /*
802 * The CQ consumer index is advanced before the CQEs are actually read.
803 * This is generally unsafe, as it lets the kernel reuse the CQE slots.
804 * However, the CQ is sized large enough for the maximum iodepth and a
805 * new SQE won't be submitted until the CQE is processed, so the CQE
806 * slot won't actually be reused until it has been processed.
807 */
808 atomic_store_relaxed(ring->head, head + available);
740019d2 809 return available;
52885fa2
JA
810}
811
bffad86f
JA
812static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
813 unsigned int max, const struct timespec *t)
52885fa2 814{
bffad86f 815 struct ioring_data *ld = td->io_ops_data;
52885fa2 816 unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
bffad86f
JA
817 struct ioring_options *o = td->eo;
818 struct io_cq_ring *ring = &ld->cq_ring;
b87aa01a
JA
819 unsigned events = 0;
820 int r;
52885fa2 821
9a2d78b3 822 ld->cq_ring_off = *ring->head;
97d1c1a9 823 for (;;) {
ec87e8c1 824 r = fio_ioring_cqring_reap(td, max - events);
52885fa2
JA
825 if (r) {
826 events += r;
97d1c1a9
CSM
827 if (events >= min)
828 return events;
829
f7cbbbf8
ST
830 if (actual_min != 0)
831 actual_min -= r;
52885fa2
JA
832 }
833
3d7d00a3 834 if (!o->sqpoll_thread) {
9a2d78b3
JA
835 r = io_uring_enter(ld, 0, actual_min,
836 IORING_ENTER_GETEVENTS);
771c9901 837 if (r < 0) {
f6abd731 838 if (errno == EAGAIN || errno == EINTR)
771c9901 839 continue;
1816895b 840 r = -errno;
9a2d78b3 841 td_verror(td, errno, "io_uring_enter");
97d1c1a9 842 return r;
771c9901 843 }
52885fa2 844 }
97d1c1a9 845 }
52885fa2
JA
846}
847
3ee8311a
AK
848static inline void fio_ioring_cmd_nvme_pi(struct thread_data *td,
849 struct io_u *io_u)
850{
851 struct ioring_data *ld = td->io_ops_data;
3ee8311a
AK
852 struct nvme_uring_cmd *cmd;
853 struct io_uring_sqe *sqe;
3ee8311a
AK
854
855 if (io_u->ddir == DDIR_TRIM)
856 return;
857
858 sqe = &ld->sqes[(io_u->index) << 1];
859 cmd = (struct nvme_uring_cmd *)sqe->cmd;
860
be06e55f 861 fio_nvme_pi_fill(cmd, io_u, &ld->ext_opts);
3ee8311a
AK
862}
863
f97d9f38
VF
864static inline void fio_ioring_setup_pi(struct thread_data *td,
865 struct io_u *io_u)
866{
867 struct ioring_data *ld = td->io_ops_data;
868
869 if (io_u->ddir == DDIR_TRIM)
870 return;
871
872 fio_nvme_generate_guard(io_u, &ld->ext_opts);
873}
874
127715b6
NC
875static inline void fio_ioring_cmdprio_prep(struct thread_data *td,
876 struct io_u *io_u)
b2a432bf 877{
b2a432bf 878 struct ioring_data *ld = td->io_ops_data;
d6cbeab4 879 struct cmdprio *cmdprio = &ld->cmdprio;
127715b6
NC
880
881 if (fio_cmdprio_set_ioprio(td, cmdprio, io_u))
882 ld->sqes[io_u->index].ioprio = io_u->ioprio;
b2a432bf
PC
883}
884
bffad86f
JA
885static enum fio_q_status fio_ioring_queue(struct thread_data *td,
886 struct io_u *io_u)
52885fa2 887{
bffad86f 888 struct ioring_data *ld = td->io_ops_data;
3ee8311a 889 struct ioring_options *o = td->eo;
bffad86f 890 struct io_sq_ring *ring = &ld->sq_ring;
d88e8c91 891 unsigned tail;
52885fa2
JA
892
893 fio_ro_check(td, io_u);
894
af0ad0fa
JA
895 /* should not hit... */
896 if (ld->queued == td->o.iodepth)
52885fa2
JA
897 return FIO_Q_BUSY;
898
980fb7f2
JA
899 /* if async trim has been tried and failed, punt to sync */
900 if (io_u->ddir == DDIR_TRIM && ld->async_trim_fail) {
52885fa2
JA
901 if (ld->queued)
902 return FIO_Q_BUSY;
903
7f57e30f 904 do_io_u_trim(td, io_u);
16be6037 905
52885fa2
JA
906 io_u_mark_submit(td, 1);
907 io_u_mark_complete(td, 1);
908 return FIO_Q_COMPLETED;
909 }
910
d6cbeab4 911 if (ld->cmdprio.mode != CMDPRIO_MODE_NONE)
ff00f247
NC
912 fio_ioring_cmdprio_prep(td, io_u);
913
9897c064 914 if (o->cmd_type == FIO_URING_CMD_NVME && ld->is_uring_cmd_eng)
3ee8311a 915 fio_ioring_cmd_nvme_pi(td, io_u);
f97d9f38
VF
916 else if (o->md_per_io_size)
917 fio_ioring_setup_pi(td, io_u);
3ee8311a 918
d88e8c91 919 tail = *ring->tail;
b87aa01a 920 ring->array[tail & ld->sq_ring_mask] = io_u->index;
d88e8c91 921 atomic_store_release(ring->tail, tail + 1);
52885fa2
JA
922
923 ld->queued++;
924 return FIO_Q_QUEUED;
925}
926
bffad86f 927static void fio_ioring_queued(struct thread_data *td, int start, int nr)
52885fa2 928{
bffad86f 929 struct ioring_data *ld = td->io_ops_data;
52885fa2
JA
930 struct timespec now;
931
932 if (!fio_fill_issue_time(td))
933 return;
934
935 fio_gettime(&now, NULL);
936
937 while (nr--) {
bffad86f 938 struct io_sq_ring *ring = &ld->sq_ring;
9a2d78b3 939 int index = ring->array[start & ld->sq_ring_mask];
f8289afc 940 struct io_u *io_u = ld->io_u_index[index];
52885fa2
JA
941
942 memcpy(&io_u->issue_time, &now, sizeof(now));
943 io_u_queued(td, io_u);
944
945 start++;
52885fa2 946 }
39f56400
VF
947
948 /*
949 * only used for iolog
950 */
951 if (td->o.read_iolog_file)
952 memcpy(&td->last_issue, &now, sizeof(now));
52885fa2
JA
953}
954
bffad86f 955static int fio_ioring_commit(struct thread_data *td)
52885fa2 956{
bffad86f
JA
957 struct ioring_data *ld = td->io_ops_data;
958 struct ioring_options *o = td->eo;
52885fa2
JA
959 int ret;
960
961 if (!ld->queued)
962 return 0;
963
3d7d00a3
JA
964 /*
965 * Kernel side does submission. just need to check if the ring is
966 * flagged as needing a kick, if so, call io_uring_enter(). This
967 * only happens if we've been idle too long.
968 */
969 if (o->sqpoll_thread) {
bffad86f 970 struct io_sq_ring *ring = &ld->sq_ring;
10bad6b9 971 unsigned start = *ld->sq_ring.tail - ld->queued;
2dd96cc4 972 unsigned flags;
4cdbc048 973
5c15a911 974 flags = atomic_load_relaxed(ring->flags);
2dd96cc4 975 if (flags & IORING_SQ_NEED_WAKEUP)
b532dd6d
JA
976 io_uring_enter(ld, ld->queued, 0,
977 IORING_ENTER_SQ_WAKEUP);
c011bf12
AK
978 fio_ioring_queued(td, start, ld->queued);
979 io_u_mark_submit(td, ld->queued);
980
771c9901
JA
981 ld->queued = 0;
982 return 0;
983 }
984
52885fa2 985 do {
9a2d78b3 986 unsigned start = *ld->sq_ring.head;
52885fa2
JA
987 long nr = ld->queued;
988
9a2d78b3 989 ret = io_uring_enter(ld, nr, 0, IORING_ENTER_GETEVENTS);
52885fa2 990 if (ret > 0) {
bffad86f 991 fio_ioring_queued(td, start, ret);
52885fa2
JA
992 io_u_mark_submit(td, ret);
993
994 ld->queued -= ret;
995 ret = 0;
a90cd050
JA
996 } else if (!ret) {
997 io_u_mark_submit(td, ret);
52885fa2 998 continue;
a90cd050 999 } else {
f6abd731 1000 if (errno == EAGAIN || errno == EINTR) {
ec87e8c1 1001 ret = fio_ioring_cqring_reap(td, ld->queued);
a90cd050
JA
1002 if (ret)
1003 continue;
1004 /* Shouldn't happen */
1005 usleep(1);
1006 continue;
52885fa2 1007 }
1816895b 1008 ret = -errno;
9a2d78b3 1009 td_verror(td, errno, "io_uring_enter submit");
52885fa2 1010 break;
a90cd050 1011 }
52885fa2
JA
1012 } while (ld->queued);
1013
1014 return ret;
1015}
1016
bffad86f 1017static void fio_ioring_unmap(struct ioring_data *ld)
52885fa2 1018{
9a2d78b3 1019 int i;
52885fa2 1020
59f94d26 1021 for (i = 0; i < FIO_ARRAY_SIZE(ld->mmap); i++)
9a2d78b3
JA
1022 munmap(ld->mmap[i].ptr, ld->mmap[i].len);
1023 close(ld->ring_fd);
b87aa01a
JA
1024}
1025
bffad86f 1026static void fio_ioring_cleanup(struct thread_data *td)
52885fa2 1027{
bffad86f 1028 struct ioring_data *ld = td->io_ops_data;
52885fa2
JA
1029
1030 if (ld) {
52885fa2 1031 if (!(td->flags & TD_F_CHILD))
bffad86f 1032 fio_ioring_unmap(ld);
9a2d78b3 1033
d6cbeab4 1034 fio_cmdprio_cleanup(&ld->cmdprio);
52885fa2 1035 free(ld->io_u_index);
2d6451c9 1036 free(ld->md_buf);
f97d9f38 1037 free(ld->pi_attr);
9a2d78b3 1038 free(ld->iovecs);
5ffd5626 1039 free(ld->fds);
4885a6eb 1040 free(ld->dsm);
52885fa2
JA
1041 free(ld);
1042 }
1043}
1044
bffad86f 1045static int fio_ioring_mmap(struct ioring_data *ld, struct io_uring_params *p)
9a2d78b3 1046{
bffad86f
JA
1047 struct io_sq_ring *sring = &ld->sq_ring;
1048 struct io_cq_ring *cring = &ld->cq_ring;
9a2d78b3
JA
1049 void *ptr;
1050
e2239016 1051 ld->mmap[0].len = p->sq_off.array + p->sq_entries * sizeof(__u32);
9a2d78b3
JA
1052 ptr = mmap(0, ld->mmap[0].len, PROT_READ | PROT_WRITE,
1053 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
1054 IORING_OFF_SQ_RING);
1055 ld->mmap[0].ptr = ptr;
1056 sring->head = ptr + p->sq_off.head;
1057 sring->tail = ptr + p->sq_off.tail;
1058 sring->ring_mask = ptr + p->sq_off.ring_mask;
1059 sring->ring_entries = ptr + p->sq_off.ring_entries;
1060 sring->flags = ptr + p->sq_off.flags;
ac122fea 1061 sring->array = ptr + p->sq_off.array;
9a2d78b3
JA
1062 ld->sq_ring_mask = *sring->ring_mask;
1063
855dc4d4
AG
1064 if (p->flags & IORING_SETUP_SQE128)
1065 ld->mmap[1].len = 2 * p->sq_entries * sizeof(struct io_uring_sqe);
1066 else
1067 ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
f0403f94 1068 ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
9a2d78b3 1069 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
f0403f94
JA
1070 IORING_OFF_SQES);
1071 ld->mmap[1].ptr = ld->sqes;
9a2d78b3 1072
855dc4d4
AG
1073 if (p->flags & IORING_SETUP_CQE32) {
1074 ld->mmap[2].len = p->cq_off.cqes +
1075 2 * p->cq_entries * sizeof(struct io_uring_cqe);
1076 } else {
1077 ld->mmap[2].len = p->cq_off.cqes +
1078 p->cq_entries * sizeof(struct io_uring_cqe);
1079 }
9a2d78b3
JA
1080 ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
1081 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
1082 IORING_OFF_CQ_RING);
1083 ld->mmap[2].ptr = ptr;
1084 cring->head = ptr + p->cq_off.head;
1085 cring->tail = ptr + p->cq_off.tail;
1086 cring->ring_mask = ptr + p->cq_off.ring_mask;
1087 cring->ring_entries = ptr + p->cq_off.ring_entries;
f0403f94 1088 cring->cqes = ptr + p->cq_off.cqes;
9a2d78b3
JA
1089 ld->cq_ring_mask = *cring->ring_mask;
1090 return 0;
1091}
1092
556d8415
JA
1093static void fio_ioring_probe(struct thread_data *td)
1094{
1095 struct ioring_data *ld = td->io_ops_data;
1096 struct ioring_options *o = td->eo;
1097 struct io_uring_probe *p;
1098 int ret;
1099
1100 /* already set by user, don't touch */
1101 if (o->nonvectored != -1)
1102 return;
1103
1104 /* default to off, as that's always safe */
1105 o->nonvectored = 0;
1106
223decdd 1107 p = calloc(1, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
556d8415
JA
1108 if (!p)
1109 return;
1110
556d8415
JA
1111 ret = syscall(__NR_io_uring_register, ld->ring_fd,
1112 IORING_REGISTER_PROBE, p, 256);
1113 if (ret < 0)
1114 goto out;
1115
1116 if (IORING_OP_WRITE > p->ops_len)
1117 goto out;
1118
1119 if ((p->ops[IORING_OP_READ].flags & IO_URING_OP_SUPPORTED) &&
1120 (p->ops[IORING_OP_WRITE].flags & IO_URING_OP_SUPPORTED))
1121 o->nonvectored = 1;
1122out:
1123 free(p);
1124}
1125
bffad86f 1126static int fio_ioring_queue_init(struct thread_data *td)
52885fa2 1127{
bffad86f
JA
1128 struct ioring_data *ld = td->io_ops_data;
1129 struct ioring_options *o = td->eo;
af0ad0fa 1130 int depth = ld->iodepth;
bffad86f 1131 struct io_uring_params p;
9a2d78b3
JA
1132 int ret;
1133
1134 memset(&p, 0, sizeof(p));
52885fa2
JA
1135
1136 if (o->hipri)
bffad86f 1137 p.flags |= IORING_SETUP_IOPOLL;
3d7d00a3
JA
1138 if (o->sqpoll_thread) {
1139 p.flags |= IORING_SETUP_SQPOLL;
1140 if (o->sqpoll_set) {
1141 p.flags |= IORING_SETUP_SQ_AFF;
1142 p.sq_thread_cpu = o->sqpoll_cpu;
1143 }
c011bf12
AK
1144
1145 /*
1146 * Submission latency for sqpoll_thread is just the time it
1147 * takes to fill in the SQ ring entries, and any syscall if
1148 * IORING_SQ_NEED_WAKEUP is set, we don't need to log that time
1149 * separately.
1150 */
1151 td->o.disable_slat = 1;
f635f1fb 1152 }
a90cd050 1153
1db268db
JA
1154 /*
1155 * Clamp CQ ring size at our SQ ring size, we don't need more entries
1156 * than that.
1157 */
1158 p.flags |= IORING_SETUP_CQSIZE;
1159 p.cq_entries = depth;
1160
4d22c103
JA
1161 /*
1162 * Setup COOP_TASKRUN as we don't need to get IPI interrupted for
1163 * completing IO operations.
1164 */
1165 p.flags |= IORING_SETUP_COOP_TASKRUN;
1166
e453f369
JA
1167 /*
1168 * io_uring is always a single issuer, and we can defer task_work
1169 * runs until we reap events.
1170 */
1171 p.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN;
1172
b5e99df6 1173retry:
bfed648c 1174 ret = syscall(__NR_io_uring_setup, depth, &p);
b5e99df6 1175 if (ret < 0) {
e453f369
JA
1176 if (errno == EINVAL && p.flags & IORING_SETUP_DEFER_TASKRUN) {
1177 p.flags &= ~IORING_SETUP_DEFER_TASKRUN;
1178 p.flags &= ~IORING_SETUP_SINGLE_ISSUER;
1179 goto retry;
1180 }
4d22c103
JA
1181 if (errno == EINVAL && p.flags & IORING_SETUP_COOP_TASKRUN) {
1182 p.flags &= ~IORING_SETUP_COOP_TASKRUN;
1183 goto retry;
1184 }
b5e99df6
JA
1185 if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
1186 p.flags &= ~IORING_SETUP_CQSIZE;
1187 goto retry;
1188 }
9a2d78b3 1189 return ret;
b5e99df6 1190 }
9a2d78b3
JA
1191
1192 ld->ring_fd = ret;
2ea53ca3 1193
556d8415
JA
1194 fio_ioring_probe(td);
1195
2ea53ca3 1196 if (o->fixedbufs) {
bfed648c 1197 ret = syscall(__NR_io_uring_register, ld->ring_fd,
919850d2 1198 IORING_REGISTER_BUFFERS, ld->iovecs, depth);
2ea53ca3
JA
1199 if (ret < 0)
1200 return ret;
1201 }
1202
bffad86f 1203 return fio_ioring_mmap(ld, &p);
52885fa2
JA
1204}
1205
855dc4d4
AG
1206static int fio_ioring_cmd_queue_init(struct thread_data *td)
1207{
1208 struct ioring_data *ld = td->io_ops_data;
1209 struct ioring_options *o = td->eo;
af0ad0fa 1210 int depth = ld->iodepth;
855dc4d4
AG
1211 struct io_uring_params p;
1212 int ret;
1213
1214 memset(&p, 0, sizeof(p));
1215
1216 if (o->hipri)
1217 p.flags |= IORING_SETUP_IOPOLL;
1218 if (o->sqpoll_thread) {
1219 p.flags |= IORING_SETUP_SQPOLL;
1220 if (o->sqpoll_set) {
1221 p.flags |= IORING_SETUP_SQ_AFF;
1222 p.sq_thread_cpu = o->sqpoll_cpu;
1223 }
c011bf12
AK
1224
1225 /*
1226 * Submission latency for sqpoll_thread is just the time it
1227 * takes to fill in the SQ ring entries, and any syscall if
1228 * IORING_SQ_NEED_WAKEUP is set, we don't need to log that time
1229 * separately.
1230 */
1231 td->o.disable_slat = 1;
855dc4d4
AG
1232 }
1233 if (o->cmd_type == FIO_URING_CMD_NVME) {
1234 p.flags |= IORING_SETUP_SQE128;
1235 p.flags |= IORING_SETUP_CQE32;
1236 }
1237
1238 /*
1239 * Clamp CQ ring size at our SQ ring size, we don't need more entries
1240 * than that.
1241 */
1242 p.flags |= IORING_SETUP_CQSIZE;
1243 p.cq_entries = depth;
1244
07f78c37
AK
1245 /*
1246 * Setup COOP_TASKRUN as we don't need to get IPI interrupted for
1247 * completing IO operations.
1248 */
1249 p.flags |= IORING_SETUP_COOP_TASKRUN;
1250
1251 /*
1252 * io_uring is always a single issuer, and we can defer task_work
1253 * runs until we reap events.
1254 */
1255 p.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN;
1256
855dc4d4
AG
1257retry:
1258 ret = syscall(__NR_io_uring_setup, depth, &p);
1259 if (ret < 0) {
07f78c37
AK
1260 if (errno == EINVAL && p.flags & IORING_SETUP_DEFER_TASKRUN) {
1261 p.flags &= ~IORING_SETUP_DEFER_TASKRUN;
1262 p.flags &= ~IORING_SETUP_SINGLE_ISSUER;
1263 goto retry;
1264 }
1265 if (errno == EINVAL && p.flags & IORING_SETUP_COOP_TASKRUN) {
1266 p.flags &= ~IORING_SETUP_COOP_TASKRUN;
1267 goto retry;
1268 }
855dc4d4
AG
1269 if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
1270 p.flags &= ~IORING_SETUP_CQSIZE;
1271 goto retry;
1272 }
1273 return ret;
1274 }
1275
1276 ld->ring_fd = ret;
1277
1278 fio_ioring_probe(td);
1279
1280 if (o->fixedbufs) {
1281 ret = syscall(__NR_io_uring_register, ld->ring_fd,
1282 IORING_REGISTER_BUFFERS, ld->iovecs, depth);
1283 if (ret < 0)
1284 return ret;
1285 }
1286
1287 return fio_ioring_mmap(ld, &p);
1288}
1289
5ffd5626
JA
1290static int fio_ioring_register_files(struct thread_data *td)
1291{
1292 struct ioring_data *ld = td->io_ops_data;
1293 struct fio_file *f;
1294 unsigned int i;
1295 int ret;
1296
1297 ld->fds = calloc(td->o.nr_files, sizeof(int));
1298
1299 for_each_file(td, f, i) {
1300 ret = generic_open_file(td, f);
1301 if (ret)
1302 goto err;
1303 ld->fds[i] = f->fd;
1304 f->engine_pos = i;
1305 }
1306
bfed648c 1307 ret = syscall(__NR_io_uring_register, ld->ring_fd,
5ffd5626
JA
1308 IORING_REGISTER_FILES, ld->fds, td->o.nr_files);
1309 if (ret) {
1310err:
1311 free(ld->fds);
1312 ld->fds = NULL;
1313 }
1314
1315 /*
1316 * Pretend the file is closed again, and really close it if we hit
1317 * an error.
1318 */
1319 for_each_file(td, f, i) {
1320 if (ret) {
1321 int fio_unused ret2;
1322 ret2 = generic_close_file(td, f);
1323 } else
1324 f->fd = -1;
1325 }
1326
1327 return ret;
1328}
1329
bffad86f 1330static int fio_ioring_post_init(struct thread_data *td)
52885fa2 1331{
bffad86f 1332 struct ioring_data *ld = td->io_ops_data;
5ffd5626 1333 struct ioring_options *o = td->eo;
52885fa2 1334 struct io_u *io_u;
650346e1 1335 int err, i;
52885fa2 1336
650346e1
JA
1337 for (i = 0; i < td->o.iodepth; i++) {
1338 struct iovec *iov = &ld->iovecs[i];
9a2d78b3 1339
650346e1
JA
1340 io_u = ld->io_u_index[i];
1341 iov->iov_base = io_u->buf;
1342 iov->iov_len = td_max_bs(td);
52885fa2
JA
1343 }
1344
bffad86f 1345 err = fio_ioring_queue_init(td);
52885fa2 1346 if (err) {
0442b53f 1347 int init_err = errno;
c4f5c92f 1348
0442b53f 1349 if (init_err == ENOSYS)
c4f5c92f 1350 log_err("fio: your kernel doesn't support io_uring\n");
0442b53f 1351 td_verror(td, init_err, "io_queue_init");
52885fa2
JA
1352 return 1;
1353 }
1354
af0ad0fa 1355 for (i = 0; i < ld->iodepth; i++) {
7c70f506
JA
1356 struct io_uring_sqe *sqe;
1357
1358 sqe = &ld->sqes[i];
1359 memset(sqe, 0, sizeof(*sqe));
1360 }
1361
5ffd5626
JA
1362 if (o->registerfiles) {
1363 err = fio_ioring_register_files(td);
1364 if (err) {
1365 td_verror(td, errno, "ioring_register_files");
1366 return 1;
1367 }
1368 }
1369
52885fa2
JA
1370 return 0;
1371}
1372
855dc4d4
AG
1373static int fio_ioring_cmd_post_init(struct thread_data *td)
1374{
1375 struct ioring_data *ld = td->io_ops_data;
1376 struct ioring_options *o = td->eo;
1377 struct io_u *io_u;
1378 int err, i;
1379
1380 for (i = 0; i < td->o.iodepth; i++) {
1381 struct iovec *iov = &ld->iovecs[i];
1382
1383 io_u = ld->io_u_index[i];
1384 iov->iov_base = io_u->buf;
1385 iov->iov_len = td_max_bs(td);
1386 }
1387
1388 err = fio_ioring_cmd_queue_init(td);
1389 if (err) {
1390 int init_err = errno;
1391
1392 td_verror(td, init_err, "io_queue_init");
1393 return 1;
1394 }
1395
af0ad0fa 1396 for (i = 0; i < ld->iodepth; i++) {
855dc4d4
AG
1397 struct io_uring_sqe *sqe;
1398
1399 if (o->cmd_type == FIO_URING_CMD_NVME) {
1400 sqe = &ld->sqes[i << 1];
1401 memset(sqe, 0, 2 * sizeof(*sqe));
1402 } else {
1403 sqe = &ld->sqes[i];
1404 memset(sqe, 0, sizeof(*sqe));
1405 }
1406 }
1407
1408 if (o->registerfiles) {
1409 err = fio_ioring_register_files(td);
1410 if (err) {
1411 td_verror(td, errno, "ioring_register_files");
1412 return 1;
1413 }
1414 }
1415
1416 return 0;
1417}
1418
3ee8311a
AK
1419static void parse_prchk_flags(struct ioring_options *o)
1420{
1421 if (!o->pi_chk)
1422 return;
1423
1424 if (strstr(o->pi_chk, "GUARD") != NULL)
1425 o->prchk = NVME_IO_PRINFO_PRCHK_GUARD;
1426 if (strstr(o->pi_chk, "REFTAG") != NULL)
1427 o->prchk |= NVME_IO_PRINFO_PRCHK_REF;
1428 if (strstr(o->pi_chk, "APPTAG") != NULL)
1429 o->prchk |= NVME_IO_PRINFO_PRCHK_APP;
1430}
1431
bc428fd8
JA
1432static int fio_ioring_cmd_init(struct thread_data *td, struct ioring_data *ld)
1433{
1434 struct ioring_options *o = td->eo;
1435
1436 if (td_write(td)) {
1437 switch (o->write_mode) {
1438 case FIO_URING_CMD_WMODE_UNCOR:
1439 ld->write_opcode = nvme_cmd_write_uncor;
1440 break;
1441 case FIO_URING_CMD_WMODE_ZEROES:
1442 ld->write_opcode = nvme_cmd_write_zeroes;
1443 if (o->deac)
1444 ld->cdw12_flags[DDIR_WRITE] = 1 << 25;
1445 break;
1446 case FIO_URING_CMD_WMODE_VERIFY:
1447 ld->write_opcode = nvme_cmd_verify;
1448 break;
1449 default:
1450 ld->write_opcode = nvme_cmd_write;
1451 break;
1452 }
1453 }
1454
1455 if (o->readfua)
1456 ld->cdw12_flags[DDIR_READ] = 1 << 30;
1457 if (o->writefua)
1458 ld->cdw12_flags[DDIR_WRITE] = 1 << 30;
1459
1460 return 0;
1461}
1462
bffad86f 1463static int fio_ioring_init(struct thread_data *td)
52885fa2 1464{
5ffd5626 1465 struct ioring_options *o = td->eo;
bffad86f 1466 struct ioring_data *ld;
5d4ee0de
AK
1467 struct nvme_dsm *dsm;
1468 void *ptr;
1469 unsigned int dsm_size;
2d6451c9 1470 unsigned long long md_size;
5d4ee0de 1471 int ret, i;
be06e55f 1472 struct nvme_cmd_ext_io_opts *ext_opts;
52885fa2 1473
5ffd5626
JA
1474 /* sqthread submission requires registered files */
1475 if (o->sqpoll_thread)
1476 o->registerfiles = 1;
1477
1478 if (o->registerfiles && td->o.nr_files != td->o.open_files) {
1479 log_err("fio: io_uring registered files require nr_files to "
1480 "be identical to open_files\n");
1481 return 1;
1482 }
1483
52885fa2
JA
1484 ld = calloc(1, sizeof(*ld));
1485
9897c064
VF
1486 ld->is_uring_cmd_eng = (td->io_ops->prep == fio_ioring_cmd_prep);
1487
af0ad0fa
JA
1488 /*
1489 * The internal io_uring queue depth must be a power-of-2, as that's
1490 * how the ring interface works. So round that up, in case the user
1491 * set iodepth isn't a power-of-2. Leave the fio depth the same, as
1492 * not to be driving too much of an iodepth, if we did round up.
1493 */
1494 ld->iodepth = roundup_pow2(td->o.iodepth);
b87aa01a 1495
52885fa2
JA
1496 /* io_u index */
1497 ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
2d6451c9 1498
f97d9f38
VF
1499 if (!ld->is_uring_cmd_eng && o->md_per_io_size) {
1500 if (o->apptag_mask != 0xffff) {
1501 log_err("fio: io_uring with metadata requires an apptag_mask of 0xffff\n");
bdcb359c 1502 free(ld->io_u_index);
f97d9f38
VF
1503 free(ld);
1504 return 1;
1505 }
1506 }
1507
2d6451c9 1508 /*
f97d9f38 1509 * metadata buffer
2d6451c9
AK
1510 * We are only supporting iomem=malloc / mem=malloc as of now.
1511 */
f97d9f38
VF
1512 if (o->md_per_io_size && (!ld->is_uring_cmd_eng ||
1513 (ld->is_uring_cmd_eng && o->cmd_type == FIO_URING_CMD_NVME))) {
2d6451c9
AK
1514 md_size = (unsigned long long) o->md_per_io_size
1515 * (unsigned long long) td->o.iodepth;
1516 md_size += page_mask + td->o.mem_align;
1517 if (td->o.mem_align && td->o.mem_align > page_size)
1518 md_size += td->o.mem_align - page_size;
d78f2f3f
VF
1519 ld->md_buf = malloc(md_size);
1520 if (!ld->md_buf) {
bdcb359c 1521 free(ld->io_u_index);
6795954b 1522 free(ld);
2d6451c9
AK
1523 return 1;
1524 }
f97d9f38
VF
1525
1526 if (!ld->is_uring_cmd_eng) {
1527 ld->pi_attr = calloc(ld->iodepth, sizeof(struct io_uring_attr_pi));
1528 if (!ld->pi_attr) {
bdcb359c 1529 free(ld->io_u_index);
f97d9f38
VF
1530 free(ld->md_buf);
1531 free(ld);
1532 return 1;
1533 }
1534 }
1535
2d6451c9 1536 }
3ee8311a 1537 parse_prchk_flags(o);
be06e55f
VF
1538 ext_opts = &ld->ext_opts;
1539 if (o->pi_act)
1540 ext_opts->io_flags |= NVME_IO_PRINFO_PRACT;
1541 ext_opts->io_flags |= o->prchk;
1542 ext_opts->apptag = o->apptag;
1543 ext_opts->apptag_mask = o->apptag_mask;
2d6451c9 1544
af0ad0fa 1545 ld->iovecs = calloc(ld->iodepth, sizeof(struct iovec));
52885fa2
JA
1546
1547 td->io_ops_data = ld;
b2a432bf 1548
d6cbeab4 1549 ret = fio_cmdprio_init(td, &ld->cmdprio, &o->cmdprio_options);
e9f6567a
DLM
1550 if (ret) {
1551 td_verror(td, EINVAL, "fio_ioring_init");
b2a432bf
PC
1552 return 1;
1553 }
1af44196 1554
4885a6eb
VF
1555 /*
1556 * For io_uring_cmd, trims are async operations unless we are operating
1557 * in zbd mode where trim means zone reset.
1558 */
bc428fd8 1559 if (td_trim(td) && td->o.zone_mode == ZONE_MODE_ZBD &&
9897c064 1560 ld->is_uring_cmd_eng) {
4885a6eb 1561 td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM;
5d4ee0de 1562 } else {
0e987a24
JA
1563 dsm_size = sizeof(*ld->dsm);
1564 dsm_size += td->o.num_range * sizeof(struct nvme_dsm_range);
5d4ee0de
AK
1565 ld->dsm = calloc(td->o.iodepth, dsm_size);
1566 ptr = ld->dsm;
1567 for (i = 0; i < td->o.iodepth; i++) {
1568 dsm = (struct nvme_dsm *)ptr;
1569 dsm->nr_ranges = td->o.num_range;
1570 ptr += dsm_size;
1571 }
1572 }
4885a6eb 1573
9897c064 1574 if (ld->is_uring_cmd_eng)
bc428fd8
JA
1575 return fio_ioring_cmd_init(td, ld);
1576 return 0;
1577}
87a4903f 1578
bc428fd8 1579static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u)
52885fa2 1580{
bffad86f 1581 struct ioring_data *ld = td->io_ops_data;
2d6451c9 1582 struct ioring_options *o = td->eo;
5163f35e 1583 struct nvme_pi_data *pi_data;
f97d9f38 1584 char *p, *q;
52885fa2 1585
f97d9f38 1586 ld->io_u_index[io_u->index] = io_u;
2d6451c9 1587
bc428fd8
JA
1588 p = PTR_ALIGN(ld->md_buf, page_mask) + td->o.mem_align;
1589 p += o->md_per_io_size * io_u->index;
1590 io_u->mmap_data = p;
1591
f97d9f38
VF
1592 if (ld->pi_attr) {
1593 struct io_uring_attr_pi *pi_attr;
1594
1595 q = ld->pi_attr;
1596 q += (sizeof(struct io_uring_attr_pi) * io_u->index);
1597 io_u->pi_attr = q;
1598
1599 pi_attr = io_u->pi_attr;
1600 pi_attr->len = o->md_per_io_size;
1601 pi_attr->app_tag = o->apptag;
1602 pi_attr->flags = 0;
1603 if (strstr(o->pi_chk, "GUARD") != NULL)
1604 pi_attr->flags |= IO_INTEGRITY_CHK_GUARD;
1605 if (strstr(o->pi_chk, "REFTAG") != NULL)
1606 pi_attr->flags |= IO_INTEGRITY_CHK_REFTAG;
1607 if (strstr(o->pi_chk, "APPTAG") != NULL)
1608 pi_attr->flags |= IO_INTEGRITY_CHK_APPTAG;
1609 }
1610
bc428fd8
JA
1611 if (!o->pi_act) {
1612 pi_data = calloc(1, sizeof(*pi_data));
1613 pi_data->io_flags |= o->prchk;
1614 pi_data->apptag_mask = o->apptag_mask;
1615 pi_data->apptag = o->apptag;
1616 io_u->engine_data = pi_data;
2d6451c9
AK
1617 }
1618
52885fa2
JA
1619 return 0;
1620}
1621
5163f35e
AK
1622static void fio_ioring_io_u_free(struct thread_data *td, struct io_u *io_u)
1623{
f928d3ea 1624 struct nvme_pi *pi = io_u->engine_data;
bc428fd8 1625
f928d3ea
VF
1626 free(pi);
1627 io_u->engine_data = NULL;
5163f35e
AK
1628}
1629
f97d9f38
VF
1630static int fio_get_pi_info(struct fio_file *f, struct nvme_data *data)
1631{
1632 struct logical_block_metadata_cap md_cap;
1633 int ret;
1634 int fd, err = 0;
1635
1636 fd = open(f->file_name, O_RDONLY);
1637 if (fd < 0)
1638 return -errno;
1639
1640 ret = ioctl(fd, FS_IOC_GETLBMD_CAP, &md_cap);
1641 if (ret < 0) {
1642 err = -errno;
1643 log_err("%s: failed to query protection information capabilities; error %d\n", f->file_name, errno);
1644 goto out;
1645 }
1646
1647 if (!(md_cap.lbmd_flags & LBMD_PI_CAP_INTEGRITY)) {
1648 log_err("%s: Protection information not supported\n", f->file_name);
1649 err = -ENOTSUP;
1650 goto out;
1651 }
1652
1653 /* Currently we don't support storage tags */
1654 if (md_cap.lbmd_storage_tag_size) {
1655 log_err("%s: Storage tag not supported\n", f->file_name);
1656 err = -ENOTSUP;
1657 goto out;
1658 }
1659
1660 data->lba_size = md_cap.lbmd_interval;
1661 data->lba_shift = ilog2(data->lba_size);
1662 data->ms = md_cap.lbmd_size;
1663 data->pi_size = md_cap.lbmd_pi_size;
1664 data->pi_loc = !(md_cap.lbmd_pi_offset);
1665
1666 /* Assume Type 1 PI if reference tags supported */
1667 if (md_cap.lbmd_flags & LBMD_PI_CAP_REFTAG)
1668 data->pi_type = NVME_NS_DPS_PI_TYPE1;
1669 else
1670 data->pi_type = NVME_NS_DPS_PI_TYPE3;
1671
1672 switch (md_cap.lbmd_guard_tag_type) {
1673 case LBMD_PI_CSUM_CRC16_T10DIF:
1674 data->guard_type = NVME_NVM_NS_16B_GUARD;
1675 break;
1676 case LBMD_PI_CSUM_CRC64_NVME:
1677 data->guard_type = NVME_NVM_NS_64B_GUARD;
1678 break;
1679 default:
1680 log_err("%s: unsupported checksum type %d\n", f->file_name,
1681 md_cap.lbmd_guard_tag_type);
1682 err = -ENOTSUP;
1683 goto out;
1684 }
1685
1686out:
1687 close(fd);
1688 return err;
1689}
1690
1691static inline int fio_ioring_open_file_md(struct thread_data *td, struct fio_file *f)
1692{
1693 int ret = 0;
1694 struct nvme_data *data = NULL;
1695
1696 data = FILE_ENG_DATA(f);
1697 if (data == NULL) {
1698 data = calloc(1, sizeof(struct nvme_data));
1699 ret = fio_get_pi_info(f, data);
1700 if (ret) {
1701 free(data);
1702 return ret;
1703 }
1704
1705 FILE_SET_ENG_DATA(f, data);
1706 }
1707
1708 return ret;
1709}
1710
5ffd5626
JA
1711static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f)
1712{
1713 struct ioring_data *ld = td->io_ops_data;
1714 struct ioring_options *o = td->eo;
1715
f97d9f38
VF
1716 if (o->md_per_io_size) {
1717 /*
1718 * This will be a no-op when called by the io_uring_cmd
1719 * ioengine because engine data has already been collected by
1720 * the time this call is made
1721 */
1722 int ret = fio_ioring_open_file_md(td, f);
1723 if (ret)
1724 return ret;
1725 }
1726
17318cf6 1727 if (!ld || !o->registerfiles)
5ffd5626
JA
1728 return generic_open_file(td, f);
1729
1730 f->fd = ld->fds[f->engine_pos];
1731 return 0;
1732}
1733
0d9ed42a
JA
1734static int verify_params(struct thread_data *td, struct nvme_data *data,
1735 struct fio_file *f, enum fio_ddir ddir)
855dc4d4 1736{
855dc4d4 1737 struct ioring_options *o = td->eo;
0d9ed42a
JA
1738 unsigned int lba_size;
1739
1740 lba_size = data->lba_ext ? data->lba_ext : data->lba_size;
1741 if (td->o.min_bs[ddir] % lba_size || td->o.max_bs[ddir] % lba_size) {
1742 if (data->lba_ext) {
1743 log_err("%s: block size must be a multiple of %u "
1744 "(LBA data size + Metadata size)\n", f->file_name, lba_size);
1745 if (td->o.min_bs[ddir] == td->o.max_bs[ddir] &&
1746 !(td->o.min_bs[ddir] % data->lba_size)) {
1747 /* fixed block size is actually a multiple of LBA data size */
1748 unsigned long long suggestion = lba_size *
1749 (td->o.min_bs[ddir] / data->lba_size);
1750 log_err("Did you mean to use a block size of %llu?\n", suggestion);
1751 }
1752 } else {
1753 log_err("%s: block size must be a multiple of LBA data size\n",
1754 f->file_name);
1755 }
1756 td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
1757 return 1;
1758 }
1759 if (data->ms && !data->lba_ext && ddir != DDIR_TRIM &&
1760 (o->md_per_io_size < ((td->o.max_bs[ddir] / data->lba_size) * data->ms))) {
1761 log_err("%s: md_per_io_size should be at least %llu bytes\n",
1762 f->file_name,
1763 ((td->o.max_bs[ddir] / data->lba_size) * data->ms));
1764 td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
1765 return 1;
1766 }
855dc4d4 1767
0d9ed42a
JA
1768 return 0;
1769}
855dc4d4 1770
0d9ed42a
JA
1771static int fio_ioring_open_nvme(struct thread_data *td, struct fio_file *f)
1772{
1773 struct ioring_options *o = td->eo;
1774 struct nvme_data *data = NULL;
1775 __u64 nlba = 0;
1776 int ret;
855dc4d4 1777
0d9ed42a
JA
1778 /* Store the namespace-id and lba size. */
1779 data = FILE_ENG_DATA(f);
1780 if (data == NULL) {
1781 data = calloc(1, sizeof(struct nvme_data));
1782 ret = fio_nvme_get_info(f, &nlba, o->pi_act, data);
1783 if (ret) {
1784 free(data);
1785 return ret;
855dc4d4 1786 }
345fa8fd 1787
0d9ed42a
JA
1788 FILE_SET_ENG_DATA(f, data);
1789 }
ba342e58 1790
0d9ed42a
JA
1791 for_each_rw_ddir(ddir) {
1792 ret = verify_params(td, data, f, ddir);
1793 if (ret)
1794 return ret;
1795 }
87a4903f 1796
0d9ed42a
JA
1797 /*
1798 * For extended logical block sizes we cannot use verify when
1799 * end to end data protection checks are enabled, as the PI
1800 * section of data buffer conflicts with verify.
1801 */
1802 if (data->ms && data->pi_type && data->lba_ext &&
1803 td->o.verify != VERIFY_NONE) {
1804 log_err("%s: for extended LBA, verify cannot be used when E2E "
1805 "data protection is enabled\n", f->file_name);
1806 td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
1807 return 1;
1808 }
1809
1810 if (o->write_mode != FIO_URING_CMD_WMODE_WRITE && !td_write(td)) {
1811 log_err("%s: 'readwrite=|rw=' has no write\n", f->file_name);
1812 td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
1813 return 1;
1814 }
1815
1816 return 0;
1817}
1818
1819static int fio_ioring_cmd_open_file(struct thread_data *td, struct fio_file *f)
1820{
0d9ed42a
JA
1821 struct ioring_options *o = td->eo;
1822
1823 if (o->cmd_type == FIO_URING_CMD_NVME) {
1824 int ret;
1825
1826 ret = fio_ioring_open_nvme(td, f);
1827 if (ret)
1828 return ret;
855dc4d4 1829 }
855dc4d4 1830
306d8986 1831 return fio_ioring_open_file(td, f);
855dc4d4
AG
1832}
1833
5ffd5626
JA
1834static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
1835{
17318cf6 1836 struct ioring_data *ld = td->io_ops_data;
5ffd5626
JA
1837 struct ioring_options *o = td->eo;
1838
17318cf6 1839 if (!ld || !o->registerfiles)
5ffd5626
JA
1840 return generic_close_file(td, f);
1841
1842 f->fd = -1;
1843 return 0;
1844}
1845
855dc4d4
AG
1846static int fio_ioring_cmd_close_file(struct thread_data *td,
1847 struct fio_file *f)
1848{
855dc4d4
AG
1849 struct ioring_options *o = td->eo;
1850
1851 if (o->cmd_type == FIO_URING_CMD_NVME) {
1852 struct nvme_data *data = FILE_ENG_DATA(f);
1853
1854 FILE_SET_ENG_DATA(f, NULL);
1855 free(data);
1856 }
855dc4d4 1857
306d8986 1858 return fio_ioring_close_file(td, f);
855dc4d4
AG
1859}
1860
1861static int fio_ioring_cmd_get_file_size(struct thread_data *td,
1862 struct fio_file *f)
1863{
1864 struct ioring_options *o = td->eo;
1865
1866 if (fio_file_size_known(f))
1867 return 0;
1868
1869 if (o->cmd_type == FIO_URING_CMD_NVME) {
1870 struct nvme_data *data = NULL;
671aa9f5 1871 __u64 nlba = 0;
855dc4d4
AG
1872 int ret;
1873
855dc4d4 1874 data = calloc(1, sizeof(struct nvme_data));
3ee8311a 1875 ret = fio_nvme_get_info(f, &nlba, o->pi_act, data);
e7e5023b
AK
1876 if (ret) {
1877 free(data);
1878 return ret;
1879 }
855dc4d4 1880
acd2dd42 1881 if (data->lba_ext)
1882 f->real_file_size = data->lba_ext * nlba;
1883 else
1884 f->real_file_size = data->lba_size * nlba;
855dc4d4
AG
1885 fio_file_set_size_known(f);
1886
1887 FILE_SET_ENG_DATA(f, data);
1888 return 0;
1889 }
1890 return generic_get_file_size(td, f);
1891}
1892
3d05e0ff
AK
1893static int fio_ioring_cmd_get_zoned_model(struct thread_data *td,
1894 struct fio_file *f,
1895 enum zbd_zoned_model *model)
1896{
1897 return fio_nvme_get_zoned_model(td, f, model);
1898}
1899
1900static int fio_ioring_cmd_report_zones(struct thread_data *td,
1901 struct fio_file *f, uint64_t offset,
1902 struct zbd_zone *zbdz,
1903 unsigned int nr_zones)
1904{
1905 return fio_nvme_report_zones(td, f, offset, zbdz, nr_zones);
1906}
1907
1908static int fio_ioring_cmd_reset_wp(struct thread_data *td, struct fio_file *f,
1909 uint64_t offset, uint64_t length)
1910{
1911 return fio_nvme_reset_wp(td, f, offset, length);
1912}
1913
1914static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td,
1915 struct fio_file *f,
1916 unsigned int *max_open_zones)
1917{
1918 return fio_nvme_get_max_open_zones(td, f, max_open_zones);
1919}
1920
a7e8aae0
KB
1921static int fio_ioring_cmd_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1922 struct fio_ruhs_info *fruhs_info)
1923{
1924 struct nvme_fdp_ruh_status *ruhs;
70ae781d 1925 int bytes, nr_ruhs, ret, i;
a7e8aae0 1926
70ae781d
AK
1927 nr_ruhs = fruhs_info->nr_ruhs;
1928 bytes = sizeof(*ruhs) + fruhs_info->nr_ruhs * sizeof(struct nvme_fdp_ruh_status_desc);
1929
1930 ruhs = calloc(1, bytes);
a7e8aae0
KB
1931 if (!ruhs)
1932 return -ENOMEM;
1933
1934 ret = fio_nvme_iomgmt_ruhs(td, f, ruhs, bytes);
1935 if (ret)
1936 goto free;
1937
1938 fruhs_info->nr_ruhs = le16_to_cpu(ruhs->nruhsd);
70ae781d 1939 for (i = 0; i < nr_ruhs; i++)
a7e8aae0
KB
1940 fruhs_info->plis[i] = le16_to_cpu(ruhs->ruhss[i].pid);
1941free:
70ae781d 1942 free(ruhs);
a7e8aae0
KB
1943 return ret;
1944}
1945
855dc4d4 1946static struct ioengine_ops ioengine_uring = {
bffad86f 1947 .name = "io_uring",
52885fa2 1948 .version = FIO_IOOPS_VERSION,
980fb7f2
JA
1949 .flags = FIO_NO_OFFLOAD | FIO_ASYNCIO_SETS_ISSUE_TIME |
1950 FIO_ATOMICWRITES,
bffad86f
JA
1951 .init = fio_ioring_init,
1952 .post_init = fio_ioring_post_init,
1953 .io_u_init = fio_ioring_io_u_init,
f97d9f38 1954 .io_u_free = fio_ioring_io_u_free,
bffad86f
JA
1955 .prep = fio_ioring_prep,
1956 .queue = fio_ioring_queue,
1957 .commit = fio_ioring_commit,
1958 .getevents = fio_ioring_getevents,
1959 .event = fio_ioring_event,
1960 .cleanup = fio_ioring_cleanup,
5ffd5626
JA
1961 .open_file = fio_ioring_open_file,
1962 .close_file = fio_ioring_close_file,
52885fa2
JA
1963 .get_file_size = generic_get_file_size,
1964 .options = options,
bffad86f 1965 .option_struct_size = sizeof(struct ioring_options),
52885fa2
JA
1966};
1967
855dc4d4
AG
1968static struct ioengine_ops ioengine_uring_cmd = {
1969 .name = "io_uring_cmd",
1970 .version = FIO_IOOPS_VERSION,
4885a6eb 1971 .flags = FIO_NO_OFFLOAD | FIO_MEMALIGN | FIO_RAWIO |
5d4ee0de
AK
1972 FIO_ASYNCIO_SETS_ISSUE_TIME |
1973 FIO_MULTI_RANGE_TRIM,
855dc4d4
AG
1974 .init = fio_ioring_init,
1975 .post_init = fio_ioring_cmd_post_init,
f97d9f38 1976 .io_u_init = fio_ioring_io_u_init,
5163f35e 1977 .io_u_free = fio_ioring_io_u_free,
855dc4d4
AG
1978 .prep = fio_ioring_cmd_prep,
1979 .queue = fio_ioring_queue,
1980 .commit = fio_ioring_commit,
1981 .getevents = fio_ioring_getevents,
1982 .event = fio_ioring_cmd_event,
2a13699a 1983 .errdetails = fio_ioring_cmd_errdetails,
855dc4d4
AG
1984 .cleanup = fio_ioring_cleanup,
1985 .open_file = fio_ioring_cmd_open_file,
1986 .close_file = fio_ioring_cmd_close_file,
1987 .get_file_size = fio_ioring_cmd_get_file_size,
3d05e0ff
AK
1988 .get_zoned_model = fio_ioring_cmd_get_zoned_model,
1989 .report_zones = fio_ioring_cmd_report_zones,
1990 .reset_wp = fio_ioring_cmd_reset_wp,
1991 .get_max_open_zones = fio_ioring_cmd_get_max_open_zones,
855dc4d4
AG
1992 .options = options,
1993 .option_struct_size = sizeof(struct ioring_options),
a7e8aae0 1994 .fdp_fetch_ruhs = fio_ioring_cmd_fetch_ruhs,
855dc4d4
AG
1995};
1996
bffad86f 1997static void fio_init fio_ioring_register(void)
52885fa2 1998{
855dc4d4
AG
1999 register_ioengine(&ioengine_uring);
2000 register_ioengine(&ioengine_uring_cmd);
52885fa2
JA
2001}
2002
bffad86f 2003static void fio_exit fio_ioring_unregister(void)
52885fa2 2004{
855dc4d4
AG
2005 unregister_ioengine(&ioengine_uring);
2006 unregister_ioengine(&ioengine_uring_cmd);
52885fa2 2007}
1f90e9bb 2008#endif