4 * IO engine using the xNVMe C API.
6 * See: http://xnvme.io/
8 * SPDX-License-Identifier: Apache-2.0
15 #include "zbd_types.h"
19 static pthread_mutex_t g_serialize = PTHREAD_MUTEX_INITIALIZER;
21 struct xnvme_fioe_fwrap {
22 /* fio file representation */
23 struct fio_file *fio_file;
25 /* xNVMe device handle */
26 struct xnvme_dev *dev;
27 /* xNVMe device geometry */
28 const struct xnvme_geo *geo;
30 struct xnvme_queue *queue;
39 XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_fwrap) == 64, "Incorrect size")
41 struct xnvme_fioe_data {
42 /* I/O completion queue */
45 /* # of iocq entries; incremented via getevents()/cb_pool() */
49 * # of errors; incremented when observed on completion via
50 * getevents()/cb_pool()
54 /* Controller which device/file to select */
58 /* Number of devices/files for which open() has been called */
60 /* Number of devices/files allocated in files[] */
64 struct iovec *md_iovec;
66 struct xnvme_fioe_fwrap files[];
68 XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_data) == 64, "Incorrect size")
70 struct xnvme_fioe_request {
71 /* Context for NVMe PI */
72 struct xnvme_pi_ctx pi_ctx;
74 /* Separate metadata buffer pointer */
78 struct xnvme_fioe_options {
81 unsigned int sqpoll_thread;
82 unsigned int xnvme_dev_nsid;
83 unsigned int xnvme_iovec;
84 unsigned int md_per_io_size;
87 unsigned int apptag_mask;
94 char *xnvme_dev_subnqn;
97 static int str_pi_chk_cb(void *data, const char *str)
99 struct xnvme_fioe_options *o = data;
101 if (strstr(str, "GUARD") != NULL)
102 o->prchk = XNVME_PI_FLAGS_GUARD_CHECK;
103 if (strstr(str, "REFTAG") != NULL)
104 o->prchk |= XNVME_PI_FLAGS_REFTAG_CHECK;
105 if (strstr(str, "APPTAG") != NULL)
106 o->prchk |= XNVME_PI_FLAGS_APPTAG_CHECK;
111 static struct fio_option options[] = {
114 .lname = "High Priority",
115 .type = FIO_OPT_STR_SET,
116 .off1 = offsetof(struct xnvme_fioe_options, hipri),
117 .help = "Use polled IO completions",
118 .category = FIO_OPT_C_ENGINE,
119 .group = FIO_OPT_G_XNVME,
122 .name = "sqthread_poll",
123 .lname = "Kernel SQ thread polling",
124 .type = FIO_OPT_STR_SET,
125 .off1 = offsetof(struct xnvme_fioe_options, sqpoll_thread),
126 .help = "Offload submission/completion to kernel thread",
127 .category = FIO_OPT_C_ENGINE,
128 .group = FIO_OPT_G_XNVME,
132 .lname = "xNVMe Backend",
133 .type = FIO_OPT_STR_STORE,
134 .off1 = offsetof(struct xnvme_fioe_options, xnvme_be),
135 .help = "Select xNVMe backend [spdk,linux,fbsd]",
136 .category = FIO_OPT_C_ENGINE,
137 .group = FIO_OPT_G_XNVME,
141 .lname = "xNVMe Memory Backend",
142 .type = FIO_OPT_STR_STORE,
143 .off1 = offsetof(struct xnvme_fioe_options, xnvme_mem),
144 .help = "Select xNVMe memory backend",
145 .category = FIO_OPT_C_ENGINE,
146 .group = FIO_OPT_G_XNVME,
149 .name = "xnvme_async",
150 .lname = "xNVMe Asynchronous command-interface",
151 .type = FIO_OPT_STR_STORE,
152 .off1 = offsetof(struct xnvme_fioe_options, xnvme_async),
153 .help = "Select xNVMe async. interface: "
154 "[emu,thrpool,io_uring,io_uring_cmd,libaio,posix,vfio,nil]",
155 .category = FIO_OPT_C_ENGINE,
156 .group = FIO_OPT_G_XNVME,
159 .name = "xnvme_sync",
160 .lname = "xNVMe Synchronous. command-interface",
161 .type = FIO_OPT_STR_STORE,
162 .off1 = offsetof(struct xnvme_fioe_options, xnvme_sync),
163 .help = "Select xNVMe sync. interface: [nvme,psync,block]",
164 .category = FIO_OPT_C_ENGINE,
165 .group = FIO_OPT_G_XNVME,
168 .name = "xnvme_admin",
169 .lname = "xNVMe Admin command-interface",
170 .type = FIO_OPT_STR_STORE,
171 .off1 = offsetof(struct xnvme_fioe_options, xnvme_admin),
172 .help = "Select xNVMe admin. cmd-interface: [nvme,block]",
173 .category = FIO_OPT_C_ENGINE,
174 .group = FIO_OPT_G_XNVME,
177 .name = "xnvme_dev_nsid",
178 .lname = "xNVMe Namespace-Identifier, for user-space NVMe driver",
180 .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_nsid),
181 .help = "xNVMe Namespace-Identifier, for user-space NVMe driver",
182 .category = FIO_OPT_C_ENGINE,
183 .group = FIO_OPT_G_XNVME,
186 .name = "xnvme_dev_subnqn",
187 .lname = "Subsystem nqn for Fabrics",
188 .type = FIO_OPT_STR_STORE,
189 .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_subnqn),
190 .help = "Subsystem NQN for Fabrics",
191 .category = FIO_OPT_C_ENGINE,
192 .group = FIO_OPT_G_XNVME,
195 .name = "xnvme_iovec",
196 .lname = "Vectored IOs",
197 .type = FIO_OPT_STR_SET,
198 .off1 = offsetof(struct xnvme_fioe_options, xnvme_iovec),
199 .help = "Send vectored IOs",
200 .category = FIO_OPT_C_ENGINE,
201 .group = FIO_OPT_G_XNVME,
204 .name = "md_per_io_size",
205 .lname = "Separate Metadata Buffer Size per I/O",
207 .off1 = offsetof(struct xnvme_fioe_options, md_per_io_size),
209 .help = "Size of separate metadata buffer per I/O (Default: 0)",
210 .category = FIO_OPT_C_ENGINE,
211 .group = FIO_OPT_G_XNVME,
215 .lname = "Protection Information Action",
216 .type = FIO_OPT_BOOL,
217 .off1 = offsetof(struct xnvme_fioe_options, pi_act),
219 .help = "Protection Information Action bit (pi_act=1 or pi_act=0)",
220 .category = FIO_OPT_C_ENGINE,
221 .group = FIO_OPT_G_XNVME,
225 .lname = "Protection Information Check",
226 .type = FIO_OPT_STR_STORE,
228 .help = "Control of Protection Information Checking (pi_chk=GUARD,REFTAG,APPTAG)",
230 .category = FIO_OPT_C_ENGINE,
231 .group = FIO_OPT_G_XNVME,
235 .lname = "Application Tag used in Protection Information",
237 .off1 = offsetof(struct xnvme_fioe_options, apptag),
239 .help = "Application Tag used in Protection Information field (Default: 0x1234)",
240 .category = FIO_OPT_C_ENGINE,
241 .group = FIO_OPT_G_XNVME,
244 .name = "apptag_mask",
245 .lname = "Application Tag Mask",
247 .off1 = offsetof(struct xnvme_fioe_options, apptag_mask),
249 .help = "Application Tag Mask used with Application Tag (Default: 0xffff)",
250 .category = FIO_OPT_C_ENGINE,
251 .group = FIO_OPT_G_XNVME,
259 static void cb_pool(struct xnvme_cmd_ctx *ctx, void *cb_arg)
261 struct io_u *io_u = cb_arg;
262 struct xnvme_fioe_data *xd = io_u->mmap_data;
263 struct xnvme_fioe_request *fio_req = io_u->engine_data;
264 struct xnvme_fioe_fwrap *fwrap = &xd->files[io_u->file->fileno];
265 bool pi_act = (fio_req->pi_ctx.pi_flags >> 3);
268 if (xnvme_cmd_ctx_cpl_status(ctx)) {
269 xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF);
274 if (!io_u->error && fwrap->geo->pi_type && (io_u->ddir == DDIR_READ) && !pi_act) {
275 err = xnvme_pi_verify(&fio_req->pi_ctx, io_u->xfer_buf,
276 fio_req->md_buf, io_u->xfer_buflen / fwrap->lba_nbytes);
283 xd->iocq[xd->completed++] = io_u;
284 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
287 static struct xnvme_opts xnvme_opts_from_fioe(struct thread_data *td)
289 struct xnvme_fioe_options *o = td->eo;
290 struct xnvme_opts opts = xnvme_opts_default();
292 opts.nsid = o->xnvme_dev_nsid;
293 opts.subnqn = o->xnvme_dev_subnqn;
294 opts.be = o->xnvme_be;
295 opts.mem = o->xnvme_mem;
296 opts.async = o->xnvme_async;
297 opts.sync = o->xnvme_sync;
298 opts.admin = o->xnvme_admin;
300 opts.poll_io = o->hipri;
301 opts.poll_sq = o->sqpoll_thread;
303 opts.direct = td->o.odirect;
308 static void _dev_close(struct thread_data *td, struct xnvme_fioe_fwrap *fwrap)
311 xnvme_queue_term(fwrap->queue);
313 xnvme_dev_close(fwrap->dev);
315 memset(fwrap, 0, sizeof(*fwrap));
318 static void xnvme_fioe_cleanup(struct thread_data *td)
320 struct xnvme_fioe_data *xd = NULL;
323 if (!td->io_ops_data)
326 xd = td->io_ops_data;
328 err = pthread_mutex_lock(&g_serialize);
330 log_err("ioeng->cleanup(): pthread_mutex_lock(), err(%d)\n", err);
331 /* NOTE: not returning here */
333 for (uint64_t i = 0; i < xd->nallocated; ++i)
334 _dev_close(td, &xd->files[i]);
337 err = pthread_mutex_unlock(&g_serialize);
339 log_err("ioeng->cleanup(): pthread_mutex_unlock(), err(%d)\n", err);
346 td->io_ops_data = NULL;
349 static int _verify_options(struct thread_data *td, struct fio_file *f,
350 struct xnvme_fioe_fwrap *fwrap)
352 struct xnvme_fioe_options *o = td->eo;
353 unsigned int correct_md_size;
355 for_each_rw_ddir(ddir) {
356 if (td->o.min_bs[ddir] % fwrap->lba_nbytes || td->o.max_bs[ddir] % fwrap->lba_nbytes) {
357 if (!fwrap->lba_pow2) {
358 log_err("ioeng->_verify_options(%s): block size must be a multiple of %u "
359 "(LBA data size + Metadata size)\n", f->file_name, fwrap->lba_nbytes);
361 log_err("ioeng->_verify_options(%s): block size must be a multiple of LBA data size\n",
366 if (ddir == DDIR_TRIM)
369 correct_md_size = (td->o.max_bs[ddir] / fwrap->lba_nbytes) * fwrap->md_nbytes;
370 if (fwrap->md_nbytes && fwrap->lba_pow2 && (o->md_per_io_size < correct_md_size)) {
371 log_err("ioeng->_verify_options(%s): md_per_io_size should be at least %u bytes\n",
372 f->file_name, correct_md_size);
378 * For extended logical block sizes we cannot use verify when
379 * end to end data protection checks are enabled, as the PI
380 * section of data buffer conflicts with verify.
382 if (fwrap->md_nbytes && fwrap->geo->pi_type && !fwrap->lba_pow2 &&
383 td->o.verify != VERIFY_NONE) {
384 log_err("ioeng->_verify_options(%s): for extended LBA, verify cannot be used when E2E data protection is enabled\n",
393 * Helper function setting up device handles as addressed by the naming
394 * convention of the given `fio_file` filename.
396 * Checks thread-options for explicit control of asynchronous implementation via
397 * the ``--xnvme_async={thrpool,emu,posix,io_uring,libaio,nil}``.
399 static int _dev_open(struct thread_data *td, struct fio_file *f)
401 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
402 struct xnvme_fioe_options *o = td->eo;
403 struct xnvme_fioe_data *xd = td->io_ops_data;
404 struct xnvme_fioe_fwrap *fwrap;
408 if (f->fileno > (int)xd->nallocated) {
409 log_err("ioeng->_dev_open(%s): invalid assumption\n", f->file_name);
413 fwrap = &xd->files[f->fileno];
415 err = pthread_mutex_lock(&g_serialize);
417 log_err("ioeng->_dev_open(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
422 fwrap->dev = xnvme_dev_open(f->file_name, &opts);
424 log_err("ioeng->_dev_open(%s): xnvme_dev_open(), err(%d)\n", f->file_name, errno);
427 fwrap->geo = xnvme_dev_get_geo(fwrap->dev);
429 if (xnvme_queue_init(fwrap->dev, td->o.iodepth, flags, &(fwrap->queue))) {
430 log_err("ioeng->_dev_open(%s): xnvme_queue_init(), err(?)\n", f->file_name);
433 xnvme_queue_set_cb(fwrap->queue, cb_pool, NULL);
435 fwrap->ssw = xnvme_dev_get_ssw(fwrap->dev);
436 fwrap->lba_nbytes = fwrap->geo->lba_nbytes;
437 fwrap->md_nbytes = fwrap->geo->nbytes_oob;
439 if (fwrap->geo->lba_extended)
445 * When PI action is set and PI size is equal to metadata size, the
446 * controller inserts/removes PI. So update the LBA data and metadata
449 if (o->pi_act && fwrap->geo->pi_type &&
450 fwrap->geo->nbytes_oob == xnvme_pi_size(fwrap->geo->pi_format)) {
451 if (fwrap->geo->lba_extended) {
452 fwrap->lba_nbytes -= fwrap->geo->nbytes_oob;
455 fwrap->md_nbytes = 0;
458 if (_verify_options(td, f, fwrap)) {
459 td_verror(td, EINVAL, "_dev_open");
464 fwrap->fio_file->filetype = FIO_TYPE_BLOCK;
465 fwrap->fio_file->real_file_size = fwrap->geo->tbytes;
466 fio_file_set_size_known(fwrap->fio_file);
468 err = pthread_mutex_unlock(&g_serialize);
470 log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
476 xnvme_queue_term(fwrap->queue);
477 xnvme_dev_close(fwrap->dev);
479 err = pthread_mutex_unlock(&g_serialize);
481 log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
487 static int xnvme_fioe_init(struct thread_data *td)
489 struct xnvme_fioe_data *xd = NULL;
490 struct xnvme_fioe_options *o = td->eo;
494 if (!td->o.use_thread) {
495 log_err("ioeng->init(): --thread=1 is required\n");
499 /* Allocate xd and iocq */
500 xd = calloc(1, sizeof(*xd) + sizeof(*xd->files) * td->o.nr_files);
502 log_err("ioeng->init(): !calloc(), err(%d)\n", errno);
506 xd->iocq = calloc(td->o.iodepth, sizeof(struct io_u *));
509 log_err("ioeng->init(): !calloc(xd->iocq), err(%d)\n", errno);
513 if (o->xnvme_iovec) {
514 xd->iovec = calloc(td->o.iodepth, sizeof(*xd->iovec));
518 log_err("ioeng->init(): !calloc(xd->iovec), err(%d)\n", errno);
523 if (o->xnvme_iovec && o->md_per_io_size) {
524 xd->md_iovec = calloc(td->o.iodepth, sizeof(*xd->md_iovec));
529 log_err("ioeng->init(): !calloc(xd->md_iovec), err(%d)\n", errno);
535 td->io_ops_data = xd;
537 for_each_file(td, f, i)
539 if (_dev_open(td, f)) {
541 * Note: We are not freeing xd, iocq, iovec and md_iovec.
542 * This will be done as part of cleanup routine.
544 log_err("ioeng->init(): failed; _dev_open(%s)\n", f->file_name);
551 if (xd->nallocated != td->o.nr_files) {
552 log_err("ioeng->init(): failed; nallocated != td->o.nr_files\n");
559 /* NOTE: using the first device for buffer-allocators) */
560 static int xnvme_fioe_iomem_alloc(struct thread_data *td, size_t total_mem)
562 struct xnvme_fioe_data *xd = td->io_ops_data;
563 struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
566 log_err("ioeng->iomem_alloc(): failed; no dev-handle\n");
570 td->orig_buffer = xnvme_buf_alloc(fwrap->dev, total_mem);
572 return td->orig_buffer == NULL;
575 /* NOTE: using the first device for buffer-allocators) */
576 static void xnvme_fioe_iomem_free(struct thread_data *td)
578 struct xnvme_fioe_data *xd = NULL;
579 struct xnvme_fioe_fwrap *fwrap = NULL;
581 if (!td->io_ops_data)
584 xd = td->io_ops_data;
585 fwrap = &xd->files[0];
588 log_err("ioeng->iomem_free(): failed no dev-handle\n");
592 xnvme_buf_free(fwrap->dev, td->orig_buffer);
595 static int xnvme_fioe_io_u_init(struct thread_data *td, struct io_u *io_u)
597 struct xnvme_fioe_request *fio_req;
598 struct xnvme_fioe_options *o = td->eo;
599 struct xnvme_fioe_data *xd = td->io_ops_data;
600 struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
603 log_err("ioeng->io_u_init(): failed; no dev-handle\n");
607 io_u->mmap_data = td->io_ops_data;
608 io_u->engine_data = NULL;
610 fio_req = calloc(1, sizeof(*fio_req));
612 log_err("ioeng->io_u_init(): !calloc(fio_req), err(%d)\n", errno);
616 if (o->md_per_io_size) {
617 fio_req->md_buf = xnvme_buf_alloc(fwrap->dev, o->md_per_io_size);
618 if (!fio_req->md_buf) {
624 io_u->engine_data = fio_req;
629 static void xnvme_fioe_io_u_free(struct thread_data *td, struct io_u *io_u)
631 struct xnvme_fioe_data *xd = NULL;
632 struct xnvme_fioe_fwrap *fwrap = NULL;
633 struct xnvme_fioe_request *fio_req = NULL;
635 if (!td->io_ops_data)
638 xd = td->io_ops_data;
639 fwrap = &xd->files[0];
642 log_err("ioeng->io_u_free(): failed no dev-handle\n");
646 fio_req = io_u->engine_data;
648 xnvme_buf_free(fwrap->dev, fio_req->md_buf);
652 io_u->mmap_data = NULL;
655 static struct io_u *xnvme_fioe_event(struct thread_data *td, int event)
657 struct xnvme_fioe_data *xd = td->io_ops_data;
660 assert((unsigned)event < xd->completed);
662 return xd->iocq[event];
665 static int xnvme_fioe_getevents(struct thread_data *td, unsigned int min, unsigned int max,
666 const struct timespec *t)
668 struct xnvme_fioe_data *xd = td->io_ops_data;
669 struct xnvme_fioe_fwrap *fwrap = NULL;
670 int nfiles = xd->nallocated;
673 if (xd->prev != -1 && ++xd->prev < nfiles) {
674 fwrap = &xd->files[xd->prev];
680 if (fwrap == NULL || xd->cur == nfiles) {
681 fwrap = &xd->files[0];
685 while (fwrap != NULL && xd->cur < nfiles && err >= 0) {
686 err = xnvme_queue_poke(fwrap->queue, max - xd->completed);
695 log_err("ioeng->getevents(): unhandled IO error\n");
700 if (xd->completed >= min) {
702 return xd->completed;
705 fwrap = &xd->files[xd->cur];
720 return xd->completed;
723 static enum fio_q_status xnvme_fioe_queue(struct thread_data *td, struct io_u *io_u)
725 struct xnvme_fioe_data *xd = td->io_ops_data;
726 struct xnvme_fioe_options *o = td->eo;
727 struct xnvme_fioe_fwrap *fwrap;
728 struct xnvme_cmd_ctx *ctx;
729 struct xnvme_fioe_request *fio_req = io_u->engine_data;
734 bool vectored_io = ((struct xnvme_fioe_options *)td->eo)->xnvme_iovec;
735 uint32_t dir = io_u->dtype;
737 fio_ro_check(td, io_u);
739 fwrap = &xd->files[io_u->file->fileno];
740 nsid = xnvme_dev_get_nsid(fwrap->dev);
742 if (fwrap->lba_pow2) {
743 slba = io_u->offset >> fwrap->ssw;
744 nlb = (io_u->xfer_buflen >> fwrap->ssw) - 1;
746 slba = io_u->offset / fwrap->lba_nbytes;
747 nlb = (io_u->xfer_buflen / fwrap->lba_nbytes) - 1;
750 ctx = xnvme_queue_get_cmd_ctx(fwrap->queue);
751 ctx->async.cb_arg = io_u;
753 ctx->cmd.common.nsid = nsid;
754 ctx->cmd.nvm.slba = slba;
755 ctx->cmd.nvm.nlb = nlb;
757 ctx->cmd.nvm.dtype = io_u->dtype;
758 ctx->cmd.nvm.cdw13.dspec = io_u->dspec;
761 switch (io_u->ddir) {
763 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ;
767 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE;
771 log_err("ioeng->queue(): ENOSYS: %u\n", io_u->ddir);
772 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
774 io_u->error = ENOSYS;
776 return FIO_Q_COMPLETED;
779 if (fwrap->geo->pi_type && !o->pi_act) {
780 err = xnvme_pi_ctx_init(&fio_req->pi_ctx, fwrap->lba_nbytes,
781 fwrap->geo->nbytes_oob, fwrap->geo->lba_extended,
782 fwrap->geo->pi_loc, fwrap->geo->pi_type,
783 (o->pi_act << 3 | o->prchk), slba, o->apptag_mask,
784 o->apptag, fwrap->geo->pi_format);
786 log_err("ioeng->queue(): err: '%d'\n", err);
788 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
790 io_u->error = abs(err);
791 return FIO_Q_COMPLETED;
794 if (io_u->ddir == DDIR_WRITE)
795 xnvme_pi_generate(&fio_req->pi_ctx, io_u->xfer_buf, fio_req->md_buf,
799 if (fwrap->geo->pi_type)
800 ctx->cmd.nvm.prinfo = (o->pi_act << 3 | o->prchk);
802 switch (fwrap->geo->pi_type) {
805 switch (fwrap->geo->pi_format) {
806 case XNVME_SPEC_NVM_NS_16B_GUARD:
807 if (o->prchk & XNVME_PI_FLAGS_REFTAG_CHECK)
808 ctx->cmd.nvm.ilbrt = (uint32_t)slba;
810 case XNVME_SPEC_NVM_NS_64B_GUARD:
811 if (o->prchk & XNVME_PI_FLAGS_REFTAG_CHECK) {
812 ctx->cmd.nvm.ilbrt = (uint32_t)slba;
813 ctx->cmd.common.cdw03 = ((slba >> 32) & 0xffff);
819 if (o->prchk & XNVME_PI_FLAGS_APPTAG_CHECK) {
820 ctx->cmd.nvm.lbat = o->apptag;
821 ctx->cmd.nvm.lbatm = o->apptag_mask;
825 if (o->prchk & XNVME_PI_FLAGS_APPTAG_CHECK) {
826 ctx->cmd.nvm.lbat = o->apptag;
827 ctx->cmd.nvm.lbatm = o->apptag_mask;
830 case XNVME_PI_DISABLE:
835 xd->iovec[io_u->index].iov_base = io_u->xfer_buf;
836 xd->iovec[io_u->index].iov_len = io_u->xfer_buflen;
837 if (fwrap->md_nbytes && fwrap->lba_pow2) {
838 xd->md_iovec[io_u->index].iov_base = fio_req->md_buf;
839 xd->md_iovec[io_u->index].iov_len = fwrap->md_nbytes * (nlb + 1);
840 err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen,
841 &xd->md_iovec[io_u->index], 1,
842 fwrap->md_nbytes * (nlb + 1));
844 err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen,
848 if (fwrap->md_nbytes && fwrap->lba_pow2)
849 err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen,
850 fio_req->md_buf, fwrap->md_nbytes * (nlb + 1));
852 err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen, NULL, 0);
860 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
864 log_err("ioeng->queue(): err: '%d'\n", err);
866 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
868 io_u->error = abs(err);
870 return FIO_Q_COMPLETED;
874 static int xnvme_fioe_close(struct thread_data *td, struct fio_file *f)
876 struct xnvme_fioe_data *xd = td->io_ops_data;
878 dprint(FD_FILE, "xnvme close %s -- nopen: %ld\n", f->file_name, xd->nopen);
885 static int xnvme_fioe_open(struct thread_data *td, struct fio_file *f)
887 struct xnvme_fioe_data *xd = td->io_ops_data;
889 dprint(FD_FILE, "xnvme open %s -- nopen: %ld\n", f->file_name, xd->nopen);
891 if (f->fileno > (int)xd->nallocated) {
892 log_err("ioeng->open(): f->fileno > xd->nallocated; invalid assumption\n");
895 if (xd->files[f->fileno].fio_file != f) {
896 log_err("ioeng->open(): fio_file != f; invalid assumption\n");
905 static int xnvme_fioe_invalidate(struct thread_data *td, struct fio_file *f)
907 /* Consider only doing this with be:spdk */
911 static int xnvme_fioe_get_max_open_zones(struct thread_data *td, struct fio_file *f,
912 unsigned int *max_open_zones)
914 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
915 struct xnvme_dev *dev;
916 const struct xnvme_spec_znd_idfy_ns *zns;
917 int err = 0, err_lock;
919 if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
920 f->filetype != FIO_TYPE_CHAR) {
921 log_info("ioeng->get_max_open_zoned(): ignoring filetype: %d\n", f->filetype);
924 err_lock = pthread_mutex_lock(&g_serialize);
926 log_err("ioeng->get_max_open_zones(): pthread_mutex_lock(), err(%d)\n", err_lock);
930 dev = xnvme_dev_open(f->file_name, &opts);
932 log_err("ioeng->get_max_open_zones(): xnvme_dev_open(), err(%d)\n", err_lock);
936 if (xnvme_dev_get_geo(dev)->type != XNVME_GEO_ZONED) {
942 zns = (void *)xnvme_dev_get_ns_css(dev);
944 log_err("ioeng->get_max_open_zones(): xnvme_dev_get_ns_css(), err(%d)\n", errno);
950 * intentional overflow as the value is zero-based and NVMe
951 * defines 0xFFFFFFFF as unlimited thus overflowing to 0 which
952 * is how fio indicates unlimited and otherwise just converting
955 *max_open_zones = zns->mor + 1;
958 xnvme_dev_close(dev);
959 err_lock = pthread_mutex_unlock(&g_serialize);
961 log_err("ioeng->get_max_open_zones(): pthread_mutex_unlock(), err(%d)\n",
968 * Currently, this function is called before of I/O engine initialization, so,
969 * we cannot consult the file-wrapping done when 'fioe' initializes.
970 * Instead we just open based on the given filename.
972 * TODO: unify the different setup methods, consider keeping the handle around,
973 * and consider how to support the --be option in this usecase
975 static int xnvme_fioe_get_zoned_model(struct thread_data *td, struct fio_file *f,
976 enum zbd_zoned_model *model)
978 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
979 struct xnvme_dev *dev;
980 int err = 0, err_lock;
982 if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
983 f->filetype != FIO_TYPE_CHAR) {
984 log_info("ioeng->get_zoned_model(): ignoring filetype: %d\n", f->filetype);
988 err = pthread_mutex_lock(&g_serialize);
990 log_err("ioeng->get_zoned_model(): pthread_mutex_lock(), err(%d)\n", err);
994 dev = xnvme_dev_open(f->file_name, &opts);
996 log_err("ioeng->get_zoned_model(): xnvme_dev_open(%s) failed, errno: %d\n",
997 f->file_name, errno);
1002 switch (xnvme_dev_get_geo(dev)->type) {
1003 case XNVME_GEO_UNKNOWN:
1004 dprint(FD_ZBD, "%s: got 'unknown', assigning ZBD_NONE\n", f->file_name);
1008 case XNVME_GEO_CONVENTIONAL:
1009 dprint(FD_ZBD, "%s: got 'conventional', assigning ZBD_NONE\n", f->file_name);
1013 case XNVME_GEO_ZONED:
1014 dprint(FD_ZBD, "%s: got 'zoned', assigning ZBD_HOST_MANAGED\n", f->file_name);
1015 *model = ZBD_HOST_MANAGED;
1019 dprint(FD_ZBD, "%s: hit-default, assigning ZBD_NONE\n", f->file_name);
1027 xnvme_dev_close(dev);
1029 err_lock = pthread_mutex_unlock(&g_serialize);
1031 log_err("ioeng->get_zoned_model(): pthread_mutex_unlock(), err(%d)\n", err_lock);
1037 * Fills the given ``zbdz`` with at most ``nr_zones`` zone-descriptors.
1039 * The implementation converts the NVMe Zoned Command Set log-pages for Zone
1040 * descriptors into the Linux Kernel Zoned Block Report format.
1042 * NOTE: This function is called before I/O engine initialization, that is,
1043 * before ``_dev_open`` has been called and file-wrapping is setup. Thus is has
1044 * to do the ``_dev_open`` itself, and shut it down again once it is done
1045 * retrieving the log-pages and converting them to the report format.
1047 * TODO: unify the different setup methods, consider keeping the handle around,
1048 * and consider how to support the --async option in this usecase
1050 static int xnvme_fioe_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
1051 struct zbd_zone *zbdz, unsigned int nr_zones)
1053 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
1054 const struct xnvme_spec_znd_idfy_lbafe *lbafe = NULL;
1055 struct xnvme_dev *dev = NULL;
1056 const struct xnvme_geo *geo = NULL;
1057 struct xnvme_znd_report *rprt = NULL;
1060 unsigned int limit = 0;
1061 int err = 0, err_lock;
1063 dprint(FD_ZBD, "%s: report_zones() offset: %zu, nr_zones: %u\n", f->file_name, offset,
1066 err = pthread_mutex_lock(&g_serialize);
1068 log_err("ioeng->report_zones(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
1073 dev = xnvme_dev_open(f->file_name, &opts);
1075 log_err("ioeng->report_zones(%s): xnvme_dev_open(), err(%d)\n", f->file_name,
1080 geo = xnvme_dev_get_geo(dev);
1081 ssw = xnvme_dev_get_ssw(dev);
1082 lbafe = xnvme_znd_dev_get_lbafe(dev);
1084 limit = nr_zones > geo->nzone ? geo->nzone : nr_zones;
1086 dprint(FD_ZBD, "%s: limit: %u\n", f->file_name, limit);
1088 slba = ((offset >> ssw) / geo->nsect) * geo->nsect;
1090 rprt = xnvme_znd_report_from_dev(dev, slba, limit, 0);
1092 log_err("ioeng->report_zones(%s): xnvme_znd_report_from_dev(), err(%d)\n",
1093 f->file_name, errno);
1097 if (rprt->nentries != limit) {
1098 log_err("ioeng->report_zones(%s): nentries != nr_zones\n", f->file_name);
1102 if (offset > geo->tbytes) {
1103 log_err("ioeng->report_zones(%s): out-of-bounds\n", f->file_name);
1107 /* Transform the zone-report */
1108 for (uint32_t idx = 0; idx < rprt->nentries; ++idx) {
1109 struct xnvme_spec_znd_descr *descr = XNVME_ZND_REPORT_DESCR(rprt, idx);
1111 zbdz[idx].start = descr->zslba << ssw;
1112 zbdz[idx].len = lbafe->zsze << ssw;
1113 zbdz[idx].capacity = descr->zcap << ssw;
1114 zbdz[idx].wp = descr->wp << ssw;
1116 switch (descr->zt) {
1117 case XNVME_SPEC_ZND_TYPE_SEQWR:
1118 zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
1122 log_err("ioeng->report_zones(%s): invalid type for zone at offset(%zu)\n",
1123 f->file_name, zbdz[idx].start);
1128 switch (descr->zs) {
1129 case XNVME_SPEC_ZND_STATE_EMPTY:
1130 zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
1132 case XNVME_SPEC_ZND_STATE_IOPEN:
1133 zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
1135 case XNVME_SPEC_ZND_STATE_EOPEN:
1136 zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
1138 case XNVME_SPEC_ZND_STATE_CLOSED:
1139 zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
1141 case XNVME_SPEC_ZND_STATE_FULL:
1142 zbdz[idx].cond = ZBD_ZONE_COND_FULL;
1145 case XNVME_SPEC_ZND_STATE_RONLY:
1146 case XNVME_SPEC_ZND_STATE_OFFLINE:
1148 zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
1154 xnvme_buf_virt_free(rprt);
1156 xnvme_dev_close(dev);
1158 err_lock = pthread_mutex_unlock(&g_serialize);
1160 log_err("ioeng->report_zones(): pthread_mutex_unlock(), err: %d\n", err_lock);
1162 dprint(FD_ZBD, "err: %d, nr_zones: %d\n", err, (int)nr_zones);
1164 return err ? err : (int)limit;
1168 * NOTE: This function may get called before I/O engine initialization, that is,
1169 * before ``_dev_open`` has been called and file-wrapping is setup. In such
1170 * case it has to do ``_dev_open`` itself, and shut it down again once it is
1171 * done resetting write pointer of zones.
1173 static int xnvme_fioe_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset,
1176 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
1177 struct xnvme_fioe_data *xd = NULL;
1178 struct xnvme_fioe_fwrap *fwrap = NULL;
1179 struct xnvme_dev *dev = NULL;
1180 const struct xnvme_geo *geo = NULL;
1181 uint64_t first, last;
1184 int err = 0, err_lock;
1186 if (td->io_ops_data) {
1187 xd = td->io_ops_data;
1188 fwrap = &xd->files[f->fileno];
1197 err = pthread_mutex_lock(&g_serialize);
1199 log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", err);
1203 dev = xnvme_dev_open(f->file_name, &opts);
1205 log_err("ioeng->reset_wp(): xnvme_dev_open(%s) failed, errno(%d)\n",
1206 f->file_name, errno);
1209 geo = xnvme_dev_get_geo(dev);
1210 ssw = xnvme_dev_get_ssw(dev);
1213 nsid = xnvme_dev_get_nsid(dev);
1215 first = ((offset >> ssw) / geo->nsect) * geo->nsect;
1216 last = (((offset + length) >> ssw) / geo->nsect) * geo->nsect;
1217 dprint(FD_ZBD, "first: 0x%lx, last: 0x%lx\n", first, last);
1219 for (uint64_t zslba = first; zslba < last; zslba += geo->nsect) {
1220 struct xnvme_cmd_ctx ctx = xnvme_cmd_ctx_from_dev(dev);
1222 if (zslba >= (geo->nsect * geo->nzone)) {
1223 log_err("ioeng->reset_wp(): out-of-bounds\n");
1228 err = xnvme_znd_mgmt_send(&ctx, nsid, zslba, false,
1229 XNVME_SPEC_ZND_CMD_MGMT_SEND_RESET, 0x0, NULL);
1230 if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
1231 err = err ? err : -EIO;
1232 log_err("ioeng->reset_wp(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
1238 if (!td->io_ops_data) {
1239 xnvme_dev_close(dev);
1241 err_lock = pthread_mutex_unlock(&g_serialize);
1243 log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err_lock);
1249 static int xnvme_fioe_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1250 struct fio_ruhs_info *fruhs_info)
1252 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
1253 struct xnvme_dev *dev;
1254 struct xnvme_spec_ruhs *ruhs;
1255 struct xnvme_cmd_ctx ctx;
1256 uint32_t ruhs_nbytes;
1258 int err = 0, err_lock;
1260 if (f->filetype != FIO_TYPE_CHAR && f->filetype != FIO_TYPE_FILE) {
1261 log_err("ioeng->fdp_ruhs(): ignoring filetype: %d\n", f->filetype);
1265 err = pthread_mutex_lock(&g_serialize);
1267 log_err("ioeng->fdp_ruhs(): pthread_mutex_lock(), err(%d)\n", err);
1271 dev = xnvme_dev_open(f->file_name, &opts);
1273 log_err("ioeng->fdp_ruhs(): xnvme_dev_open(%s) failed, errno: %d\n",
1274 f->file_name, errno);
1279 ruhs_nbytes = sizeof(*ruhs) + (FDP_MAX_RUHS * sizeof(struct xnvme_spec_ruhs_desc));
1280 ruhs = xnvme_buf_alloc(dev, ruhs_nbytes);
1285 memset(ruhs, 0, ruhs_nbytes);
1287 ctx = xnvme_cmd_ctx_from_dev(dev);
1288 nsid = xnvme_dev_get_nsid(dev);
1290 err = xnvme_nvm_mgmt_recv(&ctx, nsid, XNVME_SPEC_IO_MGMT_RECV_RUHS, 0, ruhs, ruhs_nbytes);
1292 if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
1293 err = err ? err : -EIO;
1294 log_err("ioeng->fdp_ruhs(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
1298 fruhs_info->nr_ruhs = ruhs->nruhsd;
1299 for (uint32_t idx = 0; idx < fruhs_info->nr_ruhs; ++idx) {
1300 fruhs_info->plis[idx] = le16_to_cpu(ruhs->desc[idx].pi);
1304 xnvme_buf_free(dev, ruhs);
1306 xnvme_dev_close(dev);
1308 err_lock = pthread_mutex_unlock(&g_serialize);
1310 log_err("ioeng->fdp_ruhs(): pthread_mutex_unlock(), err(%d)\n", err_lock);
1315 static int xnvme_fioe_get_file_size(struct thread_data *td, struct fio_file *f)
1317 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
1318 struct xnvme_dev *dev;
1321 if (fio_file_size_known(f))
1324 ret = pthread_mutex_lock(&g_serialize);
1326 log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", ret);
1330 dev = xnvme_dev_open(f->file_name, &opts);
1332 log_err("%s: failed retrieving device handle, errno: %d\n", f->file_name, errno);
1337 f->real_file_size = xnvme_dev_get_geo(dev)->tbytes;
1338 fio_file_set_size_known(f);
1340 if (td->o.zone_mode == ZONE_MODE_ZBD)
1341 f->filetype = FIO_TYPE_BLOCK;
1344 xnvme_dev_close(dev);
1345 err = pthread_mutex_unlock(&g_serialize);
1347 log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err);
1352 FIO_STATIC struct ioengine_ops ioengine = {
1354 .version = FIO_IOOPS_VERSION,
1356 .option_struct_size = sizeof(struct xnvme_fioe_options),
1357 .flags = FIO_DISKLESSIO | FIO_NODISKUTIL | FIO_NOEXTEND | FIO_MEMALIGN | FIO_RAWIO,
1359 .cleanup = xnvme_fioe_cleanup,
1360 .init = xnvme_fioe_init,
1362 .iomem_free = xnvme_fioe_iomem_free,
1363 .iomem_alloc = xnvme_fioe_iomem_alloc,
1365 .io_u_free = xnvme_fioe_io_u_free,
1366 .io_u_init = xnvme_fioe_io_u_init,
1368 .event = xnvme_fioe_event,
1369 .getevents = xnvme_fioe_getevents,
1370 .queue = xnvme_fioe_queue,
1372 .close_file = xnvme_fioe_close,
1373 .open_file = xnvme_fioe_open,
1374 .get_file_size = xnvme_fioe_get_file_size,
1376 .invalidate = xnvme_fioe_invalidate,
1377 .get_max_open_zones = xnvme_fioe_get_max_open_zones,
1378 .get_zoned_model = xnvme_fioe_get_zoned_model,
1379 .report_zones = xnvme_fioe_report_zones,
1380 .reset_wp = xnvme_fioe_reset_wp,
1382 .fdp_fetch_ruhs = xnvme_fioe_fetch_ruhs,
1385 static void fio_init fio_xnvme_register(void)
1387 register_ioengine(&ioengine);
1390 static void fio_exit fio_xnvme_unregister(void)
1392 unregister_ioengine(&ioengine);