4 * IO engine using the xNVMe C API.
6 * See: http://xnvme.io/
8 * SPDX-License-Identifier: Apache-2.0
14 #include "zbd_types.h"
18 static pthread_mutex_t g_serialize = PTHREAD_MUTEX_INITIALIZER;
20 struct xnvme_fioe_fwrap {
21 /* fio file representation */
22 struct fio_file *fio_file;
24 /* xNVMe device handle */
25 struct xnvme_dev *dev;
26 /* xNVMe device geometry */
27 const struct xnvme_geo *geo;
29 struct xnvme_queue *queue;
38 XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_fwrap) == 64, "Incorrect size")
40 struct xnvme_fioe_data {
41 /* I/O completion queue */
44 /* # of iocq entries; incremented via getevents()/cb_pool() */
48 * # of errors; incremented when observed on completion via
49 * getevents()/cb_pool()
53 /* Controller which device/file to select */
57 /* Number of devices/files for which open() has been called */
59 /* Number of devices/files allocated in files[] */
63 struct iovec *md_iovec;
65 struct xnvme_fioe_fwrap files[];
67 XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_data) == 64, "Incorrect size")
69 struct xnvme_fioe_request {
70 /* Context for NVMe PI */
71 struct xnvme_pi_ctx pi_ctx;
73 /* Separate metadata buffer pointer */
77 struct xnvme_fioe_options {
80 unsigned int sqpoll_thread;
81 unsigned int xnvme_dev_nsid;
82 unsigned int xnvme_iovec;
83 unsigned int md_per_io_size;
86 unsigned int apptag_mask;
93 char *xnvme_dev_subnqn;
96 static int str_pi_chk_cb(void *data, const char *str)
98 struct xnvme_fioe_options *o = data;
100 if (strstr(str, "GUARD") != NULL)
101 o->prchk = XNVME_PI_FLAGS_GUARD_CHECK;
102 if (strstr(str, "REFTAG") != NULL)
103 o->prchk |= XNVME_PI_FLAGS_REFTAG_CHECK;
104 if (strstr(str, "APPTAG") != NULL)
105 o->prchk |= XNVME_PI_FLAGS_APPTAG_CHECK;
110 static struct fio_option options[] = {
113 .lname = "High Priority",
114 .type = FIO_OPT_STR_SET,
115 .off1 = offsetof(struct xnvme_fioe_options, hipri),
116 .help = "Use polled IO completions",
117 .category = FIO_OPT_C_ENGINE,
118 .group = FIO_OPT_G_XNVME,
121 .name = "sqthread_poll",
122 .lname = "Kernel SQ thread polling",
123 .type = FIO_OPT_STR_SET,
124 .off1 = offsetof(struct xnvme_fioe_options, sqpoll_thread),
125 .help = "Offload submission/completion to kernel thread",
126 .category = FIO_OPT_C_ENGINE,
127 .group = FIO_OPT_G_XNVME,
131 .lname = "xNVMe Backend",
132 .type = FIO_OPT_STR_STORE,
133 .off1 = offsetof(struct xnvme_fioe_options, xnvme_be),
134 .help = "Select xNVMe backend [spdk,linux,fbsd]",
135 .category = FIO_OPT_C_ENGINE,
136 .group = FIO_OPT_G_XNVME,
140 .lname = "xNVMe Memory Backend",
141 .type = FIO_OPT_STR_STORE,
142 .off1 = offsetof(struct xnvme_fioe_options, xnvme_mem),
143 .help = "Select xNVMe memory backend",
144 .category = FIO_OPT_C_ENGINE,
145 .group = FIO_OPT_G_XNVME,
148 .name = "xnvme_async",
149 .lname = "xNVMe Asynchronous command-interface",
150 .type = FIO_OPT_STR_STORE,
151 .off1 = offsetof(struct xnvme_fioe_options, xnvme_async),
152 .help = "Select xNVMe async. interface: "
153 "[emu,thrpool,io_uring,io_uring_cmd,libaio,posix,vfio,nil]",
154 .category = FIO_OPT_C_ENGINE,
155 .group = FIO_OPT_G_XNVME,
158 .name = "xnvme_sync",
159 .lname = "xNVMe Synchronous. command-interface",
160 .type = FIO_OPT_STR_STORE,
161 .off1 = offsetof(struct xnvme_fioe_options, xnvme_sync),
162 .help = "Select xNVMe sync. interface: [nvme,psync,block]",
163 .category = FIO_OPT_C_ENGINE,
164 .group = FIO_OPT_G_XNVME,
167 .name = "xnvme_admin",
168 .lname = "xNVMe Admin command-interface",
169 .type = FIO_OPT_STR_STORE,
170 .off1 = offsetof(struct xnvme_fioe_options, xnvme_admin),
171 .help = "Select xNVMe admin. cmd-interface: [nvme,block]",
172 .category = FIO_OPT_C_ENGINE,
173 .group = FIO_OPT_G_XNVME,
176 .name = "xnvme_dev_nsid",
177 .lname = "xNVMe Namespace-Identifier, for user-space NVMe driver",
179 .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_nsid),
180 .help = "xNVMe Namespace-Identifier, for user-space NVMe driver",
181 .category = FIO_OPT_C_ENGINE,
182 .group = FIO_OPT_G_XNVME,
185 .name = "xnvme_dev_subnqn",
186 .lname = "Subsystem nqn for Fabrics",
187 .type = FIO_OPT_STR_STORE,
188 .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_subnqn),
189 .help = "Subsystem NQN for Fabrics",
190 .category = FIO_OPT_C_ENGINE,
191 .group = FIO_OPT_G_XNVME,
194 .name = "xnvme_iovec",
195 .lname = "Vectored IOs",
196 .type = FIO_OPT_STR_SET,
197 .off1 = offsetof(struct xnvme_fioe_options, xnvme_iovec),
198 .help = "Send vectored IOs",
199 .category = FIO_OPT_C_ENGINE,
200 .group = FIO_OPT_G_XNVME,
203 .name = "md_per_io_size",
204 .lname = "Separate Metadata Buffer Size per I/O",
206 .off1 = offsetof(struct xnvme_fioe_options, md_per_io_size),
208 .help = "Size of separate metadata buffer per I/O (Default: 0)",
209 .category = FIO_OPT_C_ENGINE,
210 .group = FIO_OPT_G_XNVME,
214 .lname = "Protection Information Action",
215 .type = FIO_OPT_BOOL,
216 .off1 = offsetof(struct xnvme_fioe_options, pi_act),
218 .help = "Protection Information Action bit (pi_act=1 or pi_act=0)",
219 .category = FIO_OPT_C_ENGINE,
220 .group = FIO_OPT_G_XNVME,
224 .lname = "Protection Information Check",
225 .type = FIO_OPT_STR_STORE,
227 .help = "Control of Protection Information Checking (pi_chk=GUARD,REFTAG,APPTAG)",
229 .category = FIO_OPT_C_ENGINE,
230 .group = FIO_OPT_G_XNVME,
234 .lname = "Application Tag used in Protection Information",
236 .off1 = offsetof(struct xnvme_fioe_options, apptag),
238 .help = "Application Tag used in Protection Information field (Default: 0x1234)",
239 .category = FIO_OPT_C_ENGINE,
240 .group = FIO_OPT_G_XNVME,
243 .name = "apptag_mask",
244 .lname = "Application Tag Mask",
246 .off1 = offsetof(struct xnvme_fioe_options, apptag_mask),
248 .help = "Application Tag Mask used with Application Tag (Default: 0xffff)",
249 .category = FIO_OPT_C_ENGINE,
250 .group = FIO_OPT_G_XNVME,
258 static void cb_pool(struct xnvme_cmd_ctx *ctx, void *cb_arg)
260 struct io_u *io_u = cb_arg;
261 struct xnvme_fioe_data *xd = io_u->mmap_data;
262 struct xnvme_fioe_request *fio_req = io_u->engine_data;
263 struct xnvme_fioe_fwrap *fwrap = &xd->files[io_u->file->fileno];
264 bool pi_act = (fio_req->pi_ctx.pi_flags >> 3);
267 if (xnvme_cmd_ctx_cpl_status(ctx)) {
268 xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF);
273 if (!io_u->error && fwrap->geo->pi_type && (io_u->ddir == DDIR_READ) && !pi_act) {
274 err = xnvme_pi_verify(&fio_req->pi_ctx, io_u->xfer_buf,
275 fio_req->md_buf, io_u->xfer_buflen / fwrap->lba_nbytes);
282 xd->iocq[xd->completed++] = io_u;
283 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
286 static struct xnvme_opts xnvme_opts_from_fioe(struct thread_data *td)
288 struct xnvme_fioe_options *o = td->eo;
289 struct xnvme_opts opts = xnvme_opts_default();
291 opts.nsid = o->xnvme_dev_nsid;
292 opts.subnqn = o->xnvme_dev_subnqn;
293 opts.be = o->xnvme_be;
294 opts.mem = o->xnvme_mem;
295 opts.async = o->xnvme_async;
296 opts.sync = o->xnvme_sync;
297 opts.admin = o->xnvme_admin;
299 opts.poll_io = o->hipri;
300 opts.poll_sq = o->sqpoll_thread;
302 opts.direct = td->o.odirect;
307 static void _dev_close(struct thread_data *td, struct xnvme_fioe_fwrap *fwrap)
310 xnvme_queue_term(fwrap->queue);
312 xnvme_dev_close(fwrap->dev);
314 memset(fwrap, 0, sizeof(*fwrap));
317 static void xnvme_fioe_cleanup(struct thread_data *td)
319 struct xnvme_fioe_data *xd = NULL;
322 if (!td->io_ops_data)
325 xd = td->io_ops_data;
327 err = pthread_mutex_lock(&g_serialize);
329 log_err("ioeng->cleanup(): pthread_mutex_lock(), err(%d)\n", err);
330 /* NOTE: not returning here */
332 for (uint64_t i = 0; i < xd->nallocated; ++i)
333 _dev_close(td, &xd->files[i]);
336 err = pthread_mutex_unlock(&g_serialize);
338 log_err("ioeng->cleanup(): pthread_mutex_unlock(), err(%d)\n", err);
345 td->io_ops_data = NULL;
349 * Helper function setting up device handles as addressed by the naming
350 * convention of the given `fio_file` filename.
352 * Checks thread-options for explicit control of asynchronous implementation via
353 * the ``--xnvme_async={thrpool,emu,posix,io_uring,libaio,nil}``.
355 static int _dev_open(struct thread_data *td, struct fio_file *f)
357 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
358 struct xnvme_fioe_options *o = td->eo;
359 struct xnvme_fioe_data *xd = td->io_ops_data;
360 struct xnvme_fioe_fwrap *fwrap;
364 if (f->fileno > (int)xd->nallocated) {
365 log_err("ioeng->_dev_open(%s): invalid assumption\n", f->file_name);
369 fwrap = &xd->files[f->fileno];
371 err = pthread_mutex_lock(&g_serialize);
373 log_err("ioeng->_dev_open(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
378 fwrap->dev = xnvme_dev_open(f->file_name, &opts);
380 log_err("ioeng->_dev_open(%s): xnvme_dev_open(), err(%d)\n", f->file_name, errno);
383 fwrap->geo = xnvme_dev_get_geo(fwrap->dev);
385 if (xnvme_queue_init(fwrap->dev, td->o.iodepth, flags, &(fwrap->queue))) {
386 log_err("ioeng->_dev_open(%s): xnvme_queue_init(), err(?)\n", f->file_name);
389 xnvme_queue_set_cb(fwrap->queue, cb_pool, NULL);
391 fwrap->ssw = xnvme_dev_get_ssw(fwrap->dev);
392 fwrap->lba_nbytes = fwrap->geo->lba_nbytes;
393 fwrap->md_nbytes = fwrap->geo->nbytes_oob;
395 if (fwrap->geo->lba_extended)
401 * When PI action is set and PI size is equal to metadata size, the
402 * controller inserts/removes PI. So update the LBA data and metadata
405 if (o->pi_act && fwrap->geo->pi_type &&
406 fwrap->geo->nbytes_oob == xnvme_pi_size(fwrap->geo->pi_format)) {
407 if (fwrap->geo->lba_extended) {
408 fwrap->lba_nbytes -= fwrap->geo->nbytes_oob;
411 fwrap->md_nbytes = 0;
415 fwrap->fio_file->filetype = FIO_TYPE_BLOCK;
416 fwrap->fio_file->real_file_size = fwrap->geo->tbytes;
417 fio_file_set_size_known(fwrap->fio_file);
419 err = pthread_mutex_unlock(&g_serialize);
421 log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
427 xnvme_queue_term(fwrap->queue);
428 xnvme_dev_close(fwrap->dev);
430 err = pthread_mutex_unlock(&g_serialize);
432 log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
438 static int xnvme_fioe_init(struct thread_data *td)
440 struct xnvme_fioe_data *xd = NULL;
441 struct xnvme_fioe_options *o = td->eo;
445 if (!td->o.use_thread) {
446 log_err("ioeng->init(): --thread=1 is required\n");
450 /* Allocate xd and iocq */
451 xd = calloc(1, sizeof(*xd) + sizeof(*xd->files) * td->o.nr_files);
453 log_err("ioeng->init(): !calloc(), err(%d)\n", errno);
457 xd->iocq = calloc(td->o.iodepth, sizeof(struct io_u *));
460 log_err("ioeng->init(): !calloc(xd->iocq), err(%d)\n", errno);
464 if (o->xnvme_iovec) {
465 xd->iovec = calloc(td->o.iodepth, sizeof(*xd->iovec));
469 log_err("ioeng->init(): !calloc(xd->iovec), err(%d)\n", errno);
474 if (o->xnvme_iovec && o->md_per_io_size) {
475 xd->md_iovec = calloc(td->o.iodepth, sizeof(*xd->md_iovec));
480 log_err("ioeng->init(): !calloc(xd->md_iovec), err(%d)\n", errno);
486 td->io_ops_data = xd;
488 for_each_file(td, f, i)
490 if (_dev_open(td, f)) {
492 * Note: We are not freeing xd, iocq, iovec and md_iovec.
493 * This will be done as part of cleanup routine.
495 log_err("ioeng->init(): failed; _dev_open(%s)\n", f->file_name);
502 if (xd->nallocated != td->o.nr_files) {
503 log_err("ioeng->init(): failed; nallocated != td->o.nr_files\n");
510 /* NOTE: using the first device for buffer-allocators) */
511 static int xnvme_fioe_iomem_alloc(struct thread_data *td, size_t total_mem)
513 struct xnvme_fioe_data *xd = td->io_ops_data;
514 struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
517 log_err("ioeng->iomem_alloc(): failed; no dev-handle\n");
521 td->orig_buffer = xnvme_buf_alloc(fwrap->dev, total_mem);
523 return td->orig_buffer == NULL;
526 /* NOTE: using the first device for buffer-allocators) */
527 static void xnvme_fioe_iomem_free(struct thread_data *td)
529 struct xnvme_fioe_data *xd = NULL;
530 struct xnvme_fioe_fwrap *fwrap = NULL;
532 if (!td->io_ops_data)
535 xd = td->io_ops_data;
536 fwrap = &xd->files[0];
539 log_err("ioeng->iomem_free(): failed no dev-handle\n");
543 xnvme_buf_free(fwrap->dev, td->orig_buffer);
546 static int xnvme_fioe_io_u_init(struct thread_data *td, struct io_u *io_u)
548 struct xnvme_fioe_request *fio_req;
549 struct xnvme_fioe_options *o = td->eo;
550 struct xnvme_fioe_data *xd = td->io_ops_data;
551 struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
554 log_err("ioeng->io_u_init(): failed; no dev-handle\n");
558 io_u->mmap_data = td->io_ops_data;
559 io_u->engine_data = NULL;
561 fio_req = calloc(1, sizeof(*fio_req));
563 log_err("ioeng->io_u_init(): !calloc(fio_req), err(%d)\n", errno);
567 if (o->md_per_io_size) {
568 fio_req->md_buf = xnvme_buf_alloc(fwrap->dev, o->md_per_io_size);
569 if (!fio_req->md_buf) {
575 io_u->engine_data = fio_req;
580 static void xnvme_fioe_io_u_free(struct thread_data *td, struct io_u *io_u)
582 struct xnvme_fioe_data *xd = NULL;
583 struct xnvme_fioe_fwrap *fwrap = NULL;
584 struct xnvme_fioe_request *fio_req = NULL;
586 if (!td->io_ops_data)
589 xd = td->io_ops_data;
590 fwrap = &xd->files[0];
593 log_err("ioeng->io_u_free(): failed no dev-handle\n");
597 fio_req = io_u->engine_data;
599 xnvme_buf_free(fwrap->dev, fio_req->md_buf);
603 io_u->mmap_data = NULL;
606 static struct io_u *xnvme_fioe_event(struct thread_data *td, int event)
608 struct xnvme_fioe_data *xd = td->io_ops_data;
611 assert((unsigned)event < xd->completed);
613 return xd->iocq[event];
616 static int xnvme_fioe_getevents(struct thread_data *td, unsigned int min, unsigned int max,
617 const struct timespec *t)
619 struct xnvme_fioe_data *xd = td->io_ops_data;
620 struct xnvme_fioe_fwrap *fwrap = NULL;
621 int nfiles = xd->nallocated;
624 if (xd->prev != -1 && ++xd->prev < nfiles) {
625 fwrap = &xd->files[xd->prev];
631 if (fwrap == NULL || xd->cur == nfiles) {
632 fwrap = &xd->files[0];
636 while (fwrap != NULL && xd->cur < nfiles && err >= 0) {
637 err = xnvme_queue_poke(fwrap->queue, max - xd->completed);
646 log_err("ioeng->getevents(): unhandled IO error\n");
651 if (xd->completed >= min) {
653 return xd->completed;
656 fwrap = &xd->files[xd->cur];
671 return xd->completed;
674 static enum fio_q_status xnvme_fioe_queue(struct thread_data *td, struct io_u *io_u)
676 struct xnvme_fioe_data *xd = td->io_ops_data;
677 struct xnvme_fioe_options *o = td->eo;
678 struct xnvme_fioe_fwrap *fwrap;
679 struct xnvme_cmd_ctx *ctx;
680 struct xnvme_fioe_request *fio_req = io_u->engine_data;
685 bool vectored_io = ((struct xnvme_fioe_options *)td->eo)->xnvme_iovec;
686 uint32_t dir = io_u->dtype;
688 fio_ro_check(td, io_u);
690 fwrap = &xd->files[io_u->file->fileno];
691 nsid = xnvme_dev_get_nsid(fwrap->dev);
693 if (fwrap->lba_pow2) {
694 slba = io_u->offset >> fwrap->ssw;
695 nlb = (io_u->xfer_buflen >> fwrap->ssw) - 1;
697 slba = io_u->offset / fwrap->lba_nbytes;
698 nlb = (io_u->xfer_buflen / fwrap->lba_nbytes) - 1;
701 ctx = xnvme_queue_get_cmd_ctx(fwrap->queue);
702 ctx->async.cb_arg = io_u;
704 ctx->cmd.common.nsid = nsid;
705 ctx->cmd.nvm.slba = slba;
706 ctx->cmd.nvm.nlb = nlb;
708 ctx->cmd.nvm.dtype = io_u->dtype;
709 ctx->cmd.nvm.cdw13.dspec = io_u->dspec;
712 switch (io_u->ddir) {
714 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ;
718 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE;
722 log_err("ioeng->queue(): ENOSYS: %u\n", io_u->ddir);
723 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
725 io_u->error = ENOSYS;
727 return FIO_Q_COMPLETED;
730 if (fwrap->geo->pi_type && !o->pi_act) {
731 err = xnvme_pi_ctx_init(&fio_req->pi_ctx, fwrap->lba_nbytes,
732 fwrap->geo->nbytes_oob, fwrap->geo->lba_extended,
733 fwrap->geo->pi_loc, fwrap->geo->pi_type,
734 (o->pi_act << 3 | o->prchk), slba, o->apptag_mask,
735 o->apptag, fwrap->geo->pi_format);
737 log_err("ioeng->queue(): err: '%d'\n", err);
739 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
741 io_u->error = abs(err);
742 return FIO_Q_COMPLETED;
745 if (io_u->ddir == DDIR_WRITE)
746 xnvme_pi_generate(&fio_req->pi_ctx, io_u->xfer_buf, fio_req->md_buf,
750 if (fwrap->geo->pi_type)
751 ctx->cmd.nvm.prinfo = (o->pi_act << 3 | o->prchk);
753 switch (fwrap->geo->pi_type) {
756 switch (fwrap->geo->pi_format) {
757 case XNVME_SPEC_NVM_NS_16B_GUARD:
758 if (o->prchk & XNVME_PI_FLAGS_REFTAG_CHECK)
759 ctx->cmd.nvm.ilbrt = (uint32_t)slba;
761 case XNVME_SPEC_NVM_NS_64B_GUARD:
762 if (o->prchk & XNVME_PI_FLAGS_REFTAG_CHECK) {
763 ctx->cmd.nvm.ilbrt = (uint32_t)slba;
764 ctx->cmd.common.cdw03 = ((slba >> 32) & 0xffff);
770 if (o->prchk & XNVME_PI_FLAGS_APPTAG_CHECK) {
771 ctx->cmd.nvm.lbat = o->apptag;
772 ctx->cmd.nvm.lbatm = o->apptag_mask;
776 if (o->prchk & XNVME_PI_FLAGS_APPTAG_CHECK) {
777 ctx->cmd.nvm.lbat = o->apptag;
778 ctx->cmd.nvm.lbatm = o->apptag_mask;
781 case XNVME_PI_DISABLE:
786 xd->iovec[io_u->index].iov_base = io_u->xfer_buf;
787 xd->iovec[io_u->index].iov_len = io_u->xfer_buflen;
788 if (fwrap->md_nbytes && fwrap->lba_pow2) {
789 xd->md_iovec[io_u->index].iov_base = fio_req->md_buf;
790 xd->md_iovec[io_u->index].iov_len = fwrap->md_nbytes * (nlb + 1);
791 err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen,
792 &xd->md_iovec[io_u->index], 1,
793 fwrap->md_nbytes * (nlb + 1));
795 err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen,
799 if (fwrap->md_nbytes && fwrap->lba_pow2)
800 err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen,
801 fio_req->md_buf, fwrap->md_nbytes * (nlb + 1));
803 err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen, NULL, 0);
811 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
815 log_err("ioeng->queue(): err: '%d'\n", err);
817 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
819 io_u->error = abs(err);
821 return FIO_Q_COMPLETED;
825 static int xnvme_fioe_close(struct thread_data *td, struct fio_file *f)
827 struct xnvme_fioe_data *xd = td->io_ops_data;
829 dprint(FD_FILE, "xnvme close %s -- nopen: %ld\n", f->file_name, xd->nopen);
836 static int xnvme_fioe_open(struct thread_data *td, struct fio_file *f)
838 struct xnvme_fioe_data *xd = td->io_ops_data;
840 dprint(FD_FILE, "xnvme open %s -- nopen: %ld\n", f->file_name, xd->nopen);
842 if (f->fileno > (int)xd->nallocated) {
843 log_err("ioeng->open(): f->fileno > xd->nallocated; invalid assumption\n");
846 if (xd->files[f->fileno].fio_file != f) {
847 log_err("ioeng->open(): fio_file != f; invalid assumption\n");
856 static int xnvme_fioe_invalidate(struct thread_data *td, struct fio_file *f)
858 /* Consider only doing this with be:spdk */
862 static int xnvme_fioe_get_max_open_zones(struct thread_data *td, struct fio_file *f,
863 unsigned int *max_open_zones)
865 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
866 struct xnvme_dev *dev;
867 const struct xnvme_spec_znd_idfy_ns *zns;
868 int err = 0, err_lock;
870 if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
871 f->filetype != FIO_TYPE_CHAR) {
872 log_info("ioeng->get_max_open_zoned(): ignoring filetype: %d\n", f->filetype);
875 err_lock = pthread_mutex_lock(&g_serialize);
877 log_err("ioeng->get_max_open_zones(): pthread_mutex_lock(), err(%d)\n", err_lock);
881 dev = xnvme_dev_open(f->file_name, &opts);
883 log_err("ioeng->get_max_open_zones(): xnvme_dev_open(), err(%d)\n", err_lock);
887 if (xnvme_dev_get_geo(dev)->type != XNVME_GEO_ZONED) {
893 zns = (void *)xnvme_dev_get_ns_css(dev);
895 log_err("ioeng->get_max_open_zones(): xnvme_dev_get_ns_css(), err(%d)\n", errno);
901 * intentional overflow as the value is zero-based and NVMe
902 * defines 0xFFFFFFFF as unlimited thus overflowing to 0 which
903 * is how fio indicates unlimited and otherwise just converting
906 *max_open_zones = zns->mor + 1;
909 xnvme_dev_close(dev);
910 err_lock = pthread_mutex_unlock(&g_serialize);
912 log_err("ioeng->get_max_open_zones(): pthread_mutex_unlock(), err(%d)\n",
919 * Currently, this function is called before of I/O engine initialization, so,
920 * we cannot consult the file-wrapping done when 'fioe' initializes.
921 * Instead we just open based on the given filename.
923 * TODO: unify the different setup methods, consider keeping the handle around,
924 * and consider how to support the --be option in this usecase
926 static int xnvme_fioe_get_zoned_model(struct thread_data *td, struct fio_file *f,
927 enum zbd_zoned_model *model)
929 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
930 struct xnvme_dev *dev;
931 int err = 0, err_lock;
933 if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
934 f->filetype != FIO_TYPE_CHAR) {
935 log_info("ioeng->get_zoned_model(): ignoring filetype: %d\n", f->filetype);
939 err = pthread_mutex_lock(&g_serialize);
941 log_err("ioeng->get_zoned_model(): pthread_mutex_lock(), err(%d)\n", err);
945 dev = xnvme_dev_open(f->file_name, &opts);
947 log_err("ioeng->get_zoned_model(): xnvme_dev_open(%s) failed, errno: %d\n",
948 f->file_name, errno);
953 switch (xnvme_dev_get_geo(dev)->type) {
954 case XNVME_GEO_UNKNOWN:
955 dprint(FD_ZBD, "%s: got 'unknown', assigning ZBD_NONE\n", f->file_name);
959 case XNVME_GEO_CONVENTIONAL:
960 dprint(FD_ZBD, "%s: got 'conventional', assigning ZBD_NONE\n", f->file_name);
964 case XNVME_GEO_ZONED:
965 dprint(FD_ZBD, "%s: got 'zoned', assigning ZBD_HOST_MANAGED\n", f->file_name);
966 *model = ZBD_HOST_MANAGED;
970 dprint(FD_ZBD, "%s: hit-default, assigning ZBD_NONE\n", f->file_name);
978 xnvme_dev_close(dev);
980 err_lock = pthread_mutex_unlock(&g_serialize);
982 log_err("ioeng->get_zoned_model(): pthread_mutex_unlock(), err(%d)\n", err_lock);
988 * Fills the given ``zbdz`` with at most ``nr_zones`` zone-descriptors.
990 * The implementation converts the NVMe Zoned Command Set log-pages for Zone
991 * descriptors into the Linux Kernel Zoned Block Report format.
993 * NOTE: This function is called before I/O engine initialization, that is,
994 * before ``_dev_open`` has been called and file-wrapping is setup. Thus is has
995 * to do the ``_dev_open`` itself, and shut it down again once it is done
996 * retrieving the log-pages and converting them to the report format.
998 * TODO: unify the different setup methods, consider keeping the handle around,
999 * and consider how to support the --async option in this usecase
1001 static int xnvme_fioe_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
1002 struct zbd_zone *zbdz, unsigned int nr_zones)
1004 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
1005 const struct xnvme_spec_znd_idfy_lbafe *lbafe = NULL;
1006 struct xnvme_dev *dev = NULL;
1007 const struct xnvme_geo *geo = NULL;
1008 struct xnvme_znd_report *rprt = NULL;
1011 unsigned int limit = 0;
1012 int err = 0, err_lock;
1014 dprint(FD_ZBD, "%s: report_zones() offset: %zu, nr_zones: %u\n", f->file_name, offset,
1017 err = pthread_mutex_lock(&g_serialize);
1019 log_err("ioeng->report_zones(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
1024 dev = xnvme_dev_open(f->file_name, &opts);
1026 log_err("ioeng->report_zones(%s): xnvme_dev_open(), err(%d)\n", f->file_name,
1031 geo = xnvme_dev_get_geo(dev);
1032 ssw = xnvme_dev_get_ssw(dev);
1033 lbafe = xnvme_znd_dev_get_lbafe(dev);
1035 limit = nr_zones > geo->nzone ? geo->nzone : nr_zones;
1037 dprint(FD_ZBD, "%s: limit: %u\n", f->file_name, limit);
1039 slba = ((offset >> ssw) / geo->nsect) * geo->nsect;
1041 rprt = xnvme_znd_report_from_dev(dev, slba, limit, 0);
1043 log_err("ioeng->report_zones(%s): xnvme_znd_report_from_dev(), err(%d)\n",
1044 f->file_name, errno);
1048 if (rprt->nentries != limit) {
1049 log_err("ioeng->report_zones(%s): nentries != nr_zones\n", f->file_name);
1053 if (offset > geo->tbytes) {
1054 log_err("ioeng->report_zones(%s): out-of-bounds\n", f->file_name);
1058 /* Transform the zone-report */
1059 for (uint32_t idx = 0; idx < rprt->nentries; ++idx) {
1060 struct xnvme_spec_znd_descr *descr = XNVME_ZND_REPORT_DESCR(rprt, idx);
1062 zbdz[idx].start = descr->zslba << ssw;
1063 zbdz[idx].len = lbafe->zsze << ssw;
1064 zbdz[idx].capacity = descr->zcap << ssw;
1065 zbdz[idx].wp = descr->wp << ssw;
1067 switch (descr->zt) {
1068 case XNVME_SPEC_ZND_TYPE_SEQWR:
1069 zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
1073 log_err("ioeng->report_zones(%s): invalid type for zone at offset(%zu)\n",
1074 f->file_name, zbdz[idx].start);
1079 switch (descr->zs) {
1080 case XNVME_SPEC_ZND_STATE_EMPTY:
1081 zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
1083 case XNVME_SPEC_ZND_STATE_IOPEN:
1084 zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
1086 case XNVME_SPEC_ZND_STATE_EOPEN:
1087 zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
1089 case XNVME_SPEC_ZND_STATE_CLOSED:
1090 zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
1092 case XNVME_SPEC_ZND_STATE_FULL:
1093 zbdz[idx].cond = ZBD_ZONE_COND_FULL;
1096 case XNVME_SPEC_ZND_STATE_RONLY:
1097 case XNVME_SPEC_ZND_STATE_OFFLINE:
1099 zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
1105 xnvme_buf_virt_free(rprt);
1107 xnvme_dev_close(dev);
1109 err_lock = pthread_mutex_unlock(&g_serialize);
1111 log_err("ioeng->report_zones(): pthread_mutex_unlock(), err: %d\n", err_lock);
1113 dprint(FD_ZBD, "err: %d, nr_zones: %d\n", err, (int)nr_zones);
1115 return err ? err : (int)limit;
1119 * NOTE: This function may get called before I/O engine initialization, that is,
1120 * before ``_dev_open`` has been called and file-wrapping is setup. In such
1121 * case it has to do ``_dev_open`` itself, and shut it down again once it is
1122 * done resetting write pointer of zones.
1124 static int xnvme_fioe_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset,
1127 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
1128 struct xnvme_fioe_data *xd = NULL;
1129 struct xnvme_fioe_fwrap *fwrap = NULL;
1130 struct xnvme_dev *dev = NULL;
1131 const struct xnvme_geo *geo = NULL;
1132 uint64_t first, last;
1135 int err = 0, err_lock;
1137 if (td->io_ops_data) {
1138 xd = td->io_ops_data;
1139 fwrap = &xd->files[f->fileno];
1148 err = pthread_mutex_lock(&g_serialize);
1150 log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", err);
1154 dev = xnvme_dev_open(f->file_name, &opts);
1156 log_err("ioeng->reset_wp(): xnvme_dev_open(%s) failed, errno(%d)\n",
1157 f->file_name, errno);
1160 geo = xnvme_dev_get_geo(dev);
1161 ssw = xnvme_dev_get_ssw(dev);
1164 nsid = xnvme_dev_get_nsid(dev);
1166 first = ((offset >> ssw) / geo->nsect) * geo->nsect;
1167 last = (((offset + length) >> ssw) / geo->nsect) * geo->nsect;
1168 dprint(FD_ZBD, "first: 0x%lx, last: 0x%lx\n", first, last);
1170 for (uint64_t zslba = first; zslba < last; zslba += geo->nsect) {
1171 struct xnvme_cmd_ctx ctx = xnvme_cmd_ctx_from_dev(dev);
1173 if (zslba >= (geo->nsect * geo->nzone)) {
1174 log_err("ioeng->reset_wp(): out-of-bounds\n");
1179 err = xnvme_znd_mgmt_send(&ctx, nsid, zslba, false,
1180 XNVME_SPEC_ZND_CMD_MGMT_SEND_RESET, 0x0, NULL);
1181 if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
1182 err = err ? err : -EIO;
1183 log_err("ioeng->reset_wp(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
1189 if (!td->io_ops_data) {
1190 xnvme_dev_close(dev);
1192 err_lock = pthread_mutex_unlock(&g_serialize);
1194 log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err_lock);
1200 static int xnvme_fioe_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1201 struct fio_ruhs_info *fruhs_info)
1203 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
1204 struct xnvme_dev *dev;
1205 struct xnvme_spec_ruhs *ruhs;
1206 struct xnvme_cmd_ctx ctx;
1207 uint32_t ruhs_nbytes;
1209 int err = 0, err_lock;
1211 if (f->filetype != FIO_TYPE_CHAR && f->filetype != FIO_TYPE_FILE) {
1212 log_err("ioeng->fdp_ruhs(): ignoring filetype: %d\n", f->filetype);
1216 err = pthread_mutex_lock(&g_serialize);
1218 log_err("ioeng->fdp_ruhs(): pthread_mutex_lock(), err(%d)\n", err);
1222 dev = xnvme_dev_open(f->file_name, &opts);
1224 log_err("ioeng->fdp_ruhs(): xnvme_dev_open(%s) failed, errno: %d\n",
1225 f->file_name, errno);
1230 ruhs_nbytes = sizeof(*ruhs) + (FDP_MAX_RUHS * sizeof(struct xnvme_spec_ruhs_desc));
1231 ruhs = xnvme_buf_alloc(dev, ruhs_nbytes);
1236 memset(ruhs, 0, ruhs_nbytes);
1238 ctx = xnvme_cmd_ctx_from_dev(dev);
1239 nsid = xnvme_dev_get_nsid(dev);
1241 err = xnvme_nvm_mgmt_recv(&ctx, nsid, XNVME_SPEC_IO_MGMT_RECV_RUHS, 0, ruhs, ruhs_nbytes);
1243 if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
1244 err = err ? err : -EIO;
1245 log_err("ioeng->fdp_ruhs(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
1249 fruhs_info->nr_ruhs = ruhs->nruhsd;
1250 for (uint32_t idx = 0; idx < fruhs_info->nr_ruhs; ++idx) {
1251 fruhs_info->plis[idx] = le16_to_cpu(ruhs->desc[idx].pi);
1255 xnvme_buf_free(dev, ruhs);
1257 xnvme_dev_close(dev);
1259 err_lock = pthread_mutex_unlock(&g_serialize);
1261 log_err("ioeng->fdp_ruhs(): pthread_mutex_unlock(), err(%d)\n", err_lock);
1266 static int xnvme_fioe_get_file_size(struct thread_data *td, struct fio_file *f)
1268 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
1269 struct xnvme_dev *dev;
1272 if (fio_file_size_known(f))
1275 ret = pthread_mutex_lock(&g_serialize);
1277 log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", ret);
1281 dev = xnvme_dev_open(f->file_name, &opts);
1283 log_err("%s: failed retrieving device handle, errno: %d\n", f->file_name, errno);
1288 f->real_file_size = xnvme_dev_get_geo(dev)->tbytes;
1289 fio_file_set_size_known(f);
1291 if (td->o.zone_mode == ZONE_MODE_ZBD)
1292 f->filetype = FIO_TYPE_BLOCK;
1295 xnvme_dev_close(dev);
1296 err = pthread_mutex_unlock(&g_serialize);
1298 log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err);
1303 FIO_STATIC struct ioengine_ops ioengine = {
1305 .version = FIO_IOOPS_VERSION,
1307 .option_struct_size = sizeof(struct xnvme_fioe_options),
1308 .flags = FIO_DISKLESSIO | FIO_NODISKUTIL | FIO_NOEXTEND | FIO_MEMALIGN | FIO_RAWIO,
1310 .cleanup = xnvme_fioe_cleanup,
1311 .init = xnvme_fioe_init,
1313 .iomem_free = xnvme_fioe_iomem_free,
1314 .iomem_alloc = xnvme_fioe_iomem_alloc,
1316 .io_u_free = xnvme_fioe_io_u_free,
1317 .io_u_init = xnvme_fioe_io_u_init,
1319 .event = xnvme_fioe_event,
1320 .getevents = xnvme_fioe_getevents,
1321 .queue = xnvme_fioe_queue,
1323 .close_file = xnvme_fioe_close,
1324 .open_file = xnvme_fioe_open,
1325 .get_file_size = xnvme_fioe_get_file_size,
1327 .invalidate = xnvme_fioe_invalidate,
1328 .get_max_open_zones = xnvme_fioe_get_max_open_zones,
1329 .get_zoned_model = xnvme_fioe_get_zoned_model,
1330 .report_zones = xnvme_fioe_report_zones,
1331 .reset_wp = xnvme_fioe_reset_wp,
1333 .fdp_fetch_ruhs = xnvme_fioe_fetch_ruhs,
1336 static void fio_init fio_xnvme_register(void)
1338 register_ioengine(&ioengine);
1341 static void fio_exit fio_xnvme_unregister(void)
1343 unregister_ioengine(&ioengine);