engines/xnvme: user space vfio based backend
[fio.git] / engines / xnvme.c
CommitLineData
a3ff873e
AK
1/*
2 * fio xNVMe IO Engine
3 *
4 * IO engine using the xNVMe C API.
5 *
6 * See: http://xnvme.io/
7 *
8 * SPDX-License-Identifier: Apache-2.0
9 */
10#include <stdlib.h>
11#include <assert.h>
12#include <libxnvme.h>
13#include <libxnvme_libconf.h>
14#include <libxnvme_nvm.h>
15#include <libxnvme_znd.h>
16#include <libxnvme_spec_fs.h>
17#include "fio.h"
18#include "zbd_types.h"
19#include "optgroup.h"
20
21static pthread_mutex_t g_serialize = PTHREAD_MUTEX_INITIALIZER;
22
23struct xnvme_fioe_fwrap {
24 /* fio file representation */
25 struct fio_file *fio_file;
26
27 /* xNVMe device handle */
28 struct xnvme_dev *dev;
29 /* xNVMe device geometry */
30 const struct xnvme_geo *geo;
31
32 struct xnvme_queue *queue;
33
34 uint32_t ssw;
35 uint32_t lba_nbytes;
36
37 uint8_t _pad[24];
38};
39XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_fwrap) == 64, "Incorrect size")
40
41struct xnvme_fioe_data {
42 /* I/O completion queue */
43 struct io_u **iocq;
44
45 /* # of iocq entries; incremented via getevents()/cb_pool() */
46 uint64_t completed;
47
48 /*
49 * # of errors; incremented when observed on completion via
50 * getevents()/cb_pool()
51 */
52 uint64_t ecount;
53
54 /* Controller which device/file to select */
55 int32_t prev;
56 int32_t cur;
57
58 /* Number of devices/files for which open() has been called */
59 int64_t nopen;
60 /* Number of devices/files allocated in files[] */
61 uint64_t nallocated;
62
63 struct iovec *iovec;
64
65 uint8_t _pad[8];
66
67 struct xnvme_fioe_fwrap files[];
68};
69XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_data) == 64, "Incorrect size")
70
71struct xnvme_fioe_options {
72 void *padding;
73 unsigned int hipri;
74 unsigned int sqpoll_thread;
75 unsigned int xnvme_dev_nsid;
76 unsigned int xnvme_iovec;
77 char *xnvme_be;
78 char *xnvme_async;
79 char *xnvme_sync;
80 char *xnvme_admin;
81};
82
83static struct fio_option options[] = {
84 {
85 .name = "hipri",
86 .lname = "High Priority",
87 .type = FIO_OPT_STR_SET,
88 .off1 = offsetof(struct xnvme_fioe_options, hipri),
89 .help = "Use polled IO completions",
90 .category = FIO_OPT_C_ENGINE,
91 .group = FIO_OPT_G_XNVME,
92 },
93 {
94 .name = "sqthread_poll",
95 .lname = "Kernel SQ thread polling",
96 .type = FIO_OPT_STR_SET,
97 .off1 = offsetof(struct xnvme_fioe_options, sqpoll_thread),
98 .help = "Offload submission/completion to kernel thread",
99 .category = FIO_OPT_C_ENGINE,
100 .group = FIO_OPT_G_XNVME,
101 },
102 {
103 .name = "xnvme_be",
104 .lname = "xNVMe Backend",
105 .type = FIO_OPT_STR_STORE,
106 .off1 = offsetof(struct xnvme_fioe_options, xnvme_be),
107 .help = "Select xNVMe backend [spdk,linux,fbsd]",
108 .category = FIO_OPT_C_ENGINE,
109 .group = FIO_OPT_G_XNVME,
110 },
111 {
112 .name = "xnvme_async",
113 .lname = "xNVMe Asynchronous command-interface",
114 .type = FIO_OPT_STR_STORE,
115 .off1 = offsetof(struct xnvme_fioe_options, xnvme_async),
203a4c7c
AK
116 .help = "Select xNVMe async. interface: "
117 "[emu,thrpool,io_uring,io_uring_cmd,libaio,posix,vfio,nil]",
a3ff873e
AK
118 .category = FIO_OPT_C_ENGINE,
119 .group = FIO_OPT_G_XNVME,
120 },
121 {
122 .name = "xnvme_sync",
123 .lname = "xNVMe Synchronous. command-interface",
124 .type = FIO_OPT_STR_STORE,
125 .off1 = offsetof(struct xnvme_fioe_options, xnvme_sync),
203a4c7c 126 .help = "Select xNVMe sync. interface: [nvme,psync,block]",
a3ff873e
AK
127 .category = FIO_OPT_C_ENGINE,
128 .group = FIO_OPT_G_XNVME,
129 },
130 {
131 .name = "xnvme_admin",
132 .lname = "xNVMe Admin command-interface",
133 .type = FIO_OPT_STR_STORE,
134 .off1 = offsetof(struct xnvme_fioe_options, xnvme_admin),
203a4c7c 135 .help = "Select xNVMe admin. cmd-interface: [nvme,block]",
a3ff873e
AK
136 .category = FIO_OPT_C_ENGINE,
137 .group = FIO_OPT_G_XNVME,
138 },
139 {
140 .name = "xnvme_dev_nsid",
141 .lname = "xNVMe Namespace-Identifier, for user-space NVMe driver",
142 .type = FIO_OPT_INT,
143 .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_nsid),
144 .help = "xNVMe Namespace-Identifier, for user-space NVMe driver",
145 .category = FIO_OPT_C_ENGINE,
146 .group = FIO_OPT_G_XNVME,
147 },
148 {
149 .name = "xnvme_iovec",
150 .lname = "Vectored IOs",
151 .type = FIO_OPT_STR_SET,
152 .off1 = offsetof(struct xnvme_fioe_options, xnvme_iovec),
153 .help = "Send vectored IOs",
154 .category = FIO_OPT_C_ENGINE,
155 .group = FIO_OPT_G_XNVME,
156 },
157
158 {
159 .name = NULL,
160 },
161};
162
163static void cb_pool(struct xnvme_cmd_ctx *ctx, void *cb_arg)
164{
165 struct io_u *io_u = cb_arg;
166 struct xnvme_fioe_data *xd = io_u->mmap_data;
167
168 if (xnvme_cmd_ctx_cpl_status(ctx)) {
169 xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF);
170 xd->ecount += 1;
171 io_u->error = EIO;
172 }
173
174 xd->iocq[xd->completed++] = io_u;
175 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
176}
177
178static struct xnvme_opts xnvme_opts_from_fioe(struct thread_data *td)
179{
180 struct xnvme_fioe_options *o = td->eo;
181 struct xnvme_opts opts = xnvme_opts_default();
182
183 opts.nsid = o->xnvme_dev_nsid;
184 opts.be = o->xnvme_be;
185 opts.async = o->xnvme_async;
186 opts.sync = o->xnvme_sync;
187 opts.admin = o->xnvme_admin;
188
189 opts.poll_io = o->hipri;
190 opts.poll_sq = o->sqpoll_thread;
191
192 opts.direct = td->o.odirect;
193
194 return opts;
195}
196
197static void _dev_close(struct thread_data *td, struct xnvme_fioe_fwrap *fwrap)
198{
199 if (fwrap->dev)
200 xnvme_queue_term(fwrap->queue);
201
202 xnvme_dev_close(fwrap->dev);
203
204 memset(fwrap, 0, sizeof(*fwrap));
205}
206
207static void xnvme_fioe_cleanup(struct thread_data *td)
208{
fdac9c68 209 struct xnvme_fioe_data *xd = NULL;
a3ff873e
AK
210 int err;
211
fdac9c68
AK
212 if (!td->io_ops_data)
213 return;
214
215 xd = td->io_ops_data;
216
a3ff873e
AK
217 err = pthread_mutex_lock(&g_serialize);
218 if (err)
219 log_err("ioeng->cleanup(): pthread_mutex_lock(), err(%d)\n", err);
220 /* NOTE: not returning here */
221
222 for (uint64_t i = 0; i < xd->nallocated; ++i)
223 _dev_close(td, &xd->files[i]);
224
225 if (!err) {
226 err = pthread_mutex_unlock(&g_serialize);
227 if (err)
228 log_err("ioeng->cleanup(): pthread_mutex_unlock(), err(%d)\n", err);
229 }
230
231 free(xd->iocq);
232 free(xd->iovec);
233 free(xd);
234 td->io_ops_data = NULL;
235}
236
237/**
238 * Helper function setting up device handles as addressed by the naming
239 * convention of the given `fio_file` filename.
240 *
241 * Checks thread-options for explicit control of asynchronous implementation via
242 * the ``--xnvme_async={thrpool,emu,posix,io_uring,libaio,nil}``.
243 */
244static int _dev_open(struct thread_data *td, struct fio_file *f)
245{
246 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
247 struct xnvme_fioe_data *xd = td->io_ops_data;
248 struct xnvme_fioe_fwrap *fwrap;
249 int flags = 0;
250 int err;
251
252 if (f->fileno > (int)xd->nallocated) {
253 log_err("ioeng->_dev_open(%s): invalid assumption\n", f->file_name);
254 return 1;
255 }
256
257 fwrap = &xd->files[f->fileno];
258
259 err = pthread_mutex_lock(&g_serialize);
260 if (err) {
261 log_err("ioeng->_dev_open(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
262 err);
263 return -err;
264 }
265
266 fwrap->dev = xnvme_dev_open(f->file_name, &opts);
267 if (!fwrap->dev) {
268 log_err("ioeng->_dev_open(%s): xnvme_dev_open(), err(%d)\n", f->file_name, errno);
269 goto failure;
270 }
271 fwrap->geo = xnvme_dev_get_geo(fwrap->dev);
272
273 if (xnvme_queue_init(fwrap->dev, td->o.iodepth, flags, &(fwrap->queue))) {
274 log_err("ioeng->_dev_open(%s): xnvme_queue_init(), err(?)\n", f->file_name);
275 goto failure;
276 }
277 xnvme_queue_set_cb(fwrap->queue, cb_pool, NULL);
278
279 fwrap->ssw = xnvme_dev_get_ssw(fwrap->dev);
280 fwrap->lba_nbytes = fwrap->geo->lba_nbytes;
281
282 fwrap->fio_file = f;
283 fwrap->fio_file->filetype = FIO_TYPE_BLOCK;
284 fwrap->fio_file->real_file_size = fwrap->geo->tbytes;
285 fio_file_set_size_known(fwrap->fio_file);
286
287 err = pthread_mutex_unlock(&g_serialize);
288 if (err)
289 log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
290 err);
291
292 return 0;
293
294failure:
295 xnvme_queue_term(fwrap->queue);
296 xnvme_dev_close(fwrap->dev);
297
298 err = pthread_mutex_unlock(&g_serialize);
299 if (err)
300 log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
301 err);
302
303 return 1;
304}
305
306static int xnvme_fioe_init(struct thread_data *td)
307{
308 struct xnvme_fioe_data *xd = NULL;
309 struct fio_file *f;
310 unsigned int i;
311
312 if (!td->o.use_thread) {
313 log_err("ioeng->init(): --thread=1 is required\n");
314 return 1;
315 }
316
317 /* Allocate xd and iocq */
318 xd = calloc(1, sizeof(*xd) + sizeof(*xd->files) * td->o.nr_files);
319 if (!xd) {
320 log_err("ioeng->init(): !calloc(), err(%d)\n", errno);
321 return 1;
322 }
323
324 xd->iocq = calloc(td->o.iodepth, sizeof(struct io_u *));
325 if (!xd->iocq) {
eb3570b5
AK
326 free(xd);
327 log_err("ioeng->init(): !calloc(xd->iocq), err(%d)\n", errno);
a3ff873e
AK
328 return 1;
329 }
330
331 xd->iovec = calloc(td->o.iodepth, sizeof(*xd->iovec));
332 if (!xd->iovec) {
eb3570b5
AK
333 free(xd->iocq);
334 free(xd);
a3ff873e
AK
335 log_err("ioeng->init(): !calloc(xd->iovec), err(%d)\n", errno);
336 return 1;
337 }
338
339 xd->prev = -1;
340 td->io_ops_data = xd;
341
342 for_each_file(td, f, i)
343 {
344 if (_dev_open(td, f)) {
eb3570b5
AK
345 /*
346 * Note: We are not freeing xd, iocq and iovec. This
347 * will be done as part of cleanup routine.
348 */
a3ff873e
AK
349 log_err("ioeng->init(): failed; _dev_open(%s)\n", f->file_name);
350 return 1;
351 }
352
353 ++(xd->nallocated);
354 }
355
356 if (xd->nallocated != td->o.nr_files) {
357 log_err("ioeng->init(): failed; nallocated != td->o.nr_files\n");
358 return 1;
359 }
360
361 return 0;
362}
363
364/* NOTE: using the first device for buffer-allocators) */
365static int xnvme_fioe_iomem_alloc(struct thread_data *td, size_t total_mem)
366{
367 struct xnvme_fioe_data *xd = td->io_ops_data;
368 struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
369
370 if (!fwrap->dev) {
371 log_err("ioeng->iomem_alloc(): failed; no dev-handle\n");
372 return 1;
373 }
374
375 td->orig_buffer = xnvme_buf_alloc(fwrap->dev, total_mem);
376
377 return td->orig_buffer == NULL;
378}
379
380/* NOTE: using the first device for buffer-allocators) */
381static void xnvme_fioe_iomem_free(struct thread_data *td)
382{
fdac9c68
AK
383 struct xnvme_fioe_data *xd = NULL;
384 struct xnvme_fioe_fwrap *fwrap = NULL;
385
386 if (!td->io_ops_data)
387 return;
388
389 xd = td->io_ops_data;
390 fwrap = &xd->files[0];
a3ff873e
AK
391
392 if (!fwrap->dev) {
393 log_err("ioeng->iomem_free(): failed no dev-handle\n");
394 return;
395 }
396
397 xnvme_buf_free(fwrap->dev, td->orig_buffer);
398}
399
400static int xnvme_fioe_io_u_init(struct thread_data *td, struct io_u *io_u)
401{
402 io_u->mmap_data = td->io_ops_data;
403
404 return 0;
405}
406
407static void xnvme_fioe_io_u_free(struct thread_data *td, struct io_u *io_u)
408{
409 io_u->mmap_data = NULL;
410}
411
412static struct io_u *xnvme_fioe_event(struct thread_data *td, int event)
413{
414 struct xnvme_fioe_data *xd = td->io_ops_data;
415
416 assert(event >= 0);
417 assert((unsigned)event < xd->completed);
418
419 return xd->iocq[event];
420}
421
422static int xnvme_fioe_getevents(struct thread_data *td, unsigned int min, unsigned int max,
423 const struct timespec *t)
424{
425 struct xnvme_fioe_data *xd = td->io_ops_data;
426 struct xnvme_fioe_fwrap *fwrap = NULL;
427 int nfiles = xd->nallocated;
428 int err = 0;
429
430 if (xd->prev != -1 && ++xd->prev < nfiles) {
431 fwrap = &xd->files[xd->prev];
432 xd->cur = xd->prev;
433 }
434
435 xd->completed = 0;
436 for (;;) {
437 if (fwrap == NULL || xd->cur == nfiles) {
438 fwrap = &xd->files[0];
439 xd->cur = 0;
440 }
441
442 while (fwrap != NULL && xd->cur < nfiles && err >= 0) {
443 err = xnvme_queue_poke(fwrap->queue, max - xd->completed);
444 if (err < 0) {
445 switch (err) {
446 case -EBUSY:
447 case -EAGAIN:
448 usleep(1);
449 break;
450
451 default:
452 log_err("ioeng->getevents(): unhandled IO error\n");
453 assert(false);
454 return 0;
455 }
456 }
457 if (xd->completed >= min) {
458 xd->prev = xd->cur;
459 return xd->completed;
460 }
461 xd->cur++;
462 fwrap = &xd->files[xd->cur];
463
464 if (err < 0) {
465 switch (err) {
466 case -EBUSY:
467 case -EAGAIN:
468 usleep(1);
469 break;
470 }
471 }
472 }
473 }
474
475 xd->cur = 0;
476
477 return xd->completed;
478}
479
480static enum fio_q_status xnvme_fioe_queue(struct thread_data *td, struct io_u *io_u)
481{
482 struct xnvme_fioe_data *xd = td->io_ops_data;
483 struct xnvme_fioe_fwrap *fwrap;
484 struct xnvme_cmd_ctx *ctx;
485 uint32_t nsid;
486 uint64_t slba;
487 uint16_t nlb;
488 int err;
489 bool vectored_io = ((struct xnvme_fioe_options *)td->eo)->xnvme_iovec;
490
491 fio_ro_check(td, io_u);
492
493 fwrap = &xd->files[io_u->file->fileno];
494 nsid = xnvme_dev_get_nsid(fwrap->dev);
495
496 slba = io_u->offset >> fwrap->ssw;
497 nlb = (io_u->xfer_buflen >> fwrap->ssw) - 1;
498
499 ctx = xnvme_queue_get_cmd_ctx(fwrap->queue);
500 ctx->async.cb_arg = io_u;
501
502 ctx->cmd.common.nsid = nsid;
503 ctx->cmd.nvm.slba = slba;
504 ctx->cmd.nvm.nlb = nlb;
505
506 switch (io_u->ddir) {
507 case DDIR_READ:
508 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ;
509 break;
510
511 case DDIR_WRITE:
512 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE;
513 break;
514
515 default:
516 log_err("ioeng->queue(): ENOSYS: %u\n", io_u->ddir);
eb3570b5
AK
517 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
518
519 io_u->error = ENOSYS;
a3ff873e 520 assert(false);
eb3570b5 521 return FIO_Q_COMPLETED;
a3ff873e
AK
522 }
523
524 if (vectored_io) {
525 xd->iovec[io_u->index].iov_base = io_u->xfer_buf;
526 xd->iovec[io_u->index].iov_len = io_u->xfer_buflen;
527
528 err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen, NULL, 0,
529 0);
530 } else {
531 err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen, NULL, 0);
532 }
533 switch (err) {
534 case 0:
535 return FIO_Q_QUEUED;
536
537 case -EBUSY:
538 case -EAGAIN:
539 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
540 return FIO_Q_BUSY;
541
542 default:
543 log_err("ioeng->queue(): err: '%d'\n", err);
544
545 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
546
547 io_u->error = abs(err);
548 assert(false);
549 return FIO_Q_COMPLETED;
550 }
551}
552
553static int xnvme_fioe_close(struct thread_data *td, struct fio_file *f)
554{
555 struct xnvme_fioe_data *xd = td->io_ops_data;
556
557 dprint(FD_FILE, "xnvme close %s -- nopen: %ld\n", f->file_name, xd->nopen);
558
559 --(xd->nopen);
560
561 return 0;
562}
563
564static int xnvme_fioe_open(struct thread_data *td, struct fio_file *f)
565{
566 struct xnvme_fioe_data *xd = td->io_ops_data;
567
568 dprint(FD_FILE, "xnvme open %s -- nopen: %ld\n", f->file_name, xd->nopen);
569
570 if (f->fileno > (int)xd->nallocated) {
571 log_err("ioeng->open(): f->fileno > xd->nallocated; invalid assumption\n");
572 return 1;
573 }
574 if (xd->files[f->fileno].fio_file != f) {
575 log_err("ioeng->open(): fio_file != f; invalid assumption\n");
576 return 1;
577 }
578
579 ++(xd->nopen);
580
581 return 0;
582}
583
584static int xnvme_fioe_invalidate(struct thread_data *td, struct fio_file *f)
585{
586 /* Consider only doing this with be:spdk */
587 return 0;
588}
589
590static int xnvme_fioe_get_max_open_zones(struct thread_data *td, struct fio_file *f,
591 unsigned int *max_open_zones)
592{
593 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
594 struct xnvme_dev *dev;
595 const struct xnvme_spec_znd_idfy_ns *zns;
596 int err = 0, err_lock;
597
598 if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
599 f->filetype != FIO_TYPE_CHAR) {
600 log_info("ioeng->get_max_open_zoned(): ignoring filetype: %d\n", f->filetype);
601 return 0;
602 }
603 err_lock = pthread_mutex_lock(&g_serialize);
604 if (err_lock) {
605 log_err("ioeng->get_max_open_zones(): pthread_mutex_lock(), err(%d)\n", err_lock);
606 return -err_lock;
607 }
608
609 dev = xnvme_dev_open(f->file_name, &opts);
610 if (!dev) {
611 log_err("ioeng->get_max_open_zones(): xnvme_dev_open(), err(%d)\n", err_lock);
612 err = -errno;
613 goto exit;
614 }
615 if (xnvme_dev_get_geo(dev)->type != XNVME_GEO_ZONED) {
616 errno = EINVAL;
617 err = -errno;
618 goto exit;
619 }
620
621 zns = (void *)xnvme_dev_get_ns_css(dev);
622 if (!zns) {
623 log_err("ioeng->get_max_open_zones(): xnvme_dev_get_ns_css(), err(%d)\n", errno);
624 err = -errno;
625 goto exit;
626 }
627
628 /*
629 * intentional overflow as the value is zero-based and NVMe
630 * defines 0xFFFFFFFF as unlimited thus overflowing to 0 which
631 * is how fio indicates unlimited and otherwise just converting
632 * to one-based.
633 */
634 *max_open_zones = zns->mor + 1;
635
636exit:
637 xnvme_dev_close(dev);
638 err_lock = pthread_mutex_unlock(&g_serialize);
639 if (err_lock)
640 log_err("ioeng->get_max_open_zones(): pthread_mutex_unlock(), err(%d)\n",
641 err_lock);
642
643 return err;
644}
645
646/**
647 * Currently, this function is called before of I/O engine initialization, so,
648 * we cannot consult the file-wrapping done when 'fioe' initializes.
649 * Instead we just open based on the given filename.
650 *
651 * TODO: unify the different setup methods, consider keeping the handle around,
652 * and consider how to support the --be option in this usecase
653 */
654static int xnvme_fioe_get_zoned_model(struct thread_data *td, struct fio_file *f,
655 enum zbd_zoned_model *model)
656{
657 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
658 struct xnvme_dev *dev;
659 int err = 0, err_lock;
660
661 if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
662 f->filetype != FIO_TYPE_CHAR) {
663 log_info("ioeng->get_zoned_model(): ignoring filetype: %d\n", f->filetype);
664 return -EINVAL;
665 }
666
667 err = pthread_mutex_lock(&g_serialize);
668 if (err) {
669 log_err("ioeng->get_zoned_model(): pthread_mutex_lock(), err(%d)\n", err);
670 return -err;
671 }
672
673 dev = xnvme_dev_open(f->file_name, &opts);
674 if (!dev) {
675 log_err("ioeng->get_zoned_model(): xnvme_dev_open(%s) failed, errno: %d\n",
676 f->file_name, errno);
677 err = -errno;
678 goto exit;
679 }
680
681 switch (xnvme_dev_get_geo(dev)->type) {
682 case XNVME_GEO_UNKNOWN:
683 dprint(FD_ZBD, "%s: got 'unknown', assigning ZBD_NONE\n", f->file_name);
684 *model = ZBD_NONE;
685 break;
686
687 case XNVME_GEO_CONVENTIONAL:
688 dprint(FD_ZBD, "%s: got 'conventional', assigning ZBD_NONE\n", f->file_name);
689 *model = ZBD_NONE;
690 break;
691
692 case XNVME_GEO_ZONED:
693 dprint(FD_ZBD, "%s: got 'zoned', assigning ZBD_HOST_MANAGED\n", f->file_name);
694 *model = ZBD_HOST_MANAGED;
695 break;
696
697 default:
698 dprint(FD_ZBD, "%s: hit-default, assigning ZBD_NONE\n", f->file_name);
699 *model = ZBD_NONE;
700 errno = EINVAL;
701 err = -errno;
702 break;
703 }
704
705exit:
706 xnvme_dev_close(dev);
707
708 err_lock = pthread_mutex_unlock(&g_serialize);
709 if (err_lock)
710 log_err("ioeng->get_zoned_model(): pthread_mutex_unlock(), err(%d)\n", err_lock);
711
712 return err;
713}
714
715/**
716 * Fills the given ``zbdz`` with at most ``nr_zones`` zone-descriptors.
717 *
718 * The implementation converts the NVMe Zoned Command Set log-pages for Zone
719 * descriptors into the Linux Kernel Zoned Block Report format.
720 *
721 * NOTE: This function is called before I/O engine initialization, that is,
722 * before ``_dev_open`` has been called and file-wrapping is setup. Thus is has
723 * to do the ``_dev_open`` itself, and shut it down again once it is done
724 * retrieving the log-pages and converting them to the report format.
725 *
726 * TODO: unify the different setup methods, consider keeping the handle around,
727 * and consider how to support the --async option in this usecase
728 */
729static int xnvme_fioe_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
730 struct zbd_zone *zbdz, unsigned int nr_zones)
731{
732 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
733 const struct xnvme_spec_znd_idfy_lbafe *lbafe = NULL;
734 struct xnvme_dev *dev = NULL;
735 const struct xnvme_geo *geo = NULL;
736 struct xnvme_znd_report *rprt = NULL;
737 uint32_t ssw;
738 uint64_t slba;
739 unsigned int limit = 0;
740 int err = 0, err_lock;
741
742 dprint(FD_ZBD, "%s: report_zones() offset: %zu, nr_zones: %u\n", f->file_name, offset,
743 nr_zones);
744
745 err = pthread_mutex_lock(&g_serialize);
746 if (err) {
747 log_err("ioeng->report_zones(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
748 err);
749 return -err;
750 }
751
752 dev = xnvme_dev_open(f->file_name, &opts);
753 if (!dev) {
754 log_err("ioeng->report_zones(%s): xnvme_dev_open(), err(%d)\n", f->file_name,
755 errno);
756 goto exit;
757 }
758
759 geo = xnvme_dev_get_geo(dev);
760 ssw = xnvme_dev_get_ssw(dev);
761 lbafe = xnvme_znd_dev_get_lbafe(dev);
762
763 limit = nr_zones > geo->nzone ? geo->nzone : nr_zones;
764
765 dprint(FD_ZBD, "%s: limit: %u\n", f->file_name, limit);
766
767 slba = ((offset >> ssw) / geo->nsect) * geo->nsect;
768
769 rprt = xnvme_znd_report_from_dev(dev, slba, limit, 0);
770 if (!rprt) {
771 log_err("ioeng->report_zones(%s): xnvme_znd_report_from_dev(), err(%d)\n",
772 f->file_name, errno);
773 err = -errno;
774 goto exit;
775 }
776 if (rprt->nentries != limit) {
777 log_err("ioeng->report_zones(%s): nentries != nr_zones\n", f->file_name);
778 err = 1;
779 goto exit;
780 }
781 if (offset > geo->tbytes) {
782 log_err("ioeng->report_zones(%s): out-of-bounds\n", f->file_name);
783 goto exit;
784 }
785
786 /* Transform the zone-report */
787 for (uint32_t idx = 0; idx < rprt->nentries; ++idx) {
788 struct xnvme_spec_znd_descr *descr = XNVME_ZND_REPORT_DESCR(rprt, idx);
789
790 zbdz[idx].start = descr->zslba << ssw;
791 zbdz[idx].len = lbafe->zsze << ssw;
792 zbdz[idx].capacity = descr->zcap << ssw;
793 zbdz[idx].wp = descr->wp << ssw;
794
795 switch (descr->zt) {
796 case XNVME_SPEC_ZND_TYPE_SEQWR:
797 zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
798 break;
799
800 default:
801 log_err("ioeng->report_zones(%s): invalid type for zone at offset(%zu)\n",
802 f->file_name, zbdz[idx].start);
803 err = -EIO;
804 goto exit;
805 }
806
807 switch (descr->zs) {
808 case XNVME_SPEC_ZND_STATE_EMPTY:
809 zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
810 break;
811 case XNVME_SPEC_ZND_STATE_IOPEN:
812 zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
813 break;
814 case XNVME_SPEC_ZND_STATE_EOPEN:
815 zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
816 break;
817 case XNVME_SPEC_ZND_STATE_CLOSED:
818 zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
819 break;
820 case XNVME_SPEC_ZND_STATE_FULL:
821 zbdz[idx].cond = ZBD_ZONE_COND_FULL;
822 break;
823
824 case XNVME_SPEC_ZND_STATE_RONLY:
825 case XNVME_SPEC_ZND_STATE_OFFLINE:
826 default:
827 zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
828 break;
829 }
830 }
831
832exit:
833 xnvme_buf_virt_free(rprt);
834
835 xnvme_dev_close(dev);
836
837 err_lock = pthread_mutex_unlock(&g_serialize);
838 if (err_lock)
839 log_err("ioeng->report_zones(): pthread_mutex_unlock(), err: %d\n", err_lock);
840
841 dprint(FD_ZBD, "err: %d, nr_zones: %d\n", err, (int)nr_zones);
842
843 return err ? err : (int)limit;
844}
845
846/**
847 * NOTE: This function may get called before I/O engine initialization, that is,
848 * before ``_dev_open`` has been called and file-wrapping is setup. In such
849 * case it has to do ``_dev_open`` itself, and shut it down again once it is
850 * done resetting write pointer of zones.
851 */
852static int xnvme_fioe_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset,
853 uint64_t length)
854{
855 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
856 struct xnvme_fioe_data *xd = NULL;
857 struct xnvme_fioe_fwrap *fwrap = NULL;
858 struct xnvme_dev *dev = NULL;
859 const struct xnvme_geo *geo = NULL;
860 uint64_t first, last;
861 uint32_t ssw;
862 uint32_t nsid;
863 int err = 0, err_lock;
864
865 if (td->io_ops_data) {
866 xd = td->io_ops_data;
867 fwrap = &xd->files[f->fileno];
868
869 assert(fwrap->dev);
870 assert(fwrap->geo);
871
872 dev = fwrap->dev;
873 geo = fwrap->geo;
874 ssw = fwrap->ssw;
875 } else {
876 err = pthread_mutex_lock(&g_serialize);
877 if (err) {
878 log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", err);
879 return -err;
880 }
881
882 dev = xnvme_dev_open(f->file_name, &opts);
883 if (!dev) {
884 log_err("ioeng->reset_wp(): xnvme_dev_open(%s) failed, errno(%d)\n",
885 f->file_name, errno);
886 goto exit;
887 }
888 geo = xnvme_dev_get_geo(dev);
889 ssw = xnvme_dev_get_ssw(dev);
890 }
891
892 nsid = xnvme_dev_get_nsid(dev);
893
894 first = ((offset >> ssw) / geo->nsect) * geo->nsect;
895 last = (((offset + length) >> ssw) / geo->nsect) * geo->nsect;
896 dprint(FD_ZBD, "first: 0x%lx, last: 0x%lx\n", first, last);
897
898 for (uint64_t zslba = first; zslba < last; zslba += geo->nsect) {
899 struct xnvme_cmd_ctx ctx = xnvme_cmd_ctx_from_dev(dev);
900
901 if (zslba >= (geo->nsect * geo->nzone)) {
902 log_err("ioeng->reset_wp(): out-of-bounds\n");
903 err = 0;
904 break;
905 }
906
907 err = xnvme_znd_mgmt_send(&ctx, nsid, zslba, false,
908 XNVME_SPEC_ZND_CMD_MGMT_SEND_RESET, 0x0, NULL);
909 if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
910 err = err ? err : -EIO;
911 log_err("ioeng->reset_wp(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
912 goto exit;
913 }
914 }
915
916exit:
917 if (!td->io_ops_data) {
918 xnvme_dev_close(dev);
919
920 err_lock = pthread_mutex_unlock(&g_serialize);
921 if (err_lock)
922 log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err_lock);
923 }
924
925 return err;
926}
927
928static int xnvme_fioe_get_file_size(struct thread_data *td, struct fio_file *f)
929{
930 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
931 struct xnvme_dev *dev;
932 int ret = 0, err;
933
934 if (fio_file_size_known(f))
935 return 0;
936
937 ret = pthread_mutex_lock(&g_serialize);
938 if (ret) {
939 log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", ret);
940 return -ret;
941 }
942
943 dev = xnvme_dev_open(f->file_name, &opts);
944 if (!dev) {
945 log_err("%s: failed retrieving device handle, errno: %d\n", f->file_name, errno);
946 ret = -errno;
947 goto exit;
948 }
949
950 f->real_file_size = xnvme_dev_get_geo(dev)->tbytes;
951 fio_file_set_size_known(f);
952 f->filetype = FIO_TYPE_BLOCK;
953
954exit:
955 xnvme_dev_close(dev);
956 err = pthread_mutex_unlock(&g_serialize);
957 if (err)
958 log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err);
959
960 return ret;
961}
962
963FIO_STATIC struct ioengine_ops ioengine = {
964 .name = "xnvme",
965 .version = FIO_IOOPS_VERSION,
966 .options = options,
967 .option_struct_size = sizeof(struct xnvme_fioe_options),
968 .flags = FIO_DISKLESSIO | FIO_NODISKUTIL | FIO_NOEXTEND | FIO_MEMALIGN | FIO_RAWIO,
969
970 .cleanup = xnvme_fioe_cleanup,
971 .init = xnvme_fioe_init,
972
973 .iomem_free = xnvme_fioe_iomem_free,
974 .iomem_alloc = xnvme_fioe_iomem_alloc,
975
976 .io_u_free = xnvme_fioe_io_u_free,
977 .io_u_init = xnvme_fioe_io_u_init,
978
979 .event = xnvme_fioe_event,
980 .getevents = xnvme_fioe_getevents,
981 .queue = xnvme_fioe_queue,
982
983 .close_file = xnvme_fioe_close,
984 .open_file = xnvme_fioe_open,
985 .get_file_size = xnvme_fioe_get_file_size,
986
987 .invalidate = xnvme_fioe_invalidate,
988 .get_max_open_zones = xnvme_fioe_get_max_open_zones,
989 .get_zoned_model = xnvme_fioe_get_zoned_model,
990 .report_zones = xnvme_fioe_report_zones,
991 .reset_wp = xnvme_fioe_reset_wp,
992};
993
994static void fio_init fio_xnvme_register(void)
995{
996 register_ioengine(&ioengine);
997}
998
999static void fio_exit fio_xnvme_unregister(void)
1000{
1001 unregister_ioengine(&ioengine);
1002}