engines/xnvme: add support for metadata
[fio.git] / engines / xnvme.c
CommitLineData
a3ff873e
AK
1/*
2 * fio xNVMe IO Engine
3 *
4 * IO engine using the xNVMe C API.
5 *
6 * See: http://xnvme.io/
7 *
8 * SPDX-License-Identifier: Apache-2.0
9 */
10#include <stdlib.h>
11#include <assert.h>
12#include <libxnvme.h>
a3ff873e
AK
13#include "fio.h"
14#include "zbd_types.h"
e5f3b613 15#include "fdp.h"
a3ff873e
AK
16#include "optgroup.h"
17
18static pthread_mutex_t g_serialize = PTHREAD_MUTEX_INITIALIZER;
19
20struct xnvme_fioe_fwrap {
21 /* fio file representation */
22 struct fio_file *fio_file;
23
24 /* xNVMe device handle */
25 struct xnvme_dev *dev;
26 /* xNVMe device geometry */
27 const struct xnvme_geo *geo;
28
29 struct xnvme_queue *queue;
30
31 uint32_t ssw;
32 uint32_t lba_nbytes;
be5514e3
AK
33 uint32_t md_nbytes;
34 uint32_t lba_pow2;
a3ff873e 35
be5514e3 36 uint8_t _pad[16];
a3ff873e
AK
37};
38XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_fwrap) == 64, "Incorrect size")
39
40struct xnvme_fioe_data {
41 /* I/O completion queue */
42 struct io_u **iocq;
43
44 /* # of iocq entries; incremented via getevents()/cb_pool() */
45 uint64_t completed;
46
47 /*
48 * # of errors; incremented when observed on completion via
49 * getevents()/cb_pool()
50 */
51 uint64_t ecount;
52
53 /* Controller which device/file to select */
54 int32_t prev;
55 int32_t cur;
56
57 /* Number of devices/files for which open() has been called */
58 int64_t nopen;
59 /* Number of devices/files allocated in files[] */
60 uint64_t nallocated;
61
62 struct iovec *iovec;
be5514e3 63 struct iovec *md_iovec;
a3ff873e
AK
64
65 struct xnvme_fioe_fwrap files[];
66};
67XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_data) == 64, "Incorrect size")
68
be5514e3
AK
69struct xnvme_fioe_request {
70 /* Separate metadata buffer pointer */
71 void *md_buf;
72};
73
a3ff873e
AK
74struct xnvme_fioe_options {
75 void *padding;
76 unsigned int hipri;
77 unsigned int sqpoll_thread;
78 unsigned int xnvme_dev_nsid;
79 unsigned int xnvme_iovec;
be5514e3 80 unsigned int md_per_io_size;
a3ff873e 81 char *xnvme_be;
c945074c 82 char *xnvme_mem;
a3ff873e
AK
83 char *xnvme_async;
84 char *xnvme_sync;
85 char *xnvme_admin;
efbafe2a 86 char *xnvme_dev_subnqn;
a3ff873e
AK
87};
88
89static struct fio_option options[] = {
90 {
91 .name = "hipri",
92 .lname = "High Priority",
93 .type = FIO_OPT_STR_SET,
94 .off1 = offsetof(struct xnvme_fioe_options, hipri),
95 .help = "Use polled IO completions",
96 .category = FIO_OPT_C_ENGINE,
97 .group = FIO_OPT_G_XNVME,
98 },
99 {
100 .name = "sqthread_poll",
101 .lname = "Kernel SQ thread polling",
102 .type = FIO_OPT_STR_SET,
103 .off1 = offsetof(struct xnvme_fioe_options, sqpoll_thread),
104 .help = "Offload submission/completion to kernel thread",
105 .category = FIO_OPT_C_ENGINE,
106 .group = FIO_OPT_G_XNVME,
107 },
108 {
109 .name = "xnvme_be",
110 .lname = "xNVMe Backend",
111 .type = FIO_OPT_STR_STORE,
112 .off1 = offsetof(struct xnvme_fioe_options, xnvme_be),
113 .help = "Select xNVMe backend [spdk,linux,fbsd]",
114 .category = FIO_OPT_C_ENGINE,
115 .group = FIO_OPT_G_XNVME,
116 },
c945074c
AK
117 {
118 .name = "xnvme_mem",
119 .lname = "xNVMe Memory Backend",
120 .type = FIO_OPT_STR_STORE,
121 .off1 = offsetof(struct xnvme_fioe_options, xnvme_mem),
122 .help = "Select xNVMe memory backend",
123 .category = FIO_OPT_C_ENGINE,
124 .group = FIO_OPT_G_XNVME,
125 },
a3ff873e
AK
126 {
127 .name = "xnvme_async",
128 .lname = "xNVMe Asynchronous command-interface",
129 .type = FIO_OPT_STR_STORE,
130 .off1 = offsetof(struct xnvme_fioe_options, xnvme_async),
203a4c7c
AK
131 .help = "Select xNVMe async. interface: "
132 "[emu,thrpool,io_uring,io_uring_cmd,libaio,posix,vfio,nil]",
a3ff873e
AK
133 .category = FIO_OPT_C_ENGINE,
134 .group = FIO_OPT_G_XNVME,
135 },
136 {
137 .name = "xnvme_sync",
138 .lname = "xNVMe Synchronous. command-interface",
139 .type = FIO_OPT_STR_STORE,
140 .off1 = offsetof(struct xnvme_fioe_options, xnvme_sync),
203a4c7c 141 .help = "Select xNVMe sync. interface: [nvme,psync,block]",
a3ff873e
AK
142 .category = FIO_OPT_C_ENGINE,
143 .group = FIO_OPT_G_XNVME,
144 },
145 {
146 .name = "xnvme_admin",
147 .lname = "xNVMe Admin command-interface",
148 .type = FIO_OPT_STR_STORE,
149 .off1 = offsetof(struct xnvme_fioe_options, xnvme_admin),
203a4c7c 150 .help = "Select xNVMe admin. cmd-interface: [nvme,block]",
a3ff873e
AK
151 .category = FIO_OPT_C_ENGINE,
152 .group = FIO_OPT_G_XNVME,
153 },
154 {
155 .name = "xnvme_dev_nsid",
156 .lname = "xNVMe Namespace-Identifier, for user-space NVMe driver",
157 .type = FIO_OPT_INT,
158 .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_nsid),
159 .help = "xNVMe Namespace-Identifier, for user-space NVMe driver",
160 .category = FIO_OPT_C_ENGINE,
161 .group = FIO_OPT_G_XNVME,
162 },
efbafe2a
AK
163 {
164 .name = "xnvme_dev_subnqn",
165 .lname = "Subsystem nqn for Fabrics",
166 .type = FIO_OPT_STR_STORE,
167 .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_subnqn),
168 .help = "Subsystem NQN for Fabrics",
169 .category = FIO_OPT_C_ENGINE,
170 .group = FIO_OPT_G_XNVME,
171 },
a3ff873e
AK
172 {
173 .name = "xnvme_iovec",
174 .lname = "Vectored IOs",
175 .type = FIO_OPT_STR_SET,
176 .off1 = offsetof(struct xnvme_fioe_options, xnvme_iovec),
177 .help = "Send vectored IOs",
178 .category = FIO_OPT_C_ENGINE,
179 .group = FIO_OPT_G_XNVME,
180 },
be5514e3
AK
181 {
182 .name = "md_per_io_size",
183 .lname = "Separate Metadata Buffer Size per I/O",
184 .type = FIO_OPT_INT,
185 .off1 = offsetof(struct xnvme_fioe_options, md_per_io_size),
186 .def = "0",
187 .help = "Size of separate metadata buffer per I/O (Default: 0)",
188 .category = FIO_OPT_C_ENGINE,
189 .group = FIO_OPT_G_XNVME,
190 },
a3ff873e
AK
191
192 {
193 .name = NULL,
194 },
195};
196
197static void cb_pool(struct xnvme_cmd_ctx *ctx, void *cb_arg)
198{
199 struct io_u *io_u = cb_arg;
200 struct xnvme_fioe_data *xd = io_u->mmap_data;
201
202 if (xnvme_cmd_ctx_cpl_status(ctx)) {
203 xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF);
204 xd->ecount += 1;
205 io_u->error = EIO;
206 }
207
208 xd->iocq[xd->completed++] = io_u;
209 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
210}
211
212static struct xnvme_opts xnvme_opts_from_fioe(struct thread_data *td)
213{
214 struct xnvme_fioe_options *o = td->eo;
215 struct xnvme_opts opts = xnvme_opts_default();
216
217 opts.nsid = o->xnvme_dev_nsid;
efbafe2a 218 opts.subnqn = o->xnvme_dev_subnqn;
a3ff873e 219 opts.be = o->xnvme_be;
c945074c 220 opts.mem = o->xnvme_mem;
a3ff873e
AK
221 opts.async = o->xnvme_async;
222 opts.sync = o->xnvme_sync;
223 opts.admin = o->xnvme_admin;
224
225 opts.poll_io = o->hipri;
226 opts.poll_sq = o->sqpoll_thread;
227
228 opts.direct = td->o.odirect;
229
230 return opts;
231}
232
233static void _dev_close(struct thread_data *td, struct xnvme_fioe_fwrap *fwrap)
234{
235 if (fwrap->dev)
236 xnvme_queue_term(fwrap->queue);
237
238 xnvme_dev_close(fwrap->dev);
239
240 memset(fwrap, 0, sizeof(*fwrap));
241}
242
243static void xnvme_fioe_cleanup(struct thread_data *td)
244{
fdac9c68 245 struct xnvme_fioe_data *xd = NULL;
a3ff873e
AK
246 int err;
247
fdac9c68
AK
248 if (!td->io_ops_data)
249 return;
250
251 xd = td->io_ops_data;
252
a3ff873e
AK
253 err = pthread_mutex_lock(&g_serialize);
254 if (err)
255 log_err("ioeng->cleanup(): pthread_mutex_lock(), err(%d)\n", err);
256 /* NOTE: not returning here */
257
258 for (uint64_t i = 0; i < xd->nallocated; ++i)
259 _dev_close(td, &xd->files[i]);
260
261 if (!err) {
262 err = pthread_mutex_unlock(&g_serialize);
263 if (err)
264 log_err("ioeng->cleanup(): pthread_mutex_unlock(), err(%d)\n", err);
265 }
266
267 free(xd->iocq);
268 free(xd->iovec);
be5514e3 269 free(xd->md_iovec);
a3ff873e
AK
270 free(xd);
271 td->io_ops_data = NULL;
272}
273
274/**
275 * Helper function setting up device handles as addressed by the naming
276 * convention of the given `fio_file` filename.
277 *
278 * Checks thread-options for explicit control of asynchronous implementation via
279 * the ``--xnvme_async={thrpool,emu,posix,io_uring,libaio,nil}``.
280 */
281static int _dev_open(struct thread_data *td, struct fio_file *f)
282{
283 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
284 struct xnvme_fioe_data *xd = td->io_ops_data;
285 struct xnvme_fioe_fwrap *fwrap;
286 int flags = 0;
287 int err;
288
289 if (f->fileno > (int)xd->nallocated) {
290 log_err("ioeng->_dev_open(%s): invalid assumption\n", f->file_name);
291 return 1;
292 }
293
294 fwrap = &xd->files[f->fileno];
295
296 err = pthread_mutex_lock(&g_serialize);
297 if (err) {
298 log_err("ioeng->_dev_open(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
299 err);
300 return -err;
301 }
302
303 fwrap->dev = xnvme_dev_open(f->file_name, &opts);
304 if (!fwrap->dev) {
305 log_err("ioeng->_dev_open(%s): xnvme_dev_open(), err(%d)\n", f->file_name, errno);
306 goto failure;
307 }
308 fwrap->geo = xnvme_dev_get_geo(fwrap->dev);
309
310 if (xnvme_queue_init(fwrap->dev, td->o.iodepth, flags, &(fwrap->queue))) {
311 log_err("ioeng->_dev_open(%s): xnvme_queue_init(), err(?)\n", f->file_name);
312 goto failure;
313 }
314 xnvme_queue_set_cb(fwrap->queue, cb_pool, NULL);
315
316 fwrap->ssw = xnvme_dev_get_ssw(fwrap->dev);
317 fwrap->lba_nbytes = fwrap->geo->lba_nbytes;
be5514e3
AK
318 fwrap->md_nbytes = fwrap->geo->nbytes_oob;
319
320 if (fwrap->geo->lba_extended)
321 fwrap->lba_pow2 = 0;
322 else
323 fwrap->lba_pow2 = 1;
a3ff873e
AK
324
325 fwrap->fio_file = f;
326 fwrap->fio_file->filetype = FIO_TYPE_BLOCK;
327 fwrap->fio_file->real_file_size = fwrap->geo->tbytes;
328 fio_file_set_size_known(fwrap->fio_file);
329
330 err = pthread_mutex_unlock(&g_serialize);
331 if (err)
332 log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
333 err);
334
335 return 0;
336
337failure:
338 xnvme_queue_term(fwrap->queue);
339 xnvme_dev_close(fwrap->dev);
340
341 err = pthread_mutex_unlock(&g_serialize);
342 if (err)
343 log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
344 err);
345
346 return 1;
347}
348
349static int xnvme_fioe_init(struct thread_data *td)
350{
351 struct xnvme_fioe_data *xd = NULL;
7f6a3869 352 struct xnvme_fioe_options *o = td->eo;
a3ff873e
AK
353 struct fio_file *f;
354 unsigned int i;
355
356 if (!td->o.use_thread) {
357 log_err("ioeng->init(): --thread=1 is required\n");
358 return 1;
359 }
360
361 /* Allocate xd and iocq */
362 xd = calloc(1, sizeof(*xd) + sizeof(*xd->files) * td->o.nr_files);
363 if (!xd) {
364 log_err("ioeng->init(): !calloc(), err(%d)\n", errno);
365 return 1;
366 }
367
368 xd->iocq = calloc(td->o.iodepth, sizeof(struct io_u *));
369 if (!xd->iocq) {
eb3570b5
AK
370 free(xd);
371 log_err("ioeng->init(): !calloc(xd->iocq), err(%d)\n", errno);
a3ff873e
AK
372 return 1;
373 }
374
7f6a3869
AK
375 if (o->xnvme_iovec) {
376 xd->iovec = calloc(td->o.iodepth, sizeof(*xd->iovec));
377 if (!xd->iovec) {
378 free(xd->iocq);
379 free(xd);
380 log_err("ioeng->init(): !calloc(xd->iovec), err(%d)\n", errno);
381 return 1;
382 }
a3ff873e
AK
383 }
384
be5514e3
AK
385 if (o->xnvme_iovec && o->md_per_io_size) {
386 xd->md_iovec = calloc(td->o.iodepth, sizeof(*xd->md_iovec));
387 if (!xd->md_iovec) {
388 free(xd->iocq);
389 free(xd->iovec);
390 free(xd);
391 log_err("ioeng->init(): !calloc(xd->md_iovec), err(%d)\n", errno);
392 return 1;
393 }
394 }
395
a3ff873e
AK
396 xd->prev = -1;
397 td->io_ops_data = xd;
398
399 for_each_file(td, f, i)
400 {
401 if (_dev_open(td, f)) {
eb3570b5 402 /*
be5514e3
AK
403 * Note: We are not freeing xd, iocq, iovec and md_iovec.
404 * This will be done as part of cleanup routine.
eb3570b5 405 */
a3ff873e
AK
406 log_err("ioeng->init(): failed; _dev_open(%s)\n", f->file_name);
407 return 1;
408 }
409
410 ++(xd->nallocated);
411 }
412
413 if (xd->nallocated != td->o.nr_files) {
414 log_err("ioeng->init(): failed; nallocated != td->o.nr_files\n");
415 return 1;
416 }
417
418 return 0;
419}
420
421/* NOTE: using the first device for buffer-allocators) */
422static int xnvme_fioe_iomem_alloc(struct thread_data *td, size_t total_mem)
423{
424 struct xnvme_fioe_data *xd = td->io_ops_data;
425 struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
426
427 if (!fwrap->dev) {
428 log_err("ioeng->iomem_alloc(): failed; no dev-handle\n");
429 return 1;
430 }
431
432 td->orig_buffer = xnvme_buf_alloc(fwrap->dev, total_mem);
433
434 return td->orig_buffer == NULL;
435}
436
437/* NOTE: using the first device for buffer-allocators) */
438static void xnvme_fioe_iomem_free(struct thread_data *td)
439{
fdac9c68
AK
440 struct xnvme_fioe_data *xd = NULL;
441 struct xnvme_fioe_fwrap *fwrap = NULL;
442
443 if (!td->io_ops_data)
444 return;
445
446 xd = td->io_ops_data;
447 fwrap = &xd->files[0];
a3ff873e
AK
448
449 if (!fwrap->dev) {
450 log_err("ioeng->iomem_free(): failed no dev-handle\n");
451 return;
452 }
453
454 xnvme_buf_free(fwrap->dev, td->orig_buffer);
455}
456
457static int xnvme_fioe_io_u_init(struct thread_data *td, struct io_u *io_u)
458{
be5514e3
AK
459 struct xnvme_fioe_request *fio_req;
460 struct xnvme_fioe_options *o = td->eo;
461 struct xnvme_fioe_data *xd = td->io_ops_data;
462 struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
463
464 if (!fwrap->dev) {
465 log_err("ioeng->io_u_init(): failed; no dev-handle\n");
466 return 1;
467 }
468
a3ff873e 469 io_u->mmap_data = td->io_ops_data;
be5514e3
AK
470 io_u->engine_data = NULL;
471
472 fio_req = calloc(1, sizeof(*fio_req));
473 if (!fio_req) {
474 log_err("ioeng->io_u_init(): !calloc(fio_req), err(%d)\n", errno);
475 return 1;
476 }
477
478 if (o->md_per_io_size) {
479 fio_req->md_buf = xnvme_buf_alloc(fwrap->dev, o->md_per_io_size);
480 if (!fio_req->md_buf) {
481 free(fio_req);
482 return 1;
483 }
484 }
485
486 io_u->engine_data = fio_req;
a3ff873e
AK
487
488 return 0;
489}
490
491static void xnvme_fioe_io_u_free(struct thread_data *td, struct io_u *io_u)
492{
be5514e3
AK
493 struct xnvme_fioe_data *xd = NULL;
494 struct xnvme_fioe_fwrap *fwrap = NULL;
495 struct xnvme_fioe_request *fio_req = NULL;
496
497 if (!td->io_ops_data)
498 return;
499
500 xd = td->io_ops_data;
501 fwrap = &xd->files[0];
502
503 if (!fwrap->dev) {
504 log_err("ioeng->io_u_free(): failed no dev-handle\n");
505 return;
506 }
507
508 fio_req = io_u->engine_data;
509 if (fio_req->md_buf)
510 xnvme_buf_free(fwrap->dev, fio_req->md_buf);
511
512 free(fio_req);
513
a3ff873e
AK
514 io_u->mmap_data = NULL;
515}
516
517static struct io_u *xnvme_fioe_event(struct thread_data *td, int event)
518{
519 struct xnvme_fioe_data *xd = td->io_ops_data;
520
521 assert(event >= 0);
522 assert((unsigned)event < xd->completed);
523
524 return xd->iocq[event];
525}
526
527static int xnvme_fioe_getevents(struct thread_data *td, unsigned int min, unsigned int max,
528 const struct timespec *t)
529{
530 struct xnvme_fioe_data *xd = td->io_ops_data;
531 struct xnvme_fioe_fwrap *fwrap = NULL;
532 int nfiles = xd->nallocated;
533 int err = 0;
534
535 if (xd->prev != -1 && ++xd->prev < nfiles) {
536 fwrap = &xd->files[xd->prev];
537 xd->cur = xd->prev;
538 }
539
540 xd->completed = 0;
541 for (;;) {
542 if (fwrap == NULL || xd->cur == nfiles) {
543 fwrap = &xd->files[0];
544 xd->cur = 0;
545 }
546
547 while (fwrap != NULL && xd->cur < nfiles && err >= 0) {
548 err = xnvme_queue_poke(fwrap->queue, max - xd->completed);
549 if (err < 0) {
550 switch (err) {
551 case -EBUSY:
552 case -EAGAIN:
553 usleep(1);
554 break;
555
556 default:
557 log_err("ioeng->getevents(): unhandled IO error\n");
558 assert(false);
559 return 0;
560 }
561 }
562 if (xd->completed >= min) {
563 xd->prev = xd->cur;
564 return xd->completed;
565 }
566 xd->cur++;
567 fwrap = &xd->files[xd->cur];
568
569 if (err < 0) {
570 switch (err) {
571 case -EBUSY:
572 case -EAGAIN:
573 usleep(1);
574 break;
575 }
576 }
577 }
578 }
579
580 xd->cur = 0;
581
582 return xd->completed;
583}
584
585static enum fio_q_status xnvme_fioe_queue(struct thread_data *td, struct io_u *io_u)
586{
587 struct xnvme_fioe_data *xd = td->io_ops_data;
588 struct xnvme_fioe_fwrap *fwrap;
589 struct xnvme_cmd_ctx *ctx;
be5514e3 590 struct xnvme_fioe_request *fio_req = io_u->engine_data;
a3ff873e
AK
591 uint32_t nsid;
592 uint64_t slba;
593 uint16_t nlb;
594 int err;
595 bool vectored_io = ((struct xnvme_fioe_options *)td->eo)->xnvme_iovec;
e5f3b613 596 uint32_t dir = io_u->dtype;
a3ff873e
AK
597
598 fio_ro_check(td, io_u);
599
600 fwrap = &xd->files[io_u->file->fileno];
601 nsid = xnvme_dev_get_nsid(fwrap->dev);
602
be5514e3
AK
603 if (fwrap->lba_pow2) {
604 slba = io_u->offset >> fwrap->ssw;
605 nlb = (io_u->xfer_buflen >> fwrap->ssw) - 1;
606 } else {
607 slba = io_u->offset / fwrap->lba_nbytes;
608 nlb = (io_u->xfer_buflen / fwrap->lba_nbytes) - 1;
609 }
a3ff873e
AK
610
611 ctx = xnvme_queue_get_cmd_ctx(fwrap->queue);
612 ctx->async.cb_arg = io_u;
613
614 ctx->cmd.common.nsid = nsid;
615 ctx->cmd.nvm.slba = slba;
616 ctx->cmd.nvm.nlb = nlb;
e5f3b613
AK
617 if (dir) {
618 ctx->cmd.nvm.dtype = io_u->dtype;
619 ctx->cmd.nvm.cdw13.dspec = io_u->dspec;
620 }
a3ff873e
AK
621
622 switch (io_u->ddir) {
623 case DDIR_READ:
624 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ;
625 break;
626
627 case DDIR_WRITE:
628 ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE;
629 break;
630
631 default:
632 log_err("ioeng->queue(): ENOSYS: %u\n", io_u->ddir);
eb3570b5
AK
633 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
634
635 io_u->error = ENOSYS;
a3ff873e 636 assert(false);
eb3570b5 637 return FIO_Q_COMPLETED;
a3ff873e
AK
638 }
639
640 if (vectored_io) {
641 xd->iovec[io_u->index].iov_base = io_u->xfer_buf;
642 xd->iovec[io_u->index].iov_len = io_u->xfer_buflen;
be5514e3
AK
643 if (fwrap->md_nbytes && fwrap->lba_pow2) {
644 xd->md_iovec[io_u->index].iov_base = fio_req->md_buf;
645 xd->md_iovec[io_u->index].iov_len = fwrap->md_nbytes * (nlb + 1);
646 err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen,
647 &xd->md_iovec[io_u->index], 1,
648 fwrap->md_nbytes * (nlb + 1));
649 } else {
650 err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen,
651 NULL, 0, 0);
652 }
a3ff873e 653 } else {
be5514e3
AK
654 if (fwrap->md_nbytes && fwrap->lba_pow2)
655 err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen,
656 fio_req->md_buf, fwrap->md_nbytes * (nlb + 1));
657 else
658 err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen, NULL, 0);
a3ff873e
AK
659 }
660 switch (err) {
661 case 0:
662 return FIO_Q_QUEUED;
663
664 case -EBUSY:
665 case -EAGAIN:
666 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
667 return FIO_Q_BUSY;
668
669 default:
670 log_err("ioeng->queue(): err: '%d'\n", err);
671
672 xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
673
674 io_u->error = abs(err);
675 assert(false);
676 return FIO_Q_COMPLETED;
677 }
678}
679
680static int xnvme_fioe_close(struct thread_data *td, struct fio_file *f)
681{
682 struct xnvme_fioe_data *xd = td->io_ops_data;
683
684 dprint(FD_FILE, "xnvme close %s -- nopen: %ld\n", f->file_name, xd->nopen);
685
686 --(xd->nopen);
687
688 return 0;
689}
690
691static int xnvme_fioe_open(struct thread_data *td, struct fio_file *f)
692{
693 struct xnvme_fioe_data *xd = td->io_ops_data;
694
695 dprint(FD_FILE, "xnvme open %s -- nopen: %ld\n", f->file_name, xd->nopen);
696
697 if (f->fileno > (int)xd->nallocated) {
698 log_err("ioeng->open(): f->fileno > xd->nallocated; invalid assumption\n");
699 return 1;
700 }
701 if (xd->files[f->fileno].fio_file != f) {
702 log_err("ioeng->open(): fio_file != f; invalid assumption\n");
703 return 1;
704 }
705
706 ++(xd->nopen);
707
708 return 0;
709}
710
711static int xnvme_fioe_invalidate(struct thread_data *td, struct fio_file *f)
712{
713 /* Consider only doing this with be:spdk */
714 return 0;
715}
716
717static int xnvme_fioe_get_max_open_zones(struct thread_data *td, struct fio_file *f,
718 unsigned int *max_open_zones)
719{
720 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
721 struct xnvme_dev *dev;
722 const struct xnvme_spec_znd_idfy_ns *zns;
723 int err = 0, err_lock;
724
725 if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
726 f->filetype != FIO_TYPE_CHAR) {
727 log_info("ioeng->get_max_open_zoned(): ignoring filetype: %d\n", f->filetype);
728 return 0;
729 }
730 err_lock = pthread_mutex_lock(&g_serialize);
731 if (err_lock) {
732 log_err("ioeng->get_max_open_zones(): pthread_mutex_lock(), err(%d)\n", err_lock);
733 return -err_lock;
734 }
735
736 dev = xnvme_dev_open(f->file_name, &opts);
737 if (!dev) {
738 log_err("ioeng->get_max_open_zones(): xnvme_dev_open(), err(%d)\n", err_lock);
739 err = -errno;
740 goto exit;
741 }
742 if (xnvme_dev_get_geo(dev)->type != XNVME_GEO_ZONED) {
743 errno = EINVAL;
744 err = -errno;
745 goto exit;
746 }
747
748 zns = (void *)xnvme_dev_get_ns_css(dev);
749 if (!zns) {
750 log_err("ioeng->get_max_open_zones(): xnvme_dev_get_ns_css(), err(%d)\n", errno);
751 err = -errno;
752 goto exit;
753 }
754
755 /*
756 * intentional overflow as the value is zero-based and NVMe
757 * defines 0xFFFFFFFF as unlimited thus overflowing to 0 which
758 * is how fio indicates unlimited and otherwise just converting
759 * to one-based.
760 */
761 *max_open_zones = zns->mor + 1;
762
763exit:
764 xnvme_dev_close(dev);
765 err_lock = pthread_mutex_unlock(&g_serialize);
766 if (err_lock)
767 log_err("ioeng->get_max_open_zones(): pthread_mutex_unlock(), err(%d)\n",
768 err_lock);
769
770 return err;
771}
772
773/**
774 * Currently, this function is called before of I/O engine initialization, so,
775 * we cannot consult the file-wrapping done when 'fioe' initializes.
776 * Instead we just open based on the given filename.
777 *
778 * TODO: unify the different setup methods, consider keeping the handle around,
779 * and consider how to support the --be option in this usecase
780 */
781static int xnvme_fioe_get_zoned_model(struct thread_data *td, struct fio_file *f,
782 enum zbd_zoned_model *model)
783{
784 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
785 struct xnvme_dev *dev;
786 int err = 0, err_lock;
787
788 if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
789 f->filetype != FIO_TYPE_CHAR) {
790 log_info("ioeng->get_zoned_model(): ignoring filetype: %d\n", f->filetype);
791 return -EINVAL;
792 }
793
794 err = pthread_mutex_lock(&g_serialize);
795 if (err) {
796 log_err("ioeng->get_zoned_model(): pthread_mutex_lock(), err(%d)\n", err);
797 return -err;
798 }
799
800 dev = xnvme_dev_open(f->file_name, &opts);
801 if (!dev) {
802 log_err("ioeng->get_zoned_model(): xnvme_dev_open(%s) failed, errno: %d\n",
803 f->file_name, errno);
804 err = -errno;
805 goto exit;
806 }
807
808 switch (xnvme_dev_get_geo(dev)->type) {
809 case XNVME_GEO_UNKNOWN:
810 dprint(FD_ZBD, "%s: got 'unknown', assigning ZBD_NONE\n", f->file_name);
811 *model = ZBD_NONE;
812 break;
813
814 case XNVME_GEO_CONVENTIONAL:
815 dprint(FD_ZBD, "%s: got 'conventional', assigning ZBD_NONE\n", f->file_name);
816 *model = ZBD_NONE;
817 break;
818
819 case XNVME_GEO_ZONED:
820 dprint(FD_ZBD, "%s: got 'zoned', assigning ZBD_HOST_MANAGED\n", f->file_name);
821 *model = ZBD_HOST_MANAGED;
822 break;
823
824 default:
825 dprint(FD_ZBD, "%s: hit-default, assigning ZBD_NONE\n", f->file_name);
826 *model = ZBD_NONE;
827 errno = EINVAL;
828 err = -errno;
829 break;
830 }
831
832exit:
833 xnvme_dev_close(dev);
834
835 err_lock = pthread_mutex_unlock(&g_serialize);
836 if (err_lock)
837 log_err("ioeng->get_zoned_model(): pthread_mutex_unlock(), err(%d)\n", err_lock);
838
839 return err;
840}
841
842/**
843 * Fills the given ``zbdz`` with at most ``nr_zones`` zone-descriptors.
844 *
845 * The implementation converts the NVMe Zoned Command Set log-pages for Zone
846 * descriptors into the Linux Kernel Zoned Block Report format.
847 *
848 * NOTE: This function is called before I/O engine initialization, that is,
849 * before ``_dev_open`` has been called and file-wrapping is setup. Thus is has
850 * to do the ``_dev_open`` itself, and shut it down again once it is done
851 * retrieving the log-pages and converting them to the report format.
852 *
853 * TODO: unify the different setup methods, consider keeping the handle around,
854 * and consider how to support the --async option in this usecase
855 */
856static int xnvme_fioe_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
857 struct zbd_zone *zbdz, unsigned int nr_zones)
858{
859 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
860 const struct xnvme_spec_znd_idfy_lbafe *lbafe = NULL;
861 struct xnvme_dev *dev = NULL;
862 const struct xnvme_geo *geo = NULL;
863 struct xnvme_znd_report *rprt = NULL;
864 uint32_t ssw;
865 uint64_t slba;
866 unsigned int limit = 0;
867 int err = 0, err_lock;
868
869 dprint(FD_ZBD, "%s: report_zones() offset: %zu, nr_zones: %u\n", f->file_name, offset,
870 nr_zones);
871
872 err = pthread_mutex_lock(&g_serialize);
873 if (err) {
874 log_err("ioeng->report_zones(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
875 err);
876 return -err;
877 }
878
879 dev = xnvme_dev_open(f->file_name, &opts);
880 if (!dev) {
881 log_err("ioeng->report_zones(%s): xnvme_dev_open(), err(%d)\n", f->file_name,
882 errno);
883 goto exit;
884 }
885
886 geo = xnvme_dev_get_geo(dev);
887 ssw = xnvme_dev_get_ssw(dev);
888 lbafe = xnvme_znd_dev_get_lbafe(dev);
889
890 limit = nr_zones > geo->nzone ? geo->nzone : nr_zones;
891
892 dprint(FD_ZBD, "%s: limit: %u\n", f->file_name, limit);
893
894 slba = ((offset >> ssw) / geo->nsect) * geo->nsect;
895
896 rprt = xnvme_znd_report_from_dev(dev, slba, limit, 0);
897 if (!rprt) {
898 log_err("ioeng->report_zones(%s): xnvme_znd_report_from_dev(), err(%d)\n",
899 f->file_name, errno);
900 err = -errno;
901 goto exit;
902 }
903 if (rprt->nentries != limit) {
904 log_err("ioeng->report_zones(%s): nentries != nr_zones\n", f->file_name);
905 err = 1;
906 goto exit;
907 }
908 if (offset > geo->tbytes) {
909 log_err("ioeng->report_zones(%s): out-of-bounds\n", f->file_name);
910 goto exit;
911 }
912
913 /* Transform the zone-report */
914 for (uint32_t idx = 0; idx < rprt->nentries; ++idx) {
915 struct xnvme_spec_znd_descr *descr = XNVME_ZND_REPORT_DESCR(rprt, idx);
916
917 zbdz[idx].start = descr->zslba << ssw;
918 zbdz[idx].len = lbafe->zsze << ssw;
919 zbdz[idx].capacity = descr->zcap << ssw;
920 zbdz[idx].wp = descr->wp << ssw;
921
922 switch (descr->zt) {
923 case XNVME_SPEC_ZND_TYPE_SEQWR:
924 zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
925 break;
926
927 default:
928 log_err("ioeng->report_zones(%s): invalid type for zone at offset(%zu)\n",
929 f->file_name, zbdz[idx].start);
930 err = -EIO;
931 goto exit;
932 }
933
934 switch (descr->zs) {
935 case XNVME_SPEC_ZND_STATE_EMPTY:
936 zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
937 break;
938 case XNVME_SPEC_ZND_STATE_IOPEN:
939 zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
940 break;
941 case XNVME_SPEC_ZND_STATE_EOPEN:
942 zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
943 break;
944 case XNVME_SPEC_ZND_STATE_CLOSED:
945 zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
946 break;
947 case XNVME_SPEC_ZND_STATE_FULL:
948 zbdz[idx].cond = ZBD_ZONE_COND_FULL;
949 break;
950
951 case XNVME_SPEC_ZND_STATE_RONLY:
952 case XNVME_SPEC_ZND_STATE_OFFLINE:
953 default:
954 zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
955 break;
956 }
957 }
958
959exit:
960 xnvme_buf_virt_free(rprt);
961
962 xnvme_dev_close(dev);
963
964 err_lock = pthread_mutex_unlock(&g_serialize);
965 if (err_lock)
966 log_err("ioeng->report_zones(): pthread_mutex_unlock(), err: %d\n", err_lock);
967
968 dprint(FD_ZBD, "err: %d, nr_zones: %d\n", err, (int)nr_zones);
969
970 return err ? err : (int)limit;
971}
972
973/**
974 * NOTE: This function may get called before I/O engine initialization, that is,
975 * before ``_dev_open`` has been called and file-wrapping is setup. In such
976 * case it has to do ``_dev_open`` itself, and shut it down again once it is
977 * done resetting write pointer of zones.
978 */
979static int xnvme_fioe_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset,
980 uint64_t length)
981{
982 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
983 struct xnvme_fioe_data *xd = NULL;
984 struct xnvme_fioe_fwrap *fwrap = NULL;
985 struct xnvme_dev *dev = NULL;
986 const struct xnvme_geo *geo = NULL;
987 uint64_t first, last;
988 uint32_t ssw;
989 uint32_t nsid;
990 int err = 0, err_lock;
991
992 if (td->io_ops_data) {
993 xd = td->io_ops_data;
994 fwrap = &xd->files[f->fileno];
995
996 assert(fwrap->dev);
997 assert(fwrap->geo);
998
999 dev = fwrap->dev;
1000 geo = fwrap->geo;
1001 ssw = fwrap->ssw;
1002 } else {
1003 err = pthread_mutex_lock(&g_serialize);
1004 if (err) {
1005 log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", err);
1006 return -err;
1007 }
1008
1009 dev = xnvme_dev_open(f->file_name, &opts);
1010 if (!dev) {
1011 log_err("ioeng->reset_wp(): xnvme_dev_open(%s) failed, errno(%d)\n",
1012 f->file_name, errno);
1013 goto exit;
1014 }
1015 geo = xnvme_dev_get_geo(dev);
1016 ssw = xnvme_dev_get_ssw(dev);
1017 }
1018
1019 nsid = xnvme_dev_get_nsid(dev);
1020
1021 first = ((offset >> ssw) / geo->nsect) * geo->nsect;
1022 last = (((offset + length) >> ssw) / geo->nsect) * geo->nsect;
1023 dprint(FD_ZBD, "first: 0x%lx, last: 0x%lx\n", first, last);
1024
1025 for (uint64_t zslba = first; zslba < last; zslba += geo->nsect) {
1026 struct xnvme_cmd_ctx ctx = xnvme_cmd_ctx_from_dev(dev);
1027
1028 if (zslba >= (geo->nsect * geo->nzone)) {
1029 log_err("ioeng->reset_wp(): out-of-bounds\n");
1030 err = 0;
1031 break;
1032 }
1033
1034 err = xnvme_znd_mgmt_send(&ctx, nsid, zslba, false,
1035 XNVME_SPEC_ZND_CMD_MGMT_SEND_RESET, 0x0, NULL);
1036 if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
1037 err = err ? err : -EIO;
1038 log_err("ioeng->reset_wp(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
1039 goto exit;
1040 }
1041 }
1042
1043exit:
1044 if (!td->io_ops_data) {
1045 xnvme_dev_close(dev);
1046
1047 err_lock = pthread_mutex_unlock(&g_serialize);
1048 if (err_lock)
1049 log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err_lock);
1050 }
1051
1052 return err;
1053}
1054
e5f3b613
AK
1055static int xnvme_fioe_fetch_ruhs(struct thread_data *td, struct fio_file *f,
1056 struct fio_ruhs_info *fruhs_info)
1057{
1058 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
1059 struct xnvme_dev *dev;
1060 struct xnvme_spec_ruhs *ruhs;
1061 struct xnvme_cmd_ctx ctx;
1062 uint32_t ruhs_nbytes;
1063 uint32_t nsid;
1064 int err = 0, err_lock;
1065
48cf0c63 1066 if (f->filetype != FIO_TYPE_CHAR && f->filetype != FIO_TYPE_FILE) {
e5f3b613
AK
1067 log_err("ioeng->fdp_ruhs(): ignoring filetype: %d\n", f->filetype);
1068 return -EINVAL;
1069 }
1070
1071 err = pthread_mutex_lock(&g_serialize);
1072 if (err) {
1073 log_err("ioeng->fdp_ruhs(): pthread_mutex_lock(), err(%d)\n", err);
1074 return -err;
1075 }
1076
1077 dev = xnvme_dev_open(f->file_name, &opts);
1078 if (!dev) {
1079 log_err("ioeng->fdp_ruhs(): xnvme_dev_open(%s) failed, errno: %d\n",
1080 f->file_name, errno);
1081 err = -errno;
1082 goto exit;
1083 }
1084
1085 ruhs_nbytes = sizeof(*ruhs) + (FDP_MAX_RUHS * sizeof(struct xnvme_spec_ruhs_desc));
1086 ruhs = xnvme_buf_alloc(dev, ruhs_nbytes);
1087 if (!ruhs) {
1088 err = -errno;
1089 goto exit;
1090 }
1091 memset(ruhs, 0, ruhs_nbytes);
1092
1093 ctx = xnvme_cmd_ctx_from_dev(dev);
1094 nsid = xnvme_dev_get_nsid(dev);
1095
1096 err = xnvme_nvm_mgmt_recv(&ctx, nsid, XNVME_SPEC_IO_MGMT_RECV_RUHS, 0, ruhs, ruhs_nbytes);
1097
1098 if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
1099 err = err ? err : -EIO;
1100 log_err("ioeng->fdp_ruhs(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
1101 goto free_buffer;
1102 }
1103
1104 fruhs_info->nr_ruhs = ruhs->nruhsd;
1105 for (uint32_t idx = 0; idx < fruhs_info->nr_ruhs; ++idx) {
1106 fruhs_info->plis[idx] = le16_to_cpu(ruhs->desc[idx].pi);
1107 }
1108
1109free_buffer:
1110 xnvme_buf_free(dev, ruhs);
1111exit:
1112 xnvme_dev_close(dev);
1113
1114 err_lock = pthread_mutex_unlock(&g_serialize);
1115 if (err_lock)
1116 log_err("ioeng->fdp_ruhs(): pthread_mutex_unlock(), err(%d)\n", err_lock);
1117
1118 return err;
1119}
1120
a3ff873e
AK
1121static int xnvme_fioe_get_file_size(struct thread_data *td, struct fio_file *f)
1122{
1123 struct xnvme_opts opts = xnvme_opts_from_fioe(td);
1124 struct xnvme_dev *dev;
1125 int ret = 0, err;
1126
1127 if (fio_file_size_known(f))
1128 return 0;
1129
1130 ret = pthread_mutex_lock(&g_serialize);
1131 if (ret) {
1132 log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", ret);
1133 return -ret;
1134 }
1135
1136 dev = xnvme_dev_open(f->file_name, &opts);
1137 if (!dev) {
1138 log_err("%s: failed retrieving device handle, errno: %d\n", f->file_name, errno);
1139 ret = -errno;
1140 goto exit;
1141 }
1142
1143 f->real_file_size = xnvme_dev_get_geo(dev)->tbytes;
1144 fio_file_set_size_known(f);
e5f3b613
AK
1145
1146 if (td->o.zone_mode == ZONE_MODE_ZBD)
1147 f->filetype = FIO_TYPE_BLOCK;
a3ff873e
AK
1148
1149exit:
1150 xnvme_dev_close(dev);
1151 err = pthread_mutex_unlock(&g_serialize);
1152 if (err)
1153 log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err);
1154
1155 return ret;
1156}
1157
1158FIO_STATIC struct ioengine_ops ioengine = {
1159 .name = "xnvme",
1160 .version = FIO_IOOPS_VERSION,
1161 .options = options,
1162 .option_struct_size = sizeof(struct xnvme_fioe_options),
1163 .flags = FIO_DISKLESSIO | FIO_NODISKUTIL | FIO_NOEXTEND | FIO_MEMALIGN | FIO_RAWIO,
1164
1165 .cleanup = xnvme_fioe_cleanup,
1166 .init = xnvme_fioe_init,
1167
1168 .iomem_free = xnvme_fioe_iomem_free,
1169 .iomem_alloc = xnvme_fioe_iomem_alloc,
1170
1171 .io_u_free = xnvme_fioe_io_u_free,
1172 .io_u_init = xnvme_fioe_io_u_init,
1173
1174 .event = xnvme_fioe_event,
1175 .getevents = xnvme_fioe_getevents,
1176 .queue = xnvme_fioe_queue,
1177
1178 .close_file = xnvme_fioe_close,
1179 .open_file = xnvme_fioe_open,
1180 .get_file_size = xnvme_fioe_get_file_size,
1181
1182 .invalidate = xnvme_fioe_invalidate,
1183 .get_max_open_zones = xnvme_fioe_get_max_open_zones,
1184 .get_zoned_model = xnvme_fioe_get_zoned_model,
1185 .report_zones = xnvme_fioe_report_zones,
1186 .reset_wp = xnvme_fioe_reset_wp,
e5f3b613
AK
1187
1188 .fdp_fetch_ruhs = xnvme_fioe_fetch_ruhs,
a3ff873e
AK
1189};
1190
1191static void fio_init fio_xnvme_register(void)
1192{
1193 register_ioengine(&ioengine);
1194}
1195
1196static void fio_exit fio_xnvme_unregister(void)
1197{
1198 unregister_ioengine(&ioengine);
1199}