fio: add libpmem engine
authorTeruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Fri, 17 Nov 2017 02:54:26 +0000 (11:54 +0900)
committerTeruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Fri, 17 Nov 2017 04:23:04 +0000 (13:23 +0900)
Adding an ioengine that access with the libpmem as memory
through a memory mmaped file on DAX filesystem.

It's very similar to the mmap engine and the dev-dax engine.

Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Signed-off-by: Takashi Menjo <menjo.takashi@lab.ntt.co.jp>
HOWTO
Makefile
configure
engines/libpmem.c [new file with mode: 0644]
examples/libpmem.fio [new file with mode: 0644]
fio.1
options.c

diff --git a/HOWTO b/HOWTO
index 419fa7373daef7a01f9807c03a70c27c0fbb1a5d..dce96bcd8103f075d56b21173659f3061f6600b1 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -1820,6 +1820,11 @@ I/O engine
                        set  `filesize` so that all the accounting still occurs, but no
                        actual I/O will be done other than creating the file.
 
+               **libpmem**
+                       Read and write using mmap I/O to a file on a filesystem
+                       mounted with DAX on a persistent memory device through the NVML
+                       libpmem library.
+
 I/O engine specific parameters
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
index 2893348a71e63c35f7cfc2a0e1766f163424b4a4..3ce606460114f0f85817d4efeb5577538f554d27 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -135,6 +135,9 @@ endif
 ifdef CONFIG_LINUX_DEVDAX
   SOURCE += engines/dev-dax.c
 endif
+ifdef CONFIG_LIBPMEM
+  SOURCE += engines/libpmem.c
+endif
 
 ifeq ($(CONFIG_TARGET_OS), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
index d34c000656bf7f6970d6a2eb917b8715761bbc69..31ba82299993f9e10523c326d9156bee03206349 100755 (executable)
--- a/configure
+++ b/configure
@@ -142,6 +142,7 @@ gfio_check="no"
 libhdfs="no"
 pmemblk="no"
 devdax="no"
+pmem="no"
 disable_lex=""
 disable_pmem="no"
 prefix=/usr/local
@@ -1845,6 +1846,7 @@ print_config "libpmemblk" "$libpmemblk"
 
 # Choose the ioengines
 if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
+  pmem="yes"
   devdax="yes"
   if test "$libpmemblk" = "yes"; then
     pmemblk="yes"
@@ -1859,6 +1861,10 @@ print_config "NVML pmemblk engine" "$pmemblk"
 # Report whether dev-dax engine is enabled
 print_config "NVML dev-dax engine" "$devdax"
 
+##########################################
+# Report whether libpmem engine is enabled
+print_config "NVML libpmem engine" "$pmem"
+
 ##########################################
 # Check if we have lex/yacc available
 yacc="no"
@@ -2300,6 +2306,9 @@ fi
 if test "$devdax" = "yes" ; then
   output_sym "CONFIG_LINUX_DEVDAX"
 fi
+if test "$pmem" = "yes" ; then
+  output_sym "CONFIG_LIBPMEM"
+fi
 if test "$arith" = "yes" ; then
   output_sym "CONFIG_ARITHMETIC"
   if test "$yacc_is_bison" = "yes" ; then
diff --git a/engines/libpmem.c b/engines/libpmem.c
new file mode 100644 (file)
index 0000000..3ba3bfe
--- /dev/null
@@ -0,0 +1,595 @@
+/*
+ * libpmem: IO engine that uses NVML libpmem to read and write data
+ *
+ * Copyright (C) 2017 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/*
+ * libpmem engine
+ *
+ * IO engine that uses libpmem to read and write data
+ *
+ * To use:
+ *   ioengine=libpmem
+ *
+ * Other relevant settings:
+ *   iodepth=1
+ *   direct=1
+ *   directory=/mnt/pmem0/
+ *   bs=4k
+ *
+ *   direct=1 means that pmem_drain() is executed for each write operation.
+ *   In contrast, direct=0 means that pmem_drain() is not executed.
+ *
+ *   The pmem device must have a DAX-capable filesystem and be mounted
+ *   with DAX enabled. directory must point to a mount point of DAX FS.
+ *
+ *   Example:
+ *     mkfs.xfs /dev/pmem0
+ *     mkdir /mnt/pmem0
+ *     mount -o dax /dev/pmem0 /mnt/pmem0
+ *
+ *
+ * See examples/libpmem.fio for more.
+ *
+ *
+ * libpmem.so
+ *   By default, the libpmem engine will let the system find the libpmem.so
+ *   that it uses. You can use an alternative libpmem by setting the
+ *   FIO_PMEM_LIB environment variable to the full path to the desired
+ *   libpmem.so.
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <libgen.h>
+#include <libpmem.h>
+
+#include "../fio.h"
+#include "../verify.h"
+
+/*
+ * Limits us to 1GiB of mapped files in total to model after
+ * libpmem engine behavior
+ */
+#define MMAP_TOTAL_SZ   (1 * 1024 * 1024 * 1024UL)
+
+struct fio_libpmem_data {
+       void *libpmem_ptr;
+       size_t libpmem_sz;
+       off_t libpmem_off;
+};
+
+#define MEGABYTE ((uintptr_t)1 << 20)
+#define GIGABYTE ((uintptr_t)1 << 30)
+#define PROCMAXLEN 2048 /* maximum expected line length in /proc files */
+#define roundup(x, y)   ((((x) + ((y) - 1)) / (y)) * (y))
+
+static int Mmap_no_random;
+static void *Mmap_hint;
+static unsigned long long Mmap_align;
+static unsigned long long Pagesize = 0;
+
+/*
+ * util_map_hint_align -- choose the desired mapping alignment
+ *
+ * Use 2MB/1GB page alignment only if the mapping length is at least
+ * twice as big as the page size.
+ */
+static inline size_t
+util_map_hint_align(size_t len, size_t req_align)
+{
+       size_t align = 0;
+
+       dprint(FD_IO, "DEBUG util_map_hint_align\n" );
+#ifndef WIN32
+       Mmap_align = Pagesize;
+#else
+       if (Mmap_align == 0) {
+               SYSTEM_INFO si;
+               GetSystemInfo(&si);
+               Mmap_align = si.dwAllocationGranularity;
+       }
+#endif
+
+       align = Mmap_align;
+
+       if (req_align)
+               align = req_align;
+       else if (len >= 2 * GIGABYTE)
+               align = GIGABYTE;
+       else if (len >= 4 * MEGABYTE)
+               align = 2 * MEGABYTE;
+
+       dprint(FD_IO, "align=%d\n", (int)align);
+       return align;
+}
+
+#ifdef __FreeBSD__
+static const char *sscanf_os = "%p %p";
+#define MAP_NORESERVE 0
+#define OS_MAPFILE "/proc/curproc/map"
+#else
+static const char *sscanf_os = "%p-%p";
+#define OS_MAPFILE "/proc/self/maps"
+#endif
+
+/*
+ * util_map_hint_unused -- use /proc to determine a hint address for mmap()
+ *
+ * This is a helper function for util_map_hint().
+ * It opens up /proc/self/maps and looks for the first unused address
+ * in the process address space that is:
+ * - greater or equal 'minaddr' argument,
+ * - large enough to hold range of given length,
+ * - aligned to the specified unit.
+ *
+ * Asking for aligned address like this will allow the DAX code to use large
+ * mappings.  It is not an error if mmap() ignores the hint and chooses
+ * different address.
+ */
+static char * util_map_hint_unused(void *minaddr, size_t len, size_t align)
+{
+       char *lo = NULL;        /* beginning of current range in maps file */
+       char *hi = NULL;        /* end of current range in maps file */
+       char *raddr = minaddr;  /* ignore regions below 'minaddr' */
+
+#ifdef WIN32
+       MEMORY_BASIC_INFORMATION mi;
+#else
+       FILE *fp;
+       char line[PROCMAXLEN];  /* for fgets() */
+#endif
+
+       dprint(FD_IO, "DEBUG util_map_hint_unused\n");
+       assert(align > 0);
+
+       /* XXX - replace sysconf() with util_get_sys_xxx() */
+       Pagesize = (unsigned long) sysconf(_SC_PAGESIZE);
+
+       if (raddr == NULL)
+               raddr += Pagesize;
+
+       raddr = (char *)roundup((uintptr_t)raddr, align);
+
+#ifdef WIN32
+       while ((uintptr_t)raddr < UINTPTR_MAX - len) {
+               size_t ret = VirtualQuery(raddr, &mi, sizeof(mi));
+               if (ret == 0) {
+                       ERR("VirtualQuery %p", raddr);
+                       return MAP_FAILED;
+               }
+               dprint(FD_IO, "addr %p len %zu state %d",
+                               mi.BaseAddress, mi.RegionSize, mi.State);
+
+               if ((mi.State != MEM_FREE) || (mi.RegionSize < len)) {
+                       raddr = (char *)mi.BaseAddress + mi.RegionSize;
+                       raddr = (char *)roundup((uintptr_t)raddr, align);
+                       dprint(FD_IO, "nearest aligned addr %p", raddr);
+               } else {
+                       dprint(FD_IO, "unused region of size %zu found at %p",
+                                       mi.RegionSize, mi.BaseAddress);
+                       return mi.BaseAddress;
+               }
+       }
+
+       dprint(FD_IO, "end of address space reached");
+       return MAP_FAILED;
+#else
+       if ((fp = fopen(OS_MAPFILE, "r")) == NULL) {
+               log_err("!%s\n", OS_MAPFILE);
+               return MAP_FAILED;
+       }
+
+       while (fgets(line, PROCMAXLEN, fp) != NULL) {
+               /* check for range line */
+               if (sscanf(line, sscanf_os, &lo, &hi) == 2) {
+                       dprint(FD_IO, "%p-%p\n", lo, hi);
+                       if (lo > raddr) {
+                               if ((uintptr_t)(lo - raddr) >= len) {
+                                       dprint(FD_IO, "unused region of size "
+                                                       "%zu found at %p\n",
+                                                       lo - raddr, raddr);
+                                       break;
+                               } else {
+                                       dprint(FD_IO, "region is too small: "
+                                                       "%zu < %zu\n",
+                                                       lo - raddr, len);
+                               }
+                       }
+
+                       if (hi > raddr) {
+                               raddr = (char *)roundup((uintptr_t)hi, align);
+                               dprint(FD_IO, "nearest aligned addr %p\n",
+                                               raddr);
+                       }
+
+                       if (raddr == 0) {
+                               dprint(FD_IO, "end of address space reached\n");
+                               break;
+                       }
+               }
+       }
+
+       /*
+        * Check for a case when this is the last unused range in the address
+        * space, but is not large enough. (very unlikely)
+        */
+       if ((raddr != NULL) && (UINTPTR_MAX - (uintptr_t)raddr < len)) {
+               dprint(FD_IO, "end of address space reached");
+               raddr = MAP_FAILED;
+       }
+
+       fclose(fp);
+
+       dprint(FD_IO, "returning %p", raddr);
+       return raddr;
+#endif
+}
+
+/*
+ * util_map_hint -- determine hint address for mmap()
+ *
+ * If PMEM_MMAP_HINT environment variable is not set, we let the system to pick
+ * the randomized mapping address.  Otherwise, a user-defined hint address
+ * is used.
+ *
+ * Windows Environment:
+ *   XXX - Windows doesn't support large DAX pages yet, so there is
+ *   no point in aligning for the same.
+ *
+ * Except for Windows Environment:
+ *   ALSR in 64-bit Linux kernel uses 28-bit of randomness for mmap
+ *   (bit positions 12-39), which means the base mapping address is randomized
+ *   within [0..1024GB] range, with 4KB granularity.  Assuming additional
+ *   1GB alignment, it results in 1024 possible locations.
+ *
+ *   Configuring the hint address via PMEM_MMAP_HINT environment variable
+ *   disables address randomization.  In such case, the function will search for
+ *   the first unused, properly aligned region of given size, above the
+ *   specified address.
+ */
+static char * util_map_hint(size_t len, size_t req_align)
+{
+       char *addr;
+       size_t align = 0;
+       char *e = NULL;
+
+       dprint(FD_IO, "DEBUG util_map_hint\n");
+       dprint(FD_IO, "len %zu req_align %zu\n", len, req_align);
+
+       /* choose the desired alignment based on the requested length */
+       align = util_map_hint_align(len, req_align);
+
+       e = getenv("PMEM_MMAP_HINT");
+       if (e) {
+               char *endp;
+               unsigned long long val = 0;
+
+               errno = 0;
+
+               val = strtoull(e, &endp, 16);
+               if (errno || endp == e) {
+                       dprint(FD_IO, "Invalid PMEM_MMAP_HINT\n");
+               } else {
+                       Mmap_hint = (void *)val;
+                       Mmap_no_random = 1;
+                       dprint(FD_IO, "PMEM_MMAP_HINT set to %p\n", Mmap_hint);
+               }
+       }
+
+       if (Mmap_no_random) {
+               dprint(FD_IO, "user-defined hint %p\n", (void *)Mmap_hint);
+               addr = util_map_hint_unused((void *)Mmap_hint, len, align);
+       } else {
+               /*
+                * Create dummy mapping to find an unused region of given size.
+                * * Request for increased size for later address alignment.
+                *
+                * Windows Environment: 
+                *   Use MAP_NORESERVE flag to only reserve the range of pages
+                *   rather than commit.  We don't want the pages to be actually
+                *   backed by the operating system paging file, as the swap
+                *   file is usually too small to handle terabyte pools.
+                *
+                * Except for Windows Environment:
+                *   Use MAP_PRIVATE with read-only access to simulate
+                *   zero cost for overcommit accounting.  Note: MAP_NORESERVE
+                *   flag is ignored if overcommit is disabled (mode 2).
+                */
+#ifndef WIN32
+               addr = mmap(NULL, len + align, PROT_READ,
+                               MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+#else
+               addr = mmap(NULL, len + align, PROT_READ,
+                               MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0);
+#endif
+               if (addr != MAP_FAILED) {
+                       dprint(FD_IO, "system choice %p\n", addr);
+                       munmap(addr, len + align);
+                       addr = (char *)roundup((uintptr_t)addr, align);
+               }
+       }
+
+       dprint(FD_IO, "hint %p\n", addr);
+
+       return addr;
+}
+
+/*
+ * This is the mmap execution function
+ */
+static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
+                           size_t length, off_t off)
+{
+       struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+       int flags = 0;
+       void *addr = NULL;
+
+       dprint(FD_IO, "DEBUG fio_libpmem_file\n");
+
+       if (td_rw(td))
+               flags = PROT_READ | PROT_WRITE;
+       else if (td_write(td)) {
+               flags = PROT_WRITE;
+
+               if (td->o.verify != VERIFY_NONE)
+                       flags |= PROT_READ;
+       } else
+               flags = PROT_READ;
+
+       dprint(FD_IO, "f->file_name = %s  td->o.verify = %d \n", f->file_name,
+                       td->o.verify);
+       dprint(FD_IO, "length = %ld  flags = %d  f->fd = %d off = %ld \n",
+                       length, flags, f->fd,off);
+
+       addr = util_map_hint(length, 0);
+
+       fdd->libpmem_ptr = mmap(addr, length, flags, MAP_SHARED, f->fd, off);
+       if (fdd->libpmem_ptr == MAP_FAILED) {
+               fdd->libpmem_ptr = NULL;
+               td_verror(td, errno, "mmap");
+       }
+
+       if (td->error && fdd->libpmem_ptr)
+               munmap(fdd->libpmem_ptr, length);
+
+       return td->error;
+}
+
+/*
+ * XXX Just mmap an appropriate portion, we cannot mmap the full extent
+ */
+static int fio_libpmem_prep_limited(struct thread_data *td, struct io_u *io_u)
+{
+       struct fio_file *f = io_u->file;
+       struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+
+       dprint(FD_IO, "DEBUG fio_libpmem_prep_limited\n" );
+
+       if (io_u->buflen > f->real_file_size) {
+               log_err("libpmem: bs too big for libpmem engine\n");
+               return EIO;
+       }
+
+       fdd->libpmem_sz = min(MMAP_TOTAL_SZ, f->real_file_size);
+       if (fdd->libpmem_sz > f->io_size)
+               fdd->libpmem_sz = f->io_size;
+
+       fdd->libpmem_off = io_u->offset;
+
+       return fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
+}
+
+/*
+ * Attempt to mmap the entire file
+ */
+static int fio_libpmem_prep_full(struct thread_data *td, struct io_u *io_u)
+{
+       struct fio_file *f = io_u->file;
+       struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+       int ret;
+
+       dprint(FD_IO, "DEBUG fio_libpmem_prep_full\n" );
+
+       if (fio_file_partial_mmap(f))
+               return EINVAL;
+
+       dprint(FD_IO," f->io_size %ld : io_u->offset %lld \n",
+                       f->io_size, io_u->offset);
+
+       if (io_u->offset != (size_t) io_u->offset ||
+                       f->io_size != (size_t) f->io_size) {
+               fio_file_set_partial_mmap(f);
+               return EINVAL;
+       }
+       fdd->libpmem_sz = f->io_size;
+       fdd->libpmem_off = 0;
+
+       ret = fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off);
+       if (ret)
+               fio_file_set_partial_mmap(f);
+
+       return ret;
+}
+
+static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u)
+{
+       struct fio_file *f = io_u->file;
+       struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+       int ret;
+
+       dprint(FD_IO, "DEBUG fio_libpmem_prep\n" );
+       /*
+        * It fits within existing mapping, use it
+        */
+       dprint(FD_IO," io_u->offset %lld : fdd->libpmem_off %ld : "
+                       "io_u->buflen %ld : fdd->libpmem_sz %ld\n",
+                       io_u->offset, fdd->libpmem_off,
+                       io_u->buflen, fdd->libpmem_sz);
+
+       if (io_u->offset >= fdd->libpmem_off &&
+                       io_u->offset + io_u->buflen <
+                       fdd->libpmem_off + fdd->libpmem_sz)
+               goto done;
+
+       /*
+        * unmap any existing mapping
+        */
+       if (fdd->libpmem_ptr) {
+               dprint(FD_IO,"munmap \n");
+               if (munmap(fdd->libpmem_ptr, fdd->libpmem_sz) < 0)
+                       return errno;
+               fdd->libpmem_ptr = NULL;
+       }
+
+       if (fio_libpmem_prep_full(td, io_u)) {
+               td_clear_error(td);
+               ret = fio_libpmem_prep_limited(td, io_u);
+               if (ret)
+                       return ret;
+       }
+
+done:
+       io_u->mmap_data = fdd->libpmem_ptr + io_u->offset - fdd->libpmem_off
+               - f->file_offset;
+       return 0;
+}
+
+static int fio_libpmem_queue(struct thread_data *td, struct io_u *io_u)
+{
+       fio_ro_check(td, io_u);
+       io_u->error = 0;
+
+       dprint(FD_IO, "DEBUG fio_libpmem_queue\n");
+
+       switch (io_u->ddir) {
+               case DDIR_READ:
+                       memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen);
+                       break;
+               case DDIR_WRITE:
+                       dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n",
+                                       io_u->mmap_data, io_u->xfer_buf );
+                       dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
+                       if(td->o.odirect == 1){
+                               pmem_memcpy_persist(io_u->mmap_data,
+                                               io_u->xfer_buf,
+                                               io_u->xfer_buflen);
+                       } else {
+                               pmem_memcpy_nodrain(io_u->mmap_data,
+                                               io_u->xfer_buf,
+                                               io_u->xfer_buflen);
+                       }
+                       break;
+               case DDIR_SYNC:
+               case DDIR_DATASYNC:
+               case DDIR_SYNC_FILE_RANGE:
+                       break;
+               default:
+                       io_u->error = EINVAL;
+                       break;
+       }
+
+       return FIO_Q_COMPLETED;
+}
+
+static int fio_libpmem_init(struct thread_data *td)
+{
+       struct thread_options *o = &td->o;
+
+       dprint(FD_IO,"o->rw_min_bs %d \n o->fsync_blocks %d \n o->fdatasync_blocks %d \n",
+                       o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks);
+       dprint(FD_IO, "DEBUG fio_libpmem_init\n");
+
+       if ((o->rw_min_bs & page_mask) &&
+                       (o->fsync_blocks || o->fdatasync_blocks)) {
+               log_err("libpmem: mmap options dictate a minimum block size of "
+                               "%llu bytes\n", (unsigned long long) page_size);
+               return 1;
+       }
+       return 0;
+}
+
+static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f)
+{
+       struct fio_libpmem_data *fdd;
+       int ret;
+
+       dprint(FD_IO,"DEBUG fio_libpmem_open_file\n");
+       dprint(FD_IO,"f->io_size=%ld \n",f->io_size);
+       dprint(FD_IO,"td->o.size=%lld \n",td->o.size);
+       dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth);
+       dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch);
+
+       ret = generic_open_file(td, f);
+       if (ret)
+               return ret;
+
+       fdd = calloc(1, sizeof(*fdd));
+       if (!fdd) {
+               int fio_unused __ret;
+               __ret = generic_close_file(td, f);
+               return 1;
+       }
+
+       FILE_SET_ENG_DATA(f, fdd);
+
+       return 0;
+}
+
+static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f)
+{
+       struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
+
+       dprint(FD_IO,"DEBUG fio_libpmem_close_file\n");
+       dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
+
+       if (td->o.odirect != 1) {
+               dprint(FD_IO,"pmem_drain\n");
+               pmem_drain();
+       }
+
+       FILE_SET_ENG_DATA(f, NULL);
+       free(fdd);
+       fio_file_clear_partial_mmap(f);
+
+       return generic_close_file(td, f);
+}
+
+static struct ioengine_ops ioengine = {
+       .name           = "libpmem",
+       .version        = FIO_IOOPS_VERSION,
+       .init           = fio_libpmem_init,
+       .prep           = fio_libpmem_prep,
+       .queue          = fio_libpmem_queue,
+       .open_file      = fio_libpmem_open_file,
+       .close_file     = fio_libpmem_close_file,
+       .get_file_size  = generic_get_file_size,
+       .flags          = FIO_SYNCIO |FIO_NOEXTEND,
+};
+
+static void fio_init fio_libpmem_register(void)
+{
+       register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_libpmem_unregister(void)
+{
+       unregister_ioengine(&ioengine);
+}
diff --git a/examples/libpmem.fio b/examples/libpmem.fio
new file mode 100644 (file)
index 0000000..17946cb
--- /dev/null
@@ -0,0 +1,73 @@
+[global]
+bs=4k
+size=8g
+ioengine=libpmem
+norandommap
+time_based=1
+group_reporting
+invalidate=1
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+
+iodepth=1
+iodepth_batch=1
+thread=1
+numjobs=1
+
+#
+# In case of 'scramble_buffers=1', the source buffer
+# is rewritetten with a random value every write operations.
+#
+# But when 'scramble_buffers=0', the source buffer isn't rewritten.
+# So it will be likely that he source buffer is on CPU cache and
+# it seems to be high performance.
+#
+scramble_buffers=0
+
+#
+# direct=0:
+#   Using pmem_memcpy_nodrain() for write operation
+#
+# direct=1:
+#   Using pmem_memcpy_persist() for write operation
+#
+direct=0
+
+#
+# Setting for fio process's CPU Node and Memory Node
+#
+numa_cpu_nodes=0
+numa_mem_policy=bind:0
+
+#
+# split means that each job will get a unique CPU from the CPU set
+#
+cpus_allowed_policy=split
+
+#
+# The pmemblk engine does IO to files in a DAX-mounted filesystem.
+# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# and then mounted with the '-o dax' option.  Note that the engine
+# accesses the underlying NVDIMM directly, bypassing the kernel block
+# layer, so the usual filesystem/disk performance monitoring tools such
+# as iostat will not provide useful data.
+#
+directory=/mnt/pmem0
+
+[libpmem-seqwrite]
+rw=write
+stonewall
+
+#[libpmem-seqread]
+#rw=read
+#stonewall
+
+#[libpmem-randwrite]
+#rw=randwrite
+#stonewall
+
+#[libpmem-randread]
+#rw=randread
+#stonewall
diff --git a/fio.1 b/fio.1
index 1f9fffcf9bf8bfcc97d3a828cf7ddab808523323..bd7670a95c493f991467ccac3c3a5b1f3f46bd09 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -1597,6 +1597,11 @@ details of writing an external I/O engine.
 Simply create the files and do no I/O to them.  You still need to set
 \fBfilesize\fR so that all the accounting still occurs, but no actual I/O will be
 done other than creating the file.
+.TP
+.B libpmem
+Read and write using mmap I/O to a file on a filesystem
+mounted with DAX on a persistent memory device through the NVML
+libpmem library.
 .SS "I/O engine specific parameters"
 In addition, there are some parameters which are only valid when a specific
 \fBioengine\fR is in use. These are used identically to normal parameters,
index e8d1a3abbbf931229b83845170e1f33dec2d55bf..a0fcd8f10b67fc72dda75d6e040470c4a9b3ff2e 100644 (file)
--- a/options.c
+++ b/options.c
@@ -1851,6 +1851,11 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                            .help = "Load external engine (append name)",
                            .cb = str_ioengine_external_cb,
                          },
+#ifdef CONFIG_LIBPMEM
+                         { .ival = "libpmem",
+                           .help = "NVML libpmem based IO engine",
+                         },
+#endif
                },
        },
        {