ioengine: Add libcufile I/O engine
authorBrian T. Smith <bsmith@systemfabricworks.com>
Tue, 3 Nov 2020 23:54:55 +0000 (23:54 +0000)
committerBrian T. Smith <bsmith@systemfabricworks.com>
Sat, 5 Dec 2020 20:46:46 +0000 (20:46 +0000)
The libcufile I/O engine uses NVIDIA GPUDirect Storage (GDS) cuFile API to perform
synchronous I/O directly against GPU buffers via nvidia-fs and a GDS-supported
filesystem.

'configure --enable-libcufile' enables the libcufile engine.

CFLAGS must specify the location of CUDA and cuFile headers.
e.g. CFLAGS="-I/usr/local/cuda/include -I/usr/local/cuda/lib64"

LDFLAGS must specify the location of CUDA and cuFile libraries.
e.g. LDFLAGS="-L/usr/local/cuda/lib64"

The paths used in CFLAGS and LDFLAGS depend upon the build host's
CUDA installation.

libcufile adds the following optons: gpu_dev_ids, cuda_io
Usage is documented in HOWTO, fio.1, examples/libcufile-cufile.fio
and examples/libcufile-posix.fio.

Note that enabling verify when cuda_io=cufile necessitates
cudaMemcpy() to populate the GPU buffer on a write and populate the
CPU buffer on a read. The primary goal of GDS is to not copy data
between CPU and GPU buffers.

Signed-off-by: Brian T. Smith <bsmith@systemfabricworks.com>
HOWTO
Makefile
configure
engines/libcufile.c [new file with mode: 0644]
examples/libcufile-cufile.fio [new file with mode: 0644]
examples/libcufile-posix.fio [new file with mode: 0644]
fio.1
optgroup.c
optgroup.h

diff --git a/HOWTO b/HOWTO
index 386fd12aa7b14b80405a4df2851b3c0f7e967fd0..7e46cee0eceac9f8d85c5b6191b3335dd72c2e14 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -2048,6 +2048,14 @@ I/O engine
                **nbd**
                        Read and write a Network Block Device (NBD).
 
+               **libcufile**
+                       I/O engine supporting libcufile synchronous access to nvidia-fs and a
+                       GPUDirect Storage-supported filesystem. This engine performs
+                       I/O without transferring buffers between user-space and the kernel,
+                       unless :option:`verify` is set or :option:`cuda_io` is `posix`.
+                       :option:`iomem` must not be `cudamalloc`. This ioengine defines
+                       engine specific options.
+
 I/O engine specific parameters
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -2398,6 +2406,28 @@ with the caveat that when used on the command line, they must come after the
        nbd+unix:///?socket=/tmp/socket
        nbds://tlshost/exportname
 
+.. option:: gpu_dev_ids=str : [libcufile]
+
+       Specify the GPU IDs to use with CUDA. This is a colon-separated list of
+       int. GPUs are assigned to workers roundrobin. Default is 0.
+
+.. option:: cuda_io=str : [libcufile]
+
+       Specify the type of I/O to use with CUDA. Default is **cufile**.
+
+       **cufile**
+               Use libcufile and nvidia-fs. This option performs I/O directly
+               between a GPUDirect Storage filesystem and GPU buffers,
+               avoiding use of a bounce buffer. If :option:`verify` is set,
+               cudaMemcpy is used to copy verificaton data between RAM and GPU.
+               Verification data is copied from RAM to GPU before a write
+               and from GPU to RAM after a read. :option:`direct` must be 1.
+       **posix**
+               Use POSIX to perform I/O with a RAM buffer, and use cudaMemcpy
+               to transfer data between RAM and the GPUs. Data is copied from
+               GPU to RAM before a write and copied from RAM to GPU after a
+               read. :option:`verify` does not affect use of cudaMemcpy.
+
 I/O depth
 ~~~~~~~~~
 
index ecfaa3e047f243de2a2a2e5d725f2e416417e927..a838af9a773b6aee8568d058366a045fb0cc8e8a 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -103,6 +103,9 @@ endif
 ifdef CONFIG_LINUX_EXT4_MOVE_EXTENT
   SOURCE += engines/e4defrag.c
 endif
+ifdef CONFIG_LIBCUFILE
+  SOURCE += engines/libcufile.c
+endif
 ifdef CONFIG_LINUX_SPLICE
   SOURCE += engines/splice.c
 endif
index d2ca8934f3bed5e3ba73f0551990cbfedcc10a4a..d247a041e463f2dd79c4af0f0301457475a46e5a 100755 (executable)
--- a/configure
+++ b/configure
@@ -162,6 +162,7 @@ pmemblk="no"
 devdax="no"
 pmem="no"
 cuda="no"
+libcufile="no"
 disable_lex=""
 disable_pmem="no"
 disable_native="no"
@@ -224,6 +225,8 @@ for opt do
   ;;
   --enable-cuda) cuda="yes"
   ;;
+  --enable-libcufile) libcufile="yes"
+  ;;
   --disable-native) disable_native="yes"
   ;;
   --with-ime=*) ime_path="$optarg"
@@ -272,6 +275,7 @@ if test "$show_help" = "yes" ; then
   echo "--disable-shm           Disable SHM support"
   echo "--disable-optimizations Don't enable compiler optimizations"
   echo "--enable-cuda           Enable GPUDirect RDMA support"
+  echo "--enable-libcufile      Enable GPUDirect Storage cuFile support"
   echo "--disable-native        Don't build for native host"
   echo "--with-ime=             Install path for DDN's Infinite Memory Engine"
   echo "--enable-libiscsi       Enable iscsi support"
@@ -2495,6 +2499,29 @@ EOF
 fi
 print_config "cuda" "$cuda"
 
+##########################################
+# libcufile probe
+if test "$libcufile" != "no" ; then
+cat > $TMPC << EOF
+#include <cufile.h>
+
+int main(int argc, char* argv[]) {
+   cuFileDriverOpen();
+   return 0;
+}
+EOF
+  if compile_prog "" "-lcuda -lcudart -lcufile" "libcufile"; then
+    libcufile="yes"
+    LIBS="-lcuda -lcudart -lcufile $LIBS"
+  else
+    if test "$libcufile" = "yes" ; then
+      feature_not_found "libcufile" ""
+    fi
+    libcufile="no"
+  fi
+fi
+print_config "libcufile" "$libcufile"
+
 ##########################################
 # check for cc -march=native
 build_native="no"
@@ -2966,6 +2993,9 @@ fi
 if test "$cuda" = "yes" ; then
   output_sym "CONFIG_CUDA"
 fi
+if test "$libcufile" = "yes" ; then
+  output_sym "CONFIG_LIBCUFILE"
+fi
 if test "$march_set" = "no" && test "$build_native" = "yes" ; then
   output_sym "CONFIG_BUILD_NATIVE"
 fi
diff --git a/engines/libcufile.c b/engines/libcufile.c
new file mode 100644 (file)
index 0000000..e575b78
--- /dev/null
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
+ * mailto:info@systemfabricworks.com
+ *
+ * License: GPLv2, see COPYING.
+ *
+ * libcufile engine
+ *
+ * fio I/O engine using the NVIDIA cuFile API.
+ *
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <cufile.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <pthread.h>
+
+#include "../fio.h"
+#include "../lib/pow2.h"
+#include "../optgroup.h"
+#include "../lib/memalign.h"
+
+#define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)
+
+#define LOGGED_BUFLEN_NOT_ALIGNED     0x01
+#define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
+#define GPU_ID_SEP ":"
+
+enum {
+       IO_CUFILE    = 1,
+       IO_POSIX     = 2
+};
+
+struct libcufile_options {
+       struct thread_data *td;
+       char               *gpu_ids;       /* colon-separated list of GPU ids,
+                                             one per job */
+       void               *cu_mem_ptr;    /* GPU memory */
+       void               *junk_buf;      /* buffer to simulate cudaMemcpy with
+                                             posix I/O write */
+       int                 my_gpu_id;     /* GPU id to use for this job */
+       unsigned int        cuda_io;       /* Type of I/O to use with CUDA */
+       size_t              total_mem;     /* size for cu_mem_ptr and junk_buf */
+       int                 logged;        /* bitmask of log messages that have
+                                             been output, prevent flood */
+};
+
+struct fio_libcufile_data {
+       CUfileDescr_t  cf_descr;
+       CUfileHandle_t cf_handle;
+};
+
+static struct fio_option options[] = {
+       {
+               .name     = "gpu_dev_ids",
+               .lname    = "libcufile engine gpu dev ids",
+               .type     = FIO_OPT_STR_STORE,
+               .off1     = offsetof(struct libcufile_options, gpu_ids),
+               .help     = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
+               .category = FIO_OPT_C_ENGINE,
+               .group    = FIO_OPT_G_LIBCUFILE,
+       },
+       {
+               .name     = "cuda_io",
+               .lname    = "libcufile cuda io",
+               .type     = FIO_OPT_STR,
+               .off1     = offsetof(struct libcufile_options, cuda_io),
+               .help     = "Type of I/O to use with CUDA",
+               .def      = "cufile",
+               .posval   = {
+                           { .ival = "cufile",
+                             .oval = IO_CUFILE,
+                             .help = "libcufile nvidia-fs"
+                           },
+                           { .ival = "posix",
+                             .oval = IO_POSIX,
+                             .help = "POSIX I/O"
+                           }
+               },
+               .category = FIO_OPT_C_ENGINE,
+               .group    = FIO_OPT_G_LIBCUFILE,
+       },
+       {
+               .name    = NULL,
+       },
+};
+
+static int running = 0;
+static int cufile_initialized = 0;
+static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;
+
+#define check_cudaruntimecall(fn, rc)                                               \
+       do {                                                                        \
+               cudaError_t res = fn;                                               \
+               if (res != cudaSuccess) {                                           \
+                       const char *str = cudaGetErrorName(res);                    \
+                       log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
+                               #fn, __LINE__, res, str);                           \
+                       rc = -1;                                                    \
+               } else                                                              \
+                       rc = 0;                                                     \
+       } while(0)
+
+static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
+{
+       if (IS_CUFILE_ERR(st.err))
+               return cufileop_status_error(st.err);
+       return "unknown";
+}
+
+/*
+ * Assign GPU to subjob roundrobin, similar to how multiple
+ * entries in 'directory' are handled by fio.
+ */
+static int fio_libcufile_find_gpu_id(struct thread_data *td)
+{
+       struct libcufile_options *o = td->eo;
+       int gpu_id = 0;
+
+       if (o->gpu_ids != NULL) {
+               char *gpu_ids, *pos, *cur;
+               int i, id_count, gpu_idx;
+
+               for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
+                       cur = strchr(cur, GPU_ID_SEP[0]);
+                       if (cur != NULL)
+                               cur++;
+               }
+
+               gpu_idx = td->subjob_number % id_count;
+
+               pos = gpu_ids = strdup(o->gpu_ids);
+               if (gpu_ids == NULL) {
+                       log_err("strdup(gpu_ids): err=%d\n", errno);
+                       return -1;
+               }
+
+               i = 0;
+               while (pos != NULL && i <= gpu_idx) {
+                       i++;
+                       cur = strsep(&pos, GPU_ID_SEP);
+               }
+
+               if (cur)
+                       gpu_id = atoi(cur);
+
+               free(gpu_ids);
+       }
+
+       return gpu_id;
+}
+
+static int fio_libcufile_init(struct thread_data *td)
+{
+       struct libcufile_options *o = td->eo;
+       CUfileError_t status;
+       int initialized;
+       int rc;
+
+       pthread_mutex_lock(&running_lock);
+       if (running == 0) {
+               assert(cufile_initialized == 0);
+               if (o->cuda_io == IO_CUFILE) {
+                       /* only open the driver if this is the first worker thread */
+                       status = cuFileDriverOpen();
+                       if (status.err != CU_FILE_SUCCESS)
+                               log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
+                                       fio_libcufile_get_cuda_error(status));
+                       else
+                               cufile_initialized = 1;
+               }
+       }
+       running++;
+       initialized = cufile_initialized;
+       pthread_mutex_unlock(&running_lock);
+
+       if (o->cuda_io == IO_CUFILE && !initialized)
+               return 1;
+
+       o->my_gpu_id = fio_libcufile_find_gpu_id(td);
+       if (o->my_gpu_id < 0)
+               return 1;
+
+       dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
+       check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
+       if (rc != 0)
+               return 1;
+
+       return 0;
+}
+
+static inline int fio_libcufile_pre_write(struct thread_data *td,
+                                         struct libcufile_options *o,
+                                         struct io_u *io_u,
+                                         size_t gpu_offset)
+{
+       int rc = 0;
+
+       if (o->cuda_io == IO_CUFILE) {
+               if (td->o.verify) {
+                       /*
+                         Data is being verified, copy the io_u buffer to GPU memory.
+                         This isn't done in the non-verify case because the data would
+                         already be in GPU memory in a normal cuFile application.
+                       */
+                       check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
+                                                        io_u->xfer_buf,
+                                                        io_u->xfer_buflen,
+                                                        cudaMemcpyHostToDevice), rc);
+                       if (rc != 0) {
+                               log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
+                               io_u->error = EIO;
+                       }
+               }
+       } else if (o->cuda_io == IO_POSIX) {
+
+               /*
+                 POSIX I/O is being used, the data has to be copied out of the
+                 GPU into a CPU buffer. GPU memory doesn't contain the actual
+                 data to write, copy the data to the junk buffer. The purpose
+                 of this is to add the overhead of cudaMemcpy() that would be
+                 present in a POSIX I/O CUDA application.
+               */
+               check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
+                                                ((char*) o->cu_mem_ptr) + gpu_offset,
+                                                io_u->xfer_buflen,
+                                                cudaMemcpyDeviceToHost), rc);
+               if (rc != 0) {
+                       log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
+                       io_u->error = EIO;
+               }
+       } else {
+               log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+               assert(0);
+               rc = EINVAL;
+       }
+
+       return rc;
+}
+
+static inline int fio_libcufile_post_read(struct thread_data *td,
+                                         struct libcufile_options *o,
+                                         struct io_u *io_u,
+                                         size_t gpu_offset)
+{
+       int rc = 0;
+
+       if (o->cuda_io == IO_CUFILE) {
+               if (td->o.verify) {
+                       /* Copy GPU memory to CPU buffer for verify */
+                       check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
+                                                        ((char*) o->cu_mem_ptr) + gpu_offset,
+                                                        io_u->xfer_buflen,
+                                                        cudaMemcpyDeviceToHost), rc);
+                       if (rc != 0) {
+                               log_err("DDIR_READ cudaMemcpy D2H failed\n");
+                               io_u->error = EIO;
+                       }
+               }
+       } else if (o->cuda_io == IO_POSIX) {
+               /* POSIX I/O read, copy the CPU buffer to GPU memory */
+               check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
+                                                io_u->xfer_buf,
+                                                io_u->xfer_buflen,
+                                                cudaMemcpyHostToDevice), rc);
+               if (rc != 0) {
+                       log_err("DDIR_READ cudaMemcpy H2D failed\n");
+                       io_u->error = EIO;
+               }
+       } else {
+               log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+               assert(0);
+               rc = EINVAL;
+       }
+
+       return rc;
+}
+
+static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
+                                            struct io_u *io_u)
+{
+       struct libcufile_options *o = td->eo;
+       struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
+       unsigned long long io_offset;
+       ssize_t sz;
+       ssize_t remaining;
+       size_t xfered;
+       size_t gpu_offset;
+       int rc;
+
+       if (o->cuda_io == IO_CUFILE && fcd == NULL) {
+               io_u->error = EINVAL;
+               td_verror(td, EINVAL, "xfer");
+               return FIO_Q_COMPLETED;
+       }
+
+       fio_ro_check(td, io_u);
+
+       switch(io_u->ddir) {
+       case DDIR_SYNC:
+               rc = fsync(io_u->file->fd);
+               if (rc != 0) {
+                       io_u->error = errno;
+                       log_err("fsync: err=%d\n", errno);
+               }
+               break;
+
+       case DDIR_DATASYNC:
+               rc = fdatasync(io_u->file->fd);
+               if (rc != 0) {
+                       io_u->error = errno;
+                       log_err("fdatasync: err=%d\n", errno);
+               }
+               break;
+
+       case DDIR_READ:
+       case DDIR_WRITE:
+               /*
+                 There may be a better way to calculate gpu_offset. The intent is
+                 that gpu_offset equals the the difference between io_u->xfer_buf and
+                 the page-aligned base address for io_u buffers.
+               */
+               gpu_offset = io_u->index * io_u->xfer_buflen;
+               io_offset = io_u->offset;
+               remaining = io_u->xfer_buflen;
+
+               xfered = 0;
+               sz = 0;
+
+               assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);
+
+               if (o->cuda_io == IO_CUFILE) {
+                       if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
+                             (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
+                               log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
+                               o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
+                       }
+
+                       if (!(ALIGNED_4KB(gpu_offset) ||
+                             (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
+                               log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
+                               o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
+                       }
+               }
+
+               if (io_u->ddir == DDIR_WRITE)
+                       rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);
+
+               if (io_u->error != 0)
+                       break;
+
+               while (remaining > 0) {
+                       assert(gpu_offset + xfered <= o->total_mem);
+                       if (io_u->ddir == DDIR_READ) {
+                               if (o->cuda_io == IO_CUFILE) {
+                                       sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
+                                                       io_offset + xfered, gpu_offset + xfered);
+                                       if (sz == -1) {
+                                               io_u->error = errno;
+                                               log_err("cuFileRead: err=%d\n", errno);
+                                       } else if (sz < 0) {
+                                               io_u->error = EIO;
+                                               log_err("cuFileRead: err=%ld:%s\n", sz,
+                                                       cufileop_status_error(-sz));
+                                       }
+                               } else if (o->cuda_io == IO_POSIX) {
+                                       sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
+                                                  remaining, io_offset + xfered);
+                                       if (sz < 0) {
+                                               io_u->error = errno;
+                                               log_err("pread: err=%d\n", errno);
+                                       }
+                               } else {
+                                       log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+                                       io_u->error = -1;
+                                       assert(0);
+                               }
+                       } else if (io_u->ddir == DDIR_WRITE) {
+                               if (o->cuda_io == IO_CUFILE) {
+                                       sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
+                                                        io_offset + xfered, gpu_offset + xfered);
+                                       if (sz == -1) {
+                                               io_u->error = errno;
+                                               log_err("cuFileWrite: err=%d\n", errno);
+                                       } else if (sz < 0) {
+                                               io_u->error = EIO;
+                                               log_err("cuFileWrite: err=%ld:%s\n", sz,
+                                                       cufileop_status_error(-sz));
+                                       }
+                               } else if (o->cuda_io == IO_POSIX) {
+                                       sz = pwrite(io_u->file->fd,
+                                                   ((char*) io_u->xfer_buf) + xfered,
+                                                   remaining, io_offset + xfered);
+                                       if (sz < 0) {
+                                               io_u->error = errno;
+                                               log_err("pwrite: err=%d\n", errno);
+                                       }
+                               } else {
+                                       log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+                                       io_u->error = -1;
+                                       assert(0);
+                               }
+                       } else {
+                               log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
+                               io_u->error = -1;
+                               assert(0);
+                               break;
+                       }
+
+                       if (io_u->error != 0)
+                               break;
+
+                       remaining -= sz;
+                       xfered += sz;
+
+                       if (remaining != 0)
+                               log_info("Incomplete %s: %ld bytes remaining\n",
+                                        io_u->ddir == DDIR_READ? "read" : "write", remaining);
+               }
+
+               if (io_u->error != 0)
+                       break;
+
+               if (io_u->ddir == DDIR_READ)
+                       rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
+               break;
+
+       default:
+               io_u->error = EINVAL;
+               break;
+       }
+
+       if (io_u->error != 0) {
+               log_err("IO failed\n");
+               td_verror(td, io_u->error, "xfer");
+       }
+
+       return FIO_Q_COMPLETED;
+}
+
+static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
+{
+       struct libcufile_options *o = td->eo;
+       struct fio_libcufile_data *fcd = NULL;
+       int rc;
+       CUfileError_t status;
+
+       rc = generic_open_file(td, f);
+       if (rc)
+               return rc;
+
+       if (o->cuda_io == IO_CUFILE) {
+               fcd = calloc(1, sizeof(*fcd));
+               if (fcd == NULL) {
+                       rc = ENOMEM;
+                       goto exit_err;
+               }
+
+               fcd->cf_descr.handle.fd = f->fd;
+               fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+               status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
+               if (status.err != CU_FILE_SUCCESS) {
+                       log_err("cufile register: err=%d:%s\n", status.err,
+                               fio_libcufile_get_cuda_error(status));
+                       rc = EINVAL;
+                       goto exit_err;
+               }
+       }
+
+       FILE_SET_ENG_DATA(f, fcd);
+       return 0;
+
+exit_err:
+       if (fcd) {
+               free(fcd);
+               fcd = NULL;
+       }
+       if (f) {
+               int rc2 = generic_close_file(td, f);
+               if (rc2)
+                       log_err("generic_close_file: err=%d\n", rc2);
+       }
+       return rc;
+}
+
+static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
+{
+       struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
+       int rc;
+
+       if (fcd != NULL) {
+               cuFileHandleDeregister(fcd->cf_handle);
+               FILE_SET_ENG_DATA(f, NULL);
+               free(fcd);
+       }
+
+       rc = generic_close_file(td, f);
+
+       return rc;
+}
+
+static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
+{
+       struct libcufile_options *o = td->eo;
+       int rc;
+       CUfileError_t status;
+
+       o->total_mem = total_mem;
+       o->logged = 0;
+       o->cu_mem_ptr = NULL;
+       o->junk_buf = NULL;
+       td->orig_buffer = calloc(1, total_mem);
+       if (!td->orig_buffer) {
+               log_err("orig_buffer calloc failed: err=%d\n", errno);
+               goto exit_error;
+       }
+
+       if (o->cuda_io == IO_POSIX) {
+               o->junk_buf = calloc(1, total_mem);
+               if (o->junk_buf == NULL) {
+                       log_err("junk_buf calloc failed: err=%d\n", errno);
+                       goto exit_error;
+               }
+       }
+
+       dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
+       check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
+       if (rc != 0)
+               goto exit_error;
+       check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
+       if (rc != 0)
+               goto exit_error;
+
+       if (o->cuda_io == IO_CUFILE) {
+               status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
+               if (status.err != CU_FILE_SUCCESS) {
+                       log_err("cuFileBufRegister: err=%d:%s\n", status.err,
+                               fio_libcufile_get_cuda_error(status));
+                       goto exit_error;
+               }
+       }
+
+       return 0;
+
+exit_error:
+       if (td->orig_buffer) {
+               free(td->orig_buffer);
+               td->orig_buffer = NULL;
+       }
+       if (o->junk_buf) {
+               free(o->junk_buf);
+               o->junk_buf = NULL;
+       }
+       if (o->cu_mem_ptr) {
+               cudaFree(o->cu_mem_ptr);
+               o->cu_mem_ptr = NULL;
+       }
+       return 1;
+}
+
+static void fio_libcufile_iomem_free(struct thread_data *td)
+{
+       struct libcufile_options *o = td->eo;
+
+       if (o->junk_buf) {
+               free(o->junk_buf);
+               o->junk_buf = NULL;
+       }
+       if (o->cu_mem_ptr) {
+               if (o->cuda_io == IO_CUFILE)
+                       cuFileBufDeregister(o->cu_mem_ptr);
+               cudaFree(o->cu_mem_ptr);
+               o->cu_mem_ptr = NULL;
+       }
+       if (td->orig_buffer) {
+               free(td->orig_buffer);
+               td->orig_buffer = NULL;
+       }
+}
+
+static void fio_libcufile_cleanup(struct thread_data *td)
+{
+       struct libcufile_options *o = td->eo;
+
+       pthread_mutex_lock(&running_lock);
+       running--;
+       assert(running >= 0);
+       if (running == 0) {
+               /* only close the driver if initialized and
+                  this is the last worker thread */
+               if (o->cuda_io == IO_CUFILE && cufile_initialized)
+                       cuFileDriverClose();
+               cufile_initialized = 0;
+       }
+       pthread_mutex_unlock(&running_lock);
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+       .name                = "libcufile",
+       .version             = FIO_IOOPS_VERSION,
+       .init                = fio_libcufile_init,
+       .queue               = fio_libcufile_queue,
+       .open_file           = fio_libcufile_open_file,
+       .close_file          = fio_libcufile_close_file,
+       .iomem_alloc         = fio_libcufile_iomem_alloc,
+       .iomem_free          = fio_libcufile_iomem_free,
+       .cleanup             = fio_libcufile_cleanup,
+       .flags               = FIO_SYNCIO,
+       .options             = options,
+       .option_struct_size  = sizeof(struct libcufile_options)
+};
+
+void fio_init fio_libcufile_register(void)
+{
+       register_ioengine(&ioengine);
+}
+
+void fio_exit fio_libcufile_unregister(void)
+{
+       unregister_ioengine(&ioengine);
+}
diff --git a/examples/libcufile-cufile.fio b/examples/libcufile-cufile.fio
new file mode 100644 (file)
index 0000000..94a64b5
--- /dev/null
@@ -0,0 +1,42 @@
+# Example libcufile job, using cufile I/O
+#
+# Required environment variables:
+#     GPU_DEV_IDS : refer to option 'gpu_dev_ids'
+#     FIO_DIR     : 'directory'. This job uses cuda_io=cufile, so path(s) must
+#                   point to GPUDirect Storage filesystem(s)
+#
+
+[global]
+ioengine=libcufile
+directory=${FIO_DIR}
+gpu_dev_ids=${GPU_DEV_IDS}
+cuda_io=cufile
+# 'direct' must be 1 when using cuda_io=cufile
+direct=1
+# Performance is negatively affected if 'bs' is not a multiple of 4k.
+# Refer to GDS cuFile documentation.
+bs=1m
+size=1m
+numjobs=16
+# cudaMalloc fails if too many processes attach to the GPU, use threads.
+thread
+
+[read]
+rw=read
+
+[write]
+rw=write
+
+[randread]
+rw=randread
+
+[randwrite]
+rw=randwrite
+
+[verify]
+rw=write
+verify=md5
+
+[randverify]
+rw=randwrite
+verify=md5
diff --git a/examples/libcufile-posix.fio b/examples/libcufile-posix.fio
new file mode 100644 (file)
index 0000000..2bce22e
--- /dev/null
@@ -0,0 +1,41 @@
+# Example libcufile job, using POSIX I/O
+#
+# Required environment variables:
+#     GPU_DEV_IDS : refer to option 'gpu_dev_ids'
+#     FIO_DIR     : 'directory'. cuda_io=posix, so the path(s) may point
+#                   to any POSIX filesystem(s)
+#
+
+[global]
+ioengine=libcufile
+directory=${FIO_DIR}
+gpu_dev_ids=${GPU_DEV_IDS}
+cuda_io=posix
+# 'direct' may be 1 or 0 when using cuda_io=posix
+direct=0
+# there are no unusual requirements for 'bs' when cuda_io=posix
+bs=1m
+size=1G
+numjobs=16
+# cudaMalloc fails if too many processes attach to the GPU, use threads
+thread
+
+[read]
+rw=read
+
+[write]
+rw=write
+
+[randread]
+rw=randread
+
+[randwrite]
+rw=randwrite
+
+[verify]
+rw=write
+verify=md5
+
+[randverify]
+rw=randwrite
+verify=md5
diff --git a/fio.1 b/fio.1
index 481193254832befdd285394d6424c209b8805989..45ec8d43dcbf8172318f91d12c33005037386bd9 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -1826,6 +1826,13 @@ Read and write iscsi lun with libiscsi.
 .TP
 .B nbd
 Synchronous read and write a Network Block Device (NBD).
+.TP
+.B libcufile
+I/O engine supporting libcufile synchronous access to nvidia-fs and a
+GPUDirect Storage-supported filesystem. This engine performs
+I/O without transferring buffers between user-space and the kernel,
+unless \fBverify\fR is set or \fBcuda_io\fR is \fBposix\fR. \fBiomem\fR must
+not be \fBcudamalloc\fR. This ioengine defines engine specific options.
 .SS "I/O engine specific parameters"
 In addition, there are some parameters which are only valid when a specific
 \fBioengine\fR is in use. These are used identically to normal parameters,
@@ -2139,7 +2146,36 @@ Example URIs:
 \fInbd+unix:///?socket=/tmp/socket\fR
 .TP
 \fInbds://tlshost/exportname\fR
-
+.RE
+.RE
+.TP
+.BI (libcufile)gpu_dev_ids\fR=\fPstr
+Specify the GPU IDs to use with CUDA. This is a colon-separated list of int.
+GPUs are assigned to workers roundrobin. Default is 0.
+.TP
+.BI (libcufile)cuda_io\fR=\fPstr
+Specify the type of I/O to use with CUDA. This option
+takes the following values:
+.RS
+.RS
+.TP
+.B cufile (default)
+Use libcufile and nvidia-fs. This option performs I/O directly
+between a GPUDirect Storage filesystem and GPU buffers,
+avoiding use of a bounce buffer. If \fBverify\fR is set,
+cudaMemcpy is used to copy verification data between RAM and GPU(s).
+Verification data is copied from RAM to GPU before a write
+and from GPU to RAM after a read.
+\fBdirect\fR must be 1.
+.TP
+.BI posix
+Use POSIX to perform I/O with a RAM buffer, and use
+cudaMemcpy to transfer data between RAM and the GPU(s).
+Data is copied from GPU to RAM before a write and copied
+from RAM to GPU after a read. \fBverify\fR does not affect
+the use of cudaMemcpy.
+.RE
+.RE
 .SS "I/O depth"
 .TP
 .BI iodepth \fR=\fPint
index c228ff292c542055459e34d89e06c58731af3e7b..647748963193db4db016b2befde7fa2ec4f3a758 100644 (file)
@@ -173,6 +173,10 @@ static const struct opt_group fio_opt_cat_groups[] = {
                .name   = "NBD I/O engine", /* NBD */
                .mask   = FIO_OPT_G_NBD,
        },
+       {
+               .name   = "libcufile I/O engine", /* libcufile */
+               .mask   = FIO_OPT_G_LIBCUFILE,
+       },
        {
                .name   = NULL,
        },
index 5789afd32c49d2cbe5bd0c968ee6a7f3ca917a49..d2f1ceb391c34fd6101cf3e4b6a867b078b178e6 100644 (file)
@@ -67,6 +67,7 @@ enum opt_category_group {
        __FIO_OPT_G_IOURING,
        __FIO_OPT_G_FILESTAT,
        __FIO_OPT_G_NR,
+       __FIO_OPT_G_LIBCUFILE,
 
        FIO_OPT_G_RATE          = (1ULL << __FIO_OPT_G_RATE),
        FIO_OPT_G_ZONE          = (1ULL << __FIO_OPT_G_ZONE),
@@ -108,6 +109,7 @@ enum opt_category_group {
        FIO_OPT_G_NBD           = (1ULL << __FIO_OPT_G_NBD),
        FIO_OPT_G_IOURING       = (1ULL << __FIO_OPT_G_IOURING),
        FIO_OPT_G_FILESTAT      = (1ULL << __FIO_OPT_G_FILESTAT),
+       FIO_OPT_G_LIBCUFILE     = (1ULL << __FIO_OPT_G_LIBCUFILE),
 };
 
 extern const struct opt_group *opt_group_from_mask(uint64_t *mask);