ioengine: Add libcufile I/O engine

[fio.git] / engines / libcufile.c
diff --git a/engines/libcufile.c b/engines/libcufile.c

new file mode 100644 (file)

index 0000000..e575b78
--- /dev/null
+++ b/engines/libcufile.c
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
+ * mailto:info@systemfabricworks.com
+ *
+ * License: GPLv2, see COPYING.
+ *
+ * libcufile engine
+ *
+ * fio I/O engine using the NVIDIA cuFile API.
+ *
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <cufile.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <pthread.h>
+
+#include "../fio.h"
+#include "../lib/pow2.h"
+#include "../optgroup.h"
+#include "../lib/memalign.h"
+
+#define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)
+
+#define LOGGED_BUFLEN_NOT_ALIGNED     0x01
+#define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
+#define GPU_ID_SEP ":"
+
+enum {
+       IO_CUFILE    = 1,
+       IO_POSIX     = 2
+};
+
+struct libcufile_options {
+       struct thread_data *td;
+       char               *gpu_ids;       /* colon-separated list of GPU ids,
+                                             one per job */
+       void               *cu_mem_ptr;    /* GPU memory */
+       void               *junk_buf;      /* buffer to simulate cudaMemcpy with
+                                             posix I/O write */
+       int                 my_gpu_id;     /* GPU id to use for this job */
+       unsigned int        cuda_io;       /* Type of I/O to use with CUDA */
+       size_t              total_mem;     /* size for cu_mem_ptr and junk_buf */
+       int                 logged;        /* bitmask of log messages that have
+                                             been output, prevent flood */
+};
+
+struct fio_libcufile_data {
+       CUfileDescr_t  cf_descr;
+       CUfileHandle_t cf_handle;
+};
+
+static struct fio_option options[] = {
+       {
+               .name     = "gpu_dev_ids",
+               .lname    = "libcufile engine gpu dev ids",
+               .type     = FIO_OPT_STR_STORE,
+               .off1     = offsetof(struct libcufile_options, gpu_ids),
+               .help     = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
+               .category = FIO_OPT_C_ENGINE,
+               .group    = FIO_OPT_G_LIBCUFILE,
+       },
+       {
+               .name     = "cuda_io",
+               .lname    = "libcufile cuda io",
+               .type     = FIO_OPT_STR,
+               .off1     = offsetof(struct libcufile_options, cuda_io),
+               .help     = "Type of I/O to use with CUDA",
+               .def      = "cufile",
+               .posval   = {
+                           { .ival = "cufile",
+                             .oval = IO_CUFILE,
+                             .help = "libcufile nvidia-fs"
+                           },
+                           { .ival = "posix",
+                             .oval = IO_POSIX,
+                             .help = "POSIX I/O"
+                           }
+               },
+               .category = FIO_OPT_C_ENGINE,
+               .group    = FIO_OPT_G_LIBCUFILE,
+       },
+       {
+               .name    = NULL,
+       },
+};
+
+static int running = 0;
+static int cufile_initialized = 0;
+static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;
+
+#define check_cudaruntimecall(fn, rc)                                               \
+       do {                                                                        \
+               cudaError_t res = fn;                                               \
+               if (res != cudaSuccess) {                                           \
+                       const char *str = cudaGetErrorName(res);                    \
+                       log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
+                               #fn, __LINE__, res, str);                           \
+                       rc = -1;                                                    \
+               } else                                                              \
+                       rc = 0;                                                     \
+       } while(0)
+
+static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
+{
+       if (IS_CUFILE_ERR(st.err))
+               return cufileop_status_error(st.err);
+       return "unknown";
+}
+
+/*
+ * Assign GPU to subjob roundrobin, similar to how multiple
+ * entries in 'directory' are handled by fio.
+ */
+static int fio_libcufile_find_gpu_id(struct thread_data *td)
+{
+       struct libcufile_options *o = td->eo;
+       int gpu_id = 0;
+
+       if (o->gpu_ids != NULL) {
+               char *gpu_ids, *pos, *cur;
+               int i, id_count, gpu_idx;
+
+               for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
+                       cur = strchr(cur, GPU_ID_SEP[0]);
+                       if (cur != NULL)
+                               cur++;
+               }
+
+               gpu_idx = td->subjob_number % id_count;
+
+               pos = gpu_ids = strdup(o->gpu_ids);
+               if (gpu_ids == NULL) {
+                       log_err("strdup(gpu_ids): err=%d\n", errno);
+                       return -1;
+               }
+
+               i = 0;
+               while (pos != NULL && i <= gpu_idx) {
+                       i++;
+                       cur = strsep(&pos, GPU_ID_SEP);
+               }
+
+               if (cur)
+                       gpu_id = atoi(cur);
+
+               free(gpu_ids);
+       }
+
+       return gpu_id;
+}
+
+static int fio_libcufile_init(struct thread_data *td)
+{
+       struct libcufile_options *o = td->eo;
+       CUfileError_t status;
+       int initialized;
+       int rc;
+
+       pthread_mutex_lock(&running_lock);
+       if (running == 0) {
+               assert(cufile_initialized == 0);
+               if (o->cuda_io == IO_CUFILE) {
+                       /* only open the driver if this is the first worker thread */
+                       status = cuFileDriverOpen();
+                       if (status.err != CU_FILE_SUCCESS)
+                               log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
+                                       fio_libcufile_get_cuda_error(status));
+                       else
+                               cufile_initialized = 1;
+               }
+       }
+       running++;
+       initialized = cufile_initialized;
+       pthread_mutex_unlock(&running_lock);
+
+       if (o->cuda_io == IO_CUFILE && !initialized)
+               return 1;
+
+       o->my_gpu_id = fio_libcufile_find_gpu_id(td);
+       if (o->my_gpu_id < 0)
+               return 1;
+
+       dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
+       check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
+       if (rc != 0)
+               return 1;
+
+       return 0;
+}
+
+static inline int fio_libcufile_pre_write(struct thread_data *td,
+                                         struct libcufile_options *o,
+                                         struct io_u *io_u,
+                                         size_t gpu_offset)
+{
+       int rc = 0;
+
+       if (o->cuda_io == IO_CUFILE) {
+               if (td->o.verify) {
+                       /*
+                         Data is being verified, copy the io_u buffer to GPU memory.
+                         This isn't done in the non-verify case because the data would
+                         already be in GPU memory in a normal cuFile application.
+                       */
+                       check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
+                                                        io_u->xfer_buf,
+                                                        io_u->xfer_buflen,
+                                                        cudaMemcpyHostToDevice), rc);
+                       if (rc != 0) {
+                               log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
+                               io_u->error = EIO;
+                       }
+               }
+       } else if (o->cuda_io == IO_POSIX) {
+
+               /*
+                 POSIX I/O is being used, the data has to be copied out of the
+                 GPU into a CPU buffer. GPU memory doesn't contain the actual
+                 data to write, copy the data to the junk buffer. The purpose
+                 of this is to add the overhead of cudaMemcpy() that would be
+                 present in a POSIX I/O CUDA application.
+               */
+               check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
+                                                ((char*) o->cu_mem_ptr) + gpu_offset,
+                                                io_u->xfer_buflen,
+                                                cudaMemcpyDeviceToHost), rc);
+               if (rc != 0) {
+                       log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
+                       io_u->error = EIO;
+               }
+       } else {
+               log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+               assert(0);
+               rc = EINVAL;
+       }
+
+       return rc;
+}
+
+static inline int fio_libcufile_post_read(struct thread_data *td,
+                                         struct libcufile_options *o,
+                                         struct io_u *io_u,
+                                         size_t gpu_offset)
+{
+       int rc = 0;
+
+       if (o->cuda_io == IO_CUFILE) {
+               if (td->o.verify) {
+                       /* Copy GPU memory to CPU buffer for verify */
+                       check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
+                                                        ((char*) o->cu_mem_ptr) + gpu_offset,
+                                                        io_u->xfer_buflen,
+                                                        cudaMemcpyDeviceToHost), rc);
+                       if (rc != 0) {
+                               log_err("DDIR_READ cudaMemcpy D2H failed\n");
+                               io_u->error = EIO;
+                       }
+               }
+       } else if (o->cuda_io == IO_POSIX) {
+               /* POSIX I/O read, copy the CPU buffer to GPU memory */
+               check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
+                                                io_u->xfer_buf,
+                                                io_u->xfer_buflen,
+                                                cudaMemcpyHostToDevice), rc);
+               if (rc != 0) {
+                       log_err("DDIR_READ cudaMemcpy H2D failed\n");
+                       io_u->error = EIO;
+               }
+       } else {
+               log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+               assert(0);
+               rc = EINVAL;
+       }
+
+       return rc;
+}
+
+static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
+                                            struct io_u *io_u)
+{
+       struct libcufile_options *o = td->eo;
+       struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
+       unsigned long long io_offset;
+       ssize_t sz;
+       ssize_t remaining;
+       size_t xfered;
+       size_t gpu_offset;
+       int rc;
+
+       if (o->cuda_io == IO_CUFILE && fcd == NULL) {
+               io_u->error = EINVAL;
+               td_verror(td, EINVAL, "xfer");
+               return FIO_Q_COMPLETED;
+       }
+
+       fio_ro_check(td, io_u);
+
+       switch(io_u->ddir) {
+       case DDIR_SYNC:
+               rc = fsync(io_u->file->fd);
+               if (rc != 0) {
+                       io_u->error = errno;
+                       log_err("fsync: err=%d\n", errno);
+               }
+               break;
+
+       case DDIR_DATASYNC:
+               rc = fdatasync(io_u->file->fd);
+               if (rc != 0) {
+                       io_u->error = errno;
+                       log_err("fdatasync: err=%d\n", errno);
+               }
+               break;
+
+       case DDIR_READ:
+       case DDIR_WRITE:
+               /*
+                 There may be a better way to calculate gpu_offset. The intent is
+                 that gpu_offset equals the the difference between io_u->xfer_buf and
+                 the page-aligned base address for io_u buffers.
+               */
+               gpu_offset = io_u->index * io_u->xfer_buflen;
+               io_offset = io_u->offset;
+               remaining = io_u->xfer_buflen;
+
+               xfered = 0;
+               sz = 0;
+
+               assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);
+
+               if (o->cuda_io == IO_CUFILE) {
+                       if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
+                             (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
+                               log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
+                               o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
+                       }
+
+                       if (!(ALIGNED_4KB(gpu_offset) ||
+                             (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
+                               log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
+                               o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
+                       }
+               }
+
+               if (io_u->ddir == DDIR_WRITE)
+                       rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);
+
+               if (io_u->error != 0)
+                       break;
+
+               while (remaining > 0) {
+                       assert(gpu_offset + xfered <= o->total_mem);
+                       if (io_u->ddir == DDIR_READ) {
+                               if (o->cuda_io == IO_CUFILE) {
+                                       sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
+                                                       io_offset + xfered, gpu_offset + xfered);
+                                       if (sz == -1) {
+                                               io_u->error = errno;
+                                               log_err("cuFileRead: err=%d\n", errno);
+                                       } else if (sz < 0) {
+                                               io_u->error = EIO;
+                                               log_err("cuFileRead: err=%ld:%s\n", sz,
+                                                       cufileop_status_error(-sz));
+                                       }
+                               } else if (o->cuda_io == IO_POSIX) {
+                                       sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
+                                                  remaining, io_offset + xfered);
+                                       if (sz < 0) {
+                                               io_u->error = errno;
+                                               log_err("pread: err=%d\n", errno);
+                                       }
+                               } else {
+                                       log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+                                       io_u->error = -1;
+                                       assert(0);
+                               }
+                       } else if (io_u->ddir == DDIR_WRITE) {
+                               if (o->cuda_io == IO_CUFILE) {
+                                       sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
+                                                        io_offset + xfered, gpu_offset + xfered);
+                                       if (sz == -1) {
+                                               io_u->error = errno;
+                                               log_err("cuFileWrite: err=%d\n", errno);
+                                       } else if (sz < 0) {
+                                               io_u->error = EIO;
+                                               log_err("cuFileWrite: err=%ld:%s\n", sz,
+                                                       cufileop_status_error(-sz));
+                                       }
+                               } else if (o->cuda_io == IO_POSIX) {
+                                       sz = pwrite(io_u->file->fd,
+                                                   ((char*) io_u->xfer_buf) + xfered,
+                                                   remaining, io_offset + xfered);
+                                       if (sz < 0) {
+                                               io_u->error = errno;
+                                               log_err("pwrite: err=%d\n", errno);
+                                       }
+                               } else {
+                                       log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+                                       io_u->error = -1;
+                                       assert(0);
+                               }
+                       } else {
+                               log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
+                               io_u->error = -1;
+                               assert(0);
+                               break;
+                       }
+
+                       if (io_u->error != 0)
+                               break;
+
+                       remaining -= sz;
+                       xfered += sz;
+
+                       if (remaining != 0)
+                               log_info("Incomplete %s: %ld bytes remaining\n",
+                                        io_u->ddir == DDIR_READ? "read" : "write", remaining);
+               }
+
+               if (io_u->error != 0)
+                       break;
+
+               if (io_u->ddir == DDIR_READ)
+                       rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
+               break;
+
+       default:
+               io_u->error = EINVAL;
+               break;
+       }
+
+       if (io_u->error != 0) {
+               log_err("IO failed\n");
+               td_verror(td, io_u->error, "xfer");
+       }
+
+       return FIO_Q_COMPLETED;
+}
+
+static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
+{
+       struct libcufile_options *o = td->eo;
+       struct fio_libcufile_data *fcd = NULL;
+       int rc;
+       CUfileError_t status;
+
+       rc = generic_open_file(td, f);
+       if (rc)
+               return rc;
+
+       if (o->cuda_io == IO_CUFILE) {
+               fcd = calloc(1, sizeof(*fcd));
+               if (fcd == NULL) {
+                       rc = ENOMEM;
+                       goto exit_err;
+               }
+
+               fcd->cf_descr.handle.fd = f->fd;
+               fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+               status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
+               if (status.err != CU_FILE_SUCCESS) {
+                       log_err("cufile register: err=%d:%s\n", status.err,
+                               fio_libcufile_get_cuda_error(status));
+                       rc = EINVAL;
+                       goto exit_err;
+               }
+       }
+
+       FILE_SET_ENG_DATA(f, fcd);
+       return 0;
+
+exit_err:
+       if (fcd) {
+               free(fcd);
+               fcd = NULL;
+       }
+       if (f) {
+               int rc2 = generic_close_file(td, f);
+               if (rc2)
+                       log_err("generic_close_file: err=%d\n", rc2);
+       }
+       return rc;
+}
+
+static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
+{
+       struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
+       int rc;
+
+       if (fcd != NULL) {
+               cuFileHandleDeregister(fcd->cf_handle);
+               FILE_SET_ENG_DATA(f, NULL);
+               free(fcd);
+       }
+
+       rc = generic_close_file(td, f);
+
+       return rc;
+}
+
+static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
+{
+       struct libcufile_options *o = td->eo;
+       int rc;
+       CUfileError_t status;
+
+       o->total_mem = total_mem;
+       o->logged = 0;
+       o->cu_mem_ptr = NULL;
+       o->junk_buf = NULL;
+       td->orig_buffer = calloc(1, total_mem);
+       if (!td->orig_buffer) {
+               log_err("orig_buffer calloc failed: err=%d\n", errno);
+               goto exit_error;
+       }
+
+       if (o->cuda_io == IO_POSIX) {
+               o->junk_buf = calloc(1, total_mem);
+               if (o->junk_buf == NULL) {
+                       log_err("junk_buf calloc failed: err=%d\n", errno);
+                       goto exit_error;
+               }
+       }
+
+       dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
+       check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
+       if (rc != 0)
+               goto exit_error;
+       check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
+       if (rc != 0)
+               goto exit_error;
+
+       if (o->cuda_io == IO_CUFILE) {
+               status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
+               if (status.err != CU_FILE_SUCCESS) {
+                       log_err("cuFileBufRegister: err=%d:%s\n", status.err,
+                               fio_libcufile_get_cuda_error(status));
+                       goto exit_error;
+               }
+       }
+
+       return 0;
+
+exit_error:
+       if (td->orig_buffer) {
+               free(td->orig_buffer);
+               td->orig_buffer = NULL;
+       }
+       if (o->junk_buf) {
+               free(o->junk_buf);
+               o->junk_buf = NULL;
+       }
+       if (o->cu_mem_ptr) {
+               cudaFree(o->cu_mem_ptr);
+               o->cu_mem_ptr = NULL;
+       }
+       return 1;
+}
+
+static void fio_libcufile_iomem_free(struct thread_data *td)
+{
+       struct libcufile_options *o = td->eo;
+
+       if (o->junk_buf) {
+               free(o->junk_buf);
+               o->junk_buf = NULL;
+       }
+       if (o->cu_mem_ptr) {
+               if (o->cuda_io == IO_CUFILE)
+                       cuFileBufDeregister(o->cu_mem_ptr);
+               cudaFree(o->cu_mem_ptr);
+               o->cu_mem_ptr = NULL;
+       }
+       if (td->orig_buffer) {
+               free(td->orig_buffer);
+               td->orig_buffer = NULL;
+       }
+}
+
+static void fio_libcufile_cleanup(struct thread_data *td)
+{
+       struct libcufile_options *o = td->eo;
+
+       pthread_mutex_lock(&running_lock);
+       running--;
+       assert(running >= 0);
+       if (running == 0) {
+               /* only close the driver if initialized and
+                  this is the last worker thread */
+               if (o->cuda_io == IO_CUFILE && cufile_initialized)
+                       cuFileDriverClose();
+               cufile_initialized = 0;
+       }
+       pthread_mutex_unlock(&running_lock);
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+       .name                = "libcufile",
+       .version             = FIO_IOOPS_VERSION,
+       .init                = fio_libcufile_init,
+       .queue               = fio_libcufile_queue,
+       .open_file           = fio_libcufile_open_file,
+       .close_file          = fio_libcufile_close_file,
+       .iomem_alloc         = fio_libcufile_iomem_alloc,
+       .iomem_free          = fio_libcufile_iomem_free,
+       .cleanup             = fio_libcufile_cleanup,
+       .flags               = FIO_SYNCIO,
+       .options             = options,
+       .option_struct_size  = sizeof(struct libcufile_options)
+};
+
+void fio_init fio_libcufile_register(void)
+{
+       register_ioengine(&ioengine);
+}
+
+void fio_exit fio_libcufile_unregister(void)
+{
+       unregister_ioengine(&ioengine);
+}