--- /dev/null
+/*
+ * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
+ * mailto:info@systemfabricworks.com
+ *
+ * License: GPLv2, see COPYING.
+ *
+ * libcufile engine
+ *
+ * fio I/O engine using the NVIDIA cuFile API.
+ *
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <cufile.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <pthread.h>
+
+#include "../fio.h"
+#include "../lib/pow2.h"
+#include "../optgroup.h"
+#include "../lib/memalign.h"
+
+#define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)
+
+#define LOGGED_BUFLEN_NOT_ALIGNED 0x01
+#define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
+#define GPU_ID_SEP ":"
+
+enum {
+ IO_CUFILE = 1,
+ IO_POSIX = 2
+};
+
+struct libcufile_options {
+ struct thread_data *td;
+ char *gpu_ids; /* colon-separated list of GPU ids,
+ one per job */
+ void *cu_mem_ptr; /* GPU memory */
+ void *junk_buf; /* buffer to simulate cudaMemcpy with
+ posix I/O write */
+ int my_gpu_id; /* GPU id to use for this job */
+ unsigned int cuda_io; /* Type of I/O to use with CUDA */
+ size_t total_mem; /* size for cu_mem_ptr and junk_buf */
+ int logged; /* bitmask of log messages that have
+ been output, prevent flood */
+};
+
+struct fio_libcufile_data {
+ CUfileDescr_t cf_descr;
+ CUfileHandle_t cf_handle;
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "gpu_dev_ids",
+ .lname = "libcufile engine gpu dev ids",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct libcufile_options, gpu_ids),
+ .help = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBCUFILE,
+ },
+ {
+ .name = "cuda_io",
+ .lname = "libcufile cuda io",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct libcufile_options, cuda_io),
+ .help = "Type of I/O to use with CUDA",
+ .def = "cufile",
+ .posval = {
+ { .ival = "cufile",
+ .oval = IO_CUFILE,
+ .help = "libcufile nvidia-fs"
+ },
+ { .ival = "posix",
+ .oval = IO_POSIX,
+ .help = "POSIX I/O"
+ }
+ },
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBCUFILE,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+static int running = 0;
+static int cufile_initialized = 0;
+static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;
+
+#define check_cudaruntimecall(fn, rc) \
+ do { \
+ cudaError_t res = fn; \
+ if (res != cudaSuccess) { \
+ const char *str = cudaGetErrorName(res); \
+ log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
+ #fn, __LINE__, res, str); \
+ rc = -1; \
+ } else \
+ rc = 0; \
+ } while(0)
+
+static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
+{
+ if (IS_CUFILE_ERR(st.err))
+ return cufileop_status_error(st.err);
+ return "unknown";
+}
+
+/*
+ * Assign GPU to subjob roundrobin, similar to how multiple
+ * entries in 'directory' are handled by fio.
+ */
+static int fio_libcufile_find_gpu_id(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+ int gpu_id = 0;
+
+ if (o->gpu_ids != NULL) {
+ char *gpu_ids, *pos, *cur;
+ int i, id_count, gpu_idx;
+
+ for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
+ cur = strchr(cur, GPU_ID_SEP[0]);
+ if (cur != NULL)
+ cur++;
+ }
+
+ gpu_idx = td->subjob_number % id_count;
+
+ pos = gpu_ids = strdup(o->gpu_ids);
+ if (gpu_ids == NULL) {
+ log_err("strdup(gpu_ids): err=%d\n", errno);
+ return -1;
+ }
+
+ i = 0;
+ while (pos != NULL && i <= gpu_idx) {
+ i++;
+ cur = strsep(&pos, GPU_ID_SEP);
+ }
+
+ if (cur)
+ gpu_id = atoi(cur);
+
+ free(gpu_ids);
+ }
+
+ return gpu_id;
+}
+
+static int fio_libcufile_init(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+ CUfileError_t status;
+ int initialized;
+ int rc;
+
+ pthread_mutex_lock(&running_lock);
+ if (running == 0) {
+ assert(cufile_initialized == 0);
+ if (o->cuda_io == IO_CUFILE) {
+ /* only open the driver if this is the first worker thread */
+ status = cuFileDriverOpen();
+ if (status.err != CU_FILE_SUCCESS)
+ log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
+ fio_libcufile_get_cuda_error(status));
+ else
+ cufile_initialized = 1;
+ }
+ }
+ running++;
+ initialized = cufile_initialized;
+ pthread_mutex_unlock(&running_lock);
+
+ if (o->cuda_io == IO_CUFILE && !initialized)
+ return 1;
+
+ o->my_gpu_id = fio_libcufile_find_gpu_id(td);
+ if (o->my_gpu_id < 0)
+ return 1;
+
+ dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
+ check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
+ if (rc != 0)
+ return 1;
+
+ return 0;
+}
+
+static inline int fio_libcufile_pre_write(struct thread_data *td,
+ struct libcufile_options *o,
+ struct io_u *io_u,
+ size_t gpu_offset)
+{
+ int rc = 0;
+
+ if (o->cuda_io == IO_CUFILE) {
+ if (td->o.verify) {
+ /*
+ Data is being verified, copy the io_u buffer to GPU memory.
+ This isn't done in the non-verify case because the data would
+ already be in GPU memory in a normal cuFile application.
+ */
+ check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buf,
+ io_u->xfer_buflen,
+ cudaMemcpyHostToDevice), rc);
+ if (rc != 0) {
+ log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
+ io_u->error = EIO;
+ }
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+
+ /*
+ POSIX I/O is being used, the data has to be copied out of the
+ GPU into a CPU buffer. GPU memory doesn't contain the actual
+ data to write, copy the data to the junk buffer. The purpose
+ of this is to add the overhead of cudaMemcpy() that would be
+ present in a POSIX I/O CUDA application.
+ */
+ check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
+ ((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buflen,
+ cudaMemcpyDeviceToHost), rc);
+ if (rc != 0) {
+ log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
+ io_u->error = EIO;
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ assert(0);
+ rc = EINVAL;
+ }
+
+ return rc;
+}
+
+static inline int fio_libcufile_post_read(struct thread_data *td,
+ struct libcufile_options *o,
+ struct io_u *io_u,
+ size_t gpu_offset)
+{
+ int rc = 0;
+
+ if (o->cuda_io == IO_CUFILE) {
+ if (td->o.verify) {
+ /* Copy GPU memory to CPU buffer for verify */
+ check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
+ ((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buflen,
+ cudaMemcpyDeviceToHost), rc);
+ if (rc != 0) {
+ log_err("DDIR_READ cudaMemcpy D2H failed\n");
+ io_u->error = EIO;
+ }
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+ /* POSIX I/O read, copy the CPU buffer to GPU memory */
+ check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buf,
+ io_u->xfer_buflen,
+ cudaMemcpyHostToDevice), rc);
+ if (rc != 0) {
+ log_err("DDIR_READ cudaMemcpy H2D failed\n");
+ io_u->error = EIO;
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ assert(0);
+ rc = EINVAL;
+ }
+
+ return rc;
+}
+
+static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct libcufile_options *o = td->eo;
+ struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
+ unsigned long long io_offset;
+ ssize_t sz;
+ ssize_t remaining;
+ size_t xfered;
+ size_t gpu_offset;
+ int rc;
+
+ if (o->cuda_io == IO_CUFILE && fcd == NULL) {
+ io_u->error = EINVAL;
+ td_verror(td, EINVAL, "xfer");
+ return FIO_Q_COMPLETED;
+ }
+
+ fio_ro_check(td, io_u);
+
+ switch(io_u->ddir) {
+ case DDIR_SYNC:
+ rc = fsync(io_u->file->fd);
+ if (rc != 0) {
+ io_u->error = errno;
+ log_err("fsync: err=%d\n", errno);
+ }
+ break;
+
+ case DDIR_DATASYNC:
+ rc = fdatasync(io_u->file->fd);
+ if (rc != 0) {
+ io_u->error = errno;
+ log_err("fdatasync: err=%d\n", errno);
+ }
+ break;
+
+ case DDIR_READ:
+ case DDIR_WRITE:
+ /*
+ There may be a better way to calculate gpu_offset. The intent is
+ that gpu_offset equals the the difference between io_u->xfer_buf and
+ the page-aligned base address for io_u buffers.
+ */
+ gpu_offset = io_u->index * io_u->xfer_buflen;
+ io_offset = io_u->offset;
+ remaining = io_u->xfer_buflen;
+
+ xfered = 0;
+ sz = 0;
+
+ assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);
+
+ if (o->cuda_io == IO_CUFILE) {
+ if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
+ (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
+ log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
+ o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
+ }
+
+ if (!(ALIGNED_4KB(gpu_offset) ||
+ (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
+ log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
+ o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
+ }
+ }
+
+ if (io_u->ddir == DDIR_WRITE)
+ rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);
+
+ if (io_u->error != 0)
+ break;
+
+ while (remaining > 0) {
+ assert(gpu_offset + xfered <= o->total_mem);
+ if (io_u->ddir == DDIR_READ) {
+ if (o->cuda_io == IO_CUFILE) {
+ sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
+ io_offset + xfered, gpu_offset + xfered);
+ if (sz == -1) {
+ io_u->error = errno;
+ log_err("cuFileRead: err=%d\n", errno);
+ } else if (sz < 0) {
+ io_u->error = EIO;
+ log_err("cuFileRead: err=%ld:%s\n", sz,
+ cufileop_status_error(-sz));
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+ sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
+ remaining, io_offset + xfered);
+ if (sz < 0) {
+ io_u->error = errno;
+ log_err("pread: err=%d\n", errno);
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ io_u->error = -1;
+ assert(0);
+ }
+ } else if (io_u->ddir == DDIR_WRITE) {
+ if (o->cuda_io == IO_CUFILE) {
+ sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
+ io_offset + xfered, gpu_offset + xfered);
+ if (sz == -1) {
+ io_u->error = errno;
+ log_err("cuFileWrite: err=%d\n", errno);
+ } else if (sz < 0) {
+ io_u->error = EIO;
+ log_err("cuFileWrite: err=%ld:%s\n", sz,
+ cufileop_status_error(-sz));
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+ sz = pwrite(io_u->file->fd,
+ ((char*) io_u->xfer_buf) + xfered,
+ remaining, io_offset + xfered);
+ if (sz < 0) {
+ io_u->error = errno;
+ log_err("pwrite: err=%d\n", errno);
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ io_u->error = -1;
+ assert(0);
+ }
+ } else {
+ log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
+ io_u->error = -1;
+ assert(0);
+ break;
+ }
+
+ if (io_u->error != 0)
+ break;
+
+ remaining -= sz;
+ xfered += sz;
+
+ if (remaining != 0)
+ log_info("Incomplete %s: %ld bytes remaining\n",
+ io_u->ddir == DDIR_READ? "read" : "write", remaining);
+ }
+
+ if (io_u->error != 0)
+ break;
+
+ if (io_u->ddir == DDIR_READ)
+ rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
+ break;
+
+ default:
+ io_u->error = EINVAL;
+ break;
+ }
+
+ if (io_u->error != 0) {
+ log_err("IO failed\n");
+ td_verror(td, io_u->error, "xfer");
+ }
+
+ return FIO_Q_COMPLETED;
+}
+
+static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
+{
+ struct libcufile_options *o = td->eo;
+ struct fio_libcufile_data *fcd = NULL;
+ int rc;
+ CUfileError_t status;
+
+ rc = generic_open_file(td, f);
+ if (rc)
+ return rc;
+
+ if (o->cuda_io == IO_CUFILE) {
+ fcd = calloc(1, sizeof(*fcd));
+ if (fcd == NULL) {
+ rc = ENOMEM;
+ goto exit_err;
+ }
+
+ fcd->cf_descr.handle.fd = f->fd;
+ fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+ status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
+ if (status.err != CU_FILE_SUCCESS) {
+ log_err("cufile register: err=%d:%s\n", status.err,
+ fio_libcufile_get_cuda_error(status));
+ rc = EINVAL;
+ goto exit_err;
+ }
+ }
+
+ FILE_SET_ENG_DATA(f, fcd);
+ return 0;
+
+exit_err:
+ if (fcd) {
+ free(fcd);
+ fcd = NULL;
+ }
+ if (f) {
+ int rc2 = generic_close_file(td, f);
+ if (rc2)
+ log_err("generic_close_file: err=%d\n", rc2);
+ }
+ return rc;
+}
+
+static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
+{
+ struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
+ int rc;
+
+ if (fcd != NULL) {
+ cuFileHandleDeregister(fcd->cf_handle);
+ FILE_SET_ENG_DATA(f, NULL);
+ free(fcd);
+ }
+
+ rc = generic_close_file(td, f);
+
+ return rc;
+}
+
+static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
+{
+ struct libcufile_options *o = td->eo;
+ int rc;
+ CUfileError_t status;
+
+ o->total_mem = total_mem;
+ o->logged = 0;
+ o->cu_mem_ptr = NULL;
+ o->junk_buf = NULL;
+ td->orig_buffer = calloc(1, total_mem);
+ if (!td->orig_buffer) {
+ log_err("orig_buffer calloc failed: err=%d\n", errno);
+ goto exit_error;
+ }
+
+ if (o->cuda_io == IO_POSIX) {
+ o->junk_buf = calloc(1, total_mem);
+ if (o->junk_buf == NULL) {
+ log_err("junk_buf calloc failed: err=%d\n", errno);
+ goto exit_error;
+ }
+ }
+
+ dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
+ check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
+ if (rc != 0)
+ goto exit_error;
+ check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
+ if (rc != 0)
+ goto exit_error;
+
+ if (o->cuda_io == IO_CUFILE) {
+ status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
+ if (status.err != CU_FILE_SUCCESS) {
+ log_err("cuFileBufRegister: err=%d:%s\n", status.err,
+ fio_libcufile_get_cuda_error(status));
+ goto exit_error;
+ }
+ }
+
+ return 0;
+
+exit_error:
+ if (td->orig_buffer) {
+ free(td->orig_buffer);
+ td->orig_buffer = NULL;
+ }
+ if (o->junk_buf) {
+ free(o->junk_buf);
+ o->junk_buf = NULL;
+ }
+ if (o->cu_mem_ptr) {
+ cudaFree(o->cu_mem_ptr);
+ o->cu_mem_ptr = NULL;
+ }
+ return 1;
+}
+
+static void fio_libcufile_iomem_free(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+
+ if (o->junk_buf) {
+ free(o->junk_buf);
+ o->junk_buf = NULL;
+ }
+ if (o->cu_mem_ptr) {
+ if (o->cuda_io == IO_CUFILE)
+ cuFileBufDeregister(o->cu_mem_ptr);
+ cudaFree(o->cu_mem_ptr);
+ o->cu_mem_ptr = NULL;
+ }
+ if (td->orig_buffer) {
+ free(td->orig_buffer);
+ td->orig_buffer = NULL;
+ }
+}
+
+static void fio_libcufile_cleanup(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+
+ pthread_mutex_lock(&running_lock);
+ running--;
+ assert(running >= 0);
+ if (running == 0) {
+ /* only close the driver if initialized and
+ this is the last worker thread */
+ if (o->cuda_io == IO_CUFILE && cufile_initialized)
+ cuFileDriverClose();
+ cufile_initialized = 0;
+ }
+ pthread_mutex_unlock(&running_lock);
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+ .name = "libcufile",
+ .version = FIO_IOOPS_VERSION,
+ .init = fio_libcufile_init,
+ .queue = fio_libcufile_queue,
+ .open_file = fio_libcufile_open_file,
+ .close_file = fio_libcufile_close_file,
+ .iomem_alloc = fio_libcufile_iomem_alloc,
+ .iomem_free = fio_libcufile_iomem_free,
+ .cleanup = fio_libcufile_cleanup,
+ .flags = FIO_SYNCIO,
+ .options = options,
+ .option_struct_size = sizeof(struct libcufile_options)
+};
+
+void fio_init fio_libcufile_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+void fio_exit fio_libcufile_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}