[fio.git] / engines / libcufile.c

/*
 * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
 * mailto:info@systemfabricworks.com
 *
 * License: GPLv2, see COPYING.
 *
 * libcufile engine
 *
 * fio I/O engine using the NVIDIA cuFile API.
 *
 */

#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <cufile.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <pthread.h>

#include "../fio.h"
#include "../lib/pow2.h"
#include "../optgroup.h"
#include "../lib/memalign.h"

#define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)

#define LOGGED_BUFLEN_NOT_ALIGNED     0x01
#define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
#define GPU_ID_SEP ":"

enum {
	IO_CUFILE    = 1,
	IO_POSIX     = 2
};

struct libcufile_options {
	struct thread_data *td;
	char               *gpu_ids;       /* colon-separated list of GPU ids,
					      one per job */
	void               *cu_mem_ptr;    /* GPU memory */
	void               *junk_buf;      /* buffer to simulate cudaMemcpy with
					      posix I/O write */
	int                 my_gpu_id;     /* GPU id to use for this job */
	unsigned int        cuda_io;       /* Type of I/O to use with CUDA */
	size_t              total_mem;     /* size for cu_mem_ptr and junk_buf */
	int                 logged;        /* bitmask of log messages that have
					      been output, prevent flood */
};

struct fio_libcufile_data {
	CUfileDescr_t  cf_descr;
	CUfileHandle_t cf_handle;
};

static struct fio_option options[] = {
	{
		.name	  = "gpu_dev_ids",
		.lname	  = "libcufile engine gpu dev ids",
		.type	  = FIO_OPT_STR_STORE,
		.off1	  = offsetof(struct libcufile_options, gpu_ids),
		.help	  = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
		.category = FIO_OPT_C_ENGINE,
		.group	  = FIO_OPT_G_LIBCUFILE,
	},
	{
		.name	  = "cuda_io",
		.lname	  = "libcufile cuda io",
		.type	  = FIO_OPT_STR,
		.off1	  = offsetof(struct libcufile_options, cuda_io),
		.help	  = "Type of I/O to use with CUDA",
		.def      = "cufile",
		.posval   = {
			    { .ival = "cufile",
			      .oval = IO_CUFILE,
			      .help = "libcufile nvidia-fs"
			    },
			    { .ival = "posix",
			      .oval = IO_POSIX,
			      .help = "POSIX I/O"
			    }
		},
		.category = FIO_OPT_C_ENGINE,
		.group	  = FIO_OPT_G_LIBCUFILE,
	},
	{
		.name	 = NULL,
	},
};

static int running = 0;
static int cufile_initialized = 0;
static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;

#define check_cudaruntimecall(fn, rc)                                               \
	do {                                                                        \
		cudaError_t res = fn;                                               \
		if (res != cudaSuccess) {                                           \
			const char *str = cudaGetErrorName(res);                    \
			log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
				#fn, __LINE__, res, str);                           \
			rc = -1;                                                    \
		} else                                                              \
			rc = 0;                                                     \
	} while(0)

static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
{
	if (IS_CUFILE_ERR(st.err))
		return cufileop_status_error(st.err);
	return "unknown";
}

/*
 * Assign GPU to subjob roundrobin, similar to how multiple
 * entries in 'directory' are handled by fio.
 */
static int fio_libcufile_find_gpu_id(struct thread_data *td)
{
	struct libcufile_options *o = td->eo;
	int gpu_id = 0;

	if (o->gpu_ids != NULL) {
		char *gpu_ids, *pos, *cur;
		int i, id_count, gpu_idx;

		for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
			cur = strchr(cur, GPU_ID_SEP[0]);
			if (cur != NULL)
				cur++;
		}

		gpu_idx = td->subjob_number % id_count;

		pos = gpu_ids = strdup(o->gpu_ids);
		if (gpu_ids == NULL) {
			log_err("strdup(gpu_ids): err=%d\n", errno);
			return -1;
		}

		i = 0;
		while (pos != NULL && i <= gpu_idx) {
			i++;
			cur = strsep(&pos, GPU_ID_SEP);
		}

		if (cur)
			gpu_id = atoi(cur);

		free(gpu_ids);
	}

	return gpu_id;
}

static int fio_libcufile_init(struct thread_data *td)
{
	struct libcufile_options *o = td->eo;
	CUfileError_t status;
	int initialized;
	int rc;

	pthread_mutex_lock(&running_lock);
	if (running == 0) {
		assert(cufile_initialized == 0);
		if (o->cuda_io == IO_CUFILE) {
			/* only open the driver if this is the first worker thread */
			status = cuFileDriverOpen();
			if (status.err != CU_FILE_SUCCESS)
				log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
					fio_libcufile_get_cuda_error(status));
			else
				cufile_initialized = 1;
		}
	}
	running++;
	initialized = cufile_initialized;
	pthread_mutex_unlock(&running_lock);

	if (o->cuda_io == IO_CUFILE && !initialized)
		return 1;

	o->my_gpu_id = fio_libcufile_find_gpu_id(td);
	if (o->my_gpu_id < 0)
		return 1;

	dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
	check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
	if (rc != 0)
		return 1;

	return 0;
}

static inline int fio_libcufile_pre_write(struct thread_data *td,
					  struct libcufile_options *o,
					  struct io_u *io_u,
					  size_t gpu_offset)
{
	int rc = 0;

	if (o->cuda_io == IO_CUFILE) {
		if (td->o.verify) {
			/*
			  Data is being verified, copy the io_u buffer to GPU memory.
			  This isn't done in the non-verify case because the data would
			  already be in GPU memory in a normal cuFile application.
			*/
			check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
							 io_u->xfer_buf,
							 io_u->xfer_buflen,
							 cudaMemcpyHostToDevice), rc);
			if (rc != 0) {
				log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
				io_u->error = EIO;
			}
		}
	} else if (o->cuda_io == IO_POSIX) {

		/*
		  POSIX I/O is being used, the data has to be copied out of the
		  GPU into a CPU buffer. GPU memory doesn't contain the actual
		  data to write, copy the data to the junk buffer. The purpose
		  of this is to add the overhead of cudaMemcpy() that would be
		  present in a POSIX I/O CUDA application.
		*/
		check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
						 ((char*) o->cu_mem_ptr) + gpu_offset,
						 io_u->xfer_buflen,
						 cudaMemcpyDeviceToHost), rc);
		if (rc != 0) {
			log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
			io_u->error = EIO;
		}
	} else {
		log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
		assert(0);
		rc = EINVAL;
	}

	return rc;
}

static inline int fio_libcufile_post_read(struct thread_data *td,
					  struct libcufile_options *o,
					  struct io_u *io_u,
					  size_t gpu_offset)
{
	int rc = 0;

	if (o->cuda_io == IO_CUFILE) {
		if (td->o.verify) {
			/* Copy GPU memory to CPU buffer for verify */
			check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
							 ((char*) o->cu_mem_ptr) + gpu_offset,
							 io_u->xfer_buflen,
							 cudaMemcpyDeviceToHost), rc);
			if (rc != 0) {
				log_err("DDIR_READ cudaMemcpy D2H failed\n");
				io_u->error = EIO;
			}
		}
	} else if (o->cuda_io == IO_POSIX) {
		/* POSIX I/O read, copy the CPU buffer to GPU memory */
		check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
						 io_u->xfer_buf,
						 io_u->xfer_buflen,
						 cudaMemcpyHostToDevice), rc);
		if (rc != 0) {
			log_err("DDIR_READ cudaMemcpy H2D failed\n");
			io_u->error = EIO;
		}
	} else {
		log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
		assert(0);
		rc = EINVAL;
	}

	return rc;
}

static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
					     struct io_u *io_u)
{
	struct libcufile_options *o = td->eo;
	struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
	unsigned long long io_offset;
	ssize_t sz;
	ssize_t remaining;
	size_t xfered;
	size_t gpu_offset;
	int rc;

	if (o->cuda_io == IO_CUFILE && fcd == NULL) {
		io_u->error = EINVAL;
		td_verror(td, EINVAL, "xfer");
		return FIO_Q_COMPLETED;
	}

	fio_ro_check(td, io_u);

	switch(io_u->ddir) {
	case DDIR_SYNC:
		rc = fsync(io_u->file->fd);
		if (rc != 0) {
			io_u->error = errno;
			log_err("fsync: err=%d\n", errno);
		}
		break;

	case DDIR_DATASYNC:
		rc = fdatasync(io_u->file->fd);
		if (rc != 0) {
			io_u->error = errno;
			log_err("fdatasync: err=%d\n", errno);
		}
		break;

	case DDIR_READ:
	case DDIR_WRITE:
		/*
		  There may be a better way to calculate gpu_offset. The intent is
		  that gpu_offset equals the the difference between io_u->xfer_buf and
		  the page-aligned base address for io_u buffers.
		*/
		gpu_offset = io_u->index * io_u->xfer_buflen;
		io_offset = io_u->offset;
		remaining = io_u->xfer_buflen;

		xfered = 0;
		sz = 0;

		assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);

		if (o->cuda_io == IO_CUFILE) {
			if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
			      (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
				log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
				o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
			}

			if (!(ALIGNED_4KB(gpu_offset) ||
			      (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
				log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
				o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
			}
		}

		if (io_u->ddir == DDIR_WRITE)
			rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);

		if (io_u->error != 0)
			break;

		while (remaining > 0) {
			assert(gpu_offset + xfered <= o->total_mem);
			if (io_u->ddir == DDIR_READ) {
				if (o->cuda_io == IO_CUFILE) {
					sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
							io_offset + xfered, gpu_offset + xfered);
					if (sz == -1) {
						io_u->error = errno;
						log_err("cuFileRead: err=%d\n", errno);
					} else if (sz < 0) {
						io_u->error = EIO;
						log_err("cuFileRead: err=%ld:%s\n", sz,
							cufileop_status_error(-sz));
					}
				} else if (o->cuda_io == IO_POSIX) {
					sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
						   remaining, io_offset + xfered);
					if (sz < 0) {
						io_u->error = errno;
						log_err("pread: err=%d\n", errno);
					}
				} else {
					log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
					io_u->error = -1;
					assert(0);
				}
			} else if (io_u->ddir == DDIR_WRITE) {
				if (o->cuda_io == IO_CUFILE) {
					sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
							 io_offset + xfered, gpu_offset + xfered);
					if (sz == -1) {
						io_u->error = errno;
						log_err("cuFileWrite: err=%d\n", errno);
					} else if (sz < 0) {
						io_u->error = EIO;
						log_err("cuFileWrite: err=%ld:%s\n", sz,
							cufileop_status_error(-sz));
					}
				} else if (o->cuda_io == IO_POSIX) {
					sz = pwrite(io_u->file->fd,
						    ((char*) io_u->xfer_buf) + xfered,
						    remaining, io_offset + xfered);
					if (sz < 0) {
						io_u->error = errno;
						log_err("pwrite: err=%d\n", errno);
					}
				} else {
					log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
					io_u->error = -1;
					assert(0);
				}
			} else {
				log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
				io_u->error = -1;
				assert(0);
				break;
			}

			if (io_u->error != 0)
				break;

			remaining -= sz;
			xfered += sz;

			if (remaining != 0)
				log_info("Incomplete %s: %ld bytes remaining\n",
					 io_u->ddir == DDIR_READ? "read" : "write", remaining);
		}

		if (io_u->error != 0)
			break;

		if (io_u->ddir == DDIR_READ)
			rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
		break;

	default:
		io_u->error = EINVAL;
		break;
	}

	if (io_u->error != 0) {
		log_err("IO failed\n");
		td_verror(td, io_u->error, "xfer");
	}

	return FIO_Q_COMPLETED;
}

static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
{
	struct libcufile_options *o = td->eo;
	struct fio_libcufile_data *fcd = NULL;
	int rc;
	CUfileError_t status;

	rc = generic_open_file(td, f);
	if (rc)
		return rc;

	if (o->cuda_io == IO_CUFILE) {
		fcd = calloc(1, sizeof(*fcd));
		if (fcd == NULL) {
			rc = ENOMEM;
			goto exit_err;
		}

		fcd->cf_descr.handle.fd = f->fd;
		fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
		status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
		if (status.err != CU_FILE_SUCCESS) {
			log_err("cufile register: err=%d:%s\n", status.err,
				fio_libcufile_get_cuda_error(status));
			rc = EINVAL;
			goto exit_err;
		}
	}

	FILE_SET_ENG_DATA(f, fcd);
	return 0;

exit_err:
	if (fcd) {
		free(fcd);
		fcd = NULL;
	}
	if (f) {
		int rc2 = generic_close_file(td, f);
		if (rc2)
			log_err("generic_close_file: err=%d\n", rc2);
	}
	return rc;
}

static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
{
	struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
	int rc;

	if (fcd != NULL) {
		cuFileHandleDeregister(fcd->cf_handle);
		FILE_SET_ENG_DATA(f, NULL);
		free(fcd);
	}

	rc = generic_close_file(td, f);

	return rc;
}

static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
{
	struct libcufile_options *o = td->eo;
	int rc;
	CUfileError_t status;

	o->total_mem = total_mem;
	o->logged = 0;
	o->cu_mem_ptr = NULL;
	o->junk_buf = NULL;
	td->orig_buffer = calloc(1, total_mem);
	if (!td->orig_buffer) {
		log_err("orig_buffer calloc failed: err=%d\n", errno);
		goto exit_error;
	}

	if (o->cuda_io == IO_POSIX) {
		o->junk_buf = calloc(1, total_mem);
		if (o->junk_buf == NULL) {
			log_err("junk_buf calloc failed: err=%d\n", errno);
			goto exit_error;
		}
	}

	dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
	check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
	if (rc != 0)
		goto exit_error;
	check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
	if (rc != 0)
		goto exit_error;

	if (o->cuda_io == IO_CUFILE) {
		status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
		if (status.err != CU_FILE_SUCCESS) {
			log_err("cuFileBufRegister: err=%d:%s\n", status.err,
				fio_libcufile_get_cuda_error(status));
			goto exit_error;
		}
	}

	return 0;

exit_error:
	if (td->orig_buffer) {
		free(td->orig_buffer);
		td->orig_buffer = NULL;
	}
	if (o->junk_buf) {
		free(o->junk_buf);
		o->junk_buf = NULL;
	}
	if (o->cu_mem_ptr) {
		cudaFree(o->cu_mem_ptr);
		o->cu_mem_ptr = NULL;
	}
	return 1;
}

static void fio_libcufile_iomem_free(struct thread_data *td)
{
	struct libcufile_options *o = td->eo;

	if (o->junk_buf) {
		free(o->junk_buf);
		o->junk_buf = NULL;
	}
	if (o->cu_mem_ptr) {
		if (o->cuda_io == IO_CUFILE)
			cuFileBufDeregister(o->cu_mem_ptr);
		cudaFree(o->cu_mem_ptr);
		o->cu_mem_ptr = NULL;
	}
	if (td->orig_buffer) {
		free(td->orig_buffer);
		td->orig_buffer = NULL;
	}
}

static void fio_libcufile_cleanup(struct thread_data *td)
{
	struct libcufile_options *o = td->eo;

	pthread_mutex_lock(&running_lock);
	running--;
	assert(running >= 0);
	if (running == 0) {
		/* only close the driver if initialized and
		   this is the last worker thread */
		if (o->cuda_io == IO_CUFILE && cufile_initialized)
			cuFileDriverClose();
		cufile_initialized = 0;
	}
	pthread_mutex_unlock(&running_lock);
}

FIO_STATIC struct ioengine_ops ioengine = {
	.name                = "libcufile",
	.version             = FIO_IOOPS_VERSION,
	.init                = fio_libcufile_init,
	.queue               = fio_libcufile_queue,
	.get_file_size       = generic_get_file_size,
	.open_file           = fio_libcufile_open_file,
	.close_file          = fio_libcufile_close_file,
	.iomem_alloc         = fio_libcufile_iomem_alloc,
	.iomem_free          = fio_libcufile_iomem_free,
	.cleanup             = fio_libcufile_cleanup,
	.flags               = FIO_SYNCIO,
	.options             = options,
	.option_struct_size  = sizeof(struct libcufile_options)
};

void fio_init fio_libcufile_register(void)
{
	register_ioengine(&ioengine);
}

void fio_exit fio_libcufile_unregister(void)
{
	unregister_ioengine(&ioengine);
}
Commit	Line	Data
	1	/*
	2	* Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
	3	* mailto:info@systemfabricworks.com
	4	*
	5	* License: GPLv2, see COPYING.
	6	*
	7	* libcufile engine
	8	*
	9	* fio I/O engine using the NVIDIA cuFile API.
	10	*
	11	*/
	12
	13	#include <stdlib.h>
	14	#include <unistd.h>
	15	#include <errno.h>
	16	#include <string.h>
	17	#include <sys/time.h>
	18	#include <sys/resource.h>
	19	#include <cufile.h>
	20	#include <cuda.h>
	21	#include <cuda_runtime.h>
	22	#include <pthread.h>
	23
	24	#include "../fio.h"
	25	#include "../lib/pow2.h"
	26	#include "../optgroup.h"
	27	#include "../lib/memalign.h"
	28
	29	#define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)
	30
	31	#define LOGGED_BUFLEN_NOT_ALIGNED 0x01
	32	#define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
	33	#define GPU_ID_SEP ":"
	34
	35	enum {
	36	IO_CUFILE = 1,
	37	IO_POSIX = 2
	38	};
	39
	40	struct libcufile_options {
	41	struct thread_data *td;
	42	char gpu_ids; / colon-separated list of GPU ids,
	43	one per job */
	44	void cu_mem_ptr; / GPU memory */
	45	void junk_buf; / buffer to simulate cudaMemcpy with
	46	posix I/O write */
	47	int my_gpu_id; /* GPU id to use for this job */
	48	unsigned int cuda_io; /* Type of I/O to use with CUDA */
	49	size_t total_mem; /* size for cu_mem_ptr and junk_buf */
	50	int logged; /* bitmask of log messages that have
	51	been output, prevent flood */
	52	};
	53
	54	struct fio_libcufile_data {
	55	CUfileDescr_t cf_descr;
	56	CUfileHandle_t cf_handle;
	57	};
	58
	59	static struct fio_option options[] = {
	60	{
	61	.name = "gpu_dev_ids",
	62	.lname = "libcufile engine gpu dev ids",
	63	.type = FIO_OPT_STR_STORE,
	64	.off1 = offsetof(struct libcufile_options, gpu_ids),
	65	.help = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
	66	.category = FIO_OPT_C_ENGINE,
	67	.group = FIO_OPT_G_LIBCUFILE,
	68	},
	69	{
	70	.name = "cuda_io",
	71	.lname = "libcufile cuda io",
	72	.type = FIO_OPT_STR,
	73	.off1 = offsetof(struct libcufile_options, cuda_io),
	74	.help = "Type of I/O to use with CUDA",
	75	.def = "cufile",
	76	.posval = {
	77	{ .ival = "cufile",
	78	.oval = IO_CUFILE,
	79	.help = "libcufile nvidia-fs"
	80	},
	81	{ .ival = "posix",
	82	.oval = IO_POSIX,
	83	.help = "POSIX I/O"
	84	}
	85	},
	86	.category = FIO_OPT_C_ENGINE,
	87	.group = FIO_OPT_G_LIBCUFILE,
	88	},
	89	{
	90	.name = NULL,
	91	},
	92	};
	93
	94	static int running = 0;
	95	static int cufile_initialized = 0;
	96	static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;
	97
	98	#define check_cudaruntimecall(fn, rc) \
	99	do { \
	100	cudaError_t res = fn; \
	101	if (res != cudaSuccess) { \
	102	const char *str = cudaGetErrorName(res); \
	103	log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
	104	#fn, __LINE__, res, str); \
	105	rc = -1; \
	106	} else \
	107	rc = 0; \
	108	} while(0)
	109
	110	static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
	111	{
	112	if (IS_CUFILE_ERR(st.err))
	113	return cufileop_status_error(st.err);
	114	return "unknown";
	115	}
	116
	117	/*
	118	* Assign GPU to subjob roundrobin, similar to how multiple
	119	* entries in 'directory' are handled by fio.
	120	*/
	121	static int fio_libcufile_find_gpu_id(struct thread_data *td)
	122	{
	123	struct libcufile_options *o = td->eo;
	124	int gpu_id = 0;
	125
	126	if (o->gpu_ids != NULL) {
	127	char gpu_ids, pos, *cur;
	128	int i, id_count, gpu_idx;
	129
	130	for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
	131	cur = strchr(cur, GPU_ID_SEP[0]);
	132	if (cur != NULL)
	133	cur++;
	134	}
	135
	136	gpu_idx = td->subjob_number % id_count;
	137
	138	pos = gpu_ids = strdup(o->gpu_ids);
	139	if (gpu_ids == NULL) {
	140	log_err("strdup(gpu_ids): err=%d\n", errno);
	141	return -1;
	142	}
	143
	144	i = 0;
	145	while (pos != NULL && i <= gpu_idx) {
	146	i++;
	147	cur = strsep(&pos, GPU_ID_SEP);
	148	}
	149
	150	if (cur)
	151	gpu_id = atoi(cur);
	152
	153	free(gpu_ids);
	154	}
	155
	156	return gpu_id;
	157	}
	158
	159	static int fio_libcufile_init(struct thread_data *td)
	160	{
	161	struct libcufile_options *o = td->eo;
	162	CUfileError_t status;
	163	int initialized;
	164	int rc;
	165
	166	pthread_mutex_lock(&running_lock);
	167	if (running == 0) {
	168	assert(cufile_initialized == 0);
	169	if (o->cuda_io == IO_CUFILE) {
	170	/* only open the driver if this is the first worker thread */
	171	status = cuFileDriverOpen();
	172	if (status.err != CU_FILE_SUCCESS)
	173	log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
	174	fio_libcufile_get_cuda_error(status));
	175	else
	176	cufile_initialized = 1;
	177	}
	178	}
	179	running++;
	180	initialized = cufile_initialized;
	181	pthread_mutex_unlock(&running_lock);
	182
	183	if (o->cuda_io == IO_CUFILE && !initialized)
	184	return 1;
	185
	186	o->my_gpu_id = fio_libcufile_find_gpu_id(td);
	187	if (o->my_gpu_id < 0)
	188	return 1;
	189
	190	dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
	191	check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
	192	if (rc != 0)
	193	return 1;
	194
	195	return 0;
	196	}
	197
	198	static inline int fio_libcufile_pre_write(struct thread_data *td,
	199	struct libcufile_options *o,
	200	struct io_u *io_u,
	201	size_t gpu_offset)
	202	{
	203	int rc = 0;
	204
	205	if (o->cuda_io == IO_CUFILE) {
	206	if (td->o.verify) {
	207	/*
	208	Data is being verified, copy the io_u buffer to GPU memory.
	209	This isn't done in the non-verify case because the data would
	210	already be in GPU memory in a normal cuFile application.
	211	*/
	212	check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
	213	io_u->xfer_buf,
	214	io_u->xfer_buflen,
	215	cudaMemcpyHostToDevice), rc);
	216	if (rc != 0) {
	217	log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
	218	io_u->error = EIO;
	219	}
	220	}
	221	} else if (o->cuda_io == IO_POSIX) {
	222
	223	/*
	224	POSIX I/O is being used, the data has to be copied out of the
	225	GPU into a CPU buffer. GPU memory doesn't contain the actual
	226	data to write, copy the data to the junk buffer. The purpose
	227	of this is to add the overhead of cudaMemcpy() that would be
	228	present in a POSIX I/O CUDA application.
	229	*/
	230	check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
	231	((char*) o->cu_mem_ptr) + gpu_offset,
	232	io_u->xfer_buflen,
	233	cudaMemcpyDeviceToHost), rc);
	234	if (rc != 0) {
	235	log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
	236	io_u->error = EIO;
	237	}
	238	} else {
	239	log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
	240	assert(0);
	241	rc = EINVAL;
	242	}
	243
	244	return rc;
	245	}
	246
	247	static inline int fio_libcufile_post_read(struct thread_data *td,
	248	struct libcufile_options *o,
	249	struct io_u *io_u,
	250	size_t gpu_offset)
	251	{
	252	int rc = 0;
	253
	254	if (o->cuda_io == IO_CUFILE) {
	255	if (td->o.verify) {
	256	/* Copy GPU memory to CPU buffer for verify */
	257	check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
	258	((char*) o->cu_mem_ptr) + gpu_offset,
	259	io_u->xfer_buflen,
	260	cudaMemcpyDeviceToHost), rc);
	261	if (rc != 0) {
	262	log_err("DDIR_READ cudaMemcpy D2H failed\n");
	263	io_u->error = EIO;
	264	}
	265	}
	266	} else if (o->cuda_io == IO_POSIX) {
	267	/* POSIX I/O read, copy the CPU buffer to GPU memory */
	268	check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
	269	io_u->xfer_buf,
	270	io_u->xfer_buflen,
	271	cudaMemcpyHostToDevice), rc);
	272	if (rc != 0) {
	273	log_err("DDIR_READ cudaMemcpy H2D failed\n");
	274	io_u->error = EIO;
	275	}
	276	} else {
	277	log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
	278	assert(0);
	279	rc = EINVAL;
	280	}
	281
	282	return rc;
	283	}
	284
	285	static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
	286	struct io_u *io_u)
	287	{
	288	struct libcufile_options *o = td->eo;
	289	struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
	290	unsigned long long io_offset;
	291	ssize_t sz;
	292	ssize_t remaining;
	293	size_t xfered;
	294	size_t gpu_offset;
	295	int rc;
	296
	297	if (o->cuda_io == IO_CUFILE && fcd == NULL) {
	298	io_u->error = EINVAL;
	299	td_verror(td, EINVAL, "xfer");
	300	return FIO_Q_COMPLETED;
	301	}
	302
	303	fio_ro_check(td, io_u);
	304
	305	switch(io_u->ddir) {
	306	case DDIR_SYNC:
	307	rc = fsync(io_u->file->fd);
	308	if (rc != 0) {
	309	io_u->error = errno;
	310	log_err("fsync: err=%d\n", errno);
	311	}
	312	break;
	313
	314	case DDIR_DATASYNC:
	315	rc = fdatasync(io_u->file->fd);
	316	if (rc != 0) {
	317	io_u->error = errno;
	318	log_err("fdatasync: err=%d\n", errno);
	319	}
	320	break;
	321
	322	case DDIR_READ:
	323	case DDIR_WRITE:
	324	/*
	325	There may be a better way to calculate gpu_offset. The intent is
	326	that gpu_offset equals the the difference between io_u->xfer_buf and
	327	the page-aligned base address for io_u buffers.
	328	*/
	329	gpu_offset = io_u->index * io_u->xfer_buflen;
	330	io_offset = io_u->offset;
	331	remaining = io_u->xfer_buflen;
	332
	333	xfered = 0;
	334	sz = 0;
	335
	336	assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);
	337
	338	if (o->cuda_io == IO_CUFILE) {
	339	if (!(ALIGNED_4KB(io_u->xfer_buflen) \|\|
	340	(o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
	341	log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
	342	o->logged \|= LOGGED_BUFLEN_NOT_ALIGNED;
	343	}
	344
	345	if (!(ALIGNED_4KB(gpu_offset) \|\|
	346	(o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
	347	log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
	348	o->logged \|= LOGGED_GPU_OFFSET_NOT_ALIGNED;
	349	}
	350	}
	351
	352	if (io_u->ddir == DDIR_WRITE)
	353	rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);
	354
	355	if (io_u->error != 0)
	356	break;
	357
	358	while (remaining > 0) {
	359	assert(gpu_offset + xfered <= o->total_mem);
	360	if (io_u->ddir == DDIR_READ) {
	361	if (o->cuda_io == IO_CUFILE) {
	362	sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
	363	io_offset + xfered, gpu_offset + xfered);
	364	if (sz == -1) {
	365	io_u->error = errno;
	366	log_err("cuFileRead: err=%d\n", errno);
	367	} else if (sz < 0) {
	368	io_u->error = EIO;
	369	log_err("cuFileRead: err=%ld:%s\n", sz,
	370	cufileop_status_error(-sz));
	371	}
	372	} else if (o->cuda_io == IO_POSIX) {
	373	sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
	374	remaining, io_offset + xfered);
	375	if (sz < 0) {
	376	io_u->error = errno;
	377	log_err("pread: err=%d\n", errno);
	378	}
	379	} else {
	380	log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
	381	io_u->error = -1;
	382	assert(0);
	383	}
	384	} else if (io_u->ddir == DDIR_WRITE) {
	385	if (o->cuda_io == IO_CUFILE) {
	386	sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
	387	io_offset + xfered, gpu_offset + xfered);
	388	if (sz == -1) {
	389	io_u->error = errno;
	390	log_err("cuFileWrite: err=%d\n", errno);
	391	} else if (sz < 0) {
	392	io_u->error = EIO;
	393	log_err("cuFileWrite: err=%ld:%s\n", sz,
	394	cufileop_status_error(-sz));
	395	}
	396	} else if (o->cuda_io == IO_POSIX) {
	397	sz = pwrite(io_u->file->fd,
	398	((char*) io_u->xfer_buf) + xfered,
	399	remaining, io_offset + xfered);
	400	if (sz < 0) {
	401	io_u->error = errno;
	402	log_err("pwrite: err=%d\n", errno);
	403	}
	404	} else {
	405	log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
	406	io_u->error = -1;
	407	assert(0);
	408	}
	409	} else {
	410	log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
	411	io_u->error = -1;
	412	assert(0);
	413	break;
	414	}
	415
	416	if (io_u->error != 0)
	417	break;
	418
	419	remaining -= sz;
	420	xfered += sz;
	421
	422	if (remaining != 0)
	423	log_info("Incomplete %s: %ld bytes remaining\n",
	424	io_u->ddir == DDIR_READ? "read" : "write", remaining);
	425	}
	426
	427	if (io_u->error != 0)
	428	break;
	429
	430	if (io_u->ddir == DDIR_READ)
	431	rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
	432	break;
	433
	434	default:
	435	io_u->error = EINVAL;
	436	break;
	437	}
	438
	439	if (io_u->error != 0) {
	440	log_err("IO failed\n");
	441	td_verror(td, io_u->error, "xfer");
	442	}
	443
	444	return FIO_Q_COMPLETED;
	445	}
	446
	447	static int fio_libcufile_open_file(struct thread_data td, struct fio_file f)
	448	{
	449	struct libcufile_options *o = td->eo;
	450	struct fio_libcufile_data *fcd = NULL;
	451	int rc;
	452	CUfileError_t status;
	453
	454	rc = generic_open_file(td, f);
	455	if (rc)
	456	return rc;
	457
	458	if (o->cuda_io == IO_CUFILE) {
	459	fcd = calloc(1, sizeof(*fcd));
	460	if (fcd == NULL) {
	461	rc = ENOMEM;
	462	goto exit_err;
	463	}
	464
	465	fcd->cf_descr.handle.fd = f->fd;
	466	fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
	467	status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
	468	if (status.err != CU_FILE_SUCCESS) {
	469	log_err("cufile register: err=%d:%s\n", status.err,
	470	fio_libcufile_get_cuda_error(status));
	471	rc = EINVAL;
	472	goto exit_err;
	473	}
	474	}
	475
	476	FILE_SET_ENG_DATA(f, fcd);
	477	return 0;
	478
	479	exit_err:
	480	if (fcd) {
	481	free(fcd);
	482	fcd = NULL;
	483	}
	484	if (f) {
	485	int rc2 = generic_close_file(td, f);
	486	if (rc2)
	487	log_err("generic_close_file: err=%d\n", rc2);
	488	}
	489	return rc;
	490	}
	491
	492	static int fio_libcufile_close_file(struct thread_data td, struct fio_file f)
	493	{
	494	struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
	495	int rc;
	496
	497	if (fcd != NULL) {
	498	cuFileHandleDeregister(fcd->cf_handle);
	499	FILE_SET_ENG_DATA(f, NULL);
	500	free(fcd);
	501	}
	502
	503	rc = generic_close_file(td, f);
	504
	505	return rc;
	506	}
	507
	508	static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
	509	{
	510	struct libcufile_options *o = td->eo;
	511	int rc;
	512	CUfileError_t status;
	513
	514	o->total_mem = total_mem;
	515	o->logged = 0;
	516	o->cu_mem_ptr = NULL;
	517	o->junk_buf = NULL;
	518	td->orig_buffer = calloc(1, total_mem);
	519	if (!td->orig_buffer) {
	520	log_err("orig_buffer calloc failed: err=%d\n", errno);
	521	goto exit_error;
	522	}
	523
	524	if (o->cuda_io == IO_POSIX) {
	525	o->junk_buf = calloc(1, total_mem);
	526	if (o->junk_buf == NULL) {
	527	log_err("junk_buf calloc failed: err=%d\n", errno);
	528	goto exit_error;
	529	}
	530	}
	531
	532	dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
	533	check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
	534	if (rc != 0)
	535	goto exit_error;
	536	check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
	537	if (rc != 0)
	538	goto exit_error;
	539
	540	if (o->cuda_io == IO_CUFILE) {
	541	status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
	542	if (status.err != CU_FILE_SUCCESS) {
	543	log_err("cuFileBufRegister: err=%d:%s\n", status.err,
	544	fio_libcufile_get_cuda_error(status));
	545	goto exit_error;
	546	}
	547	}
	548
	549	return 0;
	550
	551	exit_error:
	552	if (td->orig_buffer) {
	553	free(td->orig_buffer);
	554	td->orig_buffer = NULL;
	555	}
	556	if (o->junk_buf) {
	557	free(o->junk_buf);
	558	o->junk_buf = NULL;
	559	}
	560	if (o->cu_mem_ptr) {
	561	cudaFree(o->cu_mem_ptr);
	562	o->cu_mem_ptr = NULL;
	563	}
	564	return 1;
	565	}
	566
	567	static void fio_libcufile_iomem_free(struct thread_data *td)
	568	{
	569	struct libcufile_options *o = td->eo;
	570
	571	if (o->junk_buf) {
	572	free(o->junk_buf);
	573	o->junk_buf = NULL;
	574	}
	575	if (o->cu_mem_ptr) {
	576	if (o->cuda_io == IO_CUFILE)
	577	cuFileBufDeregister(o->cu_mem_ptr);
	578	cudaFree(o->cu_mem_ptr);
	579	o->cu_mem_ptr = NULL;
	580	}
	581	if (td->orig_buffer) {
	582	free(td->orig_buffer);
	583	td->orig_buffer = NULL;
	584	}
	585	}
	586
	587	static void fio_libcufile_cleanup(struct thread_data *td)
	588	{
	589	struct libcufile_options *o = td->eo;
	590
	591	pthread_mutex_lock(&running_lock);
	592	running--;
	593	assert(running >= 0);
	594	if (running == 0) {
	595	/* only close the driver if initialized and
	596	this is the last worker thread */
	597	if (o->cuda_io == IO_CUFILE && cufile_initialized)
	598	cuFileDriverClose();
	599	cufile_initialized = 0;
	600	}
	601	pthread_mutex_unlock(&running_lock);
	602	}
	603
	604	FIO_STATIC struct ioengine_ops ioengine = {
	605	.name = "libcufile",
	606	.version = FIO_IOOPS_VERSION,
	607	.init = fio_libcufile_init,
	608	.queue = fio_libcufile_queue,
	609	.get_file_size = generic_get_file_size,
	610	.open_file = fio_libcufile_open_file,
	611	.close_file = fio_libcufile_close_file,
	612	.iomem_alloc = fio_libcufile_iomem_alloc,
	613	.iomem_free = fio_libcufile_iomem_free,
	614	.cleanup = fio_libcufile_cleanup,
	615	.flags = FIO_SYNCIO,
	616	.options = options,
	617	.option_struct_size = sizeof(struct libcufile_options)
	618	};
	619
	620	void fio_init fio_libcufile_register(void)
	621	{
	622	register_ioengine(&ioengine);
	623	}
	624
	625	void fio_exit fio_libcufile_unregister(void)
	626	{
	627	unregister_ioengine(&ioengine);
	628	}