From: Yufei Ren <yren@us.ibm.com>
Date: Wed, 19 Apr 2017 19:42:13 +0000 (-0400)
Subject: GPUDirect RDMA support
X-Git-Tag: fio-2.20~33^2
X-Git-Url: https://git.kernel.dk/?p=fio.git;a=commitdiff_plain;h=03553853;hp=10c37df111032901e790cd57e7df43a42edd8ba8

GPUDirect RDMA support
---

diff --git a/HOWTO b/HOWTO
index ffdcb755..d9e881ab 100644
--- a/HOWTO
+++ b/HOWTO
@@ -1468,6 +1468,9 @@ Buffers and memory
 		**mmapshared**
 			Same as mmap, but use a MMAP_SHARED mapping.
 
+		**cudamalloc**
+			Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+
 	The area allocated is a function of the maximum allowed bs size for the job,
 	multiplied by the I/O depth given. Note that for **shmhuge** and
 	**mmaphuge** to work, the system must have free huge pages allocated. This
diff --git a/configure b/configure
index f42489bb..75d0978d 100755
--- a/configure
+++ b/configure
@@ -186,6 +186,8 @@ for opt do
   ;;
   --disable-pmem) disable_pmem="yes"
   ;;
+  --enable-cuda) enable_cuda="yes"
+  ;;
   --help)
     show_help="yes"
     ;;
@@ -206,7 +208,7 @@ if test "$show_help" = "yes" ; then
   echo "--esx                   Configure build options for esx"
   echo "--enable-gfio           Enable building of gtk gfio"
   echo "--disable-numa          Disable libnuma even if found"
-  echo "--disable-rdma         Disable RDMA support even if found"
+  echo "--disable-rdma          Disable RDMA support even if found"
   echo "--disable-gfapi         Disable gfapi"
   echo "--enable-libhdfs        Enable hdfs support"
   echo "--disable-lex           Disable use of lex/yacc for math"
@@ -214,6 +216,7 @@ if test "$show_help" = "yes" ; then
   echo "--enable-lex            Enable use of lex/yacc for math"
   echo "--disable-shm           Disable SHM support"
   echo "--disable-optimizations Don't enable compiler optimizations"
+  echo "--enable-cuda           Enable GPUDirect RDMA support"
   exit $exit_val
 fi
 
@@ -1990,6 +1993,21 @@ EOF
 fi
 echo "march_armv8_a_crc_crypto      $march_armv8_a_crc_crypto"
 
+##########################################
+# cuda probe
+cuda="no"
+cat > $TMPC << EOF
+#include <cuda.h>
+int main(int argc, char **argv)
+{
+  return cuInit(0);
+}
+EOF
+if test "$enable_cuda" == "yes"  && compile_prog "" "-lcuda" "cuda"; then
+  cuda="yes"
+  LIBS="-lcuda $LIBS"
+fi
+echo "cuda                          $cuda"
 
 #############################################################################
 
@@ -2210,10 +2228,12 @@ fi
 if test "$disable_opt" = "yes" ; then
   output_sym "CONFIG_DISABLE_OPTIMIZATIONS"
 fi
-
 if test "$zlib" = "no" ; then
   echo "Consider installing zlib-dev (zlib-devel), some fio features depend on it."
 fi
+if test "$cuda" = "yes" ; then
+  output_sym "CONFIG_CUDA"
+fi
 
 echo "LIBS+=$LIBS" >> $config_host_mak
 echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
diff --git a/examples/gpudirect-rdmaio-client.fio b/examples/gpudirect-rdmaio-client.fio
new file mode 100644
index 00000000..1e24624d
--- /dev/null
+++ b/examples/gpudirect-rdmaio-client.fio
@@ -0,0 +1,15 @@
+# Example gpudirect rdma client job
+[global]
+ioengine=rdma
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[sender]
+rw=write
+iodepth=1
+iodepth_batch_complete=1
diff --git a/examples/gpudirect-rdmaio-server.fio b/examples/gpudirect-rdmaio-server.fio
new file mode 100644
index 00000000..5fc4950d
--- /dev/null
+++ b/examples/gpudirect-rdmaio-server.fio
@@ -0,0 +1,12 @@
+# Example rdma server job
+[global]
+ioengine=rdma
+port=[port]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[receiver]
+rw=read
+iodepth=16
diff --git a/fio.1 b/fio.1
index b59025dc..138bcbb9 100644
--- a/fio.1
+++ b/fio.1
@@ -1309,6 +1309,9 @@ Same as \fBmmap\fR, but use huge files as backing.
 .TP
 .B mmapshared
 Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
+.TP
+.B cudamalloc
+Use GPU memory as the buffers for GPUDirect RDMA benchmark. The ioengine must be \fBrdma\fR.
 .RE
 .P
 The amount of memory allocated is the maximum allowed \fBblocksize\fR for the
diff --git a/fio.h b/fio.h
index b67613e6..6b2b669d 100644
--- a/fio.h
+++ b/fio.h
@@ -59,6 +59,10 @@
 #define MPOL_LOCAL MPOL_MAX
 #endif
 
+#ifdef CONFIG_CUDA
+#include <cuda.h>
+#endif
+
 /*
  * offset generator types
  */
@@ -408,6 +412,18 @@ struct thread_data {
 	struct steadystate_data ss;
 
 	char verror[FIO_VERROR_SIZE];
+
+#ifdef CONFIG_CUDA
+	/*
+	 * for GPU memory management
+	 */
+	int gpu_dev_cnt;
+	int gpu_dev_id;
+	CUdevice  cu_dev;
+	CUcontext cu_ctx;
+	CUdeviceptr dev_mem_ptr;
+#endif	
+
 };
 
 /*
diff --git a/io_u.c b/io_u.c
index 88f35c91..39d68d1f 100644
--- a/io_u.c
+++ b/io_u.c
@@ -1654,6 +1654,10 @@ struct io_u *get_io_u(struct thread_data *td)
 				populate_verify_io_u(td, io_u);
 				do_scramble = 0;
 			}
+#ifdef CONFIG_CUDA
+			if (td->o.mem_type == MEM_CUDA_MALLOC)
+				do_scramble = 0;
+#endif
 		} else if (io_u->ddir == DDIR_READ) {
 			/*
 			 * Reset the buf_filled parameters so next time if the
@@ -1674,8 +1678,10 @@ out:
 	if (!td_io_prep(td, io_u)) {
 		if (!td->o.disable_lat)
 			fio_gettime(&io_u->start_time, NULL);
+
 		if (do_scramble)
 			small_content_scramble(io_u);
+
 		return io_u;
 	}
 err_put:
@@ -2043,6 +2049,10 @@ void fill_io_buffer(struct thread_data *td, void *buf, unsigned int min_write,
 {
 	struct thread_options *o = &td->o;
 
+#ifdef CONFIG_CUDA
+	if (o->mem_type == MEM_CUDA_MALLOC)	return;
+#endif
+
 	if (o->compress_percentage || o->dedupe_percentage) {
 		unsigned int perc = td->o.compress_percentage;
 		struct frand_state *rs;
diff --git a/memory.c b/memory.c
index 9e73f100..fe657225 100644
--- a/memory.c
+++ b/memory.c
@@ -207,6 +207,75 @@ static void free_mem_malloc(struct thread_data *td)
 	free(td->orig_buffer);
 }
 
+#ifdef CONFIG_CUDA
+
+static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem)
+{
+	CUresult ret;
+	char name[128];
+
+	ret = cuInit(0);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed initialize cuda driver api\n");
+		return 1;
+	}
+
+	ret = cuDeviceGetCount(&td->gpu_dev_cnt);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get device count\n");
+		return 1;
+	}
+	dprint(FD_MEM, "found %d GPU devices\n", td->gpu_dev_cnt);
+
+	if (td->gpu_dev_cnt == 0) {
+		log_err("fio: no GPU device found. "
+			"Can not perform GPUDirect RDMA.\n");
+		return 1;
+	}
+
+	td->gpu_dev_id = td->o.gpu_dev_id;
+	ret = cuDeviceGet(&td->cu_dev, td->gpu_dev_id);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get GPU device\n");
+		return 1;
+	}
+
+	ret = cuDeviceGetName(name, sizeof(name), td->gpu_dev_id);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get device name\n");
+		return 1;
+	}
+	dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \
+	       td->gpu_dev_id, name);
+
+	ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed to create cuda context: %d\n", ret);
+		return 1;
+	}
+
+	ret = cuMemAlloc(&td->dev_mem_ptr, total_mem);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: cuMemAlloc %zu bytes failed\n", total_mem);
+		return 1;
+	}
+	td->orig_buffer = (void *) td->dev_mem_ptr;
+
+	dprint(FD_MEM, "cudaMalloc %llu %p\n",				\
+	       (unsigned long long) total_mem, td->orig_buffer);
+	return 0;
+}
+
+static void free_mem_cudamalloc(struct thread_data *td)
+{
+	if ((void *) td->dev_mem_ptr != NULL)
+		cuMemFree(td->dev_mem_ptr);
+
+	if (cuCtxDestroy(td->cu_ctx) != CUDA_SUCCESS)
+		log_err("fio: failed to destroy cuda context\n");
+}
+#endif
+
 /*
  * Set up the buffer area we need for io.
  */
@@ -246,6 +315,10 @@ int allocate_io_mem(struct thread_data *td)
 	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
 		 td->o.mem_type == MEM_MMAPSHARED)
 		ret = alloc_mem_mmap(td, total_mem);
+#ifdef CONFIG_CUDA
+	else if (td->o.mem_type == MEM_CUDA_MALLOC)
+		ret = alloc_mem_cudamalloc(td, total_mem);
+#endif
 	else {
 		log_err("fio: bad mem type: %d\n", td->o.mem_type);
 		ret = 1;
@@ -275,6 +348,10 @@ void free_io_mem(struct thread_data *td)
 	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
 		 td->o.mem_type == MEM_MMAPSHARED)
 		free_mem_mmap(td, total_mem);
+#ifdef CONFIG_CUDA
+	else if (td->o.mem_type == MEM_CUDA_MALLOC)
+		free_mem_cudamalloc(td);
+#endif
 	else
 		log_err("Bad memory type %u\n", td->o.mem_type);
 
diff --git a/options.c b/options.c
index e0deab0a..85574d7f 100644
--- a/options.c
+++ b/options.c
@@ -2603,6 +2603,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 			    .oval = MEM_MMAPHUGE,
 			    .help = "Like mmap, but use huge pages",
 			  },
+#endif
+#ifdef CONFIG_CUDA
+			  { .ival = "cudamalloc",
+			    .oval = MEM_CUDA_MALLOC,
+			    .help = "Allocate GPU device memory for GPUDirect RDMA",
+			  },
 #endif
 		  },
 	},
@@ -3562,6 +3568,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.type	= FIO_OPT_UNSUPPORTED,
 		.help	= "Build fio with libnuma-dev(el) to enable this option",
 	},
+#endif
+#ifdef CONFIG_CUDA
+	{
+		.name	= "gpu_dev_id",
+		.lname	= "GPU device ID",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, gpu_dev_id),
+		.help	= "Set GPU device ID for GPUDirect RDMA",
+		.def    = "0",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
 #endif
 	{
 		.name	= "end_fsync",
diff --git a/thread_options.h b/thread_options.h
index 2b2df338..393e51ef 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -20,6 +20,9 @@ enum fio_memtype {
 	MEM_MMAP,	/* use anonynomous mmap */
 	MEM_MMAPHUGE,	/* memory mapped huge file */
 	MEM_MMAPSHARED, /* use mmap with shared flag */
+#ifdef CONFIG_CUDA
+	MEM_CUDA_MALLOC,/* use GPU memory */
+#endif
 };
 
 #define ERROR_STR_MAX	128
@@ -198,6 +201,8 @@ struct thread_options {
 	unsigned short numa_mem_mode;
 	unsigned int numa_mem_prefer_node;
 	char *numa_memnodes;
+	unsigned int gpu_dev_id;
+
 	unsigned int iolog;
 	unsigned int rwmixcycle;
 	unsigned int rwmix[DDIR_RWDIR_CNT];