GPUDirect RDMA support
authorYufei Ren <yren@us.ibm.com>
Wed, 19 Apr 2017 19:42:13 +0000 (15:42 -0400)
committerYufei Ren <yren@us.ibm.com>
Wed, 26 Apr 2017 20:44:29 +0000 (16:44 -0400)
HOWTO
configure
examples/gpudirect-rdmaio-client.fio [new file with mode: 0644]
examples/gpudirect-rdmaio-server.fio [new file with mode: 0644]
fio.1
fio.h
io_u.c
memory.c
options.c
thread_options.h

diff --git a/HOWTO b/HOWTO
index ffdcb755f1a25a0f41158ae2767826a3ec910ac8..d9e881abdcc3aa2495cc18957d3b4c681d943f8d 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -1468,6 +1468,9 @@ Buffers and memory
                **mmapshared**
                        Same as mmap, but use a MMAP_SHARED mapping.
 
+               **cudamalloc**
+                       Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+
        The area allocated is a function of the maximum allowed bs size for the job,
        multiplied by the I/O depth given. Note that for **shmhuge** and
        **mmaphuge** to work, the system must have free huge pages allocated. This
index f42489bbf87bd14058fb1c9c77118e3095b2faab..75d0978d305442b6a8e2306d9c4a1fd776f5a5ce 100755 (executable)
--- a/configure
+++ b/configure
@@ -186,6 +186,8 @@ for opt do
   ;;
   --disable-pmem) disable_pmem="yes"
   ;;
+  --enable-cuda) enable_cuda="yes"
+  ;;
   --help)
     show_help="yes"
     ;;
@@ -206,7 +208,7 @@ if test "$show_help" = "yes" ; then
   echo "--esx                   Configure build options for esx"
   echo "--enable-gfio           Enable building of gtk gfio"
   echo "--disable-numa          Disable libnuma even if found"
-  echo "--disable-rdma         Disable RDMA support even if found"
+  echo "--disable-rdma          Disable RDMA support even if found"
   echo "--disable-gfapi         Disable gfapi"
   echo "--enable-libhdfs        Enable hdfs support"
   echo "--disable-lex           Disable use of lex/yacc for math"
@@ -214,6 +216,7 @@ if test "$show_help" = "yes" ; then
   echo "--enable-lex            Enable use of lex/yacc for math"
   echo "--disable-shm           Disable SHM support"
   echo "--disable-optimizations Don't enable compiler optimizations"
+  echo "--enable-cuda           Enable GPUDirect RDMA support"
   exit $exit_val
 fi
 
@@ -1990,6 +1993,21 @@ EOF
 fi
 echo "march_armv8_a_crc_crypto      $march_armv8_a_crc_crypto"
 
+##########################################
+# cuda probe
+cuda="no"
+cat > $TMPC << EOF
+#include <cuda.h>
+int main(int argc, char **argv)
+{
+  return cuInit(0);
+}
+EOF
+if test "$enable_cuda" == "yes"  && compile_prog "" "-lcuda" "cuda"; then
+  cuda="yes"
+  LIBS="-lcuda $LIBS"
+fi
+echo "cuda                          $cuda"
 
 #############################################################################
 
@@ -2210,10 +2228,12 @@ fi
 if test "$disable_opt" = "yes" ; then
   output_sym "CONFIG_DISABLE_OPTIMIZATIONS"
 fi
-
 if test "$zlib" = "no" ; then
   echo "Consider installing zlib-dev (zlib-devel), some fio features depend on it."
 fi
+if test "$cuda" = "yes" ; then
+  output_sym "CONFIG_CUDA"
+fi
 
 echo "LIBS+=$LIBS" >> $config_host_mak
 echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
diff --git a/examples/gpudirect-rdmaio-client.fio b/examples/gpudirect-rdmaio-client.fio
new file mode 100644 (file)
index 0000000..1e24624
--- /dev/null
@@ -0,0 +1,15 @@
+# Example gpudirect rdma client job
+[global]
+ioengine=rdma
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[sender]
+rw=write
+iodepth=1
+iodepth_batch_complete=1
diff --git a/examples/gpudirect-rdmaio-server.fio b/examples/gpudirect-rdmaio-server.fio
new file mode 100644 (file)
index 0000000..5fc4950
--- /dev/null
@@ -0,0 +1,12 @@
+# Example rdma server job
+[global]
+ioengine=rdma
+port=[port]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[receiver]
+rw=read
+iodepth=16
diff --git a/fio.1 b/fio.1
index b59025dc983375408350d6a09ba7afbb59428829..138bcbb988178b4edf1eebcd26bf5915113646ce 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -1309,6 +1309,9 @@ Same as \fBmmap\fR, but use huge files as backing.
 .TP
 .B mmapshared
 Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
+.TP
+.B cudamalloc
+Use GPU memory as the buffers for GPUDirect RDMA benchmark. The ioengine must be \fBrdma\fR.
 .RE
 .P
 The amount of memory allocated is the maximum allowed \fBblocksize\fR for the
diff --git a/fio.h b/fio.h
index b67613e6d10bdcb4309e9df6c14f7dbae9cc2f1e..6b2b669d3ccc93ac4aafadb8b59da91c537d6102 100644 (file)
--- a/fio.h
+++ b/fio.h
 #define MPOL_LOCAL MPOL_MAX
 #endif
 
+#ifdef CONFIG_CUDA
+#include <cuda.h>
+#endif
+
 /*
  * offset generator types
  */
@@ -408,6 +412,18 @@ struct thread_data {
        struct steadystate_data ss;
 
        char verror[FIO_VERROR_SIZE];
+
+#ifdef CONFIG_CUDA
+       /*
+        * for GPU memory management
+        */
+       int gpu_dev_cnt;
+       int gpu_dev_id;
+       CUdevice  cu_dev;
+       CUcontext cu_ctx;
+       CUdeviceptr dev_mem_ptr;
+#endif 
+
 };
 
 /*
diff --git a/io_u.c b/io_u.c
index 88f35c916078f8d92f2e50ff70eef3a88b0f46a1..39d68d1f9ae082fc391a66323027ba30157081f4 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -1654,6 +1654,10 @@ struct io_u *get_io_u(struct thread_data *td)
                                populate_verify_io_u(td, io_u);
                                do_scramble = 0;
                        }
+#ifdef CONFIG_CUDA
+                       if (td->o.mem_type == MEM_CUDA_MALLOC)
+                               do_scramble = 0;
+#endif
                } else if (io_u->ddir == DDIR_READ) {
                        /*
                         * Reset the buf_filled parameters so next time if the
@@ -1674,8 +1678,10 @@ out:
        if (!td_io_prep(td, io_u)) {
                if (!td->o.disable_lat)
                        fio_gettime(&io_u->start_time, NULL);
+
                if (do_scramble)
                        small_content_scramble(io_u);
+
                return io_u;
        }
 err_put:
@@ -2043,6 +2049,10 @@ void fill_io_buffer(struct thread_data *td, void *buf, unsigned int min_write,
 {
        struct thread_options *o = &td->o;
 
+#ifdef CONFIG_CUDA
+       if (o->mem_type == MEM_CUDA_MALLOC)     return;
+#endif
+
        if (o->compress_percentage || o->dedupe_percentage) {
                unsigned int perc = td->o.compress_percentage;
                struct frand_state *rs;
index 9e73f100740a20c0f7d5c17af23366d92e436186..fe657225a2739bf878b2ddfb0909771ccb6e21ad 100644 (file)
--- a/memory.c
+++ b/memory.c
@@ -207,6 +207,75 @@ static void free_mem_malloc(struct thread_data *td)
        free(td->orig_buffer);
 }
 
+#ifdef CONFIG_CUDA
+
+static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem)
+{
+       CUresult ret;
+       char name[128];
+
+       ret = cuInit(0);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: failed initialize cuda driver api\n");
+               return 1;
+       }
+
+       ret = cuDeviceGetCount(&td->gpu_dev_cnt);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: failed get device count\n");
+               return 1;
+       }
+       dprint(FD_MEM, "found %d GPU devices\n", td->gpu_dev_cnt);
+
+       if (td->gpu_dev_cnt == 0) {
+               log_err("fio: no GPU device found. "
+                       "Can not perform GPUDirect RDMA.\n");
+               return 1;
+       }
+
+       td->gpu_dev_id = td->o.gpu_dev_id;
+       ret = cuDeviceGet(&td->cu_dev, td->gpu_dev_id);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: failed get GPU device\n");
+               return 1;
+       }
+
+       ret = cuDeviceGetName(name, sizeof(name), td->gpu_dev_id);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: failed get device name\n");
+               return 1;
+       }
+       dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \
+              td->gpu_dev_id, name);
+
+       ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: failed to create cuda context: %d\n", ret);
+               return 1;
+       }
+
+       ret = cuMemAlloc(&td->dev_mem_ptr, total_mem);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: cuMemAlloc %zu bytes failed\n", total_mem);
+               return 1;
+       }
+       td->orig_buffer = (void *) td->dev_mem_ptr;
+
+       dprint(FD_MEM, "cudaMalloc %llu %p\n",                          \
+              (unsigned long long) total_mem, td->orig_buffer);
+       return 0;
+}
+
+static void free_mem_cudamalloc(struct thread_data *td)
+{
+       if ((void *) td->dev_mem_ptr != NULL)
+               cuMemFree(td->dev_mem_ptr);
+
+       if (cuCtxDestroy(td->cu_ctx) != CUDA_SUCCESS)
+               log_err("fio: failed to destroy cuda context\n");
+}
+#endif
+
 /*
  * Set up the buffer area we need for io.
  */
@@ -246,6 +315,10 @@ int allocate_io_mem(struct thread_data *td)
        else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
                 td->o.mem_type == MEM_MMAPSHARED)
                ret = alloc_mem_mmap(td, total_mem);
+#ifdef CONFIG_CUDA
+       else if (td->o.mem_type == MEM_CUDA_MALLOC)
+               ret = alloc_mem_cudamalloc(td, total_mem);
+#endif
        else {
                log_err("fio: bad mem type: %d\n", td->o.mem_type);
                ret = 1;
@@ -275,6 +348,10 @@ void free_io_mem(struct thread_data *td)
        else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
                 td->o.mem_type == MEM_MMAPSHARED)
                free_mem_mmap(td, total_mem);
+#ifdef CONFIG_CUDA
+       else if (td->o.mem_type == MEM_CUDA_MALLOC)
+               free_mem_cudamalloc(td);
+#endif
        else
                log_err("Bad memory type %u\n", td->o.mem_type);
 
index e0deab0a4d1020eed452078b521341a4c514bc15..85574d7f818a2c33a9fc534d74793431914a4f14 100644 (file)
--- a/options.c
+++ b/options.c
@@ -2603,6 +2603,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                            .oval = MEM_MMAPHUGE,
                            .help = "Like mmap, but use huge pages",
                          },
+#endif
+#ifdef CONFIG_CUDA
+                         { .ival = "cudamalloc",
+                           .oval = MEM_CUDA_MALLOC,
+                           .help = "Allocate GPU device memory for GPUDirect RDMA",
+                         },
 #endif
                  },
        },
@@ -3562,6 +3568,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                .type   = FIO_OPT_UNSUPPORTED,
                .help   = "Build fio with libnuma-dev(el) to enable this option",
        },
+#endif
+#ifdef CONFIG_CUDA
+       {
+               .name   = "gpu_dev_id",
+               .lname  = "GPU device ID",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, gpu_dev_id),
+               .help   = "Set GPU device ID for GPUDirect RDMA",
+               .def    = "0",
+               .category = FIO_OPT_C_GENERAL,
+               .group  = FIO_OPT_G_INVALID,
+       },
 #endif
        {
                .name   = "end_fsync",
index 2b2df3384fb12f5c24c311f32bc259515919d439..393e51ef976f5ead6d45afb3375678d9dc718556 100644 (file)
@@ -20,6 +20,9 @@ enum fio_memtype {
        MEM_MMAP,       /* use anonynomous mmap */
        MEM_MMAPHUGE,   /* memory mapped huge file */
        MEM_MMAPSHARED, /* use mmap with shared flag */
+#ifdef CONFIG_CUDA
+       MEM_CUDA_MALLOC,/* use GPU memory */
+#endif
 };
 
 #define ERROR_STR_MAX  128
@@ -198,6 +201,8 @@ struct thread_options {
        unsigned short numa_mem_mode;
        unsigned int numa_mem_prefer_node;
        char *numa_memnodes;
+       unsigned int gpu_dev_id;
+
        unsigned int iolog;
        unsigned int rwmixcycle;
        unsigned int rwmix[DDIR_RWDIR_CNT];