GPUDirect RDMA support

author Yufei Ren <yren@us.ibm.com>

Wed, 19 Apr 2017 19:42:13 +0000 (15:42 -0400)

committer Yufei Ren <yren@us.ibm.com>

Wed, 26 Apr 2017 20:44:29 +0000 (16:44 -0400)
author Yufei Ren <yren@us.ibm.com>
Wed, 19 Apr 2017 19:42:13 +0000 (15:42 -0400)
committer Yufei Ren <yren@us.ibm.com>
Wed, 26 Apr 2017 20:44:29 +0000 (16:44 -0400)
diff --git a/HOWTO b/HOWTO

index ffdcb755f1a25a0f41158ae2767826a3ec910ac8..d9e881abdcc3aa2495cc18957d3b4c681d943f8d 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -1468,6 +1468,9 @@ Buffers and memory
                 **mmapshared**
                         Same as mmap, but use a MMAP_SHARED mapping.
  
+               **cudamalloc**
+                       Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+
         The area allocated is a function of the maximum allowed bs size for the job,
         multiplied by the I/O depth given. Note that for **shmhuge** and
         **mmaphuge** to work, the system must have free huge pages allocated. This
diff --git a/configure b/configure

index f42489bbf87bd14058fb1c9c77118e3095b2faab..75d0978d305442b6a8e2306d9c4a1fd776f5a5ce 100755 (executable)
--- a/configure
+++ b/configure
@@ -186,6 +186,8 @@ for opt do
    ;;
    --disable-pmem) disable_pmem="yes"
    ;;
+  --enable-cuda) enable_cuda="yes"
+  ;;
    --help)
      show_help="yes"
      ;;
@@ -206,7 +208,7 @@ if test "$show_help" = "yes" ; then
    echo "--esx                   Configure build options for esx"
    echo "--enable-gfio           Enable building of gtk gfio"
    echo "--disable-numa          Disable libnuma even if found"
-  echo "--disable-rdma         Disable RDMA support even if found"
+  echo "--disable-rdma          Disable RDMA support even if found"
    echo "--disable-gfapi         Disable gfapi"
    echo "--enable-libhdfs        Enable hdfs support"
    echo "--disable-lex           Disable use of lex/yacc for math"
@@ -214,6 +216,7 @@ if test "$show_help" = "yes" ; then
    echo "--enable-lex            Enable use of lex/yacc for math"
    echo "--disable-shm           Disable SHM support"
    echo "--disable-optimizations Don't enable compiler optimizations"
+  echo "--enable-cuda           Enable GPUDirect RDMA support"
    exit $exit_val
  fi
  
@@ -1990,6 +1993,21 @@ EOF
  fi
  echo "march_armv8_a_crc_crypto      $march_armv8_a_crc_crypto"
  
+##########################################
+# cuda probe
+cuda="no"
+cat > $TMPC << EOF
+#include <cuda.h>
+int main(int argc, char **argv)
+{
+  return cuInit(0);
+}
+EOF
+if test "$enable_cuda" == "yes"  && compile_prog "" "-lcuda" "cuda"; then
+  cuda="yes"
+  LIBS="-lcuda $LIBS"
+fi
+echo "cuda                          $cuda"
  
  #############################################################################
  
@@ -2210,10 +2228,12 @@ fi
  if test "$disable_opt" = "yes" ; then
    output_sym "CONFIG_DISABLE_OPTIMIZATIONS"
  fi
-
  if test "$zlib" = "no" ; then
    echo "Consider installing zlib-dev (zlib-devel), some fio features depend on it."
  fi
+if test "$cuda" = "yes" ; then
+  output_sym "CONFIG_CUDA"
+fi
  
  echo "LIBS+=$LIBS" >> $config_host_mak
  echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
diff --git a/examples/gpudirect-rdmaio-client.fio b/examples/gpudirect-rdmaio-client.fio

new file mode 100644 (file)

index 0000000..1e24624
--- /dev/null
+++ b/examples/gpudirect-rdmaio-client.fio
@@ -0,0 +1,15 @@
+# Example gpudirect rdma client job
+[global]
+ioengine=rdma
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[sender]
+rw=write
+iodepth=1
+iodepth_batch_complete=1
diff --git a/examples/gpudirect-rdmaio-server.fio b/examples/gpudirect-rdmaio-server.fio

new file mode 100644 (file)

index 0000000..5fc4950
--- /dev/null
+++ b/examples/gpudirect-rdmaio-server.fio
@@ -0,0 +1,12 @@
+# Example rdma server job
+[global]
+ioengine=rdma
+port=[port]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[receiver]
+rw=read
+iodepth=16
diff --git a/fio.1 b/fio.1

index b59025dc983375408350d6a09ba7afbb59428829..138bcbb988178b4edf1eebcd26bf5915113646ce 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -1309,6 +1309,9 @@ Same as \fBmmap\fR, but use huge files as backing.
  .TP
  .B mmapshared
  Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
+.TP
+.B cudamalloc
+Use GPU memory as the buffers for GPUDirect RDMA benchmark. The ioengine must be \fBrdma\fR.
  .RE
  .P
  The amount of memory allocated is the maximum allowed \fBblocksize\fR for the
diff --git a/fio.h b/fio.h

index b67613e6d10bdcb4309e9df6c14f7dbae9cc2f1e..6b2b669d3ccc93ac4aafadb8b59da91c537d6102 100644 (file)
--- a/fio.h
+++ b/fio.h
@@ -59,6 +59,10 @@
  #define MPOL_LOCAL MPOL_MAX
  #endif
  
+#ifdef CONFIG_CUDA
+#include <cuda.h>
+#endif
+
  /*
   * offset generator types
   */
@@ -408,6 +412,18 @@ struct thread_data {
         struct steadystate_data ss;
  
         char verror[FIO_VERROR_SIZE];
+
+#ifdef CONFIG_CUDA
+       /*
+        * for GPU memory management
+        */
+       int gpu_dev_cnt;
+       int gpu_dev_id;
+       CUdevice  cu_dev;
+       CUcontext cu_ctx;
+       CUdeviceptr dev_mem_ptr;
+#endif 
+
  };
  
  /*
diff --git a/io_u.c b/io_u.c

index 88f35c916078f8d92f2e50ff70eef3a88b0f46a1..39d68d1f9ae082fc391a66323027ba30157081f4 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -1654,6 +1654,10 @@ struct io_u *get_io_u(struct thread_data *td)
                                 populate_verify_io_u(td, io_u);
                                 do_scramble = 0;
                         }
+#ifdef CONFIG_CUDA
+                       if (td->o.mem_type == MEM_CUDA_MALLOC)
+                               do_scramble = 0;
+#endif
                 } else if (io_u->ddir == DDIR_READ) {
                         /*
                          * Reset the buf_filled parameters so next time if the
@@ -1674,8 +1678,10 @@ out:
         if (!td_io_prep(td, io_u)) {
                 if (!td->o.disable_lat)
                         fio_gettime(&io_u->start_time, NULL);
+
                 if (do_scramble)
                         small_content_scramble(io_u);
+
                 return io_u;
         }
  err_put:
@@ -2043,6 +2049,10 @@ void fill_io_buffer(struct thread_data *td, void *buf, unsigned int min_write,
  {
         struct thread_options *o = &td->o;
  
+#ifdef CONFIG_CUDA
+       if (o->mem_type == MEM_CUDA_MALLOC)     return;
+#endif
+
         if (o->compress_percentage || o->dedupe_percentage) {
                 unsigned int perc = td->o.compress_percentage;
                 struct frand_state *rs;
diff --git a/memory.c b/memory.c

index 9e73f100740a20c0f7d5c17af23366d92e436186..fe657225a2739bf878b2ddfb0909771ccb6e21ad 100644 (file)
--- a/memory.c
+++ b/memory.c
@@ -207,6 +207,75 @@ static void free_mem_malloc(struct thread_data *td)
         free(td->orig_buffer);
  }
  
+#ifdef CONFIG_CUDA
+
+static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem)
+{
+       CUresult ret;
+       char name[128];
+
+       ret = cuInit(0);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: failed initialize cuda driver api\n");
+               return 1;
+       }
+
+       ret = cuDeviceGetCount(&td->gpu_dev_cnt);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: failed get device count\n");
+               return 1;
+       }
+       dprint(FD_MEM, "found %d GPU devices\n", td->gpu_dev_cnt);
+
+       if (td->gpu_dev_cnt == 0) {
+               log_err("fio: no GPU device found. "
+                       "Can not perform GPUDirect RDMA.\n");
+               return 1;
+       }
+
+       td->gpu_dev_id = td->o.gpu_dev_id;
+       ret = cuDeviceGet(&td->cu_dev, td->gpu_dev_id);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: failed get GPU device\n");
+               return 1;
+       }
+
+       ret = cuDeviceGetName(name, sizeof(name), td->gpu_dev_id);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: failed get device name\n");
+               return 1;
+       }
+       dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \
+              td->gpu_dev_id, name);
+
+       ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: failed to create cuda context: %d\n", ret);
+               return 1;
+       }
+
+       ret = cuMemAlloc(&td->dev_mem_ptr, total_mem);
+       if (ret != CUDA_SUCCESS) {
+               log_err("fio: cuMemAlloc %zu bytes failed\n", total_mem);
+               return 1;
+       }
+       td->orig_buffer = (void *) td->dev_mem_ptr;
+
+       dprint(FD_MEM, "cudaMalloc %llu %p\n",                          \
+              (unsigned long long) total_mem, td->orig_buffer);
+       return 0;
+}
+
+static void free_mem_cudamalloc(struct thread_data *td)
+{
+       if ((void *) td->dev_mem_ptr != NULL)
+               cuMemFree(td->dev_mem_ptr);
+
+       if (cuCtxDestroy(td->cu_ctx) != CUDA_SUCCESS)
+               log_err("fio: failed to destroy cuda context\n");
+}
+#endif
+
  /*
   * Set up the buffer area we need for io.
   */
@@ -246,6 +315,10 @@ int allocate_io_mem(struct thread_data *td)
         else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
                  td->o.mem_type == MEM_MMAPSHARED)
                 ret = alloc_mem_mmap(td, total_mem);
+#ifdef CONFIG_CUDA
+       else if (td->o.mem_type == MEM_CUDA_MALLOC)
+               ret = alloc_mem_cudamalloc(td, total_mem);
+#endif
         else {
                 log_err("fio: bad mem type: %d\n", td->o.mem_type);
                 ret = 1;
@@ -275,6 +348,10 @@ void free_io_mem(struct thread_data *td)
         else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
                  td->o.mem_type == MEM_MMAPSHARED)
                 free_mem_mmap(td, total_mem);
+#ifdef CONFIG_CUDA
+       else if (td->o.mem_type == MEM_CUDA_MALLOC)
+               free_mem_cudamalloc(td);
+#endif
         else
                 log_err("Bad memory type %u\n", td->o.mem_type);
  
diff --git a/options.c b/options.c

index e0deab0a4d1020eed452078b521341a4c514bc15..85574d7f818a2c33a9fc534d74793431914a4f14 100644 (file)
--- a/options.c
+++ b/options.c
@@ -2603,6 +2603,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                             .oval = MEM_MMAPHUGE,
                             .help = "Like mmap, but use huge pages",
                           },
+#endif
+#ifdef CONFIG_CUDA
+                         { .ival = "cudamalloc",
+                           .oval = MEM_CUDA_MALLOC,
+                           .help = "Allocate GPU device memory for GPUDirect RDMA",
+                         },
  #endif
                   },
         },
@@ -3562,6 +3568,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .type   = FIO_OPT_UNSUPPORTED,
                 .help   = "Build fio with libnuma-dev(el) to enable this option",
         },
+#endif
+#ifdef CONFIG_CUDA
+       {
+               .name   = "gpu_dev_id",
+               .lname  = "GPU device ID",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, gpu_dev_id),
+               .help   = "Set GPU device ID for GPUDirect RDMA",
+               .def    = "0",
+               .category = FIO_OPT_C_GENERAL,
+               .group  = FIO_OPT_G_INVALID,
+       },
  #endif
         {
                 .name   = "end_fsync",
diff --git a/thread_options.h b/thread_options.h

index 2b2df3384fb12f5c24c311f32bc259515919d439..393e51ef976f5ead6d45afb3375678d9dc718556 100644 (file)
--- a/thread_options.h
+++ b/thread_options.h
@@ -20,6 +20,9 @@ enum fio_memtype {
         MEM_MMAP,       /* use anonynomous mmap */
         MEM_MMAPHUGE,   /* memory mapped huge file */
         MEM_MMAPSHARED, /* use mmap with shared flag */
+#ifdef CONFIG_CUDA
+       MEM_CUDA_MALLOC,/* use GPU memory */
+#endif
  };
  
  #define ERROR_STR_MAX  128
@@ -198,6 +201,8 @@ struct thread_options {
         unsigned short numa_mem_mode;
         unsigned int numa_mem_prefer_node;
         char *numa_memnodes;
+       unsigned int gpu_dev_id;
+
         unsigned int iolog;
         unsigned int rwmixcycle;
         unsigned int rwmix[DDIR_RWDIR_CNT];
author	Yufei Ren <yren@us.ibm.com>
	Wed, 19 Apr 2017 19:42:13 +0000 (15:42 -0400)
committer	Yufei Ren <yren@us.ibm.com>
	Wed, 26 Apr 2017 20:44:29 +0000 (16:44 -0400)
HOWTO		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
examples/gpudirect-rdmaio-client.fio	[new file with mode: 0644]	patch \| blob
examples/gpudirect-rdmaio-server.fio	[new file with mode: 0644]	patch \| blob
fio.1		patch \| blob \| blame \| history
fio.h		patch \| blob \| blame \| history
io_u.c		patch \| blob \| blame \| history
memory.c		patch \| blob \| blame \| history
options.c		patch \| blob \| blame \| history
thread_options.h		patch \| blob \| blame \| history