From: Yufei Ren Date: Wed, 19 Apr 2017 19:42:13 +0000 (-0400) Subject: GPUDirect RDMA support X-Git-Tag: fio-2.20~33^2 X-Git-Url: https://git.kernel.dk/?p=fio.git;a=commitdiff_plain;h=03553853;hp=10c37df111032901e790cd57e7df43a42edd8ba8 GPUDirect RDMA support --- diff --git a/HOWTO b/HOWTO index ffdcb755..d9e881ab 100644 --- a/HOWTO +++ b/HOWTO @@ -1468,6 +1468,9 @@ Buffers and memory **mmapshared** Same as mmap, but use a MMAP_SHARED mapping. + **cudamalloc** + Use GPU memory as the buffers for GPUDirect RDMA benchmark. + The area allocated is a function of the maximum allowed bs size for the job, multiplied by the I/O depth given. Note that for **shmhuge** and **mmaphuge** to work, the system must have free huge pages allocated. This diff --git a/configure b/configure index f42489bb..75d0978d 100755 --- a/configure +++ b/configure @@ -186,6 +186,8 @@ for opt do ;; --disable-pmem) disable_pmem="yes" ;; + --enable-cuda) enable_cuda="yes" + ;; --help) show_help="yes" ;; @@ -206,7 +208,7 @@ if test "$show_help" = "yes" ; then echo "--esx Configure build options for esx" echo "--enable-gfio Enable building of gtk gfio" echo "--disable-numa Disable libnuma even if found" - echo "--disable-rdma Disable RDMA support even if found" + echo "--disable-rdma Disable RDMA support even if found" echo "--disable-gfapi Disable gfapi" echo "--enable-libhdfs Enable hdfs support" echo "--disable-lex Disable use of lex/yacc for math" @@ -214,6 +216,7 @@ if test "$show_help" = "yes" ; then echo "--enable-lex Enable use of lex/yacc for math" echo "--disable-shm Disable SHM support" echo "--disable-optimizations Don't enable compiler optimizations" + echo "--enable-cuda Enable GPUDirect RDMA support" exit $exit_val fi @@ -1990,6 +1993,21 @@ EOF fi echo "march_armv8_a_crc_crypto $march_armv8_a_crc_crypto" +########################################## +# cuda probe +cuda="no" +cat > $TMPC << EOF +#include +int main(int argc, char **argv) +{ + return cuInit(0); +} +EOF +if test "$enable_cuda" == "yes" && compile_prog "" "-lcuda" "cuda"; then + cuda="yes" + LIBS="-lcuda $LIBS" +fi +echo "cuda $cuda" ############################################################################# @@ -2210,10 +2228,12 @@ fi if test "$disable_opt" = "yes" ; then output_sym "CONFIG_DISABLE_OPTIMIZATIONS" fi - if test "$zlib" = "no" ; then echo "Consider installing zlib-dev (zlib-devel), some fio features depend on it." fi +if test "$cuda" = "yes" ; then + output_sym "CONFIG_CUDA" +fi echo "LIBS+=$LIBS" >> $config_host_mak echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak diff --git a/examples/gpudirect-rdmaio-client.fio b/examples/gpudirect-rdmaio-client.fio new file mode 100644 index 00000000..1e24624d --- /dev/null +++ b/examples/gpudirect-rdmaio-client.fio @@ -0,0 +1,15 @@ +# Example gpudirect rdma client job +[global] +ioengine=rdma +hostname=[hostname] +port=[port] +verb=[read/write/send/recv] +mem=cudamalloc +gpu_dev_id=0 +bs=1m +size=100g + +[sender] +rw=write +iodepth=1 +iodepth_batch_complete=1 diff --git a/examples/gpudirect-rdmaio-server.fio b/examples/gpudirect-rdmaio-server.fio new file mode 100644 index 00000000..5fc4950d --- /dev/null +++ b/examples/gpudirect-rdmaio-server.fio @@ -0,0 +1,12 @@ +# Example rdma server job +[global] +ioengine=rdma +port=[port] +mem=cudamalloc +gpu_dev_id=0 +bs=1m +size=100g + +[receiver] +rw=read +iodepth=16 diff --git a/fio.1 b/fio.1 index b59025dc..138bcbb9 100644 --- a/fio.1 +++ b/fio.1 @@ -1309,6 +1309,9 @@ Same as \fBmmap\fR, but use huge files as backing. .TP .B mmapshared Same as \fBmmap\fR, but use a MMAP_SHARED mapping. +.TP +.B cudamalloc +Use GPU memory as the buffers for GPUDirect RDMA benchmark. The ioengine must be \fBrdma\fR. .RE .P The amount of memory allocated is the maximum allowed \fBblocksize\fR for the diff --git a/fio.h b/fio.h index b67613e6..6b2b669d 100644 --- a/fio.h +++ b/fio.h @@ -59,6 +59,10 @@ #define MPOL_LOCAL MPOL_MAX #endif +#ifdef CONFIG_CUDA +#include +#endif + /* * offset generator types */ @@ -408,6 +412,18 @@ struct thread_data { struct steadystate_data ss; char verror[FIO_VERROR_SIZE]; + +#ifdef CONFIG_CUDA + /* + * for GPU memory management + */ + int gpu_dev_cnt; + int gpu_dev_id; + CUdevice cu_dev; + CUcontext cu_ctx; + CUdeviceptr dev_mem_ptr; +#endif + }; /* diff --git a/io_u.c b/io_u.c index 88f35c91..39d68d1f 100644 --- a/io_u.c +++ b/io_u.c @@ -1654,6 +1654,10 @@ struct io_u *get_io_u(struct thread_data *td) populate_verify_io_u(td, io_u); do_scramble = 0; } +#ifdef CONFIG_CUDA + if (td->o.mem_type == MEM_CUDA_MALLOC) + do_scramble = 0; +#endif } else if (io_u->ddir == DDIR_READ) { /* * Reset the buf_filled parameters so next time if the @@ -1674,8 +1678,10 @@ out: if (!td_io_prep(td, io_u)) { if (!td->o.disable_lat) fio_gettime(&io_u->start_time, NULL); + if (do_scramble) small_content_scramble(io_u); + return io_u; } err_put: @@ -2043,6 +2049,10 @@ void fill_io_buffer(struct thread_data *td, void *buf, unsigned int min_write, { struct thread_options *o = &td->o; +#ifdef CONFIG_CUDA + if (o->mem_type == MEM_CUDA_MALLOC) return; +#endif + if (o->compress_percentage || o->dedupe_percentage) { unsigned int perc = td->o.compress_percentage; struct frand_state *rs; diff --git a/memory.c b/memory.c index 9e73f100..fe657225 100644 --- a/memory.c +++ b/memory.c @@ -207,6 +207,75 @@ static void free_mem_malloc(struct thread_data *td) free(td->orig_buffer); } +#ifdef CONFIG_CUDA + +static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem) +{ + CUresult ret; + char name[128]; + + ret = cuInit(0); + if (ret != CUDA_SUCCESS) { + log_err("fio: failed initialize cuda driver api\n"); + return 1; + } + + ret = cuDeviceGetCount(&td->gpu_dev_cnt); + if (ret != CUDA_SUCCESS) { + log_err("fio: failed get device count\n"); + return 1; + } + dprint(FD_MEM, "found %d GPU devices\n", td->gpu_dev_cnt); + + if (td->gpu_dev_cnt == 0) { + log_err("fio: no GPU device found. " + "Can not perform GPUDirect RDMA.\n"); + return 1; + } + + td->gpu_dev_id = td->o.gpu_dev_id; + ret = cuDeviceGet(&td->cu_dev, td->gpu_dev_id); + if (ret != CUDA_SUCCESS) { + log_err("fio: failed get GPU device\n"); + return 1; + } + + ret = cuDeviceGetName(name, sizeof(name), td->gpu_dev_id); + if (ret != CUDA_SUCCESS) { + log_err("fio: failed get device name\n"); + return 1; + } + dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \ + td->gpu_dev_id, name); + + ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev); + if (ret != CUDA_SUCCESS) { + log_err("fio: failed to create cuda context: %d\n", ret); + return 1; + } + + ret = cuMemAlloc(&td->dev_mem_ptr, total_mem); + if (ret != CUDA_SUCCESS) { + log_err("fio: cuMemAlloc %zu bytes failed\n", total_mem); + return 1; + } + td->orig_buffer = (void *) td->dev_mem_ptr; + + dprint(FD_MEM, "cudaMalloc %llu %p\n", \ + (unsigned long long) total_mem, td->orig_buffer); + return 0; +} + +static void free_mem_cudamalloc(struct thread_data *td) +{ + if ((void *) td->dev_mem_ptr != NULL) + cuMemFree(td->dev_mem_ptr); + + if (cuCtxDestroy(td->cu_ctx) != CUDA_SUCCESS) + log_err("fio: failed to destroy cuda context\n"); +} +#endif + /* * Set up the buffer area we need for io. */ @@ -246,6 +315,10 @@ int allocate_io_mem(struct thread_data *td) else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE || td->o.mem_type == MEM_MMAPSHARED) ret = alloc_mem_mmap(td, total_mem); +#ifdef CONFIG_CUDA + else if (td->o.mem_type == MEM_CUDA_MALLOC) + ret = alloc_mem_cudamalloc(td, total_mem); +#endif else { log_err("fio: bad mem type: %d\n", td->o.mem_type); ret = 1; @@ -275,6 +348,10 @@ void free_io_mem(struct thread_data *td) else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE || td->o.mem_type == MEM_MMAPSHARED) free_mem_mmap(td, total_mem); +#ifdef CONFIG_CUDA + else if (td->o.mem_type == MEM_CUDA_MALLOC) + free_mem_cudamalloc(td); +#endif else log_err("Bad memory type %u\n", td->o.mem_type); diff --git a/options.c b/options.c index e0deab0a..85574d7f 100644 --- a/options.c +++ b/options.c @@ -2603,6 +2603,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .oval = MEM_MMAPHUGE, .help = "Like mmap, but use huge pages", }, +#endif +#ifdef CONFIG_CUDA + { .ival = "cudamalloc", + .oval = MEM_CUDA_MALLOC, + .help = "Allocate GPU device memory for GPUDirect RDMA", + }, #endif }, }, @@ -3562,6 +3568,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = { .type = FIO_OPT_UNSUPPORTED, .help = "Build fio with libnuma-dev(el) to enable this option", }, +#endif +#ifdef CONFIG_CUDA + { + .name = "gpu_dev_id", + .lname = "GPU device ID", + .type = FIO_OPT_INT, + .off1 = offsetof(struct thread_options, gpu_dev_id), + .help = "Set GPU device ID for GPUDirect RDMA", + .def = "0", + .category = FIO_OPT_C_GENERAL, + .group = FIO_OPT_G_INVALID, + }, #endif { .name = "end_fsync", diff --git a/thread_options.h b/thread_options.h index 2b2df338..393e51ef 100644 --- a/thread_options.h +++ b/thread_options.h @@ -20,6 +20,9 @@ enum fio_memtype { MEM_MMAP, /* use anonynomous mmap */ MEM_MMAPHUGE, /* memory mapped huge file */ MEM_MMAPSHARED, /* use mmap with shared flag */ +#ifdef CONFIG_CUDA + MEM_CUDA_MALLOC,/* use GPU memory */ +#endif }; #define ERROR_STR_MAX 128 @@ -198,6 +201,8 @@ struct thread_options { unsigned short numa_mem_mode; unsigned int numa_mem_prefer_node; char *numa_memnodes; + unsigned int gpu_dev_id; + unsigned int iolog; unsigned int rwmixcycle; unsigned int rwmix[DDIR_RWDIR_CNT];