**mmapshared**
Same as mmap, but use a MMAP_SHARED mapping.
+ **cudamalloc**
+ Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+
The area allocated is a function of the maximum allowed bs size for the job,
multiplied by the I/O depth given. Note that for **shmhuge** and
**mmaphuge** to work, the system must have free huge pages allocated. This
;;
--disable-pmem) disable_pmem="yes"
;;
+ --enable-cuda) enable_cuda="yes"
+ ;;
--help)
show_help="yes"
;;
echo "--esx Configure build options for esx"
echo "--enable-gfio Enable building of gtk gfio"
echo "--disable-numa Disable libnuma even if found"
- echo "--disable-rdma Disable RDMA support even if found"
+ echo "--disable-rdma Disable RDMA support even if found"
echo "--disable-gfapi Disable gfapi"
echo "--enable-libhdfs Enable hdfs support"
echo "--disable-lex Disable use of lex/yacc for math"
echo "--enable-lex Enable use of lex/yacc for math"
echo "--disable-shm Disable SHM support"
echo "--disable-optimizations Don't enable compiler optimizations"
+ echo "--enable-cuda Enable GPUDirect RDMA support"
exit $exit_val
fi
fi
echo "march_armv8_a_crc_crypto $march_armv8_a_crc_crypto"
+##########################################
+# cuda probe
+cuda="no"
+cat > $TMPC << EOF
+#include <cuda.h>
+int main(int argc, char **argv)
+{
+ return cuInit(0);
+}
+EOF
+if test "$enable_cuda" == "yes" && compile_prog "" "-lcuda" "cuda"; then
+ cuda="yes"
+ LIBS="-lcuda $LIBS"
+fi
+echo "cuda $cuda"
#############################################################################
if test "$disable_opt" = "yes" ; then
output_sym "CONFIG_DISABLE_OPTIMIZATIONS"
fi
-
if test "$zlib" = "no" ; then
echo "Consider installing zlib-dev (zlib-devel), some fio features depend on it."
fi
+if test "$cuda" = "yes" ; then
+ output_sym "CONFIG_CUDA"
+fi
echo "LIBS+=$LIBS" >> $config_host_mak
echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
--- /dev/null
+# Example gpudirect rdma client job
+[global]
+ioengine=rdma
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[sender]
+rw=write
+iodepth=1
+iodepth_batch_complete=1
--- /dev/null
+# Example rdma server job
+[global]
+ioengine=rdma
+port=[port]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[receiver]
+rw=read
+iodepth=16
.TP
.B mmapshared
Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
+.TP
+.B cudamalloc
+Use GPU memory as the buffers for GPUDirect RDMA benchmark. The ioengine must be \fBrdma\fR.
.RE
.P
The amount of memory allocated is the maximum allowed \fBblocksize\fR for the
#define MPOL_LOCAL MPOL_MAX
#endif
+#ifdef CONFIG_CUDA
+#include <cuda.h>
+#endif
+
/*
* offset generator types
*/
struct steadystate_data ss;
char verror[FIO_VERROR_SIZE];
+
+#ifdef CONFIG_CUDA
+ /*
+ * for GPU memory management
+ */
+ int gpu_dev_cnt;
+ int gpu_dev_id;
+ CUdevice cu_dev;
+ CUcontext cu_ctx;
+ CUdeviceptr dev_mem_ptr;
+#endif
+
};
/*
populate_verify_io_u(td, io_u);
do_scramble = 0;
}
+#ifdef CONFIG_CUDA
+ if (td->o.mem_type == MEM_CUDA_MALLOC)
+ do_scramble = 0;
+#endif
} else if (io_u->ddir == DDIR_READ) {
/*
* Reset the buf_filled parameters so next time if the
if (!td_io_prep(td, io_u)) {
if (!td->o.disable_lat)
fio_gettime(&io_u->start_time, NULL);
+
if (do_scramble)
small_content_scramble(io_u);
+
return io_u;
}
err_put:
{
struct thread_options *o = &td->o;
+#ifdef CONFIG_CUDA
+ if (o->mem_type == MEM_CUDA_MALLOC) return;
+#endif
+
if (o->compress_percentage || o->dedupe_percentage) {
unsigned int perc = td->o.compress_percentage;
struct frand_state *rs;
free(td->orig_buffer);
}
+#ifdef CONFIG_CUDA
+
+static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem)
+{
+ CUresult ret;
+ char name[128];
+
+ ret = cuInit(0);
+ if (ret != CUDA_SUCCESS) {
+ log_err("fio: failed initialize cuda driver api\n");
+ return 1;
+ }
+
+ ret = cuDeviceGetCount(&td->gpu_dev_cnt);
+ if (ret != CUDA_SUCCESS) {
+ log_err("fio: failed get device count\n");
+ return 1;
+ }
+ dprint(FD_MEM, "found %d GPU devices\n", td->gpu_dev_cnt);
+
+ if (td->gpu_dev_cnt == 0) {
+ log_err("fio: no GPU device found. "
+ "Can not perform GPUDirect RDMA.\n");
+ return 1;
+ }
+
+ td->gpu_dev_id = td->o.gpu_dev_id;
+ ret = cuDeviceGet(&td->cu_dev, td->gpu_dev_id);
+ if (ret != CUDA_SUCCESS) {
+ log_err("fio: failed get GPU device\n");
+ return 1;
+ }
+
+ ret = cuDeviceGetName(name, sizeof(name), td->gpu_dev_id);
+ if (ret != CUDA_SUCCESS) {
+ log_err("fio: failed get device name\n");
+ return 1;
+ }
+ dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \
+ td->gpu_dev_id, name);
+
+ ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev);
+ if (ret != CUDA_SUCCESS) {
+ log_err("fio: failed to create cuda context: %d\n", ret);
+ return 1;
+ }
+
+ ret = cuMemAlloc(&td->dev_mem_ptr, total_mem);
+ if (ret != CUDA_SUCCESS) {
+ log_err("fio: cuMemAlloc %zu bytes failed\n", total_mem);
+ return 1;
+ }
+ td->orig_buffer = (void *) td->dev_mem_ptr;
+
+ dprint(FD_MEM, "cudaMalloc %llu %p\n", \
+ (unsigned long long) total_mem, td->orig_buffer);
+ return 0;
+}
+
+static void free_mem_cudamalloc(struct thread_data *td)
+{
+ if ((void *) td->dev_mem_ptr != NULL)
+ cuMemFree(td->dev_mem_ptr);
+
+ if (cuCtxDestroy(td->cu_ctx) != CUDA_SUCCESS)
+ log_err("fio: failed to destroy cuda context\n");
+}
+#endif
+
/*
* Set up the buffer area we need for io.
*/
else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
td->o.mem_type == MEM_MMAPSHARED)
ret = alloc_mem_mmap(td, total_mem);
+#ifdef CONFIG_CUDA
+ else if (td->o.mem_type == MEM_CUDA_MALLOC)
+ ret = alloc_mem_cudamalloc(td, total_mem);
+#endif
else {
log_err("fio: bad mem type: %d\n", td->o.mem_type);
ret = 1;
else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
td->o.mem_type == MEM_MMAPSHARED)
free_mem_mmap(td, total_mem);
+#ifdef CONFIG_CUDA
+ else if (td->o.mem_type == MEM_CUDA_MALLOC)
+ free_mem_cudamalloc(td);
+#endif
else
log_err("Bad memory type %u\n", td->o.mem_type);
.oval = MEM_MMAPHUGE,
.help = "Like mmap, but use huge pages",
},
+#endif
+#ifdef CONFIG_CUDA
+ { .ival = "cudamalloc",
+ .oval = MEM_CUDA_MALLOC,
+ .help = "Allocate GPU device memory for GPUDirect RDMA",
+ },
#endif
},
},
.type = FIO_OPT_UNSUPPORTED,
.help = "Build fio with libnuma-dev(el) to enable this option",
},
+#endif
+#ifdef CONFIG_CUDA
+ {
+ .name = "gpu_dev_id",
+ .lname = "GPU device ID",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, gpu_dev_id),
+ .help = "Set GPU device ID for GPUDirect RDMA",
+ .def = "0",
+ .category = FIO_OPT_C_GENERAL,
+ .group = FIO_OPT_G_INVALID,
+ },
#endif
{
.name = "end_fsync",
MEM_MMAP, /* use anonynomous mmap */
MEM_MMAPHUGE, /* memory mapped huge file */
MEM_MMAPSHARED, /* use mmap with shared flag */
+#ifdef CONFIG_CUDA
+ MEM_CUDA_MALLOC,/* use GPU memory */
+#endif
};
#define ERROR_STR_MAX 128
unsigned short numa_mem_mode;
unsigned int numa_mem_prefer_node;
char *numa_memnodes;
+ unsigned int gpu_dev_id;
+
unsigned int iolog;
unsigned int rwmixcycle;
unsigned int rwmix[DDIR_RWDIR_CNT];