2 * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
3 * mailto:info@systemfabricworks.com
5 * License: GPLv2, see COPYING.
9 * fio I/O engine using the NVIDIA cuFile API.
18 #include <sys/resource.h>
21 #include <cuda_runtime.h>
25 #include "../lib/pow2.h"
26 #include "../optgroup.h"
27 #include "../lib/memalign.h"
29 #define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)
31 #define LOGGED_BUFLEN_NOT_ALIGNED 0x01
32 #define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
33 #define GPU_ID_SEP ":"
40 struct libcufile_options {
41 struct thread_data *td;
42 char *gpu_ids; /* colon-separated list of GPU ids,
44 void *cu_mem_ptr; /* GPU memory */
45 void *junk_buf; /* buffer to simulate cudaMemcpy with
47 int my_gpu_id; /* GPU id to use for this job */
48 unsigned int cuda_io; /* Type of I/O to use with CUDA */
49 size_t total_mem; /* size for cu_mem_ptr and junk_buf */
50 int logged; /* bitmask of log messages that have
51 been output, prevent flood */
54 struct fio_libcufile_data {
55 CUfileDescr_t cf_descr;
56 CUfileHandle_t cf_handle;
59 static struct fio_option options[] = {
61 .name = "gpu_dev_ids",
62 .lname = "libcufile engine gpu dev ids",
63 .type = FIO_OPT_STR_STORE,
64 .off1 = offsetof(struct libcufile_options, gpu_ids),
65 .help = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
66 .category = FIO_OPT_C_ENGINE,
67 .group = FIO_OPT_G_LIBCUFILE,
71 .lname = "libcufile cuda io",
73 .off1 = offsetof(struct libcufile_options, cuda_io),
74 .help = "Type of I/O to use with CUDA",
79 .help = "libcufile nvidia-fs"
86 .category = FIO_OPT_C_ENGINE,
87 .group = FIO_OPT_G_LIBCUFILE,
94 static int running = 0;
95 static int cufile_initialized = 0;
96 static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;
98 #define check_cudaruntimecall(fn, rc) \
100 cudaError_t res = fn; \
101 if (res != cudaSuccess) { \
102 const char *str = cudaGetErrorName(res); \
103 log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
104 #fn, __LINE__, res, str); \
110 static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
112 if (IS_CUFILE_ERR(st.err))
113 return cufileop_status_error(st.err);
118 * Assign GPU to subjob roundrobin, similar to how multiple
119 * entries in 'directory' are handled by fio.
121 static int fio_libcufile_find_gpu_id(struct thread_data *td)
123 struct libcufile_options *o = td->eo;
126 if (o->gpu_ids != NULL) {
127 char *gpu_ids, *pos, *cur;
128 int i, id_count, gpu_idx;
130 for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
131 cur = strchr(cur, GPU_ID_SEP[0]);
136 gpu_idx = td->subjob_number % id_count;
138 pos = gpu_ids = strdup(o->gpu_ids);
139 if (gpu_ids == NULL) {
140 log_err("strdup(gpu_ids): err=%d\n", errno);
145 while (pos != NULL && i <= gpu_idx) {
147 cur = strsep(&pos, GPU_ID_SEP);
159 static int fio_libcufile_init(struct thread_data *td)
161 struct libcufile_options *o = td->eo;
162 CUfileError_t status;
166 pthread_mutex_lock(&running_lock);
168 assert(cufile_initialized == 0);
169 if (o->cuda_io == IO_CUFILE) {
170 /* only open the driver if this is the first worker thread */
171 status = cuFileDriverOpen();
172 if (status.err != CU_FILE_SUCCESS)
173 log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
174 fio_libcufile_get_cuda_error(status));
176 cufile_initialized = 1;
180 initialized = cufile_initialized;
181 pthread_mutex_unlock(&running_lock);
183 if (o->cuda_io == IO_CUFILE && !initialized)
186 o->my_gpu_id = fio_libcufile_find_gpu_id(td);
187 if (o->my_gpu_id < 0)
190 dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
191 check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
198 static inline int fio_libcufile_pre_write(struct thread_data *td,
199 struct libcufile_options *o,
205 if (o->cuda_io == IO_CUFILE) {
208 Data is being verified, copy the io_u buffer to GPU memory.
209 This isn't done in the non-verify case because the data would
210 already be in GPU memory in a normal cuFile application.
212 check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
215 cudaMemcpyHostToDevice), rc);
217 log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
221 } else if (o->cuda_io == IO_POSIX) {
224 POSIX I/O is being used, the data has to be copied out of the
225 GPU into a CPU buffer. GPU memory doesn't contain the actual
226 data to write, copy the data to the junk buffer. The purpose
227 of this is to add the overhead of cudaMemcpy() that would be
228 present in a POSIX I/O CUDA application.
230 check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
231 ((char*) o->cu_mem_ptr) + gpu_offset,
233 cudaMemcpyDeviceToHost), rc);
235 log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
239 log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
247 static inline int fio_libcufile_post_read(struct thread_data *td,
248 struct libcufile_options *o,
254 if (o->cuda_io == IO_CUFILE) {
256 /* Copy GPU memory to CPU buffer for verify */
257 check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
258 ((char*) o->cu_mem_ptr) + gpu_offset,
260 cudaMemcpyDeviceToHost), rc);
262 log_err("DDIR_READ cudaMemcpy D2H failed\n");
266 } else if (o->cuda_io == IO_POSIX) {
267 /* POSIX I/O read, copy the CPU buffer to GPU memory */
268 check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
271 cudaMemcpyHostToDevice), rc);
273 log_err("DDIR_READ cudaMemcpy H2D failed\n");
277 log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
285 static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
288 struct libcufile_options *o = td->eo;
289 struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
290 unsigned long long io_offset;
297 if (o->cuda_io == IO_CUFILE && fcd == NULL) {
298 io_u->error = EINVAL;
299 td_verror(td, EINVAL, "xfer");
300 return FIO_Q_COMPLETED;
303 fio_ro_check(td, io_u);
307 rc = fsync(io_u->file->fd);
310 log_err("fsync: err=%d\n", errno);
315 rc = fdatasync(io_u->file->fd);
318 log_err("fdatasync: err=%d\n", errno);
325 There may be a better way to calculate gpu_offset. The intent is
326 that gpu_offset equals the the difference between io_u->xfer_buf and
327 the page-aligned base address for io_u buffers.
329 gpu_offset = io_u->index * io_u->xfer_buflen;
330 io_offset = io_u->offset;
331 remaining = io_u->xfer_buflen;
336 assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);
338 if (o->cuda_io == IO_CUFILE) {
339 if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
340 (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
341 log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
342 o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
345 if (!(ALIGNED_4KB(gpu_offset) ||
346 (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
347 log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
348 o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
352 if (io_u->ddir == DDIR_WRITE)
353 rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);
355 if (io_u->error != 0)
358 while (remaining > 0) {
359 assert(gpu_offset + xfered <= o->total_mem);
360 if (io_u->ddir == DDIR_READ) {
361 if (o->cuda_io == IO_CUFILE) {
362 sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
363 io_offset + xfered, gpu_offset + xfered);
366 log_err("cuFileRead: err=%d\n", errno);
369 log_err("cuFileRead: err=%ld:%s\n", sz,
370 cufileop_status_error(-sz));
372 } else if (o->cuda_io == IO_POSIX) {
373 sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
374 remaining, io_offset + xfered);
377 log_err("pread: err=%d\n", errno);
380 log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
384 } else if (io_u->ddir == DDIR_WRITE) {
385 if (o->cuda_io == IO_CUFILE) {
386 sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
387 io_offset + xfered, gpu_offset + xfered);
390 log_err("cuFileWrite: err=%d\n", errno);
393 log_err("cuFileWrite: err=%ld:%s\n", sz,
394 cufileop_status_error(-sz));
396 } else if (o->cuda_io == IO_POSIX) {
397 sz = pwrite(io_u->file->fd,
398 ((char*) io_u->xfer_buf) + xfered,
399 remaining, io_offset + xfered);
402 log_err("pwrite: err=%d\n", errno);
405 log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
410 log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
416 if (io_u->error != 0)
423 log_info("Incomplete %s: %ld bytes remaining\n",
424 io_u->ddir == DDIR_READ? "read" : "write", remaining);
427 if (io_u->error != 0)
430 if (io_u->ddir == DDIR_READ)
431 rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
435 io_u->error = EINVAL;
439 if (io_u->error != 0) {
440 log_err("IO failed\n");
441 td_verror(td, io_u->error, "xfer");
444 return FIO_Q_COMPLETED;
447 static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
449 struct libcufile_options *o = td->eo;
450 struct fio_libcufile_data *fcd = NULL;
452 CUfileError_t status;
454 rc = generic_open_file(td, f);
458 if (o->cuda_io == IO_CUFILE) {
459 fcd = calloc(1, sizeof(*fcd));
465 fcd->cf_descr.handle.fd = f->fd;
466 fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
467 status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
468 if (status.err != CU_FILE_SUCCESS) {
469 log_err("cufile register: err=%d:%s\n", status.err,
470 fio_libcufile_get_cuda_error(status));
476 FILE_SET_ENG_DATA(f, fcd);
485 int rc2 = generic_close_file(td, f);
487 log_err("generic_close_file: err=%d\n", rc2);
492 static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
494 struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
498 cuFileHandleDeregister(fcd->cf_handle);
499 FILE_SET_ENG_DATA(f, NULL);
503 rc = generic_close_file(td, f);
508 static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
510 struct libcufile_options *o = td->eo;
512 CUfileError_t status;
514 o->total_mem = total_mem;
516 o->cu_mem_ptr = NULL;
518 td->orig_buffer = calloc(1, total_mem);
519 if (!td->orig_buffer) {
520 log_err("orig_buffer calloc failed: err=%d\n", errno);
524 if (o->cuda_io == IO_POSIX) {
525 o->junk_buf = calloc(1, total_mem);
526 if (o->junk_buf == NULL) {
527 log_err("junk_buf calloc failed: err=%d\n", errno);
532 dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
533 check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
536 check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
540 if (o->cuda_io == IO_CUFILE) {
541 status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
542 if (status.err != CU_FILE_SUCCESS) {
543 log_err("cuFileBufRegister: err=%d:%s\n", status.err,
544 fio_libcufile_get_cuda_error(status));
552 if (td->orig_buffer) {
553 free(td->orig_buffer);
554 td->orig_buffer = NULL;
561 cudaFree(o->cu_mem_ptr);
562 o->cu_mem_ptr = NULL;
567 static void fio_libcufile_iomem_free(struct thread_data *td)
569 struct libcufile_options *o = td->eo;
576 if (o->cuda_io == IO_CUFILE)
577 cuFileBufDeregister(o->cu_mem_ptr);
578 cudaFree(o->cu_mem_ptr);
579 o->cu_mem_ptr = NULL;
581 if (td->orig_buffer) {
582 free(td->orig_buffer);
583 td->orig_buffer = NULL;
587 static void fio_libcufile_cleanup(struct thread_data *td)
589 struct libcufile_options *o = td->eo;
591 pthread_mutex_lock(&running_lock);
593 assert(running >= 0);
595 /* only close the driver if initialized and
596 this is the last worker thread */
597 if (o->cuda_io == IO_CUFILE && cufile_initialized)
599 cufile_initialized = 0;
601 pthread_mutex_unlock(&running_lock);
604 FIO_STATIC struct ioengine_ops ioengine = {
606 .version = FIO_IOOPS_VERSION,
607 .init = fio_libcufile_init,
608 .queue = fio_libcufile_queue,
609 .get_file_size = generic_get_file_size,
610 .open_file = fio_libcufile_open_file,
611 .close_file = fio_libcufile_close_file,
612 .iomem_alloc = fio_libcufile_iomem_alloc,
613 .iomem_free = fio_libcufile_iomem_free,
614 .cleanup = fio_libcufile_cleanup,
617 .option_struct_size = sizeof(struct libcufile_options)
620 void fio_init fio_libcufile_register(void)
622 register_ioengine(&ioengine);
625 void fio_exit fio_libcufile_unregister(void)
627 unregister_ioengine(&ioengine);