stat: make add lat percentile functions inline
[fio.git] / engines / libcufile.c
1 /*
2  * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
3  * mailto:info@systemfabricworks.com
4  *
5  * License: GPLv2, see COPYING.
6  *
7  * libcufile engine
8  *
9  * fio I/O engine using the NVIDIA cuFile API.
10  *
11  */
12
13 #include <stdlib.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <string.h>
17 #include <sys/time.h>
18 #include <sys/resource.h>
19 #include <cufile.h>
20 #include <cuda.h>
21 #include <cuda_runtime.h>
22 #include <pthread.h>
23
24 #include "../fio.h"
25 #include "../lib/pow2.h"
26 #include "../optgroup.h"
27 #include "../lib/memalign.h"
28
29 #define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)
30
31 #define LOGGED_BUFLEN_NOT_ALIGNED     0x01
32 #define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
33 #define GPU_ID_SEP ":"
34
35 enum {
36         IO_CUFILE    = 1,
37         IO_POSIX     = 2
38 };
39
40 struct libcufile_options {
41         struct thread_data *td;
42         char               *gpu_ids;       /* colon-separated list of GPU ids,
43                                               one per job */
44         void               *cu_mem_ptr;    /* GPU memory */
45         void               *junk_buf;      /* buffer to simulate cudaMemcpy with
46                                               posix I/O write */
47         int                 my_gpu_id;     /* GPU id to use for this job */
48         unsigned int        cuda_io;       /* Type of I/O to use with CUDA */
49         size_t              total_mem;     /* size for cu_mem_ptr and junk_buf */
50         int                 logged;        /* bitmask of log messages that have
51                                               been output, prevent flood */
52 };
53
54 struct fio_libcufile_data {
55         CUfileDescr_t  cf_descr;
56         CUfileHandle_t cf_handle;
57 };
58
59 static struct fio_option options[] = {
60         {
61                 .name     = "gpu_dev_ids",
62                 .lname    = "libcufile engine gpu dev ids",
63                 .type     = FIO_OPT_STR_STORE,
64                 .off1     = offsetof(struct libcufile_options, gpu_ids),
65                 .help     = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
66                 .category = FIO_OPT_C_ENGINE,
67                 .group    = FIO_OPT_G_LIBCUFILE,
68         },
69         {
70                 .name     = "cuda_io",
71                 .lname    = "libcufile cuda io",
72                 .type     = FIO_OPT_STR,
73                 .off1     = offsetof(struct libcufile_options, cuda_io),
74                 .help     = "Type of I/O to use with CUDA",
75                 .def      = "cufile",
76                 .posval   = {
77                             { .ival = "cufile",
78                               .oval = IO_CUFILE,
79                               .help = "libcufile nvidia-fs"
80                             },
81                             { .ival = "posix",
82                               .oval = IO_POSIX,
83                               .help = "POSIX I/O"
84                             }
85                 },
86                 .category = FIO_OPT_C_ENGINE,
87                 .group    = FIO_OPT_G_LIBCUFILE,
88         },
89         {
90                 .name    = NULL,
91         },
92 };
93
94 static int running = 0;
95 static int cufile_initialized = 0;
96 static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;
97
98 #define check_cudaruntimecall(fn, rc)                                               \
99         do {                                                                        \
100                 cudaError_t res = fn;                                               \
101                 if (res != cudaSuccess) {                                           \
102                         const char *str = cudaGetErrorName(res);                    \
103                         log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
104                                 #fn, __LINE__, res, str);                           \
105                         rc = -1;                                                    \
106                 } else                                                              \
107                         rc = 0;                                                     \
108         } while(0)
109
110 static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
111 {
112         if (IS_CUFILE_ERR(st.err))
113                 return cufileop_status_error(st.err);
114         return "unknown";
115 }
116
117 /*
118  * Assign GPU to subjob roundrobin, similar to how multiple
119  * entries in 'directory' are handled by fio.
120  */
121 static int fio_libcufile_find_gpu_id(struct thread_data *td)
122 {
123         struct libcufile_options *o = td->eo;
124         int gpu_id = 0;
125
126         if (o->gpu_ids != NULL) {
127                 char *gpu_ids, *pos, *cur;
128                 int i, id_count, gpu_idx;
129
130                 for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
131                         cur = strchr(cur, GPU_ID_SEP[0]);
132                         if (cur != NULL)
133                                 cur++;
134                 }
135
136                 gpu_idx = td->subjob_number % id_count;
137
138                 pos = gpu_ids = strdup(o->gpu_ids);
139                 if (gpu_ids == NULL) {
140                         log_err("strdup(gpu_ids): err=%d\n", errno);
141                         return -1;
142                 }
143
144                 i = 0;
145                 while (pos != NULL && i <= gpu_idx) {
146                         i++;
147                         cur = strsep(&pos, GPU_ID_SEP);
148                 }
149
150                 if (cur)
151                         gpu_id = atoi(cur);
152
153                 free(gpu_ids);
154         }
155
156         return gpu_id;
157 }
158
159 static int fio_libcufile_init(struct thread_data *td)
160 {
161         struct libcufile_options *o = td->eo;
162         CUfileError_t status;
163         int initialized;
164         int rc;
165
166         pthread_mutex_lock(&running_lock);
167         if (running == 0) {
168                 assert(cufile_initialized == 0);
169                 if (o->cuda_io == IO_CUFILE) {
170                         /* only open the driver if this is the first worker thread */
171                         status = cuFileDriverOpen();
172                         if (status.err != CU_FILE_SUCCESS)
173                                 log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
174                                         fio_libcufile_get_cuda_error(status));
175                         else
176                                 cufile_initialized = 1;
177                 }
178         }
179         running++;
180         initialized = cufile_initialized;
181         pthread_mutex_unlock(&running_lock);
182
183         if (o->cuda_io == IO_CUFILE && !initialized)
184                 return 1;
185
186         o->my_gpu_id = fio_libcufile_find_gpu_id(td);
187         if (o->my_gpu_id < 0)
188                 return 1;
189
190         dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
191         check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
192         if (rc != 0)
193                 return 1;
194
195         return 0;
196 }
197
198 static inline int fio_libcufile_pre_write(struct thread_data *td,
199                                           struct libcufile_options *o,
200                                           struct io_u *io_u,
201                                           size_t gpu_offset)
202 {
203         int rc = 0;
204
205         if (o->cuda_io == IO_CUFILE) {
206                 if (td->o.verify) {
207                         /*
208                           Data is being verified, copy the io_u buffer to GPU memory.
209                           This isn't done in the non-verify case because the data would
210                           already be in GPU memory in a normal cuFile application.
211                         */
212                         check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
213                                                          io_u->xfer_buf,
214                                                          io_u->xfer_buflen,
215                                                          cudaMemcpyHostToDevice), rc);
216                         if (rc != 0) {
217                                 log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
218                                 io_u->error = EIO;
219                         }
220                 }
221         } else if (o->cuda_io == IO_POSIX) {
222
223                 /*
224                   POSIX I/O is being used, the data has to be copied out of the
225                   GPU into a CPU buffer. GPU memory doesn't contain the actual
226                   data to write, copy the data to the junk buffer. The purpose
227                   of this is to add the overhead of cudaMemcpy() that would be
228                   present in a POSIX I/O CUDA application.
229                 */
230                 check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
231                                                  ((char*) o->cu_mem_ptr) + gpu_offset,
232                                                  io_u->xfer_buflen,
233                                                  cudaMemcpyDeviceToHost), rc);
234                 if (rc != 0) {
235                         log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
236                         io_u->error = EIO;
237                 }
238         } else {
239                 log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
240                 assert(0);
241                 rc = EINVAL;
242         }
243
244         return rc;
245 }
246
247 static inline int fio_libcufile_post_read(struct thread_data *td,
248                                           struct libcufile_options *o,
249                                           struct io_u *io_u,
250                                           size_t gpu_offset)
251 {
252         int rc = 0;
253
254         if (o->cuda_io == IO_CUFILE) {
255                 if (td->o.verify) {
256                         /* Copy GPU memory to CPU buffer for verify */
257                         check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
258                                                          ((char*) o->cu_mem_ptr) + gpu_offset,
259                                                          io_u->xfer_buflen,
260                                                          cudaMemcpyDeviceToHost), rc);
261                         if (rc != 0) {
262                                 log_err("DDIR_READ cudaMemcpy D2H failed\n");
263                                 io_u->error = EIO;
264                         }
265                 }
266         } else if (o->cuda_io == IO_POSIX) {
267                 /* POSIX I/O read, copy the CPU buffer to GPU memory */
268                 check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
269                                                  io_u->xfer_buf,
270                                                  io_u->xfer_buflen,
271                                                  cudaMemcpyHostToDevice), rc);
272                 if (rc != 0) {
273                         log_err("DDIR_READ cudaMemcpy H2D failed\n");
274                         io_u->error = EIO;
275                 }
276         } else {
277                 log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
278                 assert(0);
279                 rc = EINVAL;
280         }
281
282         return rc;
283 }
284
285 static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
286                                              struct io_u *io_u)
287 {
288         struct libcufile_options *o = td->eo;
289         struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
290         unsigned long long io_offset;
291         ssize_t sz;
292         ssize_t remaining;
293         size_t xfered;
294         size_t gpu_offset;
295         int rc;
296
297         if (o->cuda_io == IO_CUFILE && fcd == NULL) {
298                 io_u->error = EINVAL;
299                 td_verror(td, EINVAL, "xfer");
300                 return FIO_Q_COMPLETED;
301         }
302
303         fio_ro_check(td, io_u);
304
305         switch(io_u->ddir) {
306         case DDIR_SYNC:
307                 rc = fsync(io_u->file->fd);
308                 if (rc != 0) {
309                         io_u->error = errno;
310                         log_err("fsync: err=%d\n", errno);
311                 }
312                 break;
313
314         case DDIR_DATASYNC:
315                 rc = fdatasync(io_u->file->fd);
316                 if (rc != 0) {
317                         io_u->error = errno;
318                         log_err("fdatasync: err=%d\n", errno);
319                 }
320                 break;
321
322         case DDIR_READ:
323         case DDIR_WRITE:
324                 /*
325                   There may be a better way to calculate gpu_offset. The intent is
326                   that gpu_offset equals the the difference between io_u->xfer_buf and
327                   the page-aligned base address for io_u buffers.
328                 */
329                 gpu_offset = io_u->index * io_u->xfer_buflen;
330                 io_offset = io_u->offset;
331                 remaining = io_u->xfer_buflen;
332
333                 xfered = 0;
334                 sz = 0;
335
336                 assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);
337
338                 if (o->cuda_io == IO_CUFILE) {
339                         if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
340                               (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
341                                 log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
342                                 o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
343                         }
344
345                         if (!(ALIGNED_4KB(gpu_offset) ||
346                               (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
347                                 log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
348                                 o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
349                         }
350                 }
351
352                 if (io_u->ddir == DDIR_WRITE)
353                         rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);
354
355                 if (io_u->error != 0)
356                         break;
357
358                 while (remaining > 0) {
359                         assert(gpu_offset + xfered <= o->total_mem);
360                         if (io_u->ddir == DDIR_READ) {
361                                 if (o->cuda_io == IO_CUFILE) {
362                                         sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
363                                                         io_offset + xfered, gpu_offset + xfered);
364                                         if (sz == -1) {
365                                                 io_u->error = errno;
366                                                 log_err("cuFileRead: err=%d\n", errno);
367                                         } else if (sz < 0) {
368                                                 io_u->error = EIO;
369                                                 log_err("cuFileRead: err=%ld:%s\n", sz,
370                                                         cufileop_status_error(-sz));
371                                         }
372                                 } else if (o->cuda_io == IO_POSIX) {
373                                         sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
374                                                    remaining, io_offset + xfered);
375                                         if (sz < 0) {
376                                                 io_u->error = errno;
377                                                 log_err("pread: err=%d\n", errno);
378                                         }
379                                 } else {
380                                         log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
381                                         io_u->error = -1;
382                                         assert(0);
383                                 }
384                         } else if (io_u->ddir == DDIR_WRITE) {
385                                 if (o->cuda_io == IO_CUFILE) {
386                                         sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
387                                                          io_offset + xfered, gpu_offset + xfered);
388                                         if (sz == -1) {
389                                                 io_u->error = errno;
390                                                 log_err("cuFileWrite: err=%d\n", errno);
391                                         } else if (sz < 0) {
392                                                 io_u->error = EIO;
393                                                 log_err("cuFileWrite: err=%ld:%s\n", sz,
394                                                         cufileop_status_error(-sz));
395                                         }
396                                 } else if (o->cuda_io == IO_POSIX) {
397                                         sz = pwrite(io_u->file->fd,
398                                                     ((char*) io_u->xfer_buf) + xfered,
399                                                     remaining, io_offset + xfered);
400                                         if (sz < 0) {
401                                                 io_u->error = errno;
402                                                 log_err("pwrite: err=%d\n", errno);
403                                         }
404                                 } else {
405                                         log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
406                                         io_u->error = -1;
407                                         assert(0);
408                                 }
409                         } else {
410                                 log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
411                                 io_u->error = -1;
412                                 assert(0);
413                                 break;
414                         }
415
416                         if (io_u->error != 0)
417                                 break;
418
419                         remaining -= sz;
420                         xfered += sz;
421
422                         if (remaining != 0)
423                                 log_info("Incomplete %s: %ld bytes remaining\n",
424                                          io_u->ddir == DDIR_READ? "read" : "write", remaining);
425                 }
426
427                 if (io_u->error != 0)
428                         break;
429
430                 if (io_u->ddir == DDIR_READ)
431                         rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
432                 break;
433
434         default:
435                 io_u->error = EINVAL;
436                 break;
437         }
438
439         if (io_u->error != 0) {
440                 log_err("IO failed\n");
441                 td_verror(td, io_u->error, "xfer");
442         }
443
444         return FIO_Q_COMPLETED;
445 }
446
447 static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
448 {
449         struct libcufile_options *o = td->eo;
450         struct fio_libcufile_data *fcd = NULL;
451         int rc;
452         CUfileError_t status;
453
454         rc = generic_open_file(td, f);
455         if (rc)
456                 return rc;
457
458         if (o->cuda_io == IO_CUFILE) {
459                 fcd = calloc(1, sizeof(*fcd));
460                 if (fcd == NULL) {
461                         rc = ENOMEM;
462                         goto exit_err;
463                 }
464
465                 fcd->cf_descr.handle.fd = f->fd;
466                 fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
467                 status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
468                 if (status.err != CU_FILE_SUCCESS) {
469                         log_err("cufile register: err=%d:%s\n", status.err,
470                                 fio_libcufile_get_cuda_error(status));
471                         rc = EINVAL;
472                         goto exit_err;
473                 }
474         }
475
476         FILE_SET_ENG_DATA(f, fcd);
477         return 0;
478
479 exit_err:
480         if (fcd) {
481                 free(fcd);
482                 fcd = NULL;
483         }
484         if (f) {
485                 int rc2 = generic_close_file(td, f);
486                 if (rc2)
487                         log_err("generic_close_file: err=%d\n", rc2);
488         }
489         return rc;
490 }
491
492 static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
493 {
494         struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
495         int rc;
496
497         if (fcd != NULL) {
498                 cuFileHandleDeregister(fcd->cf_handle);
499                 FILE_SET_ENG_DATA(f, NULL);
500                 free(fcd);
501         }
502
503         rc = generic_close_file(td, f);
504
505         return rc;
506 }
507
508 static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
509 {
510         struct libcufile_options *o = td->eo;
511         int rc;
512         CUfileError_t status;
513
514         o->total_mem = total_mem;
515         o->logged = 0;
516         o->cu_mem_ptr = NULL;
517         o->junk_buf = NULL;
518         td->orig_buffer = calloc(1, total_mem);
519         if (!td->orig_buffer) {
520                 log_err("orig_buffer calloc failed: err=%d\n", errno);
521                 goto exit_error;
522         }
523
524         if (o->cuda_io == IO_POSIX) {
525                 o->junk_buf = calloc(1, total_mem);
526                 if (o->junk_buf == NULL) {
527                         log_err("junk_buf calloc failed: err=%d\n", errno);
528                         goto exit_error;
529                 }
530         }
531
532         dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
533         check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
534         if (rc != 0)
535                 goto exit_error;
536         check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
537         if (rc != 0)
538                 goto exit_error;
539
540         if (o->cuda_io == IO_CUFILE) {
541                 status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
542                 if (status.err != CU_FILE_SUCCESS) {
543                         log_err("cuFileBufRegister: err=%d:%s\n", status.err,
544                                 fio_libcufile_get_cuda_error(status));
545                         goto exit_error;
546                 }
547         }
548
549         return 0;
550
551 exit_error:
552         if (td->orig_buffer) {
553                 free(td->orig_buffer);
554                 td->orig_buffer = NULL;
555         }
556         if (o->junk_buf) {
557                 free(o->junk_buf);
558                 o->junk_buf = NULL;
559         }
560         if (o->cu_mem_ptr) {
561                 cudaFree(o->cu_mem_ptr);
562                 o->cu_mem_ptr = NULL;
563         }
564         return 1;
565 }
566
567 static void fio_libcufile_iomem_free(struct thread_data *td)
568 {
569         struct libcufile_options *o = td->eo;
570
571         if (o->junk_buf) {
572                 free(o->junk_buf);
573                 o->junk_buf = NULL;
574         }
575         if (o->cu_mem_ptr) {
576                 if (o->cuda_io == IO_CUFILE)
577                         cuFileBufDeregister(o->cu_mem_ptr);
578                 cudaFree(o->cu_mem_ptr);
579                 o->cu_mem_ptr = NULL;
580         }
581         if (td->orig_buffer) {
582                 free(td->orig_buffer);
583                 td->orig_buffer = NULL;
584         }
585 }
586
587 static void fio_libcufile_cleanup(struct thread_data *td)
588 {
589         struct libcufile_options *o = td->eo;
590
591         pthread_mutex_lock(&running_lock);
592         running--;
593         assert(running >= 0);
594         if (running == 0) {
595                 /* only close the driver if initialized and
596                    this is the last worker thread */
597                 if (o->cuda_io == IO_CUFILE && cufile_initialized)
598                         cuFileDriverClose();
599                 cufile_initialized = 0;
600         }
601         pthread_mutex_unlock(&running_lock);
602 }
603
604 FIO_STATIC struct ioengine_ops ioengine = {
605         .name                = "libcufile",
606         .version             = FIO_IOOPS_VERSION,
607         .init                = fio_libcufile_init,
608         .queue               = fio_libcufile_queue,
609         .open_file           = fio_libcufile_open_file,
610         .close_file          = fio_libcufile_close_file,
611         .iomem_alloc         = fio_libcufile_iomem_alloc,
612         .iomem_free          = fio_libcufile_iomem_free,
613         .cleanup             = fio_libcufile_cleanup,
614         .flags               = FIO_SYNCIO,
615         .options             = options,
616         .option_struct_size  = sizeof(struct libcufile_options)
617 };
618
619 void fio_init fio_libcufile_register(void)
620 {
621         register_ioengine(&ioengine);
622 }
623
624 void fio_exit fio_libcufile_unregister(void)
625 {
626         unregister_ioengine(&ioengine);
627 }