Merge branch 'wip-traceinfo' of https://github.com/vears91/fio
[fio.git] / engines / rbd.c
CommitLineData
fc5c0345
DG
1/*
2 * rbd engine
3 *
4 * IO engine using Ceph's librbd to test RADOS Block Devices.
5 *
6 */
7
8#include <rbd/librbd.h>
9
10#include "../fio.h"
d220c761 11#include "../optgroup.h"
a4c4c346 12#ifdef CONFIG_RBD_BLKIN
13#include <zipkin_c.h>
14#endif
fc5c0345
DG
15
16struct fio_rbd_iou {
17 struct io_u *io_u;
d8b64af2 18 rbd_completion_t completion;
d8b64af2 19 int io_seen;
b8ecbef6 20 int io_complete;
a4c4c346 21 struct blkin_trace_info info;
fc5c0345
DG
22};
23
24struct rbd_data {
25 rados_t cluster;
26 rados_ioctx_t io_ctx;
27 rbd_image_t image;
28 struct io_u **aio_events;
6f9961ac 29 struct io_u **sort_events;
fc5c0345
DG
30};
31
32struct rbd_options {
a1f871c7 33 void *pad;
6e20c6e7 34 char *cluster_name;
fc5c0345
DG
35 char *rbd_name;
36 char *pool_name;
37 char *client_name;
d7d702c7 38 int busy_poll;
fc5c0345
DG
39};
40
41static struct fio_option options[] = {
6e20c6e7
T
42 {
43 .name = "clustername",
44 .lname = "ceph cluster name",
45 .type = FIO_OPT_STR_STORE,
46 .help = "Cluster name for ceph",
47 .off1 = offsetof(struct rbd_options, cluster_name),
48 .category = FIO_OPT_C_ENGINE,
49 .group = FIO_OPT_G_RBD,
50 },
fc5c0345 51 {
d8b64af2
JA
52 .name = "rbdname",
53 .lname = "rbd engine rbdname",
54 .type = FIO_OPT_STR_STORE,
55 .help = "RBD name for RBD engine",
56 .off1 = offsetof(struct rbd_options, rbd_name),
57 .category = FIO_OPT_C_ENGINE,
58 .group = FIO_OPT_G_RBD,
59 },
fc5c0345 60 {
d7d702c7
JA
61 .name = "pool",
62 .lname = "rbd engine pool",
63 .type = FIO_OPT_STR_STORE,
64 .help = "Name of the pool hosting the RBD for the RBD engine",
65 .off1 = offsetof(struct rbd_options, pool_name),
66 .category = FIO_OPT_C_ENGINE,
67 .group = FIO_OPT_G_RBD,
d8b64af2 68 },
fc5c0345 69 {
d7d702c7
JA
70 .name = "clientname",
71 .lname = "rbd engine clientname",
72 .type = FIO_OPT_STR_STORE,
73 .help = "Name of the ceph client to access the RBD for the RBD engine",
74 .off1 = offsetof(struct rbd_options, client_name),
75 .category = FIO_OPT_C_ENGINE,
76 .group = FIO_OPT_G_RBD,
77 },
78 {
79 .name = "busy_poll",
80 .lname = "Busy poll",
81 .type = FIO_OPT_BOOL,
82 .help = "Busy poll for completions instead of sleeping",
fea585d4 83 .off1 = offsetof(struct rbd_options, busy_poll),
d7d702c7
JA
84 .def = "0",
85 .category = FIO_OPT_C_ENGINE,
86 .group = FIO_OPT_G_RBD,
d8b64af2 87 },
fc5c0345 88 {
d8b64af2
JA
89 .name = NULL,
90 },
fc5c0345
DG
91};
92
93static int _fio_setup_rbd_data(struct thread_data *td,
94 struct rbd_data **rbd_data_ptr)
95{
6f9961ac 96 struct rbd_data *rbd;
fc5c0345 97
565e784d 98 if (td->io_ops_data)
fc5c0345
DG
99 return 0;
100
6f9961ac
JA
101 rbd = calloc(1, sizeof(struct rbd_data));
102 if (!rbd)
fc5c0345
DG
103 goto failed;
104
6f9961ac
JA
105 rbd->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
106 if (!rbd->aio_events)
fc5c0345
DG
107 goto failed;
108
6f9961ac
JA
109 rbd->sort_events = calloc(td->o.iodepth, sizeof(struct io_u *));
110 if (!rbd->sort_events)
111 goto failed;
fc5c0345 112
6f9961ac 113 *rbd_data_ptr = rbd;
fc5c0345
DG
114 return 0;
115
116failed:
5a4adfd2
JQ
117 if (rbd) {
118 if (rbd->aio_events)
119 free(rbd->aio_events);
120 if (rbd->sort_events)
121 free(rbd->sort_events);
6f9961ac 122 free(rbd);
5a4adfd2 123 }
fc5c0345
DG
124 return 1;
125
126}
127
128static int _fio_rbd_connect(struct thread_data *td)
129{
565e784d 130 struct rbd_data *rbd = td->io_ops_data;
fc5c0345
DG
131 struct rbd_options *o = td->eo;
132 int r;
133
6e20c6e7
T
134 if (o->cluster_name) {
135 char *client_name = NULL;
136
137 /*
89e5ec96 138 * If we specify cluser name, the rados_create2
6e20c6e7
T
139 * will not assume 'client.'. name is considered
140 * as a full type.id namestr
141 */
89e5ec96
JQ
142 if (o->client_name) {
143 if (!index(o->client_name, '.')) {
144 client_name = calloc(1, strlen("client.") +
145 strlen(o->client_name) + 1);
146 strcat(client_name, "client.");
6ec0dcd1
JQ
147 strcat(client_name, o->client_name);
148 } else {
149 client_name = o->client_name;
89e5ec96 150 }
6e20c6e7 151 }
6ec0dcd1 152
6e20c6e7 153 r = rados_create2(&rbd->cluster, o->cluster_name,
6ec0dcd1
JQ
154 client_name, 0);
155
156 if (client_name && !index(o->client_name, '.'))
157 free(client_name);
6e20c6e7
T
158 } else
159 r = rados_create(&rbd->cluster, o->client_name);
160
fc5c0345
DG
161 if (r < 0) {
162 log_err("rados_create failed.\n");
163 goto failed_early;
164 }
165
6f9961ac 166 r = rados_conf_read_file(rbd->cluster, NULL);
fc5c0345
DG
167 if (r < 0) {
168 log_err("rados_conf_read_file failed.\n");
169 goto failed_early;
170 }
171
6f9961ac 172 r = rados_connect(rbd->cluster);
fc5c0345
DG
173 if (r < 0) {
174 log_err("rados_connect failed.\n");
175 goto failed_shutdown;
176 }
177
6f9961ac 178 r = rados_ioctx_create(rbd->cluster, o->pool_name, &rbd->io_ctx);
fc5c0345
DG
179 if (r < 0) {
180 log_err("rados_ioctx_create failed.\n");
181 goto failed_shutdown;
182 }
183
6f9961ac 184 r = rbd_open(rbd->io_ctx, o->rbd_name, &rbd->image, NULL /*snap */ );
fc5c0345
DG
185 if (r < 0) {
186 log_err("rbd_open failed.\n");
187 goto failed_open;
188 }
189 return 0;
190
191failed_open:
6f9961ac
JA
192 rados_ioctx_destroy(rbd->io_ctx);
193 rbd->io_ctx = NULL;
fc5c0345 194failed_shutdown:
6f9961ac
JA
195 rados_shutdown(rbd->cluster);
196 rbd->cluster = NULL;
fc5c0345
DG
197failed_early:
198 return 1;
199}
200
6f9961ac 201static void _fio_rbd_disconnect(struct rbd_data *rbd)
fc5c0345 202{
6f9961ac 203 if (!rbd)
fc5c0345
DG
204 return;
205
206 /* shutdown everything */
6f9961ac
JA
207 if (rbd->image) {
208 rbd_close(rbd->image);
209 rbd->image = NULL;
fc5c0345
DG
210 }
211
6f9961ac
JA
212 if (rbd->io_ctx) {
213 rados_ioctx_destroy(rbd->io_ctx);
214 rbd->io_ctx = NULL;
fc5c0345
DG
215 }
216
6f9961ac
JA
217 if (rbd->cluster) {
218 rados_shutdown(rbd->cluster);
219 rbd->cluster = NULL;
fc5c0345
DG
220 }
221}
222
d8b64af2 223static void _fio_rbd_finish_aiocb(rbd_completion_t comp, void *data)
fc5c0345 224{
dbf388d2
JA
225 struct fio_rbd_iou *fri = data;
226 struct io_u *io_u = fri->io_u;
d8b64af2 227 ssize_t ret;
fc5c0345 228
d8b64af2
JA
229 /*
230 * Looks like return value is 0 for success, or < 0 for
231 * a specific error. So we have to assume that it can't do
232 * partial completions.
233 */
234 ret = rbd_aio_get_return_value(fri->completion);
235 if (ret < 0) {
236 io_u->error = ret;
237 io_u->resid = io_u->xfer_buflen;
238 } else
239 io_u->error = 0;
20cf5aab
JD
240
241 fri->io_complete = 1;
d8b64af2 242}
fc5c0345 243
d8b64af2
JA
244static struct io_u *fio_rbd_event(struct thread_data *td, int event)
245{
565e784d 246 struct rbd_data *rbd = td->io_ops_data;
fc5c0345 247
6f9961ac 248 return rbd->aio_events[event];
fc5c0345
DG
249}
250
6f9961ac 251static inline int fri_check_complete(struct rbd_data *rbd, struct io_u *io_u,
d8b64af2 252 unsigned int *events)
fc5c0345 253{
d8b64af2 254 struct fio_rbd_iou *fri = io_u->engine_data;
fc5c0345 255
b8ecbef6 256 if (fri->io_complete) {
d8b64af2 257 fri->io_seen = 1;
6f9961ac 258 rbd->aio_events[*events] = io_u;
d8b64af2 259 (*events)++;
fc5c0345 260
d8b64af2
JA
261 rbd_aio_release(fri->completion);
262 return 1;
263 }
fc5c0345 264
d8b64af2 265 return 0;
fc5c0345
DG
266}
267
6f9961ac
JA
268static inline int rbd_io_u_seen(struct io_u *io_u)
269{
270 struct fio_rbd_iou *fri = io_u->engine_data;
271
272 return fri->io_seen;
273}
274
275static void rbd_io_u_wait_complete(struct io_u *io_u)
276{
277 struct fio_rbd_iou *fri = io_u->engine_data;
278
279 rbd_aio_wait_for_complete(fri->completion);
280}
281
282static int rbd_io_u_cmp(const void *p1, const void *p2)
283{
284 const struct io_u **a = (const struct io_u **) p1;
285 const struct io_u **b = (const struct io_u **) p2;
286 uint64_t at, bt;
287
288 at = utime_since_now(&(*a)->start_time);
289 bt = utime_since_now(&(*b)->start_time);
290
291 if (at < bt)
292 return -1;
293 else if (at == bt)
294 return 0;
295 else
296 return 1;
297}
298
d8b64af2
JA
299static int rbd_iter_events(struct thread_data *td, unsigned int *events,
300 unsigned int min_evts, int wait)
82340a9f 301{
565e784d 302 struct rbd_data *rbd = td->io_ops_data;
d8b64af2
JA
303 unsigned int this_events = 0;
304 struct io_u *io_u;
6f9961ac 305 int i, sidx;
82340a9f 306
6f9961ac 307 sidx = 0;
d8b64af2 308 io_u_qiter(&td->io_u_all, io_u, i) {
d8b64af2
JA
309 if (!(io_u->flags & IO_U_F_FLIGHT))
310 continue;
6f9961ac 311 if (rbd_io_u_seen(io_u))
d8b64af2 312 continue;
82340a9f 313
6f9961ac 314 if (fri_check_complete(rbd, io_u, events))
d8b64af2 315 this_events++;
6f9961ac
JA
316 else if (wait)
317 rbd->sort_events[sidx++] = io_u;
318 }
82340a9f 319
6f9961ac
JA
320 if (!wait || !sidx)
321 return this_events;
322
323 /*
324 * Sort events, oldest issue first, then wait on as many as we
325 * need in order of age. If we have enough events, stop waiting,
326 * and just check if any of the older ones are done.
327 */
328 if (sidx > 1)
329 qsort(rbd->sort_events, sidx, sizeof(struct io_u *), rbd_io_u_cmp);
330
331 for (i = 0; i < sidx; i++) {
332 io_u = rbd->sort_events[i];
333
334 if (fri_check_complete(rbd, io_u, events)) {
335 this_events++;
336 continue;
d8b64af2 337 }
6f9961ac
JA
338
339 /*
340 * Stop waiting when we have enough, but continue checking
341 * all pending IOs if they are complete.
342 */
d8b64af2 343 if (*events >= min_evts)
6f9961ac
JA
344 continue;
345
346 rbd_io_u_wait_complete(io_u);
347
348 if (fri_check_complete(rbd, io_u, events))
349 this_events++;
d8b64af2 350 }
fc5c0345 351
d8b64af2 352 return this_events;
fc5c0345
DG
353}
354
355static int fio_rbd_getevents(struct thread_data *td, unsigned int min,
1f440ece 356 unsigned int max, const struct timespec *t)
fc5c0345 357{
d8b64af2 358 unsigned int this_events, events = 0;
d7d702c7 359 struct rbd_options *o = td->eo;
d8b64af2 360 int wait = 0;
fc5c0345
DG
361
362 do {
d8b64af2 363 this_events = rbd_iter_events(td, &events, min, wait);
fc5c0345 364
d8b64af2 365 if (events >= min)
fc5c0345 366 break;
d8b64af2
JA
367 if (this_events)
368 continue;
fc5c0345 369
d7d702c7
JA
370 if (!o->busy_poll)
371 wait = 1;
372 else
373 nop;
fc5c0345
DG
374 } while (1);
375
376 return events;
377}
378
379static int fio_rbd_queue(struct thread_data *td, struct io_u *io_u)
380{
565e784d 381 struct rbd_data *rbd = td->io_ops_data;
d8b64af2
JA
382 struct fio_rbd_iou *fri = io_u->engine_data;
383 int r = -1;
fc5c0345
DG
384
385 fio_ro_check(td, io_u);
386
d8b64af2 387 fri->io_seen = 0;
b8ecbef6 388 fri->io_complete = 0;
d8b64af2 389
dbf388d2 390 r = rbd_aio_create_completion(fri, _fio_rbd_finish_aiocb,
d8b64af2 391 &fri->completion);
dbf388d2
JA
392 if (r < 0) {
393 log_err("rbd_aio_create_completion failed.\n");
394 goto failed;
395 }
fc5c0345 396
dbf388d2 397 if (io_u->ddir == DDIR_WRITE) {
a4c4c346 398#ifdef CONFIG_RBD_BLKIN
399 blkin_init_trace_info(&fri->info);
400 r = rbd_aio_write_traced(rbd->image, io_u->offset, io_u->xfer_buflen,
401 io_u->xfer_buf, fri->completion, &fri->info);
402#else
6f9961ac
JA
403 r = rbd_aio_write(rbd->image, io_u->offset, io_u->xfer_buflen,
404 io_u->xfer_buf, fri->completion);
a4c4c346 405#endif
fc5c0345
DG
406 if (r < 0) {
407 log_err("rbd_aio_write failed.\n");
dbf388d2 408 goto failed_comp;
fc5c0345
DG
409 }
410
411 } else if (io_u->ddir == DDIR_READ) {
a4c4c346 412#ifdef CONFIG_RBD_BLKIN
413 blkin_init_trace_info(&fri->info);
414 r = rbd_aio_read_traced(rbd->image, io_u->offset, io_u->xfer_buflen,
415 io_u->xfer_buf, fri->completion, &fri->info);
416#else
6f9961ac
JA
417 r = rbd_aio_read(rbd->image, io_u->offset, io_u->xfer_buflen,
418 io_u->xfer_buf, fri->completion);
a4c4c346 419#endif
fc5c0345
DG
420
421 if (r < 0) {
422 log_err("rbd_aio_read failed.\n");
dbf388d2 423 goto failed_comp;
fc5c0345 424 }
dbf388d2 425 } else if (io_u->ddir == DDIR_TRIM) {
6f9961ac
JA
426 r = rbd_aio_discard(rbd->image, io_u->offset,
427 io_u->xfer_buflen, fri->completion);
82340a9f 428 if (r < 0) {
dbf388d2
JA
429 log_err("rbd_aio_discard failed.\n");
430 goto failed_comp;
82340a9f 431 }
dbf388d2 432 } else if (io_u->ddir == DDIR_SYNC) {
6f9961ac 433 r = rbd_aio_flush(rbd->image, fri->completion);
fc5c0345
DG
434 if (r < 0) {
435 log_err("rbd_flush failed.\n");
dbf388d2 436 goto failed_comp;
fc5c0345 437 }
fc5c0345
DG
438 } else {
439 dprint(FD_IO, "%s: Warning: unhandled ddir: %d\n", __func__,
440 io_u->ddir);
dbf388d2 441 goto failed_comp;
fc5c0345
DG
442 }
443
444 return FIO_Q_QUEUED;
dbf388d2
JA
445failed_comp:
446 rbd_aio_release(fri->completion);
fc5c0345
DG
447failed:
448 io_u->error = r;
449 td_verror(td, io_u->error, "xfer");
450 return FIO_Q_COMPLETED;
451}
452
453static int fio_rbd_init(struct thread_data *td)
454{
455 int r;
456
457 r = _fio_rbd_connect(td);
458 if (r) {
459 log_err("fio_rbd_connect failed, return code: %d .\n", r);
460 goto failed;
461 }
462
463 return 0;
464
465failed:
466 return 1;
fc5c0345
DG
467}
468
469static void fio_rbd_cleanup(struct thread_data *td)
470{
565e784d 471 struct rbd_data *rbd = td->io_ops_data;
fc5c0345 472
6f9961ac
JA
473 if (rbd) {
474 _fio_rbd_disconnect(rbd);
475 free(rbd->aio_events);
476 free(rbd->sort_events);
477 free(rbd);
fc5c0345 478 }
fc5c0345
DG
479}
480
481static int fio_rbd_setup(struct thread_data *td)
482{
fc5c0345
DG
483 rbd_image_info_t info;
484 struct fio_file *f;
6f9961ac 485 struct rbd_data *rbd = NULL;
fc5c0345 486 int major, minor, extra;
6f9961ac 487 int r;
fc5c0345
DG
488
489 /* log version of librbd. No cluster connection required. */
490 rbd_version(&major, &minor, &extra);
491 log_info("rbd engine: RBD version: %d.%d.%d\n", major, minor, extra);
492
493 /* allocate engine specific structure to deal with librbd. */
6f9961ac 494 r = _fio_setup_rbd_data(td, &rbd);
fc5c0345
DG
495 if (r) {
496 log_err("fio_setup_rbd_data failed.\n");
497 goto cleanup;
498 }
565e784d 499 td->io_ops_data = rbd;
fc5c0345 500
d8b64af2
JA
501 /* librbd does not allow us to run first in the main thread and later
502 * in a fork child. It needs to be the same process context all the
503 * time.
fc5c0345
DG
504 */
505 td->o.use_thread = 1;
506
507 /* connect in the main thread to determine to determine
508 * the size of the given RADOS block device. And disconnect
509 * later on.
510 */
511 r = _fio_rbd_connect(td);
512 if (r) {
513 log_err("fio_rbd_connect failed.\n");
514 goto cleanup;
515 }
516
517 /* get size of the RADOS block device */
6f9961ac 518 r = rbd_stat(rbd->image, &info, sizeof(info));
fc5c0345
DG
519 if (r < 0) {
520 log_err("rbd_status failed.\n");
521 goto disconnect;
522 }
523 dprint(FD_IO, "rbd-engine: image size: %lu\n", info.size);
524
525 /* taken from "net" engine. Pretend we deal with files,
526 * even if we do not have any ideas about files.
527 * The size of the RBD is set instead of a artificial file.
528 */
529 if (!td->files_index) {
5903e7b7 530 add_file(td, td->o.filename ? : "rbd", 0, 0);
fc5c0345 531 td->o.nr_files = td->o.nr_files ? : 1;
b53f2c54 532 td->o.open_files++;
fc5c0345
DG
533 }
534 f = td->files[0];
535 f->real_file_size = info.size;
536
537 /* disconnect, then we were only connected to determine
538 * the size of the RBD.
539 */
6f9961ac 540 _fio_rbd_disconnect(rbd);
fc5c0345
DG
541 return 0;
542
543disconnect:
6f9961ac 544 _fio_rbd_disconnect(rbd);
fc5c0345
DG
545cleanup:
546 fio_rbd_cleanup(td);
547 return r;
548}
549
550static int fio_rbd_open(struct thread_data *td, struct fio_file *f)
551{
552 return 0;
553}
554
d9b100fc
JA
555static int fio_rbd_invalidate(struct thread_data *td, struct fio_file *f)
556{
903b2812 557#if defined(CONFIG_RBD_INVAL)
565e784d 558 struct rbd_data *rbd = td->io_ops_data;
903b2812 559
6f9961ac 560 return rbd_invalidate_cache(rbd->image);
903b2812 561#else
d9b100fc 562 return 0;
903b2812 563#endif
d9b100fc
JA
564}
565
fc5c0345
DG
566static void fio_rbd_io_u_free(struct thread_data *td, struct io_u *io_u)
567{
d8b64af2 568 struct fio_rbd_iou *fri = io_u->engine_data;
fc5c0345 569
d8b64af2 570 if (fri) {
fc5c0345 571 io_u->engine_data = NULL;
d8b64af2 572 free(fri);
fc5c0345
DG
573 }
574}
575
576static int fio_rbd_io_u_init(struct thread_data *td, struct io_u *io_u)
577{
d8b64af2 578 struct fio_rbd_iou *fri;
fc5c0345 579
d8b64af2
JA
580 fri = calloc(1, sizeof(*fri));
581 fri->io_u = io_u;
582 io_u->engine_data = fri;
fc5c0345
DG
583 return 0;
584}
585
10aa136b 586static struct ioengine_ops ioengine = {
d9b100fc
JA
587 .name = "rbd",
588 .version = FIO_IOOPS_VERSION,
589 .setup = fio_rbd_setup,
590 .init = fio_rbd_init,
591 .queue = fio_rbd_queue,
592 .getevents = fio_rbd_getevents,
593 .event = fio_rbd_event,
594 .cleanup = fio_rbd_cleanup,
595 .open_file = fio_rbd_open,
596 .invalidate = fio_rbd_invalidate,
597 .options = options,
598 .io_u_init = fio_rbd_io_u_init,
599 .io_u_free = fio_rbd_io_u_free,
600 .option_struct_size = sizeof(struct rbd_options),
fc5c0345
DG
601};
602
603static void fio_init fio_rbd_register(void)
604{
605 register_ioengine(&ioengine);
606}
607
608static void fio_exit fio_rbd_unregister(void)
609{
610 unregister_ioengine(&ioengine);
611}