Merge branch 'master' of https://github.com/davidzengxhsh/fio
[fio.git] / engines / rbd.c
CommitLineData
fc5c0345
DG
1/*
2 * rbd engine
3 *
4 * IO engine using Ceph's librbd to test RADOS Block Devices.
5 *
6 */
7
8#include <rbd/librbd.h>
9
10#include "../fio.h"
d220c761 11#include "../optgroup.h"
fc5c0345
DG
12
13struct fio_rbd_iou {
14 struct io_u *io_u;
d8b64af2 15 rbd_completion_t completion;
d8b64af2 16 int io_seen;
b8ecbef6 17 int io_complete;
fc5c0345
DG
18};
19
20struct rbd_data {
21 rados_t cluster;
22 rados_ioctx_t io_ctx;
23 rbd_image_t image;
24 struct io_u **aio_events;
6f9961ac 25 struct io_u **sort_events;
fc5c0345
DG
26};
27
28struct rbd_options {
a1f871c7 29 void *pad;
6e20c6e7 30 char *cluster_name;
fc5c0345
DG
31 char *rbd_name;
32 char *pool_name;
33 char *client_name;
d7d702c7 34 int busy_poll;
fc5c0345
DG
35};
36
37static struct fio_option options[] = {
6e20c6e7
T
38 {
39 .name = "clustername",
40 .lname = "ceph cluster name",
41 .type = FIO_OPT_STR_STORE,
42 .help = "Cluster name for ceph",
43 .off1 = offsetof(struct rbd_options, cluster_name),
44 .category = FIO_OPT_C_ENGINE,
45 .group = FIO_OPT_G_RBD,
46 },
fc5c0345 47 {
d8b64af2
JA
48 .name = "rbdname",
49 .lname = "rbd engine rbdname",
50 .type = FIO_OPT_STR_STORE,
51 .help = "RBD name for RBD engine",
52 .off1 = offsetof(struct rbd_options, rbd_name),
53 .category = FIO_OPT_C_ENGINE,
54 .group = FIO_OPT_G_RBD,
55 },
fc5c0345 56 {
d7d702c7
JA
57 .name = "pool",
58 .lname = "rbd engine pool",
59 .type = FIO_OPT_STR_STORE,
60 .help = "Name of the pool hosting the RBD for the RBD engine",
61 .off1 = offsetof(struct rbd_options, pool_name),
62 .category = FIO_OPT_C_ENGINE,
63 .group = FIO_OPT_G_RBD,
d8b64af2 64 },
fc5c0345 65 {
d7d702c7
JA
66 .name = "clientname",
67 .lname = "rbd engine clientname",
68 .type = FIO_OPT_STR_STORE,
69 .help = "Name of the ceph client to access the RBD for the RBD engine",
70 .off1 = offsetof(struct rbd_options, client_name),
71 .category = FIO_OPT_C_ENGINE,
72 .group = FIO_OPT_G_RBD,
73 },
74 {
75 .name = "busy_poll",
76 .lname = "Busy poll",
77 .type = FIO_OPT_BOOL,
78 .help = "Busy poll for completions instead of sleeping",
fea585d4 79 .off1 = offsetof(struct rbd_options, busy_poll),
d7d702c7
JA
80 .def = "0",
81 .category = FIO_OPT_C_ENGINE,
82 .group = FIO_OPT_G_RBD,
d8b64af2 83 },
fc5c0345 84 {
d8b64af2
JA
85 .name = NULL,
86 },
fc5c0345
DG
87};
88
89static int _fio_setup_rbd_data(struct thread_data *td,
90 struct rbd_data **rbd_data_ptr)
91{
6f9961ac 92 struct rbd_data *rbd;
fc5c0345
DG
93
94 if (td->io_ops->data)
95 return 0;
96
6f9961ac
JA
97 rbd = calloc(1, sizeof(struct rbd_data));
98 if (!rbd)
fc5c0345
DG
99 goto failed;
100
6f9961ac
JA
101 rbd->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
102 if (!rbd->aio_events)
fc5c0345
DG
103 goto failed;
104
6f9961ac
JA
105 rbd->sort_events = calloc(td->o.iodepth, sizeof(struct io_u *));
106 if (!rbd->sort_events)
107 goto failed;
fc5c0345 108
6f9961ac 109 *rbd_data_ptr = rbd;
fc5c0345
DG
110 return 0;
111
112failed:
6f9961ac
JA
113 if (rbd)
114 free(rbd);
fc5c0345
DG
115 return 1;
116
117}
118
119static int _fio_rbd_connect(struct thread_data *td)
120{
6f9961ac 121 struct rbd_data *rbd = td->io_ops->data;
fc5c0345
DG
122 struct rbd_options *o = td->eo;
123 int r;
124
6e20c6e7
T
125 if (o->cluster_name) {
126 char *client_name = NULL;
127
128 /*
129 * If we specify cluser name, the rados_creat2
130 * will not assume 'client.'. name is considered
131 * as a full type.id namestr
132 */
133 if (!index(o->client_name, '.')) {
134 client_name = calloc(1, strlen("client.") +
135 strlen(o->client_name) + 1);
136 strcat(client_name, "client.");
137 o->client_name = strcat(client_name, o->client_name);
138 }
139 r = rados_create2(&rbd->cluster, o->cluster_name,
140 o->client_name, 0);
141 } else
142 r = rados_create(&rbd->cluster, o->client_name);
143
fc5c0345
DG
144 if (r < 0) {
145 log_err("rados_create failed.\n");
146 goto failed_early;
147 }
148
6f9961ac 149 r = rados_conf_read_file(rbd->cluster, NULL);
fc5c0345
DG
150 if (r < 0) {
151 log_err("rados_conf_read_file failed.\n");
152 goto failed_early;
153 }
154
6f9961ac 155 r = rados_connect(rbd->cluster);
fc5c0345
DG
156 if (r < 0) {
157 log_err("rados_connect failed.\n");
158 goto failed_shutdown;
159 }
160
6f9961ac 161 r = rados_ioctx_create(rbd->cluster, o->pool_name, &rbd->io_ctx);
fc5c0345
DG
162 if (r < 0) {
163 log_err("rados_ioctx_create failed.\n");
164 goto failed_shutdown;
165 }
166
6f9961ac 167 r = rbd_open(rbd->io_ctx, o->rbd_name, &rbd->image, NULL /*snap */ );
fc5c0345
DG
168 if (r < 0) {
169 log_err("rbd_open failed.\n");
170 goto failed_open;
171 }
172 return 0;
173
174failed_open:
6f9961ac
JA
175 rados_ioctx_destroy(rbd->io_ctx);
176 rbd->io_ctx = NULL;
fc5c0345 177failed_shutdown:
6f9961ac
JA
178 rados_shutdown(rbd->cluster);
179 rbd->cluster = NULL;
fc5c0345
DG
180failed_early:
181 return 1;
182}
183
6f9961ac 184static void _fio_rbd_disconnect(struct rbd_data *rbd)
fc5c0345 185{
6f9961ac 186 if (!rbd)
fc5c0345
DG
187 return;
188
189 /* shutdown everything */
6f9961ac
JA
190 if (rbd->image) {
191 rbd_close(rbd->image);
192 rbd->image = NULL;
fc5c0345
DG
193 }
194
6f9961ac
JA
195 if (rbd->io_ctx) {
196 rados_ioctx_destroy(rbd->io_ctx);
197 rbd->io_ctx = NULL;
fc5c0345
DG
198 }
199
6f9961ac
JA
200 if (rbd->cluster) {
201 rados_shutdown(rbd->cluster);
202 rbd->cluster = NULL;
fc5c0345
DG
203 }
204}
205
d8b64af2 206static void _fio_rbd_finish_aiocb(rbd_completion_t comp, void *data)
fc5c0345 207{
dbf388d2
JA
208 struct fio_rbd_iou *fri = data;
209 struct io_u *io_u = fri->io_u;
d8b64af2 210 ssize_t ret;
fc5c0345 211
d8b64af2
JA
212 /*
213 * Looks like return value is 0 for success, or < 0 for
214 * a specific error. So we have to assume that it can't do
215 * partial completions.
216 */
217 ret = rbd_aio_get_return_value(fri->completion);
218 if (ret < 0) {
219 io_u->error = ret;
220 io_u->resid = io_u->xfer_buflen;
221 } else
222 io_u->error = 0;
20cf5aab
JD
223
224 fri->io_complete = 1;
d8b64af2 225}
fc5c0345 226
d8b64af2
JA
227static struct io_u *fio_rbd_event(struct thread_data *td, int event)
228{
6f9961ac 229 struct rbd_data *rbd = td->io_ops->data;
fc5c0345 230
6f9961ac 231 return rbd->aio_events[event];
fc5c0345
DG
232}
233
6f9961ac 234static inline int fri_check_complete(struct rbd_data *rbd, struct io_u *io_u,
d8b64af2 235 unsigned int *events)
fc5c0345 236{
d8b64af2 237 struct fio_rbd_iou *fri = io_u->engine_data;
fc5c0345 238
b8ecbef6 239 if (fri->io_complete) {
d8b64af2 240 fri->io_seen = 1;
6f9961ac 241 rbd->aio_events[*events] = io_u;
d8b64af2 242 (*events)++;
fc5c0345 243
d8b64af2
JA
244 rbd_aio_release(fri->completion);
245 return 1;
246 }
fc5c0345 247
d8b64af2 248 return 0;
fc5c0345
DG
249}
250
6f9961ac
JA
251static inline int rbd_io_u_seen(struct io_u *io_u)
252{
253 struct fio_rbd_iou *fri = io_u->engine_data;
254
255 return fri->io_seen;
256}
257
258static void rbd_io_u_wait_complete(struct io_u *io_u)
259{
260 struct fio_rbd_iou *fri = io_u->engine_data;
261
262 rbd_aio_wait_for_complete(fri->completion);
263}
264
265static int rbd_io_u_cmp(const void *p1, const void *p2)
266{
267 const struct io_u **a = (const struct io_u **) p1;
268 const struct io_u **b = (const struct io_u **) p2;
269 uint64_t at, bt;
270
271 at = utime_since_now(&(*a)->start_time);
272 bt = utime_since_now(&(*b)->start_time);
273
274 if (at < bt)
275 return -1;
276 else if (at == bt)
277 return 0;
278 else
279 return 1;
280}
281
d8b64af2
JA
282static int rbd_iter_events(struct thread_data *td, unsigned int *events,
283 unsigned int min_evts, int wait)
82340a9f 284{
6f9961ac 285 struct rbd_data *rbd = td->io_ops->data;
d8b64af2
JA
286 unsigned int this_events = 0;
287 struct io_u *io_u;
6f9961ac 288 int i, sidx;
82340a9f 289
6f9961ac 290 sidx = 0;
d8b64af2 291 io_u_qiter(&td->io_u_all, io_u, i) {
d8b64af2
JA
292 if (!(io_u->flags & IO_U_F_FLIGHT))
293 continue;
6f9961ac 294 if (rbd_io_u_seen(io_u))
d8b64af2 295 continue;
82340a9f 296
6f9961ac 297 if (fri_check_complete(rbd, io_u, events))
d8b64af2 298 this_events++;
6f9961ac
JA
299 else if (wait)
300 rbd->sort_events[sidx++] = io_u;
301 }
82340a9f 302
6f9961ac
JA
303 if (!wait || !sidx)
304 return this_events;
305
306 /*
307 * Sort events, oldest issue first, then wait on as many as we
308 * need in order of age. If we have enough events, stop waiting,
309 * and just check if any of the older ones are done.
310 */
311 if (sidx > 1)
312 qsort(rbd->sort_events, sidx, sizeof(struct io_u *), rbd_io_u_cmp);
313
314 for (i = 0; i < sidx; i++) {
315 io_u = rbd->sort_events[i];
316
317 if (fri_check_complete(rbd, io_u, events)) {
318 this_events++;
319 continue;
d8b64af2 320 }
6f9961ac
JA
321
322 /*
323 * Stop waiting when we have enough, but continue checking
324 * all pending IOs if they are complete.
325 */
d8b64af2 326 if (*events >= min_evts)
6f9961ac
JA
327 continue;
328
329 rbd_io_u_wait_complete(io_u);
330
331 if (fri_check_complete(rbd, io_u, events))
332 this_events++;
d8b64af2 333 }
fc5c0345 334
d8b64af2 335 return this_events;
fc5c0345
DG
336}
337
338static int fio_rbd_getevents(struct thread_data *td, unsigned int min,
1f440ece 339 unsigned int max, const struct timespec *t)
fc5c0345 340{
d8b64af2 341 unsigned int this_events, events = 0;
d7d702c7 342 struct rbd_options *o = td->eo;
d8b64af2 343 int wait = 0;
fc5c0345
DG
344
345 do {
d8b64af2 346 this_events = rbd_iter_events(td, &events, min, wait);
fc5c0345 347
d8b64af2 348 if (events >= min)
fc5c0345 349 break;
d8b64af2
JA
350 if (this_events)
351 continue;
fc5c0345 352
d7d702c7
JA
353 if (!o->busy_poll)
354 wait = 1;
355 else
356 nop;
fc5c0345
DG
357 } while (1);
358
359 return events;
360}
361
362static int fio_rbd_queue(struct thread_data *td, struct io_u *io_u)
363{
6f9961ac 364 struct rbd_data *rbd = td->io_ops->data;
d8b64af2
JA
365 struct fio_rbd_iou *fri = io_u->engine_data;
366 int r = -1;
fc5c0345
DG
367
368 fio_ro_check(td, io_u);
369
d8b64af2 370 fri->io_seen = 0;
b8ecbef6 371 fri->io_complete = 0;
d8b64af2 372
dbf388d2 373 r = rbd_aio_create_completion(fri, _fio_rbd_finish_aiocb,
d8b64af2 374 &fri->completion);
dbf388d2
JA
375 if (r < 0) {
376 log_err("rbd_aio_create_completion failed.\n");
377 goto failed;
378 }
fc5c0345 379
dbf388d2 380 if (io_u->ddir == DDIR_WRITE) {
6f9961ac
JA
381 r = rbd_aio_write(rbd->image, io_u->offset, io_u->xfer_buflen,
382 io_u->xfer_buf, fri->completion);
fc5c0345
DG
383 if (r < 0) {
384 log_err("rbd_aio_write failed.\n");
dbf388d2 385 goto failed_comp;
fc5c0345
DG
386 }
387
388 } else if (io_u->ddir == DDIR_READ) {
6f9961ac
JA
389 r = rbd_aio_read(rbd->image, io_u->offset, io_u->xfer_buflen,
390 io_u->xfer_buf, fri->completion);
fc5c0345
DG
391
392 if (r < 0) {
393 log_err("rbd_aio_read failed.\n");
dbf388d2 394 goto failed_comp;
fc5c0345 395 }
dbf388d2 396 } else if (io_u->ddir == DDIR_TRIM) {
6f9961ac
JA
397 r = rbd_aio_discard(rbd->image, io_u->offset,
398 io_u->xfer_buflen, fri->completion);
82340a9f 399 if (r < 0) {
dbf388d2
JA
400 log_err("rbd_aio_discard failed.\n");
401 goto failed_comp;
82340a9f 402 }
dbf388d2 403 } else if (io_u->ddir == DDIR_SYNC) {
6f9961ac 404 r = rbd_aio_flush(rbd->image, fri->completion);
fc5c0345
DG
405 if (r < 0) {
406 log_err("rbd_flush failed.\n");
dbf388d2 407 goto failed_comp;
fc5c0345 408 }
fc5c0345
DG
409 } else {
410 dprint(FD_IO, "%s: Warning: unhandled ddir: %d\n", __func__,
411 io_u->ddir);
dbf388d2 412 goto failed_comp;
fc5c0345
DG
413 }
414
415 return FIO_Q_QUEUED;
dbf388d2
JA
416failed_comp:
417 rbd_aio_release(fri->completion);
fc5c0345
DG
418failed:
419 io_u->error = r;
420 td_verror(td, io_u->error, "xfer");
421 return FIO_Q_COMPLETED;
422}
423
424static int fio_rbd_init(struct thread_data *td)
425{
426 int r;
427
428 r = _fio_rbd_connect(td);
429 if (r) {
430 log_err("fio_rbd_connect failed, return code: %d .\n", r);
431 goto failed;
432 }
433
434 return 0;
435
436failed:
437 return 1;
fc5c0345
DG
438}
439
440static void fio_rbd_cleanup(struct thread_data *td)
441{
6f9961ac 442 struct rbd_data *rbd = td->io_ops->data;
fc5c0345 443
6f9961ac
JA
444 if (rbd) {
445 _fio_rbd_disconnect(rbd);
446 free(rbd->aio_events);
447 free(rbd->sort_events);
448 free(rbd);
fc5c0345 449 }
fc5c0345
DG
450}
451
452static int fio_rbd_setup(struct thread_data *td)
453{
fc5c0345
DG
454 rbd_image_info_t info;
455 struct fio_file *f;
6f9961ac 456 struct rbd_data *rbd = NULL;
fc5c0345 457 int major, minor, extra;
6f9961ac 458 int r;
fc5c0345
DG
459
460 /* log version of librbd. No cluster connection required. */
461 rbd_version(&major, &minor, &extra);
462 log_info("rbd engine: RBD version: %d.%d.%d\n", major, minor, extra);
463
464 /* allocate engine specific structure to deal with librbd. */
6f9961ac 465 r = _fio_setup_rbd_data(td, &rbd);
fc5c0345
DG
466 if (r) {
467 log_err("fio_setup_rbd_data failed.\n");
468 goto cleanup;
469 }
6f9961ac 470 td->io_ops->data = rbd;
fc5c0345 471
d8b64af2
JA
472 /* librbd does not allow us to run first in the main thread and later
473 * in a fork child. It needs to be the same process context all the
474 * time.
fc5c0345
DG
475 */
476 td->o.use_thread = 1;
477
478 /* connect in the main thread to determine to determine
479 * the size of the given RADOS block device. And disconnect
480 * later on.
481 */
482 r = _fio_rbd_connect(td);
483 if (r) {
484 log_err("fio_rbd_connect failed.\n");
485 goto cleanup;
486 }
487
488 /* get size of the RADOS block device */
6f9961ac 489 r = rbd_stat(rbd->image, &info, sizeof(info));
fc5c0345
DG
490 if (r < 0) {
491 log_err("rbd_status failed.\n");
492 goto disconnect;
493 }
494 dprint(FD_IO, "rbd-engine: image size: %lu\n", info.size);
495
496 /* taken from "net" engine. Pretend we deal with files,
497 * even if we do not have any ideas about files.
498 * The size of the RBD is set instead of a artificial file.
499 */
500 if (!td->files_index) {
5903e7b7 501 add_file(td, td->o.filename ? : "rbd", 0, 0);
fc5c0345 502 td->o.nr_files = td->o.nr_files ? : 1;
b53f2c54 503 td->o.open_files++;
fc5c0345
DG
504 }
505 f = td->files[0];
506 f->real_file_size = info.size;
507
508 /* disconnect, then we were only connected to determine
509 * the size of the RBD.
510 */
6f9961ac 511 _fio_rbd_disconnect(rbd);
fc5c0345
DG
512 return 0;
513
514disconnect:
6f9961ac 515 _fio_rbd_disconnect(rbd);
fc5c0345
DG
516cleanup:
517 fio_rbd_cleanup(td);
518 return r;
519}
520
521static int fio_rbd_open(struct thread_data *td, struct fio_file *f)
522{
523 return 0;
524}
525
d9b100fc
JA
526static int fio_rbd_invalidate(struct thread_data *td, struct fio_file *f)
527{
903b2812 528#if defined(CONFIG_RBD_INVAL)
6f9961ac 529 struct rbd_data *rbd = td->io_ops->data;
903b2812 530
6f9961ac 531 return rbd_invalidate_cache(rbd->image);
903b2812 532#else
d9b100fc 533 return 0;
903b2812 534#endif
d9b100fc
JA
535}
536
fc5c0345
DG
537static void fio_rbd_io_u_free(struct thread_data *td, struct io_u *io_u)
538{
d8b64af2 539 struct fio_rbd_iou *fri = io_u->engine_data;
fc5c0345 540
d8b64af2 541 if (fri) {
fc5c0345 542 io_u->engine_data = NULL;
d8b64af2 543 free(fri);
fc5c0345
DG
544 }
545}
546
547static int fio_rbd_io_u_init(struct thread_data *td, struct io_u *io_u)
548{
d8b64af2 549 struct fio_rbd_iou *fri;
fc5c0345 550
d8b64af2
JA
551 fri = calloc(1, sizeof(*fri));
552 fri->io_u = io_u;
553 io_u->engine_data = fri;
fc5c0345
DG
554 return 0;
555}
556
10aa136b 557static struct ioengine_ops ioengine = {
d9b100fc
JA
558 .name = "rbd",
559 .version = FIO_IOOPS_VERSION,
560 .setup = fio_rbd_setup,
561 .init = fio_rbd_init,
562 .queue = fio_rbd_queue,
563 .getevents = fio_rbd_getevents,
564 .event = fio_rbd_event,
565 .cleanup = fio_rbd_cleanup,
566 .open_file = fio_rbd_open,
567 .invalidate = fio_rbd_invalidate,
568 .options = options,
569 .io_u_init = fio_rbd_io_u_init,
570 .io_u_free = fio_rbd_io_u_free,
571 .option_struct_size = sizeof(struct rbd_options),
fc5c0345
DG
572};
573
574static void fio_init fio_rbd_register(void)
575{
576 register_ioengine(&ioengine);
577}
578
579static void fio_exit fio_rbd_unregister(void)
580{
581 unregister_ioengine(&ioengine);
582}