Use correct I/O engine name "cpuio" instead of "cpu"
[fio.git] / engines / rdma.c
CommitLineData
21b8aee8 1/*
85286c5c 2 * RDMA I/O engine
21b8aee8 3 *
85286c5c
BVA
4 * RDMA I/O engine based on the IB verbs and RDMA/CM user space libraries.
5 * Supports both RDMA memory semantics and channel semantics
6 * for the InfiniBand, RoCE and iWARP protocols.
21b8aee8 7 *
0ac5d398 8 * You will need the Linux RDMA software installed, either
85286c5c 9 * from your Linux distributor or directly from openfabrics.org:
21b8aee8 10 *
11 * http://www.openfabrics.org/downloads/OFED/
12 *
7d7803fa
YR
13 * Exchanging steps of RDMA ioengine control messages:
14 * 1. client side sends test mode (RDMA_WRITE/RDMA_READ/SEND)
15 * to server side.
222757cc 16 * 2. server side parses test mode, and sends back confirmation
7d7803fa 17 * to client side. In RDMA WRITE/READ test, this confirmation
222757cc 18 * includes memory information, such as rkey, address.
7d7803fa 19 * 3. client side initiates test loop.
222757cc 20 * 4. In RDMA WRITE/READ test, client side sends a completion
7d7803fa 21 * notification to server side. Server side updates its
222757cc 22 * td->done as true.
7d7803fa 23 *
21b8aee8 24 */
25#include <stdio.h>
26#include <stdlib.h>
27#include <unistd.h>
28#include <errno.h>
29#include <assert.h>
30#include <netinet/in.h>
31#include <arpa/inet.h>
32#include <netdb.h>
33#include <sys/poll.h>
34#include <sys/types.h>
35#include <sys/socket.h>
36#include <sys/time.h>
37#include <sys/resource.h>
38
21b8aee8 39#include <pthread.h>
40#include <inttypes.h>
41
42#include "../fio.h"
ee88d056 43#include "../hash.h"
d220c761 44#include "../optgroup.h"
21b8aee8 45
21b8aee8 46#include <rdma/rdma_cma.h>
47#include <infiniband/arch.h>
48
7d7803fa 49#define FIO_RDMA_MAX_IO_DEPTH 512
21b8aee8 50
51enum rdma_io_mode {
52 FIO_RDMA_UNKNOWN = 0,
53 FIO_RDMA_MEM_WRITE,
54 FIO_RDMA_MEM_READ,
55 FIO_RDMA_CHA_SEND,
56 FIO_RDMA_CHA_RECV
57};
58
cdf91594
LG
59struct rdmaio_options {
60 struct thread_data *td;
61 unsigned int port;
62 enum rdma_io_mode verb;
63};
64
65static int str_hostname_cb(void *data, const char *input)
66{
67 struct rdmaio_options *o = data;
68
69 if (o->td->o.filename)
70 free(o->td->o.filename);
71 o->td->o.filename = strdup(input);
72 return 0;
73}
74
75static struct fio_option options[] = {
76 {
77 .name = "hostname",
78 .lname = "rdma engine hostname",
79 .type = FIO_OPT_STR_STORE,
80 .cb = str_hostname_cb,
81 .help = "Hostname for RDMA IO engine",
82 .category = FIO_OPT_C_ENGINE,
83 .group = FIO_OPT_G_RDMA,
84 },
85 {
86 .name = "port",
87 .lname = "rdma engine port",
88 .type = FIO_OPT_INT,
89 .off1 = offsetof(struct rdmaio_options, port),
90 .minval = 1,
91 .maxval = 65535,
92 .help = "Port to use for RDMA connections",
93 .category = FIO_OPT_C_ENGINE,
94 .group = FIO_OPT_G_RDMA,
95 },
96 {
97 .name = "verb",
98 .lname = "RDMA engine verb",
99 .alias = "proto",
100 .type = FIO_OPT_STR,
101 .off1 = offsetof(struct rdmaio_options, verb),
102 .help = "RDMA engine verb",
103 .def = "write",
104 .posval = {
105 { .ival = "write",
106 .oval = FIO_RDMA_MEM_WRITE,
107 .help = "Memory Write",
108 },
109 { .ival = "read",
110 .oval = FIO_RDMA_MEM_READ,
111 .help = "Memory Read",
112 },
113 { .ival = "send",
114 .oval = FIO_RDMA_CHA_SEND,
115 .help = "Posted Send",
116 },
117 { .ival = "recv",
118 .oval = FIO_RDMA_CHA_RECV,
ee386b78 119 .help = "Posted Receive",
cdf91594
LG
120 },
121 },
122 .category = FIO_OPT_C_ENGINE,
123 .group = FIO_OPT_G_RDMA,
124 },
125 {
126 .name = NULL,
127 },
128};
129
21b8aee8 130struct remote_u {
131 uint64_t buf;
132 uint32_t rkey;
133 uint32_t size;
134};
135
136struct rdma_info_blk {
137 uint32_t mode; /* channel semantic or memory semantic */
138 uint32_t nr; /* client: io depth
139 server: number of records for memory semantic
140 */
c4421263 141 uint32_t max_bs; /* maximum block size */
76cc5224 142 struct remote_u rmt_us[FIO_RDMA_MAX_IO_DEPTH];
21b8aee8 143};
144
145struct rdma_io_u_data {
146 uint64_t wr_id;
147 struct ibv_send_wr sq_wr;
148 struct ibv_recv_wr rq_wr;
149 struct ibv_sge rdma_sgl;
150};
151
152struct rdmaio_data {
153 int is_client;
154 enum rdma_io_mode rdma_protocol;
155 char host[64];
156 struct sockaddr_in addr;
157
158 struct ibv_recv_wr rq_wr;
159 struct ibv_sge recv_sgl;
160 struct rdma_info_blk recv_buf;
161 struct ibv_mr *recv_mr;
162
163 struct ibv_send_wr sq_wr;
164 struct ibv_sge send_sgl;
165 struct rdma_info_blk send_buf;
166 struct ibv_mr *send_mr;
167
168 struct ibv_comp_channel *channel;
169 struct ibv_cq *cq;
170 struct ibv_pd *pd;
171 struct ibv_qp *qp;
172
173 pthread_t cmthread;
174 struct rdma_event_channel *cm_channel;
175 struct rdma_cm_id *cm_id;
176 struct rdma_cm_id *child_cm_id;
177
178 int cq_event_num;
179
180 struct remote_u *rmt_us;
181 int rmt_nr;
182 struct io_u **io_us_queued;
183 int io_u_queued_nr;
184 struct io_u **io_us_flight;
185 int io_u_flight_nr;
186 struct io_u **io_us_completed;
187 int io_u_completed_nr;
ea6209ad
JA
188
189 struct frand_state rand_state;
21b8aee8 190};
191
192static int client_recv(struct thread_data *td, struct ibv_wc *wc)
193{
194 struct rdmaio_data *rd = td->io_ops->data;
c4421263 195 unsigned int max_bs;
21b8aee8 196
197 if (wc->byte_len != sizeof(rd->recv_buf)) {
b6cf38f0 198 log_err("Received bogus data, size %d\n", wc->byte_len);
21b8aee8 199 return 1;
200 }
201
c4421263
LG
202 max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
203 if (max_bs > ntohl(rd->recv_buf.max_bs)) {
204 log_err("fio: Server's block size (%d) must be greater than or "
205 "equal to the client's block size (%d)!\n",
206 ntohl(rd->recv_buf.max_bs), max_bs);
207 return 1;
208 }
209
21b8aee8 210 /* store mr info for MEMORY semantic */
211 if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
212 (rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
213 /* struct flist_head *entry; */
214 int i = 0;
215
216 rd->rmt_nr = ntohl(rd->recv_buf.nr);
217
218 for (i = 0; i < rd->rmt_nr; i++) {
219 rd->rmt_us[i].buf = ntohll(rd->recv_buf.rmt_us[i].buf);
220 rd->rmt_us[i].rkey = ntohl(rd->recv_buf.rmt_us[i].rkey);
221 rd->rmt_us[i].size = ntohl(rd->recv_buf.rmt_us[i].size);
222
223 dprint(FD_IO,
224 "fio: Received rkey %x addr %" PRIx64
225 " len %d from peer\n", rd->rmt_us[i].rkey,
226 rd->rmt_us[i].buf, rd->rmt_us[i].size);
227 }
228 }
229
230 return 0;
231}
232
233static int server_recv(struct thread_data *td, struct ibv_wc *wc)
234{
235 struct rdmaio_data *rd = td->io_ops->data;
c4421263 236 unsigned int max_bs;
21b8aee8 237
76cc5224 238 if (wc->wr_id == FIO_RDMA_MAX_IO_DEPTH) {
21b8aee8 239 rd->rdma_protocol = ntohl(rd->recv_buf.mode);
240
241 /* CHANNEL semantic, do nothing */
242 if (rd->rdma_protocol == FIO_RDMA_CHA_SEND)
243 rd->rdma_protocol = FIO_RDMA_CHA_RECV;
c4421263
LG
244
245 max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
246 if (max_bs < ntohl(rd->recv_buf.max_bs)) {
247 log_err("fio: Server's block size (%d) must be greater than or "
248 "equal to the client's block size (%d)!\n",
249 ntohl(rd->recv_buf.max_bs), max_bs);
250 return 1;
251 }
252
21b8aee8 253 }
254
255 return 0;
256}
257
258static int cq_event_handler(struct thread_data *td, enum ibv_wc_opcode opcode)
259{
260 struct rdmaio_data *rd = td->io_ops->data;
261 struct ibv_wc wc;
262 struct rdma_io_u_data *r_io_u_d;
263 int ret;
264 int compevnum = 0;
265 int i;
266
267 while ((ret = ibv_poll_cq(rd->cq, 1, &wc)) == 1) {
268 ret = 0;
269 compevnum++;
270
271 if (wc.status) {
272 log_err("fio: cq completion status %d(%s)\n",
273 wc.status, ibv_wc_status_str(wc.status));
274 return -1;
275 }
276
277 switch (wc.opcode) {
278
279 case IBV_WC_RECV:
280 if (rd->is_client == 1)
c4421263 281 ret = client_recv(td, &wc);
21b8aee8 282 else
c4421263
LG
283 ret = server_recv(td, &wc);
284
285 if (ret)
286 return -1;
21b8aee8 287
76cc5224 288 if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH)
21b8aee8 289 break;
290
291 for (i = 0; i < rd->io_u_flight_nr; i++) {
292 r_io_u_d = rd->io_us_flight[i]->engine_data;
293
294 if (wc.wr_id == r_io_u_d->rq_wr.wr_id) {
295 rd->io_us_flight[i]->resid =
296 rd->io_us_flight[i]->buflen
297 - wc.byte_len;
298
299 rd->io_us_flight[i]->error = 0;
300
301 rd->io_us_completed[rd->
302 io_u_completed_nr]
303 = rd->io_us_flight[i];
304 rd->io_u_completed_nr++;
305 break;
306 }
307 }
308 if (i == rd->io_u_flight_nr)
e07f72d3 309 log_err("fio: recv wr %" PRId64 " not found\n",
21b8aee8 310 wc.wr_id);
311 else {
312 /* put the last one into middle of the list */
313 rd->io_us_flight[i] =
314 rd->io_us_flight[rd->io_u_flight_nr - 1];
315 rd->io_u_flight_nr--;
316 }
317
318 break;
319
320 case IBV_WC_SEND:
321 case IBV_WC_RDMA_WRITE:
322 case IBV_WC_RDMA_READ:
76cc5224 323 if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH)
21b8aee8 324 break;
325
326 for (i = 0; i < rd->io_u_flight_nr; i++) {
327 r_io_u_d = rd->io_us_flight[i]->engine_data;
328
329 if (wc.wr_id == r_io_u_d->sq_wr.wr_id) {
330 rd->io_us_completed[rd->
331 io_u_completed_nr]
332 = rd->io_us_flight[i];
333 rd->io_u_completed_nr++;
334 break;
335 }
336 }
337 if (i == rd->io_u_flight_nr)
e07f72d3 338 log_err("fio: send wr %" PRId64 " not found\n",
21b8aee8 339 wc.wr_id);
340 else {
341 /* put the last one into middle of the list */
342 rd->io_us_flight[i] =
343 rd->io_us_flight[rd->io_u_flight_nr - 1];
344 rd->io_u_flight_nr--;
345 }
346
347 break;
348
349 default:
350 log_info("fio: unknown completion event %d\n",
351 wc.opcode);
352 return -1;
353 }
354 rd->cq_event_num++;
355 }
c4421263 356
21b8aee8 357 if (ret) {
358 log_err("fio: poll error %d\n", ret);
359 return 1;
360 }
361
362 return compevnum;
363}
364
365/*
366 * Return -1 for error and 'nr events' for a positive number
367 * of events
368 */
369static int rdma_poll_wait(struct thread_data *td, enum ibv_wc_opcode opcode)
370{
371 struct rdmaio_data *rd = td->io_ops->data;
372 struct ibv_cq *ev_cq;
373 void *ev_ctx;
374 int ret;
375
376 if (rd->cq_event_num > 0) { /* previous left */
377 rd->cq_event_num--;
378 return 0;
379 }
380
381again:
382 if (ibv_get_cq_event(rd->channel, &ev_cq, &ev_ctx) != 0) {
383 log_err("fio: Failed to get cq event!\n");
384 return -1;
385 }
386 if (ev_cq != rd->cq) {
387 log_err("fio: Unknown CQ!\n");
388 return -1;
389 }
390 if (ibv_req_notify_cq(rd->cq, 0) != 0) {
391 log_err("fio: Failed to set notify!\n");
392 return -1;
393 }
394
395 ret = cq_event_handler(td, opcode);
c4421263 396 if (ret == 0)
21b8aee8 397 goto again;
398
399 ibv_ack_cq_events(rd->cq, ret);
400
401 rd->cq_event_num--;
402
403 return ret;
404}
405
406static int fio_rdmaio_setup_qp(struct thread_data *td)
407{
408 struct rdmaio_data *rd = td->io_ops->data;
409 struct ibv_qp_init_attr init_attr;
410 int qp_depth = td->o.iodepth * 2; /* 2 times of io depth */
411
412 if (rd->is_client == 0)
413 rd->pd = ibv_alloc_pd(rd->child_cm_id->verbs);
414 else
415 rd->pd = ibv_alloc_pd(rd->cm_id->verbs);
222757cc 416
21b8aee8 417 if (rd->pd == NULL) {
2a442a30 418 log_err("fio: ibv_alloc_pd fail: %m\n");
21b8aee8 419 return 1;
420 }
421
422 if (rd->is_client == 0)
423 rd->channel = ibv_create_comp_channel(rd->child_cm_id->verbs);
424 else
425 rd->channel = ibv_create_comp_channel(rd->cm_id->verbs);
426 if (rd->channel == NULL) {
2a442a30 427 log_err("fio: ibv_create_comp_channel fail: %m\n");
21b8aee8 428 goto err1;
429 }
430
431 if (qp_depth < 16)
432 qp_depth = 16;
433
434 if (rd->is_client == 0)
435 rd->cq = ibv_create_cq(rd->child_cm_id->verbs,
436 qp_depth, rd, rd->channel, 0);
437 else
438 rd->cq = ibv_create_cq(rd->cm_id->verbs,
439 qp_depth, rd, rd->channel, 0);
440 if (rd->cq == NULL) {
2a442a30 441 log_err("fio: ibv_create_cq failed: %m\n");
21b8aee8 442 goto err2;
443 }
444
445 if (ibv_req_notify_cq(rd->cq, 0) != 0) {
2a442a30 446 log_err("fio: ibv_req_notify_cq failed: %m\n");
21b8aee8 447 goto err3;
448 }
449
450 /* create queue pair */
451 memset(&init_attr, 0, sizeof(init_attr));
452 init_attr.cap.max_send_wr = qp_depth;
453 init_attr.cap.max_recv_wr = qp_depth;
454 init_attr.cap.max_recv_sge = 1;
455 init_attr.cap.max_send_sge = 1;
456 init_attr.qp_type = IBV_QPT_RC;
457 init_attr.send_cq = rd->cq;
458 init_attr.recv_cq = rd->cq;
459
460 if (rd->is_client == 0) {
461 if (rdma_create_qp(rd->child_cm_id, rd->pd, &init_attr) != 0) {
2a442a30 462 log_err("fio: rdma_create_qp failed: %m\n");
21b8aee8 463 goto err3;
464 }
465 rd->qp = rd->child_cm_id->qp;
466 } else {
467 if (rdma_create_qp(rd->cm_id, rd->pd, &init_attr) != 0) {
2a442a30 468 log_err("fio: rdma_create_qp failed: %m\n");
21b8aee8 469 goto err3;
470 }
471 rd->qp = rd->cm_id->qp;
472 }
473
474 return 0;
475
476err3:
477 ibv_destroy_cq(rd->cq);
478err2:
479 ibv_destroy_comp_channel(rd->channel);
480err1:
481 ibv_dealloc_pd(rd->pd);
482
483 return 1;
484}
485
486static int fio_rdmaio_setup_control_msg_buffers(struct thread_data *td)
487{
488 struct rdmaio_data *rd = td->io_ops->data;
489
490 rd->recv_mr = ibv_reg_mr(rd->pd, &rd->recv_buf, sizeof(rd->recv_buf),
491 IBV_ACCESS_LOCAL_WRITE);
492 if (rd->recv_mr == NULL) {
2a442a30 493 log_err("fio: recv_buf reg_mr failed: %m\n");
21b8aee8 494 return 1;
495 }
496
497 rd->send_mr = ibv_reg_mr(rd->pd, &rd->send_buf, sizeof(rd->send_buf),
498 0);
499 if (rd->send_mr == NULL) {
2a442a30 500 log_err("fio: send_buf reg_mr failed: %m\n");
21b8aee8 501 ibv_dereg_mr(rd->recv_mr);
502 return 1;
503 }
504
505 /* setup work request */
506 /* recv wq */
507 rd->recv_sgl.addr = (uint64_t) (unsigned long)&rd->recv_buf;
222757cc 508 rd->recv_sgl.length = sizeof(rd->recv_buf);
21b8aee8 509 rd->recv_sgl.lkey = rd->recv_mr->lkey;
510 rd->rq_wr.sg_list = &rd->recv_sgl;
511 rd->rq_wr.num_sge = 1;
76cc5224 512 rd->rq_wr.wr_id = FIO_RDMA_MAX_IO_DEPTH;
21b8aee8 513
514 /* send wq */
515 rd->send_sgl.addr = (uint64_t) (unsigned long)&rd->send_buf;
222757cc 516 rd->send_sgl.length = sizeof(rd->send_buf);
21b8aee8 517 rd->send_sgl.lkey = rd->send_mr->lkey;
518
519 rd->sq_wr.opcode = IBV_WR_SEND;
520 rd->sq_wr.send_flags = IBV_SEND_SIGNALED;
521 rd->sq_wr.sg_list = &rd->send_sgl;
522 rd->sq_wr.num_sge = 1;
76cc5224 523 rd->sq_wr.wr_id = FIO_RDMA_MAX_IO_DEPTH;
21b8aee8 524
525 return 0;
526}
527
528static int get_next_channel_event(struct thread_data *td,
529 struct rdma_event_channel *channel,
530 enum rdma_cm_event_type wait_event)
531{
532 struct rdmaio_data *rd = td->io_ops->data;
21b8aee8 533 struct rdma_cm_event *event;
222757cc 534 int ret;
21b8aee8 535
536 ret = rdma_get_cm_event(channel, &event);
537 if (ret) {
222757cc 538 log_err("fio: rdma_get_cm_event: %d\n", ret);
21b8aee8 539 return 1;
540 }
541
542 if (event->event != wait_event) {
543 log_err("fio: event is %s instead of %s\n",
544 rdma_event_str(event->event),
545 rdma_event_str(wait_event));
546 return 1;
547 }
548
549 switch (event->event) {
550 case RDMA_CM_EVENT_CONNECT_REQUEST:
551 rd->child_cm_id = event->id;
552 break;
553 default:
554 break;
555 }
556
557 rdma_ack_cm_event(event);
558
559 return 0;
560}
561
562static int fio_rdmaio_prep(struct thread_data *td, struct io_u *io_u)
563{
564 struct rdmaio_data *rd = td->io_ops->data;
565 struct rdma_io_u_data *r_io_u_d;
566
567 r_io_u_d = io_u->engine_data;
568
569 switch (rd->rdma_protocol) {
570 case FIO_RDMA_MEM_WRITE:
571 case FIO_RDMA_MEM_READ:
572 r_io_u_d->rdma_sgl.addr = (uint64_t) (unsigned long)io_u->buf;
573 r_io_u_d->rdma_sgl.lkey = io_u->mr->lkey;
574 r_io_u_d->sq_wr.wr_id = r_io_u_d->wr_id;
575 r_io_u_d->sq_wr.send_flags = IBV_SEND_SIGNALED;
576 r_io_u_d->sq_wr.sg_list = &r_io_u_d->rdma_sgl;
577 r_io_u_d->sq_wr.num_sge = 1;
578 break;
579 case FIO_RDMA_CHA_SEND:
580 r_io_u_d->rdma_sgl.addr = (uint64_t) (unsigned long)io_u->buf;
581 r_io_u_d->rdma_sgl.lkey = io_u->mr->lkey;
582 r_io_u_d->rdma_sgl.length = io_u->buflen;
583 r_io_u_d->sq_wr.wr_id = r_io_u_d->wr_id;
584 r_io_u_d->sq_wr.opcode = IBV_WR_SEND;
585 r_io_u_d->sq_wr.send_flags = IBV_SEND_SIGNALED;
586 r_io_u_d->sq_wr.sg_list = &r_io_u_d->rdma_sgl;
587 r_io_u_d->sq_wr.num_sge = 1;
588 break;
589 case FIO_RDMA_CHA_RECV:
590 r_io_u_d->rdma_sgl.addr = (uint64_t) (unsigned long)io_u->buf;
591 r_io_u_d->rdma_sgl.lkey = io_u->mr->lkey;
592 r_io_u_d->rdma_sgl.length = io_u->buflen;
593 r_io_u_d->rq_wr.wr_id = r_io_u_d->wr_id;
594 r_io_u_d->rq_wr.sg_list = &r_io_u_d->rdma_sgl;
595 r_io_u_d->rq_wr.num_sge = 1;
596 break;
597 default:
598 log_err("fio: unknown rdma protocol - %d\n", rd->rdma_protocol);
599 break;
600 }
601
602 return 0;
603}
604
605static struct io_u *fio_rdmaio_event(struct thread_data *td, int event)
606{
607 struct rdmaio_data *rd = td->io_ops->data;
608 struct io_u *io_u;
609 int i;
610
611 io_u = rd->io_us_completed[0];
222757cc 612 for (i = 0; i < rd->io_u_completed_nr - 1; i++)
21b8aee8 613 rd->io_us_completed[i] = rd->io_us_completed[i + 1];
222757cc 614
21b8aee8 615 rd->io_u_completed_nr--;
616
617 dprint_io_u(io_u, "fio_rdmaio_event");
618
619 return io_u;
620}
621
622static int fio_rdmaio_getevents(struct thread_data *td, unsigned int min,
1f440ece 623 unsigned int max, const struct timespec *t)
21b8aee8 624{
625 struct rdmaio_data *rd = td->io_ops->data;
21b8aee8 626 enum ibv_wc_opcode comp_opcode;
21b8aee8 627 struct ibv_cq *ev_cq;
628 void *ev_ctx;
222757cc 629 int ret, r = 0;
954cd73a 630 comp_opcode = IBV_WC_RDMA_WRITE;
21b8aee8 631
632 switch (rd->rdma_protocol) {
633 case FIO_RDMA_MEM_WRITE:
634 comp_opcode = IBV_WC_RDMA_WRITE;
635 break;
636 case FIO_RDMA_MEM_READ:
637 comp_opcode = IBV_WC_RDMA_READ;
638 break;
639 case FIO_RDMA_CHA_SEND:
640 comp_opcode = IBV_WC_SEND;
641 break;
642 case FIO_RDMA_CHA_RECV:
643 comp_opcode = IBV_WC_RECV;
644 break;
645 default:
646 log_err("fio: unknown rdma protocol - %d\n", rd->rdma_protocol);
647 break;
648 }
649
650 if (rd->cq_event_num > 0) { /* previous left */
651 rd->cq_event_num--;
652 return 0;
653 }
654
655again:
656 if (ibv_get_cq_event(rd->channel, &ev_cq, &ev_ctx) != 0) {
657 log_err("fio: Failed to get cq event!\n");
658 return -1;
659 }
660 if (ev_cq != rd->cq) {
661 log_err("fio: Unknown CQ!\n");
662 return -1;
663 }
664 if (ibv_req_notify_cq(rd->cq, 0) != 0) {
665 log_err("fio: Failed to set notify!\n");
666 return -1;
667 }
668
669 ret = cq_event_handler(td, comp_opcode);
670 if (ret < 1)
671 goto again;
672
673 ibv_ack_cq_events(rd->cq, ret);
674
675 r += ret;
676 if (r < min)
677 goto again;
678
679 rd->cq_event_num -= r;
680
681 return r;
682}
683
684static int fio_rdmaio_send(struct thread_data *td, struct io_u **io_us,
685 unsigned int nr)
686{
687 struct rdmaio_data *rd = td->io_ops->data;
688 struct ibv_send_wr *bad_wr;
e07f72d3 689#if 0
21b8aee8 690 enum ibv_wc_opcode comp_opcode;
691 comp_opcode = IBV_WC_RDMA_WRITE;
e07f72d3 692#endif
7d7803fa
YR
693 int i;
694 long index;
21b8aee8 695 struct rdma_io_u_data *r_io_u_d;
696
697 r_io_u_d = NULL;
698
699 for (i = 0; i < nr; i++) {
700 /* RDMA_WRITE or RDMA_READ */
701 switch (rd->rdma_protocol) {
702 case FIO_RDMA_MEM_WRITE:
703 /* compose work request */
704 r_io_u_d = io_us[i]->engine_data;
222757cc 705 index = __rand(&rd->rand_state) % rd->rmt_nr;
21b8aee8 706 r_io_u_d->sq_wr.opcode = IBV_WR_RDMA_WRITE;
707 r_io_u_d->sq_wr.wr.rdma.rkey = rd->rmt_us[index].rkey;
b6cf38f0
YR
708 r_io_u_d->sq_wr.wr.rdma.remote_addr = \
709 rd->rmt_us[index].buf;
21b8aee8 710 r_io_u_d->sq_wr.sg_list->length = io_us[i]->buflen;
711 break;
712 case FIO_RDMA_MEM_READ:
713 /* compose work request */
714 r_io_u_d = io_us[i]->engine_data;
222757cc 715 index = __rand(&rd->rand_state) % rd->rmt_nr;
21b8aee8 716 r_io_u_d->sq_wr.opcode = IBV_WR_RDMA_READ;
717 r_io_u_d->sq_wr.wr.rdma.rkey = rd->rmt_us[index].rkey;
b6cf38f0
YR
718 r_io_u_d->sq_wr.wr.rdma.remote_addr = \
719 rd->rmt_us[index].buf;
21b8aee8 720 r_io_u_d->sq_wr.sg_list->length = io_us[i]->buflen;
721 break;
722 case FIO_RDMA_CHA_SEND:
723 r_io_u_d = io_us[i]->engine_data;
724 r_io_u_d->sq_wr.opcode = IBV_WR_SEND;
725 r_io_u_d->sq_wr.send_flags = IBV_SEND_SIGNALED;
726 break;
727 default:
728 log_err("fio: unknown rdma protocol - %d\n",
729 rd->rdma_protocol);
730 break;
731 }
732
733 if (ibv_post_send(rd->qp, &r_io_u_d->sq_wr, &bad_wr) != 0) {
2a442a30 734 log_err("fio: ibv_post_send fail: %m\n");
21b8aee8 735 return -1;
736 }
737
738 dprint_io_u(io_us[i], "fio_rdmaio_send");
739 }
740
741 /* wait for completion
742 rdma_poll_wait(td, comp_opcode); */
743
744 return i;
745}
746
747static int fio_rdmaio_recv(struct thread_data *td, struct io_u **io_us,
748 unsigned int nr)
749{
750 struct rdmaio_data *rd = td->io_ops->data;
751 struct ibv_recv_wr *bad_wr;
752 struct rdma_io_u_data *r_io_u_d;
753 int i;
754
755 i = 0;
756 if (rd->rdma_protocol == FIO_RDMA_CHA_RECV) {
757 /* post io_u into recv queue */
758 for (i = 0; i < nr; i++) {
759 r_io_u_d = io_us[i]->engine_data;
760 if (ibv_post_recv(rd->qp, &r_io_u_d->rq_wr, &bad_wr) !=
761 0) {
2a442a30 762 log_err("fio: ibv_post_recv fail: %m\n");
21b8aee8 763 return 1;
764 }
765 }
766 } else if ((rd->rdma_protocol == FIO_RDMA_MEM_READ)
767 || (rd->rdma_protocol == FIO_RDMA_MEM_WRITE)) {
768 /* re-post the rq_wr */
769 if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
2a442a30 770 log_err("fio: ibv_post_recv fail: %m\n");
21b8aee8 771 return 1;
772 }
773
774 rdma_poll_wait(td, IBV_WC_RECV);
775
776 dprint(FD_IO, "fio: recv FINISH message\n");
a05d62b2
YR
777 td->done = 1;
778 return 0;
21b8aee8 779 }
780
781 return i;
782}
783
784static int fio_rdmaio_queue(struct thread_data *td, struct io_u *io_u)
785{
786 struct rdmaio_data *rd = td->io_ops->data;
787
788 fio_ro_check(td, io_u);
789
790 if (rd->io_u_queued_nr == (int)td->o.iodepth)
791 return FIO_Q_BUSY;
792
793 rd->io_us_queued[rd->io_u_queued_nr] = io_u;
794 rd->io_u_queued_nr++;
795
796 dprint_io_u(io_u, "fio_rdmaio_queue");
797
798 return FIO_Q_QUEUED;
799}
800
801static void fio_rdmaio_queued(struct thread_data *td, struct io_u **io_us,
802 unsigned int nr)
803{
804 struct rdmaio_data *rd = td->io_ops->data;
805 struct timeval now;
806 unsigned int i;
807
808 if (!fio_fill_issue_time(td))
809 return;
810
811 fio_gettime(&now, NULL);
812
813 for (i = 0; i < nr; i++) {
814 struct io_u *io_u = io_us[i];
815
816 /* queued -> flight */
817 rd->io_us_flight[rd->io_u_flight_nr] = io_u;
818 rd->io_u_flight_nr++;
819
820 memcpy(&io_u->issue_time, &now, sizeof(now));
821 io_u_queued(td, io_u);
822 }
823}
824
825static int fio_rdmaio_commit(struct thread_data *td)
826{
827 struct rdmaio_data *rd = td->io_ops->data;
828 struct io_u **io_us;
829 int ret;
830
831 if (!rd->io_us_queued)
832 return 0;
833
834 io_us = rd->io_us_queued;
835 do {
836 /* RDMA_WRITE or RDMA_READ */
222757cc 837 if (rd->is_client)
21b8aee8 838 ret = fio_rdmaio_send(td, io_us, rd->io_u_queued_nr);
222757cc 839 else if (!rd->is_client)
21b8aee8 840 ret = fio_rdmaio_recv(td, io_us, rd->io_u_queued_nr);
222757cc 841 else
21b8aee8 842 ret = 0; /* must be a SYNC */
843
844 if (ret > 0) {
845 fio_rdmaio_queued(td, io_us, ret);
846 io_u_mark_submit(td, ret);
847 rd->io_u_queued_nr -= ret;
848 io_us += ret;
849 ret = 0;
850 } else
851 break;
852 } while (rd->io_u_queued_nr);
853
854 return ret;
855}
856
857static int fio_rdmaio_connect(struct thread_data *td, struct fio_file *f)
858{
859 struct rdmaio_data *rd = td->io_ops->data;
860 struct rdma_conn_param conn_param;
861 struct ibv_send_wr *bad_wr;
862
222757cc 863 memset(&conn_param, 0, sizeof(conn_param));
21b8aee8 864 conn_param.responder_resources = 1;
865 conn_param.initiator_depth = 1;
866 conn_param.retry_count = 10;
867
868 if (rdma_connect(rd->cm_id, &conn_param) != 0) {
2a442a30 869 log_err("fio: rdma_connect fail: %m\n");
21b8aee8 870 return 1;
871 }
872
873 if (get_next_channel_event
874 (td, rd->cm_channel, RDMA_CM_EVENT_ESTABLISHED) != 0) {
875 log_err("fio: wait for RDMA_CM_EVENT_ESTABLISHED\n");
876 return 1;
877 }
878
879 /* send task request */
880 rd->send_buf.mode = htonl(rd->rdma_protocol);
881 rd->send_buf.nr = htonl(td->o.iodepth);
882
883 if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
2a442a30 884 log_err("fio: ibv_post_send fail: %m");
21b8aee8 885 return 1;
886 }
887
c4421263
LG
888 if (rdma_poll_wait(td, IBV_WC_SEND) < 0)
889 return 1;
21b8aee8 890
891 /* wait for remote MR info from server side */
c4421263
LG
892 if (rdma_poll_wait(td, IBV_WC_RECV) < 0)
893 return 1;
21b8aee8 894
7d7803fa
YR
895 /* In SEND/RECV test, it's a good practice to setup the iodepth of
896 * of the RECV side deeper than that of the SEND side to
897 * avoid RNR (receiver not ready) error. The
222757cc 898 * SEND side may send so many unsolicited message before
7d7803fa
YR
899 * RECV side commits sufficient recv buffers into recv queue.
900 * This may lead to RNR error. Here, SEND side pauses for a while
901 * during which RECV side commits sufficient recv buffers.
902 */
903 usleep(500000);
904
21b8aee8 905 return 0;
906}
907
908static int fio_rdmaio_accept(struct thread_data *td, struct fio_file *f)
909{
910 struct rdmaio_data *rd = td->io_ops->data;
911 struct rdma_conn_param conn_param;
912 struct ibv_send_wr *bad_wr;
c4421263 913 int ret = 0;
21b8aee8 914
915 /* rdma_accept() - then wait for accept success */
222757cc 916 memset(&conn_param, 0, sizeof(conn_param));
21b8aee8 917 conn_param.responder_resources = 1;
918 conn_param.initiator_depth = 1;
919
920 if (rdma_accept(rd->child_cm_id, &conn_param) != 0) {
2a442a30 921 log_err("fio: rdma_accept: %m\n");
21b8aee8 922 return 1;
923 }
924
925 if (get_next_channel_event
926 (td, rd->cm_channel, RDMA_CM_EVENT_ESTABLISHED) != 0) {
927 log_err("fio: wait for RDMA_CM_EVENT_ESTABLISHED\n");
928 return 1;
929 }
930
931 /* wait for request */
c4421263 932 ret = rdma_poll_wait(td, IBV_WC_RECV) < 0;
21b8aee8 933
934 if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
2a442a30 935 log_err("fio: ibv_post_send fail: %m");
21b8aee8 936 return 1;
937 }
938
c4421263
LG
939 if (rdma_poll_wait(td, IBV_WC_SEND) < 0)
940 return 1;
21b8aee8 941
c4421263 942 return ret;
21b8aee8 943}
944
945static int fio_rdmaio_open_file(struct thread_data *td, struct fio_file *f)
946{
947 if (td_read(td))
948 return fio_rdmaio_accept(td, f);
949 else
950 return fio_rdmaio_connect(td, f);
951}
952
953static int fio_rdmaio_close_file(struct thread_data *td, struct fio_file *f)
954{
955 struct rdmaio_data *rd = td->io_ops->data;
956 struct ibv_send_wr *bad_wr;
957
958 /* unregister rdma buffer */
959
960 /*
961 * Client sends notification to the server side
962 */
963 /* refer to: http://linux.die.net/man/7/rdma_cm */
964 if ((rd->is_client == 1) && ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE)
965 || (rd->rdma_protocol ==
966 FIO_RDMA_MEM_READ))) {
967 if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
2a442a30 968 log_err("fio: ibv_post_send fail: %m");
21b8aee8 969 return 1;
970 }
971
de8f6de9 972 dprint(FD_IO, "fio: close information sent success\n");
21b8aee8 973 rdma_poll_wait(td, IBV_WC_SEND);
974 }
975
976 if (rd->is_client == 1)
977 rdma_disconnect(rd->cm_id);
978 else {
979 rdma_disconnect(rd->child_cm_id);
222757cc
JA
980#if 0
981 rdma_disconnect(rd->cm_id);
982#endif
21b8aee8 983 }
984
222757cc
JA
985#if 0
986 if (get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_DISCONNECTED) != 0) {
987 log_err("fio: wait for RDMA_CM_EVENT_DISCONNECTED\n");
988 return 1;
989 }
990#endif
21b8aee8 991
21b8aee8 992 ibv_destroy_cq(rd->cq);
7d7803fa 993 ibv_destroy_qp(rd->qp);
21b8aee8 994
995 if (rd->is_client == 1)
996 rdma_destroy_id(rd->cm_id);
997 else {
998 rdma_destroy_id(rd->child_cm_id);
999 rdma_destroy_id(rd->cm_id);
1000 }
1001
1002 ibv_destroy_comp_channel(rd->channel);
1003 ibv_dealloc_pd(rd->pd);
1004
1005 return 0;
1006}
1007
1008static int fio_rdmaio_setup_connect(struct thread_data *td, const char *host,
1009 unsigned short port)
1010{
1011 struct rdmaio_data *rd = td->io_ops->data;
1012 struct ibv_recv_wr *bad_wr;
222757cc 1013 int err;
21b8aee8 1014
1015 rd->addr.sin_family = AF_INET;
1016 rd->addr.sin_port = htons(port);
1017
1018 if (inet_aton(host, &rd->addr.sin_addr) != 1) {
1019 struct hostent *hent;
1020
1021 hent = gethostbyname(host);
1022 if (!hent) {
1023 td_verror(td, errno, "gethostbyname");
1024 return 1;
1025 }
1026
1027 memcpy(&rd->addr.sin_addr, hent->h_addr, 4);
1028 }
1029
1030 /* resolve route */
222757cc
JA
1031 err = rdma_resolve_addr(rd->cm_id, NULL, (struct sockaddr *)&rd->addr, 2000);
1032 if (err != 0) {
1033 log_err("fio: rdma_resolve_addr: %d\n", err);
21b8aee8 1034 return 1;
1035 }
1036
222757cc
JA
1037 err = get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED);
1038 if (err != 0) {
1039 log_err("fio: get_next_channel_event: %d\n", err);
21b8aee8 1040 return 1;
1041 }
1042
1043 /* resolve route */
222757cc
JA
1044 err = rdma_resolve_route(rd->cm_id, 2000);
1045 if (err != 0) {
1046 log_err("fio: rdma_resolve_route: %d\n", err);
21b8aee8 1047 return 1;
1048 }
1049
222757cc
JA
1050 err = get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED);
1051 if (err != 0) {
1052 log_err("fio: get_next_channel_event: %d\n", err);
21b8aee8 1053 return 1;
1054 }
1055
1056 /* create qp and buffer */
1057 if (fio_rdmaio_setup_qp(td) != 0)
1058 return 1;
1059
1060 if (fio_rdmaio_setup_control_msg_buffers(td) != 0)
1061 return 1;
1062
1063 /* post recv buf */
222757cc
JA
1064 err = ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr);
1065 if (err != 0) {
1066 log_err("fio: ibv_post_recv fail: %d\n", err);
21b8aee8 1067 return 1;
1068 }
1069
1070 return 0;
1071}
1072
1073static int fio_rdmaio_setup_listen(struct thread_data *td, short port)
1074{
1075 struct rdmaio_data *rd = td->io_ops->data;
1076 struct ibv_recv_wr *bad_wr;
263775ad
LG
1077 int state = td->runstate;
1078
1079 td_set_runstate(td, TD_SETTING_UP);
21b8aee8 1080
1081 rd->addr.sin_family = AF_INET;
1082 rd->addr.sin_addr.s_addr = htonl(INADDR_ANY);
1083 rd->addr.sin_port = htons(port);
1084
1085 /* rdma_listen */
1086 if (rdma_bind_addr(rd->cm_id, (struct sockaddr *)&rd->addr) != 0) {
2a442a30 1087 log_err("fio: rdma_bind_addr fail: %m\n");
21b8aee8 1088 return 1;
1089 }
1090
1091 if (rdma_listen(rd->cm_id, 3) != 0) {
2a442a30 1092 log_err("fio: rdma_listen fail: %m\n");
21b8aee8 1093 return 1;
1094 }
1095
263775ad
LG
1096 log_info("fio: waiting for connection\n");
1097
21b8aee8 1098 /* wait for CONNECT_REQUEST */
1099 if (get_next_channel_event
1100 (td, rd->cm_channel, RDMA_CM_EVENT_CONNECT_REQUEST) != 0) {
1101 log_err("fio: wait for RDMA_CM_EVENT_CONNECT_REQUEST\n");
1102 return 1;
1103 }
1104
1105 if (fio_rdmaio_setup_qp(td) != 0)
1106 return 1;
1107
1108 if (fio_rdmaio_setup_control_msg_buffers(td) != 0)
1109 return 1;
1110
1111 /* post recv buf */
1112 if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
2a442a30 1113 log_err("fio: ibv_post_recv fail: %m\n");
21b8aee8 1114 return 1;
1115 }
1116
263775ad 1117 td_set_runstate(td, state);
21b8aee8 1118 return 0;
1119}
1120
38b9354a 1121static int check_set_rlimits(struct thread_data *td)
21b8aee8 1122{
38b9354a 1123#ifdef CONFIG_RLIMIT_MEMLOCK
21b8aee8 1124 struct rlimit rl;
1125
21b8aee8 1126 /* check RLIMIT_MEMLOCK */
1127 if (getrlimit(RLIMIT_MEMLOCK, &rl) != 0) {
1128 log_err("fio: getrlimit fail: %d(%s)\n",
1129 errno, strerror(errno));
1130 return 1;
1131 }
1132
1133 /* soft limit */
1134 if ((rl.rlim_cur != RLIM_INFINITY)
1135 && (rl.rlim_cur < td->orig_buffer_size)) {
e07f72d3
BVA
1136 log_err("fio: soft RLIMIT_MEMLOCK is: %" PRId64 "\n",
1137 rl.rlim_cur);
1138 log_err("fio: total block size is: %zd\n",
21b8aee8 1139 td->orig_buffer_size);
1140 /* try to set larger RLIMIT_MEMLOCK */
1141 rl.rlim_cur = rl.rlim_max;
1142 if (setrlimit(RLIMIT_MEMLOCK, &rl) != 0) {
1143 log_err("fio: setrlimit fail: %d(%s)\n",
1144 errno, strerror(errno));
1145 log_err("fio: you may try enlarge MEMLOCK by root\n");
1146 log_err("# ulimit -l unlimited\n");
1147 return 1;
1148 }
1149 }
38b9354a
JA
1150#endif
1151
1152 return 0;
1153}
1154
cdf91594 1155static int compat_options(struct thread_data *td)
38b9354a 1156{
cdf91594
LG
1157 // The original RDMA engine had an ugly / seperator
1158 // on the filename for it's options. This function
1159 // retains backwards compatibility with it.100
38b9354a 1160
cdf91594
LG
1161 struct rdmaio_options *o = td->eo;
1162 char *modep, *portp;
1163 char *filename = td->o.filename;
38b9354a 1164
cdf91594
LG
1165 if (!filename)
1166 return 0;
21b8aee8 1167
cdf91594
LG
1168 portp = strchr(filename, '/');
1169 if (portp == NULL)
1170 return 0;
21b8aee8 1171
cdf91594
LG
1172 *portp = '\0';
1173 portp++;
21b8aee8 1174
cdf91594
LG
1175 o->port = strtol(portp, NULL, 10);
1176 if (!o->port || o->port > 65535)
21b8aee8 1177 goto bad_host;
1178
cdf91594
LG
1179 modep = strchr(portp, '/');
1180 if (modep != NULL) {
1181 *modep = '\0';
1182 modep++;
21b8aee8 1183 }
1184
21b8aee8 1185 if (modep) {
1186 if (!strncmp("rdma_write", modep, strlen(modep)) ||
1187 !strncmp("RDMA_WRITE", modep, strlen(modep)))
cdf91594 1188 o->verb = FIO_RDMA_MEM_WRITE;
21b8aee8 1189 else if (!strncmp("rdma_read", modep, strlen(modep)) ||
1190 !strncmp("RDMA_READ", modep, strlen(modep)))
cdf91594 1191 o->verb = FIO_RDMA_MEM_READ;
21b8aee8 1192 else if (!strncmp("send", modep, strlen(modep)) ||
1193 !strncmp("SEND", modep, strlen(modep)))
cdf91594 1194 o->verb = FIO_RDMA_CHA_SEND;
21b8aee8 1195 else
1196 goto bad_host;
1197 } else
cdf91594
LG
1198 o->verb = FIO_RDMA_MEM_WRITE;
1199
1200
1201 return 0;
21b8aee8 1202
cdf91594
LG
1203bad_host:
1204 log_err("fio: bad rdma host/port/protocol: %s\n", td->o.filename);
1205 return 1;
1206}
1207
1208static int fio_rdmaio_init(struct thread_data *td)
1209{
1210 struct rdmaio_data *rd = td->io_ops->data;
1211 struct rdmaio_options *o = td->eo;
1212 unsigned int max_bs;
1213 int ret, i;
1214
1215 if (td_rw(td)) {
1216 log_err("fio: rdma connections must be read OR write\n");
1217 return 1;
1218 }
1219 if (td_random(td)) {
1220 log_err("fio: RDMA network IO can't be random\n");
1221 return 1;
1222 }
1223
1224 if (compat_options(td))
1225 return 1;
1226
1227 if (!o->port) {
1228 log_err("fio: no port has been specified which is required "
1229 "for the rdma engine\n");
1230 return 1;
1231 }
1232
1233 if (check_set_rlimits(td))
1234 return 1;
1235
1236 rd->rdma_protocol = o->verb;
21b8aee8 1237 rd->cq_event_num = 0;
1238
1239 rd->cm_channel = rdma_create_event_channel();
1240 if (!rd->cm_channel) {
2a442a30 1241 log_err("fio: rdma_create_event_channel fail: %m\n");
21b8aee8 1242 return 1;
1243 }
1244
1245 ret = rdma_create_id(rd->cm_channel, &rd->cm_id, rd, RDMA_PS_TCP);
1246 if (ret) {
2a442a30 1247 log_err("fio: rdma_create_id fail: %m\n");
21b8aee8 1248 return 1;
1249 }
1250
1251 if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
1252 (rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
1253 rd->rmt_us =
76cc5224 1254 malloc(FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
21b8aee8 1255 memset(rd->rmt_us, 0,
76cc5224 1256 FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
21b8aee8 1257 rd->rmt_nr = 0;
1258 }
1259
1260 rd->io_us_queued = malloc(td->o.iodepth * sizeof(struct io_u *));
1261 memset(rd->io_us_queued, 0, td->o.iodepth * sizeof(struct io_u *));
1262 rd->io_u_queued_nr = 0;
1263
1264 rd->io_us_flight = malloc(td->o.iodepth * sizeof(struct io_u *));
1265 memset(rd->io_us_flight, 0, td->o.iodepth * sizeof(struct io_u *));
1266 rd->io_u_flight_nr = 0;
1267
1268 rd->io_us_completed = malloc(td->o.iodepth * sizeof(struct io_u *));
1269 memset(rd->io_us_completed, 0, td->o.iodepth * sizeof(struct io_u *));
1270 rd->io_u_completed_nr = 0;
1271
1272 if (td_read(td)) { /* READ as the server */
1273 rd->is_client = 0;
263775ad 1274 td->flags |= TD_F_NO_PROGRESS;
21b8aee8 1275 /* server rd->rdma_buf_len will be setup after got request */
cdf91594 1276 ret = fio_rdmaio_setup_listen(td, o->port);
21b8aee8 1277 } else { /* WRITE as the client */
1278 rd->is_client = 1;
cdf91594 1279 ret = fio_rdmaio_setup_connect(td, td->o.filename, o->port);
21b8aee8 1280 }
1281
21b8aee8 1282 max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
c4421263
LG
1283 rd->send_buf.max_bs = htonl(max_bs);
1284
21b8aee8 1285 /* register each io_u in the free list */
954cd73a
YR
1286 for (i = 0; i < td->io_u_freelist.nr; i++) {
1287 struct io_u *io_u = td->io_u_freelist.io_us[i];
21b8aee8 1288
1289 io_u->engine_data = malloc(sizeof(struct rdma_io_u_data));
1290 memset(io_u->engine_data, 0, sizeof(struct rdma_io_u_data));
1291 ((struct rdma_io_u_data *)io_u->engine_data)->wr_id = i;
1292
1293 io_u->mr = ibv_reg_mr(rd->pd, io_u->buf, max_bs,
1294 IBV_ACCESS_LOCAL_WRITE |
1295 IBV_ACCESS_REMOTE_READ |
1296 IBV_ACCESS_REMOTE_WRITE);
1297 if (io_u->mr == NULL) {
2a442a30 1298 log_err("fio: ibv_reg_mr io_u failed: %m\n");
21b8aee8 1299 return 1;
1300 }
1301
1302 rd->send_buf.rmt_us[i].buf =
1303 htonll((uint64_t) (unsigned long)io_u->buf);
1304 rd->send_buf.rmt_us[i].rkey = htonl(io_u->mr->rkey);
1305 rd->send_buf.rmt_us[i].size = htonl(max_bs);
1306
222757cc
JA
1307#if 0
1308 log_info("fio: Send rkey %x addr %" PRIx64 " len %d to client\n", io_u->mr->rkey, io_u->buf, max_bs); */
1309#endif
21b8aee8 1310 }
1311
1312 rd->send_buf.nr = htonl(i);
1313
1314 return ret;
21b8aee8 1315}
1316
1317static void fio_rdmaio_cleanup(struct thread_data *td)
1318{
1319 struct rdmaio_data *rd = td->io_ops->data;
1320
222757cc 1321 if (rd)
21b8aee8 1322 free(rd);
21b8aee8 1323}
1324
1325static int fio_rdmaio_setup(struct thread_data *td)
1326{
1327 struct rdmaio_data *rd;
1328
cdf91594
LG
1329 if (!td->files_index) {
1330 add_file(td, td->o.filename ?: "rdma", 0, 0);
1331 td->o.nr_files = td->o.nr_files ?: 1;
1332 td->o.open_files++;
1333 }
1334
21b8aee8 1335 if (!td->io_ops->data) {
222757cc 1336 rd = malloc(sizeof(*rd));
21b8aee8 1337
1338 memset(rd, 0, sizeof(*rd));
ec4dde59 1339 init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME, 0);
21b8aee8 1340 td->io_ops->data = rd;
1341 }
1342
1343 return 0;
1344}
1345
1346static struct ioengine_ops ioengine_rw = {
cdf91594
LG
1347 .name = "rdma",
1348 .version = FIO_IOOPS_VERSION,
1349 .setup = fio_rdmaio_setup,
1350 .init = fio_rdmaio_init,
1351 .prep = fio_rdmaio_prep,
1352 .queue = fio_rdmaio_queue,
1353 .commit = fio_rdmaio_commit,
1354 .getevents = fio_rdmaio_getevents,
1355 .event = fio_rdmaio_event,
1356 .cleanup = fio_rdmaio_cleanup,
1357 .open_file = fio_rdmaio_open_file,
1358 .close_file = fio_rdmaio_close_file,
1359 .flags = FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
1360 .options = options,
1361 .option_struct_size = sizeof(struct rdmaio_options),
21b8aee8 1362};
1363
21b8aee8 1364static void fio_init fio_rdmaio_register(void)
1365{
1366 register_ioengine(&ioengine_rw);
1367}
1368
1369static void fio_exit fio_rdmaio_unregister(void)
1370{
1371 unregister_ioengine(&ioengine_rw);
1372}