/*
- * rdma engine
+ * RDMA I/O engine
*
- * RDMA IO engine using OFED library.
- * Support both RDMA memory semantic and channel semantic
- * in InfiniBand, RoCE and iWarp environment.
+ * RDMA I/O engine based on the IB verbs and RDMA/CM user space libraries.
+ * Supports both RDMA memory semantics and channel semantics
+ * for the InfiniBand, RoCE and iWARP protocols.
*
- * This is currently disabled. To enable it, execute:
+ * This I/O engine is disabled by default. To enable it, execute:
*
- * $ export EXTFLAGS="-DFIO_HAVE_RDMA"
- * $ export EXTLIBS="-libverbs -lrdmacm"
+ * $ export EXTFLAGS+=" -DFIO_HAVE_RDMA "
+ * $ export EXTLIBS+=" -libverbs -lrdmacm "
*
- * before running make. You'll need the OFED as well:
+ * before running make. You will need the Linux RDMA software as well, either
+ * from your Linux distributor or directly from openfabrics.org:
*
* http://www.openfabrics.org/downloads/OFED/
*
+ * Exchanging steps of RDMA ioengine control messages:
+ * 1. client side sends test mode (RDMA_WRITE/RDMA_READ/SEND)
+ * to server side.
+ * 2. server side parses test mode, and sends back confirmation
+ * to client side. In RDMA WRITE/READ test, this confirmation
+ * includes memory information, such as rkey, address.
+ * 3. client side initiates test loop.
+ * 4. In RDMA WRITE/READ test, client side sends a completion
+ * notification to server side. Server side updates its
+ * td->done as true.
+ *
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "../fio.h"
+#include "../hash.h"
#ifdef FIO_HAVE_RDMA
#include <rdma/rdma_cma.h>
#include <infiniband/arch.h>
-#define FIO_RDMA_MAX_IO_DPETH 128
+#define FIO_RDMA_MAX_IO_DEPTH 512
enum rdma_io_mode {
FIO_RDMA_UNKNOWN = 0,
uint32_t nr; /* client: io depth
server: number of records for memory semantic
*/
- struct remote_u rmt_us[FIO_RDMA_MAX_IO_DPETH];
+ struct remote_u rmt_us[FIO_RDMA_MAX_IO_DEPTH];
};
struct rdma_io_u_data {
int io_u_flight_nr;
struct io_u **io_us_completed;
int io_u_completed_nr;
+
+ struct frand_state rand_state;
};
static int client_recv(struct thread_data *td, struct ibv_wc *wc)
struct rdmaio_data *rd = td->io_ops->data;
if (wc->byte_len != sizeof(rd->recv_buf)) {
- fprintf(stderr, "Received bogus data, size %d\n", wc->byte_len);
+ log_err("Received bogus data, size %d\n", wc->byte_len);
return 1;
}
{
struct rdmaio_data *rd = td->io_ops->data;
- if (wc->wr_id == FIO_RDMA_MAX_IO_DPETH) {
+ if (wc->wr_id == FIO_RDMA_MAX_IO_DEPTH) {
rd->rdma_protocol = ntohl(rd->recv_buf.mode);
/* CHANNEL semantic, do nothing */
else
server_recv(td, &wc);
- if (wc.wr_id == FIO_RDMA_MAX_IO_DPETH)
+ if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH)
break;
for (i = 0; i < rd->io_u_flight_nr; i++) {
}
}
if (i == rd->io_u_flight_nr)
- log_err("fio: recv wr %ld not found\n",
+ log_err("fio: recv wr %" PRId64 " not found\n",
wc.wr_id);
else {
/* put the last one into middle of the list */
case IBV_WC_SEND:
case IBV_WC_RDMA_WRITE:
case IBV_WC_RDMA_READ:
- if (wc.wr_id == FIO_RDMA_MAX_IO_DPETH)
+ if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH)
break;
for (i = 0; i < rd->io_u_flight_nr; i++) {
}
}
if (i == rd->io_u_flight_nr)
- log_err("fio: send wr %ld not found\n",
+ log_err("fio: send wr %" PRId64 " not found\n",
wc.wr_id);
else {
/* put the last one into middle of the list */
rd->recv_sgl.lkey = rd->recv_mr->lkey;
rd->rq_wr.sg_list = &rd->recv_sgl;
rd->rq_wr.num_sge = 1;
- rd->rq_wr.wr_id = FIO_RDMA_MAX_IO_DPETH;
+ rd->rq_wr.wr_id = FIO_RDMA_MAX_IO_DEPTH;
/* send wq */
rd->send_sgl.addr = (uint64_t) (unsigned long)&rd->send_buf;
rd->sq_wr.send_flags = IBV_SEND_SIGNALED;
rd->sq_wr.sg_list = &rd->send_sgl;
rd->sq_wr.num_sge = 1;
- rd->sq_wr.wr_id = FIO_RDMA_MAX_IO_DPETH;
+ rd->sq_wr.wr_id = FIO_RDMA_MAX_IO_DEPTH;
return 0;
}
{
struct rdmaio_data *rd = td->io_ops->data;
struct ibv_send_wr *bad_wr;
+#if 0
enum ibv_wc_opcode comp_opcode;
comp_opcode = IBV_WC_RDMA_WRITE;
- int i, index;
+#endif
+ int i;
+ long index;
struct rdma_io_u_data *r_io_u_d;
r_io_u_d = NULL;
case FIO_RDMA_MEM_WRITE:
/* compose work request */
r_io_u_d = io_us[i]->engine_data;
- index = rand() % rd->rmt_nr;
+ if (td->o.use_os_rand)
+ index = os_random_long(&td->random_state) % rd->rmt_nr;
+ else
+ index = __rand(&rd->rand_state) % rd->rmt_nr;
r_io_u_d->sq_wr.opcode = IBV_WR_RDMA_WRITE;
r_io_u_d->sq_wr.wr.rdma.rkey = rd->rmt_us[index].rkey;
- r_io_u_d->sq_wr.wr.rdma.remote_addr =
- rd->rmt_us[index].buf;
+ r_io_u_d->sq_wr.wr.rdma.remote_addr = \
+ rd->rmt_us[index].buf;
r_io_u_d->sq_wr.sg_list->length = io_us[i]->buflen;
break;
case FIO_RDMA_MEM_READ:
/* compose work request */
r_io_u_d = io_us[i]->engine_data;
- index = rand() % rd->rmt_nr;
+ if (td->o.use_os_rand)
+ index = os_random_long(&td->random_state) % rd->rmt_nr;
+ else
+ index = __rand(&rd->rand_state) % rd->rmt_nr;
r_io_u_d->sq_wr.opcode = IBV_WR_RDMA_READ;
r_io_u_d->sq_wr.wr.rdma.rkey = rd->rmt_us[index].rkey;
- r_io_u_d->sq_wr.wr.rdma.remote_addr =
- rd->rmt_us[index].buf;
+ r_io_u_d->sq_wr.wr.rdma.remote_addr = \
+ rd->rmt_us[index].buf;
r_io_u_d->sq_wr.sg_list->length = io_us[i]->buflen;
break;
case FIO_RDMA_CHA_SEND:
rdma_poll_wait(td, IBV_WC_RECV);
dprint(FD_IO, "fio: recv FINISH message\n");
- exit(0);
+ td->done = 1;
+ return 0;
}
return i;
/* wait for remote MR info from server side */
rdma_poll_wait(td, IBV_WC_RECV);
+ /* In SEND/RECV test, it's a good practice to setup the iodepth of
+ * of the RECV side deeper than that of the SEND side to
+ * avoid RNR (receiver not ready) error. The
+ * SEND side may send so many unsolicited message before
+ * RECV side commits sufficient recv buffers into recv queue.
+ * This may lead to RNR error. Here, SEND side pauses for a while
+ * during which RECV side commits sufficient recv buffers.
+ */
+ usleep(500000);
+
return 0;
}
return 1;
}*/
- ibv_destroy_qp(rd->qp);
ibv_destroy_cq(rd->cq);
+ ibv_destroy_qp(rd->qp);
if (rd->is_client == 1)
rdma_destroy_id(rd->cm_id);
/* soft limit */
if ((rl.rlim_cur != RLIM_INFINITY)
&& (rl.rlim_cur < td->orig_buffer_size)) {
- log_err("fio: soft RLIMIT_MEMLOCK is: %ld\n", rl.rlim_cur);
- log_err("fio: total block size is: %ld\n",
+ log_err("fio: soft RLIMIT_MEMLOCK is: %" PRId64 "\n",
+ rl.rlim_cur);
+ log_err("fio: total block size is: %zd\n",
td->orig_buffer_size);
/* try to set larger RLIMIT_MEMLOCK */
rl.rlim_cur = rl.rlim_max;
if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
(rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
rd->rmt_us =
- malloc(FIO_RDMA_MAX_IO_DPETH * sizeof(struct remote_u));
+ malloc(FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
memset(rd->rmt_us, 0,
- FIO_RDMA_MAX_IO_DPETH * sizeof(struct remote_u));
+ FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
rd->rmt_nr = 0;
}
rd = malloc(sizeof(*rd));;
memset(rd, 0, sizeof(*rd));
+ init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME);
td->io_ops->data = rd;
}
}
static struct ioengine_ops ioengine_rw = {
- .name = "rdma",
- .version = FIO_IOOPS_VERSION,
- .setup = fio_rdmaio_setup,
- .init = fio_rdmaio_init,
- .prep = fio_rdmaio_prep,
- .queue = fio_rdmaio_queue,
- .commit = fio_rdmaio_commit,
- .getevents = fio_rdmaio_getevents,
- .event = fio_rdmaio_event,
- .cleanup = fio_rdmaio_cleanup,
- .open_file = fio_rdmaio_open_file,
- .close_file = fio_rdmaio_close_file,
- .flags = FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
+ .name = "rdma",
+ .version = FIO_IOOPS_VERSION,
+ .setup = fio_rdmaio_setup,
+ .init = fio_rdmaio_init,
+ .prep = fio_rdmaio_prep,
+ .queue = fio_rdmaio_queue,
+ .commit = fio_rdmaio_commit,
+ .getevents = fio_rdmaio_getevents,
+ .event = fio_rdmaio_event,
+ .cleanup = fio_rdmaio_cleanup,
+ .open_file = fio_rdmaio_open_file,
+ .close_file = fio_rdmaio_close_file,
+ .flags = FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
};
#else /* FIO_HAVE_RDMA */
log_err(" make sure OFED is installed,\n");
log_err(" $ ofed_info\n");
log_err(" then try to make fio as follows:\n");
- log_err(" $ export EXTFLAGS=\"-DFIO_HAVE_RDMA\"\n");
- log_err(" $ export EXTLIBS=\"-libverbs -lrdmacm\"\n");
+ log_err(" $ export EXTFLAGS+=\" -DFIO_HAVE_RDMA \"\n");
+ log_err(" $ export EXTLIBS+=\" -libverbs -lrdmacm \"\n");
log_err(" $ make clean && make\n");
return 1;
}
static struct ioengine_ops ioengine_rw = {
- .name = "rdma",
- .version = FIO_IOOPS_VERSION,
- .init = fio_rdmaio_init,
- .queue = fio_rdmaio_queue,
- .open_file = fio_rdmaio_open_file,
- .close_file = fio_rdmaio_close_file,
- .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
+ .name = "rdma",
+ .version = FIO_IOOPS_VERSION,
+ .init = fio_rdmaio_init,
+ .queue = fio_rdmaio_queue,
+ .open_file = fio_rdmaio_open_file,
+ .close_file = fio_rdmaio_close_file,
+ .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
};
#endif