[fio.git] / engines / librpma_fio.c

/*
 * librpma_fio: librpma_apm and librpma_gpspm engines' common part.
 *
 * Copyright 2021, Intel Corporation
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License,
 * version 2 as published by the Free Software Foundation..
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 */

#include "librpma_fio.h"

#include <libpmem.h>

struct fio_option librpma_fio_options[] = {
	{
		.name	= "serverip",
		.lname	= "rpma_server_ip",
		.type	= FIO_OPT_STR_STORE,
		.off1	= offsetof(struct librpma_fio_options_values, server_ip),
		.help	= "IP address the server is listening on",
		.def	= "",
		.category = FIO_OPT_C_ENGINE,
		.group	= FIO_OPT_G_LIBRPMA,
	},
	{
		.name	= "port",
		.lname	= "rpma_server port",
		.type	= FIO_OPT_STR_STORE,
		.off1	= offsetof(struct librpma_fio_options_values, port),
		.help	= "port the server is listening on",
		.def	= "7204",
		.category = FIO_OPT_C_ENGINE,
		.group	= FIO_OPT_G_LIBRPMA,
	},
	{
		.name	= "direct_write_to_pmem",
		.lname	= "Direct Write to PMem (via RDMA) from the remote host is possible",
		.type	= FIO_OPT_BOOL,
		.off1	= offsetof(struct librpma_fio_options_values,
					direct_write_to_pmem),
		.help	= "Set to true ONLY when Direct Write to PMem from the remote host is possible (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)",
		.def	= "",
		.category = FIO_OPT_C_ENGINE,
		.group	= FIO_OPT_G_LIBRPMA,
	},
	{
		.name	= "busy_wait_polling",
		.lname	= "Set to 0 to wait for completion instead of busy-wait polling completion.",
		.type	= FIO_OPT_BOOL,
		.off1	= offsetof(struct librpma_fio_options_values,
					busy_wait_polling),
		.help	= "Set to false if you want to reduce CPU usage",
		.def	= "1",
		.category = FIO_OPT_C_ENGINE,
		.group	= FIO_OPT_G_LIBRPMA,
	},
	{
		.name	= NULL,
	},
};

int librpma_fio_td_port(const char *port_base_str, struct thread_data *td,
		char *port_out)
{
	unsigned long int port_ul = strtoul(port_base_str, NULL, 10);
	unsigned int port_new;

	port_out[0] = '\0';

	if (port_ul == ULONG_MAX) {
		td_verror(td, errno, "strtoul");
		return -1;
	}
	port_ul += td->thread_number - 1;
	if (port_ul >= UINT_MAX) {
		log_err("[%u] port number (%lu) bigger than UINT_MAX\n",
			td->thread_number, port_ul);
		return -1;
	}

	port_new = port_ul;
	snprintf(port_out, LIBRPMA_FIO_PORT_STR_LEN_MAX - 1, "%u", port_new);

	return 0;
}

char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
	struct librpma_fio_mem *mem)
{
	char *mem_ptr = NULL;
	int ret;

	if ((ret = posix_memalign((void **)&mem_ptr, page_size, size))) {
		log_err("fio: posix_memalign() failed\n");
		td_verror(td, ret, "posix_memalign");
		return NULL;
	}

	mem->mem_ptr = mem_ptr;
	mem->size_mmap = 0;

	return mem_ptr;
}

char *librpma_fio_allocate_pmem(struct thread_data *td, struct fio_file *f,
		size_t size, struct librpma_fio_mem *mem)
{
	size_t size_mmap = 0;
	char *mem_ptr = NULL;
	int is_pmem = 0;
	size_t ws_offset;

	if (size % page_size) {
		log_err("fio: size (%zu) is not aligned to page size (%zu)\n",
			size, page_size);
		return NULL;
	}

	if (f->filetype == FIO_TYPE_CHAR) {
		/* Each thread uses a separate offset within DeviceDAX. */
		ws_offset = (td->thread_number - 1) * size;
	} else {
		/* Each thread uses a separate FileSystemDAX file. No offset is needed. */
		ws_offset = 0;
	}

	if (!f->file_name) {
		log_err("fio: filename is not set\n");
		return NULL;
	}

	/* map the file */
	mem_ptr = pmem_map_file(f->file_name, 0 /* len */, 0 /* flags */,
			0 /* mode */, &size_mmap, &is_pmem);
	if (mem_ptr == NULL) {
		log_err("fio: pmem_map_file(%s) failed\n", f->file_name);
		/* pmem_map_file() sets errno on failure */
		td_verror(td, errno, "pmem_map_file");
		return NULL;
	}

	/* pmem is expected */
	if (!is_pmem) {
		log_err("fio: %s is not located in persistent memory\n",
			f->file_name);
		goto err_unmap;
	}

	/* check size of allocated persistent memory */
	if (size_mmap < ws_offset + size) {
		log_err(
			"fio: %s is too small to handle so many threads (%zu < %zu)\n",
			f->file_name, size_mmap, ws_offset + size);
		goto err_unmap;
	}

	log_info("fio: size of memory mapped from the file %s: %zu\n",
		f->file_name, size_mmap);

	mem->mem_ptr = mem_ptr;
	mem->size_mmap = size_mmap;

	return mem_ptr + ws_offset;

err_unmap:
	(void) pmem_unmap(mem_ptr, size_mmap);
	return NULL;
}

void librpma_fio_free(struct librpma_fio_mem *mem)
{
	if (mem->size_mmap)
		(void) pmem_unmap(mem->mem_ptr, mem->size_mmap);
	else
		free(mem->mem_ptr);
}

#define LIBRPMA_FIO_RETRY_MAX_NO	10
#define LIBRPMA_FIO_RETRY_DELAY_S	5

int librpma_fio_client_init(struct thread_data *td,
		struct rpma_conn_cfg *cfg)
{
	struct librpma_fio_client_data *ccd;
	struct librpma_fio_options_values *o = td->eo;
	struct ibv_context *dev = NULL;
	char port_td[LIBRPMA_FIO_PORT_STR_LEN_MAX];
	struct rpma_conn_req *req = NULL;
	enum rpma_conn_event event;
	struct rpma_conn_private_data pdata;
	enum rpma_log_level log_level_aux = RPMA_LOG_LEVEL_WARNING;
	int remote_flush_type;
	int retry;
	int ret;

	/* --debug=net sets RPMA_LOG_THRESHOLD_AUX to RPMA_LOG_LEVEL_INFO */
#ifdef FIO_INC_DEBUG
	if ((1UL << FD_NET) & fio_debug)
		log_level_aux = RPMA_LOG_LEVEL_INFO;
#endif

	/* configure logging thresholds to see more details */
	rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
	rpma_log_set_threshold(RPMA_LOG_THRESHOLD_AUX, log_level_aux);

	/* obtain an IBV context for a remote IP address */
	if ((ret = rpma_utils_get_ibv_context(o->server_ip,
			RPMA_UTIL_IBV_CONTEXT_REMOTE, &dev))) {
		librpma_td_verror(td, ret, "rpma_utils_get_ibv_context");
		return -1;
	}

	/* allocate client's data */
	ccd = calloc(1, sizeof(*ccd));
	if (ccd == NULL) {
		td_verror(td, errno, "calloc");
		return -1;
	}

	/* allocate all in-memory queues */
	ccd->io_us_queued = calloc(td->o.iodepth, sizeof(*ccd->io_us_queued));
	if (ccd->io_us_queued == NULL) {
		td_verror(td, errno, "calloc");
		goto err_free_ccd;
	}

	ccd->io_us_flight = calloc(td->o.iodepth, sizeof(*ccd->io_us_flight));
	if (ccd->io_us_flight == NULL) {
		td_verror(td, errno, "calloc");
		goto err_free_io_u_queues;
	}

	ccd->io_us_completed = calloc(td->o.iodepth,
			sizeof(*ccd->io_us_completed));
	if (ccd->io_us_completed == NULL) {
		td_verror(td, errno, "calloc");
		goto err_free_io_u_queues;
	}

	/* create a new peer object */
	if ((ret = rpma_peer_new(dev, &ccd->peer))) {
		librpma_td_verror(td, ret, "rpma_peer_new");
		goto err_free_io_u_queues;
	}

	/* create a connection request */
	if (librpma_fio_td_port(o->port, td, port_td))
		goto err_peer_delete;

	for (retry = 0; retry < LIBRPMA_FIO_RETRY_MAX_NO; retry++) {
		if ((ret = rpma_conn_req_new(ccd->peer, o->server_ip, port_td,
				cfg, &req))) {
			librpma_td_verror(td, ret, "rpma_conn_req_new");
			goto err_peer_delete;
		}

		/*
		 * Connect the connection request
		 * and obtain the connection object.
		 */
		if ((ret = rpma_conn_req_connect(&req, NULL, &ccd->conn))) {
			librpma_td_verror(td, ret, "rpma_conn_req_connect");
			goto err_req_delete;
		}

		/* wait for the connection to establish */
		if ((ret = rpma_conn_next_event(ccd->conn, &event))) {
			librpma_td_verror(td, ret, "rpma_conn_next_event");
			goto err_conn_delete;
		} else if (event == RPMA_CONN_ESTABLISHED) {
			break;
		} else if (event == RPMA_CONN_REJECTED) {
			(void) rpma_conn_disconnect(ccd->conn);
			(void) rpma_conn_delete(&ccd->conn);
			if (retry < LIBRPMA_FIO_RETRY_MAX_NO - 1) {
				log_err("Thread [%d]: Retrying (#%i) ...\n",
					td->thread_number, retry + 1);
				sleep(LIBRPMA_FIO_RETRY_DELAY_S);
			} else {
				log_err(
					"Thread [%d]: The maximum number of retries exceeded. Closing.\n",
					td->thread_number);
			}
		} else {
			log_err(
				"rpma_conn_next_event returned an unexptected event: (%s != RPMA_CONN_ESTABLISHED)\n",
				rpma_utils_conn_event_2str(event));
			goto err_conn_delete;
		}
	}

	if (retry > 0)
		log_err("Thread [%d]: Connected after retry #%i\n",
			td->thread_number, retry);

	if (ccd->conn == NULL)
		goto err_peer_delete;

	/* get the connection's private data sent from the server */
	if ((ret = rpma_conn_get_private_data(ccd->conn, &pdata))) {
		librpma_td_verror(td, ret, "rpma_conn_get_private_data");
		goto err_conn_delete;
	}

	/* get the server's workspace representation */
	ccd->ws = pdata.ptr;

	/* create the server's memory representation */
	if ((ret = rpma_mr_remote_from_descriptor(&ccd->ws->descriptor[0],
			ccd->ws->mr_desc_size, &ccd->server_mr))) {
		librpma_td_verror(td, ret, "rpma_mr_remote_from_descriptor");
		goto err_conn_delete;
	}

	/* get the total size of the shared server memory */
	if ((ret = rpma_mr_remote_get_size(ccd->server_mr, &ccd->ws_size))) {
		librpma_td_verror(td, ret, "rpma_mr_remote_get_size");
		goto err_conn_delete;
	}

	/* get flush type of the remote node */
	if ((ret = rpma_mr_remote_get_flush_type(ccd->server_mr,
			&remote_flush_type))) {
		librpma_td_verror(td, ret, "rpma_mr_remote_get_flush_type");
		goto err_conn_delete;
	}

	ccd->server_mr_flush_type =
		(remote_flush_type & RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT) ?
		RPMA_FLUSH_TYPE_PERSISTENT : RPMA_FLUSH_TYPE_VISIBILITY;

	/*
	 * Assure an io_us buffer allocation is page-size-aligned which is required
	 * to register for RDMA. User-provided value is intentionally ignored.
	 */
	td->o.mem_align = page_size;

	td->io_ops_data = ccd;

	return 0;

err_conn_delete:
	(void) rpma_conn_disconnect(ccd->conn);
	(void) rpma_conn_delete(&ccd->conn);

err_req_delete:
	(void) rpma_conn_req_delete(&req);

err_peer_delete:
	(void) rpma_peer_delete(&ccd->peer);

err_free_io_u_queues:
	free(ccd->io_us_queued);
	free(ccd->io_us_flight);
	free(ccd->io_us_completed);

err_free_ccd:
	free(ccd);

	return -1;
}

void librpma_fio_client_cleanup(struct thread_data *td)
{
	struct librpma_fio_client_data *ccd = td->io_ops_data;
	enum rpma_conn_event ev;
	int ret;

	if (ccd == NULL)
		return;

	/* delete the iou's memory registration */
	if ((ret = rpma_mr_dereg(&ccd->orig_mr)))
		librpma_td_verror(td, ret, "rpma_mr_dereg");
	/* delete the iou's memory registration */
	if ((ret = rpma_mr_remote_delete(&ccd->server_mr)))
		librpma_td_verror(td, ret, "rpma_mr_remote_delete");
	/* initiate disconnection */
	if ((ret = rpma_conn_disconnect(ccd->conn)))
		librpma_td_verror(td, ret, "rpma_conn_disconnect");
	/* wait for disconnection to end up */
	if ((ret = rpma_conn_next_event(ccd->conn, &ev))) {
		librpma_td_verror(td, ret, "rpma_conn_next_event");
	} else if (ev != RPMA_CONN_CLOSED) {
		log_err(
			"client_cleanup received an unexpected event (%s != RPMA_CONN_CLOSED)\n",
			rpma_utils_conn_event_2str(ev));
	}
	/* delete the connection */
	if ((ret = rpma_conn_delete(&ccd->conn)))
		librpma_td_verror(td, ret, "rpma_conn_delete");
	/* delete the peer */
	if ((ret = rpma_peer_delete(&ccd->peer)))
		librpma_td_verror(td, ret, "rpma_peer_delete");
	/* free the software queues */
	free(ccd->io_us_queued);
	free(ccd->io_us_flight);
	free(ccd->io_us_completed);
	free(ccd);
	td->io_ops_data = NULL; /* zero ccd */
}

int librpma_fio_file_nop(struct thread_data *td, struct fio_file *f)
{
	/* NOP */
	return 0;
}

int librpma_fio_client_post_init(struct thread_data *td)
{
	struct librpma_fio_client_data *ccd =  td->io_ops_data;
	size_t io_us_size;
	int ret;

	/*
	 * td->orig_buffer is not aligned. The engine requires aligned io_us
	 * so FIO alignes up the address using the formula below.
	 */
	ccd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) +
			td->o.mem_align;

	/*
	 * td->orig_buffer_size beside the space really consumed by io_us
	 * has paddings which can be omitted for the memory registration.
	 */
	io_us_size = (unsigned long long)td_max_bs(td) *
			(unsigned long long)td->o.iodepth;

	if ((ret = rpma_mr_reg(ccd->peer, ccd->orig_buffer_aligned, io_us_size,
			RPMA_MR_USAGE_READ_DST | RPMA_MR_USAGE_READ_SRC |
			RPMA_MR_USAGE_WRITE_DST | RPMA_MR_USAGE_WRITE_SRC |
			RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT, &ccd->orig_mr)))
		librpma_td_verror(td, ret, "rpma_mr_reg");
	return ret;
}

int librpma_fio_client_get_file_size(struct thread_data *td,
		struct fio_file *f)
{
	struct librpma_fio_client_data *ccd = td->io_ops_data;

	f->real_file_size = ccd->ws_size;
	fio_file_set_size_known(f);

	return 0;
}

static enum fio_q_status client_queue_sync(struct thread_data *td,
		struct io_u *io_u)
{
	struct librpma_fio_client_data *ccd = td->io_ops_data;
	struct rpma_completion cmpl;
	unsigned io_u_index;
	int ret;

	/* execute io_u */
	if (io_u->ddir == DDIR_READ) {
		/* post an RDMA read operation */
		if (librpma_fio_client_io_read(td, io_u,
				RPMA_F_COMPLETION_ALWAYS))
			goto err;
	} else if (io_u->ddir == DDIR_WRITE) {
		/* post an RDMA write operation */
		if (librpma_fio_client_io_write(td, io_u))
			goto err;
		if (ccd->flush(td, io_u, io_u, io_u->xfer_buflen))
			goto err;
	} else {
		log_err("unsupported IO mode: %s\n", io_ddir_name(io_u->ddir));
		goto err;
	}

	do {
		/* get a completion */
		ret = rpma_conn_completion_get(ccd->conn, &cmpl);
		if (ret == RPMA_E_NO_COMPLETION) {
			/* lack of completion is not an error */
			continue;
		} else if (ret != 0) {
			/* an error occurred */
			librpma_td_verror(td, ret, "rpma_conn_completion_get");
			goto err;
		}

		/* if io_us has completed with an error */
		if (cmpl.op_status != IBV_WC_SUCCESS)
			goto err;

		if (cmpl.op == RPMA_OP_SEND)
			++ccd->op_send_completed;
		else {
			if (cmpl.op == RPMA_OP_RECV)
				++ccd->op_recv_completed;

			break;
		}
	} while (1);

	if (ccd->get_io_u_index(&cmpl, &io_u_index) != 1)
		goto err;

	if (io_u->index != io_u_index) {
		log_err(
			"no matching io_u for received completion found (io_u_index=%u)\n",
			io_u_index);
		goto err;
	}

	/* make sure all SENDs are completed before exit - clean up SQ */
	if (librpma_fio_client_io_complete_all_sends(td))
		goto err;

	return FIO_Q_COMPLETED;

err:
	io_u->error = -1;
	return FIO_Q_COMPLETED;
}

enum fio_q_status librpma_fio_client_queue(struct thread_data *td,
		struct io_u *io_u)
{
	struct librpma_fio_client_data *ccd = td->io_ops_data;

	if (ccd->io_u_queued_nr == (int)td->o.iodepth)
		return FIO_Q_BUSY;

	if (td->o.sync_io)
		return client_queue_sync(td, io_u);

	/* io_u -> queued[] */
	ccd->io_us_queued[ccd->io_u_queued_nr] = io_u;
	ccd->io_u_queued_nr++;

	return FIO_Q_QUEUED;
}

int librpma_fio_client_commit(struct thread_data *td)
{
	struct librpma_fio_client_data *ccd = td->io_ops_data;
	int flags = RPMA_F_COMPLETION_ON_ERROR;
	struct timespec now;
	bool fill_time;
	int i;
	struct io_u *flush_first_io_u = NULL;
	unsigned long long int flush_len = 0;

	if (!ccd->io_us_queued)
		return -1;

	/* execute all io_us from queued[] */
	for (i = 0; i < ccd->io_u_queued_nr; i++) {
		struct io_u *io_u = ccd->io_us_queued[i];

		if (io_u->ddir == DDIR_READ) {
			if (i + 1 == ccd->io_u_queued_nr ||
			    ccd->io_us_queued[i + 1]->ddir == DDIR_WRITE)
				flags = RPMA_F_COMPLETION_ALWAYS;
			/* post an RDMA read operation */
			if (librpma_fio_client_io_read(td, io_u, flags))
				return -1;
		} else if (io_u->ddir == DDIR_WRITE) {
			/* post an RDMA write operation */
			if (librpma_fio_client_io_write(td, io_u))
				return -1;

			/* cache the first io_u in the sequence */
			if (flush_first_io_u == NULL)
				flush_first_io_u = io_u;

			/*
			 * the flush length is the sum of all io_u's creating
			 * the sequence
			 */
			flush_len += io_u->xfer_buflen;

			/*
			 * if io_u's are random the rpma_flush is required
			 * after each one of them
			 */
			if (!td_random(td)) {
				/*
				 * When the io_u's are sequential and
				 * the current io_u is not the last one and
				 * the next one is also a write operation
				 * the flush can be postponed by one io_u and
				 * cover all of them which build a continuous
				 * sequence.
				 */
				if ((i + 1 < ccd->io_u_queued_nr) &&
				    (ccd->io_us_queued[i + 1]->ddir == DDIR_WRITE))
					continue;
			}

			/* flush all writes which build a continuous sequence */
			if (ccd->flush(td, flush_first_io_u, io_u, flush_len))
				return -1;

			/*
			 * reset the flush parameters in preparation for
			 * the next one
			 */
			flush_first_io_u = NULL;
			flush_len = 0;
		} else {
			log_err("unsupported IO mode: %s\n",
				io_ddir_name(io_u->ddir));
			return -1;
		}
	}

	if ((fill_time = fio_fill_issue_time(td)))
		fio_gettime(&now, NULL);

	/* move executed io_us from queued[] to flight[] */
	for (i = 0; i < ccd->io_u_queued_nr; i++) {
		struct io_u *io_u = ccd->io_us_queued[i];

		/* FIO does not do this if the engine is asynchronous */
		if (fill_time)
			memcpy(&io_u->issue_time, &now, sizeof(now));

		/* move executed io_us from queued[] to flight[] */
		ccd->io_us_flight[ccd->io_u_flight_nr] = io_u;
		ccd->io_u_flight_nr++;

		/*
		 * FIO says:
		 * If an engine has the commit hook
		 * it has to call io_u_queued() itself.
		 */
		io_u_queued(td, io_u);
	}

	/* FIO does not do this if an engine has the commit hook. */
	io_u_mark_submit(td, ccd->io_u_queued_nr);
	ccd->io_u_queued_nr = 0;

	return 0;
}

/*
 * RETURN VALUE
 * - > 0  - a number of completed io_us
 * -   0  - when no complicitions received
 * - (-1) - when an error occurred
 */
static int client_getevent_process(struct thread_data *td)
{
	struct librpma_fio_client_data *ccd = td->io_ops_data;
	struct rpma_completion cmpl;
	/* io_u->index of completed io_u (cmpl.op_context) */
	unsigned int io_u_index;
	/* # of completed io_us */
	int cmpl_num = 0;
	/* helpers */
	struct io_u *io_u;
	int i;
	int ret;

	/* get a completion */
	if ((ret = rpma_conn_completion_get(ccd->conn, &cmpl))) {
		/* lack of completion is not an error */
		if (ret == RPMA_E_NO_COMPLETION) {
			/* lack of completion is not an error */
			return 0;
		}

		/* an error occurred */
		librpma_td_verror(td, ret, "rpma_conn_completion_get");
		return -1;
	}

	/* if io_us has completed with an error */
	if (cmpl.op_status != IBV_WC_SUCCESS) {
		td->error = cmpl.op_status;
		return -1;
	}

	if (cmpl.op == RPMA_OP_SEND)
		++ccd->op_send_completed;
	else if (cmpl.op == RPMA_OP_RECV)
		++ccd->op_recv_completed;

	if ((ret = ccd->get_io_u_index(&cmpl, &io_u_index)) != 1)
		return ret;

	/* look for an io_u being completed */
	for (i = 0; i < ccd->io_u_flight_nr; ++i) {
		if (ccd->io_us_flight[i]->index == io_u_index) {
			cmpl_num = i + 1;
			break;
		}
	}

	/* if no matching io_u has been found */
	if (cmpl_num == 0) {
		log_err(
			"no matching io_u for received completion found (io_u_index=%u)\n",
			io_u_index);
		return -1;
	}

	/* move completed io_us to the completed in-memory queue */
	for (i = 0; i < cmpl_num; ++i) {
		/* get and prepare io_u */
		io_u = ccd->io_us_flight[i];

		/* append to the queue */
		ccd->io_us_completed[ccd->io_u_completed_nr] = io_u;
		ccd->io_u_completed_nr++;
	}

	/* remove completed io_us from the flight queue */
	for (i = cmpl_num; i < ccd->io_u_flight_nr; ++i)
		ccd->io_us_flight[i - cmpl_num] = ccd->io_us_flight[i];
	ccd->io_u_flight_nr -= cmpl_num;

	return cmpl_num;
}

int librpma_fio_client_getevents(struct thread_data *td, unsigned int min,
		unsigned int max, const struct timespec *t)
{
	struct librpma_fio_client_data *ccd = td->io_ops_data;
	/* total # of completed io_us */
	int cmpl_num_total = 0;
	/* # of completed io_us from a single event */
	int cmpl_num;

	do {
		cmpl_num = client_getevent_process(td);
		if (cmpl_num > 0) {
			/* new completions collected */
			cmpl_num_total += cmpl_num;
		} else if (cmpl_num == 0) {
			/*
			 * It is required to make sure that CQEs for SENDs
			 * will flow at least at the same pace as CQEs for RECVs.
			 */
			if (cmpl_num_total >= min &&
			    ccd->op_send_completed >= ccd->op_recv_completed)
				break;

			/*
			 * To reduce CPU consumption one can use
			 * the rpma_conn_completion_wait() function.
			 * Note this greatly increase the latency
			 * and make the results less stable.
			 * The bandwidth stays more or less the same.
			 */
		} else {
			/* an error occurred */
			return -1;
		}

		/*
		 * The expected max can be exceeded if CQEs for RECVs will come up
		 * faster than CQEs for SENDs. But it is required to make sure CQEs for
		 * SENDs will flow at least at the same pace as CQEs for RECVs.
		 */
	} while (cmpl_num_total < max ||
			ccd->op_send_completed < ccd->op_recv_completed);

	/*
	 * All posted SENDs are completed and RECVs for them (responses) are
	 * completed. This is the initial situation so the counters are reset.
	 */
	if (ccd->op_send_posted == ccd->op_send_completed &&
			ccd->op_send_completed == ccd->op_recv_completed) {
		ccd->op_send_posted = 0;
		ccd->op_send_completed = 0;
		ccd->op_recv_completed = 0;
	}

	return cmpl_num_total;
}

struct io_u *librpma_fio_client_event(struct thread_data *td, int event)
{
	struct librpma_fio_client_data *ccd = td->io_ops_data;
	struct io_u *io_u;
	int i;

	/* get the first io_u from the queue */
	io_u = ccd->io_us_completed[0];

	/* remove the first io_u from the queue */
	for (i = 1; i < ccd->io_u_completed_nr; ++i)
		ccd->io_us_completed[i - 1] = ccd->io_us_completed[i];
	ccd->io_u_completed_nr--;

	dprint_io_u(io_u, "client_event");

	return io_u;
}

char *librpma_fio_client_errdetails(struct io_u *io_u)
{
	/* get the string representation of an error */
	enum ibv_wc_status status = io_u->error;
	const char *status_str = ibv_wc_status_str(status);

	char *details = strdup(status_str);
	if (details == NULL) {
		fprintf(stderr, "Error: %s\n", status_str);
		fprintf(stderr, "Fatal error: out of memory. Aborting.\n");
		abort();
	}

	/* FIO frees the returned string when it becomes obsolete */
	return details;
}

int librpma_fio_server_init(struct thread_data *td)
{
	struct librpma_fio_options_values *o = td->eo;
	struct librpma_fio_server_data *csd;
	struct ibv_context *dev = NULL;
	enum rpma_log_level log_level_aux = RPMA_LOG_LEVEL_WARNING;
	int ret = -1;

	/* --debug=net sets RPMA_LOG_THRESHOLD_AUX to RPMA_LOG_LEVEL_INFO */
#ifdef FIO_INC_DEBUG
	if ((1UL << FD_NET) & fio_debug)
		log_level_aux = RPMA_LOG_LEVEL_INFO;
#endif

	/* configure logging thresholds to see more details */
	rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
	rpma_log_set_threshold(RPMA_LOG_THRESHOLD_AUX, log_level_aux);


	/* obtain an IBV context for a remote IP address */
	if ((ret = rpma_utils_get_ibv_context(o->server_ip,
			RPMA_UTIL_IBV_CONTEXT_LOCAL, &dev))) {
		librpma_td_verror(td, ret, "rpma_utils_get_ibv_context");
		return -1;
	}

	/* allocate server's data */
	csd = calloc(1, sizeof(*csd));
	if (csd == NULL) {
		td_verror(td, errno, "calloc");
		return -1;
	}

	/* create a new peer object */
	if ((ret = rpma_peer_new(dev, &csd->peer))) {
		librpma_td_verror(td, ret, "rpma_peer_new");
		goto err_free_csd;
	}

	td->io_ops_data = csd;

	return 0;

err_free_csd:
	free(csd);

	return -1;
}

void librpma_fio_server_cleanup(struct thread_data *td)
{
	struct librpma_fio_server_data *csd =  td->io_ops_data;
	int ret;

	if (csd == NULL)
		return;

	/* free the peer */
	if ((ret = rpma_peer_delete(&csd->peer)))
		librpma_td_verror(td, ret, "rpma_peer_delete");

	free(csd);
}

int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
		struct rpma_conn_cfg *cfg)
{
	struct librpma_fio_server_data *csd = td->io_ops_data;
	struct librpma_fio_options_values *o = td->eo;
	enum rpma_conn_event conn_event = RPMA_CONN_UNDEFINED;
	struct librpma_fio_workspace ws = {0};
	struct rpma_conn_private_data pdata;
	uint32_t max_msg_num;
	struct rpma_conn_req *conn_req;
	struct rpma_conn *conn;
	struct rpma_mr_local *mr;
	char port_td[LIBRPMA_FIO_PORT_STR_LEN_MAX];
	struct rpma_ep *ep;
	size_t mem_size = td->o.size;
	size_t mr_desc_size;
	void *ws_ptr;
	bool is_dram;
	int usage_mem_type;
	int ret;

	if (!f->file_name) {
		log_err("fio: filename is not set\n");
		return -1;
	}

	/* start a listening endpoint at addr:port */
	if (librpma_fio_td_port(o->port, td, port_td))
		return -1;

	if ((ret = rpma_ep_listen(csd->peer, o->server_ip, port_td, &ep))) {
		librpma_td_verror(td, ret, "rpma_ep_listen");
		return -1;
	}

	is_dram = !strcmp(f->file_name, "malloc");
	if (is_dram) {
		/* allocation from DRAM using posix_memalign() */
		ws_ptr = librpma_fio_allocate_dram(td, mem_size, &csd->mem);
		usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_VISIBILITY;
	} else {
		/* allocation from PMEM using pmem_map_file() */
		ws_ptr = librpma_fio_allocate_pmem(td, f, mem_size, &csd->mem);
		usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT;
	}

	if (ws_ptr == NULL)
		goto err_ep_shutdown;

	f->real_file_size = mem_size;

	if ((ret = rpma_mr_reg(csd->peer, ws_ptr, mem_size,
			RPMA_MR_USAGE_READ_DST | RPMA_MR_USAGE_READ_SRC |
			RPMA_MR_USAGE_WRITE_DST | RPMA_MR_USAGE_WRITE_SRC |
			usage_mem_type, &mr))) {
		librpma_td_verror(td, ret, "rpma_mr_reg");
		goto err_free;
	}

	if (!is_dram && f->filetype == FIO_TYPE_FILE) {
		ret = rpma_mr_advise(mr, 0, mem_size,
				IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE,
				IBV_ADVISE_MR_FLAG_FLUSH);
		if (ret) {
			librpma_td_verror(td, ret, "rpma_mr_advise");
			/* an invalid argument is an error */
			if (ret == RPMA_E_INVAL)
				goto err_mr_dereg;

			/* log_err used instead of log_info to avoid corruption of the JSON output */
			log_err("Note: having rpma_mr_advise(3) failed because of RPMA_E_NOSUPP or RPMA_E_PROVIDER may come with a performance penalty, but it is not a blocker for running the benchmark.\n");
		}
	}

	/* get size of the memory region's descriptor */
	if ((ret = rpma_mr_get_descriptor_size(mr, &mr_desc_size))) {
		librpma_td_verror(td, ret, "rpma_mr_get_descriptor_size");
		goto err_mr_dereg;
	}

	/* verify size of the memory region's descriptor */
	if (mr_desc_size > LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE) {
		log_err(
			"size of the memory region's descriptor is too big (max=%i)\n",
			LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE);
		goto err_mr_dereg;
	}

	/* get the memory region's descriptor */
	if ((ret = rpma_mr_get_descriptor(mr, &ws.descriptor[0]))) {
		librpma_td_verror(td, ret, "rpma_mr_get_descriptor");
		goto err_mr_dereg;
	}

	if (cfg != NULL) {
		if ((ret = rpma_conn_cfg_get_rq_size(cfg, &max_msg_num))) {
			librpma_td_verror(td, ret, "rpma_conn_cfg_get_rq_size");
			goto err_mr_dereg;
		}

		/* verify whether iodepth fits into uint16_t */
		if (max_msg_num > UINT16_MAX) {
			log_err("fio: iodepth too big (%u > %u)\n",
				max_msg_num, UINT16_MAX);
			return -1;
		}

		ws.max_msg_num = max_msg_num;
	}

	/* prepare a workspace description */
	ws.direct_write_to_pmem = o->direct_write_to_pmem;
	ws.mr_desc_size = mr_desc_size;
	pdata.ptr = &ws;
	pdata.len = sizeof(ws);

	/* receive an incoming connection request */
	if ((ret = rpma_ep_next_conn_req(ep, cfg, &conn_req))) {
		librpma_td_verror(td, ret, "rpma_ep_next_conn_req");
		goto err_mr_dereg;
	}

	if (csd->prepare_connection && csd->prepare_connection(td, conn_req))
		goto err_req_delete;

	/* accept the connection request and obtain the connection object */
	if ((ret = rpma_conn_req_connect(&conn_req, &pdata, &conn))) {
		librpma_td_verror(td, ret, "rpma_conn_req_connect");
		goto err_req_delete;
	}

	/* wait for the connection to be established */
	if ((ret = rpma_conn_next_event(conn, &conn_event))) {
		librpma_td_verror(td, ret, "rpma_conn_next_event");
		goto err_conn_delete;
	} else if (conn_event != RPMA_CONN_ESTABLISHED) {
		log_err("rpma_conn_next_event returned an unexptected event\n");
		goto err_conn_delete;
	}

	/* end-point is no longer needed */
	(void) rpma_ep_shutdown(&ep);

	csd->ws_mr = mr;
	csd->ws_ptr = ws_ptr;
	csd->conn = conn;

	return 0;

err_conn_delete:
	(void) rpma_conn_delete(&conn);

err_req_delete:
	(void) rpma_conn_req_delete(&conn_req);

err_mr_dereg:
	(void) rpma_mr_dereg(&mr);

err_free:
	librpma_fio_free(&csd->mem);

err_ep_shutdown:
	(void) rpma_ep_shutdown(&ep);

	return -1;
}

int librpma_fio_server_close_file(struct thread_data *td, struct fio_file *f)
{
	struct librpma_fio_server_data *csd = td->io_ops_data;
	enum rpma_conn_event conn_event = RPMA_CONN_UNDEFINED;
	int rv = 0;
	int ret;

	/* wait for the connection to be closed */
	ret = rpma_conn_next_event(csd->conn, &conn_event);
	if (!ret && conn_event != RPMA_CONN_CLOSED) {
		log_err("rpma_conn_next_event returned an unexptected event\n");
		rv = -1;
	}

	if ((ret = rpma_conn_disconnect(csd->conn))) {
		librpma_td_verror(td, ret, "rpma_conn_disconnect");
		rv = -1;
	}

	if ((ret = rpma_conn_delete(&csd->conn))) {
		librpma_td_verror(td, ret, "rpma_conn_delete");
		rv = -1;
	}

	if ((ret = rpma_mr_dereg(&csd->ws_mr))) {
		librpma_td_verror(td, ret, "rpma_mr_dereg");
		rv = -1;
	}

	librpma_fio_free(&csd->mem);

	return rv;
}