the job to exit. The same format as rate is used for read vs
write seperation.
+max_latency=int If set, fio will exit the job if it exceeds this maximum
+ latency. It will exit with an ETIME error.
+
ratecycle=int Average bandwidth for 'rate' and 'ratemin' over this number
of milliseconds.
allows a range of CPUs. Say you wanted a binding to CPUs
1, 5, and 8-15, you would set cpus_allowed=1,5,8-15.
+numa_cpu_nodes=str Set this job running on spcified NUMA nodes' CPUs. The
+ arguments allow comma delimited list of cpu numbers,
+ A-B ranges, or 'all'. Note, to enable numa options support,
+ export the following environment variables,
+ export EXTFLAGS+=" -DFIO_HAVE_LIBNUMA "
+ export EXTLIBS+=" -lnuma "
+
+numa_mem_policy=str Set this job's memory policy and corresponding NUMA
+ nodes. Format of the argements:
+ <mode>[:<nodelist>]
+ `mode' is one of the following memory policy:
+ default, prefer, bind, interleave, local
+ For `default' and `local' memory policy, no node is
+ needed to be specified.
+ For `prefer', only one node is allowed.
+ For `bind' and `interleave', it allow comma delimited
+ list of numbers, A-B ranges, or 'all'.
+
startdelay=time Start this job the specified number of seconds after fio
has started. Only useful if the job file contains several
jobs, and you want to delay starting some jobs to a certain
---- ---
P Thread setup, but not started.
C Thread created.
-I Thread initialized, waiting.
+I Thread initialized, waiting or generating necessary data.
p Thread running pre-reading file(s).
R Running, doing sequential reads.
r Running, doing random reads.
--- /dev/null
+As specified by the COPYING file, fio is free software published under version
+2 of the GPL license. That covers the copying part of the license. By using fio,
+you are also promising to uphold the following moral obligations:
+
+- If you publish results that are done using fio, it must be clearly stated
+ that fio was used. The specific version should also be listed.
+
+- If you develop features or bug fixes for fio, they should be sent upstream
+ for inclusion into the main repository. This isn't specific to fio, that
+ is a general rule for any open source project. It's just the Right Thing
+ to do. Plus it means that you don't have to maintain the feature or change
+ internally. In the long run, this is saving you a lot of time.
+
+I would consider the above to fall under "common courtesy", but since
+people tend to have differing opinions of that, it doesn't hurt to spell out
+my expectations clearly.
+
-CC ?= gcc
+ifneq ($(origin CC), environment)
+CC = gcc
+endif
DEBUGFLAGS = -D_FORTIFY_SOURCE=2 -DFIO_INC_DEBUG
CPPFLAGS= -D_GNU_SOURCE -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 \
$(DEBUGFLAGS)
lib/num2str.c lib/ieee754.c $(wildcard crc/*.c) engines/cpu.c \
engines/mmap.c engines/sync.c engines/null.c engines/net.c \
memalign.c server.c client.c iolog.c backend.c libfio.c flow.c \
- cconv.c lib/prio_tree.c json.c
+ cconv.c lib/prio_tree.c lib/zipf.c json.c gettime-thread.c
ifeq ($(UNAME), Linux)
SOURCE += diskutil.c fifo.c blktrace.c helpers.c cgroup.c trim.c \
gclient.o gcompat.o cairo_text_helpers.o printing.o
T_SMALLOC_OBJS = t/stest.o
-T_SMALLOC_OBJS += mutex.o smalloc.o t/log.o gettime.o time.o
+T_SMALLOC_OBJS += gettime.o mutex.o smalloc.o t/log.o
T_SMALLOC_PROGS = t/stest
T_IEEE_OBJS = t/ieee754.o
T_IEEE_OBJS += lib/ieee754.o
T_IEEE_PROGS = t/ieee754
+T_ZIPF_OBS = t/genzipf.o
+T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/zipf.o t/genzipf.o
+T_ZIPF_PROGS = t/genzipf
+
T_OBJS = $(T_SMALLOC_OBJS)
T_OBJS += $(T_IEEE_OBJS)
+T_OBJS += $(T_ZIPF_OBJS)
+
+T_PROGS = $(T_SMALLOC_PROGS)
+T_PROGS += $(T_IEEE_PROGS)
+T_PROGS += $(T_ZIPF_PROGS)
ifneq ($(findstring $(MAKEFLAGS),s),s)
ifndef V
gfio: $(GFIO_OBJS)
$(QUIET_CC)$(CC) $(LIBS) -o gfio $(GFIO_OBJS) $(LIBS) $(GTK_LDFLAGS)
+t/genzipf: $(T_ZIPF_OBJS)
+ $(QUIET_CC)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_ZIPF_OBJS) $(LIBS) $(LDFLAGS)
+
.depend: $(SOURCE)
$(QUIET_DEP)$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SOURCE) 1> .depend
readv/writev (with queuing emulation) mmap for mmap'ed
io, syslet-rw for syslet driven read/write, splice for
using splice/vmsplice, sg for direct SG_IO io, net
- for network io, or cpuio for a cycler burner load. sg
- only works on Linux on SCSI (or SCSI-like devices, such
- as usb-storage or sata/libata driven) devices. Fio also
- has a null io engine, which is mainly used for testing
+ for network io, rdma for RDMA io, or cpuio for a
+ cycler burner load. sg only works on Linux on
+ SCSI (or SCSI-like devices, such as usb-storage or
+ sata/libata driven) devices. Fio also has a null
+ io engine, which is mainly used for testing
fio itself.
iodepth=x For async io, allow 'x' ios in flight
ratecycle=x ratemin averaged over x msecs
cpumask=x Only allow job to run on CPUs defined by mask.
cpus_allowed=x Like 'cpumask', but allow text setting of CPU affinity.
+ numa_cpu_nodes=x,y-z Allow job to run on specified NUMA nodes' CPU.
+ numa_mem_policy=m:x,y-z Setup numa memory allocation policy.
+ 'm' stands for policy, such as local, interleave,
+ bind, prefer, local. 'x, y-z' are numa node(s) for
+ memory allocation according to policy.
fsync=x If writing with buffered IO, fsync after every
'x' blocks have been written.
end_fsync=x If 'x', run fsync() after end-of-job.
1) A description of what you think the bug is
2) Environment (Linux distro version, kernel version). This is mostly
needed if it's a build bug.
-3) The fio version. The most useful is the git top level commit, you
- can find out by doing a cat .git/HEAD from the top level fio directory.
+3) The output from fio --version.
4) How to reproduce. Please include a full list of the parameters
passed to fio and the job file used (if any).
int groupid = 0;
unsigned int thread_number = 0;
+unsigned int stat_number = 0;
int shm_id = 0;
int temp_stall_ts;
unsigned long done_secs = 0;
int ret2, full;
enum fio_ddir ddir;
- if (td->terminate)
+ if (td->terminate || td->done)
break;
update_tv_cache(td);
if (ret < 0)
break;
- if (!ddir_rw_sum(bytes_done))
+ if (!ddir_rw_sum(bytes_done) && !(td->io_ops->flags & FIO_NOIO))
continue;
if (!in_ramp_time(td) && should_check_rate(td, bytes_done)) {
if (fio_pin_memory(td))
goto err;
+#ifdef FIO_HAVE_LIBNUMA
+ /* numa node setup */
+ if (td->o.numa_cpumask_set || td->o.numa_memmask_set) {
+ int ret;
+
+ if (numa_available() < 0) {
+ td_verror(td, errno, "Does not support NUMA API\n");
+ goto err;
+ }
+
+ if (td->o.numa_cpumask_set) {
+ ret = numa_run_on_node_mask(td->o.numa_cpunodesmask);
+ if (ret == -1) {
+ td_verror(td, errno, \
+ "numa_run_on_node_mask failed\n");
+ goto err;
+ }
+ }
+
+ if (td->o.numa_memmask_set) {
+
+ switch (td->o.numa_mem_mode) {
+ case MPOL_INTERLEAVE:
+ numa_set_interleave_mask(td->o.numa_memnodesmask);
+ break;
+ case MPOL_BIND:
+ numa_set_membind(td->o.numa_memnodesmask);
+ break;
+ case MPOL_LOCAL:
+ numa_set_localalloc();
+ break;
+ case MPOL_PREFERRED:
+ numa_set_preferred(td->o.numa_mem_prefer_node);
+ break;
+ case MPOL_DEFAULT:
+ default:
+ break;
+ }
+
+ }
+ }
+#endif
+
/*
* May alter parameters that init_io_u() will use, so we need to
* do this first.
o->softrandommap = le32_to_cpu(top->softrandommap);
o->bs_unaligned = le32_to_cpu(top->bs_unaligned);
o->fsync_on_close = le32_to_cpu(top->fsync_on_close);
+ o->random_distribution = le32_to_cpu(top->random_distribution);
+ o->zipf_theta.u.f = fio_uint64_to_double(le64_to_cpu(top->zipf_theta.u.i));
+ o->pareto_h.u.f = fio_uint64_to_double(le64_to_cpu(top->pareto_h.u.i));
o->hugepage_size = le32_to_cpu(top->hugepage_size);
o->rw_min_bs = le32_to_cpu(top->rw_min_bs);
o->thinktime = le32_to_cpu(top->thinktime);
o->loops = le32_to_cpu(top->loops);
o->mem_type = le32_to_cpu(top->mem_type);
o->mem_align = le32_to_cpu(top->mem_align);
+ o->max_latency = le32_to_cpu(top->max_latency);
o->stonewall = le32_to_cpu(top->stonewall);
o->new_group = le32_to_cpu(top->new_group);
o->numjobs = le32_to_cpu(top->numjobs);
top->softrandommap = cpu_to_le32(o->softrandommap);
top->bs_unaligned = cpu_to_le32(o->bs_unaligned);
top->fsync_on_close = cpu_to_le32(o->fsync_on_close);
+ top->random_distribution = cpu_to_le32(o->random_distribution);
+ top->zipf_theta.u.i = __cpu_to_le64(fio_double_to_uint64(o->zipf_theta.u.f));
+ top->pareto_h.u.i = __cpu_to_le64(fio_double_to_uint64(o->pareto_h.u.f));
top->hugepage_size = cpu_to_le32(o->hugepage_size);
top->rw_min_bs = cpu_to_le32(o->rw_min_bs);
top->thinktime = cpu_to_le32(o->thinktime);
top->loops = cpu_to_le32(o->loops);
top->mem_type = cpu_to_le32(o->mem_type);
top->mem_align = cpu_to_le32(o->mem_align);
+ top->max_latency = cpu_to_le32(o->max_latency);
top->stonewall = cpu_to_le32(o->stonewall);
top->new_group = cpu_to_le32(o->new_group);
top->numjobs = cpu_to_le32(o->numjobs);
int sum_stat_clients;
static int sum_stat_nr;
+static int do_output_all_clients;
#define FIO_CLIENT_HASH_BITS 7
#define FIO_CLIENT_HASH_SZ (1 << FIO_CLIENT_HASH_BITS)
if (client->ini_file)
free(client->ini_file);
+ if (!client->did_stat)
+ sum_stat_clients -= client->nr_stat;
+
free(client);
}
client->ops->removed(client);
nr_clients--;
- sum_stat_clients--;
-
fio_put_client(client);
}
struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
show_thread_status(&p->ts, &p->rs);
+ client->did_stat = 1;
- if (sum_stat_clients == 1)
+ if (!do_output_all_clients)
return;
sum_thread_stats(&client_ts, &p->ts, sum_stat_nr);
struct cmd_start_pdu *pdu = (struct cmd_start_pdu *) cmd->payload;
client->state = Client_started;
- client->jobs = pdu->jobs;
+ client->jobs = le32_to_cpu(pdu->jobs);
+ client->nr_stat = le32_to_cpu(pdu->stat_outputs);
+
+ if (sum_stat_clients > 1)
+ do_output_all_clients = 1;
+
+ sum_stat_clients += client->nr_stat;
}
static void handle_stop(struct fio_client *client, struct fio_net_cmd *cmd)
pfds = malloc(nr_clients * sizeof(struct pollfd));
- sum_stat_clients = nr_clients;
init_thread_stat(&client_ts);
init_group_run_stat(&client_gs);
int is_sock;
int disk_stats_shown;
unsigned int jobs;
+ unsigned int nr_stat;
int error;
int signal;
int ipv6;
int sent_job;
+ int did_stat;
uint32_t type;
uint32_t thread_number;
#define __must_check
#endif
-#define uninitialized_var(x) x = x
-
#ifndef _weak
#ifndef __CYGWIN__
#define _weak __attribute__((weak))
ret = ioctl(f->fd, EXT4_IOC_MOVE_EXT, &me);
len = me.moved_len * ed->bsz;
- if (io_u->file && len >= 0 && ddir_rw(io_u->ddir))
+ if (io_u->file && len && ddir_rw(io_u->ddir))
io_u->file->file_pos = io_u->offset + len;
if (len > io_u->xfer_buflen)
*
* This I/O engine is disabled by default. To enable it, execute:
*
- * $ export EXTFLAGS="-DFIO_HAVE_RDMA"
- * $ export EXTLIBS="-libverbs -lrdmacm"
+ * $ export EXTFLAGS+=" -DFIO_HAVE_RDMA "
+ * $ export EXTLIBS+=" -libverbs -lrdmacm "
*
* before running make. You will need the Linux RDMA software as well, either
* from your Linux distributor or directly from openfabrics.org:
*
* http://www.openfabrics.org/downloads/OFED/
*
+ * Exchanging steps of RDMA ioengine control messages:
+ * 1. client side sends test mode (RDMA_WRITE/RDMA_READ/SEND)
+ * to server side.
+ * 2. server side parses test mode, and sends back confirmation
+ * to client side. In RDMA WRITE/READ test, this confirmation
+ * includes memory information, such as rkey, address.
+ * 3. client side initiates test loop.
+ * 4. In RDMA WRITE/READ test, client side sends a completion
+ * notification to server side. Server side updates its
+ * td->done as true.
+ *
*/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "../fio.h"
+#include "../hash.h"
#ifdef FIO_HAVE_RDMA
#include <rdma/rdma_cma.h>
#include <infiniband/arch.h>
-#define FIO_RDMA_MAX_IO_DEPTH 128
+#define FIO_RDMA_MAX_IO_DEPTH 512
enum rdma_io_mode {
FIO_RDMA_UNKNOWN = 0,
int io_u_flight_nr;
struct io_u **io_us_completed;
int io_u_completed_nr;
+
+ struct frand_state rand_state;
};
static int client_recv(struct thread_data *td, struct ibv_wc *wc)
rd->pd = ibv_alloc_pd(rd->child_cm_id->verbs);
else
rd->pd = ibv_alloc_pd(rd->cm_id->verbs);
+
if (rd->pd == NULL) {
log_err("fio: ibv_alloc_pd fail\n");
return 1;
/* setup work request */
/* recv wq */
rd->recv_sgl.addr = (uint64_t) (unsigned long)&rd->recv_buf;
- rd->recv_sgl.length = sizeof rd->recv_buf;
+ rd->recv_sgl.length = sizeof(rd->recv_buf);
rd->recv_sgl.lkey = rd->recv_mr->lkey;
rd->rq_wr.sg_list = &rd->recv_sgl;
rd->rq_wr.num_sge = 1;
/* send wq */
rd->send_sgl.addr = (uint64_t) (unsigned long)&rd->send_buf;
- rd->send_sgl.length = sizeof rd->send_buf;
+ rd->send_sgl.length = sizeof(rd->send_buf);
rd->send_sgl.lkey = rd->send_mr->lkey;
rd->sq_wr.opcode = IBV_WR_SEND;
enum rdma_cm_event_type wait_event)
{
struct rdmaio_data *rd = td->io_ops->data;
-
- int ret;
struct rdma_cm_event *event;
+ int ret;
ret = rdma_get_cm_event(channel, &event);
if (ret) {
- log_err("fio: rdma_get_cm_event");
+ log_err("fio: rdma_get_cm_event: %d\n", ret);
return 1;
}
int i;
io_u = rd->io_us_completed[0];
- for (i = 0; i < rd->io_u_completed_nr - 1; i++) {
+ for (i = 0; i < rd->io_u_completed_nr - 1; i++)
rd->io_us_completed[i] = rd->io_us_completed[i + 1];
- }
+
rd->io_u_completed_nr--;
dprint_io_u(io_u, "fio_rdmaio_event");
unsigned int max, struct timespec *t)
{
struct rdmaio_data *rd = td->io_ops->data;
- int r;
enum ibv_wc_opcode comp_opcode;
comp_opcode = IBV_WC_RDMA_WRITE;
struct ibv_cq *ev_cq;
void *ev_ctx;
- int ret;
-
- r = 0;
+ int ret, r = 0;
switch (rd->rdma_protocol) {
case FIO_RDMA_MEM_WRITE:
enum ibv_wc_opcode comp_opcode;
comp_opcode = IBV_WC_RDMA_WRITE;
#endif
- int i, index;
+ int i;
+ long index;
struct rdma_io_u_data *r_io_u_d;
r_io_u_d = NULL;
case FIO_RDMA_MEM_WRITE:
/* compose work request */
r_io_u_d = io_us[i]->engine_data;
- index = rand() % rd->rmt_nr;
+ index = __rand(&rd->rand_state) % rd->rmt_nr;
r_io_u_d->sq_wr.opcode = IBV_WR_RDMA_WRITE;
r_io_u_d->sq_wr.wr.rdma.rkey = rd->rmt_us[index].rkey;
r_io_u_d->sq_wr.wr.rdma.remote_addr = \
case FIO_RDMA_MEM_READ:
/* compose work request */
r_io_u_d = io_us[i]->engine_data;
- index = rand() % rd->rmt_nr;
+ index = __rand(&rd->rand_state) % rd->rmt_nr;
r_io_u_d->sq_wr.opcode = IBV_WR_RDMA_READ;
r_io_u_d->sq_wr.wr.rdma.rkey = rd->rmt_us[index].rkey;
r_io_u_d->sq_wr.wr.rdma.remote_addr = \
io_us = rd->io_us_queued;
do {
/* RDMA_WRITE or RDMA_READ */
- if (rd->is_client) {
+ if (rd->is_client)
ret = fio_rdmaio_send(td, io_us, rd->io_u_queued_nr);
- } else if (!rd->is_client) {
+ else if (!rd->is_client)
ret = fio_rdmaio_recv(td, io_us, rd->io_u_queued_nr);
- } else
+ else
ret = 0; /* must be a SYNC */
if (ret > 0) {
struct rdma_conn_param conn_param;
struct ibv_send_wr *bad_wr;
- memset(&conn_param, 0, sizeof conn_param);
+ memset(&conn_param, 0, sizeof(conn_param));
conn_param.responder_resources = 1;
conn_param.initiator_depth = 1;
conn_param.retry_count = 10;
/* wait for remote MR info from server side */
rdma_poll_wait(td, IBV_WC_RECV);
+ /* In SEND/RECV test, it's a good practice to setup the iodepth of
+ * of the RECV side deeper than that of the SEND side to
+ * avoid RNR (receiver not ready) error. The
+ * SEND side may send so many unsolicited message before
+ * RECV side commits sufficient recv buffers into recv queue.
+ * This may lead to RNR error. Here, SEND side pauses for a while
+ * during which RECV side commits sufficient recv buffers.
+ */
+ usleep(500000);
+
return 0;
}
struct ibv_send_wr *bad_wr;
/* rdma_accept() - then wait for accept success */
- memset(&conn_param, 0, sizeof conn_param);
+ memset(&conn_param, 0, sizeof(conn_param));
conn_param.responder_resources = 1;
conn_param.initiator_depth = 1;
rdma_disconnect(rd->cm_id);
else {
rdma_disconnect(rd->child_cm_id);
-/* rdma_disconnect(rd->cm_id); */
+#if 0
+ rdma_disconnect(rd->cm_id);
+#endif
}
-/* if (get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_DISCONNECTED) != 0)
- {
- log_err("fio: wait for RDMA_CM_EVENT_DISCONNECTED\n");
- return 1;
- }*/
+#if 0
+ if (get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_DISCONNECTED) != 0) {
+ log_err("fio: wait for RDMA_CM_EVENT_DISCONNECTED\n");
+ return 1;
+ }
+#endif
- ibv_destroy_qp(rd->qp);
ibv_destroy_cq(rd->cq);
+ ibv_destroy_qp(rd->qp);
if (rd->is_client == 1)
rdma_destroy_id(rd->cm_id);
{
struct rdmaio_data *rd = td->io_ops->data;
struct ibv_recv_wr *bad_wr;
+ int err;
rd->addr.sin_family = AF_INET;
rd->addr.sin_port = htons(port);
}
/* resolve route */
- if (rdma_resolve_addr(rd->cm_id, NULL,
- (struct sockaddr *)&rd->addr, 2000) != 0) {
- log_err("fio: rdma_resolve_addr");
+ err = rdma_resolve_addr(rd->cm_id, NULL, (struct sockaddr *)&rd->addr, 2000);
+ if (err != 0) {
+ log_err("fio: rdma_resolve_addr: %d\n", err);
return 1;
}
- if (get_next_channel_event
- (td, rd->cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED)
- != 0) {
- log_err("fio: get_next_channel_event");
+ err = get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED);
+ if (err != 0) {
+ log_err("fio: get_next_channel_event: %d\n", err);
return 1;
}
/* resolve route */
- if (rdma_resolve_route(rd->cm_id, 2000) != 0) {
- log_err("fio: rdma_resolve_route");
+ err = rdma_resolve_route(rd->cm_id, 2000);
+ if (err != 0) {
+ log_err("fio: rdma_resolve_route: %d\n", err);
return 1;
}
- if (get_next_channel_event
- (td, rd->cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED) != 0) {
- log_err("fio: get_next_channel_event");
+ err = get_next_channel_event(td, rd->cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED);
+ if (err != 0) {
+ log_err("fio: get_next_channel_event: %d\n", err);
return 1;
}
return 1;
/* post recv buf */
- if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
- log_err("fio: ibv_post_recv fail\n");
+ err = ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr);
+ if (err != 0) {
+ log_err("fio: ibv_post_recv fail: %d\n", err);
return 1;
}
static int fio_rdmaio_init(struct thread_data *td)
{
struct rdmaio_data *rd = td->io_ops->data;
+ struct flist_head *entry;
+ unsigned int max_bs;
unsigned int port;
char host[64], buf[128];
char *sep, *portp, *modep;
- int ret;
+ int ret, i = 0;
struct rlimit rl;
if (td_rw(td)) {
ret = fio_rdmaio_setup_connect(td, host, port);
}
- struct flist_head *entry;
- unsigned int max_bs;
max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
/* register each io_u in the free list */
- int i = 0;
flist_for_each(entry, &td->io_u_freelist) {
struct io_u *io_u = flist_entry(entry, struct io_u, list);
rd->send_buf.rmt_us[i].rkey = htonl(io_u->mr->rkey);
rd->send_buf.rmt_us[i].size = htonl(max_bs);
-/* log_info("fio: Send rkey %x addr %" PRIx64 " len %d to client\n",
- io_u->mr->rkey, io_u->buf, max_bs); */
+#if 0
+ log_info("fio: Send rkey %x addr %" PRIx64 " len %d to client\n", io_u->mr->rkey, io_u->buf, max_bs); */
+#endif
i++;
}
{
struct rdmaio_data *rd = td->io_ops->data;
- if (rd) {
-/* if (nd->listenfd != -1)
- close(nd->listenfd);
- if (nd->pipes[0] != -1)
- close(nd->pipes[0]);
- if (nd->pipes[1] != -1)
- close(nd->pipes[1]);
-*/
+ if (rd)
free(rd);
- }
}
static int fio_rdmaio_setup(struct thread_data *td)
struct rdmaio_data *rd;
if (!td->io_ops->data) {
- rd = malloc(sizeof(*rd));;
+ rd = malloc(sizeof(*rd));
memset(rd, 0, sizeof(*rd));
+ init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME);
td->io_ops->data = rd;
}
log_err(" make sure OFED is installed,\n");
log_err(" $ ofed_info\n");
log_err(" then try to make fio as follows:\n");
- log_err(" $ export EXTFLAGS=\"-DFIO_HAVE_RDMA\"\n");
- log_err(" $ export EXTLIBS=\"-libverbs -lrdmacm\"\n");
+ log_err(" $ export EXTFLAGS+=\" -DFIO_HAVE_RDMA \"\n");
+ log_err(" $ export EXTLIBS+=\" -libverbs -lrdmacm \"\n");
log_err(" $ make clean && make\n");
return 1;
}
static int fio_spliceio_queue(struct thread_data *td, struct io_u *io_u)
{
struct spliceio_data *sd = td->io_ops->data;
- int uninitialized_var(ret);
+ int ret = 0;
fio_ro_check(td, io_u);
c = 'C';
break;
case TD_INITIALIZED:
+ case TD_SETTING_UP:
c = 'I';
break;
case TD_NOT_CREATED:
} else if (td->runstate == TD_RAMP) {
je->nr_running++;
je->nr_ramp++;
- } else if (td->runstate < TD_RUNNING)
+ } else if (td->runstate == TD_SETTING_UP)
+ je->nr_running++;
+ else if (td->runstate < TD_RUNNING)
je->nr_pending++;
if (je->elapsed_sec >= 3)
--- /dev/null
+[global]
+ioengine=cpuio
+time_based
+runtime=10
+
+[burn50percent]
+cpuload=50
+
--- /dev/null
+; setup numa policy for each thread
+; 'numactl --show' to determine the maximum numa nodes
+[global]
+ioengine=libaio
+buffered=0
+rw=randread
+bs=512K
+iodepth=16
+size=512m
+filename=/dev/sdb1
+
+; Fix memory blocks (512K * 16) in numa node 0
+[job1]
+numa_cpu_nodes=0
+numa_mem_policy=bind:0
+
+; Interleave memory blocks (512K * 16) in numa node 0 and 1
+[job2]
+numa_cpu_nodes=0-1
+numa_mem_policy=interleave:0-1
+
--- /dev/null
+# Example job file for using a zipf distribution instead
+# of a purely random workload where each block is read
+# or written once.
+[job]
+ioengine=null
+rw=randread
+norandommap
+size=1280m
+bs=4k
+random_distribution=zipf:0.5
#include "compiler/compiler.h"
#include "io_ddir.h"
#include "flist.h"
+#include "lib/zipf.h"
/*
* The type of object we are working on
unsigned long last_free_lookup;
unsigned failed_rands;
+ /*
+ * Used for zipf random distribution
+ */
+ struct zipf_state zipf;
+
int references;
enum fio_file_flags flags;
#include "smalloc.h"
#include "filehash.h"
#include "os/os.h"
+#include "hash.h"
#ifdef FIO_HAVE_LINUX_FALLOCATE
#include <linux/falloc.h>
return 1;
}
+static int __init_rand_distribution(struct thread_data *td, struct fio_file *f)
+{
+ unsigned int range_size, seed;
+ unsigned long nranges;
+
+ range_size = min(td->o.min_bs[DDIR_READ], td->o.min_bs[DDIR_WRITE]);
+
+ nranges = (f->real_file_size + range_size - 1) / range_size;
+
+ seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number;
+ if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+ zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, seed);
+ else
+ pareto_init(&f->zipf, nranges, td->o.pareto_h.u.f, seed);
+
+ return 1;
+}
+
+static int init_rand_distribution(struct thread_data *td)
+{
+ struct fio_file *f;
+ unsigned int i;
+ int state;
+
+ if (td->o.random_distribution == FIO_RAND_DIST_RANDOM)
+ return 0;
+
+ state = td->runstate;
+ td_set_runstate(td, TD_SETTING_UP);
+ for_each_file(td, f, i)
+ __init_rand_distribution(td, f);
+ td_set_runstate(td, state);
+
+ return 1;
+}
+
int init_random_map(struct thread_data *td)
{
unsigned long long blocks, num_maps;
struct fio_file *f;
unsigned int i;
+ if (init_rand_distribution(td))
+ return 0;
if (td->o.norandommap || !td_random(td))
return 0;
Average bandwidth for \fBrate\fR and \fBratemin\fR over this number of
milliseconds. Default: 1000ms.
.TP
+.BI max_latency \fR=\fPint
+If set, fio will exit the job if it exceeds this maximum latency. It will exit
+with an ETIME error.
+.TP
.BI cpumask \fR=\fPint
Set CPU affinity for this job. \fIint\fR is a bitmask of allowed CPUs the job
may run on. See \fBsched_setaffinity\fR\|(2).
.BI cpus_allowed \fR=\fPstr
Same as \fBcpumask\fR, but allows a comma-delimited list of CPU numbers.
.TP
+.BI numa_cpu_nodes \fR=\fPstr
+Set this job running on spcified NUMA nodes' CPUs. The arguments allow
+comma delimited list of cpu numbers, A-B ranges, or 'all'.
+.TP
+.BI numa_mem_policy \fR=\fPstr
+Set this job's memory policy and corresponding NUMA nodes. Format of
+the argements:
+.RS
+.TP
+.B <mode>[:<nodelist>]
+.TP
+.B mode
+is one of the following memory policy:
+.TP
+.B default, prefer, bind, interleave, local
+.TP
+.RE
+For \fBdefault\fR and \fBlocal\fR memory policy, no \fBnodelist\fR is
+needed to be specified. For \fBprefer\fR, only one node is
+allowed. For \fBbind\fR and \fBinterleave\fR, \fBnodelist\fR allows
+comma delimited list of numbers, A-B ranges, or 'all'.
+.TP
.BI startdelay \fR=\fPint
Delay start of job for the specified number of seconds.
.TP
#include <sys/asynch.h>
#endif
+#ifdef FIO_HAVE_LIBNUMA
+#include <linux/mempolicy.h>
+#include <numa.h>
+
+/*
+ * "local" is pseudo-policy
+ */
+#define MPOL_LOCAL MPOL_MAX
+#endif
+
/*
* offset generator types
*/
extern int exitall_on_terminate;
extern unsigned int thread_number;
+extern unsigned int stat_number;
extern int shm_id;
extern int groupid;
extern int output_format;
#define REAL_MAX_JOBS 2048
-static inline enum error_type td_error_type(enum fio_ddir ddir, int err)
+static inline enum error_type_bit td_error_type(enum fio_ddir ddir, int err)
{
if (err == EILSEQ)
return ERROR_TYPE_VERIFY_BIT;
TD_NOT_CREATED = 0,
TD_CREATED,
TD_INITIALIZED,
+ TD_SETTING_UP,
TD_RAMP,
TD_RUNNING,
TD_PRE_READING,
FIO_OUTPUT_NORMAL,
};
+enum {
+ FIO_RAND_DIST_RANDOM = 0,
+ FIO_RAND_DIST_ZIPF,
+ FIO_RAND_DIST_PARETO,
+};
+
#endif
--- /dev/null
+#include <unistd.h>
+#include <math.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include "fio.h"
+#include "smalloc.h"
+
+struct timeval *fio_tv;
+int fio_gtod_offload = 0;
+int fio_gtod_cpu = -1;
+static pthread_t gtod_thread;
+
+void fio_gtod_init(void)
+{
+ fio_tv = smalloc(sizeof(struct timeval));
+ assert(fio_tv);
+}
+
+static void fio_gtod_update(void)
+{
+ gettimeofday(fio_tv, NULL);
+}
+
+static void *gtod_thread_main(void *data)
+{
+ struct fio_mutex *mutex = data;
+
+ fio_mutex_up(mutex);
+
+ /*
+ * As long as we have jobs around, update the clock. It would be nice
+ * to have some way of NOT hammering that CPU with gettimeofday(),
+ * but I'm not sure what to use outside of a simple CPU nop to relax
+ * it - we don't want to lose precision.
+ */
+ while (threads) {
+ fio_gtod_update();
+ nop;
+ }
+
+ return NULL;
+}
+
+int fio_start_gtod_thread(void)
+{
+ struct fio_mutex *mutex;
+ pthread_attr_t attr;
+ int ret;
+
+ mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
+ if (!mutex)
+ return 1;
+
+ pthread_attr_init(&attr);
+ pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
+ ret = pthread_create(>od_thread, &attr, gtod_thread_main, NULL);
+ pthread_attr_destroy(&attr);
+ if (ret) {
+ log_err("Can't create gtod thread: %s\n", strerror(ret));
+ goto err;
+ }
+
+ ret = pthread_detach(gtod_thread);
+ if (ret) {
+ log_err("Can't detatch gtod thread: %s\n", strerror(ret));
+ goto err;
+ }
+
+ dprint(FD_MUTEX, "wait on startup_mutex\n");
+ fio_mutex_down(mutex);
+ dprint(FD_MUTEX, "done waiting on startup_mutex\n");
+err:
+ fio_mutex_remove(mutex);
+ return ret;
+}
+
+
static struct timeval last_tv;
static int last_tv_valid;
-static struct timeval *fio_tv;
-int fio_gtod_offload = 0;
-int fio_gtod_cpu = -1;
-static pthread_t gtod_thread;
-
enum fio_cs fio_clock_source = FIO_PREFERRED_CLOCK_SOURCE;
#ifdef FIO_DEBUG_TIME
calibrate_cpu_clock();
}
-void fio_gtod_init(void)
+unsigned long long utime_since(struct timeval *s, struct timeval *e)
{
- fio_tv = smalloc(sizeof(struct timeval));
- assert(fio_tv);
+ long sec, usec;
+ unsigned long long ret;
+
+ sec = e->tv_sec - s->tv_sec;
+ usec = e->tv_usec - s->tv_usec;
+ if (sec > 0 && usec < 0) {
+ sec--;
+ usec += 1000000;
+ }
+
+ /*
+ * time warp bug on some kernels?
+ */
+ if (sec < 0 || (sec == 0 && usec < 0))
+ return 0;
+
+ ret = sec * 1000000ULL + usec;
+
+ return ret;
}
-static void fio_gtod_update(void)
+unsigned long long utime_since_now(struct timeval *s)
{
- gettimeofday(fio_tv, NULL);
+ struct timeval t;
+
+ fio_gettime(&t, NULL);
+ return utime_since(s, &t);
}
-static void *gtod_thread_main(void *data)
+unsigned long mtime_since(struct timeval *s, struct timeval *e)
{
- struct fio_mutex *mutex = data;
+ long sec, usec, ret;
- fio_mutex_up(mutex);
-
- /*
- * As long as we have jobs around, update the clock. It would be nice
- * to have some way of NOT hammering that CPU with gettimeofday(),
- * but I'm not sure what to use outside of a simple CPU nop to relax
- * it - we don't want to lose precision.
- */
- while (threads) {
- fio_gtod_update();
- nop;
+ sec = e->tv_sec - s->tv_sec;
+ usec = e->tv_usec - s->tv_usec;
+ if (sec > 0 && usec < 0) {
+ sec--;
+ usec += 1000000;
}
- return NULL;
+ if (sec < 0 || (sec == 0 && usec < 0))
+ return 0;
+
+ sec *= 1000UL;
+ usec /= 1000UL;
+ ret = sec + usec;
+
+ return ret;
}
-int fio_start_gtod_thread(void)
+unsigned long mtime_since_now(struct timeval *s)
{
- struct fio_mutex *mutex;
- pthread_attr_t attr;
- int ret;
-
- mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
- if (!mutex)
- return 1;
-
- pthread_attr_init(&attr);
- pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
- ret = pthread_create(>od_thread, &attr, gtod_thread_main, NULL);
- pthread_attr_destroy(&attr);
- if (ret) {
- log_err("Can't create gtod thread: %s\n", strerror(ret));
- goto err;
- }
+ struct timeval t;
+ void *p = __builtin_return_address(0);
- ret = pthread_detach(gtod_thread);
- if (ret) {
- log_err("Can't detatch gtod thread: %s\n", strerror(ret));
- goto err;
- }
+ fio_gettime(&t, p);
+ return mtime_since(s, &t);
+}
- dprint(FD_MUTEX, "wait on startup_mutex\n");
- fio_mutex_down(mutex);
- dprint(FD_MUTEX, "done waiting on startup_mutex\n");
-err:
- fio_mutex_remove(mutex);
- return ret;
+unsigned long time_since_now(struct timeval *s)
+{
+ return mtime_since_now(s) / 1000;
}
extern void fio_clock_init(void);
extern int fio_start_gtod_thread(void);
+extern struct timeval *fio_tv;
+
#endif
#error Define GOLDEN_RATIO_PRIME for your wordsize.
#endif
-static inline unsigned long hash_long(unsigned long val, unsigned int bits)
+#define GR_PRIME_64 0x9e37fffffffc0001UL
+
+static inline unsigned long __hash_long(unsigned long val)
{
unsigned long hash = val;
hash *= GOLDEN_RATIO_PRIME;
#endif
+ return hash;
+}
+
+static inline unsigned long hash_long(unsigned long val, unsigned int bits)
+{
/* High bits are more random, so use them. */
- return hash >> (BITS_PER_LONG - bits);
+ return __hash_long(val) >> (BITS_PER_LONG - bits);
+}
+
+static inline uint64_t __hash_u64(uint64_t val)
+{
+ return val * GR_PRIME_64;
}
static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
profile_add_hooks(td);
td->thread_number = thread_number;
+
+ if (!parent || !parent->o.group_reporting)
+ stat_number++;
+
return td;
}
return 1;
}
-static int get_next_rand_offset(struct thread_data *td, struct fio_file *f,
- enum fio_ddir ddir, unsigned long long *b)
+static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f,
+ enum fio_ddir ddir, unsigned long long *b)
{
unsigned long long rmax, r, lastb;
int loops = 5;
return 0;
}
+static int __get_next_rand_offset_zipf(struct thread_data *td,
+ struct fio_file *f, enum fio_ddir ddir,
+ unsigned long long *b)
+{
+ *b = zipf_next(&f->zipf);
+ return 0;
+}
+
+static int __get_next_rand_offset_pareto(struct thread_data *td,
+ struct fio_file *f, enum fio_ddir ddir,
+ unsigned long long *b)
+{
+ *b = pareto_next(&f->zipf);
+ return 0;
+}
+
+static int get_next_rand_offset(struct thread_data *td, struct fio_file *f,
+ enum fio_ddir ddir, unsigned long long *b)
+{
+ if (td->o.random_distribution == FIO_RAND_DIST_RANDOM)
+ return __get_next_rand_offset(td, f, ddir, b);
+ else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+ return __get_next_rand_offset_zipf(td, f, ddir, b);
+ else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
+ return __get_next_rand_offset_pareto(td, f, ddir, b);
+
+ log_err("fio: unknown random distribution: %d\n", td->o.random_distribution);
+ return 1;
+}
+
static int get_next_rand_block(struct thread_data *td, struct fio_file *f,
enum fio_ddir ddir, unsigned long long *b)
{
static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u)
{
const int ddir = io_u->ddir;
- unsigned int uninitialized_var(buflen);
+ unsigned int buflen = 0;
unsigned int minbs, maxbs;
unsigned long r, rand_max;
struct io_completion_data *icd,
const enum fio_ddir idx, unsigned int bytes)
{
- unsigned long uninitialized_var(lusec);
+ unsigned long lusec = 0;
if (!td->o.disable_clat || !td->o.disable_bw)
lusec = utime_since(&io_u->issue_time, &icd->time);
tusec = utime_since(&io_u->start_time, &icd->time);
add_lat_sample(td, idx, tusec, bytes);
+
+ if (td->o.max_latency && tusec > td->o.max_latency) {
+ if (!td->error)
+ log_err("fio: latency of %lu usec exceeds specified max (%u usec)\n", tusec, td->o.max_latency);
+ td_verror(td, ETIMEDOUT, "max latency exceeded");
+ icd->error = ETIMEDOUT;
+ }
}
if (!td->o.disable_clat) {
static void io_completed(struct thread_data *td, struct io_u *io_u,
struct io_completion_data *icd)
{
- /*
- * Older gcc's are too dumb to realize that usec is always used
- * initialized, silence that warning.
- */
- unsigned long uninitialized_var(usec);
struct fio_file *f;
dprint_io_u(io_u, "io complete");
return value;
}
+static char *strdup_escape(const char *str)
+{
+ const char *input = str;
+ char *p, *ret;
+ int escapes;
+
+ if (!strlen(str))
+ return NULL;
+
+ escapes = 0;
+ while ((input = strpbrk(input, "\\\"")) != NULL) {
+ escapes++;
+ input++;
+ }
+
+ p = ret = malloc(strlen(str) + escapes + 1);
+ while (*str) {
+ if (*str == '\\' || *str == '\"')
+ *p++ = '\\';
+ *p++ = *str++;
+ }
+ *p = '\0';
+
+ return ret;
+}
+
+/*
+ * Valid JSON strings must escape '"' and '/' with a preceeding '/'
+ */
static struct json_value *json_create_value_string(const char *str)
{
struct json_value *value = malloc(sizeof(struct json_value));
if (value) {
value->type = JSON_TYPE_STRING;
- value->string = strdup(str);
+ value->string = strdup_escape(str);
if (!value->string) {
free(value);
value = NULL;
n = n->rb_left;
return n;
}
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+ struct rb_node *parent;
+
+ if (RB_EMPTY_NODE(node))
+ return NULL;
+
+ /*
+ * If we have a right-hand child, go down and then left as far
+ * as we can.
+ */
+ if (node->rb_right) {
+ node = node->rb_right;
+ while (node->rb_left)
+ node=node->rb_left;
+ return (struct rb_node *)node;
+ }
+
+ /*
+ * No right-hand children. Everything down and left is smaller than us,
+ * so any 'next' node must be in the general direction of our parent.
+ * Go up the tree; any time the ancestor is a right-hand child of its
+ * parent, keep going up. First time it's a left-hand child of its
+ * parent, said parent is our 'next' node.
+ */
+ while ((parent = rb_parent(node)) && node == parent->rb_right)
+ node = parent;
+
+ return parent;
+}
/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_first(struct rb_root *);
+extern struct rb_node *rb_next(const struct rb_node *);
static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
struct rb_node ** rb_link)
--- /dev/null
+#include <math.h>
+#include <string.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include "ieee754.h"
+#include "../log.h"
+#include "zipf.h"
+#include "../minmax.h"
+#include "../hash.h"
+
+#define ZIPF_MAX_GEN 10000000
+
+static void zipf_update(struct zipf_state *zs)
+{
+ unsigned long to_gen;
+ unsigned int i;
+
+ /*
+ * It can become very costly to generate long sequences. Just cap it at
+ * 10M max, that should be doable in 1-2s on even slow machines.
+ * Precision will take a slight hit, but nothing major.
+ */
+ to_gen = min(zs->nranges, ZIPF_MAX_GEN);
+
+ for (i = 0; i < to_gen; i++)
+ zs->zetan += pow(1.0 / (double) (i + 1), zs->theta);
+}
+
+static void shared_rand_init(struct zipf_state *zs, unsigned long nranges,
+ unsigned int seed)
+{
+ memset(zs, 0, sizeof(*zs));
+ zs->nranges = nranges;
+
+ init_rand_seed(&zs->rand, seed);
+ zs->rand_off = __rand(&zs->rand);
+}
+
+void zipf_init(struct zipf_state *zs, unsigned long nranges, double theta,
+ unsigned int seed)
+{
+ shared_rand_init(zs, nranges, seed);
+
+ zs->theta = theta;
+ zs->zeta2 = pow(1.0, zs->theta) + pow(0.5, zs->theta);
+
+ zipf_update(zs);
+}
+
+unsigned long long zipf_next(struct zipf_state *zs)
+{
+ double alpha, eta, rand_uni, rand_z;
+ unsigned long long n = zs->nranges;
+ unsigned long long val;
+
+ alpha = 1.0 / (1.0 - zs->theta);
+ eta = (1.0 - pow(2.0 / n, 1.0 - zs->theta)) / (1.0 - zs->zeta2 / zs->zetan);
+
+ rand_uni = (double) __rand(&zs->rand) / (double) FRAND_MAX;
+ rand_z = rand_uni * zs->zetan;
+
+ if (rand_z < 1.0)
+ val = 1;
+ else if (rand_z < (1.0 + pow(0.5, zs->theta)))
+ val = 2;
+ else
+ val = 1 + (unsigned long long)(n * pow(eta*rand_uni - eta + 1.0, alpha));
+
+ return (__hash_u64(val - 1) + zs->rand_off) % zs->nranges;
+}
+
+void pareto_init(struct zipf_state *zs, unsigned long nranges, double h,
+ unsigned int seed)
+{
+ shared_rand_init(zs, nranges, seed);
+ zs->pareto_pow = log(h) / log(1.0 - h);
+}
+
+unsigned long long pareto_next(struct zipf_state *zs)
+{
+ double rand = (double) __rand(&zs->rand) / (double) FRAND_MAX;
+ unsigned long long n = zs->nranges - 1;
+
+ return (__hash_u64(n * pow(rand, zs->pareto_pow)) + zs->rand_off) % zs->nranges;
+}
--- /dev/null
+#ifndef FIO_ZIPF_H
+#define FIO_ZIPF_H
+
+#include <inttypes.h>
+#include "rand.h"
+
+struct zipf_state {
+ uint64_t nranges;
+ double theta;
+ double zeta2;
+ double zetan;
+ double pareto_pow;
+ struct frand_state rand;
+ uint64_t rand_off;
+};
+
+void zipf_init(struct zipf_state *zs, unsigned long nranges, double theta, unsigned int seed);
+unsigned long long zipf_next(struct zipf_state *zs);
+
+void pareto_init(struct zipf_state *zs, unsigned long nranges, double h, unsigned int seed);
+unsigned long long pareto_next(struct zipf_state *zs);
+
+#endif
--- /dev/null
+#ifndef FIO_MIN_MAX_H
+#define FIO_MIN_MAX_H
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+#endif
}
#endif
+#ifdef FIO_HAVE_LIBNUMA
+static int str_numa_cpunodes_cb(void *data, char *input)
+{
+ struct thread_data *td = data;
+
+ /* numa_parse_nodestring() parses a character string list
+ * of nodes into a bit mask. The bit mask is allocated by
+ * numa_allocate_nodemask(), so it should be freed by
+ * numa_free_nodemask().
+ */
+ td->o.numa_cpunodesmask = numa_parse_nodestring(input);
+ if (td->o.numa_cpunodesmask == NULL) {
+ log_err("fio: numa_parse_nodestring failed\n");
+ td_verror(td, 1, "str_numa_cpunodes_cb");
+ return 1;
+ }
+
+ td->o.numa_cpumask_set = 1;
+ return 0;
+}
+
+static int str_numa_mpol_cb(void *data, char *input)
+{
+ struct thread_data *td = data;
+ const char * const policy_types[] =
+ { "default", "prefer", "bind", "interleave", "local" };
+ int i;
+
+ char *nodelist = strchr(input, ':');
+ if (nodelist) {
+ /* NUL-terminate mode */
+ *nodelist++ = '\0';
+ }
+
+ for (i = 0; i <= MPOL_LOCAL; i++) {
+ if (!strcmp(input, policy_types[i])) {
+ td->o.numa_mem_mode = i;
+ break;
+ }
+ }
+ if (i > MPOL_LOCAL) {
+ log_err("fio: memory policy should be: default, prefer, bind, interleave, local\n");
+ goto out;
+ }
+
+ switch (td->o.numa_mem_mode) {
+ case MPOL_PREFERRED:
+ /*
+ * Insist on a nodelist of one node only
+ */
+ if (nodelist) {
+ char *rest = nodelist;
+ while (isdigit(*rest))
+ rest++;
+ if (*rest) {
+ log_err("fio: one node only for \'prefer\'\n");
+ goto out;
+ }
+ } else {
+ log_err("fio: one node is needed for \'prefer\'\n");
+ goto out;
+ }
+ break;
+ case MPOL_INTERLEAVE:
+ /*
+ * Default to online nodes with memory if no nodelist
+ */
+ if (!nodelist)
+ nodelist = strdup("all");
+ break;
+ case MPOL_LOCAL:
+ case MPOL_DEFAULT:
+ /*
+ * Don't allow a nodelist
+ */
+ if (nodelist) {
+ log_err("fio: NO nodelist for \'local\'\n");
+ goto out;
+ }
+ break;
+ case MPOL_BIND:
+ /*
+ * Insist on a nodelist
+ */
+ if (!nodelist) {
+ log_err("fio: a nodelist is needed for \'bind\'\n");
+ goto out;
+ }
+ break;
+ }
+
+
+ /* numa_parse_nodestring() parses a character string list
+ * of nodes into a bit mask. The bit mask is allocated by
+ * numa_allocate_nodemask(), so it should be freed by
+ * numa_free_nodemask().
+ */
+ switch (td->o.numa_mem_mode) {
+ case MPOL_PREFERRED:
+ td->o.numa_mem_prefer_node = atoi(nodelist);
+ break;
+ case MPOL_INTERLEAVE:
+ case MPOL_BIND:
+ td->o.numa_memnodesmask = numa_parse_nodestring(nodelist);
+ if (td->o.numa_memnodesmask == NULL) {
+ log_err("fio: numa_parse_nodestring failed\n");
+ td_verror(td, 1, "str_numa_memnodes_cb");
+ return 1;
+ }
+ break;
+ case MPOL_LOCAL:
+ case MPOL_DEFAULT:
+ default:
+ break;
+ }
+
+ td->o.numa_memmask_set = 1;
+ return 0;
+
+out:
+ return 1;
+}
+#endif
+
static int str_fst_cb(void *data, const char *str)
{
struct thread_data *td = data;
}
#endif
+static int str_random_distribution_cb(void *data, const char *str)
+{
+ struct thread_data *td = data;
+ double val;
+ char *nr;
+
+ if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+ val = 1.1;
+ else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
+ val = 0.2;
+ else
+ return 0;
+
+ nr = get_opt_postfix(str);
+ if (nr && !str_to_float(nr, &val)) {
+ log_err("fio: random postfix parsing failed\n");
+ free(nr);
+ return 1;
+ }
+
+ free(nr);
+
+ if (td->o.random_distribution == FIO_RAND_DIST_ZIPF) {
+ if (val == 1.00) {
+ log_err("fio: zipf theta must different than 1.0\n");
+ return 1;
+ }
+ td->o.zipf_theta.u.f = val;
+ } else {
+ if (val <= 0.00 || val >= 1.00) {
+ log_err("fio: pareto input out of range (0 < input < 1.0)\n");
+ return 1;
+ }
+ td->o.pareto_h.u.f = val;
+ }
+
+ return 0;
+}
+
/*
* Return next file in the string. Files are separated with ':'. If the ':'
* is escaped with a '\', then that ':' is part of the filename and does not
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_RANDOM,
},
+ {
+ .name = "random_distribution",
+ .type = FIO_OPT_STR,
+ .off1 = td_var_offset(random_distribution),
+ .cb = str_random_distribution_cb,
+ .help = "Random offset distribution generator",
+ .def = "random",
+ .posval = {
+ { .ival = "random",
+ .oval = FIO_RAND_DIST_RANDOM,
+ .help = "Completely random",
+ },
+ { .ival = "zipf",
+ .oval = FIO_RAND_DIST_ZIPF,
+ .help = "Zipf distribution",
+ },
+ { .ival = "pareto",
+ .oval = FIO_RAND_DIST_PARETO,
+ .help = "Pareto distribution",
+ },
+ },
+ },
{
.name = "nrfiles",
.lname = "Number of files",
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_RATE,
},
+ {
+ .name = "max_latency",
+ .type = FIO_OPT_INT,
+ .off1 = td_var_offset(max_latency),
+ .help = "Maximum tolerated IO latency (usec)",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_RATE,
+ },
{
.name = "invalidate",
.lname = "Cache invalidate",
.category = FIO_OPT_C_GENERAL,
.group = FIO_OPT_G_CRED,
},
+#endif
+#ifdef FIO_HAVE_LIBNUMA
+ {
+ .name = "numa_cpu_nodes",
+ .type = FIO_OPT_STR,
+ .cb = str_numa_cpunodes_cb,
+ .help = "NUMA CPU nodes bind",
+ },
+ {
+ .name = "numa_mem_policy",
+ .type = FIO_OPT_STR,
+ .cb = str_numa_mpol_cb,
+ .help = "NUMA memory policy setup",
+ },
#endif
{
.name = "end_fsync",
#include "parse.h"
#include "debug.h"
#include "options.h"
+#include "minmax.h"
static struct fio_option *__fio_options;
/*
* Convert string into a floating number. Return 1 for success and 0 otherwise.
*/
-static int str_to_float(const char *str, double *val)
+int str_to_float(const char *str, double *val)
{
return (1 == sscanf(str, "%lf", val));
}
extern void strip_blank_end(char *);
extern int str_to_decimal(const char *, long long *, int, void *);
extern int check_str_bytes(const char *p, long long *val, void *data);
+extern int str_to_float(const char *str, double *val);
/*
* Handlers for the options
#define td_var(start, offset) ((void *) start + (offset))
-#ifndef min
-#define min(a, b) ((a) < (b) ? (a) : (b))
-#endif
-#ifndef max
-#define max(a, b) ((a) > (b) ? (a) : (b))
-#endif
-
static inline int parse_is_percent(unsigned long long val)
{
return val <= -1ULL && val >= (-1ULL - 100ULL);
pdu->buf_len = le32_to_cpu(pdu->buf_len);
pdu->client_type = le32_to_cpu(pdu->client_type);
+ stat_number = 0;
+
if (parse_jobs_ini(buf, 1, 0, pdu->client_type)) {
fio_net_send_quit(server_fd);
return -1;
}
spdu.jobs = cpu_to_le32(thread_number);
+ spdu.stat_outputs = cpu_to_le32(stat_number);
fio_net_send_cmd(server_fd, FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, NULL);
return 0;
}
dprint(FD_NET, "server: %d: %s\n", i, argv[i]);
}
+ stat_number = 0;
+
if (parse_cmd_line(clp->lines, argv, clp->client_type)) {
fio_net_send_quit(server_fd);
free(argv);
free(argv);
spdu.jobs = cpu_to_le32(thread_number);
+ spdu.stat_outputs = cpu_to_le32(stat_number);
fio_net_send_cmd(server_fd, FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, NULL);
return 0;
}
};
enum {
- FIO_SERVER_VER = 18,
+ FIO_SERVER_VER = 19,
FIO_SERVER_MAX_FRAGMENT_PDU = 1024,
struct cmd_start_pdu {
uint32_t jobs;
+ uint32_t stat_outputs;
};
struct cmd_end_pdu {
struct fio_mutex *lock; /* protects this pool */
void *map; /* map of blocks */
unsigned int *bitmap; /* blocks free/busy map */
- unsigned int free_blocks; /* free blocks */
- unsigned int nr_blocks; /* total blocks */
- unsigned int next_non_full;
- unsigned int mmap_size;
+ size_t free_blocks; /* free blocks */
+ size_t nr_blocks; /* total blocks */
+ size_t next_non_full;
+ size_t mmap_size;
};
struct block_hdr {
- unsigned int size;
+ size_t size;
#ifdef SMALLOC_REDZONE
unsigned int prered;
#endif
return (ptr >= pool->map) && (ptr < pool->map + pool_size);
}
-static inline unsigned int size_to_blocks(unsigned int size)
+static inline size_t size_to_blocks(size_t size)
{
return (size + SMALLOC_BPB - 1) / SMALLOC_BPB;
}
static int blocks_iter(struct pool *pool, unsigned int pool_idx,
- unsigned int idx, unsigned int nr_blocks,
+ unsigned int idx, size_t nr_blocks,
int (*func)(unsigned int *map, unsigned int mask))
{
}
static int blocks_free(struct pool *pool, unsigned int pool_idx,
- unsigned int idx, unsigned int nr_blocks)
+ unsigned int idx, size_t nr_blocks)
{
return blocks_iter(pool, pool_idx, idx, nr_blocks, mask_cmp);
}
static void set_blocks(struct pool *pool, unsigned int pool_idx,
- unsigned int idx, unsigned int nr_blocks)
+ unsigned int idx, size_t nr_blocks)
{
blocks_iter(pool, pool_idx, idx, nr_blocks, mask_set);
}
static void clear_blocks(struct pool *pool, unsigned int pool_idx,
- unsigned int idx, unsigned int nr_blocks)
+ unsigned int idx, size_t nr_blocks)
{
blocks_iter(pool, pool_idx, idx, nr_blocks, mask_clear);
}
sfree_pool(pool, ptr);
}
-static void *__smalloc_pool(struct pool *pool, unsigned int size)
+static void *__smalloc_pool(struct pool *pool, size_t size)
{
- unsigned int nr_blocks;
+ size_t nr_blocks;
unsigned int i;
unsigned int offset;
unsigned int last_idx;
return ret;
}
-static void *smalloc_pool(struct pool *pool, unsigned int size)
+static void *smalloc_pool(struct pool *pool, size_t size)
{
- unsigned int alloc_size = size + sizeof(struct block_hdr);
+ size_t alloc_size = size + sizeof(struct block_hdr);
void *ptr;
/*
return ptr;
}
-void *smalloc(unsigned int size)
+void *smalloc(size_t size)
{
unsigned int i;
+ if (size != (unsigned int) size)
+ return NULL;
+
global_write_lock();
i = last_pool;
#ifndef FIO_SMALLOC_H
#define FIO_SMALLOC_H
-extern void *smalloc(unsigned int);
+extern void *smalloc(size_t);
extern void sfree(void *);
extern char *smalloc_strdup(const char *);
extern void sinit(void);
{
struct thread_stat *ts = &td->ts;
+#ifdef RUSAGE_THREAD
+ getrusage(RUSAGE_THREAD, &td->ru_end);
+#else
getrusage(RUSAGE_SELF, &td->ru_end);
+#endif
ts->usr_time += mtime_since(&td->ru_start.ru_utime,
&td->ru_end.ru_utime);
--- /dev/null
+/*
+ * Generate/analyze pareto/zipf distributions to better understand
+ * what an access pattern would look like.
+ *
+ * For instance, the following would generate a zipf distribution
+ * with theta 1.2, using 100,000 values and split the reporting into
+ * 20 buckets:
+ *
+ * t/genzipf zipf 1.2 100000 20
+ *
+ * Only the distribution type (zipf or pareto) and spread input need
+ * to be given, if not given defaults are used.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "../lib/zipf.h"
+#include "../flist.h"
+#include "../hash.h"
+#include "../rbtree.h"
+
+#define DEF_NR 1000000
+#define DEF_NR_OUTPUT 23
+
+struct node {
+ struct flist_head list;
+ unsigned long long val;
+ unsigned long hits;
+};
+
+static struct flist_head *hash;
+static unsigned long hash_bits = 24;
+static unsigned long hash_size = 1 << 24;
+
+enum {
+ TYPE_NONE = 0,
+ TYPE_ZIPF,
+ TYPE_PARETO,
+};
+static const char *dist_types[] = { "None", "Zipf", "Pareto" };
+
+static int dist_type = TYPE_ZIPF;
+static unsigned long gb_size = 500;
+static unsigned long block_size = 4096;
+static unsigned long output_nranges = DEF_NR_OUTPUT;
+static double percentage;
+static double dist_val;
+
+#define DEF_ZIPF_VAL 1.2
+#define DEF_PARETO_VAL 0.3
+
+static struct node *hash_lookup(unsigned long long val)
+{
+ struct flist_head *l = &hash[hash_long(val, hash_bits)];
+ struct flist_head *entry;
+ struct node *n;
+
+ flist_for_each(entry, l) {
+ n = flist_entry(entry, struct node, list);
+ if (n->val == val)
+ return n;
+ }
+
+ return NULL;
+}
+
+static struct node *hash_insert(struct node *n, unsigned long long val)
+{
+ struct flist_head *l = &hash[hash_long(val, hash_bits)];
+
+ n->val = val;
+ n->hits = 1;
+ flist_add_tail(&n->list, l);
+ return n;
+}
+
+static int parse_options(int argc, char *argv[])
+{
+ const char *optstring = "t:g:i:o:b:p:";
+ int c, dist_val_set = 0;
+
+ while ((c = getopt(argc, argv, optstring)) != -1) {
+ switch (c) {
+ case 'p':
+ percentage = atof(optarg);
+ break;
+ case 'b':
+ block_size = strtoul(optarg, NULL, 10);
+ break;
+ case 't':
+ if (!strncmp(optarg, "zipf", 4))
+ dist_type = TYPE_ZIPF;
+ else if (!strncmp(optarg, "pareto", 6))
+ dist_type = TYPE_PARETO;
+ else {
+ printf("wrong dist type: %s\n", optarg);
+ return 1;
+ }
+ break;
+ case 'g':
+ gb_size = strtoul(optarg, NULL, 10);
+ break;
+ case 'i':
+ dist_val = atof(optarg);
+ dist_val_set = 1;
+ break;
+ case 'o':
+ output_nranges = strtoul(optarg, NULL, 10);
+ break;
+ default:
+ printf("bad option %c\n", c);
+ return 1;
+ }
+ }
+
+ if (dist_type == TYPE_PARETO) {
+ if ((dist_val >= 1.00 || dist_val < 0.00)) {
+ printf("pareto input must be > 0.00 and < 1.00\n");
+ return 1;
+ }
+ if (!dist_val_set)
+ dist_val = DEF_PARETO_VAL;
+ } else if (dist_type == TYPE_ZIPF) {
+ if (dist_val == 1.0) {
+ printf("zipf input must be different than 1.0\n");
+ return 1;
+ }
+ if (!dist_val_set)
+ dist_val = DEF_ZIPF_VAL;
+ }
+
+ return 0;
+}
+
+struct output_sum {
+ double output;
+ unsigned int nranges;
+};
+
+static int node_cmp(const void *p1, const void *p2)
+{
+ const struct node *n1 = p1;
+ const struct node *n2 = p2;
+
+ return n2->hits - n1->hits;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long offset;
+ unsigned long i, j, k, nr_vals, cur_vals, interval, total_vals, nnodes;
+ unsigned long long nranges;
+ struct output_sum *output_sums;
+ struct node *nodes;
+ double perc, perc_i;
+ struct zipf_state zs;
+
+ if (parse_options(argc, argv))
+ return 1;
+
+ printf("Generating %s distribution with %f input and %lu GB size and %lu block_size.\n", dist_types[dist_type], dist_val, gb_size, block_size);
+
+ nranges = gb_size * 1024 * 1024 * 1024ULL;
+ nranges /= block_size;
+
+ if (dist_type == TYPE_ZIPF)
+ zipf_init(&zs, nranges, dist_val, 1);
+ else
+ pareto_init(&zs, nranges, dist_val, 1);
+
+ hash_bits = 0;
+ hash_size = nranges;
+ while ((hash_size >>= 1) != 0)
+ hash_bits++;
+
+ hash_size = 1 << hash_bits;
+
+ hash = malloc(hash_size * sizeof(struct flist_head));
+ for (i = 0; i < hash_size; i++)
+ INIT_FLIST_HEAD(&hash[i]);
+
+ nodes = malloc(nranges * sizeof(struct node));
+
+ for (nr_vals = i = j = 0; i < nranges; i++) {
+ struct node *n;
+
+ if (dist_type == TYPE_ZIPF)
+ offset = zipf_next(&zs);
+ else
+ offset = pareto_next(&zs);
+
+ n = hash_lookup(offset);
+ if (n)
+ n->hits++;
+ else {
+ hash_insert(&nodes[j], offset);
+ j++;
+ }
+
+ nr_vals++;
+ }
+
+ qsort(nodes, j, sizeof(struct node), node_cmp);
+ nnodes = j;
+ nr_vals = nnodes;
+
+ interval = (nr_vals + output_nranges - 1) / output_nranges;
+
+ output_sums = malloc(output_nranges * sizeof(struct output_sum));
+ for (i = 0; i < output_nranges; i++) {
+ output_sums[i].output = 0.0;
+ output_sums[i].nranges = 1;
+ }
+
+ total_vals = i = j = cur_vals = 0;
+
+ for (k = 0; k < nnodes; k++) {
+ struct output_sum *os = &output_sums[j];
+ struct node *node = &nodes[k];
+
+ if (i >= interval) {
+ os->output = (double) (cur_vals + 1) / (double) nranges;
+ os->output *= 100.0;
+ j++;
+ cur_vals = node->hits;
+ interval += (nr_vals + output_nranges - 1) / output_nranges;
+ } else {
+ cur_vals += node->hits;
+ os->nranges += node->hits;
+ }
+
+ i++;
+ total_vals += node->hits;
+
+ if (percentage) {
+ unsigned long blocks = percentage * nranges / 100;
+
+ if (total_vals >= blocks) {
+ double cs = i * block_size / (1024 * 1024);
+ char p = 'M';
+
+ if (cs > 1024.0) {
+ cs /= 1024.0;
+ p = 'G';
+ }
+ if (cs > 1024.0) {
+ cs /= 1024.0;
+ p = 'T';
+ }
+
+ printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p);
+ percentage = 0.0;
+ }
+ }
+ }
+
+ perc_i = 100.0 / (double) output_nranges;
+ perc = 0.0;
+
+ printf("\n Rows Hits No Hits Size\n");
+ printf("--------------------------------------------------------\n");
+ for (i = 0; i < j; i++) {
+ struct output_sum *os = &output_sums[i];
+ double gb = (double) os->nranges * block_size / 1024.0;
+ char p = 'K';
+
+ if (gb > 1024.0) {
+ p = 'M';
+ gb /= 1024.0;
+ }
+ if (gb > 1024.0) {
+ p = 'G';
+ gb /= 1024.0;
+ }
+
+ perc += perc_i;
+ printf("%s %6.2f%%\t%6.2f%%\t\t%8u\t%6.2f%c\n", i ? "|->" : "Top", perc, os->output, os->nranges, gb, p);
+ }
+
+ free(output_sums);
+ free(hash);
+ return 0;
+}
return fwrite(buffer, len, 1, stderr);
}
+
+int log_info(const char *format, ...)
+{
+ char buffer[1024];
+ va_list args;
+ size_t len;
+
+ va_start(args, format);
+ len = vsnprintf(buffer, sizeof(buffer), format, args);
+ va_end(args);
+
+ return fwrite(buffer, len, 1, stdout);
+}
#include "../flist.h"
FILE *f_err;
+struct timeval *fio_tv = NULL;
+unsigned int fio_debug = 0;
#define MAGIC1 0xa9b1c8d2
#define MAGIC2 0xf0a1e9b3
scleanup();
return 0;
}
+
+void __dprint(int type, const char *str, ...)
+{
+}
unsigned int bs_unaligned;
unsigned int fsync_on_close;
+ unsigned int random_distribution;
+ fio_fp64_t zipf_theta;
+ fio_fp64_t pareto_h;
+
unsigned int hugepage_size;
unsigned int rw_min_bs;
unsigned int thinktime;
enum fio_memtype mem_type;
unsigned int mem_align;
+ unsigned max_latency;
+
unsigned int stonewall;
unsigned int new_group;
unsigned int numjobs;
unsigned int cpumask_set;
os_cpu_mask_t verify_cpumask;
unsigned int verify_cpumask_set;
+#ifdef FIO_HAVE_LIBNUMA
+ struct bitmask *numa_cpunodesmask;
+ unsigned int numa_cpumask_set;
+ unsigned short numa_mem_mode;
+ unsigned int numa_mem_prefer_node;
+ struct bitmask *numa_memnodesmask;
+ unsigned int numa_memmask_set;
+#endif
unsigned int iolog;
unsigned int rwmixcycle;
unsigned int rwmix[2];
uint32_t bs_unaligned;
uint32_t fsync_on_close;
+ uint32_t random_distribution;
+ fio_fp64_t zipf_theta;
+ fio_fp64_t pareto_h;
+
uint32_t hugepage_size;
uint32_t rw_min_bs;
uint32_t thinktime;
uint32_t mem_type;
uint32_t mem_align;
+ uint32_t max_latency;
+
uint32_t stonewall;
uint32_t new_group;
uint32_t numjobs;
static struct timeval genesis;
static unsigned long ns_granularity;
-unsigned long long utime_since(struct timeval *s, struct timeval *e)
-{
- long sec, usec;
- unsigned long long ret;
-
- sec = e->tv_sec - s->tv_sec;
- usec = e->tv_usec - s->tv_usec;
- if (sec > 0 && usec < 0) {
- sec--;
- usec += 1000000;
- }
-
- /*
- * time warp bug on some kernels?
- */
- if (sec < 0 || (sec == 0 && usec < 0))
- return 0;
-
- ret = sec * 1000000ULL + usec;
-
- return ret;
-}
-
-unsigned long long utime_since_now(struct timeval *s)
-{
- struct timeval t;
-
- fio_gettime(&t, NULL);
- return utime_since(s, &t);
-}
-
-unsigned long mtime_since(struct timeval *s, struct timeval *e)
-{
- long sec, usec, ret;
-
- sec = e->tv_sec - s->tv_sec;
- usec = e->tv_usec - s->tv_usec;
- if (sec > 0 && usec < 0) {
- sec--;
- usec += 1000000;
- }
-
- if (sec < 0 || (sec == 0 && usec < 0))
- return 0;
-
- sec *= 1000UL;
- usec /= 1000UL;
- ret = sec + usec;
-
- return ret;
-}
-
-unsigned long mtime_since_now(struct timeval *s)
-{
- struct timeval t;
- void *p = __builtin_return_address(0);
-
- fio_gettime(&t, p);
- return mtime_since(s, &t);
-}
-
-unsigned long time_since_now(struct timeval *s)
-{
- return mtime_since_now(s) / 1000;
-}
-
/*
* busy looping version for the last few usec
*/