Merge branch 'master' of https://github.com/davidzengxhsh/fio
authorJens Axboe <axboe@fb.com>
Fri, 20 May 2016 17:22:37 +0000 (11:22 -0600)
committerJens Axboe <axboe@fb.com>
Fri, 20 May 2016 17:22:37 +0000 (11:22 -0600)
74 files changed:
.gitignore
.travis.yml [new file with mode: 0644]
FIO-VERSION-GEN
HOWTO
Makefile
backend.c
cconv.c
client.c
client.h
configure
diskutil.c
diskutil.h
engines/pmemblk.c [new file with mode: 0644]
engines/rbd.c
engines/rdma.c
engines/sync.c
examples/jesd219.fio [new file with mode: 0644]
examples/pmemblk.fio [new file with mode: 0644]
examples/rand-zones.fio [new file with mode: 0644]
file.h
filesetup.c
fio.1
fio.c
fio.h
fio_time.h
flist.h
gfio.c
hash.h
helper_thread.c [new file with mode: 0644]
helper_thread.h [new file with mode: 0644]
init.c
io_ddir.h
io_u.c
ioengine.h
ioengines.c
iolog.c
iolog.h
lib/gauss.c
lib/gauss.h
lib/rand.c
lib/rand.h
lib/zipf.c
lib/zipf.h
libfio.c
memory.c
mutex.c
options.c
os/os-linux.h
os/os-mac.h
os/os-windows.h
os/os.h
os/windows/install.wxs
os/windows/posix.c
oslib/getopt_long.c
oslib/libmtd.h
oslib/libmtd_common.h
server.c
server.h
stat.c
stat.h
t/arch.c [new file with mode: 0644]
t/dedupe.c
t/gen-rand.c [new file with mode: 0644]
t/lfsr-test.c
t/memlock.c [new file with mode: 0644]
t/read-to-pipe-async.c [new file with mode: 0644]
t/stest.c
t/verify-state.c
thread_options.h
time.c
tools/fiologparser.py [new file with mode: 0755]
verify-state.h
verify.c
workqueue.c

index c9d90fb..bd9b032 100644 (file)
@@ -9,3 +9,4 @@
 /fio
 y.tab.*
 lex.yy.c
+*.un~
diff --git a/.travis.yml b/.travis.yml
new file mode 100644 (file)
index 0000000..9bef750
--- /dev/null
@@ -0,0 +1,7 @@
+language: c
+compiler:
+  - clang
+  - gcc
+before_install:
+  - sudo apt-get -qq update
+  - sudo apt-get install -y libaio-dev libnuma-dev
index 3253034..fcdbd98 100755 (executable)
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-2.3
+DEF_VER=fio-2.9
 
 LF='
 '
diff --git a/HOWTO b/HOWTO
index 7c24c1b..9ed2c5f 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -10,6 +10,9 @@ Table of contents
 7. Terse output
 8. Trace file format
 9. CPU idleness profiling
+10. Verification and triggers
+11. Log File Formats
+
 
 1.0 Overview and history
 ------------------------
@@ -670,10 +673,23 @@ file_service_type=str  Defines how fio decides which file from a job to
                                the next. Multiple files can still be
                                open depending on 'openfiles'.
 
-               The string can have a number appended, indicating how
-               often to switch to a new file. So if option random:4 is
-               given, fio will switch to a new random file after 4 ios
-               have been issued.
+                       zipf    Use a zipfian distribution to decide what file
+                               to access.
+
+                       pareto  Use a pareto distribution to decide what file
+                               to access.
+
+                       gauss   Use a gaussian (normal) distribution to decide
+                               what file to access.
+
+               For random, roundrobin, and sequential, a postfix can be
+               appended to tell fio how many I/Os to issue before switching
+               to a new file. For example, specifying
+               'file_service_type=random:8' would cause fio to issue 8 I/Os
+               before selecting a new file at random. For the non-uniform
+               distributions, a floating point postfix can be given to
+               influence how the distribution is skewed. See
+               'random_distribution' for a description of how that would work.
 
 ioengine=str   Defines how the job issues io to the file. The following
                types are defined:
@@ -795,6 +811,9 @@ ioengine=str        Defines how the job issues io to the file. The following
                                overwriting. The writetrim mode works well
                                for this constraint.
 
+                       pmemblk Read and write through the NVML libpmemblk
+                               interface.
+
                        external Prefix to specify loading an external
                                IO engine object file. Append the engine
                                filename, eg ioengine=external:/tmp/foo.o
@@ -962,6 +981,8 @@ random_distribution=str:float       By default, fio will use a completely uniform
                random          Uniform random distribution
                zipf            Zipf distribution
                pareto          Pareto distribution
+               gauss           Normal (guassian) distribution
+               zoned           Zoned random distribution
 
                When using a zipf or pareto distribution, an input value
                is also needed to define the access pattern. For zipf, this
@@ -970,7 +991,28 @@ random_distribution=str:float      By default, fio will use a completely uniform
                what the given input values will yield in terms of hit rates.
                If you wanted to use zipf with a theta of 1.2, you would use
                random_distribution=zipf:1.2 as the option. If a non-uniform
-               model is used, fio will disable use of the random map.
+               model is used, fio will disable use of the random map. For
+               the gauss distribution, a normal deviation is supplied as
+               a value between 0 and 100.
+
+               For a zoned distribution, fio supports specifying percentages
+               of IO access that should fall within what range of the file or
+               device. For example, given a criteria of:
+
+                       60% of accesses should be to the first 10%
+                       30% of accesses should be to the next 20%
+                       8% of accesses should be to to the next 30%
+                       2% of accesses should be to the next 40%
+
+               we can define that through zoning of the random accesses. For
+               the above example, the user would do:
+
+                       random_distribution=zoned:60/10:30/20:8/30:2/40
+
+               similarly to how bssplit works for setting ranges and
+               percentages of block sizes. Like bssplit, it's possible to
+               specify separate zones for reads, writes, and trims. If just
+               one set is given, it'll apply to all of them.
 
 percentage_random=int  For a random workload, set how big a percentage should
                be random. This defaults to 100%, in which case the workload
@@ -1013,7 +1055,9 @@ random_generator=str      Fio supports the following engines for generating
                typically good enough. LFSR only works with single
                block sizes, not with workloads that use multiple block
                sizes. If used with such a workload, fio may read or write
-               some blocks multiple times.
+               some blocks multiple times. The default value is tausworthe,
+               unless the required space exceeds 2^32 blocks. If it does,
+               then tausworthe64 is selected automatically.
 
 nice=int       Run the job with the given nice value. See man nice(2).
 
@@ -1235,10 +1279,14 @@ exitall_on_error        When one job finishes in error, terminate the rest. The
                default is to wait for each job to finish.
 
 bwavgtime=int  Average the calculated bandwidth over the given time. Value
-               is specified in milliseconds.
+               is specified in milliseconds. If the job also does bandwidth
+               logging through 'write_bw_log', then the minimum of this option
+               and 'log_avg_msec' will be used.  Default: 500ms.
 
 iopsavgtime=int        Average the calculated IOPS over the given time. Value
-               is specified in milliseconds.
+               is specified in milliseconds. If the job also does IOPS logging
+               through 'write_iops_log', then the minimum of this option and
+               'log_avg_msec' will be used.  Default: 500ms.
 
 create_serialize=bool  If true, serialize the file creating for the jobs.
                        This may be handy to avoid interleaving of data
@@ -1538,7 +1586,7 @@ write_bw_log=str If given, write a bandwidth log of the jobs in this job
                filename. For this option, the suffix is _bw.x.log, where
                x is the index of the job (1..N, where N is the number of
                jobs). If 'per_job_logs' is false, then the filename will not
-               include the job index.
+               include the job index. See 'Log File Formats'.
 
 write_lat_log=str Same as write_bw_log, except that this option stores io
                submission, completion, and total latencies instead. If no
@@ -1552,8 +1600,8 @@ write_lat_log=str Same as write_bw_log, except that this option stores io
                and foo_lat.x.log, where x is the index of the job (1..N,
                where N is the number of jobs). This helps fio_generate_plot
                fine the logs automatically. If 'per_job_logs' is false, then
-               the filename will not include the job index.
-
+               the filename will not include the job index. See 'Log File
+               Formats'.
 
 write_iops_log=str Same as write_bw_log, but writes IOPS. If no filename is
                given with this option, the default filename of
@@ -1561,14 +1609,19 @@ write_iops_log=str Same as write_bw_log, but writes IOPS. If no filename is
                (1..N, where N is the number of jobs). Even if the filename
                is given, fio will still append the type of log. If
                'per_job_logs' is false, then the filename will not include
-               the job index.
+               the job index. See 'Log File Formats'.
 
 log_avg_msec=int By default, fio will log an entry in the iops, latency,
                or bw log for every IO that completes. When writing to the
                disk log, that can quickly grow to a very large size. Setting
                this option makes fio average the each log entry over the
                specified period of time, reducing the resolution of the log.
-               Defaults to 0.
+               See log_max_value as well. Defaults to 0, logging all entries.
+
+log_max_value=bool     If log_avg_msec is set, fio logs the average over that
+               window. If you instead want to log the maximum value, set this
+               option to 1. Defaults to 0, meaning that averaged values are
+               logged.
 
 log_offset=int If this is set, the iolog options will include the byte
                offset for the IO entry as well as the other data values.
@@ -1777,6 +1830,9 @@ that defines them is selected.
                enabled when polling for a minimum of 0 events (eg when
                iodepth_batch_complete=0).
 
+[psyncv2] hipri                Set RWF_HIPRI on IO, indicating to the kernel that
+                       it's of higher priority than normal.
+
 [cpu] cpuload=int Attempt to use the specified percentage of CPU cycles.
 
 [cpu] cpuchunks=int Split the load into cycles of the given time. In
@@ -1852,6 +1908,15 @@ be the starting port number since fio will use a range of ports.
                1         : allocate space immidietly inside defragment event,
                            and free right after event
 
+[rbd] clustername=str  Specifies the name of the Ceph cluster.
+[rbd] rbdname=str      Specifies the name of the RBD.
+[rbd] pool=str         Specifies the naem of the Ceph pool containing RBD.
+[rbd] clientname=str   Specifies the username (without the 'client.' prefix)
+                       used to access the Ceph cluster. If the clustername is
+                       specified, the clientmae shall be the full type.id
+                       string. If no type. prefix is given, fio will add
+                       'client.' by default.
+
 [mtd] skip_bad=bool    Skip operations against known bad blocks.
 
 [libhdfs] hdfsdirectory        libhdfs will create chunk in this HDFS directory
@@ -1952,7 +2017,9 @@ runt=             The runtime of that thread
 cpu=           CPU usage. User and system time, along with the number
                of context switches this thread went through, usage of
                system and user time, and finally the number of major
-               and minor page faults.
+               and minor page faults. The CPU utilization numbers are
+               averages for the jobs in that reporting group, while the
+               context and fault counters are summed.
 IO depths=     The distribution of io depths over the job life time. The
                numbers are divided into powers of 2, so for example the
                16= entries includes depths up to that value but higher
@@ -2213,10 +2280,43 @@ localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reb
 For this case, fio would wait for the server to send us the write state,
 then execute 'ipmi-reboot server' when that happened.
 
-10.1 Loading verify state
+10.2 Loading verify state
 -------------------------
 To load store write state, read verification job file must contain
 the verify_state_load option. If that is set, fio will load the previously
 stored state. For a local fio run this is done by loading the files directly,
 and on a client/server run, the server backend will ask the client to send
 the files over and load them from there.
+
+
+11.0 Log File Formats
+---------------------
+
+Fio supports a variety of log file formats, for logging latencies, bandwidth,
+and IOPS. The logs share a common format, which looks like this:
+
+time (msec), value, data direction, offset
+
+Time for the log entry is always in milliseconds. The value logged depends
+on the type of log, it will be one of the following:
+
+       Latency log             Value is latency in usecs
+       Bandwidth log           Value is in KB/sec
+       IOPS log                Value is IOPS
+
+Data direction is one of the following:
+
+       0                       IO is a READ
+       1                       IO is a WRITE
+       2                       IO is a TRIM
+
+The offset is the offset, in bytes, from the start of the file, for that
+particular IO. The logging of the offset can be toggled with 'log_offset'.
+
+If windowed logging is enabled though 'log_avg_msec', then fio doesn't log
+individual IOs. Instead of logs the average values over the specified
+period of time. Since 'data direction' and 'offset' are per-IO values,
+they aren't applicable if windowed logging is enabled. If windowed logging
+is enabled and 'log_max_value' is set, then fio logs maximum values in
+that window instead of averages.
+
index ee90899..108e6ee 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -20,13 +20,13 @@ all:
 include config-host.mak
 endif
 
-DEBUGFLAGS = -D_FORTIFY_SOURCE=2 -DFIO_INC_DEBUG
+DEBUGFLAGS = -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -DFIO_INC_DEBUG
 CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
 OPTFLAGS= -g -ffast-math
 CFLAGS = -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
 LIBS   += -lm $(EXTLIBS)
 PROGS  = fio
-SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio)
+SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py)
 
 ifndef CONFIG_FIO_NO_OPT
   CFLAGS += -O3
@@ -45,7 +45,7 @@ SOURCE :=     $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
                server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
                gettime-thread.c helpers.c json.c idletime.c td_error.c \
                profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
-               workqueue.c rate-submit.c optgroup.c
+               workqueue.c rate-submit.c optgroup.c helper_thread.c
 
 ifdef CONFIG_LIBHDFS
   HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
@@ -124,6 +124,9 @@ ifdef CONFIG_MTD
   SOURCE += oslib/libmtd.c
   SOURCE += oslib/libmtd_legacy.c
 endif
+ifdef CONFIG_PMEMBLK
+  SOURCE += engines/pmemblk.c
+endif
 
 ifeq ($(CONFIG_TARGET_OS), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
@@ -191,7 +194,7 @@ endif
 -include $(OBJS:.o=.d)
 
 T_SMALLOC_OBJS = t/stest.o
-T_SMALLOC_OBJS += gettime.o mutex.o smalloc.o t/log.o t/debug.o
+T_SMALLOC_OBJS += gettime.o mutex.o smalloc.o t/log.o t/debug.o t/arch.o
 T_SMALLOC_PROGS = t/stest
 
 T_IEEE_OBJS = t/ieee754.o
@@ -208,9 +211,14 @@ T_AXMAP_OBJS += lib/lfsr.o lib/axmap.o
 T_AXMAP_PROGS = t/axmap
 
 T_LFSR_TEST_OBJS = t/lfsr-test.o
-T_LFSR_TEST_OBJS += lib/lfsr.o gettime.o t/log.o t/debug.o
+T_LFSR_TEST_OBJS += lib/lfsr.o gettime.o t/log.o t/debug.o t/arch.o
 T_LFSR_TEST_PROGS = t/lfsr-test
 
+T_GEN_RAND_OBJS = t/gen-rand.o
+T_GEN_RAND_OBJS += t/log.o t/debug.o lib/rand.o lib/pattern.o lib/strntol.o \
+                       oslib/strcasestr.o
+T_GEN_RAND_PROGS = t/gen-rand
+
 ifeq ($(CONFIG_TARGET_OS), Linux)
 T_BTRACE_FIO_OBJS = t/btrace2fio.o
 T_BTRACE_FIO_OBJS += fifo.o lib/flist_sort.o t/log.o oslib/linux-dev-lookup.o
@@ -219,27 +227,35 @@ endif
 
 T_DEDUPE_OBJS = t/dedupe.o
 T_DEDUPE_OBJS += lib/rbtree.o t/log.o mutex.o smalloc.o gettime.o crc/md5.o \
-               lib/memalign.o lib/bloom.o t/debug.o crc/xxhash.o \
+               lib/memalign.o lib/bloom.o t/debug.o crc/xxhash.o t/arch.o \
                crc/murmur3.o crc/crc32c.o crc/crc32c-intel.o crc/fnv.o
 T_DEDUPE_PROGS = t/fio-dedupe
 
 T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o t/debug.o
 T_VS_PROGS = t/fio-verify-state
 
+T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o
+T_PIPE_ASYNC_PROGS = t/read-to-pipe-async
+
+T_MEMLOCK_OBJS = t/memlock.o
+T_MEMLOCK_PROGS = t/memlock
+
 T_OBJS = $(T_SMALLOC_OBJS)
 T_OBJS += $(T_IEEE_OBJS)
 T_OBJS += $(T_ZIPF_OBJS)
 T_OBJS += $(T_AXMAP_OBJS)
 T_OBJS += $(T_LFSR_TEST_OBJS)
+T_OBJS += $(T_GEN_RAND_OBJS)
 T_OBJS += $(T_BTRACE_FIO_OBJS)
 T_OBJS += $(T_DEDUPE_OBJS)
 T_OBJS += $(T_VS_OBJS)
+T_OBJS += $(T_PIPE_ASYNC_OBJS)
+T_OBJS += $(T_MEMLOCK_OBJS)
 
 ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
     T_DEDUPE_OBJS += os/windows/posix.o lib/hweight.o
     T_SMALLOC_OBJS += os/windows/posix.o lib/hweight.o
     T_LFSR_TEST_OBJS += os/windows/posix.o lib/hweight.o
-    T_ZIPF_OBJS += oslib/strcasestr.o
 endif
 
 T_TEST_PROGS = $(T_SMALLOC_PROGS)
@@ -247,6 +263,7 @@ T_TEST_PROGS += $(T_IEEE_PROGS)
 T_PROGS += $(T_ZIPF_PROGS)
 T_TEST_PROGS += $(T_AXMAP_PROGS)
 T_TEST_PROGS += $(T_LFSR_TEST_PROGS)
+T_TEST_PROGS += $(T_GEN_RAND_PROGS)
 T_PROGS += $(T_BTRACE_FIO_PROGS)
 T_PROGS += $(T_DEDUPE_PROGS)
 T_PROGS += $(T_VS_PROGS)
@@ -366,6 +383,12 @@ cairo_text_helpers.o: cairo_text_helpers.c cairo_text_helpers.h
 printing.o: printing.c printing.h
        $(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
+t/read-to-pipe-async: $(T_PIPE_ASYNC_OBJS)
+       $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_PIPE_ASYNC_OBJS) $(LIBS)
+
+t/memlock: $(T_MEMLOCK_OBJS)
+       $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_MEMLOCK_OBJS) $(LIBS)
+
 t/stest: $(T_SMALLOC_OBJS)
        $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_SMALLOC_OBJS) $(LIBS)
 
@@ -387,6 +410,9 @@ t/axmap: $(T_AXMAP_OBJS)
 t/lfsr-test: $(T_LFSR_TEST_OBJS)
        $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_LFSR_TEST_OBJS) $(LIBS)
 
+t/gen-rand: $(T_GEN_RAND_OBJS)
+       $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_GEN_RAND_OBJS) $(LIBS)
+
 ifeq ($(CONFIG_TARGET_OS), Linux)
 t/fio-btrace2fio: $(T_BTRACE_FIO_OBJS)
        $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
@@ -415,6 +441,8 @@ doc: tools/plot/fio2gnuplot.1
        @man -t tools/fio_generate_plots.1 | ps2pdf - fio_generate_plots.pdf
        @man -t tools/plot/fio2gnuplot.1 | ps2pdf - fio2gnuplot.pdf
 
+test:
+
 install: $(PROGS) $(SCRIPTS) tools/plot/fio2gnuplot.1 FORCE
        $(INSTALL) -m 755 -d $(DESTDIR)$(bindir)
        $(INSTALL) $(PROGS) $(SCRIPTS) $(DESTDIR)$(bindir)
index bd94078..6d50360 100644 (file)
--- a/backend.c
+++ b/backend.c
 #include "workqueue.h"
 #include "lib/mountcheck.h"
 #include "rate-submit.h"
-
-static pthread_t helper_thread;
-static pthread_mutex_t helper_lock;
-pthread_cond_t helper_cond;
-int helper_do_stat = 0;
+#include "helper_thread.h"
 
 static struct fio_mutex *startup_mutex;
 static struct flist_head *cgroup_list;
@@ -79,7 +75,6 @@ unsigned int stat_number = 0;
 int shm_id = 0;
 int temp_stall_ts;
 unsigned long done_secs = 0;
-volatile int helper_exit = 0;
 
 #define PAGE_ALIGN(buf)        \
        (char *) (((uintptr_t) (buf) + page_mask) & ~page_mask)
@@ -309,6 +304,8 @@ requeue:
                put_io_u(td, io_u);
                return true;
        } else if (ret == FIO_Q_QUEUED) {
+               if (td_io_commit(td))
+                       return true;
                if (io_u_queued_complete(td, 1) < 0)
                        return true;
        } else if (ret == FIO_Q_COMPLETED) {
@@ -444,6 +441,12 @@ static int wait_for_completions(struct thread_data *td, struct timeval *time)
        int min_evts = 0;
        int ret;
 
+       if (td->flags & TD_F_REGROW_LOGS) {
+               ret = io_u_quiesce(td);
+               regrow_logs(td);
+               return ret;
+       }
+
        /*
         * if the queue is full, we MUST reap at least 1 event
         */
@@ -520,6 +523,14 @@ sync_done:
                        if (*ret < 0)
                                break;
                }
+
+               /*
+                * when doing I/O (not when verifying),
+                * check for any errors that are to be ignored
+                */
+               if (!from_verify)
+                       break;
+
                return 0;
        case FIO_Q_QUEUED:
                /*
@@ -871,7 +882,14 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
                if (flow_threshold_exceeded(td))
                        continue;
 
-               if (!td->o.time_based && bytes_issued >= total_bytes)
+               /*
+                * Break if we exceeded the bytes. The exception is time
+                * based runs, but we still need to break out of the loop
+                * for those to run verification, if enabled.
+                */
+               if (bytes_issued >= total_bytes &&
+                   (!td->o.time_based ||
+                    (td->o.time_based && td->o.verify != VERIFY_NONE)))
                        break;
 
                io_u = get_io_u(td);
@@ -1053,6 +1071,41 @@ reap:
                bytes_done[i] = td->bytes_done[i] - bytes_done[i];
 }
 
+static void free_file_completion_logging(struct thread_data *td)
+{
+       struct fio_file *f;
+       unsigned int i;
+
+       for_each_file(td, f, i) {
+               if (!f->last_write_comp)
+                       break;
+               sfree(f->last_write_comp);
+       }
+}
+
+static int init_file_completion_logging(struct thread_data *td,
+                                       unsigned int depth)
+{
+       struct fio_file *f;
+       unsigned int i;
+
+       if (td->o.verify == VERIFY_NONE || !td->o.verify_state_save)
+               return 0;
+
+       for_each_file(td, f, i) {
+               f->last_write_comp = scalloc(depth, sizeof(uint64_t));
+               if (!f->last_write_comp)
+                       goto cleanup;
+       }
+
+       return 0;
+
+cleanup:
+       free_file_completion_logging(td);
+       log_err("fio: failed to alloc write comp data\n");
+       return 1;
+}
+
 static void cleanup_io_u(struct thread_data *td)
 {
        struct io_u *io_u;
@@ -1071,8 +1124,7 @@ static void cleanup_io_u(struct thread_data *td)
        io_u_qexit(&td->io_u_freelist);
        io_u_qexit(&td->io_u_all);
 
-       if (td->last_write_comp)
-               sfree(td->last_write_comp);
+       free_file_completion_logging(td);
 }
 
 static int init_io_u(struct thread_data *td)
@@ -1189,13 +1241,8 @@ static int init_io_u(struct thread_data *td)
                p += max_bs;
        }
 
-       if (td->o.verify != VERIFY_NONE) {
-               td->last_write_comp = scalloc(max_units, sizeof(uint64_t));
-               if (!td->last_write_comp) {
-                       log_err("fio: failed to alloc write comp data\n");
-                       return 1;
-               }
-       }
+       if (init_file_completion_logging(td, max_units))
+               return 1;
 
        return 0;
 }
@@ -1293,7 +1340,7 @@ static bool keep_running(struct thread_data *td)
                if (diff < td_max_bs(td))
                        return false;
 
-               if (fio_files_done(td))
+               if (fio_files_done(td) && !td->o.io_limit)
                        return false;
 
                return true;
@@ -1430,6 +1477,14 @@ static void *thread_main(void *data)
                goto err;
        }
 
+       /*
+        * Do this early, we don't want the compress threads to be limited
+        * to the same CPUs as the IO workers. So do this before we set
+        * any potential CPU affinity
+        */
+       if (iolog_compress_init(td, sk_out))
+               goto err;
+
        /*
         * If we have a gettimeofday() thread, make sure we exclude that
         * thread from this job
@@ -1564,9 +1619,6 @@ static void *thread_main(void *data)
                        goto err;
        }
 
-       if (iolog_compress_init(td, sk_out))
-               goto err;
-
        fio_verify_init(td);
 
        if (rate_submit_init(td, sk_out))
@@ -1664,6 +1716,8 @@ static void *thread_main(void *data)
                        break;
        }
 
+       td_set_runstate(td, TD_FINISHING);
+
        update_rusage_stat(td);
        td->ts.total_run_time = mtime_since_now(&td->epoch);
        td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
@@ -1676,7 +1730,7 @@ static void *thread_main(void *data)
 
        fio_unpin_memory(td);
 
-       fio_writeout_logs(td);
+       td_writeout_logs(td, true);
 
        iolog_compress_exit(td);
        rate_submit_exit(td);
@@ -1701,6 +1755,15 @@ err:
        cgroup_shutdown(td, &cgroup_mnt);
        verify_free_state(td);
 
+       if (td->zone_state_index) {
+               int i;
+
+               for (i = 0; i < DDIR_RWDIR_CNT; i++)
+                       free(td->zone_state_index[i]);
+               free(td->zone_state_index);
+               td->zone_state_index = NULL;
+       }
+
        if (fio_option_is_set(o, cpumask)) {
                ret = fio_cpuset_exit(&o->cpumask);
                if (ret)
@@ -1763,8 +1826,9 @@ static int fork_main(struct sk_out *sk_out, int shmid, int offset)
 
 static void dump_td_info(struct thread_data *td)
 {
-       log_err("fio: job '%s' hasn't exited in %lu seconds, it appears to "
-               "be stuck. Doing forceful exit of this job.\n", td->o.name,
+       log_err("fio: job '%s' (state=%d) hasn't exited in %lu seconds, it "
+               "appears to be stuck. Doing forceful exit of this job.\n",
+                       td->o.name, td->runstate,
                        (unsigned long) time_since_now(&td->terminate_time));
 }
 
@@ -1850,6 +1914,7 @@ static void reap_threads(unsigned int *nr_running, unsigned int *t_rate,
                 * move on.
                 */
                if (td->terminate &&
+                   td->runstate < TD_FSYNCING &&
                    time_since_now(&td->terminate_time) >= FIO_REAP_TIMEOUT) {
                        dump_td_info(td);
                        td_set_runstate(td, TD_REAPED);
@@ -1938,12 +2003,11 @@ static int fio_verify_load_state(struct thread_data *td)
 
        if (is_backend) {
                void *data;
-               int ver;
 
                ret = fio_server_get_verify_state(td->o.name,
-                                       td->thread_number - 1, &data, &ver);
+                                       td->thread_number - 1, &data);
                if (!ret)
-                       verify_convert_assign_state(td, data, ver);
+                       verify_assign_state(td, data);
        } else
                ret = verify_load_state(td, "local");
 
@@ -2265,82 +2329,10 @@ reap:
        update_io_ticks();
 }
 
-static void wait_for_helper_thread_exit(void)
-{
-       void *ret;
-
-       helper_exit = 1;
-       pthread_cond_signal(&helper_cond);
-       pthread_join(helper_thread, &ret);
-}
-
 static void free_disk_util(void)
 {
        disk_util_prune_entries();
-
-       pthread_cond_destroy(&helper_cond);
-}
-
-static void *helper_thread_main(void *data)
-{
-       struct sk_out *sk_out = data;
-       int ret = 0;
-
-       sk_out_assign(sk_out);
-
-       fio_mutex_up(startup_mutex);
-
-       while (!ret) {
-               uint64_t sec = DISK_UTIL_MSEC / 1000;
-               uint64_t nsec = (DISK_UTIL_MSEC % 1000) * 1000000;
-               struct timespec ts;
-               struct timeval tv;
-
-               gettimeofday(&tv, NULL);
-               ts.tv_sec = tv.tv_sec + sec;
-               ts.tv_nsec = (tv.tv_usec * 1000) + nsec;
-
-               if (ts.tv_nsec >= 1000000000ULL) {
-                       ts.tv_nsec -= 1000000000ULL;
-                       ts.tv_sec++;
-               }
-
-               pthread_cond_timedwait(&helper_cond, &helper_lock, &ts);
-
-               ret = update_io_ticks();
-
-               if (helper_do_stat) {
-                       helper_do_stat = 0;
-                       __show_running_run_stats();
-               }
-
-               if (!is_backend)
-                       print_thread_status();
-       }
-
-       sk_out_drop();
-       return NULL;
-}
-
-static int create_helper_thread(struct sk_out *sk_out)
-{
-       int ret;
-
-       setup_disk_util();
-
-       pthread_cond_init(&helper_cond, NULL);
-       pthread_mutex_init(&helper_lock, NULL);
-
-       ret = pthread_create(&helper_thread, NULL, helper_thread_main, sk_out);
-       if (ret) {
-               log_err("Can't create helper thread: %s\n", strerror(ret));
-               return 1;
-       }
-
-       dprint(FD_MUTEX, "wait on startup_mutex\n");
-       fio_mutex_down(startup_mutex);
-       dprint(FD_MUTEX, "done waiting on startup_mutex\n");
-       return 0;
+       helper_thread_destroy();
 }
 
 int fio_backend(struct sk_out *sk_out)
@@ -2373,14 +2365,14 @@ int fio_backend(struct sk_out *sk_out)
 
        set_genesis_time();
        stat_init();
-       create_helper_thread(sk_out);
+       helper_thread_create(startup_mutex, sk_out);
 
        cgroup_list = smalloc(sizeof(*cgroup_list));
        INIT_FLIST_HEAD(cgroup_list);
 
        run_threads(sk_out);
 
-       wait_for_helper_thread_exit();
+       helper_thread_exit();
 
        if (!fio_abort) {
                __show_run_stats();
diff --git a/cconv.c b/cconv.c
index 6d8d0b3..0c3a36c 100644 (file)
--- a/cconv.c
+++ b/cconv.c
@@ -23,6 +23,8 @@ static void __string_to_net(uint8_t *dst, const char *src, size_t dst_size)
 
 static void free_thread_options_to_cpu(struct thread_options *o)
 {
+       int i;
+
        free(o->description);
        free(o->name);
        free(o->wait_for);
@@ -43,6 +45,11 @@ static void free_thread_options_to_cpu(struct thread_options *o)
        free(o->ioscheduler);
        free(o->profile);
        free(o->cgroup);
+
+       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+               free(o->bssplit[i]);
+               free(o->zone_split[i]);
+       }
 }
 
 void convert_thread_options_to_cpu(struct thread_options *o,
@@ -111,6 +118,16 @@ void convert_thread_options_to_cpu(struct thread_options *o,
                        }
                }
 
+               o->zone_split_nr[i] = le32_to_cpu(top->zone_split_nr[i]);
+
+               if (o->zone_split_nr[i]) {
+                       o->zone_split[i] = malloc(o->zone_split_nr[i] * sizeof(struct zone_split));
+                       for (j = 0; j < o->zone_split_nr[i]; j++) {
+                               o->zone_split[i][j].access_perc = top->zone_split[i][j].access_perc;
+                               o->zone_split[i][j].size_perc = top->zone_split[i][j].size_perc;
+                       }
+               }
+
                o->rwmix[i] = le32_to_cpu(top->rwmix[i]);
                o->rate[i] = le32_to_cpu(top->rate[i]);
                o->ratemin[i] = le32_to_cpu(top->ratemin[i]);
@@ -160,6 +177,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
        o->allrand_repeatable = le32_to_cpu(top->allrand_repeatable);
        o->rand_seed = le64_to_cpu(top->rand_seed);
        o->log_avg_msec = le32_to_cpu(top->log_avg_msec);
+       o->log_max = le32_to_cpu(top->log_max);
        o->log_offset = le32_to_cpu(top->log_offset);
        o->log_gz = le32_to_cpu(top->log_gz);
        o->log_gz_store = le32_to_cpu(top->log_gz_store);
@@ -348,6 +366,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
        top->allrand_repeatable = cpu_to_le32(o->allrand_repeatable);
        top->rand_seed = __cpu_to_le64(o->rand_seed);
        top->log_avg_msec = cpu_to_le32(o->log_avg_msec);
+       top->log_max = cpu_to_le32(o->log_max);
        top->log_offset = cpu_to_le32(o->log_offset);
        top->log_gz = cpu_to_le32(o->log_gz);
        top->log_gz_store = cpu_to_le32(o->log_gz_store);
@@ -451,6 +470,21 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
                        }
                }
 
+               top->zone_split_nr[i] = cpu_to_le32(o->zone_split_nr[i]);
+
+               if (o->zone_split_nr[i]) {
+                       unsigned int zone_split_nr = o->zone_split_nr[i];
+
+                       if (zone_split_nr > ZONESPLIT_MAX) {
+                               log_err("fio: ZONESPLIT_MAX is too small\n");
+                               zone_split_nr = ZONESPLIT_MAX;
+                       }
+                       for (j = 0; j < zone_split_nr; j++) {
+                               top->zone_split[i][j].access_perc = o->zone_split[i][j].access_perc;
+                               top->zone_split[i][j].size_perc = o->zone_split[i][j].size_perc;
+                       }
+               }
+
                top->rwmix[i] = cpu_to_le32(o->rwmix[i]);
                top->rate[i] = cpu_to_le32(o->rate[i]);
                top->ratemin[i] = cpu_to_le32(o->ratemin[i]);
index 6c2a1ef..d502a4b 100644 (file)
--- a/client.c
+++ b/client.c
@@ -33,6 +33,8 @@ static void handle_text(struct fio_client *client, struct fio_net_cmd *cmd);
 static void handle_stop(struct fio_client *client, struct fio_net_cmd *cmd);
 static void handle_start(struct fio_client *client, struct fio_net_cmd *cmd);
 
+static void convert_text(struct fio_net_cmd *cmd);
+
 struct client_ops fio_client_ops = {
        .text           = handle_text,
        .disk_util      = handle_du,
@@ -215,12 +217,32 @@ static int fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn)
        return 1;
 }
 
+static void fio_drain_client_text(struct fio_client *client)
+{
+       do {
+               struct fio_net_cmd *cmd;
+
+               cmd = fio_net_recv_cmd(client->fd, false);
+               if (!cmd)
+                       break;
+
+               if (cmd->opcode == FIO_NET_CMD_TEXT) {
+                       convert_text(cmd);
+                       client->ops->text(client, cmd);
+               }
+
+               free(cmd);
+       } while (1);
+}
+
 static void remove_client(struct fio_client *client)
 {
        assert(client->refs);
 
        dprint(FD_NET, "client: removed <%s>\n", client->hostname);
 
+       fio_drain_client_text(client);
+
        if (!flist_empty(&client->list))
                flist_del_init(&client->list);
 
@@ -325,7 +347,7 @@ err:
        return NULL;
 }
 
-int fio_client_add_ini_file(void *cookie, const char *ini_file, int remote)
+int fio_client_add_ini_file(void *cookie, const char *ini_file, bool remote)
 {
        struct fio_client *client = cookie;
        struct client_file *cf;
@@ -767,7 +789,7 @@ static int __fio_client_send_local_ini(struct fio_client *client,
 }
 
 int fio_client_send_ini(struct fio_client *client, const char *filename,
-                       int remote)
+                       bool remote)
 {
        int ret;
 
@@ -794,6 +816,8 @@ int fio_clients_send_ini(const char *filename)
        struct flist_head *entry, *tmp;
 
        flist_for_each_safe(entry, tmp, &client_list) {
+               bool failed = false;
+
                client = flist_entry(entry, struct fio_client, list);
 
                if (client->nr_files) {
@@ -805,12 +829,13 @@ int fio_clients_send_ini(const char *filename)
                                cf = &client->files[i];
 
                                if (fio_client_send_cf(client, cf)) {
+                                       failed = true;
                                        remove_client(client);
                                        break;
                                }
                        }
                }
-               if (client->sent_job)
+               if (client->sent_job || failed)
                        continue;
                if (!filename || fio_client_send_ini(client, filename, 0))
                        remove_client(client);
@@ -1438,7 +1463,6 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
                return NULL;
 #endif
                ret = convert_iolog_gz(cmd, pdu);
-               printf("compressed iolog, %p\n", ret);
                if (!ret) {
                        log_err("fio: failed decompressing log\n");
                        return NULL;
@@ -1526,7 +1550,7 @@ int fio_handle_client(struct fio_client *client)
 
        dprint(FD_NET, "client: handle %s\n", client->hostname);
 
-       cmd = fio_net_recv_cmd(client->fd);
+       cmd = fio_net_recv_cmd(client->fd, true);
        if (!cmd)
                return 0;
 
index 7fe09d1..ddacf78 100644 (file)
--- a/client.h
+++ b/client.h
@@ -22,7 +22,7 @@ enum {
 
 struct client_file {
        char *file;
-       int remote;
+       bool remote;
 };
 
 struct fio_client {
@@ -124,12 +124,12 @@ extern int fio_clients_connect(void);
 extern int fio_start_client(struct fio_client *);
 extern int fio_start_all_clients(void);
 extern int fio_clients_send_ini(const char *);
-extern int fio_client_send_ini(struct fio_client *, const char *, int);
+extern int fio_client_send_ini(struct fio_client *, const char *, bool);
 extern int fio_handle_clients(struct client_ops *);
 extern int fio_client_add(struct client_ops *, const char *, void **);
 extern struct fio_client *fio_client_add_explicit(struct client_ops *, const char *, int, int);
 extern void fio_client_add_cmd_option(void *, const char *);
-extern int fio_client_add_ini_file(void *, const char *, int);
+extern int fio_client_add_ini_file(void *, const char *, bool);
 extern int fio_client_terminate(struct fio_client *);
 extern void fio_clients_terminate(void);
 extern struct fio_client *fio_get_client(struct fio_client *);
index cbd4d30..5f6bca3 100755 (executable)
--- a/configure
+++ b/configure
@@ -135,6 +135,7 @@ show_help="no"
 exit_val=0
 gfio_check="no"
 libhdfs="no"
+pmemblk="no"
 disable_lex=""
 prefix=/usr/local
 
@@ -169,6 +170,8 @@ for opt do
   ;;
   --enable-libhdfs) libhdfs="yes"
   ;;
+  --enable-pmemblk) pmemblk="yes"
+  ;;
   --disable-lex) disable_lex="yes"
   ;;
   --enable-lex) disable_lex="no"
@@ -199,6 +202,7 @@ if test "$show_help" = "yes" ; then
   echo "--disable-numa         Disable libnuma even if found"
   echo "--disable-gfapi        Disable gfapi"
   echo "--enable-libhdfs       Enable hdfs support"
+  echo "--enable-pmemblk       Enable NVML libpmemblk support"
   echo "--disable-lex          Disable use of lex/yacc for math"
   echo "--enable-lex           Enable use of lex/yacc for math"
   echo "--disable-shm          Disable SHM support"
@@ -1240,6 +1244,22 @@ if compile_prog "" "" "pwritev"; then
 fi
 echo "pwritev/preadv                $pwritev"
 
+##########################################
+# Check whether we have pwritev2/preadv2
+pwritev2="no"
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/uio.h>
+int main(int argc, char **argv)
+{
+  return pwritev2(0, NULL, 1, 0, 0) + preadv2(0, NULL, 1, 0, 0);
+}
+EOF
+if compile_prog "" "" "pwritev2"; then
+  pwritev2="yes"
+fi
+echo "pwritev2/preadv2              $pwritev2"
+
 ##########################################
 # Check whether we have the required functions for ipv6
 ipv6="no"
@@ -1463,6 +1483,10 @@ if compile_prog "" "" "mtd"; then
 fi
 echo "MTD                           $mtd"
 
+##########################################
+# Report whether pmemblk engine is enabled
+echo "NVML libpmemblk engine        $pmemblk"
+
 # Check if we have lex/yacc available
 yacc="no"
 yacc_is_bison="no"
@@ -1742,6 +1766,9 @@ fi
 if test "$pwritev" = "yes" ; then
   output_sym "CONFIG_PWRITEV"
 fi
+if test "$pwritev2" = "yes" ; then
+  output_sym "CONFIG_PWRITEV2"
+fi
 if test "$ipv6" = "yes" ; then
   output_sym "CONFIG_IPV6"
 fi
@@ -1776,6 +1803,9 @@ if test "$libhdfs" = "yes" ; then
 if test "$mtd" = "yes" ; then
   output_sym "CONFIG_MTD"
 fi
+if test "$pmemblk" = "yes" ; then
+  output_sym "CONFIG_PMEMBLK"
+fi
 if test "$arith" = "yes" ; then
   output_sym "CONFIG_ARITHMETIC"
   if test "$yacc_is_bison" = "yes" ; then
index c3181b5..8031d5d 100644 (file)
@@ -11,6 +11,7 @@
 #include "fio.h"
 #include "smalloc.h"
 #include "diskutil.h"
+#include "helper_thread.h"
 
 static int last_majdev, last_mindev;
 static struct disk_util *last_du;
@@ -121,7 +122,7 @@ int update_io_ticks(void)
 
        fio_mutex_down(disk_util_mutex);
 
-       if (!helper_exit) {
+       if (!helper_should_exit()) {
                flist_for_each(entry, &disk_list) {
                        du = flist_entry(entry, struct disk_util, list);
                        update_io_tick_disk(du);
@@ -695,6 +696,7 @@ void show_disk_util(int terse, struct json_object *parent,
 {
        struct flist_head *entry;
        struct disk_util *du;
+       bool do_json;
 
        if (!disk_util_mutex)
                return;
@@ -706,15 +708,17 @@ void show_disk_util(int terse, struct json_object *parent,
                return;
        }
 
-       if (output_format & FIO_OUTPUT_JSON)
-               assert(parent);
+       if ((output_format & FIO_OUTPUT_JSON) && parent)
+               do_json = true;
+       else
+               do_json = false;
 
-       if (!terse && !(output_format & FIO_OUTPUT_JSON))
+       if (!terse && !do_json)
                log_buf(out, "\nDisk stats (read/write):\n");
 
-       if (output_format & FIO_OUTPUT_JSON)
+       if (do_json)
                json_object_add_disk_utils(parent, &disk_list);
-       if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
+       else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
                flist_for_each(entry, &disk_list) {
                        du = flist_entry(entry, struct disk_util, list);
 
index 25d0beb..ff8a5b0 100644 (file)
@@ -4,8 +4,7 @@
 #define FIO_DU_NAME_SZ         64
 
 #include "lib/output_buffer.h"
-
-extern volatile int helper_exit;
+#include "helper_thread.h"
 
 struct disk_util_stats {
        uint64_t ios[2];
@@ -129,7 +128,7 @@ static inline void print_disk_util(struct disk_util_stat *du,
 
 static inline int update_io_ticks(void)
 {
-       return helper_exit;
+       return helper_should_exit();
 }
 #endif
 
diff --git a/engines/pmemblk.c b/engines/pmemblk.c
new file mode 100644 (file)
index 0000000..6d19864
--- /dev/null
@@ -0,0 +1,504 @@
+/*
+ * pmemblk: IO engine that uses NVML libpmemblk to read and write data
+ *
+ * Copyright (C) 2016 Hewlett Packard Enterprise Development LP
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA
+ */
+
+/*
+ * pmemblk engine
+ *
+ * IO engine that uses libpmemblk to read and write data
+ *
+ * To use:
+ *   ioengine=pmemblk
+ *
+ * Other relevant settings:
+ *   iodepth=1
+ *   direct=1
+ *   thread=1   REQUIRED
+ *   unlink=1
+ *   filename=/pmem0/fiotestfile,BSIZE,FSIZEMB
+ *
+ *   thread must be set to 1 for pmemblk as multiple processes cannot
+ *     open the same block pool file.
+ *
+ *   iodepth should be set to 1 as pmemblk is always synchronous.
+ *   Use numjobs to scale up.
+ *
+ *   direct=1 is implied as pmemblk is always direct.
+ *
+ *   Can set unlink to 1 to remove the block pool file after testing.
+ *
+ *   When specifying the filename, if the block pool file does not already
+ *   exist, then the pmemblk engine can create the pool file if you specify
+ *   the block and file sizes.  BSIZE is the block size in bytes.
+ *   FSIZEMB is the pool file size in MB.
+ *
+ *   See examples/pmemblk.fio for more.
+ *
+ * libpmemblk.so
+ *   By default, the pmemblk engine will let the system find the libpmemblk.so
+ *   that it uses.  You can use an alternative libpmemblk by setting the
+ *   FIO_PMEMBLK_LIB environment variable to the full path to the desired
+ *   libpmemblk.so.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <assert.h>
+#include <dlfcn.h>
+#include <string.h>
+
+#include "../fio.h"
+
+/*
+ * libpmemblk
+ */
+struct PMEMblkpool_s;
+typedef struct PMEMblkpool_s PMEMblkpool;
+
+static PMEMblkpool *(*pmemblk_create) (const char *, size_t, size_t, mode_t);
+static PMEMblkpool *(*pmemblk_open) (const char *, size_t);
+static void (*pmemblk_close) (PMEMblkpool *);
+static size_t(*pmemblk_nblock) (PMEMblkpool *);
+static size_t(*pmemblk_bsize) (PMEMblkpool *);
+static int (*pmemblk_read) (PMEMblkpool *, void *, off_t);
+static int (*pmemblk_write) (PMEMblkpool *, const void *, off_t);
+
+int load_libpmemblk(const char *path)
+{
+       void *dl;
+
+       if (!path)
+               path = "libpmemblk.so";
+
+       dl = dlopen(path, RTLD_NOW | RTLD_NODELETE);
+       if (!dl)
+               goto errorout;
+
+       pmemblk_create = dlsym(dl, "pmemblk_create");
+       if (!pmemblk_create)
+               goto errorout;
+       pmemblk_open = dlsym(dl, "pmemblk_open");
+       if (!pmemblk_open)
+               goto errorout;
+       pmemblk_close = dlsym(dl, "pmemblk_close");
+       if (!pmemblk_close)
+               goto errorout;
+       pmemblk_nblock = dlsym(dl, "pmemblk_nblock");
+       if (!pmemblk_nblock)
+               goto errorout;
+       pmemblk_bsize = dlsym(dl, "pmemblk_bsize");
+       if (!pmemblk_bsize)
+               goto errorout;
+       pmemblk_read = dlsym(dl, "pmemblk_read");
+       if (!pmemblk_read)
+               goto errorout;
+       pmemblk_write = dlsym(dl, "pmemblk_write");
+       if (!pmemblk_write)
+               goto errorout;
+
+       return 0;
+
+errorout:
+       log_err("fio: unable to load libpmemblk: %s\n", dlerror());
+       if (dl)
+               dlclose(dl);
+
+       return -1;
+}
+
+typedef struct fio_pmemblk_file *fio_pmemblk_file_t;
+
+struct fio_pmemblk_file {
+       fio_pmemblk_file_t pmb_next;
+       char *pmb_filename;
+       uint64_t pmb_refcnt;
+       PMEMblkpool *pmb_pool;
+       size_t pmb_bsize;
+       size_t pmb_nblocks;
+};
+#define FIOFILEPMBSET(_f, _v)  do {                 \
+       (_f)->engine_data = (uint64_t)(uintptr_t)(_v);  \
+} while(0)
+#define FIOFILEPMBGET(_f)  ((fio_pmemblk_file_t)((_f)->engine_data))
+
+static fio_pmemblk_file_t Cache;
+
+static pthread_mutex_t CacheLock = PTHREAD_MUTEX_INITIALIZER;
+
+#define PMB_CREATE   (0x0001)  /* should create file */
+
+fio_pmemblk_file_t fio_pmemblk_cache_lookup(const char *filename)
+{
+       fio_pmemblk_file_t i;
+
+       for (i = Cache; i != NULL; i = i->pmb_next)
+               if (!strcmp(filename, i->pmb_filename))
+                       return i;
+
+       return NULL;
+}
+
+static void fio_pmemblk_cache_insert(fio_pmemblk_file_t pmb)
+{
+       pmb->pmb_next = Cache;
+       Cache = pmb;
+}
+
+static void fio_pmemblk_cache_remove(fio_pmemblk_file_t pmb)
+{
+       fio_pmemblk_file_t i;
+
+       if (pmb == Cache) {
+               Cache = Cache->pmb_next;
+               pmb->pmb_next = NULL;
+               return;
+       }
+
+       for (i = Cache; i != NULL; i = i->pmb_next)
+               if (pmb == i->pmb_next) {
+                       i->pmb_next = i->pmb_next->pmb_next;
+                       pmb->pmb_next = NULL;
+                       return;
+               }
+}
+
+/*
+ * to control block size and gross file size at the libpmemblk
+ * level, we allow the block size and file size to be appended
+ * to the file name:
+ *
+ *   path[,bsize,fsizemb]
+ *
+ * note that we do not use the fio option "filesize" to dictate
+ * the file size because we can only give libpmemblk the gross
+ * file size, which is different from the net or usable file
+ * size (which is probably what fio wants).
+ *
+ * the final path without the parameters is returned in ppath.
+ * the block size and file size are returned in pbsize and fsize.
+ *
+ * note that the user should specify the file size in MiB, but
+ * we return bytes from here.
+ */
+static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize,
+                          uint64_t *pfsize)
+{
+       char *path;
+       char *s;
+       uint64_t bsize;
+       uint64_t fsizemb;
+
+       path = strdup(pathspec);
+       if (!path) {
+               *ppath = NULL;
+               return;
+       }
+
+       /* extract sizes, if given */
+       s = strrchr(path, ',');
+       if (s && (fsizemb = strtoull(s + 1, NULL, 10))) {
+               *s = 0;
+               s = strrchr(path, ',');
+               if (s && (bsize = strtoull(s + 1, NULL, 10))) {
+                       *s = 0;
+                       *ppath = path;
+                       *pbsize = bsize;
+                       *pfsize = fsizemb << 20;
+                       return;
+               }
+       }
+
+       /* size specs not found */
+       strcpy(path, pathspec);
+       *ppath = path;
+       *pbsize = 0;
+       *pfsize = 0;
+}
+
+static fio_pmemblk_file_t pmb_open(const char *pathspec, int flags)
+{
+       fio_pmemblk_file_t pmb;
+       char *path = NULL;
+       uint64_t bsize = 0;
+       uint64_t fsize = 0;
+
+       pmb_parse_path(pathspec, &path, &bsize, &fsize);
+       if (!path)
+               return NULL;
+
+       pthread_mutex_lock(&CacheLock);
+
+       pmb = fio_pmemblk_cache_lookup(path);
+       if (!pmb) {
+               /* load libpmemblk if needed */
+               if (!pmemblk_open)
+                       if (load_libpmemblk(getenv("FIO_PMEMBLK_LIB")))
+                               goto error;
+
+               pmb = malloc(sizeof(*pmb));
+               if (!pmb)
+                       goto error;
+
+               /* try opening existing first, create it if needed */
+               pmb->pmb_pool = pmemblk_open(path, bsize);
+               if (!pmb->pmb_pool && (errno == ENOENT) &&
+                   (flags & PMB_CREATE) && (0 < fsize) && (0 < bsize)) {
+                       pmb->pmb_pool =
+                           pmemblk_create(path, bsize, fsize, 0644);
+               }
+               if (!pmb->pmb_pool) {
+                       log_err
+                           ("fio: enable to open pmemblk pool file (errno %d)\n",
+                            errno);
+                       goto error;
+               }
+
+               pmb->pmb_filename = path;
+               pmb->pmb_next = NULL;
+               pmb->pmb_refcnt = 0;
+               pmb->pmb_bsize = pmemblk_bsize(pmb->pmb_pool);
+               pmb->pmb_nblocks = pmemblk_nblock(pmb->pmb_pool);
+
+               fio_pmemblk_cache_insert(pmb);
+       }
+
+       pmb->pmb_refcnt += 1;
+
+       pthread_mutex_unlock(&CacheLock);
+
+       return pmb;
+
+error:
+       if (pmb) {
+               if (pmb->pmb_pool)
+                       pmemblk_close(pmb->pmb_pool);
+               pmb->pmb_pool = NULL;
+               pmb->pmb_filename = NULL;
+               free(pmb);
+       }
+       if (path)
+               free(path);
+
+       pthread_mutex_unlock(&CacheLock);
+       return NULL;
+}
+
+static void pmb_close(fio_pmemblk_file_t pmb, const bool keep)
+{
+       pthread_mutex_lock(&CacheLock);
+
+       pmb->pmb_refcnt--;
+
+       if (!keep && !pmb->pmb_refcnt) {
+               pmemblk_close(pmb->pmb_pool);
+               pmb->pmb_pool = NULL;
+               free(pmb->pmb_filename);
+               pmb->pmb_filename = NULL;
+               fio_pmemblk_cache_remove(pmb);
+               free(pmb);
+       }
+
+       pthread_mutex_unlock(&CacheLock);
+}
+
+static int pmb_get_flags(struct thread_data *td, uint64_t *pflags)
+{
+       static int thread_warned = 0;
+       static int odirect_warned = 0;
+
+       uint64_t flags = 0;
+
+       if (!td->o.use_thread) {
+               if (!thread_warned) {
+                       thread_warned = 1;
+                       log_err("fio: must set thread=1 for pmemblk engine\n");
+               }
+               return 1;
+       }
+
+       if (!td->o.odirect && !odirect_warned) {
+               odirect_warned = 1;
+               log_info("fio: direct == 0, but pmemblk is always direct\n");
+       }
+
+       if (td->o.allow_create)
+               flags |= PMB_CREATE;
+
+       (*pflags) = flags;
+       return 0;
+}
+
+static int fio_pmemblk_open_file(struct thread_data *td, struct fio_file *f)
+{
+       uint64_t flags = 0;
+       fio_pmemblk_file_t pmb;
+
+       if (pmb_get_flags(td, &flags))
+               return 1;
+
+       pmb = pmb_open(f->file_name, flags);
+       if (!pmb)
+               return 1;
+
+       FIOFILEPMBSET(f, pmb);
+       return 0;
+}
+
+static int fio_pmemblk_close_file(struct thread_data fio_unused *td,
+                                 struct fio_file *f)
+{
+       fio_pmemblk_file_t pmb = FIOFILEPMBGET(f);
+
+       if (pmb)
+               pmb_close(pmb, false);
+
+       FIOFILEPMBSET(f, NULL);
+       return 0;
+}
+
+static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+       uint64_t flags = 0;
+       fio_pmemblk_file_t pmb = FIOFILEPMBGET(f);
+
+       if (fio_file_size_known(f))
+               return 0;
+
+       if (!pmb) {
+               if (pmb_get_flags(td, &flags))
+                       return 1;
+               pmb = pmb_open(f->file_name, flags);
+               if (!pmb)
+                       return 1;
+       }
+
+       f->real_file_size = pmb->pmb_bsize * pmb->pmb_nblocks;
+
+       fio_file_set_size_known(f);
+
+       if (!FIOFILEPMBGET(f))
+               pmb_close(pmb, true);
+
+       return 0;
+}
+
+static int fio_pmemblk_queue(struct thread_data *td, struct io_u *io_u)
+{
+       struct fio_file *f = io_u->file;
+       fio_pmemblk_file_t pmb = FIOFILEPMBGET(f);
+
+       unsigned long long off;
+       unsigned long len;
+       void *buf;
+       int (*blkop) (PMEMblkpool *, void *, off_t) = (void *)pmemblk_write;
+
+       fio_ro_check(td, io_u);
+
+       switch (io_u->ddir) {
+       case DDIR_READ:
+               blkop = pmemblk_read;
+               /* fall through */
+       case DDIR_WRITE:
+               off = io_u->offset;
+               len = io_u->xfer_buflen;
+
+               io_u->error = EINVAL;
+               if (off % pmb->pmb_bsize)
+                       break;
+               if (len % pmb->pmb_bsize)
+                       break;
+               if ((off + len) / pmb->pmb_bsize > pmb->pmb_nblocks)
+                       break;
+
+               io_u->error = 0;
+               buf = io_u->xfer_buf;
+               off /= pmb->pmb_bsize;
+               len /= pmb->pmb_bsize;
+               while (0 < len) {
+                       if (0 != blkop(pmb->pmb_pool, buf, off)) {
+                               io_u->error = errno;
+                               break;
+                       }
+                       buf += pmb->pmb_bsize;
+                       off++;
+                       len--;
+               }
+               off *= pmb->pmb_bsize;
+               len *= pmb->pmb_bsize;
+               io_u->resid = io_u->xfer_buflen - (off - io_u->offset);
+               break;
+       case DDIR_SYNC:
+       case DDIR_DATASYNC:
+       case DDIR_SYNC_FILE_RANGE:
+               /* we're always sync'd */
+               io_u->error = 0;
+               break;
+       default:
+               io_u->error = EINVAL;
+               break;
+       }
+
+       return FIO_Q_COMPLETED;
+}
+
+static int fio_pmemblk_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+       char *path = NULL;
+       uint64_t bsize = 0;
+       uint64_t fsize = 0;
+
+       /*
+        * we need our own unlink in case the user has specified
+        * the block and file sizes in the path name.  we parse
+        * the file_name to determine the file name we actually used.
+        */
+
+       pmb_parse_path(f->file_name, &path, &bsize, &fsize);
+       if (!path)
+               return 1;
+
+       unlink(path);
+       free(path);
+       return 0;
+}
+
+struct ioengine_ops ioengine = {
+       .name = "pmemblk",
+       .version = FIO_IOOPS_VERSION,
+       .queue = fio_pmemblk_queue,
+       .open_file = fio_pmemblk_open_file,
+       .close_file = fio_pmemblk_close_file,
+       .get_file_size = fio_pmemblk_get_file_size,
+       .unlink_file = fio_pmemblk_unlink_file,
+       .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+};
+
+static void fio_init fio_pmemblk_register(void)
+{
+       register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_pmemblk_unregister(void)
+{
+       unregister_ioengine(&ioengine);
+}
index 8252d27..87ed360 100644 (file)
@@ -27,6 +27,7 @@ struct rbd_data {
 
 struct rbd_options {
        void *pad;
+       char *cluster_name;
        char *rbd_name;
        char *pool_name;
        char *client_name;
@@ -34,6 +35,15 @@ struct rbd_options {
 };
 
 static struct fio_option options[] = {
+        {
+               .name           = "clustername",
+               .lname          = "ceph cluster name",
+               .type           = FIO_OPT_STR_STORE,
+               .help           = "Cluster name for ceph",
+               .off1           = offsetof(struct rbd_options, cluster_name),
+               .category       = FIO_OPT_C_ENGINE,
+               .group          = FIO_OPT_G_RBD,
+        },
        {
                .name           = "rbdname",
                .lname          = "rbd engine rbdname",
@@ -112,7 +122,25 @@ static int _fio_rbd_connect(struct thread_data *td)
        struct rbd_options *o = td->eo;
        int r;
 
-       r = rados_create(&rbd->cluster, o->client_name);
+       if (o->cluster_name) {
+               char *client_name = NULL; 
+
+               /*
+                * If we specify cluser name, the rados_creat2
+                * will not assume 'client.'. name is considered
+                * as a full type.id namestr
+                */
+               if (!index(o->client_name, '.')) {
+                       client_name = calloc(1, strlen("client.") +
+                                               strlen(o->client_name) + 1);
+                       strcat(client_name, "client.");
+                       o->client_name = strcat(client_name, o->client_name);
+               }
+               r = rados_create2(&rbd->cluster, o->cluster_name,
+                                       o->client_name, 0);
+       } else
+               r = rados_create(&rbd->cluster, o->client_name);
+       
        if (r < 0) {
                log_err("rados_create failed.\n");
                goto failed_early;
index 87ba465..7fbfad9 100644 (file)
@@ -415,7 +415,7 @@ static int fio_rdmaio_setup_qp(struct thread_data *td)
                rd->pd = ibv_alloc_pd(rd->cm_id->verbs);
 
        if (rd->pd == NULL) {
-               log_err("fio: ibv_alloc_pd fail\n");
+               log_err("fio: ibv_alloc_pd fail: %m\n");
                return 1;
        }
 
@@ -424,7 +424,7 @@ static int fio_rdmaio_setup_qp(struct thread_data *td)
        else
                rd->channel = ibv_create_comp_channel(rd->cm_id->verbs);
        if (rd->channel == NULL) {
-               log_err("fio: ibv_create_comp_channel fail\n");
+               log_err("fio: ibv_create_comp_channel fail: %m\n");
                goto err1;
        }
 
@@ -438,12 +438,12 @@ static int fio_rdmaio_setup_qp(struct thread_data *td)
                rd->cq = ibv_create_cq(rd->cm_id->verbs,
                                       qp_depth, rd, rd->channel, 0);
        if (rd->cq == NULL) {
-               log_err("fio: ibv_create_cq failed\n");
+               log_err("fio: ibv_create_cq failed: %m\n");
                goto err2;
        }
 
        if (ibv_req_notify_cq(rd->cq, 0) != 0) {
-               log_err("fio: ibv_create_cq failed\n");
+               log_err("fio: ibv_req_notify_cq failed: %m\n");
                goto err3;
        }
 
@@ -459,13 +459,13 @@ static int fio_rdmaio_setup_qp(struct thread_data *td)
 
        if (rd->is_client == 0) {
                if (rdma_create_qp(rd->child_cm_id, rd->pd, &init_attr) != 0) {
-                       log_err("fio: rdma_create_qp failed\n");
+                       log_err("fio: rdma_create_qp failed: %m\n");
                        goto err3;
                }
                rd->qp = rd->child_cm_id->qp;
        } else {
                if (rdma_create_qp(rd->cm_id, rd->pd, &init_attr) != 0) {
-                       log_err("fio: rdma_create_qp failed\n");
+                       log_err("fio: rdma_create_qp failed: %m\n");
                        goto err3;
                }
                rd->qp = rd->cm_id->qp;
@@ -490,14 +490,14 @@ static int fio_rdmaio_setup_control_msg_buffers(struct thread_data *td)
        rd->recv_mr = ibv_reg_mr(rd->pd, &rd->recv_buf, sizeof(rd->recv_buf),
                                 IBV_ACCESS_LOCAL_WRITE);
        if (rd->recv_mr == NULL) {
-               log_err("fio: recv_buf reg_mr failed\n");
+               log_err("fio: recv_buf reg_mr failed: %m\n");
                return 1;
        }
 
        rd->send_mr = ibv_reg_mr(rd->pd, &rd->send_buf, sizeof(rd->send_buf),
                                 0);
        if (rd->send_mr == NULL) {
-               log_err("fio: send_buf reg_mr failed\n");
+               log_err("fio: send_buf reg_mr failed: %m\n");
                ibv_dereg_mr(rd->recv_mr);
                return 1;
        }
@@ -731,7 +731,7 @@ static int fio_rdmaio_send(struct thread_data *td, struct io_u **io_us,
                }
 
                if (ibv_post_send(rd->qp, &r_io_u_d->sq_wr, &bad_wr) != 0) {
-                       log_err("fio: ibv_post_send fail\n");
+                       log_err("fio: ibv_post_send fail: %m\n");
                        return -1;
                }
 
@@ -759,7 +759,7 @@ static int fio_rdmaio_recv(struct thread_data *td, struct io_u **io_us,
                        r_io_u_d = io_us[i]->engine_data;
                        if (ibv_post_recv(rd->qp, &r_io_u_d->rq_wr, &bad_wr) !=
                            0) {
-                               log_err("fio: ibv_post_recv fail\n");
+                               log_err("fio: ibv_post_recv fail: %m\n");
                                return 1;
                        }
                }
@@ -767,7 +767,7 @@ static int fio_rdmaio_recv(struct thread_data *td, struct io_u **io_us,
                   || (rd->rdma_protocol == FIO_RDMA_MEM_WRITE)) {
                /* re-post the rq_wr */
                if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
-                       log_err("fio: ibv_post_recv fail\n");
+                       log_err("fio: ibv_post_recv fail: %m\n");
                        return 1;
                }
 
@@ -866,7 +866,7 @@ static int fio_rdmaio_connect(struct thread_data *td, struct fio_file *f)
        conn_param.retry_count = 10;
 
        if (rdma_connect(rd->cm_id, &conn_param) != 0) {
-               log_err("fio: rdma_connect fail\n");
+               log_err("fio: rdma_connect fail: %m\n");
                return 1;
        }
 
@@ -881,7 +881,7 @@ static int fio_rdmaio_connect(struct thread_data *td, struct fio_file *f)
        rd->send_buf.nr = htonl(td->o.iodepth);
 
        if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-               log_err("fio: ibv_post_send fail");
+               log_err("fio: ibv_post_send fail: %m");
                return 1;
        }
 
@@ -918,7 +918,7 @@ static int fio_rdmaio_accept(struct thread_data *td, struct fio_file *f)
        conn_param.initiator_depth = 1;
 
        if (rdma_accept(rd->child_cm_id, &conn_param) != 0) {
-               log_err("fio: rdma_accept\n");
+               log_err("fio: rdma_accept: %m\n");
                return 1;
        }
 
@@ -932,7 +932,7 @@ static int fio_rdmaio_accept(struct thread_data *td, struct fio_file *f)
        ret = rdma_poll_wait(td, IBV_WC_RECV) < 0;
 
        if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-               log_err("fio: ibv_post_send fail");
+               log_err("fio: ibv_post_send fail: %m");
                return 1;
        }
 
@@ -965,7 +965,7 @@ static int fio_rdmaio_close_file(struct thread_data *td, struct fio_file *f)
                                     || (rd->rdma_protocol ==
                                         FIO_RDMA_MEM_READ))) {
                if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-                       log_err("fio: ibv_post_send fail");
+                       log_err("fio: ibv_post_send fail: %m");
                        return 1;
                }
 
@@ -1084,12 +1084,12 @@ static int fio_rdmaio_setup_listen(struct thread_data *td, short port)
 
        /* rdma_listen */
        if (rdma_bind_addr(rd->cm_id, (struct sockaddr *)&rd->addr) != 0) {
-               log_err("fio: rdma_bind_addr fail\n");
+               log_err("fio: rdma_bind_addr fail: %m\n");
                return 1;
        }
 
        if (rdma_listen(rd->cm_id, 3) != 0) {
-               log_err("fio: rdma_listen fail\n");
+               log_err("fio: rdma_listen fail: %m\n");
                return 1;
        }
 
@@ -1110,7 +1110,7 @@ static int fio_rdmaio_setup_listen(struct thread_data *td, short port)
 
        /* post recv buf */
        if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
-               log_err("fio: ibv_post_recv fail\n");
+               log_err("fio: ibv_post_recv fail: %m\n");
                return 1;
        }
 
@@ -1238,13 +1238,13 @@ static int fio_rdmaio_init(struct thread_data *td)
 
        rd->cm_channel = rdma_create_event_channel();
        if (!rd->cm_channel) {
-               log_err("fio: rdma_create_event_channel fail\n");
+               log_err("fio: rdma_create_event_channel fail: %m\n");
                return 1;
        }
 
        ret = rdma_create_id(rd->cm_channel, &rd->cm_id, rd, RDMA_PS_TCP);
        if (ret) {
-               log_err("fio: rdma_create_id fail\n");
+               log_err("fio: rdma_create_id fail: %m\n");
                return 1;
        }
 
@@ -1295,7 +1295,7 @@ static int fio_rdmaio_init(struct thread_data *td)
                                      IBV_ACCESS_REMOTE_READ |
                                      IBV_ACCESS_REMOTE_WRITE);
                if (io_u->mr == NULL) {
-                       log_err("fio: ibv_reg_mr io_u failed\n");
+                       log_err("fio: ibv_reg_mr io_u failed: %m\n");
                        return 1;
                }
 
index f5801fe..260ef66 100644 (file)
@@ -13,6 +13,7 @@
 #include <assert.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 
 /*
  * Sync engine uses engine_data to store last offset
@@ -31,6 +32,28 @@ struct syncio_data {
        enum fio_ddir last_ddir;
 };
 
+#ifdef CONFIG_PWRITEV2
+struct psyncv2_options {
+       void *pad;
+       unsigned int hipri;
+};
+
+static struct fio_option options[] = {
+       {
+               .name   = "hipri",
+               .lname  = "RWF_HIPRI",
+               .type   = FIO_OPT_STR_SET,
+               .off1   = offsetof(struct psyncv2_options, hipri),
+               .help   = "Set RWF_HIPRI for pwritev2/preadv2",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_INVALID,
+       },
+       {
+               .name   = NULL,
+       },
+};
+#endif
+
 static int fio_syncio_prep(struct thread_data *td, struct io_u *io_u)
 {
        struct fio_file *f = io_u->file;
@@ -98,6 +121,38 @@ static int fio_pvsyncio_queue(struct thread_data *td, struct io_u *io_u)
 }
 #endif
 
+#ifdef CONFIG_PWRITEV2
+static int fio_pvsyncio2_queue(struct thread_data *td, struct io_u *io_u)
+{
+       struct syncio_data *sd = td->io_ops->data;
+       struct psyncv2_options *o = td->eo;
+       struct iovec *iov = &sd->iovecs[0];
+       struct fio_file *f = io_u->file;
+       int ret, flags = 0;
+
+       fio_ro_check(td, io_u);
+
+       if (o->hipri)
+               flags |= RWF_HIPRI;
+
+       iov->iov_base = io_u->xfer_buf;
+       iov->iov_len = io_u->xfer_buflen;
+
+       if (io_u->ddir == DDIR_READ)
+               ret = preadv2(f->fd, iov, 1, io_u->offset, flags);
+       else if (io_u->ddir == DDIR_WRITE)
+               ret = pwritev2(f->fd, iov, 1, io_u->offset, flags);
+       else if (io_u->ddir == DDIR_TRIM) {
+               do_io_u_trim(td, io_u);
+               return FIO_Q_COMPLETED;
+       } else
+               ret = do_io_u_sync(td, io_u);
+
+       return fio_io_end(td, io_u, ret);
+}
+#endif
+
+
 static int fio_psyncio_queue(struct thread_data *td, struct io_u *io_u)
 {
        struct fio_file *f = io_u->file;
@@ -374,6 +429,22 @@ static struct ioengine_ops ioengine_pvrw = {
 };
 #endif
 
+#ifdef CONFIG_PWRITEV2
+static struct ioengine_ops ioengine_pvrw2 = {
+       .name           = "pvsync2",
+       .version        = FIO_IOOPS_VERSION,
+       .init           = fio_vsyncio_init,
+       .cleanup        = fio_vsyncio_cleanup,
+       .queue          = fio_pvsyncio2_queue,
+       .open_file      = generic_open_file,
+       .close_file     = generic_close_file,
+       .get_file_size  = generic_get_file_size,
+       .flags          = FIO_SYNCIO,
+       .options        = options,
+       .option_struct_size     = sizeof(struct psyncv2_options),
+};
+#endif
+
 static void fio_init fio_syncio_register(void)
 {
        register_ioengine(&ioengine_rw);
@@ -382,6 +453,9 @@ static void fio_init fio_syncio_register(void)
 #ifdef CONFIG_PWRITEV
        register_ioengine(&ioengine_pvrw);
 #endif
+#ifdef CONFIG_PWRITEV2
+       register_ioengine(&ioengine_pvrw2);
+#endif
 }
 
 static void fio_exit fio_syncio_unregister(void)
@@ -392,4 +466,7 @@ static void fio_exit fio_syncio_unregister(void)
 #ifdef CONFIG_PWRITEV
        unregister_ioengine(&ioengine_pvrw);
 #endif
+#ifdef CONFIG_PWRITEV2
+       unregister_ioengine(&ioengine_pvrw2);
+#endif
 }
diff --git a/examples/jesd219.fio b/examples/jesd219.fio
new file mode 100644 (file)
index 0000000..ab2c40e
--- /dev/null
@@ -0,0 +1,19 @@
+# Sample implementation of the JESD219 workload for SSD endurance
+# testing. It uses a specific distribution of block sizes and
+# read/write mix, as well as a specific distribution of where on
+# the device the IO accesses will land. Based on posting from
+# Jeff Furlong <jeff.furlong@hgst.com>
+[JESD219]
+ioengine=libaio
+direct=1
+rw=randrw
+norandommap
+randrepeat=0
+rwmixread=40
+rwmixwrite=60
+iodepth=256
+numjobs=4
+bssplit=512/4:1024/1:1536/1:2048/1:2560/1:3072/1:3584/1:4k/67:8k/10:16k/7:32k/3:64k/3
+random_distribution=zoned:50/5:30/15:20/80
+filename=/dev/nvme0n1
+group_reporting=1
diff --git a/examples/pmemblk.fio b/examples/pmemblk.fio
new file mode 100644 (file)
index 0000000..2d5ecfc
--- /dev/null
@@ -0,0 +1,71 @@
+[global]
+bs=1m
+ioengine=pmemblk
+norandommap
+time_based=1
+runtime=30
+group_reporting
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+cpus_allowed_policy=split
+
+# For the pmemblk engine:
+#
+#   IOs always complete immediately
+#   IOs are always direct
+#   Must use threads
+#
+iodepth=1
+direct=1
+thread=1
+numjobs=16
+#
+# Unlink can be used to remove the files when done, but if you are
+# using serial runs with stonewall, and you want the files to be created
+# only once and unlinked only at the very end, then put the unlink=1
+# in the last group.  This is the method demonstrated here.
+#
+# Note that if you have a read-only group and if the files will be
+# newly created, then all of the data will read back as zero and the
+# read will be optimized, yielding performance that is different from
+# that of reading non-zero blocks (or unoptimized zero blocks).
+#
+unlink=0
+#
+# The pmemblk engine does IO to files in a DAX-mounted filesystem.
+# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# and then mounted with the '-o dax' option.  Note that the engine
+# accesses the underlying NVDIMM directly, bypassing the kernel block
+# layer, so the usual filesystem/disk performance monitoring tools such
+# as iostat will not provide useful data.
+#
+# Here we specify a test file on each of two NVDIMMs.  The first
+# number after the file name is the block size in bytes (4096 bytes
+# in this example).  The second number is the size of the file to
+# create in MiB (1 GiB in this example); note that the actual usable
+# space available to fio will be less than this as libpmemblk requires
+# some space for metadata.
+#
+# Currently, the minimum block size is 512 bytes and the minimum file
+# size is about 17 MiB (these are libpmemblk requirements).
+#
+# While both files in this example have the same block size and file
+# size, this is not required.
+#
+filename=/pmem0/fio-test,4096,1024
+filename=/pmem1/fio-test,4096,1024
+
+[pmemblk-write]
+rw=randwrite
+stonewall
+
+[pmemblk-read]
+rw=randread
+stonewall
+#
+# We're done, so unlink the file:
+#
+unlink=1
+
diff --git a/examples/rand-zones.fio b/examples/rand-zones.fio
new file mode 100644 (file)
index 0000000..da13fa3
--- /dev/null
@@ -0,0 +1,18 @@
+# Sample job file demonstrating how to use zoned random distributionss
+# to have skewed random accesses. This example has 50% of the accesses
+# to the first 5% of the file (50/5), 30% to the next 15% (30/15), and
+# finally 20% of the IO will end up in the remaining 80%.
+[zones]
+size=2g
+direct=1
+bs=4k
+rw=randread
+norandommap
+random_distribution=zoned:50/5:30/15:20/
+
+# The above applies to all of reads/writes/trims. If we wanted to do
+# something differently for writes, let's say 50% for the first 10%
+# and 50% for the remaining 90%, we could do it by adding a new section
+# after a a comma.
+
+# random_distribution=zoned:50/5:30/15:20/,50/10:50/90
diff --git a/file.h b/file.h
index a631766..0cf622f 100644 (file)
--- a/file.h
+++ b/file.h
@@ -39,13 +39,20 @@ enum file_lock_mode {
 };
 
 /*
- * roundrobin available files, or choose one at random, or do each one
- * serially.
+ * How fio chooses what file to service next. Choice of uniformly random, or
+ * some skewed random variants, or just sequentially go through them or
+ * roundrobing.
  */
 enum {
-       FIO_FSERVICE_RANDOM     = 1,
-       FIO_FSERVICE_RR         = 2,
-       FIO_FSERVICE_SEQ        = 3,
+       FIO_FSERVICE_RANDOM             = 1,
+       FIO_FSERVICE_RR                 = 2,
+       FIO_FSERVICE_SEQ                = 3,
+       __FIO_FSERVICE_NONUNIFORM       = 0x100,
+       FIO_FSERVICE_ZIPF               = __FIO_FSERVICE_NONUNIFORM | 4,
+       FIO_FSERVICE_PARETO             = __FIO_FSERVICE_NONUNIFORM | 5,
+       FIO_FSERVICE_GAUSS              = __FIO_FSERVICE_NONUNIFORM | 6,
+
+       FIO_FSERVICE_SHIFT              = 10,
 };
 
 /*
@@ -97,6 +104,13 @@ struct fio_file {
        uint64_t first_write;
        uint64_t last_write;
 
+       /*
+        * Tracks the last iodepth number of completed writes, if data
+        * verification is enabled
+        */
+       uint64_t *last_write_comp;
+       unsigned int last_write_idx;
+
        /*
         * For use by the io engine
         */
index a821632..f721c36 100644 (file)
@@ -761,12 +761,16 @@ static unsigned long long get_fs_free_counts(struct thread_data *td)
 uint64_t get_start_offset(struct thread_data *td, struct fio_file *f)
 {
        struct thread_options *o = &td->o;
+       uint64_t offset;
 
        if (o->file_append && f->filetype == FIO_TYPE_FILE)
                return f->real_file_size;
 
-       return td->o.start_offset +
-               td->subjob_number * td->o.offset_increment;
+       offset = td->o.start_offset + td->subjob_number * td->o.offset_increment;
+       if (offset % td_max_bs(td))
+               offset -= (offset % td_max_bs(td));
+
+       return offset;
 }
 
 /*
@@ -809,6 +813,7 @@ int setup_files(struct thread_data *td)
         */
        total_size = 0;
        for_each_file(td, f, i) {
+               f->fileno = i;
                if (f->real_file_size == -1ULL)
                        total_size = -1ULL;
                else
@@ -889,8 +894,10 @@ int setup_files(struct thread_data *td)
                if (f->io_size == -1ULL)
                        total_size = -1ULL;
                else {
-                        if (o->size_percent)
-                                f->io_size = (f->io_size * o->size_percent) / 100;
+                        if (o->size_percent) {
+                               f->io_size = (f->io_size * o->size_percent) / 100;
+                               f->io_size -= (f->io_size % td_min_bs(td));
+                       }
                        total_size += f->io_size;
                }
 
diff --git a/fio.1 b/fio.1
index 7bdfea3..5e4cd4f 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -566,10 +566,24 @@ Round robin over opened files (default).
 .TP
 .B sequential
 Do each file in the set sequentially.
+.TP
+.B zipf
+Use a zipfian distribution to decide what file to access.
+.TP
+.B pareto
+Use a pareto distribution to decide what file to access.
+.TP
+.B gauss
+Use a gaussian (normal) distribution to decide what file to access.
 .RE
 .P
-The number of I/Os to issue before switching to a new file can be specified by
-appending `:\fIint\fR' to the service type.
+For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be
+appended to tell fio how many I/Os to issue before switching to a new file.
+For example, specifying \fBfile_service_type=random:8\fR would cause fio to
+issue \fI8\fR I/Os before selecting a new file at random. For the non-uniform
+distributions, a floating point postfix can be given to influence how the
+distribution is skewed. See \fBrandom_distribution\fR for a description of how
+that would work.
 .RE
 .TP
 .BI ioengine \fR=\fPstr
@@ -591,6 +605,9 @@ coalescing adjacent IOs into a single submission.
 .B pvsync
 Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O.
 .TP
+.B pvsync2
+Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O.
+.TP
 .B libaio
 Linux native asynchronous I/O. This ioengine defines engine specific options.
 .TP
@@ -697,6 +714,9 @@ treated as erases. Depending on the underlying device type, the I/O may have
 to go in a certain pattern, e.g., on NAND, writing sequentially to erase blocks
 and discarding before overwriting. The writetrim mode works well for this
 constraint.
+.TP
+.B pmemblk
+Read and write through the NVML libpmemblk interface.
 .RE
 .P
 .RE
@@ -871,15 +891,51 @@ Zipf distribution
 .B pareto
 Pareto distribution
 .TP
+.B gauss
+Normal (gaussian) distribution
+.TP
+.B zoned
+Zoned random distribution
+.TP
 .RE
-.P
-When using a zipf or pareto distribution, an input value is also needed to
-define the access pattern. For zipf, this is the zipf theta. For pareto,
-it's the pareto power. Fio includes a test program, genzipf, that can be
-used visualize what the given input values will yield in terms of hit rates.
-If you wanted to use zipf with a theta of 1.2, you would use
+When using a \fBzipf\fR or \fBpareto\fR distribution, an input value is also
+needed to define the access pattern. For \fBzipf\fR, this is the zipf theta.
+For \fBpareto\fR, it's the pareto power. Fio includes a test program, genzipf,
+that can be used visualize what the given input values will yield in terms of
+hit rates. If you wanted to use \fBzipf\fR with a theta of 1.2, you would use
 random_distribution=zipf:1.2 as the option. If a non-uniform model is used,
-fio will disable use of the random map.
+fio will disable use of the random map. For the \fBgauss\fR distribution, a
+normal deviation is supplied as a value between 0 and 100.
+.P
+.RS
+For a \fBzoned\fR distribution, fio supports specifying percentages of IO
+access that should fall within what range of the file or device. For example,
+given a criteria of:
+.P
+.RS
+60% of accesses should be to the first 10%
+.RE
+.RS
+30% of accesses should be to the next 20%
+.RE
+.RS
+8% of accesses should be to to the next 30%
+.RE
+.RS
+2% of accesses should be to the next 40%
+.RE
+.P
+we can define that through zoning of the random accesses. For the above
+example, the user would do:
+.P
+.RS
+.B random_distribution=zoned:60/10:30/20:8/30:2/40
+.RE
+.P
+similarly to how \fBbssplit\fR works for setting ranges and percentages of block
+sizes. Like \fBbssplit\fR, it's possible to specify separate zones for reads,
+writes, and trims. If just one set is given, it'll apply to all of them.
+.RE
 .TP
 .BI percentage_random \fR=\fPint
 For a random workload, set how big a percentage should be random. This defaults
@@ -920,7 +976,9 @@ guarantees that we never generate the same offset twice, and it's also less
 computationally expensive. It's not a true random generator, however, though
 for IO purposes it's typically good enough. LFSR only works with single block
 sizes, not with workloads that use multiple block sizes. If used with such a
-workload, fio may read or write some blocks multiple times.
+workload, fio may read or write some blocks multiple times. The default
+value is tausworthe, unless the required space exceeds 2^32 blocks. If it does,
+then tausworthe64 is selected automatically.
 .TP
 .BI nice \fR=\fPint
 Run job with given nice value.  See \fBnice\fR\|(2).
@@ -1139,12 +1197,14 @@ Terminate all jobs if one job finishes in error.  Default: wait for each job
 to finish.
 .TP
 .BI bwavgtime \fR=\fPint
-Average bandwidth calculations over the given time in milliseconds.  Default:
-500ms.
+Average bandwidth calculations over the given time in milliseconds. If the job
+also does bandwidth logging through \fBwrite_bw_log\fR, then the minimum of
+this option and \fBlog_avg_msec\fR will be used.  Default: 500ms.
 .TP
 .BI iopsavgtime \fR=\fPint
-Average IOPS calculations over the given time in milliseconds.  Default:
-500ms.
+Average IOPS calculations over the given time in milliseconds. If the job
+also does IOPS logging through \fBwrite_iops_log\fR, then the minimum of
+this option and \fBlog_avg_msec\fR will be used.  Default: 500ms.
 .TP
 .BI create_serialize \fR=\fPbool
 If true, serialize file creation for the jobs.  Default: true.
@@ -1400,7 +1460,8 @@ fio_generate_plots script uses gnuplot to turn these text files into nice
 graphs. See \fBwrite_lat_log\fR for behaviour of given filename. For this
 option, the postfix is _bw.x.log, where x is the index of the job (1..N,
 where N is the number of jobs). If \fBper_job_logs\fR is false, then the
-filename will not include the job index.
+filename will not include the job index. See the \fBLOG FILE FORMATS\fR
+section.
 .TP
 .BI write_lat_log \fR=\fPstr
 Same as \fBwrite_bw_log\fR, but writes I/O completion latencies.  If no
@@ -1408,21 +1469,27 @@ filename is given with this option, the default filename of
 "jobname_type.x.log" is used, where x is the index of the job (1..N, where
 N is the number of jobs). Even if the filename is given, fio will still
 append the type of log. If \fBper_job_logs\fR is false, then the filename will
-not include the job index.
+not include the job index. See the \fBLOG FILE FORMATS\fR section.
 .TP
 .BI write_iops_log \fR=\fPstr
 Same as \fBwrite_bw_log\fR, but writes IOPS. If no filename is given with this
 option, the default filename of "jobname_type.x.log" is used, where x is the
 index of the job (1..N, where N is the number of jobs). Even if the filename
 is given, fio will still append the type of log. If \fBper_job_logs\fR is false,
-then the filename will not include the job index.
+then the filename will not include the job index. See the \fBLOG FILE FORMATS\fR
+section.
 .TP
 .BI log_avg_msec \fR=\fPint
 By default, fio will log an entry in the iops, latency, or bw log for every
 IO that completes. When writing to the disk log, that can quickly grow to a
 very large size. Setting this option makes fio average the each log entry
-over the specified period of time, reducing the resolution of the log.
-Defaults to 0.
+over the specified period of time, reducing the resolution of the log. See
+\fBlog_max_value\fR as well.  Defaults to 0, logging all entries.
+.TP
+.BI log_max_value \fR=\fPbool
+If \fBlog_avg_msec\fR is set, fio logs the average over that window. If you
+instead want to log the maximum value, set this option to 1.  Defaults to
+0, meaning that averaged values are logged.
 .TP
 .BI log_offset \fR=\fPbool
 If this is set, the iolog options will include the byte offset for the IO
@@ -1640,6 +1707,10 @@ from user-space to reap events. The reaping mode is only
 enabled when polling for a minimum of 0 events (eg when
 iodepth_batch_complete=0).
 .TP
+.BI (psyncv2)hipri
+Set RWF_HIPRI on IO, indicating to the kernel that it's of
+higher priority than normal.
+.TP
 .BI (net,netsplice)hostname \fR=\fPstr
 The host name or IP address to use for TCP or UDP based IO.
 If the job is a TCP listener or UDP reader, the hostname is not
@@ -1720,6 +1791,9 @@ Preallocate donor's file on init
 .BI 1:
 allocate space immediately inside defragment event, and free right after event
 .RE
+.TP 
+.BI (rbd)clustername \fR=\fPstr
+Specifies the name of the ceph cluster.
 .TP
 .BI (rbd)rbdname \fR=\fPstr
 Specifies the name of the RBD.
@@ -1728,7 +1802,9 @@ Specifies the name of the RBD.
 Specifies the name of the Ceph pool containing the RBD.
 .TP
 .BI (rbd)clientname \fR=\fPstr
-Specifies the username (without the 'client.' prefix) used to access the Ceph cluster.
+Specifies the username (without the 'client.' prefix) used to access the Ceph
+cluster. If the clustername is specified, the clientname shall be the full
+type.id string. If no type. prefix is given, fio will add 'client.' by default.
 .TP
 .BI (mtd)skipbad \fR=\fPbool
 Skip operations against known bad blocks.
@@ -1821,7 +1897,9 @@ and standard deviation.
 .TP
 .B cpu
 CPU usage statistics. Includes user and system time, number of context switches
-this thread went through and number of major and minor page faults.
+this thread went through and number of major and minor page faults. The CPU
+utilization numbers are averages for the jobs in that reporting group, while
+the context and fault counters are summed.
 .TP
 .B IO depths
 Distribution of I/O depths.  Each depth includes everything less than (or equal)
@@ -1981,6 +2059,246 @@ Error Info (dependent on continue_on_error, default off):
 .P
 .B text description (if provided in config - appears on newline)
 .RE
+.SH TRACE FILE FORMAT
+There are two trace file format that you can encounter. The older (v1) format
+is unsupported since version 1.20-rc3 (March 2008). It will still be described
+below in case that you get an old trace and want to understand it.
+
+In any case the trace is a simple text file with a single action per line.
+
+.P
+.B Trace file format v1
+.RS
+Each line represents a single io action in the following format:
+
+rw, offset, length
+
+where rw=0/1 for read/write, and the offset and length entries being in bytes.
+
+This format is not supported in Fio versions => 1.20-rc3.
+
+.RE
+.P
+.B Trace file format v2
+.RS
+The second version of the trace file format was added in Fio version 1.17.
+It allows to access more then one file per trace and has a bigger set of
+possible file actions.
+
+The first line of the trace file has to be:
+
+\fBfio version 2 iolog\fR
+
+Following this can be lines in two different formats, which are described below.
+The file management format:
+
+\fBfilename action\fR
+
+The filename is given as an absolute path. The action can be one of these:
+
+.P
+.PD 0
+.RS
+.TP
+.B add
+Add the given filename to the trace
+.TP
+.B open
+Open the file with the given filename. The filename has to have been previously
+added with the \fBadd\fR action.
+.TP
+.B close
+Close the file with the given filename. The file must have previously been
+opened.
+.RE
+.PD
+.P
+
+The file io action format:
+
+\fBfilename action offset length\fR
+
+The filename is given as an absolute path, and has to have been added and opened
+before it can be used with this format. The offset and length are given in
+bytes. The action can be one of these:
+
+.P
+.PD 0
+.RS
+.TP
+.B wait
+Wait for 'offset' microseconds. Everything below 100 is discarded.  The time is
+relative to the previous wait statement.
+.TP
+.B read
+Read \fBlength\fR bytes beginning from \fBoffset\fR
+.TP
+.B write
+Write \fBlength\fR bytes beginning from \fBoffset\fR
+.TP
+.B sync
+fsync() the file
+.TP
+.B datasync
+fdatasync() the file
+.TP
+.B trim
+trim the given file from the given \fBoffset\fR for \fBlength\fR bytes
+.RE
+.PD
+.P
+
+.SH CPU IDLENESS PROFILING
+In some cases, we want to understand CPU overhead in a test. For example,
+we test patches for the specific goodness of whether they reduce CPU usage.
+fio implements a balloon approach to create a thread per CPU that runs at
+idle priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each
+CPU can be derived accordingly.
+
+An unit work is defined as touching a full page of unsigned characters. Mean
+and standard deviation of time to complete an unit work is reported in "unit
+work" section. Options can be chosen to report detailed percpu idleness or
+overall system idleness by aggregating percpu stats.
+
+.SH VERIFICATION AND TRIGGERS
+Fio is usually run in one of two ways, when data verification is done. The
+first is a normal write job of some sort with verify enabled. When the
+write phase has completed, fio switches to reads and verifies everything
+it wrote. The second model is running just the write phase, and then later
+on running the same job (but with reads instead of writes) to repeat the
+same IO patterns and verify the contents. Both of these methods depend
+on the write phase being completed, as fio otherwise has no idea how much
+data was written.
+
+With verification triggers, fio supports dumping the current write state
+to local files. Then a subsequent read verify workload can load this state
+and know exactly where to stop. This is useful for testing cases where
+power is cut to a server in a managed fashion, for instance.
+
+A verification trigger consists of two things:
+
+.RS
+Storing the write state of each job
+.LP
+Executing a trigger command
+.RE
+
+The write state is relatively small, on the order of hundreds of bytes
+to single kilobytes. It contains information on the number of completions
+done, the last X completions, etc.
+
+A trigger is invoked either through creation (\fBtouch\fR) of a specified
+file in the system, or through a timeout setting. If fio is run with
+\fB\-\-trigger\-file=/tmp/trigger-file\fR, then it will continually check for
+the existence of /tmp/trigger-file. When it sees this file, it will
+fire off the trigger (thus saving state, and executing the trigger
+command).
+
+For client/server runs, there's both a local and remote trigger. If
+fio is running as a server backend, it will send the job states back
+to the client for safe storage, then execute the remote trigger, if
+specified. If a local trigger is specified, the server will still send
+back the write state, but the client will then execute the trigger.
+
+.RE
+.P
+.B Verification trigger example
+.RS
+
+Lets say we want to run a powercut test on the remote machine 'server'.
+Our write workload is in write-test.fio. We want to cut power to 'server'
+at some point during the run, and we'll run this test from the safety
+or our local machine, 'localbox'. On the server, we'll start the fio
+backend normally:
+
+server# \fBfio \-\-server\fR
+
+and on the client, we'll fire off the workload:
+
+localbox$ \fBfio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger-remote="bash \-c "echo b > /proc/sysrq-triger""\fR
+
+We set \fB/tmp/my-trigger\fR as the trigger file, and we tell fio to execute
+
+\fBecho b > /proc/sysrq-trigger\fR
+
+on the server once it has received the trigger and sent us the write
+state. This will work, but it's not \fIreally\fR cutting power to the server,
+it's merely abruptly rebooting it. If we have a remote way of cutting
+power to the server through IPMI or similar, we could do that through
+a local trigger command instead. Lets assume we have a script that does
+IPMI reboot of a given hostname, ipmi-reboot. On localbox, we could
+then have run fio with a local trigger instead:
+
+localbox$ \fBfio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger="ipmi-reboot server"\fR
+
+For this case, fio would wait for the server to send us the write state,
+then execute 'ipmi-reboot server' when that happened.
+
+.RE
+.P
+.B Loading verify state
+.RS
+To load store write state, read verification job file must contain
+the verify_state_load option. If that is set, fio will load the previously
+stored state. For a local fio run this is done by loading the files directly,
+and on a client/server run, the server backend will ask the client to send
+the files over and load them from there.
+
+.RE
+
+.SH LOG FILE FORMATS
+
+Fio supports a variety of log file formats, for logging latencies, bandwidth,
+and IOPS. The logs share a common format, which looks like this:
+
+.B time (msec), value, data direction, offset
+
+Time for the log entry is always in milliseconds. The value logged depends
+on the type of log, it will be one of the following:
+
+.P
+.PD 0
+.TP
+.B Latency log
+Value is in latency in usecs
+.TP
+.B Bandwidth log
+Value is in KB/sec
+.TP
+.B IOPS log
+Value is in IOPS
+.PD
+.P
+
+Data direction is one of the following:
+
+.P
+.PD 0
+.TP
+.B 0
+IO is a READ
+.TP
+.B 1
+IO is a WRITE
+.TP
+.B 2
+IO is a TRIM
+.PD
+.P
+
+The \fIoffset\fR is the offset, in bytes, from the start of the file, for that
+particular IO. The logging of the offset can be toggled with \fBlog_offset\fR.
+
+If windowed logging is enabled though \fBlog_avg_msec\fR, then fio doesn't log
+individual IOs. Instead of logs the average values over the specified
+period of time. Since \fIdata direction\fR and \fIoffset\fR are per-IO values,
+they aren't applicable if windowed logging is enabled. If windowed logging
+is enabled and \fBlog_max_value\fR is set, then fio logs maximum values in
+that window instead of averages.
+
+.RE
+
 .SH CLIENT / SERVER
 Normally you would run fio as a stand-alone application on the machine
 where the IO workload should be generated. However, it is also possible to
@@ -1998,34 +2316,34 @@ for TCP/IP v4, 'ip6' for TCP/IP v6, or 'sock' for a local unix domain
 socket. 'hostname' is either a hostname or IP address, and 'port' is the port to
 listen to (only valid for TCP/IP, not a local socket). Some examples:
 
-1) fio \-\-server
+1) \fBfio \-\-server\fR
 
    Start a fio server, listening on all interfaces on the default port (8765).
 
-2) fio \-\-server=ip:hostname,4444
+2) \fBfio \-\-server=ip:hostname,4444\fR
 
    Start a fio server, listening on IP belonging to hostname and on port 4444.
 
-3) fio \-\-server=ip6:::1,4444
+3) \fBfio \-\-server=ip6:::1,4444\fR
 
    Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
 
-4) fio \-\-server=,4444
+4) \fBfio \-\-server=,4444\fR
 
    Start a fio server, listening on all interfaces on port 4444.
 
-5) fio \-\-server=1.2.3.4
+5) \fBfio \-\-server=1.2.3.4\fR
 
    Start a fio server, listening on IP 1.2.3.4 on the default port.
 
-6) fio \-\-server=sock:/tmp/fio.sock
+6) \fBfio \-\-server=sock:/tmp/fio.sock\fR
 
    Start a fio server, listening on the local socket /tmp/fio.sock.
 
 When a server is running, you can connect to it from a client. The client
 is run with:
 
-fio \-\-local-args \-\-client=server \-\-remote-args <job file(s)>
+\fBfio \-\-local-args \-\-client=server \-\-remote-args <job file(s)>\fR
 
 where \-\-local-args are arguments that are local to the client where it is
 running, 'server' is the connect string, and \-\-remote-args and <job file(s)>
@@ -2033,12 +2351,12 @@ are sent to the server. The 'server' string follows the same format as it
 does on the server side, to allow IP/hostname/socket and port strings.
 You can connect to multiple clients as well, to do that you could run:
 
-fio \-\-client=server2 \-\-client=server2 <job file(s)>
+\fBfio \-\-client=server2 \-\-client=server2 <job file(s)>\fR
 
 If the job file is located on the fio server, then you can tell the server
 to load a local file as well. This is done by using \-\-remote-config:
 
-fio \-\-client=server \-\-remote-config /path/to/file.fio
+\fBfio \-\-client=server \-\-remote-config /path/to/file.fio\fR
 
 Then fio will open this local (to the server) job file instead
 of being passed one from the client.
@@ -2053,7 +2371,7 @@ host2.your.dns.domain
 
 The fio command would then be:
 
-fio \-\-client=host.list <job file>
+\fBfio \-\-client=host.list <job file>\fR
 
 In this mode, you cannot input server-specific parameters or job files, and all
 servers receive the same job file.
diff --git a/fio.c b/fio.c
index bd3e260..69014dd 100644 (file)
--- a/fio.c
+++ b/fio.c
@@ -39,9 +39,12 @@ int main(int argc, char *argv[], char *envp[])
 #error "No available clock source!"
 #endif
 
-       if (parse_options(argc, argv))
+       if (fio_server_create_sk_key())
                goto done;
 
+       if (parse_options(argc, argv))
+               goto done_key;
+
        /*
         * line buffer stdout to avoid output lines from multiple
         * threads getting mixed
@@ -54,11 +57,13 @@ int main(int argc, char *argv[], char *envp[])
                set_genesis_time();
 
                if (fio_start_all_clients())
-                       goto done;
+                       goto done_key;
                ret = fio_handle_clients(&fio_client_ops);
        } else
                ret = fio_backend(NULL);
 
+done_key:
+       fio_server_destroy_sk_key();
 done:
        deinitialize_fio();
        return ret;
diff --git a/fio.h b/fio.h
index b71a486..7e6311c 100644 (file)
--- a/fio.h
+++ b/fio.h
@@ -79,6 +79,7 @@ enum {
        TD_F_NEED_LOCK          = 1U << 11,
        TD_F_CHILD              = 1U << 12,
        TD_F_NO_PROGRESS        = 1U << 13,
+       TD_F_REGROW_LOGS        = 1U << 14,
 };
 
 enum {
@@ -96,6 +97,7 @@ enum {
        FIO_RAND_START_DELAY,
        FIO_DEDUPE_OFF,
        FIO_RAND_POISSON_OFF,
+       FIO_RAND_ZONE_OFF,
        FIO_RAND_NR_OFFS,
 };
 
@@ -115,6 +117,11 @@ struct sk_out;
 void sk_out_assign(struct sk_out *);
 void sk_out_drop(void);
 
+struct zone_split_index {
+       uint8_t size_perc;
+       uint8_t size_perc_prev;
+};
+
 /*
  * This describes a single thread/process executing a fio job.
  */
@@ -148,13 +155,6 @@ struct thread_data {
        uint64_t stat_io_blocks[DDIR_RWDIR_CNT];
        struct timeval iops_sample_time;
 
-       /*
-        * Tracks the last iodepth number of completed writes, if data
-        * verification is enabled
-        */
-       uint64_t *last_write_comp;
-       unsigned int last_write_idx;
-
        volatile int update_rusage;
        struct fio_mutex *rusage_sem;
        struct rusage ru_start;
@@ -171,6 +171,15 @@ struct thread_data {
                unsigned int next_file;
                struct frand_state next_file_state;
        };
+       union {
+               struct zipf_state next_file_zipf;
+               struct gauss_state next_file_gauss;
+       };
+       union {
+               double zipf_theta;
+               double pareto_h;
+               double gauss_dev;
+       };
        int error;
        int sig;
        int done;
@@ -200,6 +209,9 @@ struct thread_data {
        struct frand_state buf_state;
        struct frand_state buf_state_prev;
        struct frand_state dedupe_state;
+       struct frand_state zone_state;
+
+       struct zone_split_index **zone_state_index;
 
        unsigned int verify_batch;
        unsigned int trim_batch;
@@ -443,8 +455,6 @@ extern int nr_clients;
 extern int log_syslog;
 extern int status_interval;
 extern const char fio_version_string[];
-extern int helper_do_stat;
-extern pthread_cond_t helper_cond;
 extern char *trigger_file;
 extern char *trigger_cmd;
 extern char *trigger_remote_cmd;
@@ -712,6 +722,7 @@ enum {
        FIO_RAND_DIST_ZIPF,
        FIO_RAND_DIST_PARETO,
        FIO_RAND_DIST_GAUSS,
+       FIO_RAND_DIST_ZONED,
 };
 
 #define FIO_DEF_ZIPF           1.1
index 79f324a..cb271c2 100644 (file)
@@ -17,5 +17,6 @@ extern void set_genesis_time(void);
 extern int ramp_time_over(struct thread_data *);
 extern int in_ramp_time(struct thread_data *);
 extern void fio_time_init(void);
+extern void timeval_add_msec(struct timeval *, unsigned int);
 
 #endif
diff --git a/flist.h b/flist.h
index d453e79..b4fe6e6 100644 (file)
--- a/flist.h
+++ b/flist.h
@@ -177,6 +177,9 @@ static inline void flist_splice_init(struct flist_head *list,
 #define flist_first_entry(ptr, type, member) \
        flist_entry((ptr)->next, type, member)
 
+#define flist_last_entry(ptr, type, member) \
+       flist_entry((ptr)->prev, type, member)
+
 /**
  * flist_for_each      -       iterate over a list
  * @pos:       the &struct flist_head to use as a loop counter.
diff --git a/gfio.c b/gfio.c
index 42d536e..e3bcbdf 100644 (file)
--- a/gfio.c
+++ b/gfio.c
@@ -449,7 +449,7 @@ static int send_job_file(struct gui_entry *ge)
                free(gco);
        }
 
-       ret = fio_client_send_ini(gc->client, ge->job_file, 0);
+       ret = fio_client_send_ini(gc->client, ge->job_file, false);
        if (!ret)
                return 0;
 
diff --git a/hash.h b/hash.h
index 02b0614..1d7608b 100644 (file)
--- a/hash.h
+++ b/hash.h
 #error Define GOLDEN_RATIO_PRIME for your wordsize.
 #endif
 
-#define GR_PRIME_64    0x9e37fffffffc0001ULL
+/*
+ * The above primes are actively bad for hashing, since they are
+ * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
+ * real problems. Besides, the "prime" part is pointless for the
+ * multiplicative hash.
+ *
+ * Although a random odd number will do, it turns out that the golden
+ * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
+ * properties.
+ *
+ * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2.
+ * (See Knuth vol 3, section 6.4, exercise 9.)
+ */
+#define GOLDEN_RATIO_32 0x61C88647
+#define GOLDEN_RATIO_64 0x61C8864680B583EBull
 
 static inline unsigned long __hash_long(unsigned long val)
 {
        unsigned long hash = val;
 
 #if BITS_PER_LONG == 64
+       hash *= GOLDEN_RATIO_64;
+#else
        /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
        unsigned long n = hash;
        n <<= 18;
@@ -49,9 +65,6 @@ static inline unsigned long __hash_long(unsigned long val)
        hash += n;
        n <<= 2;
        hash += n;
-#else
-       /* On some cpus multiply is faster, on others gcc will do shifts */
-       hash *= GOLDEN_RATIO_PRIME;
 #endif
 
        return hash;
@@ -65,7 +78,7 @@ static inline unsigned long hash_long(unsigned long val, unsigned int bits)
 
 static inline uint64_t __hash_u64(uint64_t val)
 {
-       return val * GR_PRIME_64;
+       return val * GOLDEN_RATIO_64;
 }
        
 static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
@@ -77,7 +90,7 @@ static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
  * Bob Jenkins jhash
  */
 
-#define JHASH_INITVAL  GOLDEN_RATIO_PRIME
+#define JHASH_INITVAL  GOLDEN_RATIO_32
 
 static inline uint32_t rol32(uint32_t word, uint32_t shift)
 {
diff --git a/helper_thread.c b/helper_thread.c
new file mode 100644 (file)
index 0000000..1befabf
--- /dev/null
@@ -0,0 +1,167 @@
+#include "fio.h"
+#include "smalloc.h"
+#include "helper_thread.h"
+
+static struct helper_data {
+       volatile int exit;
+       volatile int reset;
+       volatile int do_stat;
+       struct sk_out *sk_out;
+       pthread_t thread;
+       pthread_mutex_t lock;
+       pthread_cond_t cond;
+       struct fio_mutex *startup_mutex;
+} *helper_data;
+
+void helper_thread_destroy(void)
+{
+       pthread_cond_destroy(&helper_data->cond);
+       pthread_mutex_destroy(&helper_data->lock);
+       sfree(helper_data);
+}
+
+void helper_reset(void)
+{
+       if (!helper_data)
+               return;
+
+       pthread_mutex_lock(&helper_data->lock);
+
+       if (!helper_data->reset) {
+               helper_data->reset = 1;
+               pthread_cond_signal(&helper_data->cond);
+       }
+
+       pthread_mutex_unlock(&helper_data->lock);
+}
+
+void helper_do_stat(void)
+{
+       if (!helper_data)
+               return;
+
+       pthread_mutex_lock(&helper_data->lock);
+       helper_data->do_stat = 1;
+       pthread_cond_signal(&helper_data->cond);
+       pthread_mutex_unlock(&helper_data->lock);
+}
+
+bool helper_should_exit(void)
+{
+       if (!helper_data)
+               return true;
+
+       return helper_data->exit;
+}
+
+void helper_thread_exit(void)
+{
+       void *ret;
+
+       pthread_mutex_lock(&helper_data->lock);
+       helper_data->exit = 1;
+       pthread_cond_signal(&helper_data->cond);
+       pthread_mutex_unlock(&helper_data->lock);
+
+       pthread_join(helper_data->thread, &ret);
+}
+
+static void *helper_thread_main(void *data)
+{
+       struct helper_data *hd = data;
+       unsigned int msec_to_next_event, next_log;
+       struct timeval tv, last_du;
+       int ret = 0;
+
+       sk_out_assign(hd->sk_out);
+
+       gettimeofday(&tv, NULL);
+       memcpy(&last_du, &tv, sizeof(tv));
+
+       fio_mutex_up(hd->startup_mutex);
+
+       msec_to_next_event = DISK_UTIL_MSEC;
+       while (!ret && !hd->exit) {
+               struct timespec ts;
+               struct timeval now;
+               uint64_t since_du;
+
+               timeval_add_msec(&tv, msec_to_next_event);
+               ts.tv_sec = tv.tv_sec;
+               ts.tv_nsec = tv.tv_usec * 1000;
+
+               pthread_mutex_lock(&hd->lock);
+               pthread_cond_timedwait(&hd->cond, &hd->lock, &ts);
+
+               gettimeofday(&now, NULL);
+
+               if (hd->reset) {
+                       memcpy(&tv, &now, sizeof(tv));
+                       memcpy(&last_du, &now, sizeof(last_du));
+                       hd->reset = 0;
+               }
+
+               pthread_mutex_unlock(&hd->lock);
+
+               since_du = mtime_since(&last_du, &now);
+               if (since_du >= DISK_UTIL_MSEC || DISK_UTIL_MSEC - since_du < 10) {
+                       ret = update_io_ticks();
+                       timeval_add_msec(&last_du, DISK_UTIL_MSEC);
+                       msec_to_next_event = DISK_UTIL_MSEC;
+                       if (since_du >= DISK_UTIL_MSEC)
+                               msec_to_next_event -= (since_du - DISK_UTIL_MSEC);
+               } else {
+                       if (since_du >= DISK_UTIL_MSEC)
+                               msec_to_next_event = DISK_UTIL_MSEC - (DISK_UTIL_MSEC - since_du);
+                       else
+                               msec_to_next_event = DISK_UTIL_MSEC;
+               }
+
+               if (hd->do_stat) {
+                       hd->do_stat = 0;
+                       __show_running_run_stats();
+               }
+
+               next_log = calc_log_samples();
+               if (!next_log)
+                       next_log = DISK_UTIL_MSEC;
+
+               msec_to_next_event = min(next_log, msec_to_next_event);
+
+               if (!is_backend)
+                       print_thread_status();
+       }
+
+       fio_writeout_logs(false);
+
+       sk_out_drop();
+       return NULL;
+}
+
+int helper_thread_create(struct fio_mutex *startup_mutex, struct sk_out *sk_out)
+{
+       struct helper_data *hd;
+       int ret;
+
+       hd = smalloc(sizeof(*hd));
+
+       setup_disk_util();
+
+       hd->sk_out = sk_out;
+       pthread_cond_init(&hd->cond, NULL);
+       pthread_mutex_init(&hd->lock, NULL);
+       hd->startup_mutex = startup_mutex;
+
+       ret = pthread_create(&hd->thread, NULL, helper_thread_main, hd);
+       if (ret) {
+               log_err("Can't create helper thread: %s\n", strerror(ret));
+               return 1;
+       }
+
+       helper_data = hd;
+
+       dprint(FD_MUTEX, "wait on startup_mutex\n");
+       fio_mutex_down(startup_mutex);
+       dprint(FD_MUTEX, "done waiting on startup_mutex\n");
+       return 0;
+}
diff --git a/helper_thread.h b/helper_thread.h
new file mode 100644 (file)
index 0000000..78933b1
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef FIO_HELPER_THREAD_H
+#define FIO_HELPER_THREAD_H
+
+extern void helper_reset(void);
+extern void helper_do_stat(void);
+extern bool helper_should_exit(void);
+extern void helper_thread_destroy(void);
+extern void helper_thread_exit(void);
+extern int helper_thread_create(struct fio_mutex *, struct sk_out *);
+
+#endif
diff --git a/init.c b/init.c
index 77cf9f2..7166ea7 100644 (file)
--- a/init.c
+++ b/init.c
@@ -493,7 +493,7 @@ static struct thread_data *get_new_job(int global, struct thread_data *parent,
        if (jobname)
                td->o.name = strdup(jobname);
 
-       if (!parent->o.group_reporting)
+       if (!parent->o.group_reporting || parent == &def_thread)
                stat_number++;
 
        set_cmd_options(td);
@@ -919,19 +919,42 @@ static int exists_and_not_file(const char *filename)
        return 1;
 }
 
-static void td_fill_rand_seeds_internal(struct thread_data *td, int use64)
+static void init_rand_file_service(struct thread_data *td)
 {
+       unsigned long nranges = td->o.nr_files << FIO_FSERVICE_SHIFT;
+       const unsigned int seed = td->rand_seeds[FIO_RAND_FILE_OFF];
+
+       if (td->o.file_service_type == FIO_FSERVICE_ZIPF) {
+               zipf_init(&td->next_file_zipf, nranges, td->zipf_theta, seed);
+               zipf_disable_hash(&td->next_file_zipf);
+       } else if (td->o.file_service_type == FIO_FSERVICE_PARETO) {
+               pareto_init(&td->next_file_zipf, nranges, td->pareto_h, seed);
+               zipf_disable_hash(&td->next_file_zipf);
+       } else if (td->o.file_service_type == FIO_FSERVICE_GAUSS) {
+               gauss_init(&td->next_file_gauss, nranges, td->gauss_dev, seed);
+               gauss_disable_hash(&td->next_file_gauss);
+       }
+}
+
+static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
+{
+       int i;
+
        init_rand_seed(&td->bsrange_state, td->rand_seeds[FIO_RAND_BS_OFF], use64);
        init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF], use64);
-       init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], use64);
+       init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], false);
 
        if (td->o.file_service_type == FIO_FSERVICE_RANDOM)
                init_rand_seed(&td->next_file_state, td->rand_seeds[FIO_RAND_FILE_OFF], use64);
+       else if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)
+               init_rand_file_service(td);
 
        init_rand_seed(&td->file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF], use64);
        init_rand_seed(&td->trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF], use64);
        init_rand_seed(&td->delay_state, td->rand_seeds[FIO_RAND_START_DELAY], use64);
        init_rand_seed(&td->poisson_state, td->rand_seeds[FIO_RAND_POISSON_OFF], 0);
+       init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false);
+       init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false);
 
        if (!td_random(td))
                return;
@@ -940,14 +963,17 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, int use64)
                td->rand_seeds[FIO_RAND_BLOCK_OFF] = FIO_RANDSEED * td->thread_number;
 
        init_rand_seed(&td->random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF], use64);
-       init_rand_seed(&td->seq_rand_state[DDIR_READ], td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF], use64);
-       init_rand_seed(&td->seq_rand_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_SEQ_RAND_WRITE_OFF], use64);
-       init_rand_seed(&td->seq_rand_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_SEQ_RAND_TRIM_OFF], use64);
+
+       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+               struct frand_state *s = &td->seq_rand_state[i];
+
+               init_rand_seed(s, td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF], false);
+       }
 }
 
 void td_fill_rand_seeds(struct thread_data *td)
 {
-       int use64;
+       bool use64;
 
        if (td->o.allrand_repeatable) {
                unsigned int i;
@@ -966,8 +992,6 @@ void td_fill_rand_seeds(struct thread_data *td)
 
        init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF], use64);
        frand_copy(&td->buf_state_prev, &td->buf_state);
-
-       init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], use64);
 }
 
 /*
@@ -1074,7 +1098,7 @@ static int setup_random_seeds(struct thread_data *td)
                seed *= 0x9e370001UL;
 
        for (i = 0; i < FIO_RAND_NR_OFFS; i++) {
-               td->rand_seeds[i] = seed;
+               td->rand_seeds[i] = seed * td->thread_number + i;
                seed *= 0x9e370001UL;
        }
 
@@ -1411,6 +1435,11 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                };
                const char *suf;
 
+               if (fio_option_is_set(o, bw_avg_time))
+                       p.avg_msec = min(o->log_avg_msec, o->bw_avg_time);
+               else
+                       o->bw_avg_time = p.avg_msec;
+
                if (p.log_gz_store)
                        suf = "log.fz";
                else
@@ -1431,6 +1460,11 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                };
                const char *suf;
 
+               if (fio_option_is_set(o, iops_avg_time))
+                       p.avg_msec = min(o->log_avg_msec, o->iops_avg_time);
+               else
+                       o->iops_avg_time = p.avg_msec;
+
                if (p.log_gz_store)
                        suf = "log.fz";
                else
@@ -1773,15 +1807,48 @@ int __parse_jobs_ini(struct thread_data *td,
                        strip_blank_end(p);
 
                        if (!strncmp(p, "include", strlen("include"))) {
-                               char *filename = p + strlen("include") + 1;
+                               char *filename = p + strlen("include") + 1,
+                                       *ts, *full_fn = NULL;
+
+                               /*
+                                * Allow for the include filename
+                                * specification to be relative.
+                                */
+                               if (access(filename, F_OK) &&
+                                   (ts = strrchr(file, '/'))) {
+                                       int len = ts - file +
+                                               strlen(filename) + 2;
+
+                                       if (!(full_fn = calloc(1, len))) {
+                                               ret = ENOMEM;
+                                               break;
+                                       }
+
+                                       strncpy(full_fn,
+                                               file, (ts - file) + 1);
+                                       strncpy(full_fn + (ts - file) + 1,
+                                               filename, strlen(filename));
+                                       full_fn[len - 1] = 0;
+                                       filename = full_fn;
+                               }
+
+                               ret = __parse_jobs_ini(td, filename, is_buf,
+                                                      stonewall_flag, type, 1,
+                                                      name, &opts,
+                                                      &alloc_opts, &num_opts);
 
-                               if ((ret = __parse_jobs_ini(td, filename,
-                                               is_buf, stonewall_flag, type, 1,
-                                               name, &opts, &alloc_opts, &num_opts))) {
-                                       log_err("Error %d while parsing include file %s\n",
+                               if (ret) {
+                                       log_err("Error %d while parsing "
+                                               "include file %s\n",
                                                ret, filename);
-                                       break;
                                }
+
+                               if (full_fn)
+                                       free(full_fn);
+
+                               if (ret)
+                                       break;
+
                                continue;
                        }
 
@@ -1860,6 +1927,7 @@ static int fill_def_thread(void)
 
 static void show_debug_categories(void)
 {
+#ifdef FIO_INC_DEBUG
        struct debug_level *dl = &debug_levels[0];
        int curlen, first = 1;
 
@@ -1885,6 +1953,7 @@ static void show_debug_categories(void)
                first = 0;
        }
        printf("\n");
+#endif
 }
 
 static void usage(const char *name)
@@ -2512,14 +2581,14 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
                                    !strncmp(argv[optind], "-", 1))
                                        break;
 
-                               if (fio_client_add_ini_file(cur_client, argv[optind], 0))
+                               if (fio_client_add_ini_file(cur_client, argv[optind], false))
                                        break;
                                optind++;
                        }
                        break;
                case 'R':
                        did_arg = 1;
-                       if (fio_client_add_ini_file(cur_client, optarg, 1)) {
+                       if (fio_client_add_ini_file(cur_client, optarg, true)) {
                                do_exit++;
                                exit_val = 1;
                        }
index e5eff68..763e826 100644 (file)
--- a/io_ddir.h
+++ b/io_ddir.h
@@ -16,8 +16,9 @@ enum fio_ddir {
 
 static inline const char *io_ddir_name(enum fio_ddir ddir)
 {
-       const char *name[] = { "read", "write", "trim", "sync", "datasync",
-                               "sync_file_range", "write", };
+       static const char *name[] = { "read", "write", "trim", "sync",
+                                       "datasync", "sync_file_range",
+                                       "wait", };
 
        if (ddir < DDIR_LAST)
                return name[ddir];
diff --git a/io_u.c b/io_u.c
index 9628d5e..c0790b2 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -86,24 +86,19 @@ struct rand_off {
 };
 
 static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f,
-                                 enum fio_ddir ddir, uint64_t *b)
+                                 enum fio_ddir ddir, uint64_t *b,
+                                 uint64_t lastb)
 {
        uint64_t r;
 
        if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE ||
            td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) {
-               uint64_t frand_max, lastb;
 
-               lastb = last_block(td, f, ddir);
-               if (!lastb)
-                       return 1;
-
-               frand_max = rand_max(&td->random_state);
                r = __rand(&td->random_state);
 
                dprint(FD_RANDOM, "off rand %llu\n", (unsigned long long) r);
 
-               *b = lastb * (r / ((uint64_t) frand_max + 1.0));
+               *b = lastb * (r / (rand_max(&td->random_state) + 1.0));
        } else {
                uint64_t off = 0;
 
@@ -161,6 +156,70 @@ static int __get_next_rand_offset_gauss(struct thread_data *td,
        return 0;
 }
 
+static int __get_next_rand_offset_zoned(struct thread_data *td,
+                                       struct fio_file *f, enum fio_ddir ddir,
+                                       uint64_t *b)
+{
+       unsigned int v, send, stotal;
+       uint64_t offset, lastb;
+       static int warned;
+       struct zone_split_index *zsi;
+
+       lastb = last_block(td, f, ddir);
+       if (!lastb)
+               return 1;
+
+       if (!td->o.zone_split_nr[ddir]) {
+bail:
+               return __get_next_rand_offset(td, f, ddir, b, lastb);
+       }
+
+       /*
+        * Generate a value, v, between 1 and 100, both inclusive
+        */
+       v = rand32_between(&td->zone_state, 1, 100);
+
+       zsi = &td->zone_state_index[ddir][v - 1];
+       stotal = zsi->size_perc_prev;
+       send = zsi->size_perc;
+
+       /*
+        * Should never happen
+        */
+       if (send == -1U) {
+               if (!warned) {
+                       log_err("fio: bug in zoned generation\n");
+                       warned = 1;
+               }
+               goto bail;
+       }
+
+       /*
+        * 'send' is some percentage below or equal to 100 that
+        * marks the end of the current IO range. 'stotal' marks
+        * the start, in percent.
+        */
+       if (stotal)
+               offset = stotal * lastb / 100ULL;
+       else
+               offset = 0;
+
+       lastb = lastb * (send - stotal) / 100ULL;
+
+       /*
+        * Generate index from 0..send-of-lastb
+        */
+       if (__get_next_rand_offset(td, f, ddir, b, lastb) == 1)
+               return 1;
+
+       /*
+        * Add our start offset, if any
+        */
+       if (offset)
+               *b += offset;
+
+       return 0;
+}
 
 static int flist_cmp(void *data, struct flist_head *a, struct flist_head *b)
 {
@@ -173,14 +232,22 @@ static int flist_cmp(void *data, struct flist_head *a, struct flist_head *b)
 static int get_off_from_method(struct thread_data *td, struct fio_file *f,
                               enum fio_ddir ddir, uint64_t *b)
 {
-       if (td->o.random_distribution == FIO_RAND_DIST_RANDOM)
-               return __get_next_rand_offset(td, f, ddir, b);
-       else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+       if (td->o.random_distribution == FIO_RAND_DIST_RANDOM) {
+               uint64_t lastb;
+
+               lastb = last_block(td, f, ddir);
+               if (!lastb)
+                       return 1;
+
+               return __get_next_rand_offset(td, f, ddir, b, lastb);
+       } else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
                return __get_next_rand_offset_zipf(td, f, ddir, b);
        else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
                return __get_next_rand_offset_pareto(td, f, ddir, b);
        else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
                return __get_next_rand_offset_gauss(td, f, ddir, b);
+       else if (td->o.random_distribution == FIO_RAND_DIST_ZONED)
+               return __get_next_rand_offset_zoned(td, f, ddir, b);
 
        log_err("fio: unknown random distribution: %d\n", td->o.random_distribution);
        return 1;
@@ -207,16 +274,12 @@ static inline bool should_sort_io(struct thread_data *td)
 
 static bool should_do_random(struct thread_data *td, enum fio_ddir ddir)
 {
-       uint64_t frand_max;
        unsigned int v;
-       unsigned long r;
 
        if (td->o.perc_rand[ddir] == 100)
                return true;
 
-       frand_max = rand_max(&td->seq_rand_state[ddir]);
-       r = __rand(&td->seq_rand_state[ddir]);
-       v = 1 + (int) (100.0 * (r / (frand_max + 1.0)));
+       v = rand32_between(&td->seq_rand_state[ddir], 1, 100);
 
        return v <= td->o.perc_rand[ddir];
 }
@@ -265,7 +328,8 @@ static int get_next_rand_block(struct thread_data *td, struct fio_file *f,
        if (!get_next_rand_offset(td, f, ddir, b))
                return 0;
 
-       if (td->o.time_based) {
+       if (td->o.time_based ||
+           (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)) {
                fio_file_reset(td, f);
                if (!get_next_rand_offset(td, f, ddir, b))
                        return 0;
@@ -285,8 +349,15 @@ static int get_next_seq_offset(struct thread_data *td, struct fio_file *f,
        assert(ddir_rw(ddir));
 
        if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) &&
-           o->time_based)
-               f->last_pos[ddir] = f->last_pos[ddir] - f->io_size;
+           o->time_based) {
+               struct thread_options *o = &td->o;
+               uint64_t io_size = f->io_size + (f->io_size % o->min_bs[ddir]);
+
+               if (io_size > f->last_pos[ddir])
+                       f->last_pos[ddir] = 0;
+               else
+                       f->last_pos[ddir] = f->last_pos[ddir] - io_size;
+       }
 
        if (f->last_pos[ddir] < f->real_file_size) {
                uint64_t pos;
@@ -301,10 +372,15 @@ static int get_next_seq_offset(struct thread_data *td, struct fio_file *f,
                        /*
                         * If we reach beyond the end of the file
                         * with holed IO, wrap around to the
-                        * beginning again.
+                        * beginning again. If we're doing backwards IO,
+                        * wrap to the end.
                         */
-                       if (pos >= f->real_file_size)
-                               pos = f->file_offset;
+                       if (pos >= f->real_file_size) {
+                               if (o->ddir_seq_add > 0)
+                                       pos = f->file_offset;
+                               else
+                                       pos = f->real_file_size + o->ddir_seq_add;
+                       }
                }
 
                *offset = pos;
@@ -483,7 +559,7 @@ static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u,
 
                                buflen = bsp->bs;
                                perc += bsp->perc;
-                               if ((r <= ((frand_max / 100L) * perc)) &&
+                               if ((r * 100UL <= frand_max * perc) &&
                                    io_u_fits(td, io_u, buflen))
                                        break;
                        }
@@ -529,12 +605,9 @@ static void set_rwmix_bytes(struct thread_data *td)
 
 static inline enum fio_ddir get_rand_ddir(struct thread_data *td)
 {
-       uint64_t frand_max = rand_max(&td->rwmix_state);
        unsigned int v;
-       unsigned long r;
 
-       r = __rand(&td->rwmix_state);
-       v = 1 + (int) (100.0 * (r / (frand_max + 1.0)));
+       v = rand32_between(&td->rwmix_state, 1, 100);
 
        if (v <= td->o.rwmix[DDIR_READ])
                return DDIR_READ;
@@ -998,6 +1071,34 @@ static void io_u_mark_latency(struct thread_data *td, unsigned long usec)
                io_u_mark_lat_msec(td, usec / 1000);
 }
 
+static unsigned int __get_next_fileno_rand(struct thread_data *td)
+{
+       unsigned long fileno;
+
+       if (td->o.file_service_type == FIO_FSERVICE_RANDOM) {
+               uint64_t frand_max = rand_max(&td->next_file_state);
+               unsigned long r;
+
+               r = __rand(&td->next_file_state);
+               return (unsigned int) ((double) td->o.nr_files
+                               * (r / (frand_max + 1.0)));
+       }
+
+       if (td->o.file_service_type == FIO_FSERVICE_ZIPF)
+               fileno = zipf_next(&td->next_file_zipf);
+       else if (td->o.file_service_type == FIO_FSERVICE_PARETO)
+               fileno = pareto_next(&td->next_file_zipf);
+       else if (td->o.file_service_type == FIO_FSERVICE_GAUSS)
+               fileno = gauss_next(&td->next_file_gauss);
+       else {
+               log_err("fio: bad file service type: %d\n", td->o.file_service_type);
+               assert(0);
+               return 0;
+       }
+
+       return fileno >> FIO_FSERVICE_SHIFT;
+}
+
 /*
  * Get next file to service by choosing one at random
  */
@@ -1005,17 +1106,13 @@ static struct fio_file *get_next_file_rand(struct thread_data *td,
                                           enum fio_file_flags goodf,
                                           enum fio_file_flags badf)
 {
-       uint64_t frand_max = rand_max(&td->next_file_state);
        struct fio_file *f;
        int fno;
 
        do {
                int opened = 0;
-               unsigned long r;
 
-               r = __rand(&td->next_file_state);
-               fno = (unsigned int) ((double) td->o.nr_files
-                               * (r / (frand_max + 1.0)));
+               fno = __get_next_fileno_rand(td);
 
                f = td->files[fno];
                if (fio_file_done(f))
@@ -1168,10 +1265,14 @@ static long set_io_u_file(struct thread_data *td, struct io_u *io_u)
                put_file_log(td, f);
                td_io_close_file(td, f);
                io_u->file = NULL;
-               fio_file_set_done(f);
-               td->nr_done_files++;
-               dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name,
+               if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)
+                       fio_file_reset(td, f);
+               else {
+                       fio_file_set_done(f);
+                       td->nr_done_files++;
+                       dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name,
                                        td->nr_done_files, td->o.nr_files);
+               }
        } while (1);
 
        return 0;
@@ -1600,7 +1701,7 @@ void io_u_log_error(struct thread_data *td, struct io_u *io_u)
 {
        __io_u_log_error(td, io_u);
        if (td->parent)
-               __io_u_log_error(td, io_u);
+               __io_u_log_error(td->parent, io_u);
 }
 
 static inline bool gtod_reduce(struct thread_data *td)
@@ -1643,16 +1744,18 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
                }
        }
 
-       if (!td->o.disable_clat) {
-               add_clat_sample(td, idx, lusec, bytes, io_u->offset);
-               io_u_mark_latency(td, lusec);
-       }
+       if (ddir_rw(idx)) {
+               if (!td->o.disable_clat) {
+                       add_clat_sample(td, idx, lusec, bytes, io_u->offset);
+                       io_u_mark_latency(td, lusec);
+               }
 
-       if (!td->o.disable_bw)
-               add_bw_sample(td, idx, bytes, &icd->time);
+               if (!td->o.disable_bw && per_unit_log(td->bw_log))
+                       add_bw_sample(td, io_u, bytes, lusec);
 
-       if (no_reduce)
-               add_iops_sample(td, idx, bytes, &icd->time);
+               if (no_reduce && per_unit_log(td->iops_log))
+                       add_iops_sample(td, io_u, bytes);
+       }
 
        if (td->ts.nr_block_infos && io_u->ddir == DDIR_TRIM) {
                uint32_t *info = io_u_block_info(td, io_u);
@@ -1668,6 +1771,28 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
        }
 }
 
+static void file_log_write_comp(const struct thread_data *td, struct fio_file *f,
+                               uint64_t offset, unsigned int bytes)
+{
+       int idx;
+
+       if (!f)
+               return;
+
+       if (f->first_write == -1ULL || offset < f->first_write)
+               f->first_write = offset;
+       if (f->last_write == -1ULL || ((offset + bytes) > f->last_write))
+               f->last_write = offset + bytes;
+
+       if (!f->last_write_comp)
+               return;
+
+       idx = f->last_write_idx++;
+       f->last_write_comp[idx] = offset;
+       if (f->last_write_idx == td->o.iodepth)
+               f->last_write_idx = 0;
+}
+
 static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
                         struct io_completion_data *icd)
 {
@@ -1718,23 +1843,8 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
                if (!(io_u->flags & IO_U_F_VER_LIST))
                        td->this_io_bytes[ddir] += bytes;
 
-               if (ddir == DDIR_WRITE) {
-                       if (f) {
-                               if (f->first_write == -1ULL ||
-                                   io_u->offset < f->first_write)
-                                       f->first_write = io_u->offset;
-                               if (f->last_write == -1ULL ||
-                                   ((io_u->offset + bytes) > f->last_write))
-                                       f->last_write = io_u->offset + bytes;
-                       }
-                       if (td->last_write_comp) {
-                               int idx = td->last_write_idx++;
-
-                               td->last_write_comp[idx] = io_u->offset;
-                               if (td->last_write_idx == td->o.iodepth)
-                                       td->last_write_idx = 0;
-                       }
-               }
+               if (ddir == DDIR_WRITE)
+                       file_log_write_comp(td, f, io_u->offset, bytes);
 
                if (ramp_time_over(td) && (td->runstate == TD_RUNNING ||
                                           td->runstate == TD_VERIFYING))
@@ -1888,9 +1998,7 @@ void io_u_queued(struct thread_data *td, struct io_u *io_u)
  */
 static struct frand_state *get_buf_state(struct thread_data *td)
 {
-       uint64_t frand_max;
        unsigned int v;
-       unsigned long r;
 
        if (!td->o.dedupe_percentage)
                return &td->buf_state;
@@ -1899,9 +2007,7 @@ static struct frand_state *get_buf_state(struct thread_data *td)
                return &td->buf_state;
        }
 
-       frand_max = rand_max(&td->dedupe_state);
-       r = __rand(&td->dedupe_state);
-       v = 1 + (int) (100.0 * (r / (frand_max + 1.0)));
+       v = rand32_between(&td->dedupe_state, 1, 100);
 
        if (v <= td->o.dedupe_percentage)
                return &td->buf_state_prev;
index 6734c7b..161acf5 100644 (file)
@@ -16,7 +16,7 @@
 #include <guasi.h>
 #endif
 
-#define FIO_IOOPS_VERSION      22
+#define FIO_IOOPS_VERSION      23
 
 enum {
        IO_U_F_FREE             = 1 << 0,
@@ -157,6 +157,8 @@ struct ioengine_ops {
        int (*unlink_file)(struct thread_data *, struct fio_file *);
        int (*get_file_size)(struct thread_data *, struct fio_file *);
        void (*terminate)(struct thread_data *);
+       int (*iomem_alloc)(struct thread_data *, size_t);
+       void (*iomem_free)(struct thread_data *);
        int (*io_u_init)(struct thread_data *, struct io_u *);
        void (*io_u_free)(struct thread_data *, struct io_u *);
        int option_struct_size;
index 9c5ac60..e2e7280 100644 (file)
 
 static FLIST_HEAD(engine_list);
 
-static int check_engine_ops(struct ioengine_ops *ops)
+static bool check_engine_ops(struct ioengine_ops *ops)
 {
        if (ops->version != FIO_IOOPS_VERSION) {
                log_err("bad ioops version %d (want %d)\n", ops->version,
                                                        FIO_IOOPS_VERSION);
-               return 1;
+               return true;
        }
 
        if (!ops->queue) {
                log_err("%s: no queue handler\n", ops->name);
-               return 1;
+               return true;
        }
 
        /*
         * sync engines only need a ->queue()
         */
        if (ops->flags & FIO_SYNCIO)
-               return 0;
+               return false;
 
-       if (!ops->event) {
-               log_err("%s: no event handler\n", ops->name);
-               return 1;
-       }
-       if (!ops->getevents) {
-               log_err("%s: no getevents handler\n", ops->name);
-               return 1;
-       }
-       if (!ops->queue) {
-               log_err("%s: no queue handler\n", ops->name);
-               return 1;
+       if (!ops->event || !ops->getevents) {
+               log_err("%s: no event/getevents handler\n", ops->name);
+               return true;
        }
 
-       return 0;
+       return false;
 }
 
 void unregister_ioengine(struct ioengine_ops *ops)
@@ -346,10 +338,10 @@ int td_io_queue(struct thread_data *td, struct io_u *io_u)
        } else if (ret == FIO_Q_QUEUED) {
                int r;
 
-               if (ddir_rw(io_u->ddir)) {
-                       td->io_u_queued++;
+               td->io_u_queued++;
+
+               if (ddir_rw(io_u->ddir))
                        td->ts.total_io_u[io_u->ddir]++;
-               }
 
                if (td->io_u_queued >= td->o.iodepth_batch) {
                        r = td_io_commit(td);
diff --git a/iolog.c b/iolog.c
index feda9ed..3723e0a 100644 (file)
--- a/iolog.c
+++ b/iolog.c
@@ -18,6 +18,9 @@
 #include "verify.h"
 #include "trim.h"
 #include "filelock.h"
+#include "smalloc.h"
+
+static int iolog_flush(struct io_log *log);
 
 static const char iolog_ver2[] = "fio version 2 iolog";
 
@@ -574,18 +577,25 @@ void setup_log(struct io_log **log, struct log_params *p,
 {
        struct io_log *l;
 
-       l = calloc(1, sizeof(*l));
-       l->nr_samples = 0;
-       l->max_samples = 1024;
+       l = scalloc(1, sizeof(*l));
+       INIT_FLIST_HEAD(&l->io_logs);
        l->log_type = p->log_type;
        l->log_offset = p->log_offset;
        l->log_gz = p->log_gz;
        l->log_gz_store = p->log_gz_store;
-       l->log = malloc(l->max_samples * log_entry_sz(l));
        l->avg_msec = p->avg_msec;
        l->filename = strdup(filename);
        l->td = p->td;
 
+       if (l->td && l->td->o.io_submit_mode != IO_MODE_OFFLOAD) {
+               struct io_logs *p;
+
+               p = calloc(1, sizeof(*l->pending));
+               p->max_samples = l->td->o.iodepth;
+               p->log = calloc(p->max_samples, log_entry_sz(l));
+               l->pending = p;
+       }
+
        if (l->log_offset)
                l->log_ddir_mask = LOG_OFFSET_SAMPLE_BIT;
 
@@ -629,9 +639,23 @@ static void clear_file_buffer(void *buf)
 
 void free_log(struct io_log *log)
 {
-       free(log->log);
+       while (!flist_empty(&log->io_logs)) {
+               struct io_logs *cur_log;
+
+               cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+               flist_del_init(&cur_log->list);
+               free(cur_log->log);
+       }
+
+       if (log->pending) {
+               free(log->pending->log);
+               free(log->pending);
+               log->pending = NULL;
+       }
+
+       free(log->pending);
        free(log->filename);
-       free(log);
+       sfree(log);
 }
 
 void flush_samples(FILE *f, void *samples, uint64_t sample_size)
@@ -672,14 +696,10 @@ void flush_samples(FILE *f, void *samples, uint64_t sample_size)
 
 struct iolog_flush_data {
        struct workqueue_work work;
-       pthread_mutex_t lock;
-       pthread_cond_t cv;
-       int wait;
-       volatile int done;
-       volatile int refs;
        struct io_log *log;
        void *samples;
-       uint64_t nr_samples;
+       uint32_t nr_samples;
+       bool free;
 };
 
 #define GZ_CHUNK       131072
@@ -706,6 +726,7 @@ static int z_stream_init(z_stream *stream, int gz_hdr)
 {
        int wbits = 15;
 
+       memset(stream, 0, sizeof(*stream));
        stream->zalloc = Z_NULL;
        stream->zfree = Z_NULL;
        stream->opaque = Z_NULL;
@@ -740,7 +761,8 @@ static void finish_chunk(z_stream *stream, FILE *f,
 
        ret = inflateEnd(stream);
        if (ret != Z_OK)
-               log_err("fio: failed to end log inflation (%d)\n", ret);
+               log_err("fio: failed to end log inflation seq %d (%d)\n",
+                               iter->seq, ret);
 
        flush_samples(f, iter->buf, iter->buf_used);
        free(iter->buf);
@@ -757,7 +779,7 @@ static size_t inflate_chunk(struct iolog_compress *ic, int gz_hdr, FILE *f,
 {
        size_t ret;
 
-       dprint(FD_COMPRESS, "inflate chunk size=%lu, seq=%u",
+       dprint(FD_COMPRESS, "inflate chunk size=%lu, seq=%u\n",
                                (unsigned long) ic->len, ic->seq);
 
        if (ic->seq != iter->seq) {
@@ -804,7 +826,7 @@ static size_t inflate_chunk(struct iolog_compress *ic, int gz_hdr, FILE *f,
 
        ret = (void *) stream->next_in - ic->buf;
 
-       dprint(FD_COMPRESS, "inflated to size=%lu\n", (unsigned long) ret);
+       dprint(FD_COMPRESS, "inflated to size=%lu\n", (unsigned long) iter->buf_size);
 
        return ret;
 }
@@ -960,7 +982,13 @@ void flush_log(struct io_log *log, int do_append)
 
        inflate_gz_chunks(log, f);
 
-       flush_samples(f, log->log, log->nr_samples * log_entry_sz(log));
+       while (!flist_empty(&log->io_logs)) {
+               struct io_logs *cur_log;
+
+               cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+               flist_del_init(&cur_log->list);
+               flush_samples(f, cur_log->log, cur_log->nr_samples * log_entry_sz(log));
+       }
 
        fclose(f);
        clear_file_buffer(buf);
@@ -969,7 +997,7 @@ void flush_log(struct io_log *log, int do_append)
 static int finish_log(struct thread_data *td, struct io_log *log, int trylock)
 {
        if (td->flags & TD_F_COMPRESS_LOG)
-               iolog_flush(log, 1);
+               iolog_flush(log);
 
        if (trylock) {
                if (fio_trylock_file(log->filename))
@@ -1009,29 +1037,9 @@ size_t log_chunk_sizes(struct io_log *log)
 
 #ifdef CONFIG_ZLIB
 
-static void drop_data_unlock(struct iolog_flush_data *data)
-{
-       int refs;
-
-       refs = --data->refs;
-       pthread_mutex_unlock(&data->lock);
-
-       if (!refs) {
-               free(data);
-               pthread_mutex_destroy(&data->lock);
-               pthread_cond_destroy(&data->cv);
-       }
-}
-
-/*
- * Invoked from our compress helper thread, when logging would have exceeded
- * the specified memory limitation. Compresses the previously stored
- * entries.
- */
-static int gz_work(struct submit_worker *sw, struct workqueue_work *work)
+static int gz_work(struct iolog_flush_data *data)
 {
-       struct iolog_flush_data *data;
-       struct iolog_compress *c;
+       struct iolog_compress *c = NULL;
        struct flist_head list;
        unsigned int seq;
        z_stream stream;
@@ -1040,8 +1048,7 @@ static int gz_work(struct submit_worker *sw, struct workqueue_work *work)
 
        INIT_FLIST_HEAD(&list);
 
-       data = container_of(work, struct iolog_flush_data, work);
-
+       memset(&stream, 0, sizeof(stream));
        stream.zalloc = Z_NULL;
        stream.zfree = Z_NULL;
        stream.opaque = Z_NULL;
@@ -1049,7 +1056,7 @@ static int gz_work(struct submit_worker *sw, struct workqueue_work *work)
        ret = deflateInit(&stream, Z_DEFAULT_COMPRESSION);
        if (ret != Z_OK) {
                log_err("fio: failed to init gz stream\n");
-               return 0;
+               goto err;
        }
 
        seq = ++data->log->chunk_seq;
@@ -1057,9 +1064,12 @@ static int gz_work(struct submit_worker *sw, struct workqueue_work *work)
        stream.next_in = (void *) data->samples;
        stream.avail_in = data->nr_samples * log_entry_sz(data->log);
 
-       dprint(FD_COMPRESS, "deflate input size=%lu, seq=%u\n",
-                               (unsigned long) stream.avail_in, seq);
+       dprint(FD_COMPRESS, "deflate input size=%lu, seq=%u, log=%s\n",
+                               (unsigned long) stream.avail_in, seq,
+                               data->log->filename);
        do {
+               if (c)
+                       dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq, c->len);
                c = get_new_chunk(seq);
                stream.avail_out = GZ_CHUNK;
                stream.next_out = c->buf;
@@ -1079,9 +1089,26 @@ static int gz_work(struct submit_worker *sw, struct workqueue_work *work)
        stream.avail_out = GZ_CHUNK - c->len;
 
        ret = deflate(&stream, Z_FINISH);
-       if (ret == Z_STREAM_END)
-               c->len = GZ_CHUNK - stream.avail_out;
-       else {
+       if (ret < 0) {
+               /*
+                * Z_BUF_ERROR is special, it just means we need more
+                * output space. We'll handle that below. Treat any other
+                * error as fatal.
+                */
+               if (ret != Z_BUF_ERROR) {
+                       log_err("fio: deflate log (%d)\n", ret);
+                       flist_del(&c->list);
+                       free_chunk(c);
+                       goto err;
+               }
+       }
+
+       total -= c->len;
+       c->len = GZ_CHUNK - stream.avail_out;
+       total += c->len;
+       dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq, c->len);
+
+       if (ret != Z_STREAM_END) {
                do {
                        c = get_new_chunk(seq);
                        stream.avail_out = GZ_CHUNK;
@@ -1090,6 +1117,7 @@ static int gz_work(struct submit_worker *sw, struct workqueue_work *work)
                        c->len = GZ_CHUNK - stream.avail_out;
                        total += c->len;
                        flist_add_tail(&c->list, &list);
+                       dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq, c->len);
                } while (ret != Z_STREAM_END);
        }
 
@@ -1109,13 +1137,7 @@ static int gz_work(struct submit_worker *sw, struct workqueue_work *work)
 
        ret = 0;
 done:
-       if (data->wait) {
-               pthread_mutex_lock(&data->lock);
-               data->done = 1;
-               pthread_cond_signal(&data->cv);
-
-               drop_data_unlock(data);
-       } else
+       if (data->free)
                free(data);
        return ret;
 err:
@@ -1128,6 +1150,16 @@ err:
        goto done;
 }
 
+/*
+ * Invoked from our compress helper thread, when logging would have exceeded
+ * the specified memory limitation. Compresses the previously stored
+ * entries.
+ */
+static int gz_work_async(struct submit_worker *sw, struct workqueue_work *work)
+{
+       return gz_work(container_of(work, struct iolog_flush_data, work));
+}
+
 static int gz_init_worker(struct submit_worker *sw)
 {
        struct thread_data *td = sw->wq->td;
@@ -1144,7 +1176,7 @@ static int gz_init_worker(struct submit_worker *sw)
 }
 
 static struct workqueue_ops log_compress_wq_ops = {
-       .fn             = gz_work,
+       .fn             = gz_work_async,
        .init_worker_fn = gz_init_worker,
        .nice           = 1,
 };
@@ -1170,52 +1202,69 @@ void iolog_compress_exit(struct thread_data *td)
  * Queue work item to compress the existing log entries. We reset the
  * current log to a small size, and reference the existing log in the
  * data that we queue for compression. Once compression has been done,
- * this old log is freed. If called with wait == 1, will not return until
- * the log compression has completed.
+ * this old log is freed. If called with finish == true, will not return
+ * until the log compression has completed, and will flush all previous
+ * logs too
  */
-int iolog_flush(struct io_log *log, int wait)
+static int iolog_flush(struct io_log *log)
 {
        struct iolog_flush_data *data;
 
-       io_u_quiesce(log->td);
-
        data = malloc(sizeof(*data));
        if (!data)
                return 1;
 
        data->log = log;
+       data->free = false;
 
-       data->samples = log->log;
-       data->nr_samples = log->nr_samples;
-
-       log->nr_samples = 0;
-       log->max_samples = 128;
-       log->log = malloc(log->max_samples * log_entry_sz(log));
+       while (!flist_empty(&log->io_logs)) {
+               struct io_logs *cur_log;
 
-       data->wait = wait;
-       if (data->wait) {
-               pthread_mutex_init(&data->lock, NULL);
-               pthread_cond_init(&data->cv, NULL);
-               data->done = 0;
-               data->refs = 2;
-       }
+               cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+               flist_del_init(&cur_log->list);
 
-       workqueue_enqueue(&log->td->log_compress_wq, &data->work);
+               data->samples = cur_log->log;
+               data->nr_samples = cur_log->nr_samples;
 
-       if (wait) {
-               pthread_mutex_lock(&data->lock);
-               while (!data->done)
-                       pthread_cond_wait(&data->cv, &data->lock);
+               cur_log->nr_samples = 0;
+               cur_log->max_samples = 0;
+               cur_log->log = NULL;
 
-               drop_data_unlock(data);
+               gz_work(data);
        }
 
+       free(data);
        return 0;
 }
 
+int iolog_cur_flush(struct io_log *log, struct io_logs *cur_log)
+{
+       struct iolog_flush_data *data;
+
+       data = malloc(sizeof(*data));
+       if (!data)
+               return 1;
+
+       data->log = log;
+
+       data->samples = cur_log->log;
+       data->nr_samples = cur_log->nr_samples;
+       data->free = true;
+
+       cur_log->nr_samples = cur_log->max_samples = 0;
+       cur_log->log = NULL;
+
+       workqueue_enqueue(&log->td->log_compress_wq, &data->work);
+       return 0;
+}
 #else
 
-int iolog_flush(struct io_log *log, int wait)
+static int iolog_flush(struct io_log *log)
+{
+       return 1;
+}
+
+int iolog_cur_flush(struct io_log *log, struct io_logs *cur_log)
 {
        return 1;
 }
@@ -1231,6 +1280,29 @@ void iolog_compress_exit(struct thread_data *td)
 
 #endif
 
+struct io_logs *iolog_cur_log(struct io_log *log)
+{
+       if (flist_empty(&log->io_logs))
+               return NULL;
+
+       return flist_last_entry(&log->io_logs, struct io_logs, list);
+}
+
+uint64_t iolog_nr_samples(struct io_log *iolog)
+{
+       struct flist_head *entry;
+       uint64_t ret = 0;
+
+       flist_for_each(entry, &iolog->io_logs) {
+               struct io_logs *cur_log;
+
+               cur_log = flist_entry(entry, struct io_logs, list);
+               ret += cur_log->nr_samples;
+       }
+
+       return ret;
+}
+
 static int __write_log(struct thread_data *td, struct io_log *log, int try)
 {
        if (log)
@@ -1239,29 +1311,74 @@ static int __write_log(struct thread_data *td, struct io_log *log, int try)
        return 0;
 }
 
-static int write_iops_log(struct thread_data *td, int try)
+static int write_iops_log(struct thread_data *td, int try, bool unit_log)
 {
-       return __write_log(td, td->iops_log, try);
+       int ret;
+
+       if (per_unit_log(td->iops_log) != unit_log)
+               return 0;
+
+       ret = __write_log(td, td->iops_log, try);
+       if (!ret)
+               td->iops_log = NULL;
+
+       return ret;
 }
 
-static int write_slat_log(struct thread_data *td, int try)
+static int write_slat_log(struct thread_data *td, int try, bool unit_log)
 {
-       return __write_log(td, td->slat_log, try);
+       int ret;
+
+       if (!unit_log)
+               return 0;
+
+       ret = __write_log(td, td->slat_log, try);
+       if (!ret)
+               td->slat_log = NULL;
+
+       return ret;
 }
 
-static int write_clat_log(struct thread_data *td, int try)
+static int write_clat_log(struct thread_data *td, int try, bool unit_log)
 {
-       return __write_log(td, td->clat_log, try);
+       int ret;
+
+       if (!unit_log)
+               return 0;
+
+       ret = __write_log(td, td->clat_log, try);
+       if (!ret)
+               td->clat_log = NULL;
+
+       return ret;
 }
 
-static int write_lat_log(struct thread_data *td, int try)
+static int write_lat_log(struct thread_data *td, int try, bool unit_log)
 {
-       return __write_log(td, td->lat_log, try);
+       int ret;
+
+       if (!unit_log)
+               return 0;
+
+       ret = __write_log(td, td->lat_log, try);
+       if (!ret)
+               td->lat_log = NULL;
+
+       return ret;
 }
 
-static int write_bandw_log(struct thread_data *td, int try)
+static int write_bandw_log(struct thread_data *td, int try, bool unit_log)
 {
-       return __write_log(td, td->bw_log, try);
+       int ret;
+
+       if (per_unit_log(td->bw_log) != unit_log)
+               return 0;
+
+       ret = __write_log(td, td->bw_log, try);
+       if (!ret)
+               td->bw_log = NULL;
+
+       return ret;
 }
 
 enum {
@@ -1276,7 +1393,7 @@ enum {
 
 struct log_type {
        unsigned int mask;
-       int (*fn)(struct thread_data *, int);
+       int (*fn)(struct thread_data *, int, bool);
 };
 
 static struct log_type log_types[] = {
@@ -1302,7 +1419,7 @@ static struct log_type log_types[] = {
        },
 };
 
-void fio_writeout_logs(struct thread_data *td)
+void td_writeout_logs(struct thread_data *td, bool unit_logs)
 {
        unsigned int log_mask = 0;
        unsigned int log_left = ALL_LOG_NR;
@@ -1310,7 +1427,7 @@ void fio_writeout_logs(struct thread_data *td)
 
        old_state = td_bump_runstate(td, TD_FINISHING);
 
-       finalize_logs(td);
+       finalize_logs(td, unit_logs);
 
        while (log_left) {
                int prev_log_left = log_left;
@@ -1320,7 +1437,7 @@ void fio_writeout_logs(struct thread_data *td)
                        int ret;
 
                        if (!(log_mask & lt->mask)) {
-                               ret = lt->fn(td, log_left != 1);
+                               ret = lt->fn(td, log_left != 1, unit_logs);
                                if (!ret) {
                                        log_left--;
                                        log_mask |= lt->mask;
@@ -1334,3 +1451,12 @@ void fio_writeout_logs(struct thread_data *td)
 
        td_restore_runstate(td, old_state);
 }
+
+void fio_writeout_logs(bool unit_logs)
+{
+       struct thread_data *td;
+       int i;
+
+       for_each_td(td, i)
+               td_writeout_logs(td, unit_logs);
+}
diff --git a/iolog.h b/iolog.h
index 297daf5..0da7067 100644 (file)
--- a/iolog.h
+++ b/iolog.h
@@ -41,6 +41,16 @@ enum {
        IO_LOG_TYPE_IOPS,
 };
 
+#define DEF_LOG_ENTRIES                1024
+#define MAX_LOG_ENTRIES                (1024 * DEF_LOG_ENTRIES)
+
+struct io_logs {
+       struct flist_head list;
+       uint64_t nr_samples;
+       uint64_t max_samples;
+       void *log;
+};
+
 /*
  * Dynamically growing data sample log
  */
@@ -48,9 +58,14 @@ struct io_log {
        /*
         * Entries already logged
         */
-       uint64_t nr_samples;
-       uint64_t max_samples;
-       void *log;
+       struct flist_head io_logs;
+       uint32_t cur_log_max;
+
+       /*
+        * When the current log runs out of space, store events here until
+        * we have a chance to regrow
+        */
+       struct io_logs *pending;
 
        unsigned int log_ddir_mask;
 
@@ -63,7 +78,7 @@ struct io_log {
        /*
         * If we fail extending the log, stop collecting more entries.
         */
-       unsigned int disabled;
+       bool disabled;
 
        /*
         * Log offsets
@@ -126,10 +141,15 @@ static inline struct io_sample *__get_sample(void *samples, int log_offset,
        return (struct io_sample *) ((char *) samples + sample_offset);
 }
 
+struct io_logs *iolog_cur_log(struct io_log *);
+uint64_t iolog_nr_samples(struct io_log *);
+void regrow_logs(struct thread_data *);
+
 static inline struct io_sample *get_sample(struct io_log *iolog,
+                                          struct io_logs *cur_log,
                                           uint64_t sample)
 {
-       return __get_sample(iolog->log, iolog->log_offset, sample);
+       return __get_sample(cur_log->log, iolog->log_offset, sample);
 }
 
 enum {
@@ -205,13 +225,19 @@ struct log_params {
        int log_compress;
 };
 
-extern void finalize_logs(struct thread_data *td);
+static inline bool per_unit_log(struct io_log *log)
+{
+       return log && !log->avg_msec;
+}
+
+extern void finalize_logs(struct thread_data *td, bool);
 extern void setup_log(struct io_log **, struct log_params *, const char *);
 extern void flush_log(struct io_log *, int);
 extern void flush_samples(FILE *, void *, uint64_t);
 extern void free_log(struct io_log *);
-extern void fio_writeout_logs(struct thread_data *);
-extern int iolog_flush(struct io_log *, int);
+extern void fio_writeout_logs(bool);
+extern void td_writeout_logs(struct thread_data *, bool);
+extern int iolog_cur_flush(struct io_log *, struct io_logs *);
 
 static inline void init_ipo(struct io_piece *ipo)
 {
index afd0490..f974490 100644 (file)
@@ -38,7 +38,10 @@ unsigned long long gauss_next(struct gauss_state *gs)
                sum += dev;
        }
 
-       return __hash_u64(sum) % gs->nranges;
+       if (!gs->disable_hash)
+               sum = __hash_u64(sum);
+
+       return sum % gs->nranges;
 }
 
 void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
@@ -54,3 +57,8 @@ void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
                        gs->stddev = nranges / 2;
        }
 }
+
+void gauss_disable_hash(struct gauss_state *gs)
+{
+       gs->disable_hash = true;
+}
index a76df3f..478aa14 100644 (file)
@@ -8,10 +8,12 @@ struct gauss_state {
        struct frand_state r;
        uint64_t nranges;
        unsigned int stddev;
+       bool disable_hash;
 };
 
 void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
                unsigned int seed);
 unsigned long long gauss_next(struct gauss_state *gs);
+void gauss_disable_hash(struct gauss_state *gs);
 
 #endif
index 1b661a8..9c3e0d6 100644 (file)
@@ -76,7 +76,7 @@ static void __init_rand64(struct taus258_state *state, uint64_t seed)
                __rand64(state);
 }
 
-void init_rand(struct frand_state *state, int use64)
+void init_rand(struct frand_state *state, bool use64)
 {
        state->use64 = use64;
 
@@ -86,7 +86,7 @@ void init_rand(struct frand_state *state, int use64)
                __init_rand64(&state->state64, 1);
 }
 
-void init_rand_seed(struct frand_state *state, unsigned int seed, int use64)
+void init_rand_seed(struct frand_state *state, unsigned int seed, bool use64)
 {
        state->use64 = use64;
 
index a95bd28..bff4a35 100644 (file)
@@ -2,6 +2,8 @@
 #define FIO_RAND_H
 
 #include <inttypes.h>
+#include <assert.h>
+#include "types.h"
 #include "../arch/arch.h"
 
 #define FRAND32_MAX    (-1U)
@@ -23,10 +25,6 @@ struct frand_state {
        };
 };
 
-struct frand64_state {
-       uint64_t s1, s2, s3, s4, s5;
-};
-
 static inline uint64_t rand_max(struct frand_state *state)
 {
        if (state->use64)
@@ -117,8 +115,21 @@ static inline double __rand_0_1(struct frand_state *state)
        }
 }
 
-extern void init_rand(struct frand_state *, int);
-extern void init_rand_seed(struct frand_state *, unsigned int seed, int);
+/*
+ * Generate a random value between 'start' and 'end', both inclusive
+ */
+static inline int rand32_between(struct frand_state *state, int start, int end)
+{
+       uint32_t r;
+
+       assert(!state->use64);
+
+       r = __rand32(&state->state32);
+       return start + (int) ((double)end * (r / (FRAND32_MAX + 1.0)));
+}
+
+extern void init_rand(struct frand_state *, bool);
+extern void init_rand_seed(struct frand_state *, unsigned int seed, bool);
 extern void __fill_random_buf(void *buf, unsigned int len, unsigned long seed);
 extern unsigned long fill_random_buf(struct frand_state *, void *buf, unsigned int len);
 extern void __fill_random_buf_percentage(unsigned long, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
index d8e72b1..681df70 100644 (file)
@@ -69,7 +69,12 @@ unsigned long long zipf_next(struct zipf_state *zs)
        else
                val = 1 + (unsigned long long)(n * pow(eta*rand_uni - eta + 1.0, alpha));
 
-       return (__hash_u64(val - 1) + zs->rand_off) % zs->nranges;
+       val--;
+
+       if (!zs->disable_hash)
+               val = __hash_u64(val);
+
+       return (val + zs->rand_off) % zs->nranges;
 }
 
 void pareto_init(struct zipf_state *zs, unsigned long nranges, double h,
@@ -82,7 +87,17 @@ void pareto_init(struct zipf_state *zs, unsigned long nranges, double h,
 unsigned long long pareto_next(struct zipf_state *zs)
 {
        double rand = (double) __rand(&zs->rand) / (double) FRAND32_MAX;
-       unsigned long long n = zs->nranges - 1;
+       unsigned long long n;
+
+       n = (zs->nranges - 1) * pow(rand, zs->pareto_pow);
+
+       if (!zs->disable_hash)
+               n = __hash_u64(n);
 
-       return (__hash_u64(n * pow(rand, zs->pareto_pow)) + zs->rand_off) % zs->nranges;
+       return (n + zs->rand_off)  % zs->nranges;
+}
+
+void zipf_disable_hash(struct zipf_state *zs)
+{
+       zs->disable_hash = true;
 }
index f98ad81..af2d0e6 100644 (file)
@@ -12,6 +12,7 @@ struct zipf_state {
        double pareto_pow;
        struct frand_state rand;
        uint64_t rand_off;
+       bool disable_hash;
 };
 
 void zipf_init(struct zipf_state *zs, unsigned long nranges, double theta, unsigned int seed);
@@ -19,5 +20,6 @@ unsigned long long zipf_next(struct zipf_state *zs);
 
 void pareto_init(struct zipf_state *zs, unsigned long nranges, double h, unsigned int seed);
 unsigned long long pareto_next(struct zipf_state *zs);
+void zipf_disable_hash(struct zipf_state *zs);
 
 #endif
index c626d15..55762d7 100644 (file)
--- a/libfio.c
+++ b/libfio.c
@@ -33,6 +33,7 @@
 #include "smalloc.h"
 #include "os/os.h"
 #include "filelock.h"
+#include "helper_thread.h"
 
 /*
  * Just expose an empty list, if the OS does not support disk util stats
@@ -146,9 +147,12 @@ void reset_all_stats(struct thread_data *td)
        fio_gettime(&tv, NULL);
        memcpy(&td->epoch, &tv, sizeof(tv));
        memcpy(&td->start, &tv, sizeof(tv));
+       memcpy(&td->iops_sample_time, &tv, sizeof(tv));
+       memcpy(&td->bw_sample_time, &tv, sizeof(tv));
 
        lat_target_reset(td);
        clear_rusage_stat(td);
+       helper_reset();
 }
 
 void reset_fio_state(void)
index 5060223..c04d7df 100644 (file)
--- a/memory.c
+++ b/memory.c
@@ -229,7 +229,17 @@ int allocate_io_mem(struct thread_data *td)
 
        dprint(FD_MEM, "Alloc %llu for buffers\n", (unsigned long long) total_mem);
 
-       if (td->o.mem_type == MEM_MALLOC)
+       /*
+        * If the IO engine has hooks to allocate/free memory, use those. But
+        * error out if the user explicitly asked for something else.
+        */
+       if (td->io_ops->iomem_alloc) {
+               if (fio_option_is_set(&td->o, mem_type)) {
+                       log_err("fio: option 'mem/iomem' conflicts with specified IO engine\n");
+                       ret = 1;
+               } else
+                       ret = td->io_ops->iomem_alloc(td, total_mem);
+       } else if (td->o.mem_type == MEM_MALLOC)
                ret = alloc_mem_malloc(td, total_mem);
        else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE)
                ret = alloc_mem_shm(td, total_mem);
@@ -255,7 +265,10 @@ void free_io_mem(struct thread_data *td)
        if (td->o.odirect || td->o.oatomic)
                total_mem += page_mask;
 
-       if (td->o.mem_type == MEM_MALLOC)
+       if (td->io_ops->iomem_alloc) {
+               if (td->io_ops->iomem_free)
+                       td->io_ops->iomem_free(td);
+       } else if (td->o.mem_type == MEM_MALLOC)
                free_mem_malloc(td);
        else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE)
                free_mem_shm(td);
diff --git a/mutex.c b/mutex.c
index a48e37d..16107dd 100644 (file)
--- a/mutex.c
+++ b/mutex.c
@@ -136,6 +136,7 @@ int fio_mutex_down_timeout(struct fio_mutex *mutex, unsigned int msecs)
        if (!ret) {
                mutex->value--;
                pthread_mutex_unlock(&mutex->lock);
+               return 0;
        }
 
        pthread_mutex_unlock(&mutex->lock);
index 0bf1390..07589c4 100644 (file)
--- a/options.c
+++ b/options.c
 
 char client_sockaddr_str[INET6_ADDRSTRLEN] = { 0 };
 
+struct pattern_fmt_desc fmt_desc[] = {
+       {
+               .fmt   = "%o",
+               .len   = FIELD_SIZE(struct io_u *, offset),
+               .paste = paste_blockoff
+       }
+};
+
 /*
  * Check if mmap/mmaphuge has a :/foo/bar/file at the end. If so, return that.
  */
@@ -44,35 +52,28 @@ static int bs_cmp(const void *p1, const void *p2)
        return (int) bsp1->perc - (int) bsp2->perc;
 }
 
-static int bssplit_ddir(struct thread_options *o, int ddir, char *str)
+struct split {
+       unsigned int nr;
+       unsigned int val1[100];
+       unsigned int val2[100];
+};
+
+static int split_parse_ddir(struct thread_options *o, struct split *split,
+                           enum fio_ddir ddir, char *str)
 {
-       struct bssplit *bssplit;
-       unsigned int i, perc, perc_missing;
-       unsigned int max_bs, min_bs;
+       unsigned int i, perc;
        long long val;
        char *fname;
 
-       o->bssplit_nr[ddir] = 4;
-       bssplit = malloc(4 * sizeof(struct bssplit));
+       split->nr = 0;
 
        i = 0;
-       max_bs = 0;
-       min_bs = -1;
        while ((fname = strsep(&str, ":")) != NULL) {
                char *perc_str;
 
                if (!strlen(fname))
                        break;
 
-               /*
-                * grow struct buffer, if needed
-                */
-               if (i == o->bssplit_nr[ddir]) {
-                       o->bssplit_nr[ddir] <<= 1;
-                       bssplit = realloc(bssplit, o->bssplit_nr[ddir]
-                                                 * sizeof(struct bssplit));
-               }
-
                perc_str = strstr(fname, "/");
                if (perc_str) {
                        *perc_str = '\0';
@@ -87,28 +88,53 @@ static int bssplit_ddir(struct thread_options *o, int ddir, char *str)
 
                if (str_to_decimal(fname, &val, 1, o, 0, 0)) {
                        log_err("fio: bssplit conversion failed\n");
-                       free(bssplit);
                        return 1;
                }
 
-               if (val > max_bs)
-                       max_bs = val;
-               if (val < min_bs)
-                       min_bs = val;
-
-               bssplit[i].bs = val;
-               bssplit[i].perc = perc;
+               split->val1[i] = val;
+               split->val2[i] = perc;
                i++;
+               if (i == 100)
+                       break;
        }
 
-       o->bssplit_nr[ddir] = i;
+       split->nr = i;
+       return 0;
+}
+
+static int bssplit_ddir(struct thread_options *o, enum fio_ddir ddir, char *str)
+{
+       unsigned int i, perc, perc_missing;
+       unsigned int max_bs, min_bs;
+       struct split split;
+
+       memset(&split, 0, sizeof(split));
+
+       if (split_parse_ddir(o, &split, ddir, str))
+               return 1;
+       if (!split.nr)
+               return 0;
+
+       max_bs = 0;
+       min_bs = -1;
+       o->bssplit[ddir] = malloc(split.nr * sizeof(struct bssplit));
+       o->bssplit_nr[ddir] = split.nr;
+       for (i = 0; i < split.nr; i++) {
+               if (split.val1[i] > max_bs)
+                       max_bs = split.val1[i];
+               if (split.val1[i] < min_bs)
+                       min_bs = split.val1[i];
+
+               o->bssplit[ddir][i].bs = split.val1[i];
+               o->bssplit[ddir][i].perc =split.val2[i];
+       }
 
        /*
         * Now check if the percentages add up, and how much is missing
         */
        perc = perc_missing = 0;
        for (i = 0; i < o->bssplit_nr[ddir]; i++) {
-               struct bssplit *bsp = &bssplit[i];
+               struct bssplit *bsp = &o->bssplit[ddir][i];
 
                if (bsp->perc == -1U)
                        perc_missing++;
@@ -118,7 +144,8 @@ static int bssplit_ddir(struct thread_options *o, int ddir, char *str)
 
        if (perc > 100 && perc_missing > 1) {
                log_err("fio: bssplit percentages add to more than 100%%\n");
-               free(bssplit);
+               free(o->bssplit[ddir]);
+               o->bssplit[ddir] = NULL;
                return 1;
        }
 
@@ -130,7 +157,7 @@ static int bssplit_ddir(struct thread_options *o, int ddir, char *str)
                if (perc_missing == 1 && o->bssplit_nr[ddir] == 1)
                        perc = 100;
                for (i = 0; i < o->bssplit_nr[ddir]; i++) {
-                       struct bssplit *bsp = &bssplit[i];
+                       struct bssplit *bsp = &o->bssplit[ddir][i];
 
                        if (bsp->perc == -1U)
                                bsp->perc = (100 - perc) / perc_missing;
@@ -143,60 +170,78 @@ static int bssplit_ddir(struct thread_options *o, int ddir, char *str)
        /*
         * now sort based on percentages, for ease of lookup
         */
-       qsort(bssplit, o->bssplit_nr[ddir], sizeof(struct bssplit), bs_cmp);
-       o->bssplit[ddir] = bssplit;
+       qsort(o->bssplit[ddir], o->bssplit_nr[ddir], sizeof(struct bssplit), bs_cmp);
        return 0;
 }
 
-static int str_bssplit_cb(void *data, const char *input)
+typedef int (split_parse_fn)(struct thread_options *, enum fio_ddir, char *);
+
+static int str_split_parse(struct thread_data *td, char *str, split_parse_fn *fn)
 {
-       struct thread_data *td = data;
-       char *str, *p, *odir, *ddir;
+       char *odir, *ddir;
        int ret = 0;
 
-       if (parse_dryrun())
-               return 0;
-
-       p = str = strdup(input);
-
-       strip_blank_front(&str);
-       strip_blank_end(str);
-
        odir = strchr(str, ',');
        if (odir) {
                ddir = strchr(odir + 1, ',');
                if (ddir) {
-                       ret = bssplit_ddir(&td->o, DDIR_TRIM, ddir + 1);
+                       ret = fn(&td->o, DDIR_TRIM, ddir + 1);
                        if (!ret)
                                *ddir = '\0';
                } else {
                        char *op;
 
                        op = strdup(odir + 1);
-                       ret = bssplit_ddir(&td->o, DDIR_TRIM, op);
+                       ret = fn(&td->o, DDIR_TRIM, op);
 
                        free(op);
                }
                if (!ret)
-                       ret = bssplit_ddir(&td->o, DDIR_WRITE, odir + 1);
+                       ret = fn(&td->o, DDIR_WRITE, odir + 1);
                if (!ret) {
                        *odir = '\0';
-                       ret = bssplit_ddir(&td->o, DDIR_READ, str);
+                       ret = fn(&td->o, DDIR_READ, str);
                }
        } else {
                char *op;
 
                op = strdup(str);
-               ret = bssplit_ddir(&td->o, DDIR_WRITE, op);
+               ret = fn(&td->o, DDIR_WRITE, op);
                free(op);
 
                if (!ret) {
                        op = strdup(str);
-                       ret = bssplit_ddir(&td->o, DDIR_TRIM, op);
+                       ret = fn(&td->o, DDIR_TRIM, op);
                        free(op);
                }
                if (!ret)
-                       ret = bssplit_ddir(&td->o, DDIR_READ, str);
+                       ret = fn(&td->o, DDIR_READ, str);
+       }
+
+       return ret;
+}
+
+static int str_bssplit_cb(void *data, const char *input)
+{
+       struct thread_data *td = data;
+       char *str, *p;
+       int ret = 0;
+
+       p = str = strdup(input);
+
+       strip_blank_front(&str);
+       strip_blank_end(str);
+
+       ret = str_split_parse(td, str, bssplit_ddir);
+
+       if (parse_dryrun()) {
+               int i;
+
+               for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+                       free(td->o.bssplit[i]);
+                       td->o.bssplit[i] = NULL;
+                       td->o.bssplit_nr[i] = 0;
+               }
        }
 
        free(p);
@@ -679,12 +724,77 @@ out:
 static int str_fst_cb(void *data, const char *str)
 {
        struct thread_data *td = data;
-       char *nr = get_opt_postfix(str);
+       double val;
+       bool done = false;
+       char *nr;
 
        td->file_service_nr = 1;
-       if (nr) {
-               td->file_service_nr = atoi(nr);
+
+       switch (td->o.file_service_type) {
+       case FIO_FSERVICE_RANDOM:
+       case FIO_FSERVICE_RR:
+       case FIO_FSERVICE_SEQ:
+               nr = get_opt_postfix(str);
+               if (nr) {
+                       td->file_service_nr = atoi(nr);
+                       free(nr);
+               }
+               done = true;
+               break;
+       case FIO_FSERVICE_ZIPF:
+               val = FIO_DEF_ZIPF;
+               break;
+       case FIO_FSERVICE_PARETO:
+               val = FIO_DEF_PARETO;
+               break;
+       case FIO_FSERVICE_GAUSS:
+               val = 0.0;
+               break;
+       default:
+               log_err("fio: bad file service type: %d\n", td->o.file_service_type);
+               return 1;
+       }
+
+       if (done)
+               return 0;
+
+       nr = get_opt_postfix(str);
+       if (nr && !str_to_float(nr, &val, 0)) {
+               log_err("fio: file service type random postfix parsing failed\n");
                free(nr);
+               return 1;
+       }
+
+       free(nr);
+
+       switch (td->o.file_service_type) {
+       case FIO_FSERVICE_ZIPF:
+               if (val == 1.00) {
+                       log_err("fio: zipf theta must be different than 1.0\n");
+                       return 1;
+               }
+               if (parse_dryrun())
+                       return 0;
+               td->zipf_theta = val;
+               break;
+       case FIO_FSERVICE_PARETO:
+               if (val <= 0.00 || val >= 1.00) {
+                          log_err("fio: pareto input out of range (0 < input < 1.0)\n");
+                          return 1;
+               }
+               if (parse_dryrun())
+                       return 0;
+               td->pareto_h = val;
+               break;
+       case FIO_FSERVICE_GAUSS:
+               if (val < 0.00 || val >= 100.00) {
+                          log_err("fio: normal deviation out of range (0 <= input < 100.0)\n");
+                          return 1;
+               }
+               if (parse_dryrun())
+                       return 0;
+               td->gauss_dev = val;
+               break;
        }
 
        return 0;
@@ -706,21 +816,208 @@ static int str_sfr_cb(void *data, const char *str)
 }
 #endif
 
+static int zone_cmp(const void *p1, const void *p2)
+{
+       const struct zone_split *zsp1 = p1;
+       const struct zone_split *zsp2 = p2;
+
+       return (int) zsp2->access_perc - (int) zsp1->access_perc;
+}
+
+static int zone_split_ddir(struct thread_options *o, enum fio_ddir ddir,
+                          char *str)
+{
+       unsigned int i, perc, perc_missing, sperc, sperc_missing;
+       struct split split;
+
+       memset(&split, 0, sizeof(split));
+
+       if (split_parse_ddir(o, &split, ddir, str))
+               return 1;
+       if (!split.nr)
+               return 0;
+
+       o->zone_split[ddir] = malloc(split.nr * sizeof(struct zone_split));
+       o->zone_split_nr[ddir] = split.nr;
+       for (i = 0; i < split.nr; i++) {
+               o->zone_split[ddir][i].access_perc = split.val1[i];
+               o->zone_split[ddir][i].size_perc = split.val2[i];
+       }
+
+       /*
+        * Now check if the percentages add up, and how much is missing
+        */
+       perc = perc_missing = 0;
+       sperc = sperc_missing = 0;
+       for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+               struct zone_split *zsp = &o->zone_split[ddir][i];
+
+               if (zsp->access_perc == (uint8_t) -1U)
+                       perc_missing++;
+               else
+                       perc += zsp->access_perc;
+
+               if (zsp->size_perc == (uint8_t) -1U)
+                       sperc_missing++;
+               else
+                       sperc += zsp->size_perc;
+
+       }
+
+       if (perc > 100 || sperc > 100) {
+               log_err("fio: zone_split percentages add to more than 100%%\n");
+               free(o->zone_split[ddir]);
+               o->zone_split[ddir] = NULL;
+               return 1;
+       }
+       if (perc < 100) {
+               log_err("fio: access percentage don't add up to 100 for zoned "
+                       "random distribution (got=%u)\n", perc);
+               free(o->zone_split[ddir]);
+               o->zone_split[ddir] = NULL;
+               return 1;
+       }
+
+       /*
+        * If values didn't have a percentage set, divide the remains between
+        * them.
+        */
+       if (perc_missing) {
+               if (perc_missing == 1 && o->zone_split_nr[ddir] == 1)
+                       perc = 100;
+               for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+                       struct zone_split *zsp = &o->zone_split[ddir][i];
+
+                       if (zsp->access_perc == (uint8_t) -1U)
+                               zsp->access_perc = (100 - perc) / perc_missing;
+               }
+       }
+       if (sperc_missing) {
+               if (sperc_missing == 1 && o->zone_split_nr[ddir] == 1)
+                       sperc = 100;
+               for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+                       struct zone_split *zsp = &o->zone_split[ddir][i];
+
+                       if (zsp->size_perc == (uint8_t) -1U)
+                               zsp->size_perc = (100 - sperc) / sperc_missing;
+               }
+       }
+
+       /*
+        * now sort based on percentages, for ease of lookup
+        */
+       qsort(o->zone_split[ddir], o->zone_split_nr[ddir], sizeof(struct zone_split), zone_cmp);
+       return 0;
+}
+
+static void __td_zone_gen_index(struct thread_data *td, enum fio_ddir ddir)
+{
+       unsigned int i, j, sprev, aprev;
+
+       td->zone_state_index[ddir] = malloc(sizeof(struct zone_split_index) * 100);
+
+       sprev = aprev = 0;
+       for (i = 0; i < td->o.zone_split_nr[ddir]; i++) {
+               struct zone_split *zsp = &td->o.zone_split[ddir][i];
+
+               for (j = aprev; j < aprev + zsp->access_perc; j++) {
+                       struct zone_split_index *zsi = &td->zone_state_index[ddir][j];
+
+                       zsi->size_perc = sprev + zsp->size_perc;
+                       zsi->size_perc_prev = sprev;
+               }
+
+               aprev += zsp->access_perc;
+               sprev += zsp->size_perc;
+       }
+}
+
+/*
+ * Generate state table for indexes, so we don't have to do it inline from
+ * the hot IO path
+ */
+static void td_zone_gen_index(struct thread_data *td)
+{
+       int i;
+
+       td->zone_state_index = malloc(DDIR_RWDIR_CNT *
+                                       sizeof(struct zone_split_index *));
+
+       for (i = 0; i < DDIR_RWDIR_CNT; i++)
+               __td_zone_gen_index(td, i);
+}
+
+static int parse_zoned_distribution(struct thread_data *td, const char *input)
+{
+       char *str, *p;
+       int i, ret = 0;
+
+       p = str = strdup(input);
+
+       strip_blank_front(&str);
+       strip_blank_end(str);
+
+       /* We expect it to start like that, bail if not */
+       if (strncmp(str, "zoned:", 6)) {
+               log_err("fio: mismatch in zoned input <%s>\n", str);
+               free(p);
+               return 1;
+       }
+       str += strlen("zoned:");
+
+       ret = str_split_parse(td, str, zone_split_ddir);
+
+       free(p);
+
+       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+               int j;
+
+               dprint(FD_PARSE, "zone ddir %d (nr=%u): \n", i, td->o.zone_split_nr[i]);
+
+               for (j = 0; j < td->o.zone_split_nr[i]; j++) {
+                       struct zone_split *zsp = &td->o.zone_split[i][j];
+
+                       dprint(FD_PARSE, "\t%d: %u/%u\n", j, zsp->access_perc,
+                                                               zsp->size_perc);
+               }
+       }
+
+       if (parse_dryrun()) {
+               int i;
+
+               for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+                       free(td->o.zone_split[i]);
+                       td->o.zone_split[i] = NULL;
+                       td->o.zone_split_nr[i] = 0;
+               }
+
+               return ret;
+       }
+
+       if (!ret)
+               td_zone_gen_index(td);
+       else {
+               for (i = 0; i < DDIR_RWDIR_CNT; i++)
+                       td->o.zone_split_nr[i] = 0;
+       }
+
+       return ret;
+}
+
 static int str_random_distribution_cb(void *data, const char *str)
 {
        struct thread_data *td = data;
        double val;
        char *nr;
 
-       if (parse_dryrun())
-               return 0;
-
        if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
                val = FIO_DEF_ZIPF;
        else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
                val = FIO_DEF_PARETO;
        else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
                val = 0.0;
+       else if (td->o.random_distribution == FIO_RAND_DIST_ZONED)
+               return parse_zoned_distribution(td, str);
        else
                return 0;
 
@@ -738,18 +1035,24 @@ static int str_random_distribution_cb(void *data, const char *str)
                        log_err("fio: zipf theta must different than 1.0\n");
                        return 1;
                }
+               if (parse_dryrun())
+                       return 0;
                td->o.zipf_theta.u.f = val;
        } else if (td->o.random_distribution == FIO_RAND_DIST_PARETO) {
                if (val <= 0.00 || val >= 1.00) {
                        log_err("fio: pareto input out of range (0 < input < 1.0)\n");
                        return 1;
                }
+               if (parse_dryrun())
+                       return 0;
                td->o.pareto_h.u.f = val;
        } else {
-               if (val <= 0.00 || val >= 100.0) {
-                       log_err("fio: normal deviation out of range (0 < input < 100.0)\n");
+               if (val < 0.00 || val >= 100.0) {
+                       log_err("fio: normal deviation out of range (0 <= input < 100.0)\n");
                        return 1;
                }
+               if (parse_dryrun())
+                       return 0;
                td->o.gauss_dev.u.f = val;
        }
 
@@ -954,20 +1257,13 @@ static int str_dedupe_cb(void *data, unsigned long long *il)
 
 static int str_verify_pattern_cb(void *data, const char *input)
 {
-       struct pattern_fmt_desc fmt_desc[] = {
-               {
-                       .fmt   = "%o",
-                       .len   = FIELD_SIZE(struct io_u *, offset),
-                       .paste = paste_blockoff
-               }
-       };
        struct thread_data *td = data;
        int ret;
 
        td->o.verify_fmt_sz = ARRAY_SIZE(td->o.verify_fmt);
        ret = parse_and_fill_pattern(input, strlen(input), td->o.verify_pattern,
-                       MAX_PATTERN_SIZE, fmt_desc, sizeof(fmt_desc),
-                       td->o.verify_fmt, &td->o.verify_fmt_sz);
+                                    MAX_PATTERN_SIZE, fmt_desc, sizeof(fmt_desc),
+                                    td->o.verify_fmt, &td->o.verify_fmt_sz);
        if (ret < 0)
                return 1;
 
@@ -1240,6 +1536,11 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                            .help = "Use preadv/pwritev",
                          },
 #endif
+#ifdef CONFIG_PWRITEV2
+                         { .ival = "pvsync2",
+                           .help = "Use preadv2/pwritev2",
+                         },
+#endif
 #ifdef CONFIG_LIBAIO
                          { .ival = "libaio",
                            .help = "Linux native asynchronous IO",
@@ -1332,6 +1633,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                          { .ival = "libhdfs",
                            .help = "Hadoop Distributed Filesystem (HDFS) engine"
                          },
+#endif
+#ifdef CONFIG_PMEMBLK
+                         { .ival = "pmemblk",
+                           .help = "NVML libpmemblk based IO engine",
+                         },
+
 #endif
                          { .ival = "external",
                            .help = "Load external engine (append name)",
@@ -1704,6 +2011,11 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                            .oval = FIO_RAND_DIST_GAUSS,
                            .help = "Normal (gaussian) distribution",
                          },
+                         { .ival = "zoned",
+                           .oval = FIO_RAND_DIST_ZONED,
+                           .help = "Zoned random distribution",
+                         },
+
                },
                .category = FIO_OPT_C_IO,
                .group  = FIO_OPT_G_RANDOM,
@@ -1773,7 +2085,19 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                .posval = {
                          { .ival = "random",
                            .oval = FIO_FSERVICE_RANDOM,
-                           .help = "Choose a file at random",
+                           .help = "Choose a file at random (uniform)",
+                         },
+                         { .ival = "zipf",
+                           .oval = FIO_FSERVICE_ZIPF,
+                           .help = "Zipf randomized",
+                         },
+                         { .ival = "pareto",
+                           .oval = FIO_FSERVICE_PARETO,
+                           .help = "Pareto randomized",
+                         },
+                         { .ival = "gauss",
+                           .oval = FIO_FSERVICE_GAUSS,
+                           .help = "Normal (guassian) distribution",
                          },
                          { .ival = "roundrobin",
                            .oval = FIO_FSERVICE_RR,
@@ -3073,6 +3397,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                .category = FIO_OPT_C_LOG,
                .group  = FIO_OPT_G_INVALID,
        },
+       {
+               .name   = "log_max_value",
+               .lname  = "Log maximum instead of average",
+               .type   = FIO_OPT_BOOL,
+               .off1   = td_var_offset(log_max),
+               .help   = "Log max sample in a window instead of average",
+               .def    = "0",
+               .category = FIO_OPT_C_LOG,
+               .group  = FIO_OPT_G_INVALID,
+       },
        {
                .name   = "log_offset",
                .lname  = "Log offset of IO",
index 9e708f0..23c16b6 100644 (file)
@@ -6,6 +6,7 @@
 #include <sys/ioctl.h>
 #include <sys/uio.h>
 #include <sys/syscall.h>
+#include <sys/sysmacros.h>
 #include <sys/vfs.h>
 #include <sys/mman.h>
 #include <unistd.h>
index d202e99..76d388e 100644 (file)
 
 typedef off_t off64_t;
 
-/* OS X as of 10.6 doesn't have the timer_* functions. 
- * Emulate the functionality using setitimer and sigaction here
- */
-
-#define MAX_TIMERS 64
-
 typedef unsigned int clockid_t;
-typedef unsigned int timer_t;
-
-struct itimerspec {
-       struct timespec it_value;
-       struct timespec it_interval;
-};
-
-static struct sigevent fio_timers[MAX_TIMERS];
-static unsigned int num_timers = 0;
-
-static void sig_alrm(int signum)
-{
-       union sigval sv;
-       
-       for (int i = 0; i < num_timers; i++) {
-               if (fio_timers[i].sigev_notify_function == NULL)
-                       continue;
-               
-               if (fio_timers[i].sigev_notify == SIGEV_THREAD)
-                       fio_timers[i].sigev_notify_function(sv);
-               else if (fio_timers[i].sigev_notify == SIGEV_SIGNAL)
-                       kill(getpid(), fio_timers[i].sigev_signo);
-       }
-}
-
-static inline int timer_settime(timer_t timerid, int flags,
-                               const struct itimerspec *value,
-                               struct itimerspec *ovalue)
-{
-       struct sigaction sa;
-       struct itimerval tv;
-       struct itimerval tv_out;
-       int rc;
-       
-       tv.it_interval.tv_sec = value->it_interval.tv_sec;
-       tv.it_interval.tv_usec = value->it_interval.tv_nsec / 1000;
-
-       tv.it_value.tv_sec = value->it_value.tv_sec;
-       tv.it_value.tv_usec = value->it_value.tv_nsec / 1000;
-
-       sa.sa_handler = sig_alrm;
-       sigemptyset(&sa.sa_mask);
-       sa.sa_flags = 0;
-       
-       rc = sigaction(SIGALRM, &sa, NULL);
-
-       if (!rc)
-               rc = setitimer(ITIMER_REAL, &tv, &tv_out);
-       
-       if (!rc && ovalue != NULL) {
-               ovalue->it_interval.tv_sec = tv_out.it_interval.tv_sec;
-               ovalue->it_interval.tv_nsec = tv_out.it_interval.tv_usec * 1000;
-               ovalue->it_value.tv_sec = tv_out.it_value.tv_sec;
-               ovalue->it_value.tv_nsec = tv_out.it_value.tv_usec * 1000;
-       }
-
-       return rc;
-}
-
-static inline int timer_delete(timer_t timer)
-{
-       return 0;
-}
 
 #define FIO_OS_DIRECTIO
 static inline int fio_set_odirect(int fd)
index 159c086..d049531 100644 (file)
 
 #include "windows/posix.h"
 
+/* Cygwin doesn't define rand_r if C99 or newer is being used */
+#if defined(WIN32) && !defined(rand_r)
+int rand_r(unsigned *);
+#endif
+
 #ifndef PTHREAD_STACK_MIN
 #define PTHREAD_STACK_MIN 65535
 #endif
diff --git a/os/os.h b/os/os.h
index fd47f22..9877383 100644 (file)
--- a/os/os.h
+++ b/os/os.h
@@ -151,7 +151,7 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
 #endif
 
 #ifndef FIO_PREFERRED_ENGINE
-#define FIO_PREFERRED_ENGINE   "sync"
+#define FIO_PREFERRED_ENGINE   "psync"
 #endif
 
 #ifndef FIO_OS_PATH_SEPARATOR
@@ -343,6 +343,14 @@ static inline unsigned long long get_fs_free_size(const char *path)
 }
 #endif
 
+#ifdef __powerpc64__
+#define FIO_HAVE_CPU_ONLINE_SYSCONF
+static inline unsigned int cpus_online(void)
+{
+        return sysconf(_SC_NPROCESSORS_CONF);
+}
+#endif
+
 #ifndef FIO_HAVE_CPU_ONLINE_SYSCONF
 static inline unsigned int cpus_online(void)
 {
index 299ca9b..44cc938 100755 (executable)
@@ -10,7 +10,7 @@
        <Product Id="*"
          Codepage="1252" Language="1033"
          Manufacturer="fio" Name="fio"
-         UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="2.3">
+         UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="2.9">
                <Package
                  Description="Flexible IO Tester"
                  InstallerVersion="301" Keywords="Installer,MSI,Database"
index 41fc480..fd3d9ab 100755 (executable)
@@ -243,12 +243,12 @@ void Time_tToSystemTime(time_t dosTime, SYSTEMTIME *systemTime)
 char* ctime_r(const time_t *t, char *buf)
 {
     SYSTEMTIME systime;
-    const char * const dayOfWeek[] = { "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" };
+    const char * const dayOfWeek[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" };
     const char * const monthOfYear[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
 
     Time_tToSystemTime(*t, &systime);
     /* We don't know how long `buf` is, but assume it's rounded up from the minimum of 25 to 32 */
-    StringCchPrintfA(buf, 32, "%s %s %d %02d:%02d:%02d %04d", dayOfWeek[systime.wDayOfWeek - 1], monthOfYear[systime.wMonth - 1],
+    StringCchPrintfA(buf, 31, "%s %s %d %02d:%02d:%02d %04d", dayOfWeek[systime.wDayOfWeek % 7], monthOfYear[(systime.wMonth - 1) % 12],
                                                                                 systime.wDay, systime.wHour, systime.wMinute, systime.wSecond, systime.wYear);
     return buf;
 }
@@ -888,7 +888,7 @@ struct dirent *readdir(DIR *dirp)
 
        if (dirp->find_handle == INVALID_HANDLE_VALUE) {
                char search_pattern[MAX_PATH];
-               StringCchPrintfA(search_pattern, MAX_PATH, "%s\\*", dirp->dirname);
+               StringCchPrintfA(search_pattern, MAX_PATH-1, "%s\\*", dirp->dirname);
                dirp->find_handle = FindFirstFileA(search_pattern, &find_data);
                if (dirp->find_handle == INVALID_HANDLE_VALUE)
                        return NULL;
index 11d879a..8ec7741 100644 (file)
@@ -26,14 +26,14 @@ static struct getopt_private_state {
 } pvt;
 
 static inline const char *option_matches(const char *arg_str,
-                                        const char *opt_name)
+                                        const char *opt_name, int smatch)
 {
        while (*arg_str != '\0' && *arg_str != '=') {
                if (*arg_str++ != *opt_name++)
                        return NULL;
        }
 
-       if (*opt_name)
+       if (*opt_name && !smatch)
                return NULL;
 
        return arg_str;
@@ -84,11 +84,37 @@ int getopt_long_only(int argc, char *const *argv, const char *optstring,
                }
 
                for (lo = longopts; lo->name; lo++) {
-                       if ((opt_end = option_matches(carg+2, lo->name)))
+                       opt_end = option_matches(carg+2, lo->name, 0);
+                       if (opt_end)
                            break;
                }
-               if (!opt_end)
-                       return '?';
+               /*
+                * The GNU getopt_long_only() apparently allows a short match,
+                * if it's unique and if we don't have a full match. Let's
+                * do the same here, search and see if there is one (and only
+                * one) short match.
+                */
+               if (!opt_end) {
+                       const struct option *lo_match = NULL;
+
+                       for (lo = longopts; lo->name; lo++) {
+                               const char *ret;
+
+                               ret = option_matches(carg+2, lo->name, 1);
+                               if (!ret)
+                                       continue;
+                               if (!opt_end) {
+                                       opt_end = ret;
+                                       lo_match = lo;
+                               } else {
+                                       opt_end = NULL;
+                                       break;
+                               }
+                       }
+                       if (!opt_end)
+                               return '?';
+                       lo = lo_match;
+               }
 
                if (longindex)
                        *longindex = lo-longopts;
index 3625de5..b5fd3f3 100644 (file)
@@ -29,6 +29,9 @@
 extern "C" {
 #endif
 
+// Needed for uint8_t, uint64_t
+#include <stdint.h>
+
 /* Maximum MTD device name length */
 #define MTD_NAME_MAX 127
 /* Maximum MTD device type string length */
index a123323..9768066 100644 (file)
@@ -30,6 +30,7 @@
 #include <errno.h>
 #include <features.h>
 #include <inttypes.h>
+#include <sys/sysmacros.h>
 
 #ifndef PROGRAM_NAME
 # error "You must define PROGRAM_NAME before including this header"
index eccd9d3..d36c511 100644 (file)
--- a/server.c
+++ b/server.c
@@ -262,10 +262,17 @@ static int fio_send_data(int sk, const void *p, unsigned int len)
        return fio_sendv_data(sk, &iov, 1);
 }
 
-static int fio_recv_data(int sk, void *p, unsigned int len)
+static int fio_recv_data(int sk, void *p, unsigned int len, bool wait)
 {
+       int flags;
+
+       if (wait)
+               flags = MSG_WAITALL;
+       else
+               flags = OS_MSG_DONTWAIT;
+
        do {
-               int ret = recv(sk, p, len, MSG_WAITALL);
+               int ret = recv(sk, p, len, flags);
 
                if (ret > 0) {
                        len -= ret;
@@ -275,9 +282,11 @@ static int fio_recv_data(int sk, void *p, unsigned int len)
                        continue;
                } else if (!ret)
                        break;
-               else if (errno == EAGAIN || errno == EINTR)
-                       continue;
-               else
+               else if (errno == EAGAIN || errno == EINTR) {
+                       if (wait)
+                               continue;
+                       break;
+               } else
                        break;
        } while (!exit_backend);
 
@@ -326,7 +335,7 @@ static int verify_convert_cmd(struct fio_net_cmd *cmd)
 /*
  * Read (and defragment, if necessary) incoming commands
  */
-struct fio_net_cmd *fio_net_recv_cmd(int sk)
+struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait)
 {
        struct fio_net_cmd cmd, *tmp, *cmdret = NULL;
        size_t cmd_size = 0, pdu_offset = 0;
@@ -335,7 +344,7 @@ struct fio_net_cmd *fio_net_recv_cmd(int sk)
        void *pdu = NULL;
 
        do {
-               ret = fio_recv_data(sk, &cmd, sizeof(cmd));
+               ret = fio_recv_data(sk, &cmd, sizeof(cmd), wait);
                if (ret)
                        break;
 
@@ -379,7 +388,7 @@ struct fio_net_cmd *fio_net_recv_cmd(int sk)
 
                /* There's payload, get it */
                pdu = (void *) cmdret->payload + pdu_offset;
-               ret = fio_recv_data(sk, pdu, cmd.pdu_len);
+               ret = fio_recv_data(sk, pdu, cmd.pdu_len, wait);
                if (ret)
                        break;
 
@@ -962,9 +971,9 @@ static int handle_trigger_cmd(struct fio_net_cmd *cmd)
                struct all_io_list state;
 
                state.threads = cpu_to_le64((uint64_t) 0);
-               fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, &state, sizeof(state), NULL, SK_F_COPY);
+               fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, &state, sizeof(state), NULL, SK_F_COPY | SK_F_INLINE);
        } else
-               fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, rep, sz, NULL, SK_F_FREE);
+               fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, rep, sz, NULL, SK_F_FREE | SK_F_INLINE);
 
        exec_trigger(buf);
        return 0;
@@ -1053,17 +1062,35 @@ static int fio_send_cmd_ext_pdu(int sk, uint16_t opcode, const void *buf,
 {
        struct fio_net_cmd cmd;
        struct iovec iov[2];
+       size_t this_len;
+       int ret;
 
        iov[0].iov_base = (void *) &cmd;
        iov[0].iov_len = sizeof(cmd);
-       iov[1].iov_base = (void *) buf;
-       iov[1].iov_len = size;
 
-       __fio_init_net_cmd(&cmd, opcode, size, tag);
-       cmd.flags = __cpu_to_le32(flags);
-       fio_net_cmd_crc_pdu(&cmd, buf);
+       do {
+               uint32_t this_flags = flags;
+
+               this_len = size;
+               if (this_len > FIO_SERVER_MAX_FRAGMENT_PDU)
+                       this_len = FIO_SERVER_MAX_FRAGMENT_PDU;
+
+               if (this_len < size)
+                       this_flags |= FIO_NET_CMD_F_MORE;
 
-       return fio_sendv_data(sk, iov, 2);
+               __fio_init_net_cmd(&cmd, opcode, this_len, tag);
+               cmd.flags = __cpu_to_le32(this_flags);
+               fio_net_cmd_crc_pdu(&cmd, buf);
+
+               iov[1].iov_base = (void *) buf;
+               iov[1].iov_len = this_len;
+
+               ret = fio_sendv_data(sk, iov, 2);
+               size -= this_len;
+               buf += this_len;
+       } while (!ret && size);
+
+       return ret;
 }
 
 static void finish_entry(struct sk_entry *entry)
@@ -1209,7 +1236,7 @@ static int handle_connection(struct sk_out *sk_out)
                if (ret < 0)
                        break;
 
-               cmd = fio_net_recv_cmd(sk_out->sk);
+               cmd = fio_net_recv_cmd(sk_out->sk, true);
                if (!cmd) {
                        ret = -1;
                        break;
@@ -1625,58 +1652,79 @@ void fio_server_send_du(void)
        }
 }
 
-static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
-{
-       int ret = 0;
 #ifdef CONFIG_ZLIB
+static int __fio_append_iolog_gz(struct sk_entry *first, struct io_log *log,
+                                struct io_logs *cur_log, z_stream *stream)
+{
        struct sk_entry *entry;
-       z_stream stream;
        void *out_pdu;
+       int ret;
 
-       /*
-        * Dirty - since the log is potentially huge, compress it into
-        * FIO_SERVER_MAX_FRAGMENT_PDU chunks and let the receiving
-        * side defragment it.
-        */
-       out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
-
-       stream.zalloc = Z_NULL;
-       stream.zfree = Z_NULL;
-       stream.opaque = Z_NULL;
-
-       if (deflateInit(&stream, Z_DEFAULT_COMPRESSION) != Z_OK) {
-               ret = 1;
-               goto err;
-       }
-
-       stream.next_in = (void *) log->log;
-       stream.avail_in = log->nr_samples * log_entry_sz(log);
+       stream->next_in = (void *) cur_log->log;
+       stream->avail_in = cur_log->nr_samples * log_entry_sz(log);
 
        do {
                unsigned int this_len;
 
-               stream.avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
-               stream.next_out = out_pdu;
-               ret = deflate(&stream, Z_FINISH);
+               /*
+                * Dirty - since the log is potentially huge, compress it into
+                * FIO_SERVER_MAX_FRAGMENT_PDU chunks and let the receiving
+                * side defragment it.
+                */
+               out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
+
+               stream->avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+               stream->next_out = out_pdu;
+               ret = deflate(stream, Z_FINISH);
                /* may be Z_OK, or Z_STREAM_END */
-               if (ret < 0)
-                       goto err_zlib;
+               if (ret < 0) {
+                       free(out_pdu);
+                       return 1;
+               }
 
-               this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream.avail_out;
+               this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream->avail_out;
 
                entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
-                                               NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
-               out_pdu = NULL;
+                                        NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
                flist_add_tail(&entry->list, &first->next);
-       } while (stream.avail_in);
+       } while (stream->avail_in);
+
+       return 0;
+}
+
+static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
+{
+       int ret = 0;
+       z_stream stream;
+
+       memset(&stream, 0, sizeof(stream));
+       stream.zalloc = Z_NULL;
+       stream.zfree = Z_NULL;
+       stream.opaque = Z_NULL;
+
+       if (deflateInit(&stream, Z_DEFAULT_COMPRESSION) != Z_OK)
+               return 1;
+
+       while (!flist_empty(&log->io_logs)) {
+               struct io_logs *cur_log;
+
+               cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+               flist_del_init(&cur_log->list);
+
+               ret = __fio_append_iolog_gz(first, log, cur_log, &stream);
+               if (ret)
+                       break;
+       }
 
-err_zlib:
        deflateEnd(&stream);
-err:
-       free(out_pdu);
-#endif
        return ret;
 }
+#else
+static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
+{
+       return 1;
+}
+#endif
 
 static int fio_append_gz_chunks(struct sk_entry *first, struct io_log *log)
 {
@@ -1700,11 +1748,21 @@ static int fio_append_gz_chunks(struct sk_entry *first, struct io_log *log)
 static int fio_append_text_log(struct sk_entry *first, struct io_log *log)
 {
        struct sk_entry *entry;
-       size_t size = log->nr_samples * log_entry_sz(log);
 
-       entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, log->log, size,
-                                       NULL, SK_F_VEC | SK_F_INLINE);
-       flist_add_tail(&entry->list, &first->next);
+       while (!flist_empty(&log->io_logs)) {
+               struct io_logs *cur_log;
+               size_t size;
+
+               cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+               flist_del_init(&cur_log->list);
+
+               size = cur_log->nr_samples * log_entry_sz(log);
+
+               entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, cur_log->log, size,
+                                               NULL, SK_F_VEC | SK_F_INLINE);
+               flist_add_tail(&entry->list, &first->next);
+       }
+
        return 0;
 }
 
@@ -1712,9 +1770,10 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
 {
        struct cmd_iolog_pdu pdu;
        struct sk_entry *first;
-       int i, ret = 0;
+       struct flist_head *entry;
+       int ret = 0;
 
-       pdu.nr_samples = cpu_to_le64(log->nr_samples);
+       pdu.nr_samples = cpu_to_le64(iolog_nr_samples(log));
        pdu.thread_number = cpu_to_le32(td->thread_number);
        pdu.log_type = cpu_to_le32(log->log_type);
 
@@ -1732,18 +1791,25 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
         * We can't do this for a pre-compressed log, but for that case,
         * log->nr_samples is zero anyway.
         */
-       for (i = 0; i < log->nr_samples; i++) {
-               struct io_sample *s = get_sample(log, i);
+       flist_for_each(entry, &log->io_logs) {
+               struct io_logs *cur_log;
+               int i;
+
+               cur_log = flist_entry(entry, struct io_logs, list);
+
+               for (i = 0; i < cur_log->nr_samples; i++) {
+                       struct io_sample *s = get_sample(log, cur_log, i);
 
-               s->time         = cpu_to_le64(s->time);
-               s->val          = cpu_to_le64(s->val);
-               s->__ddir       = cpu_to_le32(s->__ddir);
-               s->bs           = cpu_to_le32(s->bs);
+                       s->time         = cpu_to_le64(s->time);
+                       s->val          = cpu_to_le64(s->val);
+                       s->__ddir       = cpu_to_le32(s->__ddir);
+                       s->bs           = cpu_to_le32(s->bs);
 
-               if (log->log_offset) {
-                       struct io_sample_offset *so = (void *) s;
+                       if (log->log_offset) {
+                               struct io_sample_offset *so = (void *) s;
 
-                       so->offset = cpu_to_le64(so->offset);
+                               so->offset = cpu_to_le64(so->offset);
+                       }
                }
        }
 
@@ -1792,7 +1858,7 @@ void fio_server_send_start(struct thread_data *td)
 }
 
 int fio_server_get_verify_state(const char *name, int threadnumber,
-                               void **datap, int *version)
+                               void **datap)
 {
        struct thread_io_list *s;
        struct cmd_sendfile out;
@@ -1844,7 +1910,7 @@ fail:
         * the header, and the thread_io_list checksum
         */
        s = rep->data + sizeof(struct verify_state_hdr);
-       if (verify_state_hdr(rep->data, s, version)) {
+       if (verify_state_hdr(rep->data, s)) {
                ret = EILSEQ;
                goto fail;
        }
@@ -1889,11 +1955,10 @@ static int fio_init_server_ip(void)
                return -1;
        }
 #ifdef SO_REUSEPORT
-       if (setsockopt(sk, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)) < 0) {
-               log_err("fio: setsockopt(REUSEPORT): %s\n", strerror(errno));
-               close(sk);
-               return -1;
-       }
+       /*
+        * Not fatal if fails, so just ignore it if that happens
+        */
+       setsockopt(sk, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt));
 #endif
 
        if (use_ipv6) {
@@ -2186,16 +2251,25 @@ static void set_sig_handlers(void)
        sigaction(SIGINT, &act, NULL);
 }
 
-static int fio_server(void)
+void fio_server_destroy_sk_key(void)
 {
-       int sk, ret;
+       pthread_key_delete(sk_out_key);
+}
 
+int fio_server_create_sk_key(void)
+{
        if (pthread_key_create(&sk_out_key, NULL)) {
                log_err("fio: can't create sk_out backend key\n");
-               return -1;
+               return 1;
        }
 
        pthread_setspecific(sk_out_key, NULL);
+       return 0;
+}
+
+static int fio_server(void)
+{
+       int sk, ret;
 
        dprint(FD_NET, "starting server\n");
 
index 9205ae6..7fc3ec6 100644 (file)
--- a/server.h
+++ b/server.h
@@ -38,7 +38,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-       FIO_SERVER_VER                  = 51,
+       FIO_SERVER_VER                  = 53,
 
        FIO_SERVER_MAX_FRAGMENT_PDU     = 1024,
        FIO_SERVER_MAX_CMD_MB           = 2048,
@@ -211,15 +211,18 @@ extern void fio_server_send_ts(struct thread_stat *, struct group_run_stats *);
 extern void fio_server_send_gs(struct group_run_stats *);
 extern void fio_server_send_du(void);
 extern void fio_server_send_job_options(struct flist_head *, unsigned int);
-extern int fio_server_get_verify_state(const char *, int, void **, int *);
+extern int fio_server_get_verify_state(const char *, int, void **);
 
-extern struct fio_net_cmd *fio_net_recv_cmd(int sk);
+extern struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait);
 
 extern int fio_send_iolog(struct thread_data *, struct io_log *, const char *);
 extern void fio_server_send_add_job(struct thread_data *);
 extern void fio_server_send_start(struct thread_data *);
 extern int fio_net_send_quit(int sk);
 
+extern int fio_server_create_sk_key(void);
+extern void fio_server_destroy_sk_key(void);
+
 extern int exit_backend;
 extern int fio_net_port;
 
diff --git a/stat.c b/stat.c
index 3070cef..f55cb2f 100644 (file)
--- a/stat.c
+++ b/stat.c
@@ -15,6 +15,7 @@
 #include "idletime.h"
 #include "lib/pow2.h"
 #include "lib/output_buffer.h"
+#include "helper_thread.h"
 
 struct fio_mutex *stat_mutex;
 
@@ -1580,6 +1581,8 @@ void __show_run_stats(void)
                unsigned long long bw;
 
                ts = &threadstats[i];
+               if (ts->groupid == -1)
+                       continue;
                rs = &runstats[ts->groupid];
                rs->kb_base = ts->kb_base;
                rs->unit_base = ts->unit_base;
@@ -1846,58 +1849,175 @@ static inline void add_stat_sample(struct io_stat *is, unsigned long data)
        is->samples++;
 }
 
+/*
+ * Return a struct io_logs, which is added to the tail of the log
+ * list for 'iolog'.
+ */
+static struct io_logs *get_new_log(struct io_log *iolog)
+{
+       size_t new_size, new_samples;
+       struct io_logs *cur_log;
+
+       /*
+        * Cap the size at MAX_LOG_ENTRIES, so we don't keep doubling
+        * forever
+        */
+       if (!iolog->cur_log_max)
+               new_samples = DEF_LOG_ENTRIES;
+       else {
+               new_samples = iolog->cur_log_max * 2;
+               if (new_samples > MAX_LOG_ENTRIES)
+                       new_samples = MAX_LOG_ENTRIES;
+       }
+
+       new_size = new_samples * log_entry_sz(iolog);
+
+       cur_log = malloc(sizeof(*cur_log));
+       if (cur_log) {
+               INIT_FLIST_HEAD(&cur_log->list);
+               cur_log->log = malloc(new_size);
+               if (cur_log->log) {
+                       cur_log->nr_samples = 0;
+                       cur_log->max_samples = new_samples;
+                       flist_add_tail(&cur_log->list, &iolog->io_logs);
+                       iolog->cur_log_max = new_samples;
+                       return cur_log;
+               }
+               free(cur_log);
+       }
+
+       return NULL;
+}
+
+/*
+ * Add and return a new log chunk, or return current log if big enough
+ */
+static struct io_logs *regrow_log(struct io_log *iolog)
+{
+       struct io_logs *cur_log;
+       int i;
+
+       if (!iolog || iolog->disabled)
+               return NULL;
+
+       cur_log = iolog_cur_log(iolog);
+       if (cur_log->nr_samples < cur_log->max_samples)
+               return cur_log;
+
+       /*
+        * No room for a new sample. If we're compressing on the fly, flush
+        * out the current chunk
+        */
+       if (iolog->log_gz) {
+               if (iolog_cur_flush(iolog, cur_log)) {
+                       log_err("fio: failed flushing iolog! Will stop logging.\n");
+                       return NULL;
+               }
+       }
+
+       /*
+        * Get a new log array, and add to our list
+        */
+       cur_log = get_new_log(iolog);
+       if (!cur_log) {
+               log_err("fio: failed extending iolog! Will stop logging.\n");
+               return NULL;
+       }
+
+       if (!iolog->pending || !iolog->pending->nr_samples)
+               return cur_log;
+
+       /*
+        * Flush pending items to new log
+        */
+       for (i = 0; i < iolog->pending->nr_samples; i++) {
+               struct io_sample *src, *dst;
+
+               src = get_sample(iolog, iolog->pending, i);
+               dst = get_sample(iolog, cur_log, i);
+               memcpy(dst, src, log_entry_sz(iolog));
+       }
+
+       iolog->pending->nr_samples = 0;
+       return cur_log;
+}
+
+void regrow_logs(struct thread_data *td)
+{
+       if (!regrow_log(td->slat_log))
+               td->slat_log->disabled = true;
+       if (!regrow_log(td->clat_log))
+               td->clat_log->disabled = true;
+       if (!regrow_log(td->lat_log))
+               td->lat_log->disabled = true;
+       if (!regrow_log(td->bw_log))
+               td->bw_log->disabled = true;
+       if (!regrow_log(td->iops_log))
+               td->iops_log->disabled = true;
+
+       td->flags &= ~TD_F_REGROW_LOGS;
+}
+
+static struct io_logs *get_cur_log(struct io_log *iolog)
+{
+       struct io_logs *cur_log;
+
+       cur_log = iolog_cur_log(iolog);
+       if (!cur_log) {
+               cur_log = get_new_log(iolog);
+               if (!cur_log)
+                       return NULL;
+       }
+
+       if (cur_log->nr_samples < cur_log->max_samples)
+               return cur_log;
+
+       /*
+        * Out of space. If we're in IO offload mode, add a new log chunk
+        * inline. If we're doing inline submissions, flag 'td' as needing
+        * a log regrow and we'll take care of it on the submission side.
+        */
+       if (iolog->td->o.io_submit_mode == IO_MODE_OFFLOAD)
+               return regrow_log(iolog);
+
+       iolog->td->flags |= TD_F_REGROW_LOGS;
+       assert(iolog->pending->nr_samples < iolog->pending->max_samples);
+       return iolog->pending;
+}
+
 static void __add_log_sample(struct io_log *iolog, unsigned long val,
                             enum fio_ddir ddir, unsigned int bs,
                             unsigned long t, uint64_t offset)
 {
-       uint64_t nr_samples = iolog->nr_samples;
-       struct io_sample *s;
+       struct io_logs *cur_log;
 
        if (iolog->disabled)
                return;
-
-       if (!iolog->nr_samples)
+       if (flist_empty(&iolog->io_logs))
                iolog->avg_last = t;
 
-       if (iolog->nr_samples == iolog->max_samples) {
-               size_t new_size;
-               void *new_log;
+       cur_log = get_cur_log(iolog);
+       if (cur_log) {
+               struct io_sample *s;
 
-               new_size = 2 * iolog->max_samples * log_entry_sz(iolog);
+               s = get_sample(iolog, cur_log, cur_log->nr_samples);
 
-               if (iolog->log_gz && (new_size > iolog->log_gz)) {
-                       if (iolog_flush(iolog, 0)) {
-                               log_err("fio: failed flushing iolog! Will stop logging.\n");
-                               iolog->disabled = 1;
-                               return;
-                       }
-                       nr_samples = iolog->nr_samples;
-               } else {
-                       new_log = realloc(iolog->log, new_size);
-                       if (!new_log) {
-                               log_err("fio: failed extending iolog! Will stop logging.\n");
-                               iolog->disabled = 1;
-                               return;
-                       }
-                       iolog->log = new_log;
-                       iolog->max_samples <<= 1;
-               }
-       }
-
-       s = get_sample(iolog, nr_samples);
+               s->val = val;
+               s->time = t;
+               io_sample_set_ddir(iolog, s, ddir);
+               s->bs = bs;
 
-       s->val = val;
-       s->time = t;
-       io_sample_set_ddir(iolog, s, ddir);
-       s->bs = bs;
+               if (iolog->log_offset) {
+                       struct io_sample_offset *so = (void *) s;
 
-       if (iolog->log_offset) {
-               struct io_sample_offset *so = (void *) s;
+                       so->offset = offset;
+               }
 
-               so->offset = offset;
+               cur_log->nr_samples++;
+               return;
        }
 
-       iolog->nr_samples++;
+       iolog->disabled = true;
 }
 
 static inline void reset_io_stat(struct io_stat *ios)
@@ -1942,35 +2062,35 @@ void reset_io_stats(struct thread_data *td)
        }
 }
 
-static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed)
+static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
+                             unsigned long elapsed, bool log_max)
 {
        /*
         * Note an entry in the log. Use the mean from the logged samples,
         * making sure to properly round up. Only write a log entry if we
         * had actual samples done.
         */
-       if (iolog->avg_window[DDIR_READ].samples) {
-               unsigned long mr;
+       if (iolog->avg_window[ddir].samples) {
+               unsigned long val;
 
-               mr = iolog->avg_window[DDIR_READ].mean.u.f + 0.50;
-               __add_log_sample(iolog, mr, DDIR_READ, 0, elapsed, 0);
-       }
-       if (iolog->avg_window[DDIR_WRITE].samples) {
-               unsigned long mw;
+               if (log_max)
+                       val = iolog->avg_window[ddir].max_val;
+               else
+                       val = iolog->avg_window[ddir].mean.u.f + 0.50;
 
-               mw = iolog->avg_window[DDIR_WRITE].mean.u.f + 0.50;
-               __add_log_sample(iolog, mw, DDIR_WRITE, 0, elapsed, 0);
+               __add_log_sample(iolog, val, ddir, 0, elapsed, 0);
        }
-       if (iolog->avg_window[DDIR_TRIM].samples) {
-               unsigned long mw;
 
-               mw = iolog->avg_window[DDIR_TRIM].mean.u.f + 0.50;
-               __add_log_sample(iolog, mw, DDIR_TRIM, 0, elapsed, 0);
-       }
+       reset_io_stat(&iolog->avg_window[ddir]);
+}
 
-       reset_io_stat(&iolog->avg_window[DDIR_READ]);
-       reset_io_stat(&iolog->avg_window[DDIR_WRITE]);
-       reset_io_stat(&iolog->avg_window[DDIR_TRIM]);
+static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed,
+                            bool log_max)
+{
+       int ddir;
+
+       for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+               __add_stat_to_log(iolog, ddir, elapsed, log_max);
 }
 
 static void add_log_sample(struct thread_data *td, struct io_log *iolog,
@@ -2006,27 +2126,27 @@ static void add_log_sample(struct thread_data *td, struct io_log *iolog,
        if (this_window < iolog->avg_msec)
                return;
 
-       _add_stat_to_log(iolog, elapsed);
+       _add_stat_to_log(iolog, elapsed, td->o.log_max != 0);
 
        iolog->avg_last = elapsed;
 }
 
-void finalize_logs(struct thread_data *td)
+void finalize_logs(struct thread_data *td, bool unit_logs)
 {
        unsigned long elapsed;
 
        elapsed = mtime_since_now(&td->epoch);
 
-       if (td->clat_log)
-               _add_stat_to_log(td->clat_log, elapsed);
-       if (td->slat_log)
-               _add_stat_to_log(td->slat_log, elapsed);
-       if (td->lat_log)
-               _add_stat_to_log(td->lat_log, elapsed);
-       if (td->bw_log)
-               _add_stat_to_log(td->bw_log, elapsed);
-       if (td->iops_log)
-               _add_stat_to_log(td->iops_log, elapsed);
+       if (td->clat_log && unit_logs)
+               _add_stat_to_log(td->clat_log, elapsed, td->o.log_max != 0);
+       if (td->slat_log && unit_logs)
+               _add_stat_to_log(td->slat_log, elapsed, td->o.log_max != 0);
+       if (td->lat_log && unit_logs)
+               _add_stat_to_log(td->lat_log, elapsed, td->o.log_max != 0);
+       if (td->bw_log && (unit_logs == per_unit_log(td->bw_log)))
+               _add_stat_to_log(td->bw_log, elapsed, td->o.log_max != 0);
+       if (td->iops_log && (unit_logs == per_unit_log(td->iops_log)))
+               _add_stat_to_log(td->iops_log, elapsed, td->o.log_max != 0);
 }
 
 void add_agg_sample(unsigned long val, enum fio_ddir ddir, unsigned int bs)
@@ -2054,9 +2174,6 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 {
        struct thread_stat *ts = &td->ts;
 
-       if (!ddir_rw(ddir))
-               return;
-
        td_io_u_lock(td);
 
        add_stat_sample(&ts->clat_stat[ddir], usec);
@@ -2106,18 +2223,41 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
        td_io_u_unlock(td);
 }
 
-void add_bw_sample(struct thread_data *td, enum fio_ddir ddir, unsigned int bs,
-                  struct timeval *t)
+void add_bw_sample(struct thread_data *td, struct io_u *io_u,
+                  unsigned int bytes, unsigned long spent)
+{
+       struct thread_stat *ts = &td->ts;
+       unsigned long rate;
+
+       if (spent)
+               rate = bytes * 1000 / spent;
+       else
+               rate = 0;
+
+       td_io_u_lock(td);
+
+       add_stat_sample(&ts->bw_stat[io_u->ddir], rate);
+
+       if (td->bw_log)
+               add_log_sample(td, td->bw_log, rate, io_u->ddir, bytes, io_u->offset);
+
+       td->stat_io_bytes[io_u->ddir] = td->this_io_bytes[io_u->ddir];
+       td_io_u_unlock(td);
+}
+
+static int add_bw_samples(struct thread_data *td, struct timeval *t)
 {
        struct thread_stat *ts = &td->ts;
        unsigned long spent, rate;
+       enum fio_ddir ddir;
 
-       if (!ddir_rw(ddir))
-               return;
+       if (per_unit_log(td->bw_log))
+               return 0;
 
        spent = mtime_since(&td->bw_sample_time, t);
-       if (spent < td->o.bw_avg_time)
-               return;
+       if (spent < td->o.bw_avg_time &&
+           td->o.bw_avg_time - spent >= 10)
+               return td->o.bw_avg_time - spent;
 
        td_io_u_lock(td);
 
@@ -2138,28 +2278,57 @@ void add_bw_sample(struct thread_data *td, enum fio_ddir ddir, unsigned int bs,
 
                add_stat_sample(&ts->bw_stat[ddir], rate);
 
-               if (td->bw_log)
+               if (td->bw_log) {
+                       unsigned int bs = 0;
+
+                       if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
+                               bs = td->o.min_bs[ddir];
+
                        add_log_sample(td, td->bw_log, rate, ddir, bs, 0);
+               }
 
                td->stat_io_bytes[ddir] = td->this_io_bytes[ddir];
        }
 
-       fio_gettime(&td->bw_sample_time, NULL);
+       timeval_add_msec(&td->bw_sample_time, td->o.bw_avg_time);
+
+       td_io_u_unlock(td);
+
+       if (spent <= td->o.bw_avg_time)
+               return td->o.bw_avg_time;
+
+       return td->o.bw_avg_time - (1 + spent - td->o.bw_avg_time);
+}
+
+void add_iops_sample(struct thread_data *td, struct io_u *io_u,
+                    unsigned int bytes)
+{
+       struct thread_stat *ts = &td->ts;
+
+       td_io_u_lock(td);
+
+       add_stat_sample(&ts->iops_stat[io_u->ddir], 1);
+
+       if (td->iops_log)
+               add_log_sample(td, td->iops_log, 1, io_u->ddir, bytes, io_u->offset);
+
+       td->stat_io_blocks[io_u->ddir] = td->this_io_blocks[io_u->ddir];
        td_io_u_unlock(td);
 }
 
-void add_iops_sample(struct thread_data *td, enum fio_ddir ddir, unsigned int bs,
-                    struct timeval *t)
+static int add_iops_samples(struct thread_data *td, struct timeval *t)
 {
        struct thread_stat *ts = &td->ts;
        unsigned long spent, iops;
+       enum fio_ddir ddir;
 
-       if (!ddir_rw(ddir))
-               return;
+       if (per_unit_log(td->iops_log))
+               return 0;
 
        spent = mtime_since(&td->iops_sample_time, t);
-       if (spent < td->o.iops_avg_time)
-               return;
+       if (spent < td->o.iops_avg_time &&
+           td->o.iops_avg_time - spent >= 10)
+               return td->o.iops_avg_time - spent;
 
        td_io_u_lock(td);
 
@@ -2180,14 +2349,59 @@ void add_iops_sample(struct thread_data *td, enum fio_ddir ddir, unsigned int bs
 
                add_stat_sample(&ts->iops_stat[ddir], iops);
 
-               if (td->iops_log)
+               if (td->iops_log) {
+                       unsigned int bs = 0;
+
+                       if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
+                               bs = td->o.min_bs[ddir];
+
                        add_log_sample(td, td->iops_log, iops, ddir, bs, 0);
+               }
 
                td->stat_io_blocks[ddir] = td->this_io_blocks[ddir];
        }
 
-       fio_gettime(&td->iops_sample_time, NULL);
+       timeval_add_msec(&td->iops_sample_time, td->o.iops_avg_time);
+
        td_io_u_unlock(td);
+
+       if (spent <= td->o.iops_avg_time)
+               return td->o.iops_avg_time;
+
+       return td->o.iops_avg_time - (1 + spent - td->o.iops_avg_time);
+}
+
+/*
+ * Returns msecs to next event
+ */
+int calc_log_samples(void)
+{
+       struct thread_data *td;
+       unsigned int next = ~0U, tmp;
+       struct timeval now;
+       int i;
+
+       fio_gettime(&now, NULL);
+
+       for_each_td(td, i) {
+               if (!ramp_time_over(td) ||
+                   !(td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING)) {
+                       next = min(td->o.iops_avg_time, td->o.bw_avg_time);
+                       continue;
+               }
+               if (!per_unit_log(td->bw_log)) {
+                       tmp = add_bw_samples(td, &now);
+                       if (tmp < next)
+                               next = tmp;
+               }
+               if (!per_unit_log(td->iops_log)) {
+                       tmp = add_iops_samples(td, &now);
+                       if (tmp < next)
+                               next = tmp;
+               }
+       }
+
+       return next == ~0U ? 0 : next;
 }
 
 void stat_init(void)
@@ -2210,8 +2424,7 @@ void stat_exit(void)
  */
 void show_running_run_stats(void)
 {
-       helper_do_stat = 1;
-       pthread_cond_signal(&helper_cond);
+       helper_do_stat();
 }
 
 uint32_t *io_u_block_info(struct thread_data *td, struct io_u *io_u)
diff --git a/stat.h b/stat.h
index 9c3f192..86f1a0b 100644 (file)
--- a/stat.h
+++ b/stat.h
@@ -276,11 +276,12 @@ extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long,
                                unsigned int, uint64_t);
 extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long,
                                unsigned int, uint64_t);
-extern void add_bw_sample(struct thread_data *, enum fio_ddir, unsigned int,
-                               struct timeval *);
-extern void add_iops_sample(struct thread_data *, enum fio_ddir, unsigned int,
-                               struct timeval *);
 extern void add_agg_sample(unsigned long, enum fio_ddir, unsigned int);
+extern void add_iops_sample(struct thread_data *, struct io_u *,
+                               unsigned int);
+extern void add_bw_sample(struct thread_data *, struct io_u *,
+                               unsigned int, unsigned long);
+extern int calc_log_samples(void);
 
 extern struct io_log *agg_io_log[DDIR_RWDIR_CNT];
 extern int write_bw_log;
diff --git a/t/arch.c b/t/arch.c
new file mode 100644 (file)
index 0000000..befb7c7
--- /dev/null
+++ b/t/arch.c
@@ -0,0 +1,5 @@
+#include "../arch/arch.h"
+
+unsigned long arch_flags = 0;
+int tsc_reliable;
+int arch_random;
index 3a66820..7856da1 100644 (file)
@@ -537,6 +537,7 @@ int main(int argc, char *argv[])
        uint64_t nextents = 0, nchunks = 0;
        int c, ret;
 
+       arch_init(argv);
        debug_init();
 
        while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {
diff --git a/t/gen-rand.c b/t/gen-rand.c
new file mode 100644 (file)
index 0000000..6c31f92
--- /dev/null
@@ -0,0 +1,68 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "../lib/types.h"
+#include "../log.h"
+#include "../lib/lfsr.h"
+#include "../lib/axmap.h"
+#include "../smalloc.h"
+#include "../minmax.h"
+#include "../lib/rand.h"
+
+int main(int argc, char *argv[])
+{
+       struct frand_state s;
+       uint64_t i, start, end, nvalues;
+       unsigned long *buckets, index, pass, fail;
+       double p, dev, mean, vmin, vmax;
+
+       if (argc < 4) {
+               log_err("%s: start end nvalues\n", argv[0]);
+               return 1;
+       }
+
+       start = strtoul(argv[1], NULL, 10);
+       end = strtoul(argv[2], NULL, 10);
+
+       if (start >= end) {
+               log_err("%s: start must be smaller than end\n", argv[0]);
+               return 1;
+       }
+       index = 1 + end - start;
+       buckets = calloc(index, sizeof(unsigned long));
+
+       nvalues = strtoul(argv[3], NULL, 10);
+
+       init_rand(&s, false);
+
+       for (i = 0; i < nvalues; i++) {
+               int v = rand32_between(&s, start, end);
+
+               buckets[v - start]++;
+       }
+
+       p = 1.0 / index;
+       dev = sqrt(nvalues * p * (1.0 - p));
+       mean = nvalues * p;
+       vmin = mean - dev;
+       vmax = mean + dev;
+
+       pass = fail = 0;
+       for (i = 0; i < index; i++) {
+               if (buckets[i] < vmin || buckets[i] > vmax) {
+                       printf("FAIL bucket%4lu: val=%8lu (%.1f < %.1f > %.1f)\n", (unsigned long) i + 1, buckets[i], vmin, mean, vmax);
+                       fail++;
+               } else {
+                       printf("PASS bucket%4lu: val=%8lu (%.1f < %.1f > %.1f)\n", (unsigned long) i + 1, buckets[i], vmin, mean, vmax);
+                       pass++;
+               }
+       }
+
+       printf("Passes=%lu, Fail=%lu\n", pass, fail);
+
+       return 0;
+}
index 4352b89..bad5097 100644 (file)
@@ -38,6 +38,8 @@ int main(int argc, char *argv[])
        void *v = NULL, *v_start;
        double total, mean;
 
+       arch_init(argv);
+
        /* Read arguments */
        switch (argc) {
                case 5: if (strncmp(argv[4], "verify", 7) == 0)
diff --git a/t/memlock.c b/t/memlock.c
new file mode 100644 (file)
index 0000000..d9d586d
--- /dev/null
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+static struct thread_data {
+       unsigned long mb;
+} td;
+
+static void *worker(void *data)
+{
+       struct thread_data *td = data;
+       unsigned long index;
+       size_t size;
+       char *buf;
+       int i, first = 1;
+
+       size = td->mb * 1024UL * 1024UL;
+       buf = malloc(size);
+
+       for (i = 0; i < 100000; i++) {
+               for (index = 0; index + 4096 < size; index += 4096)
+                       memset(&buf[index+512], 0x89, 512);
+               if (first) {
+                       printf("loop%d: did %lu MB\n", i+1, size/(1024UL*1024UL));
+                       first = 0;
+               }
+       }
+       return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+       unsigned long mb, threads;
+       pthread_t *pthreads;
+       int i;
+
+       if (argc < 3) {
+               printf("%s: <mb per thread> <threads>\n", argv[0]);
+               return 1;
+       }
+
+       mb = strtoul(argv[1], NULL, 10);
+       threads = strtoul(argv[2], NULL, 10);
+
+       pthreads = calloc(threads, sizeof(pthread_t));
+       td.mb = mb;
+
+       for (i = 0; i < threads; i++)
+               pthread_create(&pthreads[i], NULL, worker, &td);
+
+       for (i = 0; i < threads; i++) {
+               void *ret;
+
+               pthread_join(pthreads[i], &ret);
+       }
+       return 0;
+}
diff --git a/t/read-to-pipe-async.c b/t/read-to-pipe-async.c
new file mode 100644 (file)
index 0000000..e8bdc85
--- /dev/null
@@ -0,0 +1,670 @@
+/*
+ * Read a file and write the contents to stdout. If a given read takes
+ * longer than 'max_us' time, then we schedule a new thread to handle
+ * the next read. This avoids the coordinated omission problem, where
+ * one request appears to take a long time, but in reality a lot of
+ * requests would have been slow, but we don't notice since new submissions
+ * are not being issued if just 1 is held up.
+ *
+ * One test case:
+ *
+ * $ time (./read-to-pipe-async -f randfile.gz | gzip -dc > outfile; sync)
+ *
+ * This will read randfile.gz and log the latencies of doing so, while
+ * piping the output to gzip to decompress it. Any latencies over max_us
+ * are logged when they happen, and latency buckets are displayed at the
+ * end of the run
+ *
+ * gcc -Wall -g -O2 -o read-to-pipe-async read-to-pipe-async.c -lpthread
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <inttypes.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "../flist.h"
+
+static int bs = 4096;
+static int max_us = 10000;
+static char *file;
+static int separate_writer = 1;
+
+#define PLAT_BITS      8
+#define PLAT_VAL       (1 << PLAT_BITS)
+#define PLAT_GROUP_NR  19
+#define PLAT_NR                (PLAT_GROUP_NR * PLAT_VAL)
+#define PLAT_LIST_MAX  20
+
+struct stats {
+       unsigned int plat[PLAT_NR];
+       unsigned int nr_samples;
+       unsigned int max;
+       unsigned int min;
+       unsigned int over;
+};
+
+static double plist[PLAT_LIST_MAX] = { 50.0, 75.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.99, 99.999, 99.9999, };
+
+struct thread_data {
+       int exit;
+       int done;
+       pthread_mutex_t lock;
+       pthread_cond_t cond;
+       pthread_mutex_t done_lock;
+       pthread_cond_t done_cond;
+       pthread_t thread;
+};
+
+struct writer_thread {
+       struct flist_head list;
+       struct flist_head done_list;
+       struct stats s;
+       struct thread_data thread;
+};
+
+struct reader_thread {
+       struct flist_head list;
+       struct flist_head done_list;
+       int started;
+       int busy;
+       int write_seq;
+       struct stats s;
+       struct thread_data thread;
+};
+
+struct work_item {
+       struct flist_head list;
+       void *buf;
+       size_t buf_size;
+       off_t off;
+       int fd;
+       int seq;
+       struct writer_thread *writer;
+       struct reader_thread *reader;
+       pthread_mutex_t lock;
+       pthread_cond_t cond;
+       pthread_t thread;
+};
+
+static struct reader_thread reader_thread;
+static struct writer_thread writer_thread;
+
+uint64_t utime_since(const struct timeval *s, const struct timeval *e)
+{
+       long sec, usec;
+       uint64_t ret;
+
+       sec = e->tv_sec - s->tv_sec;
+       usec = e->tv_usec - s->tv_usec;
+       if (sec > 0 && usec < 0) {
+               sec--;
+               usec += 1000000;
+       }
+
+       if (sec < 0 || (sec == 0 && usec < 0))
+               return 0;
+
+       ret = sec * 1000000ULL + usec;
+
+       return ret;
+}
+
+static struct work_item *find_seq(struct writer_thread *w, unsigned int seq)
+{
+       struct work_item *work;
+       struct flist_head *entry;
+
+       if (flist_empty(&w->list))
+               return NULL;
+
+       flist_for_each(entry, &w->list) {
+               work = flist_entry(entry, struct work_item, list);
+               if (work->seq == seq)
+                       return work;
+       }
+
+       return NULL;
+}
+
+static unsigned int plat_val_to_idx(unsigned int val)
+{
+       unsigned int msb, error_bits, base, offset;
+
+       /* Find MSB starting from bit 0 */
+       if (val == 0)
+               msb = 0;
+       else
+               msb = sizeof(val)*8 - __builtin_clz(val) - 1;
+
+       /*
+        * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+        * all bits of the sample as index
+        */
+       if (msb <= PLAT_BITS)
+               return val;
+
+       /* Compute the number of error bits to discard*/
+       error_bits = msb - PLAT_BITS;
+
+       /* Compute the number of buckets before the group */
+       base = (error_bits + 1) << PLAT_BITS;
+
+       /*
+        * Discard the error bits and apply the mask to find the
+        * index for the buckets in the group
+        */
+       offset = (PLAT_VAL - 1) & (val >> error_bits);
+
+       /* Make sure the index does not exceed (array size - 1) */
+       return (base + offset) < (PLAT_NR - 1) ?
+               (base + offset) : (PLAT_NR - 1);
+}
+
+/*
+ * Convert the given index of the bucket array to the value
+ * represented by the bucket
+ */
+static unsigned int plat_idx_to_val(unsigned int idx)
+{
+       unsigned int error_bits, k, base;
+
+       assert(idx < PLAT_NR);
+
+       /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+        * all bits of the sample as index */
+       if (idx < (PLAT_VAL << 1))
+               return idx;
+
+       /* Find the group and compute the minimum value of that group */
+       error_bits = (idx >> PLAT_BITS) - 1;
+       base = 1 << (error_bits + PLAT_BITS);
+
+       /* Find its bucket number of the group */
+       k = idx % PLAT_VAL;
+
+       /* Return the mean of the range of the bucket */
+       return base + ((k + 0.5) * (1 << error_bits));
+}
+
+static void add_lat(struct stats *s, unsigned int us, const char *name)
+{
+       int lat_index = 0;
+
+       if (us > s->max)
+               s->max = us;
+       if (us < s->min)
+               s->min = us;
+
+       if (us > max_us) {
+               fprintf(stderr, "%s latency=%u usec\n", name, us);
+               s->over++;
+       }
+
+       lat_index = plat_val_to_idx(us);
+       __sync_fetch_and_add(&s->plat[lat_index], 1);
+       __sync_fetch_and_add(&s->nr_samples, 1);
+}
+
+static int write_work(struct work_item *work)
+{
+       struct timeval s, e;
+       ssize_t ret;
+
+       gettimeofday(&s, NULL);
+       ret = write(STDOUT_FILENO, work->buf, work->buf_size);
+       gettimeofday(&e, NULL);
+       assert(ret == work->buf_size);
+
+       add_lat(&work->writer->s, utime_since(&s, &e), "write");
+       return work->seq + 1;
+}
+
+static void thread_exiting(struct thread_data *thread)
+{
+       __sync_fetch_and_add(&thread->done, 1);
+       pthread_cond_signal(&thread->done_cond);
+}
+
+static void *writer_fn(void *data)
+{
+       struct writer_thread *wt = data;
+       struct work_item *work;
+       unsigned int seq = 1;
+
+       work = NULL;
+       while (!wt->thread.exit || !flist_empty(&wt->list)) {
+               pthread_mutex_lock(&wt->thread.lock);
+
+               if (work) {
+                       flist_add_tail(&work->list, &wt->done_list);
+                       work = NULL;
+               }
+       
+               work = find_seq(wt, seq);
+               if (work)
+                       flist_del_init(&work->list);
+               else
+                       pthread_cond_wait(&wt->thread.cond, &wt->thread.lock);
+
+               pthread_mutex_unlock(&wt->thread.lock);
+
+               if (work)
+                       seq = write_work(work);
+       }
+
+       thread_exiting(&wt->thread);
+       return NULL;
+}
+
+static void reader_work(struct work_item *work)
+{
+       struct timeval s, e;
+       ssize_t ret;
+       size_t left;
+       void *buf;
+       off_t off;
+
+       gettimeofday(&s, NULL);
+
+       left = work->buf_size;
+       buf = work->buf;
+       off = work->off;
+       while (left) {
+               ret = pread(work->fd, buf, left, off);
+               if (!ret) {
+                       fprintf(stderr, "zero read\n");
+                       break;
+               } else if (ret < 0) {
+                       fprintf(stderr, "errno=%d\n", errno);
+                       break;
+               }
+               left -= ret;
+               off += ret;
+               buf += ret;
+       }
+
+       gettimeofday(&e, NULL);
+
+       add_lat(&work->reader->s, utime_since(&s, &e), "read");
+
+       pthread_cond_signal(&work->cond);
+
+       if (separate_writer) {
+               pthread_mutex_lock(&work->writer->thread.lock);
+               flist_add_tail(&work->list, &work->writer->list);
+               pthread_mutex_unlock(&work->writer->thread.lock);
+               pthread_cond_signal(&work->writer->thread.cond);
+       } else {
+               struct reader_thread *rt = work->reader;
+               struct work_item *next = NULL;
+               struct flist_head *entry;
+
+               /*
+                * Write current work if it matches in sequence.
+                */
+               if (work->seq == rt->write_seq)
+                       goto write_it;
+
+               pthread_mutex_lock(&rt->thread.lock);
+
+               flist_add_tail(&work->list, &rt->done_list);
+
+               /*
+                * See if the next work item is here, if so, write it
+                */
+               work = NULL;
+               flist_for_each(entry, &rt->done_list) {
+                       next = flist_entry(entry, struct work_item, list);
+                       if (next->seq == rt->write_seq) {
+                               work = next;
+                               flist_del(&work->list);
+                               break;
+                       }
+               }
+
+               pthread_mutex_unlock(&rt->thread.lock);
+       
+               if (work) {
+write_it:
+                       write_work(work);
+                       __sync_fetch_and_add(&rt->write_seq, 1);
+               }
+       }
+}
+
+static void *reader_one_off(void *data)
+{
+       reader_work(data);
+       return NULL;
+}
+
+static void *reader_fn(void *data)
+{
+       struct reader_thread *rt = data;
+       struct work_item *work;
+
+       while (!rt->thread.exit || !flist_empty(&rt->list)) {
+               work = NULL;
+               pthread_mutex_lock(&rt->thread.lock);
+               if (!flist_empty(&rt->list)) {
+                       work = flist_first_entry(&rt->list, struct work_item, list);
+                       flist_del_init(&work->list);
+               } else
+                       pthread_cond_wait(&rt->thread.cond, &rt->thread.lock);
+               pthread_mutex_unlock(&rt->thread.lock);
+
+               if (work) {
+                       __sync_fetch_and_add(&rt->busy, 1);
+                       reader_work(work);
+                       __sync_fetch_and_sub(&rt->busy, 1);
+               }
+       }
+
+       thread_exiting(&rt->thread);
+       return NULL;
+}
+
+static void queue_work(struct reader_thread *rt, struct work_item *work)
+{
+       if (!rt->started) {
+               pthread_mutex_lock(&rt->thread.lock);
+               flist_add_tail(&work->list, &rt->list);
+               pthread_mutex_unlock(&rt->thread.lock);
+
+               rt->started = 1;
+               pthread_create(&rt->thread.thread, NULL, reader_fn, rt);
+       } else if (!rt->busy && !pthread_mutex_trylock(&rt->thread.lock)) {
+               flist_add_tail(&work->list, &rt->list);
+               pthread_mutex_unlock(&rt->thread.lock);
+
+               pthread_cond_signal(&rt->thread.cond);
+       } else {
+               int ret = pthread_create(&work->thread, NULL, reader_one_off, work);
+               if (ret)
+                       fprintf(stderr, "pthread_create=%d\n", ret);
+               else
+                       pthread_detach(work->thread);
+       }
+}
+
+static unsigned int calc_percentiles(unsigned int *io_u_plat, unsigned long nr,
+                                    unsigned int **output)
+{
+       unsigned long sum = 0;
+       unsigned int len, i, j = 0;
+       unsigned int oval_len = 0;
+       unsigned int *ovals = NULL;
+       int is_last;
+
+       len = 0;
+       while (len < PLAT_LIST_MAX && plist[len] != 0.0)
+               len++;
+
+       if (!len)
+               return 0;
+
+       /*
+        * Calculate bucket values, note down max and min values
+        */
+       is_last = 0;
+       for (i = 0; i < PLAT_NR && !is_last; i++) {
+               sum += io_u_plat[i];
+               while (sum >= (plist[j] / 100.0 * nr)) {
+                       assert(plist[j] <= 100.0);
+
+                       if (j == oval_len) {
+                               oval_len += 100;
+                               ovals = realloc(ovals, oval_len * sizeof(unsigned int));
+                       }
+
+                       ovals[j] = plat_idx_to_val(i);
+                       is_last = (j == len - 1);
+                       if (is_last)
+                               break;
+
+                       j++;
+               }
+       }
+
+       *output = ovals;
+       return len;
+}
+
+static void show_latencies(struct stats *s, const char *msg)
+{
+       unsigned int *ovals = NULL;
+       unsigned int len, i;
+
+       len = calc_percentiles(s->plat, s->nr_samples, &ovals);
+       if (len) {
+               fprintf(stderr, "Latency percentiles (usec) (%s)\n", msg);
+               for (i = 0; i < len; i++)
+                       fprintf(stderr, "\t%2.4fth: %u\n", plist[i], ovals[i]);
+       }
+
+       if (ovals)
+               free(ovals);
+
+       fprintf(stderr, "\tOver=%u, min=%u, max=%u\n", s->over, s->min, s->max);
+}
+
+static void init_thread(struct thread_data *thread)
+{
+       pthread_cond_init(&thread->cond, NULL);
+       pthread_cond_init(&thread->done_cond, NULL);
+       pthread_mutex_init(&thread->lock, NULL);
+       pthread_mutex_init(&thread->done_lock, NULL);
+       thread->exit = 0;
+}
+
+static void exit_thread(struct thread_data *thread,
+                       void fn(struct writer_thread *),
+                       struct writer_thread *wt)
+{
+       __sync_fetch_and_add(&thread->exit, 1);
+       pthread_cond_signal(&thread->cond);
+
+       while (!thread->done) {
+               pthread_mutex_lock(&thread->done_lock);
+
+               if (fn) {
+                       struct timeval tv;
+                       struct timespec ts;
+
+                       gettimeofday(&tv, NULL);
+                       ts.tv_sec = tv.tv_sec + 1;
+                       ts.tv_nsec = tv.tv_usec * 1000ULL;
+
+                       pthread_cond_timedwait(&thread->done_cond, &thread->done_lock, &ts);
+                       fn(wt);
+               } else
+                       pthread_cond_wait(&thread->done_cond, &thread->done_lock);
+
+               pthread_mutex_unlock(&thread->done_lock);
+       }
+}
+
+static int usage(char *argv[])
+{
+       fprintf(stderr, "%s: [-b blocksize] [-t max usec] [-w separate writer] -f file\n", argv[0]);
+       return 1;
+}
+
+static int parse_options(int argc, char *argv[])
+{
+       int c;
+
+       while ((c = getopt(argc, argv, "f:b:t:w:")) != -1) {
+               switch (c) {
+               case 'f':
+                       file =