Merge branch 'github-actions' of https://github.com/sitsofe/fio
authorJens Axboe <axboe@kernel.dk>
Fri, 10 Dec 2021 18:08:26 +0000 (11:08 -0700)
committerJens Axboe <axboe@kernel.dk>
Fri, 10 Dec 2021 18:08:26 +0000 (11:08 -0700)
* 'github-actions' of https://github.com/sitsofe/fio:
  ci: retire travis configuration
  ci: add CI via GitHub Actions

153 files changed:
.gitignore
DEDUPE-TODO [new file with mode: 0644]
FIO-VERSION-GEN
HOWTO
Makefile
README
arch/arch.h
backend.c
cconv.c
client.c
dedupe.c [new file with mode: 0644]
dedupe.h [new file with mode: 0644]
diskutil.c
engines/cmdprio.c [new file with mode: 0644]
engines/cmdprio.h [new file with mode: 0644]
engines/dfs.c
engines/exec.c [new file with mode: 0644]
engines/filecreate.c
engines/filedelete.c
engines/filestat.c
engines/http.c
engines/io_uring.c
engines/libaio.c
engines/libzbc.c
engines/sg.c
eta.c
examples/1mbs_clients.png [new file with mode: 0644]
examples/aio-read.png [new file with mode: 0644]
examples/backwards-read.png [new file with mode: 0644]
examples/basic-verify.png [new file with mode: 0644]
examples/butterfly.png [new file with mode: 0644]
examples/cmdprio-bssplit.fio [new file with mode: 0644]
examples/cmdprio-bssplit.png [new file with mode: 0644]
examples/cmdprio-percentage.fio [new file with mode: 0644]
examples/cmdprio-percentage.png [new file with mode: 0644]
examples/cpp_null.png [new file with mode: 0644]
examples/cpuio.png [new file with mode: 0644]
examples/cross-stripe-verify.png [new file with mode: 0644]
examples/dev-dax.png [new file with mode: 0644]
examples/dfs.png [new file with mode: 0644]
examples/disk-zone-profile.png [new file with mode: 0644]
examples/e4defrag.png [new file with mode: 0644]
examples/e4defrag2.fio
examples/e4defrag2.png [new file with mode: 0644]
examples/enospc-pressure.png [new file with mode: 0644]
examples/exec.fio [new file with mode: 0644]
examples/exec.png [new file with mode: 0644]
examples/exitwhat.png [new file with mode: 0644]
examples/falloc.png [new file with mode: 0644]
examples/filecreate-ioengine.png [new file with mode: 0644]
examples/filedelete-ioengine.png [new file with mode: 0644]
examples/filestat-ioengine.png [new file with mode: 0644]
examples/fio-rand-RW.png [new file with mode: 0644]
examples/fio-rand-read.png [new file with mode: 0644]
examples/fio-rand-write.png [new file with mode: 0644]
examples/fio-seq-RW.png [new file with mode: 0644]
examples/fio-seq-read.png [new file with mode: 0644]
examples/fio-seq-write.png [new file with mode: 0644]
examples/fixed-rate-submission.png [new file with mode: 0644]
examples/flow.png [new file with mode: 0644]
examples/fsx.png [new file with mode: 0644]
examples/ftruncate.png [new file with mode: 0644]
examples/gfapi.png [new file with mode: 0644]
examples/gpudirect-rdmaio-client.png [new file with mode: 0644]
examples/gpudirect-rdmaio-server.png [new file with mode: 0644]
examples/http-s3.png [new file with mode: 0644]
examples/http-swift.png [new file with mode: 0644]
examples/http-webdav.png [new file with mode: 0644]
examples/ime.png [new file with mode: 0644]
examples/iometer-file-access-server.png [new file with mode: 0644]
examples/jesd219.png [new file with mode: 0644]
examples/latency-profile.png [new file with mode: 0644]
examples/libcufile-cufile.png [new file with mode: 0644]
examples/libcufile-posix.png [new file with mode: 0644]
examples/libhdfs.png [new file with mode: 0644]
examples/libiscsi.png [new file with mode: 0644]
examples/libpmem.png [new file with mode: 0644]
examples/librpma_apm-client.png [new file with mode: 0644]
examples/librpma_apm-server.png [new file with mode: 0644]
examples/librpma_gpspm-client.png [new file with mode: 0644]
examples/librpma_gpspm-server.png [new file with mode: 0644]
examples/libzbc-rand-write.png [new file with mode: 0644]
examples/libzbc-seq-read.png [new file with mode: 0644]
examples/mtd.fio
examples/mtd.png [new file with mode: 0644]
examples/nbd.png [new file with mode: 0644]
examples/netio.png [new file with mode: 0644]
examples/netio_multicast.png [new file with mode: 0644]
examples/nfs.png [new file with mode: 0644]
examples/null.png [new file with mode: 0644]
examples/numa.png [new file with mode: 0644]
examples/pmemblk.fio
examples/pmemblk.png [new file with mode: 0644]
examples/poisson-rate-submission.png [new file with mode: 0644]
examples/rados.png [new file with mode: 0644]
examples/rand-zones.png [new file with mode: 0644]
examples/rbd.png [new file with mode: 0644]
examples/rdmaio-client.png [new file with mode: 0644]
examples/rdmaio-server.png [new file with mode: 0644]
examples/ssd-steadystate.png [new file with mode: 0644]
examples/ssd-test.png [new file with mode: 0644]
examples/steadystate.png [new file with mode: 0644]
examples/surface-scan.png [new file with mode: 0644]
examples/test.png [new file with mode: 0644]
examples/tiobench-example.png [new file with mode: 0644]
examples/waitfor.png [new file with mode: 0644]
examples/zbd-rand-write.png [new file with mode: 0644]
examples/zbd-seq-read.png [new file with mode: 0644]
examples/zipf.png [new file with mode: 0644]
filesetup.c
fio.1
fio.h
helper_thread.c
init.c
io_ddir.h
io_u.c
io_u.h
ioengines.c
iolog.c
iolog.h
lib/fls.h
lib/rand.c
lib/rand.h
lib/seqlock.h
libfio.c
log.c
options.c
os/linux/io_uring.h
os/os-android.h
os/os-dragonfly.h
os/os-linux.h
os/os-windows.h
os/os.h
os/windows/dlls.c [new file with mode: 0644]
oslib/linux-blkzoned.c
parse.c
server.c
server.h
stat.c
stat.h
t/dedupe.c
t/io_uring.c
t/one-core-peak.sh [new file with mode: 0755]
t/run-fio-tests.py
t/zbd/functions
t/zbd/run-tests-against-nullb
t/zbd/test-zbd-support
thread_options.h
tools/fiograph/fiograph.conf [new file with mode: 0644]
tools/fiograph/fiograph.py [new file with mode: 0755]
verify.c
zbd.c
zbd.h

index 6651f96edc72ea3295c75cc9f9628eea9e267386..72494a1e2a9edafb7b77f50af94fce7d4435454b 100644 (file)
@@ -31,3 +31,4 @@ doc/output
 /TAGS
 /t/zbd/test-zbd-support.log.*
 /t/fuzz/fuzz_parseini
+tsc-rate
diff --git a/DEDUPE-TODO b/DEDUPE-TODO
new file mode 100644 (file)
index 0000000..4b0bfd1
--- /dev/null
@@ -0,0 +1,16 @@
+- Shifted dedup-able data.
+  Allow for dedup buffer generation to shift contents by random number
+  of sectors (fill the gaps with uncompressible data). Some storage
+  subsystems modernized the deduplication detection algorithms to look
+  for shifted data as well. For example, some databases push a timestamp
+  on the prefix of written blocks, which makes the underlying data
+  dedup-able in different alignment. FIO should be able to simulate such
+  workload.
+
+- Generation of similar data (but not exact).
+  A rising trend in enterprise storage systems.
+  Generation of "similar" data means random uncompressible buffers
+  that differ by few(configurable number of) bits from each other.
+  The storage subsystem usually identifies the similar buffers using
+  locality-sensitive hashing or other methods.
+
index 47af94e9ded4bd33f14c385707afb78c8332dae6..e9d563c124a419dda8d013211fcd7154ab249d45 100755 (executable)
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-3.27
+DEF_VER=fio-3.28
 
 LF='
 '
diff --git a/HOWTO b/HOWTO
index 86fb296445f006e2f3416abe559bd5f0dc57ce19..8c9e41356b9586b33fa17e4068d50620eb6be551 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -992,6 +992,9 @@ Target file/device
                                single zone. The :option:`zoneskip` parameter
                                is ignored. :option:`zonerange` and
                                :option:`zonesize` must be identical.
+                               Trim is handled using a zone reset operation.
+                               Trim only considers non-empty sequential write
+                               required and sequential write preferred zones.
 
 .. option:: zonerange=int
 
@@ -1055,6 +1058,11 @@ Target file/device
        number of open zones is defined as the number of zones to which write
        commands are issued.
 
+.. option:: job_max_open_zones=int
+
+       Limit on the number of simultaneously opened zones per single
+       thread/process.
+
 .. option:: zone_reset_threshold=float
 
        A number between zero and one that indicates the ratio of logical
@@ -1705,6 +1713,36 @@ Buffers and memory
        this option will also enable :option:`refill_buffers` to prevent every buffer
        being identical.
 
+.. option:: dedupe_mode=str
+
+       If ``dedupe_percentage=<int>`` is given, then this option controls how fio
+       generates the dedupe buffers.
+
+               **repeat**
+                       Generate dedupe buffers by repeating previous writes
+               **working_set**
+                       Generate dedupe buffers from working set
+
+       ``repeat`` is the default option for fio. Dedupe buffers are generated
+       by repeating previous unique write.
+
+       ``working_set`` is a more realistic workload.
+       With ``working_set``, ``dedupe_working_set_percentage=<int>`` should be provided.
+       Given that, fio will use the initial unique write buffers as its working set.
+       Upon deciding to dedupe, fio will randomly choose a buffer from the working set.
+       Note that by using ``working_set`` the dedupe percentage will converge
+       to the desired over time while ``repeat`` maintains the desired percentage
+       throughout the job.
+
+.. option:: dedupe_working_set_percentage=int
+
+       If ``dedupe_mode=<str>`` is set to ``working_set``, then this controls
+       the percentage of size of the file or device used as the buffers
+       fio will choose to generate the dedupe buffers from
+
+       Note that size needs to be explicitly provided and only 1 file per
+       job is supported
+
 .. option:: invalidate=bool
 
        Invalidate the buffer/page cache parts of the files to be used prior to
@@ -1930,6 +1968,11 @@ I/O engine
                        character devices. This engine supports trim operations.
                        The sg engine includes engine specific options.
 
+               **libzbc**
+                       Read, write, trim and ZBC/ZAC operations to a zoned
+                       block device using libzbc library. The target can be
+                       either an SG character device or a block device file.
+
                **null**
                        Doesn't transfer any data, just pretends to.  This is mainly used to
                        exercise fio itself and for debugging/testing purposes.
@@ -2109,6 +2152,9 @@ I/O engine
                        achieving higher concurrency and thus throughput than is possible
                        via kernel NFS.
 
+               **exec**
+                       Execute 3rd party tools. Could be used to perform monitoring during jobs runtime.
+
 I/O engine specific parameters
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -2117,14 +2163,52 @@ In addition, there are some parameters which are only valid when a specific
 with the caveat that when used on the command line, they must come after the
 :option:`ioengine` that defines them is selected.
 
-.. option:: cmdprio_percentage=int : [io_uring] [libaio]
-
-    Set the percentage of I/O that will be issued with higher priority by setting
-    the priority bit. Non-read I/O is likely unaffected by ``cmdprio_percentage``.
-    This option cannot be used with the `prio` or `prioclass` options. For this
-    option to set the priority bit properly, NCQ priority must be supported and
-    enabled and :option:`direct`\=1 option must be used. fio must also be run as
-    the root user.
+.. option:: cmdprio_percentage=int[,int] : [io_uring] [libaio]
+
+    Set the percentage of I/O that will be issued with the highest priority.
+    Default: 0. A single value applies to reads and writes. Comma-separated
+    values may be specified for reads and writes. For this option to be
+    effective, NCQ priority must be supported and enabled, and `direct=1'
+    option must be used. fio must also be run as the root user. Unlike
+    slat/clat/lat stats, which can be tracked and reported independently, per
+    priority stats only track and report a single type of latency. By default,
+    completion latency (clat) will be reported, if :option:`lat_percentiles` is
+    set, total latency (lat) will be reported.
+
+.. option:: cmdprio_class=int[,int] : [io_uring] [libaio]
+
+       Set the I/O priority class to use for I/Os that must be issued with
+       a priority when :option:`cmdprio_percentage` or
+       :option:`cmdprio_bssplit` is set. If not specified when
+       :option:`cmdprio_percentage` or :option:`cmdprio_bssplit` is set,
+       this defaults to the highest priority class. A single value applies
+       to reads and writes. Comma-separated values may be specified for
+       reads and writes. See :manpage:`ionice(1)`. See also the
+       :option:`prioclass` option.
+
+.. option:: cmdprio=int[,int] : [io_uring] [libaio]
+
+       Set the I/O priority value to use for I/Os that must be issued with
+       a priority when :option:`cmdprio_percentage` or
+       :option:`cmdprio_bssplit` is set. If not specified when
+       :option:`cmdprio_percentage` or :option:`cmdprio_bssplit` is set,
+       this defaults to 0.
+       Linux limits us to a positive value between 0 and 7, with 0 being the
+       highest. A single value applies to reads and writes. Comma-separated
+       values may be specified for reads and writes. See :manpage:`ionice(1)`.
+       Refer to an appropriate manpage for other operating systems since
+       meaning of priority may differ. See also the :option:`prio` option.
+
+.. option:: cmdprio_bssplit=str[,str] : [io_uring] [libaio]
+       To get a finer control over I/O priority, this option allows
+       specifying the percentage of IOs that must have a priority set
+       depending on the block size of the IO. This option is useful only
+       when used together with the :option:`bssplit` option, that is,
+       multiple different block sizes are used for reads and writes.
+       The format for this option is the same as the format of the
+       :option:`bssplit` option, with the exception that values for
+       trim IOs are ignored. This option is mutually exclusive with the
+       :option:`cmdprio_percentage` option.
 
 .. option:: fixedbufs : [io_uring]
 
@@ -2515,11 +2599,11 @@ with the caveat that when used on the command line, they must come after the
 
 .. option:: pool=str : [dfs]
 
-       Specify the UUID of the DAOS pool to connect to.
+       Specify the label or UUID of the DAOS pool to connect to.
 
 .. option:: cont=str : [dfs]
 
-       Specify the UUID of the DAOS container to open.
+       Specify the label or UUID of the DAOS container to open.
 
 .. option:: chunk_size=int : [dfs]
 
@@ -2536,6 +2620,28 @@ with the caveat that when used on the command line, they must come after the
        URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
        Refer to the libnfs README for more details.
 
+.. option:: program=str : [exec]
+
+       Specify the program to execute.
+
+.. option:: arguments=str : [exec]
+
+       Specify arguments to pass to program.
+       Some special variables can be expanded to pass fio's job details to the program.
+
+       **%r**
+               Replaced by the duration of the job in seconds.
+       **%n**
+               Replaced by the name of the job.
+
+.. option:: grace_time=int : [exec]
+
+       Specify the time between the SIGTERM and SIGKILL signals. Default is 1 second.
+
+.. option:: std_redirect=bool : [exec]
+
+       If set, stdout and stderr streams are redirected to files named from the job name. Default is true.
+
 I/O depth
 ~~~~~~~~~
 
@@ -2642,7 +2748,7 @@ I/O rate
        Stall the job for the specified period of time after an I/O has completed before issuing the
        next. May be used to simulate processing being done by an application.
        When the unit is omitted, the value is interpreted in microseconds.  See
-       :option:`thinktime_blocks` and :option:`thinktime_spin`.
+       :option:`thinktime_blocks`, :option:`thinktime_iotime` and :option:`thinktime_spin`.
 
 .. option:: thinktime_spin=time
 
@@ -2667,6 +2773,18 @@ I/O rate
        :option:`thinktime_blocks` blocks. If this is set to `issue`, then the trigger happens
        at the issue side.
 
+.. option:: thinktime_iotime=time
+
+       Only valid if :option:`thinktime` is set - control :option:`thinktime`
+       interval by time. The :option:`thinktime` stall is repeated after IOs
+       are executed for :option:`thinktime_iotime`. For example,
+       ``--thinktime_iotime=9s --thinktime=1s`` repeat 10-second cycle with IOs
+       for 9 seconds and stall for 1 second. When the unit is omitted,
+       :option:`thinktime_iotime` is interpreted as a number of seconds. If
+       this option is used together with :option:`thinktime_blocks`, the
+       :option:`thinktime` stall is repeated after :option:`thinktime_iotime`
+       or after :option:`thinktime_blocks` IOs, whichever happens first.
+
 .. option:: rate=int[,int][,int]
 
        Cap the bandwidth used by this job. The number is in bytes/sec, the normal
@@ -2906,14 +3024,14 @@ Threads, processes and job synchronization
        between 0 and 7, with 0 being the highest.  See man
        :manpage:`ionice(1)`. Refer to an appropriate manpage for other operating
        systems since meaning of priority may differ. For per-command priority
-       setting, see I/O engine specific `cmdprio_percentage` and `hipri_percentage`
-       options.
+       setting, see I/O engine specific :option:`cmdprio_percentage` and
+       :option:`cmdprio` options.
 
 .. option:: prioclass=int
 
        Set the I/O priority class. See man :manpage:`ionice(1)`. For per-command
-       priority setting, see I/O engine specific `cmdprio_percentage` and
-       `hipri_percentage` options.
+       priority setting, see I/O engine specific :option:`cmdprio_percentage`
+       and :option:`cmdprio_class` options.
 
 .. option:: cpus_allowed=str
 
@@ -3423,6 +3541,18 @@ Measurements and reporting
        :option:`write_bw_log` for details about the filename format and `Log
        File Formats`_ for how data is structured within the file.
 
+.. option:: log_entries=int
+
+       By default, fio will log an entry in the iops, latency, or bw log for
+       every I/O that completes. The initial number of I/O log entries is 1024.
+       When the log entries are all used, new log entries are dynamically
+       allocated.  This dynamic log entry allocation may negatively impact
+       time-related statistics such as I/O tail latencies (e.g. 99.9th percentile
+       completion latency). This option allows specifying a larger initial
+       number of log entries to avoid run-time allocations of new log entries,
+       resulting in more precise time-related I/O statistics.
+       Also see :option:`log_avg_msec`. Defaults to 1024.
+
 .. option:: log_avg_msec=int
 
        By default, fio will log an entry in the iops, latency, or bw log for every
index f57569d5f66461f85a54a69e53d4f161a2d6fba8..5d17bcab906591ff121d3675d795ecd2be4695e6 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -57,11 +57,12 @@ SOURCE :=   $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
                smalloc.c filehash.c profile.c debug.c engines/cpu.c \
                engines/mmap.c engines/sync.c engines/null.c engines/net.c \
                engines/ftruncate.c engines/filecreate.c engines/filestat.c engines/filedelete.c \
+               engines/exec.c \
                server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
                gettime-thread.c helpers.c json.c idletime.c td_error.c \
                profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
                workqueue.c rate-submit.c optgroup.c helper_thread.c \
-               steadystate.c zone-dist.c zbd.c
+               steadystate.c zone-dist.c zbd.c dedupe.c
 
 ifdef CONFIG_LIBHDFS
   HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
@@ -97,6 +98,7 @@ else ifdef CONFIG_32BIT
 endif
 ifdef CONFIG_LIBAIO
   libaio_SRCS = engines/libaio.c
+  cmdprio_SRCS = engines/cmdprio.c
   libaio_LIBS = -laio
   ENGINES += libaio
 endif
@@ -224,6 +226,7 @@ endif
 ifeq ($(CONFIG_TARGET_OS), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
                oslib/linux-dev-lookup.c engines/io_uring.c
+  cmdprio_SRCS = engines/cmdprio.c
 ifdef CONFIG_HAS_BLKZONED
   SOURCE += oslib/linux-blkzoned.c
 endif
@@ -232,7 +235,8 @@ endif
 endif
 ifeq ($(CONFIG_TARGET_OS), Android)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \
-               oslib/linux-dev-lookup.c
+               oslib/linux-dev-lookup.c engines/io_uring.c
+  cmdprio_SRCS = engines/cmdprio.c
 ifdef CONFIG_HAS_BLKZONED
   SOURCE += oslib/linux-blkzoned.c
 endif
@@ -274,12 +278,16 @@ ifeq ($(CONFIG_TARGET_OS), Darwin)
   LIBS  += -lpthread -ldl
 endif
 ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
-  SOURCE += os/windows/cpu-affinity.c os/windows/posix.c
-  WINDOWS_OBJS = os/windows/cpu-affinity.o os/windows/posix.o lib/hweight.o
+  SOURCE += os/windows/cpu-affinity.c os/windows/posix.c os/windows/dlls.c
+  WINDOWS_OBJS = os/windows/cpu-affinity.o os/windows/posix.o os/windows/dlls.o lib/hweight.o
   LIBS  += -lpthread -lpsapi -lws2_32 -lssp
   FIO_CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format
 endif
 
+ifdef cmdprio_SRCS
+  SOURCE += $(cmdprio_SRCS)
+endif
+
 ifdef CONFIG_DYNAMIC_ENGINES
  DYNAMIC_ENGS := $(ENGINES)
 define engine_template =
@@ -293,7 +301,7 @@ else # !CONFIG_DYNAMIC_ENGINES
 define engine_template =
 SOURCE += $$($(1)_SRCS)
 LIBS += $$($(1)_LIBS)
-CFLAGS += $$($(1)_CFLAGS)
+override CFLAGS += $$($(1)_CFLAGS)
 endef
 endif
 
@@ -367,8 +375,7 @@ T_VS_PROGS = t/fio-verify-state
 T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o
 T_PIPE_ASYNC_PROGS = t/read-to-pipe-async
 
-T_IOU_RING_OBJS = t/io_uring.o
-T_IOU_RING_OBJS += t/arch.o
+T_IOU_RING_OBJS = t/io_uring.o lib/rand.o lib/pattern.o lib/strntol.o
 T_IOU_RING_PROGS = t/io_uring
 
 T_MEMLOCK_OBJS = t/memlock.o
@@ -625,7 +632,7 @@ unittests/unittest: $(UT_OBJS) $(UT_TARGET_OBJS)
 endif
 
 clean: FORCE
-       @rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] engines/*.so profiles/*.[do] t/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
+       @rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] engines/*.so profiles/*.[do] t/*.[do] t/*/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
        @rm -f t/fio-btrace2fio t/io_uring t/read-to-pipe-async
        @rm -rf  doc/output
 
@@ -650,7 +657,7 @@ test: fio
 fulltest:
        sudo modprobe null_blk &&                                       \
        if [ ! -e /usr/include/libzbc/zbc.h ]; then                     \
-         git clone https://github.com/hgst/libzbc &&                   \
+         git clone https://github.com/westerndigitalcorporation/libzbc && \
          (cd libzbc &&                                                 \
           ./autogen.sh &&                                              \
           ./configure --prefix=/usr &&                                 \
diff --git a/README b/README
index 2fecf0e020077eeed24300882e553e772de050bb..d566fae3de8526a8b5359a63d65e7a8e70833fc2 100644 (file)
--- a/README
+++ b/README
@@ -10,7 +10,7 @@ tailored test case again and again.
 
 A test work load is difficult to define, though. There can be any number of
 processes or threads involved, and they can each be using their own way of
-generating I/O. You could have someone dirtying large amounts of memory in an
+generating I/O. You could have someone dirtying large amounts of memory in a
 memory mapped file, or maybe several threads issuing reads using asynchronous
 I/O. fio needed to be flexible enough to simulate both of these cases, and many
 more.
@@ -72,6 +72,10 @@ in the body of the email. Archives can be found here:
 
        http://www.spinics.net/lists/fio/
 
+or here:
+
+       https://lore.kernel.org/fio/
+
 and archives for the old list can be found here:
 
        http://maillist.kernel.dk/fio-devel/
index a25779d4fd8521fe56a38027c182e5c7fcd5a4d5..fca003beabf4e606aa1b80af853f973009e8c7ed 100644 (file)
@@ -1,7 +1,11 @@
 #ifndef ARCH_H
 #define ARCH_H
 
+#ifdef __cplusplus
+#include <atomic>
+#else
 #include <stdatomic.h>
+#endif
 
 #include "../lib/types.h"
 
@@ -36,6 +40,21 @@ extern unsigned long arch_flags;
 
 #define ARCH_CPU_CLOCK_WRAPS
 
+#ifdef __cplusplus
+#define atomic_add(p, v)                                               \
+       std::atomic_fetch_add(p, (v))
+#define atomic_sub(p, v)                                               \
+       std::atomic_fetch_sub(p, (v))
+#define atomic_load_relaxed(p)                                 \
+       std::atomic_load_explicit(p,                            \
+                            std::memory_order_relaxed)
+#define atomic_load_acquire(p)                                 \
+       std::atomic_load_explicit(p,                            \
+                            std::memory_order_acquire)
+#define atomic_store_release(p, v)                             \
+       std::atomic_store_explicit(p, (v),                      \
+                            std::memory_order_release)
+#else
 #define atomic_add(p, v)                                       \
        atomic_fetch_add((_Atomic typeof(*(p)) *)(p), v)
 #define atomic_sub(p, v)                                       \
@@ -49,6 +68,7 @@ extern unsigned long arch_flags;
 #define atomic_store_release(p, v)                             \
        atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \
                              memory_order_release)
+#endif
 
 /* IWYU pragma: begin_exports */
 #if defined(__i386__)
index 6290e0d652a838c510634e62cc123eea8da3e277..c167f908625f39fd8e344c9f75c705f7fd71adf4 100644 (file)
--- a/backend.c
+++ b/backend.c
@@ -837,7 +837,7 @@ static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
        if (td->o.rate_process == RATE_PROCESS_POISSON) {
                uint64_t val, iops;
 
-               iops = bps / td->o.bs[ddir];
+               iops = bps / td->o.min_bs[ddir];
                val = (int64_t) (1000000 / iops) *
                                -logf(__rand_0_1(&td->poisson_state[ddir]));
                if (val) {
@@ -858,15 +858,47 @@ static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
        return 0;
 }
 
+static void init_thinktime(struct thread_data *td)
+{
+       if (td->o.thinktime_blocks_type == THINKTIME_BLOCKS_TYPE_COMPLETE)
+               td->thinktime_blocks_counter = td->io_blocks;
+       else
+               td->thinktime_blocks_counter = td->io_issues;
+       td->last_thinktime = td->epoch;
+       td->last_thinktime_blocks = 0;
+}
+
 static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir,
                             struct timespec *time)
 {
        unsigned long long b;
        uint64_t total;
        int left;
+       struct timespec now;
+       bool stall = false;
+
+       if (td->o.thinktime_iotime) {
+               fio_gettime(&now, NULL);
+               if (utime_since(&td->last_thinktime, &now)
+                   >= td->o.thinktime_iotime + td->o.thinktime) {
+                       stall = true;
+               } else if (!fio_option_is_set(&td->o, thinktime_blocks)) {
+                       /*
+                        * When thinktime_iotime is set and thinktime_blocks is
+                        * not set, skip the thinktime_blocks check, since
+                        * thinktime_blocks default value 1 does not work
+                        * together with thinktime_iotime.
+                        */
+                       return;
+               }
+
+       }
 
        b = ddir_rw_sum(td->thinktime_blocks_counter);
-       if (b % td->o.thinktime_blocks || !b)
+       if (b >= td->last_thinktime_blocks + td->o.thinktime_blocks)
+               stall = true;
+
+       if (!stall)
                return;
 
        io_u_quiesce(td);
@@ -902,6 +934,10 @@ static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir,
 
        if (time && should_check_rate(td))
                fio_gettime(time, NULL);
+
+       td->last_thinktime_blocks = b;
+       if (td->o.thinktime_iotime)
+               td->last_thinktime = now;
 }
 
 /*
@@ -1407,7 +1443,7 @@ static int set_ioscheduler(struct thread_data *td, struct fio_file *file)
 
        sprintf(tmp2, "[%s]", td->o.ioscheduler);
        if (!strstr(tmp, tmp2)) {
-               log_err("fio: io scheduler %s not found\n", td->o.ioscheduler);
+               log_err("fio: unable to set io scheduler to %s\n", td->o.ioscheduler);
                td_verror(td, EINVAL, "iosched_switch");
                fclose(f);
                return 1;
@@ -1760,6 +1796,7 @@ static void *thread_main(void *data)
                        td_verror(td, errno, "ioprio_set");
                        goto err;
                }
+               td->ioprio = ioprio_value(o->ioprio_class, o->ioprio);
        }
 
        if (o->cgroup && cgroup_setup(td, cgroup_list, &cgroup_mnt))
@@ -1791,17 +1828,14 @@ static void *thread_main(void *data)
        if (rate_submit_init(td, sk_out))
                goto err;
 
-       if (td->o.thinktime_blocks_type == THINKTIME_BLOCKS_TYPE_COMPLETE)
-               td->thinktime_blocks_counter = td->io_blocks;
-       else
-               td->thinktime_blocks_counter = td->io_issues;
-
        set_epoch_time(td, o->log_unix_epoch);
        fio_getrusage(&td->ru_start);
        memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
        memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
        memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
 
+       init_thinktime(td);
+
        if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] ||
                        o->ratemin[DDIR_TRIM]) {
                memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
diff --git a/cconv.c b/cconv.c
index 74c241063abb6491586f18751502ff572f50edc0..4f8d27eb2dcec47244dfab3b902205b32a21da5e 100644 (file)
--- a/cconv.c
+++ b/cconv.c
@@ -187,11 +187,13 @@ void convert_thread_options_to_cpu(struct thread_options *o,
        o->rand_repeatable = le32_to_cpu(top->rand_repeatable);
        o->allrand_repeatable = le32_to_cpu(top->allrand_repeatable);
        o->rand_seed = le64_to_cpu(top->rand_seed);
+       o->log_entries = le32_to_cpu(top->log_entries);
        o->log_avg_msec = le32_to_cpu(top->log_avg_msec);
        o->log_hist_msec = le32_to_cpu(top->log_hist_msec);
        o->log_hist_coarseness = le32_to_cpu(top->log_hist_coarseness);
        o->log_max = le32_to_cpu(top->log_max);
        o->log_offset = le32_to_cpu(top->log_offset);
+       o->log_prio = le32_to_cpu(top->log_prio);
        o->log_gz = le32_to_cpu(top->log_gz);
        o->log_gz_store = le32_to_cpu(top->log_gz_store);
        o->log_unix_epoch = le32_to_cpu(top->log_unix_epoch);
@@ -213,6 +215,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
        o->thinktime_spin = le32_to_cpu(top->thinktime_spin);
        o->thinktime_blocks = le32_to_cpu(top->thinktime_blocks);
        o->thinktime_blocks_type = le32_to_cpu(top->thinktime_blocks_type);
+       o->thinktime_iotime = le32_to_cpu(top->thinktime_iotime);
        o->fsync_blocks = le32_to_cpu(top->fsync_blocks);
        o->fdatasync_blocks = le32_to_cpu(top->fdatasync_blocks);
        o->barrier_blocks = le32_to_cpu(top->barrier_blocks);
@@ -298,6 +301,8 @@ void convert_thread_options_to_cpu(struct thread_options *o,
        o->compress_percentage = le32_to_cpu(top->compress_percentage);
        o->compress_chunk = le32_to_cpu(top->compress_chunk);
        o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
+       o->dedupe_mode = le32_to_cpu(top->dedupe_mode);
+       o->dedupe_working_set_percentage = le32_to_cpu(top->dedupe_working_set_percentage);
        o->block_error_hist = le32_to_cpu(top->block_error_hist);
        o->replay_align = le32_to_cpu(top->replay_align);
        o->replay_scale = le32_to_cpu(top->replay_scale);
@@ -412,9 +417,11 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
        top->rand_repeatable = cpu_to_le32(o->rand_repeatable);
        top->allrand_repeatable = cpu_to_le32(o->allrand_repeatable);
        top->rand_seed = __cpu_to_le64(o->rand_seed);
+       top->log_entries = cpu_to_le32(o->log_entries);
        top->log_avg_msec = cpu_to_le32(o->log_avg_msec);
        top->log_max = cpu_to_le32(o->log_max);
        top->log_offset = cpu_to_le32(o->log_offset);
+       top->log_prio = cpu_to_le32(o->log_prio);
        top->log_gz = cpu_to_le32(o->log_gz);
        top->log_gz_store = cpu_to_le32(o->log_gz_store);
        top->log_unix_epoch = cpu_to_le32(o->log_unix_epoch);
@@ -436,6 +443,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
        top->thinktime_spin = cpu_to_le32(o->thinktime_spin);
        top->thinktime_blocks = cpu_to_le32(o->thinktime_blocks);
        top->thinktime_blocks_type = __cpu_to_le32(o->thinktime_blocks_type);
+       top->thinktime_iotime = __cpu_to_le32(o->thinktime_iotime);
        top->fsync_blocks = cpu_to_le32(o->fsync_blocks);
        top->fdatasync_blocks = cpu_to_le32(o->fdatasync_blocks);
        top->barrier_blocks = cpu_to_le32(o->barrier_blocks);
@@ -499,6 +507,8 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
        top->compress_percentage = cpu_to_le32(o->compress_percentage);
        top->compress_chunk = cpu_to_le32(o->compress_chunk);
        top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
+       top->dedupe_mode = cpu_to_le32(o->dedupe_mode);
+       top->dedupe_working_set_percentage = cpu_to_le32(o->dedupe_working_set_percentage);
        top->block_error_hist = cpu_to_le32(o->block_error_hist);
        top->replay_align = cpu_to_le32(o->replay_align);
        top->replay_scale = cpu_to_le32(o->replay_scale);
index 29d8750a5b2b6948bc8206399d6a1be774a22a78..8b230617f79bd0dbab6853e9388a1e7fa8f700ce 100644 (file)
--- a/client.c
+++ b/client.c
@@ -1679,6 +1679,7 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
        ret->log_type           = le32_to_cpu(ret->log_type);
        ret->compressed         = le32_to_cpu(ret->compressed);
        ret->log_offset         = le32_to_cpu(ret->log_offset);
+       ret->log_prio           = le32_to_cpu(ret->log_prio);
        ret->log_hist_coarseness = le32_to_cpu(ret->log_hist_coarseness);
 
        if (*store_direct)
@@ -1696,6 +1697,7 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
                s->data.val     = le64_to_cpu(s->data.val);
                s->__ddir       = __le32_to_cpu(s->__ddir);
                s->bs           = le64_to_cpu(s->bs);
+               s->priority     = le16_to_cpu(s->priority);
 
                if (ret->log_offset) {
                        struct io_sample_offset *so = (void *) s;
diff --git a/dedupe.c b/dedupe.c
new file mode 100644 (file)
index 0000000..fd116df
--- /dev/null
+++ b/dedupe.c
@@ -0,0 +1,36 @@
+#include "fio.h"
+
+int init_dedupe_working_set_seeds(struct thread_data *td)
+{
+       unsigned long long i, j, num_seed_advancements;
+       struct frand_state dedupe_working_set_state = {0};
+
+       if (!td->o.dedupe_percentage || !(td->o.dedupe_mode == DEDUPE_MODE_WORKING_SET))
+               return 0;
+
+       num_seed_advancements = td->o.min_bs[DDIR_WRITE] /
+               min_not_zero(td->o.min_bs[DDIR_WRITE], (unsigned long long) td->o.compress_chunk);
+       /*
+        * The dedupe working set keeps seeds of unique data (generated by buf_state).
+        * Dedupe-ed pages will be generated using those seeds.
+        */
+       td->num_unique_pages = (td->o.size * (unsigned long long)td->o.dedupe_working_set_percentage / 100) / td->o.min_bs[DDIR_WRITE];
+       td->dedupe_working_set_states = malloc(sizeof(struct frand_state) * td->num_unique_pages);
+       if (!td->dedupe_working_set_states) {
+               log_err("fio: could not allocate dedupe working set\n");
+               return 1;
+       }
+       frand_copy(&dedupe_working_set_state, &td->buf_state);
+       for (i = 0; i < td->num_unique_pages; i++) {
+               frand_copy(&td->dedupe_working_set_states[i], &dedupe_working_set_state);
+               /*
+                * When compression is used the seed is advanced multiple times to
+                * generate the buffer. We want to regenerate the same buffer when
+                * deduping against this page
+                */
+               for (j = 0; j < num_seed_advancements; j++)
+                       __get_next_seed(&dedupe_working_set_state);
+       }
+
+       return 0;
+}
diff --git a/dedupe.h b/dedupe.h
new file mode 100644 (file)
index 0000000..d4c4dc3
--- /dev/null
+++ b/dedupe.h
@@ -0,0 +1,6 @@
+#ifndef DEDUPE_H
+#define DEDUPE_H
+
+int init_dedupe_working_set_seeds(struct thread_data *td);
+
+#endif
index 0051a7a035875b3a51493ac527125b413ad1bb7d..ace7af3d5b5bb7368584a1369a0872cbe79c92ee 100644 (file)
@@ -166,14 +166,10 @@ static int get_device_numbers(char *file_name, int *maj, int *min)
                if (S_ISBLK(st.st_mode)) {
                        majdev = major(st.st_rdev);
                        mindev = minor(st.st_rdev);
-               } else if (S_ISCHR(st.st_mode)) {
-                       majdev = major(st.st_rdev);
-                       mindev = minor(st.st_rdev);
-                       if (fio_lookup_raw(st.st_rdev, &majdev, &mindev))
-                               return -1;
-               } else if (S_ISFIFO(st.st_mode))
+               } else if (S_ISCHR(st.st_mode) ||
+                          S_ISFIFO(st.st_mode)) {
                        return -1;
-               else {
+               else {
                        majdev = major(st.st_dev);
                        mindev = minor(st.st_dev);
                }
diff --git a/engines/cmdprio.c b/engines/cmdprio.c
new file mode 100644 (file)
index 0000000..92b752a
--- /dev/null
@@ -0,0 +1,243 @@
+/*
+ * IO priority handling helper functions common to the libaio and io_uring
+ * engines.
+ */
+
+#include "cmdprio.h"
+
+static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg,
+                                   enum fio_ddir ddir, char *str, bool data)
+{
+       struct cmdprio *cmdprio = cb_arg;
+       struct split split;
+       unsigned int i;
+
+       if (ddir == DDIR_TRIM)
+               return 0;
+
+       memset(&split, 0, sizeof(split));
+
+       if (split_parse_ddir(to, &split, str, data, BSSPLIT_MAX))
+               return 1;
+       if (!split.nr)
+               return 0;
+
+       cmdprio->bssplit_nr[ddir] = split.nr;
+       cmdprio->bssplit[ddir] = malloc(split.nr * sizeof(struct bssplit));
+       if (!cmdprio->bssplit[ddir])
+               return 1;
+
+       for (i = 0; i < split.nr; i++) {
+               cmdprio->bssplit[ddir][i].bs = split.val1[i];
+               if (split.val2[i] == -1U) {
+                       cmdprio->bssplit[ddir][i].perc = 0;
+               } else {
+                       if (split.val2[i] > 100)
+                               cmdprio->bssplit[ddir][i].perc = 100;
+                       else
+                               cmdprio->bssplit[ddir][i].perc = split.val2[i];
+               }
+       }
+
+       return 0;
+}
+
+int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
+                             struct cmdprio *cmdprio)
+{
+       char *str, *p;
+       int ret = 0;
+
+       p = str = strdup(input);
+
+       strip_blank_front(&str);
+       strip_blank_end(str);
+
+       ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, cmdprio,
+                             false);
+
+       free(p);
+       return ret;
+}
+
+static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u)
+{
+       enum fio_ddir ddir = io_u->ddir;
+       struct cmdprio_options *options = cmdprio->options;
+       int i;
+
+       switch (cmdprio->mode) {
+       case CMDPRIO_MODE_PERC:
+               return options->percentage[ddir];
+       case CMDPRIO_MODE_BSSPLIT:
+               for (i = 0; i < cmdprio->bssplit_nr[ddir]; i++) {
+                       if (cmdprio->bssplit[ddir][i].bs == io_u->buflen)
+                               return cmdprio->bssplit[ddir][i].perc;
+               }
+               break;
+       default:
+               /*
+                * An I/O engine should never call this function if cmdprio
+                * is not is use.
+                */
+               assert(0);
+       }
+
+       return 0;
+}
+
+/**
+ * fio_cmdprio_set_ioprio - Set an io_u ioprio according to cmdprio options
+ *
+ * Generates a random percentage value to determine if an io_u ioprio needs
+ * to be set. If the random percentage value is within the user specified
+ * percentage of I/Os that should use a cmdprio priority value (rather than
+ * the default priority), then this function updates the io_u with an ioprio
+ * value as defined by the cmdprio/cmdprio_class or cmdprio_bssplit options.
+ *
+ * Return true if the io_u ioprio was changed and false otherwise.
+ */
+bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio,
+                           struct io_u *io_u)
+{
+       enum fio_ddir ddir = io_u->ddir;
+       struct cmdprio_options *options = cmdprio->options;
+       unsigned int p;
+       unsigned int cmdprio_value =
+               ioprio_value(options->class[ddir], options->level[ddir]);
+
+       p = fio_cmdprio_percentage(cmdprio, io_u);
+       if (p && rand_between(&td->prio_state, 0, 99) < p) {
+               io_u->ioprio = cmdprio_value;
+               if (!td->ioprio || cmdprio_value < td->ioprio) {
+                       /*
+                        * The async IO priority is higher (has a lower value)
+                        * than the default priority (which is either 0 or the
+                        * value set by "prio" and "prioclass" options).
+                        */
+                       io_u->flags |= IO_U_F_HIGH_PRIO;
+               }
+               return true;
+       }
+
+       if (td->ioprio && td->ioprio < cmdprio_value) {
+               /*
+                * The IO will be executed with the default priority (which is
+                * either 0 or the value set by "prio" and "prioclass options),
+                * and this priority is higher (has a lower value) than the
+                * async IO priority.
+                */
+               io_u->flags |= IO_U_F_HIGH_PRIO;
+       }
+
+       return false;
+}
+
+static int fio_cmdprio_parse_and_gen_bssplit(struct thread_data *td,
+                                            struct cmdprio *cmdprio)
+{
+       struct cmdprio_options *options = cmdprio->options;
+       int ret;
+
+       ret = fio_cmdprio_bssplit_parse(td, options->bssplit_str, cmdprio);
+       if (ret)
+               goto err;
+
+       return 0;
+
+err:
+       fio_cmdprio_cleanup(cmdprio);
+
+       return ret;
+}
+
+static int fio_cmdprio_parse_and_gen(struct thread_data *td,
+                                    struct cmdprio *cmdprio)
+{
+       struct cmdprio_options *options = cmdprio->options;
+       int i, ret;
+
+       switch (cmdprio->mode) {
+       case CMDPRIO_MODE_BSSPLIT:
+               ret = fio_cmdprio_parse_and_gen_bssplit(td, cmdprio);
+               break;
+       case CMDPRIO_MODE_PERC:
+               ret = 0;
+               break;
+       default:
+               assert(0);
+               return 1;
+       }
+
+       /*
+        * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class
+        * is not set, default to RT priority class.
+        */
+       for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
+               if (options->percentage[i] || cmdprio->bssplit_nr[i]) {
+                       if (!options->class[i])
+                               options->class[i] = IOPRIO_CLASS_RT;
+               }
+       }
+
+       return ret;
+}
+
+void fio_cmdprio_cleanup(struct cmdprio *cmdprio)
+{
+       int ddir;
+
+       for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+               free(cmdprio->bssplit[ddir]);
+               cmdprio->bssplit[ddir] = NULL;
+               cmdprio->bssplit_nr[ddir] = 0;
+       }
+
+       /*
+        * options points to a cmdprio_options struct that is part of td->eo.
+        * td->eo itself will be freed by free_ioengine().
+        */
+       cmdprio->options = NULL;
+}
+
+int fio_cmdprio_init(struct thread_data *td, struct cmdprio *cmdprio,
+                    struct cmdprio_options *options)
+{
+       struct thread_options *to = &td->o;
+       bool has_cmdprio_percentage = false;
+       bool has_cmdprio_bssplit = false;
+       int i;
+
+       cmdprio->options = options;
+
+       if (options->bssplit_str && strlen(options->bssplit_str))
+               has_cmdprio_bssplit = true;
+
+       for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
+               if (options->percentage[i])
+                       has_cmdprio_percentage = true;
+       }
+
+       /*
+        * Check for option conflicts
+        */
+       if (has_cmdprio_percentage && has_cmdprio_bssplit) {
+               log_err("%s: cmdprio_percentage and cmdprio_bssplit options "
+                       "are mutually exclusive\n",
+                       to->name);
+               return 1;
+       }
+
+       if (has_cmdprio_bssplit)
+               cmdprio->mode = CMDPRIO_MODE_BSSPLIT;
+       else if (has_cmdprio_percentage)
+               cmdprio->mode = CMDPRIO_MODE_PERC;
+       else
+               cmdprio->mode = CMDPRIO_MODE_NONE;
+
+       /* Nothing left to do if cmdprio is not used */
+       if (cmdprio->mode == CMDPRIO_MODE_NONE)
+               return 0;
+
+       return fio_cmdprio_parse_and_gen(td, cmdprio);
+}
diff --git a/engines/cmdprio.h b/engines/cmdprio.h
new file mode 100644 (file)
index 0000000..0c7bd6c
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * IO priority handling declarations and helper functions common to the
+ * libaio and io_uring engines.
+ */
+
+#ifndef FIO_CMDPRIO_H
+#define FIO_CMDPRIO_H
+
+#include "../fio.h"
+
+/* read and writes only, no trim */
+#define CMDPRIO_RWDIR_CNT 2
+
+enum {
+       CMDPRIO_MODE_NONE,
+       CMDPRIO_MODE_PERC,
+       CMDPRIO_MODE_BSSPLIT,
+};
+
+struct cmdprio_options {
+       unsigned int percentage[CMDPRIO_RWDIR_CNT];
+       unsigned int class[CMDPRIO_RWDIR_CNT];
+       unsigned int level[CMDPRIO_RWDIR_CNT];
+       char *bssplit_str;
+};
+
+struct cmdprio {
+       struct cmdprio_options *options;
+       unsigned int bssplit_nr[CMDPRIO_RWDIR_CNT];
+       struct bssplit *bssplit[CMDPRIO_RWDIR_CNT];
+       unsigned int mode;
+};
+
+bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio,
+                           struct io_u *io_u);
+
+void fio_cmdprio_cleanup(struct cmdprio *cmdprio);
+
+int fio_cmdprio_init(struct thread_data *td, struct cmdprio *cmdprio,
+                    struct cmdprio_options *options);
+
+#endif
index 0343b101e92c2899db26c7dddecef696ae70e7b0..664e8b13c727107fd2659f8a9795c1fabcc4d5ae 100644 (file)
@@ -49,19 +49,19 @@ struct daos_fio_options {
 static struct fio_option options[] = {
        {
                .name           = "pool",
-               .lname          = "pool uuid",
+               .lname          = "pool uuid or label",
                .type           = FIO_OPT_STR_STORE,
                .off1           = offsetof(struct daos_fio_options, pool),
-               .help           = "DAOS pool uuid",
+               .help           = "DAOS pool uuid or label",
                .category       = FIO_OPT_C_ENGINE,
                .group          = FIO_OPT_G_DFS,
        },
        {
                .name           = "cont",
-               .lname          = "container uuid",
+               .lname          = "container uuid or label",
                .type           = FIO_OPT_STR_STORE,
                .off1           = offsetof(struct daos_fio_options, cont),
-               .help           = "DAOS container uuid",
+               .help           = "DAOS container uuid or label",
                .category       = FIO_OPT_C_ENGINE,
                .group          = FIO_OPT_G_DFS,
        },
@@ -103,7 +103,6 @@ static struct fio_option options[] = {
 static int daos_fio_global_init(struct thread_data *td)
 {
        struct daos_fio_options *eo = td->eo;
-       uuid_t                  pool_uuid, co_uuid;
        daos_pool_info_t        pool_info;
        daos_cont_info_t        co_info;
        int                     rc = 0;
@@ -124,6 +123,10 @@ static int daos_fio_global_init(struct thread_data *td)
                return rc;
        }
 
+#if !defined(DAOS_API_VERSION_MAJOR) || \
+    (DAOS_API_VERSION_MAJOR == 1 && DAOS_API_VERSION_MINOR < 3)
+       uuid_t pool_uuid, co_uuid;
+
        rc = uuid_parse(eo->pool, pool_uuid);
        if (rc) {
                log_err("Failed to parse 'Pool uuid': %s\n", eo->pool);
@@ -137,6 +140,7 @@ static int daos_fio_global_init(struct thread_data *td)
                td_verror(td, EINVAL, "uuid_parse(eo->cont)");
                return EINVAL;
        }
+#endif
 
        /* Connect to the DAOS pool */
 #if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
@@ -152,9 +156,12 @@ static int daos_fio_global_init(struct thread_data *td)
        rc = daos_pool_connect(pool_uuid, NULL, svcl, DAOS_PC_RW,
                        &poh, &pool_info, NULL);
        d_rank_list_free(svcl);
-#else
+#elif (DAOS_API_VERSION_MAJOR == 1 && DAOS_API_VERSION_MINOR < 3)
        rc = daos_pool_connect(pool_uuid, NULL, DAOS_PC_RW, &poh, &pool_info,
                               NULL);
+#else
+       rc = daos_pool_connect(eo->pool, NULL, DAOS_PC_RW, &poh, &pool_info,
+                              NULL);
 #endif
        if (rc) {
                log_err("Failed to connect to pool %d\n", rc);
@@ -163,7 +170,12 @@ static int daos_fio_global_init(struct thread_data *td)
        }
 
        /* Open the DAOS container */
+#if !defined(DAOS_API_VERSION_MAJOR) || \
+    (DAOS_API_VERSION_MAJOR == 1 && DAOS_API_VERSION_MINOR < 3)
        rc = daos_cont_open(poh, co_uuid, DAOS_COO_RW, &coh, &co_info, NULL);
+#else
+       rc = daos_cont_open(poh, eo->cont, DAOS_COO_RW, &coh, &co_info, NULL);
+#endif
        if (rc) {
                log_err("Failed to open container: %d\n", rc);
                td_verror(td, rc, "daos_cont_open");
diff --git a/engines/exec.c b/engines/exec.c
new file mode 100644 (file)
index 0000000..ab3639c
--- /dev/null
@@ -0,0 +1,394 @@
+/*
+ * Exec engine
+ *
+ * Doesn't transfer any data, merely run 3rd party tools
+ *
+ */
+#include "../fio.h"
+#include "../optgroup.h"
+#include <signal.h>
+
+struct exec_options {
+       void *pad;
+       char *program;
+       char *arguments;
+       int grace_time;
+       unsigned int std_redirect;
+       pid_t pid;
+};
+
+static struct fio_option options[] = {
+       {
+               .name = "program",
+               .lname = "Program",
+               .type = FIO_OPT_STR_STORE,
+               .off1 = offsetof(struct exec_options, program),
+               .help = "Program to execute",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_INVALID,
+       },
+       {
+               .name = "arguments",
+               .lname = "Arguments",
+               .type = FIO_OPT_STR_STORE,
+               .off1 = offsetof(struct exec_options, arguments),
+               .help = "Arguments to pass",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_INVALID,
+       },
+       {
+               .name = "grace_time",
+               .lname = "Grace time",
+               .type = FIO_OPT_INT,
+               .minval = 0,
+               .def = "1",
+               .off1 = offsetof(struct exec_options, grace_time),
+               .help = "Grace time before sending a SIGKILL",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_INVALID,
+       },
+       {
+               .name = "std_redirect",
+               .lname = "Std redirect",
+               .type = FIO_OPT_BOOL,
+               .def = "1",
+               .off1 = offsetof(struct exec_options, std_redirect),
+               .help = "Redirect stdout & stderr to files",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_INVALID,
+       },
+       {
+               .name = NULL,
+       },
+};
+
+char *str_replace(char *orig, const char *rep, const char *with)
+{
+       /*
+        * Replace a substring by another.
+        *
+        * Returns the new string if occurences were found
+        * Returns orig if no occurence is found
+        */
+       char *result, *insert, *tmp;
+       int len_rep, len_with, len_front, count;
+
+       /* sanity checks and initialization */
+       if (!orig || !rep)
+               return orig;
+
+       len_rep = strlen(rep);
+       if (len_rep == 0)
+               return orig;
+
+       if (!with)
+               with = "";
+       len_with = strlen(with);
+
+       insert = orig;
+       for (count = 0; (tmp = strstr(insert, rep)); ++count) {
+               insert = tmp + len_rep;
+       }
+
+       tmp = result = malloc(strlen(orig) + (len_with - len_rep) * count + 1);
+
+       if (!result)
+               return orig;
+
+       while (count--) {
+               insert = strstr(orig, rep);
+               len_front = insert - orig;
+               tmp = strncpy(tmp, orig, len_front) + len_front;
+               tmp = strcpy(tmp, with) + len_with;
+               orig += len_front + len_rep;
+       }
+       strcpy(tmp, orig);
+       return result;
+}
+
+char *expand_variables(struct thread_options *o, char *arguments)
+{
+       char str[16];
+       char *expanded_runtime, *expanded_name;
+       snprintf(str, sizeof(str), "%lld", o->timeout / 1000000);
+
+       /* %r is replaced by the runtime in seconds */
+       expanded_runtime = str_replace(arguments, "%r", str);
+
+       /* %n is replaced by the name of the running job */
+       expanded_name = str_replace(expanded_runtime, "%n", o->name);
+
+       free(expanded_runtime);
+       return expanded_name;
+}
+
+static int exec_background(struct thread_options *o, struct exec_options *eo)
+{
+       char *outfilename = NULL, *errfilename = NULL;
+       int outfd = 0, errfd = 0;
+       pid_t pid;
+       char *expanded_arguments = NULL;
+       /* For the arguments splitting */
+       char **arguments_array = NULL;
+       char *p;
+       char *exec_cmd = NULL;
+       size_t arguments_nb_items = 0, q;
+
+       if (asprintf(&outfilename, "%s.stdout", o->name) < 0)
+               return -1;
+
+       if (asprintf(&errfilename, "%s.stderr", o->name) < 0) {
+               free(outfilename);
+               return -1;
+       }
+
+       /* If we have variables in the arguments, let's expand them */
+       expanded_arguments = expand_variables(o, eo->arguments);
+
+       if (eo->std_redirect) {
+               log_info("%s : Saving output of %s %s : stdout=%s stderr=%s\n",
+                        o->name, eo->program, expanded_arguments, outfilename,
+                        errfilename);
+
+               /* Creating the stderr & stdout output files */
+               outfd = open(outfilename, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+               if (outfd < 0) {
+                       log_err("fio: cannot open output file %s : %s\n",
+                               outfilename, strerror(errno));
+                       free(outfilename);
+                       free(errfilename);
+                       free(expanded_arguments);
+                       return -1;
+               }
+
+               errfd = open(errfilename, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+               if (errfd < 0) {
+                       log_err("fio: cannot open output file %s : %s\n",
+                               errfilename, strerror(errno));
+                       free(outfilename);
+                       free(errfilename);
+                       free(expanded_arguments);
+                       close(outfd);
+                       return -1;
+               }
+       } else {
+               log_info("%s : Running %s %s\n",
+                        o->name, eo->program, expanded_arguments);
+       }
+
+       pid = fork();
+
+       /* We are on the control thread (parent side of the fork */
+       if (pid > 0) {
+               eo->pid = pid;
+               if (eo->std_redirect) {
+                       /* The output file is for the client side of the fork */
+                       close(outfd);
+                       close(errfd);
+                       free(outfilename);
+                       free(errfilename);
+               }
+               free(expanded_arguments);
+               return 0;
+       }
+
+       /* If the fork failed */
+       if (pid < 0) {
+               log_err("fio: forking failed %s \n", strerror(errno));
+               if (eo->std_redirect) {
+                       close(outfd);
+                       close(errfd);
+                       free(outfilename);
+                       free(errfilename);
+               }
+               free(expanded_arguments);
+               return -1;
+       }
+
+       /* We are in the worker (child side of the fork) */
+       if (pid == 0) {
+               if (eo->std_redirect) {
+                       /* replace stdout by the output file we create */
+                       dup2(outfd, 1);
+                       /* replace stderr by the output file we create */
+                       dup2(errfd, 2);
+                       close(outfd);
+                       close(errfd);
+                       free(outfilename);
+                       free(errfilename);
+               }
+
+               /*
+                * Let's split the command line into a null terminated array to
+                * be passed to the exec'd program.
+                * But don't asprintf expanded_arguments if NULL as it would be
+                * converted to a '(null)' argument, while we want no arguments
+                * at all.
+                */
+               if (expanded_arguments != NULL) {
+                       if (asprintf(&exec_cmd, "%s %s", eo->program, expanded_arguments) < 0) {
+                               free(expanded_arguments);
+                               return -1;
+                       }
+               } else {
+                       if (asprintf(&exec_cmd, "%s", eo->program) < 0)
+                               return -1;
+               }
+
+               /*
+                * Let's build an argv array to based on the program name and
+                * arguments
+                */
+               p = exec_cmd;
+               for (;;) {
+                       p += strspn(p, " ");
+
+                       if (!(q = strcspn(p, " ")))
+                               break;
+
+                       if (q) {
+                               arguments_array =
+                                   realloc(arguments_array,
+                                           (arguments_nb_items +
+                                            1) * sizeof(char *));
+                               arguments_array[arguments_nb_items] =
+                                   malloc(q + 1);
+                               strncpy(arguments_array[arguments_nb_items], p,
+                                       q);
+                               arguments_array[arguments_nb_items][q] = 0;
+                               arguments_nb_items++;
+                               p += q;
+                       }
+               }
+
+               /* Adding a null-terminated item to close the list */
+               arguments_array =
+                   realloc(arguments_array,
+                           (arguments_nb_items + 1) * sizeof(char *));
+               arguments_array[arguments_nb_items] = NULL;
+
+               /*
+                * Replace the fio program from the child fork by the target
+                * program
+                */
+               execvp(arguments_array[0], arguments_array);
+       }
+       /* We never reach this place */
+       /* Let's free the malloc'ed structures to make static checkers happy */
+       if (expanded_arguments)
+               free(expanded_arguments);
+       if (arguments_array)
+               free(arguments_array);
+       return 0;
+}
+
+static enum fio_q_status
+fio_exec_queue(struct thread_data *td, struct io_u fio_unused * io_u)
+{
+       struct thread_options *o = &td->o;
+       struct exec_options *eo = td->eo;
+
+       /* Let's execute the program the first time we get queued */
+       if (eo->pid == -1) {
+               exec_background(o, eo);
+       } else {
+               /*
+                * The program is running in background, let's check on a
+                * regular basis
+                * if the time is over and if we need to stop the tool
+                */
+               usleep(o->thinktime);
+               if (utime_since_now(&td->start) > o->timeout) {
+                       /* Let's stop the child */
+                       kill(eo->pid, SIGTERM);
+                       /*
+                        * Let's give grace_time (1 sec by default) to the 3rd
+                        * party tool to stop
+                        */
+                       sleep(eo->grace_time);
+               }
+       }
+
+       return FIO_Q_COMPLETED;
+}
+
+static int fio_exec_init(struct thread_data *td)
+{
+       struct thread_options *o = &td->o;
+       struct exec_options *eo = td->eo;
+       int td_previous_state;
+
+       eo->pid = -1;
+
+       if (!eo->program) {
+               td_vmsg(td, EINVAL,
+                       "no program is defined, it is mandatory to define one",
+                       "exec");
+               return 1;
+       }
+
+       log_info("%s : program=%s, arguments=%s\n",
+                td->o.name, eo->program, eo->arguments);
+
+       /* Saving the current thread state */
+       td_previous_state = td->runstate;
+
+       /*
+        * Reporting that we are preparing the engine
+        * This is useful as the qsort() calibration takes time
+        * This prevents the job from starting before init is completed
+        */
+       td_set_runstate(td, TD_SETTING_UP);
+
+       /*
+        * set thinktime_sleep and thinktime_spin appropriately
+        */
+       o->thinktime_blocks = 1;
+       o->thinktime_blocks_type = THINKTIME_BLOCKS_TYPE_COMPLETE;
+       o->thinktime_spin = 0;
+       /* 50ms pause when waiting for the program to complete */
+       o->thinktime = 50000;
+
+       o->nr_files = o->open_files = 1;
+
+       /* Let's restore the previous state. */
+       td_set_runstate(td, td_previous_state);
+       return 0;
+}
+
+static void fio_exec_cleanup(struct thread_data *td)
+{
+       struct exec_options *eo = td->eo;
+       /* Send a sigkill to ensure the job is well terminated */
+       if (eo->pid > 0)
+               kill(eo->pid, SIGKILL);
+}
+
+static int
+fio_exec_open(struct thread_data fio_unused * td,
+             struct fio_file fio_unused * f)
+{
+       return 0;
+}
+
+static struct ioengine_ops ioengine = {
+       .name = "exec",
+       .version = FIO_IOOPS_VERSION,
+       .queue = fio_exec_queue,
+       .init = fio_exec_init,
+       .cleanup = fio_exec_cleanup,
+       .open_file = fio_exec_open,
+       .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOIO,
+       .options = options,
+       .option_struct_size = sizeof(struct exec_options),
+};
+
+static void fio_init fio_exec_register(void)
+{
+       register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_exec_unregister(void)
+{
+       unregister_ioengine(&ioengine);
+}
index 16c64928162654ad76c6697958b846eb5bb0d985..4bb13c348c1a4113a1425dbf620dc5d74a8264be 100644 (file)
@@ -49,7 +49,7 @@ static int open_file(struct thread_data *td, struct fio_file *f)
                uint64_t nsec;
 
                nsec = ntime_since_now(&start);
-               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0);
+               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false);
        }
 
        return 0;
index 64c586391037cf6b140f59bae351f8f4b08483ce..e882ccf0176726be863a605c81afc0f5fbd8e999 100644 (file)
@@ -51,7 +51,7 @@ static int delete_file(struct thread_data *td, struct fio_file *f)
                uint64_t nsec;
 
                nsec = ntime_since_now(&start);
-               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0);
+               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false);
        }
 
        return 0;
index 405f028d11cd383aa48d4e6c3840a592a1074b4b..003112474b39f61c94937447a8518d8b8bc25027 100644 (file)
@@ -125,7 +125,7 @@ static int stat_file(struct thread_data *td, struct fio_file *f)
                uint64_t nsec;
 
                nsec = ntime_since_now(&start);
-               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0);
+               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false);
        }
 
        return 0;
index 7a61b132b92bd72b94ebdc9915e311235b53fc56..35c44871da636b79f77545c078ff5ca3da1a8d8f 100644 (file)
@@ -297,10 +297,9 @@ static int _curl_trace(CURL *handle, curl_infotype type,
        switch (type) {
        case CURLINFO_TEXT:
                fprintf(stderr, "== Info: %s", data);
-               /* fall through */
+               fallthrough;
        default:
        case CURLINFO_SSL_DATA_OUT:
-               /* fall through */
        case CURLINFO_SSL_DATA_IN:
                return 0;
 
index 9c091e37e60b2cc1bd9a63ae37228ac51fc2003a..00ae34823f86ec32907700a8118c30a5d3e97ae1 100644 (file)
@@ -23,6 +23,7 @@
 
 #include "../lib/types.h"
 #include "../os/linux/io_uring.h"
+#include "cmdprio.h"
 
 struct io_sq_ring {
        unsigned *head;
@@ -64,17 +65,17 @@ struct ioring_data {
        int queued;
        int cq_ring_off;
        unsigned iodepth;
-       bool ioprio_class_set;
-       bool ioprio_set;
        int prepped;
 
        struct ioring_mmap mmap[3];
+
+       struct cmdprio cmdprio;
 };
 
 struct ioring_options {
-       void *pad;
+       struct thread_data *td;
        unsigned int hipri;
-       unsigned int cmdprio_percentage;
+       struct cmdprio_options cmdprio_options;
        unsigned int fixedbufs;
        unsigned int registerfiles;
        unsigned int sqpoll_thread;
@@ -120,13 +121,56 @@ static struct fio_option options[] = {
                .name   = "cmdprio_percentage",
                .lname  = "high priority percentage",
                .type   = FIO_OPT_INT,
-               .off1   = offsetof(struct ioring_options, cmdprio_percentage),
-               .minval = 1,
+               .off1   = offsetof(struct ioring_options,
+                                  cmdprio_options.percentage[DDIR_READ]),
+               .off2   = offsetof(struct ioring_options,
+                                  cmdprio_options.percentage[DDIR_WRITE]),
+               .minval = 0,
                .maxval = 100,
                .help   = "Send high priority I/O this percentage of the time",
                .category = FIO_OPT_C_ENGINE,
                .group  = FIO_OPT_G_IOURING,
        },
+       {
+               .name   = "cmdprio_class",
+               .lname  = "Asynchronous I/O priority class",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct ioring_options,
+                                  cmdprio_options.class[DDIR_READ]),
+               .off2   = offsetof(struct ioring_options,
+                                  cmdprio_options.class[DDIR_WRITE]),
+               .help   = "Set asynchronous IO priority class",
+               .minval = IOPRIO_MIN_PRIO_CLASS + 1,
+               .maxval = IOPRIO_MAX_PRIO_CLASS,
+               .interval = 1,
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
+       {
+               .name   = "cmdprio",
+               .lname  = "Asynchronous I/O priority level",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct ioring_options,
+                                  cmdprio_options.level[DDIR_READ]),
+               .off2   = offsetof(struct ioring_options,
+                                  cmdprio_options.level[DDIR_WRITE]),
+               .help   = "Set asynchronous IO priority level",
+               .minval = IOPRIO_MIN_PRIO,
+               .maxval = IOPRIO_MAX_PRIO,
+               .interval = 1,
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
+       {
+               .name   = "cmdprio_bssplit",
+               .lname  = "Priority percentage block size split",
+               .type   = FIO_OPT_STR_STORE,
+               .off1   = offsetof(struct ioring_options,
+                                  cmdprio_options.bssplit_str),
+               .help   = "Set priority percentages for different block sizes",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
 #else
        {
                .name   = "cmdprio_percentage",
@@ -134,6 +178,24 @@ static struct fio_option options[] = {
                .type   = FIO_OPT_UNSUPPORTED,
                .help   = "Your platform does not support I/O priority classes",
        },
+       {
+               .name   = "cmdprio_class",
+               .lname  = "Asynchronous I/O priority class",
+               .type   = FIO_OPT_UNSUPPORTED,
+               .help   = "Your platform does not support I/O priority classes",
+       },
+       {
+               .name   = "cmdprio",
+               .lname  = "Asynchronous I/O priority level",
+               .type   = FIO_OPT_UNSUPPORTED,
+               .help   = "Your platform does not support I/O priority classes",
+       },
+       {
+               .name   = "cmdprio_bssplit",
+               .lname  = "Priority percentage block size split",
+               .type   = FIO_OPT_UNSUPPORTED,
+               .help   = "Your platform does not support I/O priority classes",
+       },
 #endif
        {
                .name   = "fixedbufs",
@@ -234,6 +296,7 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
                sqe->flags = IOSQE_FIXED_FILE;
        } else {
                sqe->fd = f->fd;
+               sqe->flags = 0;
        }
 
        if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
@@ -261,16 +324,24 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
                                sqe->len = 1;
                        }
                }
+               sqe->rw_flags = 0;
                if (!td->o.odirect && o->uncached)
-                       sqe->rw_flags = RWF_UNCACHED;
+                       sqe->rw_flags |= RWF_UNCACHED;
                if (o->nowait)
                        sqe->rw_flags |= RWF_NOWAIT;
-               if (ld->ioprio_class_set)
-                       sqe->ioprio = td->o.ioprio_class << 13;
-               if (ld->ioprio_set)
-                       sqe->ioprio |= td->o.ioprio;
+
+               /*
+                * Since io_uring can have a submission context (sqthread_poll)
+                * that is different from the process context, we cannot rely on
+                * the IO priority set by ioprio_set() (option prio/prioclass)
+                * to be inherited.
+                * td->ioprio will have the value of the "default prio", so set
+                * this unconditionally. This value might get overridden by
+                * fio_ioring_cmdprio_prep() if the option cmdprio_percentage or
+                * cmdprio_bssplit is used.
+                */
+               sqe->ioprio = td->ioprio;
                sqe->off = io_u->offset;
-               sqe->rw_flags = 0;
        } else if (ddir_sync(io_u->ddir)) {
                sqe->ioprio = 0;
                if (io_u->ddir == DDIR_SYNC_FILE_RANGE) {
@@ -376,15 +447,14 @@ static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
        return r < 0 ? r : events;
 }
 
-static void fio_ioring_prio_prep(struct thread_data *td, struct io_u *io_u)
+static inline void fio_ioring_cmdprio_prep(struct thread_data *td,
+                                          struct io_u *io_u)
 {
-       struct ioring_options *o = td->eo;
        struct ioring_data *ld = td->io_ops_data;
-       if (rand_between(&td->prio_state, 0, 99) < o->cmdprio_percentage) {
-               ld->sqes[io_u->index].ioprio = IOPRIO_CLASS_RT << IOPRIO_CLASS_SHIFT;
-               io_u->flags |= IO_U_F_PRIORITY;
-       }
-       return;
+       struct cmdprio *cmdprio = &ld->cmdprio;
+
+       if (fio_cmdprio_set_ioprio(td, cmdprio, io_u))
+               ld->sqes[io_u->index].ioprio = io_u->ioprio;
 }
 
 static enum fio_q_status fio_ioring_queue(struct thread_data *td,
@@ -392,7 +462,6 @@ static enum fio_q_status fio_ioring_queue(struct thread_data *td,
 {
        struct ioring_data *ld = td->io_ops_data;
        struct io_sq_ring *ring = &ld->sq_ring;
-       struct ioring_options *o = td->eo;
        unsigned tail, next_tail;
 
        fio_ro_check(td, io_u);
@@ -415,8 +484,9 @@ static enum fio_q_status fio_ioring_queue(struct thread_data *td,
        if (next_tail == atomic_load_acquire(ring->head))
                return FIO_Q_BUSY;
 
-       if (o->cmdprio_percentage)
-               fio_ioring_prio_prep(td, io_u);
+       if (ld->cmdprio.mode != CMDPRIO_MODE_NONE)
+               fio_ioring_cmdprio_prep(td, io_u);
+
        ring->array[tail & ld->sq_ring_mask] = io_u->index;
        atomic_store_release(ring->tail, next_tail);
 
@@ -520,6 +590,7 @@ static void fio_ioring_cleanup(struct thread_data *td)
                if (!(td->flags & TD_F_CHILD))
                        fio_ioring_unmap(ld);
 
+               fio_cmdprio_cleanup(&ld->cmdprio);
                free(ld->io_u_index);
                free(ld->iovecs);
                free(ld->fds);
@@ -621,6 +692,13 @@ static int fio_ioring_queue_init(struct thread_data *td)
                }
        }
 
+       /*
+        * Clamp CQ ring size at our SQ ring size, we don't need more entries
+        * than that.
+        */
+       p.flags |= IORING_SETUP_CQSIZE;
+       p.cq_entries = depth;
+
        ret = syscall(__NR_io_uring_setup, depth, &p);
        if (ret < 0)
                return ret;
@@ -726,7 +804,7 @@ static int fio_ioring_init(struct thread_data *td)
 {
        struct ioring_options *o = td->eo;
        struct ioring_data *ld;
-       struct thread_options *to = &td->o;
+       int ret;
 
        /* sqthread submission requires registered files */
        if (o->sqpoll_thread)
@@ -750,22 +828,12 @@ static int fio_ioring_init(struct thread_data *td)
 
        td->io_ops_data = ld;
 
-       /*
-        * Check for option conflicts
-        */
-       if ((fio_option_is_set(to, ioprio) || fio_option_is_set(to, ioprio_class)) &&
-                       o->cmdprio_percentage != 0) {
-               log_err("%s: cmdprio_percentage option and mutually exclusive "
-                               "prio or prioclass option is set, exiting\n", to->name);
-               td_verror(td, EINVAL, "fio_io_uring_init");
+       ret = fio_cmdprio_init(td, &ld->cmdprio, &o->cmdprio_options);
+       if (ret) {
+               td_verror(td, EINVAL, "fio_ioring_init");
                return 1;
        }
 
-       if (fio_option_is_set(&td->o, ioprio_class))
-               ld->ioprio_class_set = true;
-       if (fio_option_is_set(&td->o, ioprio))
-               ld->ioprio_set = true;
-
        return 0;
 }
 
index b909b79e9c7169f7898e2aa32be37f895cafe4a0..9c278d060b218617d0ab89aa5285a1e010507661 100644 (file)
@@ -15,6 +15,7 @@
 #include "../lib/pow2.h"
 #include "../optgroup.h"
 #include "../lib/memalign.h"
+#include "cmdprio.h"
 
 /* Should be defined in newest aio_abi.h */
 #ifndef IOCB_FLAG_IOPRIO
@@ -50,12 +51,14 @@ struct libaio_data {
        unsigned int queued;
        unsigned int head;
        unsigned int tail;
+
+       struct cmdprio cmdprio;
 };
 
 struct libaio_options {
-       void *pad;
+       struct thread_data *td;
        unsigned int userspace_reap;
-       unsigned int cmdprio_percentage;
+       struct cmdprio_options cmdprio_options;
        unsigned int nowait;
 };
 
@@ -74,13 +77,56 @@ static struct fio_option options[] = {
                .name   = "cmdprio_percentage",
                .lname  = "high priority percentage",
                .type   = FIO_OPT_INT,
-               .off1   = offsetof(struct libaio_options, cmdprio_percentage),
-               .minval = 1,
+               .off1   = offsetof(struct libaio_options,
+                                  cmdprio_options.percentage[DDIR_READ]),
+               .off2   = offsetof(struct libaio_options,
+                                  cmdprio_options.percentage[DDIR_WRITE]),
+               .minval = 0,
                .maxval = 100,
                .help   = "Send high priority I/O this percentage of the time",
                .category = FIO_OPT_C_ENGINE,
                .group  = FIO_OPT_G_LIBAIO,
        },
+       {
+               .name   = "cmdprio_class",
+               .lname  = "Asynchronous I/O priority class",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct libaio_options,
+                                  cmdprio_options.class[DDIR_READ]),
+               .off2   = offsetof(struct libaio_options,
+                                  cmdprio_options.class[DDIR_WRITE]),
+               .help   = "Set asynchronous IO priority class",
+               .minval = IOPRIO_MIN_PRIO_CLASS + 1,
+               .maxval = IOPRIO_MAX_PRIO_CLASS,
+               .interval = 1,
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBAIO,
+       },
+       {
+               .name   = "cmdprio",
+               .lname  = "Asynchronous I/O priority level",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct libaio_options,
+                                  cmdprio_options.level[DDIR_READ]),
+               .off2   = offsetof(struct libaio_options,
+                                  cmdprio_options.level[DDIR_WRITE]),
+               .help   = "Set asynchronous IO priority level",
+               .minval = IOPRIO_MIN_PRIO,
+               .maxval = IOPRIO_MAX_PRIO,
+               .interval = 1,
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBAIO,
+       },
+       {
+               .name   = "cmdprio_bssplit",
+               .lname  = "Priority percentage block size split",
+               .type   = FIO_OPT_STR_STORE,
+               .off1   = offsetof(struct libaio_options,
+                                  cmdprio_options.bssplit_str),
+               .help   = "Set priority percentages for different block sizes",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBAIO,
+       },
 #else
        {
                .name   = "cmdprio_percentage",
@@ -88,6 +134,24 @@ static struct fio_option options[] = {
                .type   = FIO_OPT_UNSUPPORTED,
                .help   = "Your platform does not support I/O priority classes",
        },
+       {
+               .name   = "cmdprio_class",
+               .lname  = "Asynchronous I/O priority class",
+               .type   = FIO_OPT_UNSUPPORTED,
+               .help   = "Your platform does not support I/O priority classes",
+       },
+       {
+               .name   = "cmdprio",
+               .lname  = "Asynchronous I/O priority level",
+               .type   = FIO_OPT_UNSUPPORTED,
+               .help   = "Your platform does not support I/O priority classes",
+       },
+       {
+               .name   = "cmdprio_bssplit",
+               .lname  = "Priority percentage block size split",
+               .type   = FIO_OPT_UNSUPPORTED,
+               .help   = "Your platform does not support I/O priority classes",
+       },
 #endif
        {
                .name   = "nowait",
@@ -132,15 +196,16 @@ static int fio_libaio_prep(struct thread_data *td, struct io_u *io_u)
        return 0;
 }
 
-static void fio_libaio_prio_prep(struct thread_data *td, struct io_u *io_u)
+static inline void fio_libaio_cmdprio_prep(struct thread_data *td,
+                                          struct io_u *io_u)
 {
-       struct libaio_options *o = td->eo;
-       if (rand_between(&td->prio_state, 0, 99) < o->cmdprio_percentage) {
-               io_u->iocb.aio_reqprio = IOPRIO_CLASS_RT << IOPRIO_CLASS_SHIFT;
+       struct libaio_data *ld = td->io_ops_data;
+       struct cmdprio *cmdprio = &ld->cmdprio;
+
+       if (fio_cmdprio_set_ioprio(td, cmdprio, io_u)) {
+               io_u->iocb.aio_reqprio = io_u->ioprio;
                io_u->iocb.u.c.flags |= IOCB_FLAG_IOPRIO;
-               io_u->flags |= IO_U_F_PRIORITY;
        }
-       return;
 }
 
 static struct io_u *fio_libaio_event(struct thread_data *td, int event)
@@ -246,7 +311,6 @@ static enum fio_q_status fio_libaio_queue(struct thread_data *td,
                                          struct io_u *io_u)
 {
        struct libaio_data *ld = td->io_ops_data;
-       struct libaio_options *o = td->eo;
 
        fio_ro_check(td, io_u);
 
@@ -277,8 +341,8 @@ static enum fio_q_status fio_libaio_queue(struct thread_data *td,
                return FIO_Q_COMPLETED;
        }
 
-       if (o->cmdprio_percentage)
-               fio_libaio_prio_prep(td, io_u);
+       if (ld->cmdprio.mode != CMDPRIO_MODE_NONE)
+               fio_libaio_cmdprio_prep(td, io_u);
 
        ld->iocbs[ld->head] = &io_u->iocb;
        ld->io_us[ld->head] = io_u;
@@ -396,6 +460,8 @@ static void fio_libaio_cleanup(struct thread_data *td)
                 */
                if (!(td->flags & TD_F_CHILD))
                        io_destroy(ld->aio_ctx);
+
+               fio_cmdprio_cleanup(&ld->cmdprio);
                free(ld->aio_events);
                free(ld->iocbs);
                free(ld->io_us);
@@ -420,8 +486,8 @@ static int fio_libaio_post_init(struct thread_data *td)
 static int fio_libaio_init(struct thread_data *td)
 {
        struct libaio_data *ld;
-       struct thread_options *to = &td->o;
        struct libaio_options *o = td->eo;
+       int ret;
 
        ld = calloc(1, sizeof(*ld));
 
@@ -432,16 +498,13 @@ static int fio_libaio_init(struct thread_data *td)
        ld->io_us = calloc(ld->entries, sizeof(struct io_u *));
 
        td->io_ops_data = ld;
-       /*
-        * Check for option conflicts
-        */
-       if ((fio_option_is_set(to, ioprio) || fio_option_is_set(to, ioprio_class)) &&
-                       o->cmdprio_percentage != 0) {
-               log_err("%s: cmdprio_percentage option and mutually exclusive "
-                               "prio or prioclass option is set, exiting\n", to->name);
+
+       ret = fio_cmdprio_init(td, &ld->cmdprio, &o->cmdprio_options);
+       if (ret) {
                td_verror(td, EINVAL, "fio_libaio_init");
                return 1;
        }
+
        return 0;
 }
 
index 7f2bc431b4352f7f3472fd00b63c268b823dbe64..2bc2c7e0e411042e7480066c39e59d1a3bb4ec26 100644 (file)
@@ -14,6 +14,7 @@
 #include "fio.h"
 #include "err.h"
 #include "zbd_types.h"
+#include "zbd.h"
 
 struct libzbc_data {
        struct zbc_device       *zdev;
@@ -63,7 +64,7 @@ static int libzbc_open_dev(struct thread_data *td, struct fio_file *f,
                return -EINVAL;
        }
 
-       if (td_write(td)) {
+       if (td_write(td) || td_trim(td)) {
                if (!read_only)
                        flags |= O_RDWR;
        } else if (td_read(td)) {
@@ -71,10 +72,6 @@ static int libzbc_open_dev(struct thread_data *td, struct fio_file *f,
                        flags |= O_RDWR;
                else
                        flags |= O_RDONLY;
-       } else if (td_trim(td)) {
-               td_verror(td, EINVAL, "libzbc does not support trim");
-               log_err("%s: libzbc does not support trim\n", f->file_name);
-               return -EINVAL;
        }
 
        if (td->o.oatomic) {
@@ -88,7 +85,7 @@ static int libzbc_open_dev(struct thread_data *td, struct fio_file *f,
                return -ENOMEM;
 
        ret = zbc_open(f->file_name,
-                      flags | ZBC_O_DRV_BLOCK | ZBC_O_DRV_SCSI | ZBC_O_DRV_ATA,
+                      flags | ZBC_O_DRV_SCSI | ZBC_O_DRV_ATA,
                       &ld->zdev);
        if (ret) {
                log_err("%s: zbc_open() failed, err=%d\n",
@@ -411,7 +408,11 @@ static enum fio_q_status libzbc_queue(struct thread_data *td, struct io_u *io_u)
                ret = zbc_flush(ld->zdev);
                if (ret)
                        log_err("zbc_flush error %zd\n", ret);
-       } else if (io_u->ddir != DDIR_TRIM) {
+       } else if (io_u->ddir == DDIR_TRIM) {
+               ret = zbd_do_io_u_trim(td, io_u);
+               if (!ret)
+                       ret = EINVAL;
+       } else {
                log_err("Unsupported operation %u\n", io_u->ddir);
                ret = -EINVAL;
        }
index 0c2d2c8b861ad22fdc3080e51a605194450ffcc5..1c0193840df3cacbf276be48542f5cd5af7b80ad 100644 (file)
@@ -471,10 +471,9 @@ static enum fio_q_status fio_sgio_rw_doio(struct thread_data *td,
                        if (__io_u == io_u)
                                break;
 
-                       if (io_u_sync_complete(td, __io_u)) {
-                               ret = -1;
+                       if (io_u_sync_complete(td, __io_u))
                                break;
-                       }
+
                } while (1);
 
                return FIO_Q_COMPLETED;
@@ -982,7 +981,7 @@ static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
 
        if (sd && !sd->type_checked && fio_sgio_type_check(td, f)) {
                ret = generic_close_file(td, f);
-               return 1;
+               return ret;
        }
 
        return 0;
diff --git a/eta.c b/eta.c
index db13cb18103226028ca324cf5acdfb2d03fe4507..ea1781f3b792147636210f38714abb089870a2da 100644 (file)
--- a/eta.c
+++ b/eta.c
@@ -509,7 +509,7 @@ bool calc_thread_status(struct jobs_eta *je, int force)
                memcpy(&rate_prev_time, &now, sizeof(now));
                regrow_agg_logs();
                for_each_rw_ddir(ddir) {
-                       add_agg_sample(sample_val(je->rate[ddir]), ddir, 0, 0);
+                       add_agg_sample(sample_val(je->rate[ddir]), ddir, 0);
                }
        }
 
diff --git a/examples/1mbs_clients.png b/examples/1mbs_clients.png
new file mode 100644 (file)
index 0000000..3f972dc
Binary files /dev/null and b/examples/1mbs_clients.png differ
diff --git a/examples/aio-read.png b/examples/aio-read.png
new file mode 100644 (file)
index 0000000..e0c020a
Binary files /dev/null and b/examples/aio-read.png differ
diff --git a/examples/backwards-read.png b/examples/backwards-read.png
new file mode 100644 (file)
index 0000000..81dc920
Binary files /dev/null and b/examples/backwards-read.png differ
diff --git a/examples/basic-verify.png b/examples/basic-verify.png
new file mode 100644 (file)
index 0000000..98f7302
Binary files /dev/null and b/examples/basic-verify.png differ
diff --git a/examples/butterfly.png b/examples/butterfly.png
new file mode 100644 (file)
index 0000000..2c56651
Binary files /dev/null and b/examples/butterfly.png differ
diff --git a/examples/cmdprio-bssplit.fio b/examples/cmdprio-bssplit.fio
new file mode 100644 (file)
index 0000000..47e9a79
--- /dev/null
@@ -0,0 +1,17 @@
+; Randomly read/write a block device file at queue depth 16.
+; 40 % of read IOs are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 100% of the 64kB reads are executed at the highest priority and
+; all other IOs executed without a priority set.
+[global]
+filename=/dev/sda
+direct=1
+write_lat_log=prio-run.log
+log_prio=1
+
+[randrw]
+rw=randrw
+bssplit=64k/40:1024k/60,1024k/100
+ioengine=libaio
+iodepth=16
+cmdprio_bssplit=64k/100:1024k/0,1024k/0
+cmdprio_class=1
diff --git a/examples/cmdprio-bssplit.png b/examples/cmdprio-bssplit.png
new file mode 100644 (file)
index 0000000..a0bb3ff
Binary files /dev/null and b/examples/cmdprio-bssplit.png differ
diff --git a/examples/cmdprio-percentage.fio b/examples/cmdprio-percentage.fio
new file mode 100644 (file)
index 0000000..e4bc9db
--- /dev/null
@@ -0,0 +1,17 @@
+; Read a block device file at queue depth 8
+; with 20 % of the IOs using the high priority RT class
+; and the remaining IOs using the idle priority class
+[global]
+filename=/dev/sda
+direct=1
+write_lat_log=prio-run.log
+log_prio=1
+
+[randread]
+rw=randread
+bs=128k
+ioengine=libaio
+iodepth=8
+prioclass=3
+cmdprio_percentage=20
+cmdprio_class=1
diff --git a/examples/cmdprio-percentage.png b/examples/cmdprio-percentage.png
new file mode 100644 (file)
index 0000000..e794de0
Binary files /dev/null and b/examples/cmdprio-percentage.png differ
diff --git a/examples/cpp_null.png b/examples/cpp_null.png
new file mode 100644 (file)
index 0000000..5303ac2
Binary files /dev/null and b/examples/cpp_null.png differ
diff --git a/examples/cpuio.png b/examples/cpuio.png
new file mode 100644 (file)
index 0000000..02938db
Binary files /dev/null and b/examples/cpuio.png differ
diff --git a/examples/cross-stripe-verify.png b/examples/cross-stripe-verify.png
new file mode 100644 (file)
index 0000000..90aa630
Binary files /dev/null and b/examples/cross-stripe-verify.png differ
diff --git a/examples/dev-dax.png b/examples/dev-dax.png
new file mode 100644 (file)
index 0000000..2463bca
Binary files /dev/null and b/examples/dev-dax.png differ
diff --git a/examples/dfs.png b/examples/dfs.png
new file mode 100644 (file)
index 0000000..049ccae
Binary files /dev/null and b/examples/dfs.png differ
diff --git a/examples/disk-zone-profile.png b/examples/disk-zone-profile.png
new file mode 100644 (file)
index 0000000..5f7b24c
Binary files /dev/null and b/examples/disk-zone-profile.png differ
diff --git a/examples/e4defrag.png b/examples/e4defrag.png
new file mode 100644 (file)
index 0000000..00a7fef
Binary files /dev/null and b/examples/e4defrag.png differ
index 2d4e1a87705890065228bca7456c914d75a4117d..86554ef7a04b45cbeecaae168434ebd7a9fcaa59 100644 (file)
@@ -48,7 +48,7 @@ donorname=file.def
 
 ########
 # Run random e4defrag and various aio workers in parallel
-[e4defrag-fuzzer-4k]
+[e4defrag-fuzzer-4k-bis]
 stonewall
 continue_on_error=all
 inplace=1
diff --git a/examples/e4defrag2.png b/examples/e4defrag2.png
new file mode 100644 (file)
index 0000000..8a128e9
Binary files /dev/null and b/examples/e4defrag2.png differ
diff --git a/examples/enospc-pressure.png b/examples/enospc-pressure.png
new file mode 100644 (file)
index 0000000..da28b7c
Binary files /dev/null and b/examples/enospc-pressure.png differ
diff --git a/examples/exec.fio b/examples/exec.fio
new file mode 100644 (file)
index 0000000..ac1bedf
--- /dev/null
@@ -0,0 +1,36 @@
+[global]
+time_based
+runtime=30
+
+[monitoring_noop]
+ioengine=exec
+program=/usr/sbin/turbostat
+arguments=-c package -qS --interval 5 -s Busy%,Bzy_MHz,Avg_MHz,CorWatt,PkgWatt,RAMWatt,PkgTmp
+
+[cpuload_noop]
+ioengine=cpuio
+cpuload=100
+numjobs=12
+cpumode=noop
+
+[sleep]
+# Let the processor cooling down for a few seconds
+stonewall
+ioengine=exec
+runtime=10
+program=/bin/sleep
+arguments=%r
+grace_time=0
+std_redirect=0
+
+[monitoring_qsort]
+stonewall
+ioengine=exec
+program=/usr/sbin/turbostat
+arguments=-c package -qS --interval 5 -s Busy%,Bzy_MHz,Avg_MHz,CorWatt,PkgWatt,RAMWatt,PkgTmp
+
+[cpuload_qsort]
+ioengine=cpuio
+cpuload=100
+numjobs=12
+cpumode=qsort
diff --git a/examples/exec.png b/examples/exec.png
new file mode 100644 (file)
index 0000000..5f9f3b5
Binary files /dev/null and b/examples/exec.png differ
diff --git a/examples/exitwhat.png b/examples/exitwhat.png
new file mode 100644 (file)
index 0000000..9fc1883
Binary files /dev/null and b/examples/exitwhat.png differ
diff --git a/examples/falloc.png b/examples/falloc.png
new file mode 100644 (file)
index 0000000..886be22
Binary files /dev/null and b/examples/falloc.png differ
diff --git a/examples/filecreate-ioengine.png b/examples/filecreate-ioengine.png
new file mode 100644 (file)
index 0000000..45d11da
Binary files /dev/null and b/examples/filecreate-ioengine.png differ
diff --git a/examples/filedelete-ioengine.png b/examples/filedelete-ioengine.png
new file mode 100644 (file)
index 0000000..3512ab7
Binary files /dev/null and b/examples/filedelete-ioengine.png differ
diff --git a/examples/filestat-ioengine.png b/examples/filestat-ioengine.png
new file mode 100644 (file)
index 0000000..bed59ab
Binary files /dev/null and b/examples/filestat-ioengine.png differ
diff --git a/examples/fio-rand-RW.png b/examples/fio-rand-RW.png
new file mode 100644 (file)
index 0000000..aa4b099
Binary files /dev/null and b/examples/fio-rand-RW.png differ
diff --git a/examples/fio-rand-read.png b/examples/fio-rand-read.png
new file mode 100644 (file)
index 0000000..d45664a
Binary files /dev/null and b/examples/fio-rand-read.png differ
diff --git a/examples/fio-rand-write.png b/examples/fio-rand-write.png
new file mode 100644 (file)
index 0000000..10e068b
Binary files /dev/null and b/examples/fio-rand-write.png differ
diff --git a/examples/fio-seq-RW.png b/examples/fio-seq-RW.png
new file mode 100644 (file)
index 0000000..a2be35e
Binary files /dev/null and b/examples/fio-seq-RW.png differ
diff --git a/examples/fio-seq-read.png b/examples/fio-seq-read.png
new file mode 100644 (file)
index 0000000..cf8f297
Binary files /dev/null and b/examples/fio-seq-read.png differ
diff --git a/examples/fio-seq-write.png b/examples/fio-seq-write.png
new file mode 100644 (file)
index 0000000..8db1209
Binary files /dev/null and b/examples/fio-seq-write.png differ
diff --git a/examples/fixed-rate-submission.png b/examples/fixed-rate-submission.png
new file mode 100644 (file)
index 0000000..86ca9b3
Binary files /dev/null and b/examples/fixed-rate-submission.png differ
diff --git a/examples/flow.png b/examples/flow.png
new file mode 100644 (file)
index 0000000..26a3d34
Binary files /dev/null and b/examples/flow.png differ
diff --git a/examples/fsx.png b/examples/fsx.png
new file mode 100644 (file)
index 0000000..b4e13c8
Binary files /dev/null and b/examples/fsx.png differ
diff --git a/examples/ftruncate.png b/examples/ftruncate.png
new file mode 100644 (file)
index 0000000..b98895f
Binary files /dev/null and b/examples/ftruncate.png differ
diff --git a/examples/gfapi.png b/examples/gfapi.png
new file mode 100644 (file)
index 0000000..acc6a6a
Binary files /dev/null and b/examples/gfapi.png differ
diff --git a/examples/gpudirect-rdmaio-client.png b/examples/gpudirect-rdmaio-client.png
new file mode 100644 (file)
index 0000000..eac7985
Binary files /dev/null and b/examples/gpudirect-rdmaio-client.png differ
diff --git a/examples/gpudirect-rdmaio-server.png b/examples/gpudirect-rdmaio-server.png
new file mode 100644 (file)
index 0000000..e043d7c
Binary files /dev/null and b/examples/gpudirect-rdmaio-server.png differ
diff --git a/examples/http-s3.png b/examples/http-s3.png
new file mode 100644 (file)
index 0000000..2021e85
Binary files /dev/null and b/examples/http-s3.png differ
diff --git a/examples/http-swift.png b/examples/http-swift.png
new file mode 100644 (file)
index 0000000..9928fb1
Binary files /dev/null and b/examples/http-swift.png differ
diff --git a/examples/http-webdav.png b/examples/http-webdav.png
new file mode 100644 (file)
index 0000000..c37c3de
Binary files /dev/null and b/examples/http-webdav.png differ
diff --git a/examples/ime.png b/examples/ime.png
new file mode 100644 (file)
index 0000000..f636f5e
Binary files /dev/null and b/examples/ime.png differ
diff --git a/examples/iometer-file-access-server.png b/examples/iometer-file-access-server.png
new file mode 100644 (file)
index 0000000..e312455
Binary files /dev/null and b/examples/iometer-file-access-server.png differ
diff --git a/examples/jesd219.png b/examples/jesd219.png
new file mode 100644 (file)
index 0000000..73b5a12
Binary files /dev/null and b/examples/jesd219.png differ
diff --git a/examples/latency-profile.png b/examples/latency-profile.png
new file mode 100644 (file)
index 0000000..50650df
Binary files /dev/null and b/examples/latency-profile.png differ
diff --git a/examples/libcufile-cufile.png b/examples/libcufile-cufile.png
new file mode 100644 (file)
index 0000000..f3758e5
Binary files /dev/null and b/examples/libcufile-cufile.png differ
diff --git a/examples/libcufile-posix.png b/examples/libcufile-posix.png
new file mode 100644 (file)
index 0000000..7818feb
Binary files /dev/null and b/examples/libcufile-posix.png differ
diff --git a/examples/libhdfs.png b/examples/libhdfs.png
new file mode 100644 (file)
index 0000000..e774c91
Binary files /dev/null and b/examples/libhdfs.png differ
diff --git a/examples/libiscsi.png b/examples/libiscsi.png
new file mode 100644 (file)
index 0000000..d0006cc
Binary files /dev/null and b/examples/libiscsi.png differ
diff --git a/examples/libpmem.png b/examples/libpmem.png
new file mode 100644 (file)
index 0000000..8a9a143
Binary files /dev/null and b/examples/libpmem.png differ
diff --git a/examples/librpma_apm-client.png b/examples/librpma_apm-client.png
new file mode 100644 (file)
index 0000000..2fe02cd
Binary files /dev/null and b/examples/librpma_apm-client.png differ
diff --git a/examples/librpma_apm-server.png b/examples/librpma_apm-server.png
new file mode 100644 (file)
index 0000000..f78ae02
Binary files /dev/null and b/examples/librpma_apm-server.png differ
diff --git a/examples/librpma_gpspm-client.png b/examples/librpma_gpspm-client.png
new file mode 100644 (file)
index 0000000..0c975a2
Binary files /dev/null and b/examples/librpma_gpspm-client.png differ
diff --git a/examples/librpma_gpspm-server.png b/examples/librpma_gpspm-server.png
new file mode 100644 (file)
index 0000000..5612453
Binary files /dev/null and b/examples/librpma_gpspm-server.png differ
diff --git a/examples/libzbc-rand-write.png b/examples/libzbc-rand-write.png
new file mode 100644 (file)
index 0000000..1d27741
Binary files /dev/null and b/examples/libzbc-rand-write.png differ
diff --git a/examples/libzbc-seq-read.png b/examples/libzbc-seq-read.png
new file mode 100644 (file)
index 0000000..5a53222
Binary files /dev/null and b/examples/libzbc-seq-read.png differ
index e5dcea4c04b8e2677ede8a2b9ca822490db2f1ef..0a7f2bae82f48622702644713ace468904c50085 100644 (file)
@@ -6,7 +6,7 @@ ignore_error=,EIO
 blocksize=512,512,16384
 skip_bad=1
 
-[write]
+[trim]
 stonewall
 rw=trim
 
@@ -14,7 +14,7 @@ rw=trim
 stonewall
 rw=write
 
-[write]
+[trimwrite]
 stonewall
 block_error_percentiles=1
 rw=trimwrite
diff --git a/examples/mtd.png b/examples/mtd.png
new file mode 100644 (file)
index 0000000..8cb3692
Binary files /dev/null and b/examples/mtd.png differ
diff --git a/examples/nbd.png b/examples/nbd.png
new file mode 100644 (file)
index 0000000..e3bcf61
Binary files /dev/null and b/examples/nbd.png differ
diff --git a/examples/netio.png b/examples/netio.png
new file mode 100644 (file)
index 0000000..81afd41
Binary files /dev/null and b/examples/netio.png differ
diff --git a/examples/netio_multicast.png b/examples/netio_multicast.png
new file mode 100644 (file)
index 0000000..f07ab4b
Binary files /dev/null and b/examples/netio_multicast.png differ
diff --git a/examples/nfs.png b/examples/nfs.png
new file mode 100644 (file)
index 0000000..29dbca0
Binary files /dev/null and b/examples/nfs.png differ
diff --git a/examples/null.png b/examples/null.png
new file mode 100644 (file)
index 0000000..052671d
Binary files /dev/null and b/examples/null.png differ
diff --git a/examples/numa.png b/examples/numa.png
new file mode 100644 (file)
index 0000000..1ef4575
Binary files /dev/null and b/examples/numa.png differ
index f813174100aafef4acc97b84c74d078ca107a931..59bb2a8a5acbf0e03d16a988f6ae0b9eb84575d2 100644 (file)
@@ -55,7 +55,7 @@ unlink=0
 # size, this is not required.
 #
 filename=/pmem0/fio-test,4096,1024
-filename=/pmem1/fio-test,4096,1024
+#filename=/pmem1/fio-test,4096,1024
 
 [pmemblk-write]
 rw=randwrite
diff --git a/examples/pmemblk.png b/examples/pmemblk.png
new file mode 100644 (file)
index 0000000..250e254
Binary files /dev/null and b/examples/pmemblk.png differ
diff --git a/examples/poisson-rate-submission.png b/examples/poisson-rate-submission.png
new file mode 100644 (file)
index 0000000..739c256
Binary files /dev/null and b/examples/poisson-rate-submission.png differ
diff --git a/examples/rados.png b/examples/rados.png
new file mode 100644 (file)
index 0000000..91bd61a
Binary files /dev/null and b/examples/rados.png differ
diff --git a/examples/rand-zones.png b/examples/rand-zones.png
new file mode 100644 (file)
index 0000000..13cbfb4
Binary files /dev/null and b/examples/rand-zones.png differ
diff --git a/examples/rbd.png b/examples/rbd.png
new file mode 100644 (file)
index 0000000..f118613
Binary files /dev/null and b/examples/rbd.png differ
diff --git a/examples/rdmaio-client.png b/examples/rdmaio-client.png
new file mode 100644 (file)
index 0000000..4e4bc28
Binary files /dev/null and b/examples/rdmaio-client.png differ
diff --git a/examples/rdmaio-server.png b/examples/rdmaio-server.png
new file mode 100644 (file)
index 0000000..fc34472
Binary files /dev/null and b/examples/rdmaio-server.png differ
diff --git a/examples/ssd-steadystate.png b/examples/ssd-steadystate.png
new file mode 100644 (file)
index 0000000..eb27f8a
Binary files /dev/null and b/examples/ssd-steadystate.png differ
diff --git a/examples/ssd-test.png b/examples/ssd-test.png
new file mode 100644 (file)
index 0000000..a92ed15
Binary files /dev/null and b/examples/ssd-test.png differ
diff --git a/examples/steadystate.png b/examples/steadystate.png
new file mode 100644 (file)
index 0000000..4bb9048
Binary files /dev/null and b/examples/steadystate.png differ
diff --git a/examples/surface-scan.png b/examples/surface-scan.png
new file mode 100644 (file)
index 0000000..0057380
Binary files /dev/null and b/examples/surface-scan.png differ
diff --git a/examples/test.png b/examples/test.png
new file mode 100644 (file)
index 0000000..6be5002
Binary files /dev/null and b/examples/test.png differ
diff --git a/examples/tiobench-example.png b/examples/tiobench-example.png
new file mode 100644 (file)
index 0000000..1441032
Binary files /dev/null and b/examples/tiobench-example.png differ
diff --git a/examples/waitfor.png b/examples/waitfor.png
new file mode 100644 (file)
index 0000000..64e4bf9
Binary files /dev/null and b/examples/waitfor.png differ
diff --git a/examples/zbd-rand-write.png b/examples/zbd-rand-write.png
new file mode 100644 (file)
index 0000000..d58721b
Binary files /dev/null and b/examples/zbd-rand-write.png differ
diff --git a/examples/zbd-seq-read.png b/examples/zbd-seq-read.png
new file mode 100644 (file)
index 0000000..b81a08c
Binary files /dev/null and b/examples/zbd-seq-read.png differ
diff --git a/examples/zipf.png b/examples/zipf.png
new file mode 100644 (file)
index 0000000..cb2a981
Binary files /dev/null and b/examples/zipf.png differ
index 296de5a11a3d015f65ffd28311d0278eca4341f7..fb556d8444e4a1a29b027a6e53ffecab3708898d 100644 (file)
@@ -1024,7 +1024,6 @@ int longest_existing_path(char *path) {
        while (!done) {
                buf_pos = strrchr(buf, FIO_OS_PATH_SEPARATOR);
                if (!buf_pos) {
-                       done = true;
                        offset = 0;
                        break;
                }
@@ -1120,9 +1119,6 @@ int setup_files(struct thread_data *td)
        if (err)
                goto err_out;
 
-       if (o->read_iolog_file)
-               goto done;
-
        if (td->o.zone_mode == ZONE_MODE_ZBD) {
                err = zbd_init_files(td);
                if (err)
@@ -1130,6 +1126,9 @@ int setup_files(struct thread_data *td)
        }
        zbd_recalc_options_with_zone_granularity(td);
 
+       if (o->read_iolog_file)
+               goto done;
+
        /*
         * check sizes. if the files/devices do not exist and the size
         * isn't passed to fio, abort.
diff --git a/fio.1 b/fio.1
index 5aa54a4d0471772276737edae926bba7b7f7e63b..a3ebb67d36df6884ea1ce95ca5cfb48946bec0a6 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -766,6 +766,8 @@ starts. The \fBzonecapacity\fR parameter is ignored.
 Zoned block device mode. I/O happens sequentially in each zone, even if random
 I/O has been selected. Random I/O happens across all zones instead of being
 restricted to a single zone.
+Trim is handled using a zone reset operation. Trim only considers non-empty
+sequential write required and sequential write preferred zones.
 .RE
 .RE
 .TP
@@ -1509,6 +1511,48 @@ all \-\- this option only controls the distribution of unique buffers. Setting
 this option will also enable \fBrefill_buffers\fR to prevent every buffer
 being identical.
 .TP
+.BI dedupe_mode \fR=\fPstr
+If \fBdedupe_percentage\fR is given, then this option controls how fio
+generates the dedupe buffers.
+.RS
+.RS
+.TP
+.B repeat
+.P
+.RS
+Generate dedupe buffers by repeating previous writes
+.RE
+.TP
+.B working_set
+.P
+.RS
+Generate dedupe buffers from working set
+.RE
+.RE
+.P
+\fBrepeat\fR is the default option for fio. Dedupe buffers are generated
+by repeating previous unique write.
+
+\fBworking_set\fR is a more realistic workload.
+With \fBworking_set\fR, \fBdedupe_working_set_percentage\fR should be provided.
+Given that, fio will use the initial unique write buffers as its working set.
+Upon deciding to dedupe, fio will randomly choose a buffer from the working set.
+Note that by using \fBworking_set\fR the dedupe percentage will converge
+to the desired over time while \fBrepeat\fR maintains the desired percentage
+throughout the job.
+.RE
+.RE
+.TP
+.BI dedupe_working_set_percentage \fR=\fPint
+If \fBdedupe_mode\fR is set to \fBworking_set\fR, then this controls
+the percentage of size of the file or device used as the buffers
+fio will choose to generate the dedupe buffers from
+.P
+.RS
+Note that \fBsize\fR needs to be explicitly provided and only 1 file
+per job is supported
+.RE
+.TP
 .BI invalidate \fR=\fPbool
 Invalidate the buffer/page cache parts of the files to be used prior to
 starting I/O if the platform and file type support it. Defaults to true.
@@ -1658,9 +1702,7 @@ Sets size to something really large and waits for ENOSPC (no space left on
 device) or EDQUOT (disk quota exceeded)
 as the terminating condition. Only makes sense with sequential
 write. For a read workload, the mount point will be filled first then I/O
-started on the result. This option doesn't make sense if operating on a raw
-device node, since the size of that is already known by the file system.
-Additionally, writing beyond end-of-device will not return ENOSPC there.
+started on the result.
 .SS "I/O engine"
 .TP
 .BI ioengine \fR=\fPstr
@@ -1721,10 +1763,9 @@ character devices. This engine supports trim operations. The
 sg engine includes engine specific options.
 .TP
 .B libzbc
-Synchronous I/O engine for SMR hard-disks using the \fBlibzbc\fR
-library. The target can be either an sg character device or
-a block device file. This engine supports the zonemode=zbd zone
-operations.
+Read, write, trim and ZBC/ZAC operations to a zoned block device using
+\fBlibzbc\fR library. The target can be either an SG character device or
+a block device file.
 .TP
 .B null
 Doesn't transfer any data, just pretends to. This is mainly used to
@@ -1912,19 +1953,52 @@ I/O engine supporting asynchronous read and write operations to
 NFS filesystems from userspace via libnfs. This is useful for
 achieving higher concurrency and thus throughput than is possible
 via kernel NFS.
+.TP
+.B exec
+Execute 3rd party tools. Could be used to perform monitoring during jobs runtime.
 .SS "I/O engine specific parameters"
 In addition, there are some parameters which are only valid when a specific
 \fBioengine\fR is in use. These are used identically to normal parameters,
 with the caveat that when used on the command line, they must come after the
 \fBioengine\fR that defines them is selected.
 .TP
-.BI (io_uring, libaio)cmdprio_percentage \fR=\fPint
-Set the percentage of I/O that will be issued with higher priority by setting
-the priority bit. Non-read I/O is likely unaffected by ``cmdprio_percentage``.
-This option cannot be used with the `prio` or `prioclass` options. For this
-option to set the priority bit properly, NCQ priority must be supported and
-enabled and `direct=1' option must be used. fio must also be run as the root
-user.
+.BI (io_uring,libaio)cmdprio_percentage \fR=\fPint[,int]
+Set the percentage of I/O that will be issued with the highest priority.
+Default: 0. A single value applies to reads and writes. Comma-separated
+values may be specified for reads and writes. For this option to be effective,
+NCQ priority must be supported and enabled, and `direct=1' option must be
+used. fio must also be run as the root user. Unlike slat/clat/lat stats, which
+can be tracked and reported independently, per priority stats only track and
+report a single type of latency. By default, completion latency (clat) will be
+reported, if \fBlat_percentiles\fR is set, total latency (lat) will be reported.
+.TP
+.BI (io_uring,libaio)cmdprio_class \fR=\fPint[,int]
+Set the I/O priority class to use for I/Os that must be issued with a
+priority when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR is set.
+If not specified when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR
+is set, this defaults to the highest priority class. A single value applies
+to reads and writes. Comma-separated values may be specified for reads and
+writes. See man \fBionice\fR\|(1). See also the \fBprioclass\fR option.
+.TP
+.BI (io_uring,libaio)cmdprio \fR=\fPint[,int]
+Set the I/O priority value to use for I/Os that must be issued with a
+priority when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR is set.
+If not specified when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR
+is set, this defaults to 0. Linux limits us to a positive value between
+0 and 7, with 0 being the highest. A single value applies to reads and writes.
+Comma-separated values may be specified for reads and writes. See man
+\fBionice\fR\|(1). Refer to an appropriate manpage for other operating systems
+since the meaning of priority may differ. See also the \fBprio\fR option.
+.TP
+.BI (io_uring,libaio)cmdprio_bssplit \fR=\fPstr[,str]
+To get a finer control over I/O priority, this option allows specifying
+the percentage of IOs that must have a priority set depending on the block
+size of the IO. This option is useful only when used together with the option
+\fBbssplit\fR, that is, multiple different block sizes are used for reads and
+writes. The format for this option is the same as the format of the
+\fBbssplit\fR option, with the exception that values for trim IOs are
+ignored. This option is mutually exclusive with the \fBcmdprio_percentage\fR
+option.
 .TP
 .BI (io_uring)fixedbufs
 If fio is asked to do direct IO, then Linux will map pages for each IO call, and
@@ -1999,20 +2073,20 @@ Detect when I/O threads are done, then exit.
 .BI (libhdfs)namenode \fR=\fPstr
 The hostname or IP address of a HDFS cluster namenode to contact.
 .TP
-.BI (libhdfs)port
+.BI (libhdfs)port \fR=\fPint
 The listening port of the HFDS cluster namenode.
 .TP
-.BI (netsplice,net)port
+.BI (netsplice,net)port \fR=\fPint
 The TCP or UDP port to bind to or connect to. If this is used with
 \fBnumjobs\fR to spawn multiple instances of the same job type, then
 this will be the starting port number since fio will use a range of
 ports.
 .TP
-.BI (rdma, librpma_*)port
+.BI (rdma,librpma_*)port \fR=\fPint
 The port to use for RDMA-CM communication. This should be the same
 value on the client and the server side.
 .TP
-.BI (netsplice,net, rdma)hostname \fR=\fPstr
+.BI (netsplice,net,rdma)hostname \fR=\fPstr
 The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O.
 If the job is a TCP listener or UDP reader, the hostname is not used
 and must be omitted unless it is a valid UDP multicast address.
@@ -2282,10 +2356,10 @@ the use of cudaMemcpy.
 .RE
 .TP
 .BI (dfs)pool
-Specify the UUID of the DAOS pool to connect to.
+Specify the label or UUID of the DAOS pool to connect to.
 .TP
 .BI (dfs)cont
-Specify the UUID of the DAOS DAOS container to open.
+Specify the label or UUID of the DAOS container to open.
 .TP
 .BI (dfs)chunk_size
 Specificy a different chunk size (in bytes) for the dfs file.
@@ -2298,6 +2372,31 @@ Use DAOS container's object class by default.
 .BI (nfs)nfs_url
 URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
 Refer to the libnfs README for more details.
+.TP
+.BI (exec)program\fR=\fPstr
+Specify the program to execute.
+Note the program will receive a SIGTERM when the job is reaching the time limit.
+A SIGKILL is sent once the job is over. The delay between the two signals is defined by \fBgrace_time\fR option.
+.TP
+.BI (exec)arguments\fR=\fPstr
+Specify arguments to pass to program.
+Some special variables can be expanded to pass fio's job details to the program :
+.RS
+.RS
+.TP
+.B %r
+replaced by the duration of the job in seconds
+.TP
+.BI %n
+replaced by the name of the job
+.RE
+.RE
+.TP
+.BI (exec)grace_time\fR=\fPint
+Defines the time between the SIGTERM and SIGKILL signals. Default is 1 second.
+.TP
+.BI (exec)std_redirect\fR=\fbool
+If set, stdout and stderr streams are redirected to files named from the job name. Default is true.
 .SS "I/O depth"
 .TP
 .BI iodepth \fR=\fPint
@@ -2402,7 +2501,7 @@ problem). Note that this option cannot reliably be used with async IO engines.
 Stall the job for the specified period of time after an I/O has completed before issuing the
 next. May be used to simulate processing being done by an application.
 When the unit is omitted, the value is interpreted in microseconds. See
-\fBthinktime_blocks\fR and \fBthinktime_spin\fR.
+\fBthinktime_blocks\fR, \fBthinktime_iotime\fR and \fBthinktime_spin\fR.
 .TP
 .BI thinktime_spin \fR=\fPtime
 Only valid if \fBthinktime\fR is set - pretend to spend CPU time doing
@@ -2423,6 +2522,17 @@ Only valid if \fBthinktime\fR is set - control how \fBthinktime_blocks\fR trigge
 The default is `complete', which triggers \fBthinktime\fR when fio completes
 \fBthinktime_blocks\fR blocks. If this is set to `issue', then the trigger happens
 at the issue side.
+.TP
+.BI thinktime_iotime \fR=\fPtime
+Only valid if \fBthinktime\fR is set - control \fBthinktime\fR interval by time.
+The \fBthinktime\fR stall is repeated after IOs are executed for
+\fBthinktime_iotime\fR. For example, `\-\-thinktime_iotime=9s \-\-thinktime=1s'
+repeat 10-second cycle with IOs for 9 seconds and stall for 1 second. When the
+unit is omitted, \fBthinktime_iotime\fR is interpreted as a number of seconds.
+If this option is used together with \fBthinktime_blocks\fR, the \fBthinktime\fR
+stall is repeated after \fBthinktime_iotime\fR or after \fBthinktime_blocks\fR
+IOs, whichever happens first.
+
 .TP
 .BI rate \fR=\fPint[,int][,int]
 Cap the bandwidth used by this job. The number is in bytes/sec, the normal
@@ -2624,13 +2734,13 @@ Set the I/O priority value of this job. Linux limits us to a positive value
 between 0 and 7, with 0 being the highest. See man
 \fBionice\fR\|(1). Refer to an appropriate manpage for other operating
 systems since meaning of priority may differ. For per-command priority
-setting, see I/O engine specific `cmdprio_percentage` and `hipri_percentage`
-options.
+setting, see the I/O engine specific `cmdprio_percentage` and
+`cmdprio` options.
 .TP
 .BI prioclass \fR=\fPint
 Set the I/O priority class. See man \fBionice\fR\|(1). For per-command
-priority setting, see I/O engine specific `cmdprio_percentage` and `hipri_percent`
-options.
+priority setting, see the I/O engine specific `cmdprio_percentage` and
+`cmdprio_class` options.
 .TP
 .BI cpus_allowed \fR=\fPstr
 Controls the same options as \fBcpumask\fR, but accepts a textual
@@ -3136,6 +3246,17 @@ logging (see \fBlog_avg_msec\fR) has been enabled. See
 \fBwrite_bw_log\fR for details about the filename format and \fBLOG
 FILE FORMATS\fR for how data is structured within the file.
 .TP
+.BI log_entries \fR=\fPint
+By default, fio will log an entry in the iops, latency, or bw log for
+every I/O that completes. The initial number of I/O log entries is 1024.
+When the log entries are all used, new log entries are dynamically
+allocated.  This dynamic log entry allocation may negatively impact
+time-related statistics such as I/O tail latencies (e.g. 99.9th percentile
+completion latency). This option allows specifying a larger initial
+number of log entries to avoid run-time allocation of new log entries,
+resulting in more precise time-related I/O statistics.
+Also see \fBlog_avg_msec\fR as well. Defaults to 1024.
+.TP
 .BI log_avg_msec \fR=\fPint
 By default, fio will log an entry in the iops, latency, or bw log for every
 I/O that completes. When writing to the disk log, that can quickly grow to a
@@ -3169,6 +3290,11 @@ If this is set, the iolog options will include the byte offset for the I/O
 entry as well as the other data values. Defaults to 0 meaning that
 offsets are not present in logs. Also see \fBLOG FILE FORMATS\fR section.
 .TP
+.BI log_prio \fR=\fPbool
+If this is set, the iolog options will include the I/O priority for the I/O
+entry as well as the other data values. Defaults to 0 meaning that
+I/O priorities are not present in logs. Also see \fBLOG FILE FORMATS\fR section.
+.TP
 .BI log_compression \fR=\fPint
 If this is set, fio will compress the I/O logs as it goes, to keep the
 memory footprint lower. When a log reaches the specified size, that chunk is
@@ -4102,8 +4228,14 @@ The entry's `block size' is always in bytes. The `offset' is the position in byt
 from the start of the file for that particular I/O. The logging of the offset can be
 toggled with \fBlog_offset\fR.
 .P
-`Command priority` is 0 for normal priority and 1 for high priority. This is controlled
-by the ioengine specific \fBcmdprio_percentage\fR.
+If \fBlog_prio\fR is not set, the entry's `Command priority` is 1 for an IO executed
+with the highest RT priority class (\fBprioclass\fR=1 or \fBcmdprio_class\fR=1) and 0
+otherwise. This is controlled by the \fBprioclass\fR option and the ioengine specific
+\fBcmdprio_percentage\fR \fBcmdprio_class\fR options. If \fBlog_prio\fR is set, the
+entry's `Command priority` is the priority set for the IO, as a 16-bits hexadecimal
+number with the lowest 13 bits indicating the priority value (\fBprio\fR and
+\fBcmdprio\fR options) and the highest 3 bits indicating the IO priority class
+(\fBprioclass\fR and \fBcmdprio_class\fR options).
 .P
 Fio defaults to logging every individual I/O but when windowed logging is set
 through \fBlog_avg_msec\fR, either the average (by default) or the maximum
diff --git a/fio.h b/fio.h
index b05cb3dfc395346fe02775e65d7a4fba5288ed43..6bb21ebb2ace311c3b4643bb6adbfbbcf045b444 100644 (file)
--- a/fio.h
+++ b/fio.h
@@ -47,6 +47,7 @@
 #include "workqueue.h"
 #include "steadystate.h"
 #include "lib/nowarn_snprintf.h"
+#include "dedupe.h"
 
 #ifdef CONFIG_SOLARISAIO
 #include <sys/asynch.h>
@@ -140,6 +141,7 @@ enum {
        FIO_RAND_POISSON2_OFF,
        FIO_RAND_POISSON3_OFF,
        FIO_RAND_PRIO_CMDS,
+       FIO_RAND_DEDUPE_WORKING_SET_IX,
        FIO_RAND_NR_OFFS,
 };
 
@@ -259,9 +261,14 @@ struct thread_data {
 
        struct frand_state buf_state;
        struct frand_state buf_state_prev;
+       struct frand_state buf_state_ret;
        struct frand_state dedupe_state;
        struct frand_state zone_state;
        struct frand_state prio_state;
+       struct frand_state dedupe_working_set_index_state;
+       struct frand_state *dedupe_working_set_states;
+
+       unsigned long long num_unique_pages;
 
        struct zone_split_index **zone_state_index;
        unsigned int num_open_zones;
@@ -273,6 +280,11 @@ struct thread_data {
 
        int shm_id;
 
+       /*
+        * Job default IO priority set with prioclass and prio options.
+        */
+       unsigned int ioprio;
+
        /*
         * IO engine hooks, contains everything needed to submit an io_u
         * to any of the available IO engines.
@@ -358,6 +370,8 @@ struct thread_data {
        uint64_t bytes_done[DDIR_RWDIR_CNT];
 
        uint64_t *thinktime_blocks_counter;
+       struct timespec last_thinktime;
+       uint64_t last_thinktime_blocks;
 
        /*
         * State for random io, a bitmap of blocks done vs not done
@@ -413,6 +427,7 @@ struct thread_data {
         */
        struct flist_head io_log_list;
        FILE *io_log_rfile;
+       unsigned int io_log_blktrace;
        unsigned int io_log_current;
        unsigned int io_log_checkmark;
        unsigned int io_log_highmark;
index d8e7ebfe573ef5531fb2c60d00e87e4cb2127308..b9b83db3057435f6fd2918d84cc3e96bdb7c4252 100644 (file)
@@ -9,6 +9,10 @@
 #define DRD_IGNORE_VAR(x) do { } while (0)
 #endif
 
+#ifdef WIN32
+#include "os/os-windows.h"
+#endif
+
 #include "fio.h"
 #include "smalloc.h"
 #include "helper_thread.h"
@@ -283,19 +287,12 @@ static void *helper_thread_main(void *data)
                }
        };
        struct timespec ts;
-       int clk_tck, ret = 0;
+       long clk_tck;
+       int ret = 0;
 
-#ifdef _SC_CLK_TCK
-       clk_tck = sysconf(_SC_CLK_TCK);
-#else
-       /*
-        * The timer frequence is variable on Windows. Instead of trying to
-        * query it, use 64 Hz, the clock frequency lower bound. See also
-        * https://carpediemsystems.co.uk/2019/07/18/windows-system-timer-granularity/.
-        */
-       clk_tck = 64;
-#endif
-       dprint(FD_HELPERTHREAD, "clk_tck = %d\n", clk_tck);
+       os_clk_tck(&clk_tck);
+
+       dprint(FD_HELPERTHREAD, "clk_tck = %ld\n", clk_tck);
        assert(clk_tck > 0);
        sleep_accuracy_ms = (1000 + clk_tck - 1) / clk_tck;
 
diff --git a/init.c b/init.c
index 60c7cff405d70d8e974545026e2fe659b512b7ed..5f069d9a5b4af0fb4e4cb2fe67861a25d020fbff 100644 (file)
--- a/init.c
+++ b/init.c
@@ -958,6 +958,28 @@ static int fixup_options(struct thread_data *td)
 
        o->latency_target *= 1000ULL;
 
+       /*
+        * Dedupe working set verifications
+        */
+       if (o->dedupe_percentage && o->dedupe_mode == DEDUPE_MODE_WORKING_SET) {
+               if (!fio_option_is_set(o, size)) {
+                       log_err("fio: pregenerated dedupe working set "
+                                       "requires size to be set\n");
+                       ret |= 1;
+               } else if (o->nr_files != 1) {
+                       log_err("fio: dedupe working set mode supported with "
+                                       "single file per job, but %d files "
+                                       "provided\n", o->nr_files);
+                       ret |= 1;
+               } else if (o->dedupe_working_set_percentage + o->dedupe_percentage > 100) {
+                       log_err("fio: impossible to reach expected dedupe percentage %u "
+                                       "since %u percentage of size is reserved to dedupe working set "
+                                       "(those are unique pages)\n",
+                                       o->dedupe_percentage, o->dedupe_working_set_percentage);
+                       ret |= 1;
+               }
+       }
+
        return ret;
 }
 
@@ -1031,6 +1053,7 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
        init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false);
        init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false);
        init_rand_seed(&td->prio_state, td->rand_seeds[FIO_RAND_PRIO_CMDS], false);
+       init_rand_seed(&td->dedupe_working_set_index_state, td->rand_seeds[FIO_RAND_DEDUPE_WORKING_SET_IX], use64);
 
        if (!td_random(td))
                return;
@@ -1491,6 +1514,9 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
        if (fixup_options(td))
                goto err;
 
+       if (init_dedupe_working_set_seeds(td))
+               goto err;
+
        /*
         * Belongs to fixup_options, but o->name is not necessarily set as yet
         */
@@ -1522,16 +1548,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
        memcpy(td->ts.percentile_list, o->percentile_list, sizeof(o->percentile_list));
        td->ts.sig_figs = o->sig_figs;
 
-       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               td->ts.clat_stat[i].min_val = ULONG_MAX;
-               td->ts.slat_stat[i].min_val = ULONG_MAX;
-               td->ts.lat_stat[i].min_val = ULONG_MAX;
-               td->ts.bw_stat[i].min_val = ULONG_MAX;
-               td->ts.iops_stat[i].min_val = ULONG_MAX;
-               td->ts.clat_high_prio_stat[i].min_val = ULONG_MAX;
-               td->ts.clat_low_prio_stat[i].min_val = ULONG_MAX;
-       }
-       td->ts.sync_stat.min_val = ULONG_MAX;
+       init_thread_stat_min_vals(&td->ts);
        td->ddir_seq_nr = o->ddir_seq_nr;
 
        if ((o->stonewall || o->new_group) && prev_group_jobs) {
@@ -1557,6 +1574,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                        .hist_coarseness = o->log_hist_coarseness,
                        .log_type = IO_LOG_TYPE_LAT,
                        .log_offset = o->log_offset,
+                       .log_prio = o->log_prio,
                        .log_gz = o->log_gz,
                        .log_gz_store = o->log_gz_store,
                };
@@ -1590,6 +1608,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                        .hist_coarseness = o->log_hist_coarseness,
                        .log_type = IO_LOG_TYPE_HIST,
                        .log_offset = o->log_offset,
+                       .log_prio = o->log_prio,
                        .log_gz = o->log_gz,
                        .log_gz_store = o->log_gz_store,
                };
@@ -1621,6 +1640,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                        .hist_coarseness = o->log_hist_coarseness,
                        .log_type = IO_LOG_TYPE_BW,
                        .log_offset = o->log_offset,
+                       .log_prio = o->log_prio,
                        .log_gz = o->log_gz,
                        .log_gz_store = o->log_gz_store,
                };
@@ -1652,6 +1672,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                        .hist_coarseness = o->log_hist_coarseness,
                        .log_type = IO_LOG_TYPE_IOPS,
                        .log_offset = o->log_offset,
+                       .log_prio = o->log_prio,
                        .log_gz = o->log_gz,
                        .log_gz_store = o->log_gz_store,
                };
index a42da97a335cd046b82c44c396695be471341b8e..296a9d04ac53cc1c44585ee736f8ea424375615d 100644 (file)
--- a/io_ddir.h
+++ b/io_ddir.h
@@ -24,7 +24,7 @@ static inline const char *io_ddir_name(enum fio_ddir ddir)
                                        "datasync", "sync_file_range",
                                        "wait", };
 
-       if (ddir < DDIR_LAST)
+       if (ddir >= 0 && ddir < DDIR_LAST)
                return name[ddir];
 
        return "invalid";
diff --git a/io_u.c b/io_u.c
index b421a579bd0a1aaa594692a21731a2774de77cea..3c72d63d0d5368db1ecae9158371f99efb9a27e0 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -1595,7 +1595,7 @@ again:
                assert(io_u->flags & IO_U_F_FREE);
                io_u_clear(td, io_u, IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
                                 IO_U_F_TRIMMED | IO_U_F_BARRIER |
-                                IO_U_F_VER_LIST | IO_U_F_PRIORITY);
+                                IO_U_F_VER_LIST | IO_U_F_HIGH_PRIO);
 
                io_u->error = 0;
                io_u->acct_ddir = -1;
@@ -1799,6 +1799,10 @@ struct io_u *get_io_u(struct thread_data *td)
        io_u->xfer_buf = io_u->buf;
        io_u->xfer_buflen = io_u->buflen;
 
+       /*
+        * Remember the issuing context priority. The IO engine may change this.
+        */
+       io_u->ioprio = td->ioprio;
 out:
        assert(io_u->file);
        if (!td_io_prep(td, io_u)) {
@@ -1884,7 +1888,8 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
                unsigned long long tnsec;
 
                tnsec = ntime_since(&io_u->start_time, &icd->time);
-               add_lat_sample(td, idx, tnsec, bytes, io_u->offset, io_u_is_prio(io_u));
+               add_lat_sample(td, idx, tnsec, bytes, io_u->offset,
+                              io_u->ioprio, io_u_is_high_prio(io_u));
 
                if (td->flags & TD_F_PROFILE_OPS) {
                        struct prof_io_ops *ops = &td->prof_io_ops;
@@ -1905,7 +1910,8 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
 
        if (ddir_rw(idx)) {
                if (!td->o.disable_clat) {
-                       add_clat_sample(td, idx, llnsec, bytes, io_u->offset, io_u_is_prio(io_u));
+                       add_clat_sample(td, idx, llnsec, bytes, io_u->offset,
+                                       io_u->ioprio, io_u_is_high_prio(io_u));
                        io_u_mark_latency(td, llnsec);
                }
 
@@ -1998,7 +2004,7 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
                 * Make sure we notice short IO from here, and requeue them
                 * appropriately!
                 */
-               if (io_u->resid) {
+               if (bytes && io_u->resid) {
                        io_u->xfer_buflen = io_u->resid;
                        io_u->xfer_buf += bytes;
                        io_u->offset += bytes;
@@ -2162,7 +2168,7 @@ void io_u_queued(struct thread_data *td, struct io_u *io_u)
                        td = td->parent;
 
                add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen,
-                               io_u->offset, io_u_is_prio(io_u));
+                               io_u->offset, io_u->ioprio);
        }
 }
 
@@ -2172,6 +2178,7 @@ void io_u_queued(struct thread_data *td, struct io_u *io_u)
 static struct frand_state *get_buf_state(struct thread_data *td)
 {
        unsigned int v;
+       unsigned long long i;
 
        if (!td->o.dedupe_percentage)
                return &td->buf_state;
@@ -2183,7 +2190,24 @@ static struct frand_state *get_buf_state(struct thread_data *td)
        v = rand_between(&td->dedupe_state, 1, 100);
 
        if (v <= td->o.dedupe_percentage)
-               return &td->buf_state_prev;
+               switch (td->o.dedupe_mode) {
+               case DEDUPE_MODE_REPEAT:
+                       /*
+                       * The caller advances the returned frand_state.
+                       * A copy of prev should be returned instead since
+                       * a subsequent intention to generate a deduped buffer
+                       * might result in generating a unique one
+                       */
+                       frand_copy(&td->buf_state_ret, &td->buf_state_prev);
+                       return &td->buf_state_ret;
+               case DEDUPE_MODE_WORKING_SET:
+                       i = rand_between(&td->dedupe_working_set_index_state, 0, td->num_unique_pages - 1);
+                       frand_copy(&td->buf_state_ret, &td->dedupe_working_set_states[i]);
+                       return &td->buf_state_ret;
+               default:
+                       log_err("unexpected dedupe mode %u\n", td->o.dedupe_mode);
+                       assert(0);
+               }
 
        return &td->buf_state;
 }
@@ -2206,27 +2230,30 @@ void fill_io_buffer(struct thread_data *td, void *buf, unsigned long long min_wr
 
        if (o->compress_percentage || o->dedupe_percentage) {
                unsigned int perc = td->o.compress_percentage;
-               struct frand_state *rs;
+               struct frand_state *rs = NULL;
                unsigned long long left = max_bs;
                unsigned long long this_write;
 
                do {
-                       rs = get_buf_state(td);
+                       /*
+                        * Buffers are either entirely dedupe-able or not.
+                        * If we choose to dedup, the buffer should undergo
+                        * the same manipulation as the original write. Which
+                        * means we should retrack the steps we took for compression
+                        * as well.
+                        */
+                       if (!rs)
+                               rs = get_buf_state(td);
 
                        min_write = min(min_write, left);
 
-                       if (perc) {
-                               this_write = min_not_zero(min_write,
-                                                       (unsigned long long) td->o.compress_chunk);
+                       this_write = min_not_zero(min_write,
+                                               (unsigned long long) td->o.compress_chunk);
 
-                               fill_random_buf_percentage(rs, buf, perc,
-                                       this_write, this_write,
-                                       o->buffer_pattern,
-                                       o->buffer_pattern_bytes);
-                       } else {
-                               fill_random_buf(rs, buf, min_write);
-                               this_write = min_write;
-                       }
+                       fill_random_buf_percentage(rs, buf, perc,
+                               this_write, this_write,
+                               o->buffer_pattern,
+                               o->buffer_pattern_bytes);
 
                        buf += this_write;
                        left -= this_write;
@@ -2299,10 +2326,19 @@ int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
        struct fio_file *f = io_u->file;
        int ret;
 
+       if (td->o.zone_mode == ZONE_MODE_ZBD) {
+               ret = zbd_do_io_u_trim(td, io_u);
+               if (ret == io_u_completed)
+                       return io_u->xfer_buflen;
+               if (ret)
+                       goto err;
+       }
+
        ret = os_trim(f, io_u->offset, io_u->xfer_buflen);
        if (!ret)
                return io_u->xfer_buflen;
 
+err:
        io_u->error = ret;
        return 0;
 #endif
diff --git a/io_u.h b/io_u.h
index d4c5be4303b3dc4bb7ed9a3cb905b86852231401..bdbac52577afeff573e6895621dccc903acc1d0a 100644 (file)
--- a/io_u.h
+++ b/io_u.h
@@ -21,7 +21,7 @@ enum {
        IO_U_F_TRIMMED          = 1 << 5,
        IO_U_F_BARRIER          = 1 << 6,
        IO_U_F_VER_LIST         = 1 << 7,
-       IO_U_F_PRIORITY         = 1 << 8,
+       IO_U_F_HIGH_PRIO        = 1 << 8,
 };
 
 /*
@@ -46,6 +46,11 @@ struct io_u {
         */
        unsigned short numberio;
 
+       /*
+        * IO priority.
+        */
+       unsigned short ioprio;
+
        /*
         * Allocated/set buffer and length
         */
@@ -188,7 +193,6 @@ static inline enum fio_ddir acct_ddir(struct io_u *io_u)
        td_flags_clear((td), &(io_u->flags), (val))
 #define io_u_set(td, io_u, val)                \
        td_flags_set((td), &(io_u)->flags, (val))
-#define io_u_is_prio(io_u)     \
-       (io_u->flags & (unsigned int) IO_U_F_PRIORITY) != 0
+#define io_u_is_high_prio(io_u)        (io_u->flags & IO_U_F_HIGH_PRIO)
 
 #endif
index dd61af07a4432c802ba53d18535c2dc44ea2a8bf..d08a511a0635eccc090528dfa2ff242fd5719209 100644 (file)
@@ -692,17 +692,17 @@ int fio_show_ioengine_help(const char *engine)
        }
 
        td.o.ioengine = (char *)engine;
-       io_ops = load_ioengine(&td);
+       td.io_ops = load_ioengine(&td);
 
-       if (!io_ops) {
+       if (!td.io_ops) {
                log_info("IO engine %s not found\n", engine);
                return 1;
        }
 
-       if (io_ops->options)
-               ret = show_cmd_help(io_ops->options, sep);
+       if (td.io_ops->options)
+               ret = show_cmd_help(td.io_ops->options, sep);
        else
-               log_info("IO engine %s has no options\n", io_ops->name);
+               log_info("IO engine %s has no options\n", td.io_ops->name);
 
        free_ioengine(&td);
        return ret;
diff --git a/iolog.c b/iolog.c
index cf264916a9ecd83f7b3079bd186573441dba65fb..1aeb7a76b2b6ca21ab8bd3d5c18e5ec200669fcb 100644 (file)
--- a/iolog.c
+++ b/iolog.c
@@ -151,7 +151,8 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u)
 
        while (!flist_empty(&td->io_log_list)) {
                int ret;
-               if (td->o.read_iolog_chunked) {
+
+               if (!td->io_log_blktrace && td->o.read_iolog_chunked) {
                        if (td->io_log_checkmark == td->io_log_current) {
                                if (!read_iolog2(td))
                                        return 1;
@@ -706,10 +707,13 @@ bool init_iolog(struct thread_data *td)
                 * Check if it's a blktrace file and load that if possible.
                 * Otherwise assume it's a normal log file and load that.
                 */
-               if (is_blktrace(fname, &need_swap))
+               if (is_blktrace(fname, &need_swap)) {
+                       td->io_log_blktrace = 1;
                        ret = load_blktrace(td, fname, need_swap);
-               else
+               } else {
+                       td->io_log_blktrace = 0;
                        ret = init_iolog_read(td, fname);
+               }
        } else if (td->o.write_iolog_file)
                ret = init_iolog_write(td);
        else
@@ -733,6 +737,7 @@ void setup_log(struct io_log **log, struct log_params *p,
        INIT_FLIST_HEAD(&l->io_logs);
        l->log_type = p->log_type;
        l->log_offset = p->log_offset;
+       l->log_prio = p->log_prio;
        l->log_gz = p->log_gz;
        l->log_gz_store = p->log_gz_store;
        l->avg_msec = p->avg_msec;
@@ -765,6 +770,8 @@ void setup_log(struct io_log **log, struct log_params *p,
 
        if (l->log_offset)
                l->log_ddir_mask = LOG_OFFSET_SAMPLE_BIT;
+       if (l->log_prio)
+               l->log_ddir_mask |= LOG_PRIO_SAMPLE_BIT;
 
        INIT_FLIST_HEAD(&l->chunk_list);
 
@@ -891,33 +898,55 @@ static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
 void flush_samples(FILE *f, void *samples, uint64_t sample_size)
 {
        struct io_sample *s;
-       int log_offset;
+       int log_offset, log_prio;
        uint64_t i, nr_samples;
+       unsigned int prio_val;
+       const char *fmt;
 
        if (!sample_size)
                return;
 
        s = __get_sample(samples, 0, 0);
        log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+       log_prio = (s->__ddir & LOG_PRIO_SAMPLE_BIT) != 0;
+
+       if (log_offset) {
+               if (log_prio)
+                       fmt = "%lu, %" PRId64 ", %u, %llu, %llu, 0x%04x\n";
+               else
+                       fmt = "%lu, %" PRId64 ", %u, %llu, %llu, %u\n";
+       } else {
+               if (log_prio)
+                       fmt = "%lu, %" PRId64 ", %u, %llu, 0x%04x\n";
+               else
+                       fmt = "%lu, %" PRId64 ", %u, %llu, %u\n";
+       }
 
        nr_samples = sample_size / __log_entry_sz(log_offset);
 
        for (i = 0; i < nr_samples; i++) {
                s = __get_sample(samples, log_offset, i);
 
+               if (log_prio)
+                       prio_val = s->priority;
+               else
+                       prio_val = ioprio_value_is_class_rt(s->priority);
+
                if (!log_offset) {
-                       fprintf(f, "%lu, %" PRId64 ", %u, %llu, %u\n",
-                                       (unsigned long) s->time,
-                                       s->data.val,
-                                       io_sample_ddir(s), (unsigned long long) s->bs, s->priority_bit);
+                       fprintf(f, fmt,
+                               (unsigned long) s->time,
+                               s->data.val,
+                               io_sample_ddir(s), (unsigned long long) s->bs,
+                               prio_val);
                } else {
                        struct io_sample_offset *so = (void *) s;
 
-                       fprintf(f, "%lu, %" PRId64 ", %u, %llu, %llu, %u\n",
-                                       (unsigned long) s->time,
-                                       s->data.val,
-                                       io_sample_ddir(s), (unsigned long long) s->bs,
-                                       (unsigned long long) so->offset, s->priority_bit);
+                       fprintf(f, fmt,
+                               (unsigned long) s->time,
+                               s->data.val,
+                               io_sample_ddir(s), (unsigned long long) s->bs,
+                               (unsigned long long) so->offset,
+                               prio_val);
                }
        }
 }
diff --git a/iolog.h b/iolog.h
index 9e382cc0211748254f97dc8071176319d874a591..7d66b7c42fb6e4c0d455165d0ebf56467a7b7e0b 100644 (file)
--- a/iolog.h
+++ b/iolog.h
@@ -42,7 +42,7 @@ struct io_sample {
        uint64_t time;
        union io_sample_data data;
        uint32_t __ddir;
-       uint8_t priority_bit;
+       uint16_t priority;
        uint64_t bs;
 };
 
@@ -104,6 +104,11 @@ struct io_log {
         */
        unsigned int log_offset;
 
+       /*
+        * Log I/O priorities
+        */
+       unsigned int log_prio;
+
        /*
         * Max size of log entries before a chunk is compressed
         */
@@ -145,7 +150,13 @@ struct io_log {
  * If the upper bit is set, then we have the offset as well
  */
 #define LOG_OFFSET_SAMPLE_BIT  0x80000000U
-#define io_sample_ddir(io)     ((io)->__ddir & ~LOG_OFFSET_SAMPLE_BIT)
+/*
+ * If the bit following the upper bit is set, then we have the priority
+ */
+#define LOG_PRIO_SAMPLE_BIT    0x40000000U
+
+#define LOG_SAMPLE_BITS                (LOG_OFFSET_SAMPLE_BIT | LOG_PRIO_SAMPLE_BIT)
+#define io_sample_ddir(io)     ((io)->__ddir & ~LOG_SAMPLE_BITS)
 
 static inline void io_sample_set_ddir(struct io_log *log,
                                      struct io_sample *io,
@@ -262,6 +273,7 @@ struct log_params {
        int hist_coarseness;
        int log_type;
        int log_offset;
+       int log_prio;
        int log_gz;
        int log_gz_store;
        int log_compress;
index dc7ecd0d629c94afe22a599739e76d68cb39922c..99e1862a34c5047d487b6ad2f300f61969dd8c3c 100644 (file)
--- a/lib/fls.h
+++ b/lib/fls.h
@@ -32,7 +32,6 @@ static inline int __fls(int x)
                r -= 2;
        }
        if (!(x & 0x80000000u)) {
-               x <<= 1;
                r -= 1;
        }
        return r;
index 5eb6e60aeb6b651e88595b4ba9c2d875d1a5748d..6e893e80ba00a4939116cb59099e881300aaed3e 100644 (file)
@@ -59,7 +59,7 @@ static void __init_rand32(struct taus88_state *state, unsigned int seed)
                __rand32(state);
 }
 
-static void __init_rand64(struct taus258_state *state, uint64_t seed)
+void __init_rand64(struct taus258_state *state, uint64_t seed)
 {
        int cranks = 6;
 
@@ -125,10 +125,7 @@ void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
 uint64_t fill_random_buf(struct frand_state *fs, void *buf,
                         unsigned int len)
 {
-       uint64_t r = __rand(fs);
-
-       if (sizeof(int) != sizeof(long *))
-               r *= (unsigned long) __rand(fs);
+       uint64_t r = __get_next_seed(fs);
 
        __fill_random_buf(buf, len, r);
        return r;
@@ -188,10 +185,7 @@ uint64_t fill_random_buf_percentage(struct frand_state *fs, void *buf,
                                    unsigned int segment, unsigned int len,
                                    char *pattern, unsigned int pbytes)
 {
-       uint64_t r = __rand(fs);
-
-       if (sizeof(int) != sizeof(long *))
-               r *= (unsigned long) __rand(fs);
+       uint64_t r = __get_next_seed(fs);
 
        __fill_random_buf_percentage(r, buf, percentage, segment, len,
                                        pattern, pbytes);
index 46c1c5e023a132513ded05d885bd5769dbf7ceb9..2b4be78893a926d22a37006706a1caf4b0d3e431 100644 (file)
@@ -150,8 +150,19 @@ static inline uint64_t rand_between(struct frand_state *state, uint64_t start,
                return start + rand32_upto(state, end - start);
 }
 
+static inline uint64_t __get_next_seed(struct frand_state *fs)
+{
+       uint64_t r = __rand(fs);
+
+       if (sizeof(int) != sizeof(long *))
+               r *= (unsigned long) __rand(fs);
+
+       return r;
+}
+
 extern void init_rand(struct frand_state *, bool);
 extern void init_rand_seed(struct frand_state *, uint64_t seed, bool);
+void __init_rand64(struct taus258_state *state, uint64_t seed);
 extern void __fill_random_buf(void *buf, unsigned int len, uint64_t seed);
 extern uint64_t fill_random_buf(struct frand_state *, void *buf, unsigned int len);
 extern void __fill_random_buf_percentage(uint64_t, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
index 56f3e37dab5b35750a390714c3af2d9579de5628..ef3aa0918df461ba3f60cf99b9d2d1f83b17095b 100644 (file)
@@ -5,7 +5,11 @@
 #include "../arch/arch.h"
 
 struct seqlock {
+#ifdef __cplusplus
+       std::atomic<unsigned int> sequence;
+#else
        volatile unsigned int sequence;
+#endif
 };
 
 static inline void seqlock_init(struct seqlock *s)
index 6144a474738fe9a51a530ffad691aa234138d665..198eaf2eb723f744cbc279cd3dff51a1be64bd00 100644 (file)
--- a/libfio.c
+++ b/libfio.c
@@ -104,7 +104,7 @@ static void reset_io_counters(struct thread_data *td, int all)
        /*
         * reset file done count if we are to start over
         */
-       if (td->o.time_based || td->o.loops || td->o.do_verify)
+       if (td->o.time_based || td->loops > 1 || td->o.do_verify)
                td->nr_done_files = 0;
 }
 
@@ -140,7 +140,6 @@ void reset_all_stats(struct thread_data *td)
                td->io_issues[i] = 0;
                td->ts.total_io_u[i] = 0;
                td->ts.runtime[i] = 0;
-               td->rwmix_issues = 0;
        }
 
        set_epoch_time(td, td->o.log_unix_epoch);
diff --git a/log.c b/log.c
index 562a29aaddfe0e1d42de369cd8e5645afa96f7a3..237bac28899c9feaf603835040732d0901d1acbf 100644 (file)
--- a/log.c
+++ b/log.c
@@ -62,7 +62,7 @@ void log_prevalist(int type, const char *fmt, va_list args)
        free(buf1);
        if (len < 0)
                return;
-       len = log_info_buf(buf2, len);
+       log_info_buf(buf2, len);
        free(buf2);
 }
 #endif
index a8986d116716fb7f5df8e572caff0a6c8dfdc995..102bcf5661a0c0fbffd54a61ad85281809250d3a 100644 (file)
--- a/options.c
+++ b/options.c
@@ -73,13 +73,7 @@ static int bs_cmp(const void *p1, const void *p2)
        return (int) bsp1->perc - (int) bsp2->perc;
 }
 
-struct split {
-       unsigned int nr;
-       unsigned long long val1[ZONESPLIT_MAX];
-       unsigned long long val2[ZONESPLIT_MAX];
-};
-
-static int split_parse_ddir(struct thread_options *o, struct split *split,
+int split_parse_ddir(struct thread_options *o, struct split *split,
                            char *str, bool absolute, unsigned int max_splits)
 {
        unsigned long long perc;
@@ -138,8 +132,8 @@ static int split_parse_ddir(struct thread_options *o, struct split *split,
        return 0;
 }
 
-static int bssplit_ddir(struct thread_options *o, enum fio_ddir ddir, char *str,
-                       bool data)
+static int bssplit_ddir(struct thread_options *o, void *eo,
+                       enum fio_ddir ddir, char *str, bool data)
 {
        unsigned int i, perc, perc_missing;
        unsigned long long max_bs, min_bs;
@@ -211,10 +205,8 @@ static int bssplit_ddir(struct thread_options *o, enum fio_ddir ddir, char *str,
        return 0;
 }
 
-typedef int (split_parse_fn)(struct thread_options *, enum fio_ddir, char *, bool);
-
-static int str_split_parse(struct thread_data *td, char *str,
-                          split_parse_fn *fn, bool data)
+int str_split_parse(struct thread_data *td, char *str,
+                   split_parse_fn *fn, void *eo, bool data)
 {
        char *odir, *ddir;
        int ret = 0;
@@ -223,37 +215,37 @@ static int str_split_parse(struct thread_data *td, char *str,
        if (odir) {
                ddir = strchr(odir + 1, ',');
                if (ddir) {
-                       ret = fn(&td->o, DDIR_TRIM, ddir + 1, data);
+                       ret = fn(&td->o, eo, DDIR_TRIM, ddir + 1, data);
                        if (!ret)
                                *ddir = '\0';
                } else {
                        char *op;
 
                        op = strdup(odir + 1);
-                       ret = fn(&td->o, DDIR_TRIM, op, data);
+                       ret = fn(&td->o, eo, DDIR_TRIM, op, data);
 
                        free(op);
                }
                if (!ret)
-                       ret = fn(&td->o, DDIR_WRITE, odir + 1, data);
+                       ret = fn(&td->o, eo, DDIR_WRITE, odir + 1, data);
                if (!ret) {
                        *odir = '\0';
-                       ret = fn(&td->o, DDIR_READ, str, data);
+                       ret = fn(&td->o, eo, DDIR_READ, str, data);
                }
        } else {
                char *op;
 
                op = strdup(str);
-               ret = fn(&td->o, DDIR_WRITE, op, data);
+               ret = fn(&td->o, eo, DDIR_WRITE, op, data);
                free(op);
 
                if (!ret) {
                        op = strdup(str);
-                       ret = fn(&td->o, DDIR_TRIM, op, data);
+                       ret = fn(&td->o, eo, DDIR_TRIM, op, data);
                        free(op);
                }
                if (!ret)
-                       ret = fn(&td->o, DDIR_READ, str, data);
+                       ret = fn(&td->o, eo, DDIR_READ, str, data);
        }
 
        return ret;
@@ -270,7 +262,7 @@ static int str_bssplit_cb(void *data, const char *input)
        strip_blank_front(&str);
        strip_blank_end(str);
 
-       ret = str_split_parse(td, str, bssplit_ddir, false);
+       ret = str_split_parse(td, str, bssplit_ddir, NULL, false);
 
        if (parse_dryrun()) {
                int i;
@@ -906,8 +898,8 @@ static int str_sfr_cb(void *data, const char *str)
 }
 #endif
 
-static int zone_split_ddir(struct thread_options *o, enum fio_ddir ddir,
-                          char *str, bool absolute)
+static int zone_split_ddir(struct thread_options *o, void *eo,
+                          enum fio_ddir ddir, char *str, bool absolute)
 {
        unsigned int i, perc, perc_missing, sperc, sperc_missing;
        struct split split;
@@ -1012,7 +1004,7 @@ static int parse_zoned_distribution(struct thread_data *td, const char *input,
        }
        str += strlen(pre);
 
-       ret = str_split_parse(td, str, zone_split_ddir, absolute);
+       ret = str_split_parse(td, str, zone_split_ddir, NULL, absolute);
 
        free(p);
 
@@ -3688,6 +3680,20 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                },
                .parent = "thinktime",
        },
+       {
+               .name   = "thinktime_iotime",
+               .lname  = "Thinktime interval",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, thinktime_iotime),
+               .help   = "IO time interval between 'thinktime'",
+               .def    = "0",
+               .parent = "thinktime",
+               .hide   = 1,
+               .is_seconds = 1,
+               .is_time = 1,
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_THINKTIME,
+       },
        {
                .name   = "rate",
                .lname  = "I/O rate",
@@ -4238,6 +4244,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                .category = FIO_OPT_C_LOG,
                .group  = FIO_OPT_G_INVALID,
        },
+       {
+               .name   = "log_entries",
+               .lname  = "Log entries",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, log_entries),
+               .help   = "Initial number of entries in a job IO log",
+               .def    = __fio_stringify(DEF_LOG_ENTRIES),
+               .minval = DEF_LOG_ENTRIES,
+               .maxval = MAX_LOG_ENTRIES,
+               .category = FIO_OPT_C_LOG,
+               .group  = FIO_OPT_G_INVALID,
+       },
        {
                .name   = "log_avg_msec",
                .lname  = "Log averaging (msec)",
@@ -4300,6 +4318,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                .category = FIO_OPT_C_LOG,
                .group  = FIO_OPT_G_INVALID,
        },
+       {
+               .name   = "log_prio",
+               .lname  = "Log priority of IO",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct thread_options, log_prio),
+               .help   = "Include priority value of IO for each log entry",
+               .def    = "0",
+               .category = FIO_OPT_C_LOG,
+               .group  = FIO_OPT_G_INVALID,
+       },
 #ifdef CONFIG_ZLIB
        {
                .name   = "log_compression",
@@ -4497,6 +4525,40 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                .category = FIO_OPT_C_IO,
                .group  = FIO_OPT_G_IO_BUF,
        },
+       {
+               .name   = "dedupe_mode",
+               .lname  = "Dedupe mode",
+               .help   = "Mode for the deduplication buffer generation",
+               .type   = FIO_OPT_STR,
+               .off1   = offsetof(struct thread_options, dedupe_mode),
+               .parent = "dedupe_percentage",
+               .def    = "repeat",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_IO_BUF,
+               .posval = {
+                          { .ival = "repeat",
+                            .oval = DEDUPE_MODE_REPEAT,
+                            .help = "repeat previous page",
+                          },
+                          { .ival = "working_set",
+                            .oval = DEDUPE_MODE_WORKING_SET,
+                            .help = "choose a page randomly from limited working set defined in dedupe_working_set_percentage",
+                          },
+               },
+       },
+       {
+               .name   = "dedupe_working_set_percentage",
+               .lname  = "Dedupe working set percentage",
+               .help   = "Dedupe working set size in percentages from file or device size used to generate dedupe patterns from",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, dedupe_working_set_percentage),
+               .parent = "dedupe_percentage",
+               .def    = "5",
+               .maxval = 100,
+               .minval = 0,
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_IO_BUF,
+       },
        {
                .name   = "clat_percentiles",
                .lname  = "Completion latency percentiles",
index d39b45fddb2214293aee16c137023a5993680837..c45b5e9a93879a292b4456fd669dc33332d53769 100644 (file)
 #include <linux/fs.h>
 #include <linux/types.h>
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 /*
  * IO submission data structure (Submission Queue Entry)
  */
@@ -46,23 +42,25 @@ struct io_uring_sqe {
                __u32           statx_flags;
                __u32           fadvise_advice;
                __u32           splice_flags;
+               __u32           rename_flags;
+               __u32           unlink_flags;
+               __u32           hardlink_flags;
        };
        __u64   user_data;      /* data to be passed back at completion time */
+       /* pack this to avoid bogus arm OABI complaints */
        union {
-               struct {
-                       /* pack this to avoid bogus arm OABI complaints */
-                       union {
-                               /* index into fixed buffers, if used */
-                               __u16   buf_index;
-                               /* for grouped buffer selection */
-                               __u16   buf_group;
-                       } __attribute__((packed));
-                       /* personality to use, if used */
-                       __u16   personality;
-                       __s32   splice_fd_in;
-               };
-               __u64   __pad2[3];
+               /* index into fixed buffers, if used */
+               __u16   buf_index;
+               /* for grouped buffer selection */
+               __u16   buf_group;
+       } __attribute__((packed));
+       /* personality to use, if used */
+       __u16   personality;
+       union {
+               __s32   splice_fd_in;
+               __u32   file_index;
        };
+       __u64   __pad2[2];
 };
 
 enum {
@@ -99,6 +97,7 @@ enum {
 #define IORING_SETUP_CQSIZE    (1U << 3)       /* app defines CQ size */
 #define IORING_SETUP_CLAMP     (1U << 4)       /* clamp SQ/CQ ring sizes */
 #define IORING_SETUP_ATTACH_WQ (1U << 5)       /* attach to existing wq */
+#define IORING_SETUP_R_DISABLED        (1U << 6)       /* start with ring disabled */
 
 enum {
        IORING_OP_NOP,
@@ -135,6 +134,12 @@ enum {
        IORING_OP_PROVIDE_BUFFERS,
        IORING_OP_REMOVE_BUFFERS,
        IORING_OP_TEE,
+       IORING_OP_SHUTDOWN,
+       IORING_OP_RENAMEAT,
+       IORING_OP_UNLINKAT,
+       IORING_OP_MKDIRAT,
+       IORING_OP_SYMLINKAT,
+       IORING_OP_LINKAT,
 
        /* this goes last, obviously */
        IORING_OP_LAST,
@@ -148,14 +153,35 @@ enum {
 /*
  * sqe->timeout_flags
  */
-#define IORING_TIMEOUT_ABS     (1U << 0)
-
+#define IORING_TIMEOUT_ABS             (1U << 0)
+#define IORING_TIMEOUT_UPDATE          (1U << 1)
+#define IORING_TIMEOUT_BOOTTIME                (1U << 2)
+#define IORING_TIMEOUT_REALTIME                (1U << 3)
+#define IORING_LINK_TIMEOUT_UPDATE     (1U << 4)
+#define IORING_TIMEOUT_ETIME_SUCCESS   (1U << 5)
+#define IORING_TIMEOUT_CLOCK_MASK      (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
+#define IORING_TIMEOUT_UPDATE_MASK     (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
 /*
  * sqe->splice_flags
  * extends splice(2) flags
  */
 #define SPLICE_F_FD_IN_FIXED   (1U << 31) /* the last bit of __u32 */
 
+/*
+ * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
+ * command flags for POLL_ADD are stored in sqe->len.
+ *
+ * IORING_POLL_ADD_MULTI       Multishot poll. Sets IORING_CQE_F_MORE if
+ *                             the poll handler will continue to report
+ *                             CQEs on behalf of the same SQE.
+ *
+ * IORING_POLL_UPDATE          Update existing poll request, matching
+ *                             sqe->addr as the old user_data field.
+ */
+#define IORING_POLL_ADD_MULTI  (1U << 0)
+#define IORING_POLL_UPDATE_EVENTS      (1U << 1)
+#define IORING_POLL_UPDATE_USER_DATA   (1U << 2)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
@@ -169,8 +195,10 @@ struct io_uring_cqe {
  * cqe->flags
  *
  * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
+ * IORING_CQE_F_MORE   If set, parent SQE will generate more CQE entries
  */
 #define IORING_CQE_F_BUFFER            (1U << 0)
+#define IORING_CQE_F_MORE              (1U << 1)
 
 enum {
        IORING_CQE_BUFFER_SHIFT         = 16,
@@ -228,6 +256,8 @@ struct io_cqring_offsets {
  */
 #define IORING_ENTER_GETEVENTS (1U << 0)
 #define IORING_ENTER_SQ_WAKEUP (1U << 1)
+#define IORING_ENTER_SQ_WAIT   (1U << 2)
+#define IORING_ENTER_EXT_ARG   (1U << 3)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -255,28 +285,85 @@ struct io_uring_params {
 #define IORING_FEAT_CUR_PERSONALITY    (1U << 4)
 #define IORING_FEAT_FAST_POLL          (1U << 5)
 #define IORING_FEAT_POLL_32BITS        (1U << 6)
+#define IORING_FEAT_SQPOLL_NONFIXED    (1U << 7)
+#define IORING_FEAT_EXT_ARG            (1U << 8)
+#define IORING_FEAT_NATIVE_WORKERS     (1U << 9)
+#define IORING_FEAT_RSRC_TAGS          (1U << 10)
 
 /*
  * io_uring_register(2) opcodes and arguments
  */
-#define IORING_REGISTER_BUFFERS                0
-#define IORING_UNREGISTER_BUFFERS      1
-#define IORING_REGISTER_FILES          2
-#define IORING_UNREGISTER_FILES                3
-#define IORING_REGISTER_EVENTFD                4
-#define IORING_UNREGISTER_EVENTFD      5
-#define IORING_REGISTER_FILES_UPDATE   6
-#define IORING_REGISTER_EVENTFD_ASYNC  7
-#define IORING_REGISTER_PROBE          8
-#define IORING_REGISTER_PERSONALITY    9
-#define IORING_UNREGISTER_PERSONALITY  10
+enum {
+       IORING_REGISTER_BUFFERS                 = 0,
+       IORING_UNREGISTER_BUFFERS               = 1,
+       IORING_REGISTER_FILES                   = 2,
+       IORING_UNREGISTER_FILES                 = 3,
+       IORING_REGISTER_EVENTFD                 = 4,
+       IORING_UNREGISTER_EVENTFD               = 5,
+       IORING_REGISTER_FILES_UPDATE            = 6,
+       IORING_REGISTER_EVENTFD_ASYNC           = 7,
+       IORING_REGISTER_PROBE                   = 8,
+       IORING_REGISTER_PERSONALITY             = 9,
+       IORING_UNREGISTER_PERSONALITY           = 10,
+       IORING_REGISTER_RESTRICTIONS            = 11,
+       IORING_REGISTER_ENABLE_RINGS            = 12,
+
+       /* extended with tagging */
+       IORING_REGISTER_FILES2                  = 13,
+       IORING_REGISTER_FILES_UPDATE2           = 14,
+       IORING_REGISTER_BUFFERS2                = 15,
+       IORING_REGISTER_BUFFERS_UPDATE          = 16,
+
+       /* set/clear io-wq thread affinities */
+       IORING_REGISTER_IOWQ_AFF                = 17,
+       IORING_UNREGISTER_IOWQ_AFF              = 18,
+
+       /* set/get max number of io-wq workers */
+       IORING_REGISTER_IOWQ_MAX_WORKERS        = 19,
 
+       /* this goes last */
+       IORING_REGISTER_LAST
+};
+
+/* io-wq worker categories */
+enum {
+       IO_WQ_BOUND,
+       IO_WQ_UNBOUND,
+};
+
+/* deprecated, see struct io_uring_rsrc_update */
 struct io_uring_files_update {
        __u32 offset;
        __u32 resv;
        __aligned_u64 /* __s32 * */ fds;
 };
 
+struct io_uring_rsrc_register {
+       __u32 nr;
+       __u32 resv;
+       __u64 resv2;
+       __aligned_u64 data;
+       __aligned_u64 tags;
+};
+
+struct io_uring_rsrc_update {
+       __u32 offset;
+       __u32 resv;
+       __aligned_u64 data;
+};
+
+struct io_uring_rsrc_update2 {
+       __u32 offset;
+       __u32 resv;
+       __aligned_u64 data;
+       __aligned_u64 tags;
+       __u32 nr;
+       __u32 resv2;
+};
+
+/* Skip updating fd indexes set to this value in the fd table */
+#define IORING_REGISTER_FILES_SKIP     (-2)
+
 #define IO_URING_OP_SUPPORTED  (1U << 0)
 
 struct io_uring_probe_op {
@@ -294,8 +381,41 @@ struct io_uring_probe {
        struct io_uring_probe_op ops[0];
 };
 
-#ifdef __cplusplus
-}
-#endif
+struct io_uring_restriction {
+       __u16 opcode;
+       union {
+               __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */
+               __u8 sqe_op;      /* IORING_RESTRICTION_SQE_OP */
+               __u8 sqe_flags;   /* IORING_RESTRICTION_SQE_FLAGS_* */
+       };
+       __u8 resv;
+       __u32 resv2[3];
+};
+
+/*
+ * io_uring_restriction->opcode values
+ */
+enum {
+       /* Allow an io_uring_register(2) opcode */
+       IORING_RESTRICTION_REGISTER_OP          = 0,
+
+       /* Allow an sqe opcode */
+       IORING_RESTRICTION_SQE_OP               = 1,
+
+       /* Allow sqe flags */
+       IORING_RESTRICTION_SQE_FLAGS_ALLOWED    = 2,
+
+       /* Require sqe flags (these flags must be set on each submission) */
+       IORING_RESTRICTION_SQE_FLAGS_REQUIRED   = 3,
+
+       IORING_RESTRICTION_LAST
+};
+
+struct io_uring_getevents_arg {
+       __u64   sigmask;
+       __u32   sigmask_sz;
+       __u32   pad;
+       __u64   ts;
+};
 
 #endif
index a81cd815e1a7df5b0328fbda515edbfa10f40042..10c51b8318f87a4edb68f1fc4264ca5ea3835667 100644 (file)
@@ -173,16 +173,26 @@ enum {
 #define IOPRIO_MIN_PRIO_CLASS  0
 #define IOPRIO_MAX_PRIO_CLASS  3
 
-static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
+static inline int ioprio_value(int ioprio_class, int ioprio)
 {
        /*
         * If no class is set, assume BE
         */
-       if (!ioprio_class)
-               ioprio_class = IOPRIO_CLASS_BE;
+        if (!ioprio_class)
+                ioprio_class = IOPRIO_CLASS_BE;
+
+       return (ioprio_class << IOPRIO_CLASS_SHIFT) | ioprio;
+}
+
+static inline bool ioprio_value_is_class_rt(unsigned int priority)
+{
+       return (priority >> IOPRIO_CLASS_SHIFT) == IOPRIO_CLASS_RT;
+}
 
-       ioprio |= ioprio_class << IOPRIO_CLASS_SHIFT;
-       return syscall(__NR_ioprio_set, which, who, ioprio);
+static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
+{
+       return syscall(__NR_ioprio_set, which, who,
+                      ioprio_value(ioprio_class, ioprio));
 }
 
 #ifndef BLKGETSIZE64
@@ -299,4 +309,8 @@ static inline int fio_set_sched_idle(void)
 }
 #endif
 
+#ifndef RWF_UNCACHED
+#define RWF_UNCACHED   0x00000040
+#endif
+
 #endif
index 6e46589450eaaf6b318724d61a157813e92f144d..5b37a37e19fde5f1b328612968cee9975dd79065 100644 (file)
@@ -171,6 +171,7 @@ static inline int fio_getaffinity(int pid, os_cpu_mask_t *mask)
  * ioprio_set() with 4 arguments, so define fio's ioprio_set() as a macro.
  * Note that there is no idea of class within ioprio_set(2) unlike Linux.
  */
+#define ioprio_value(ioprio_class, ioprio)     (ioprio)
 #define ioprio_set(which, who, ioprio_class, ioprio)   \
        ioprio_set(which, who, ioprio)
 
index f7137abe1b872a9821cfa4ea73dd799627597f5c..3001140ca486630d6bb60a2958c01d824fbd7fc8 100644 (file)
 #include <errno.h>
 #include <sched.h>
 #include <linux/unistd.h>
-#include <linux/raw.h>
 #include <linux/major.h>
 #include <linux/fs.h>
 #include <scsi/sg.h>
 
 #ifdef ARCH_HAVE_CRC_CRYPTO
 #include <sys/auxv.h>
+#ifndef HWCAP_PMULL
+#define HWCAP_PMULL             (1 << 4)
+#endif /* HWCAP_PMULL */
 #ifndef HWCAP_CRC32
 #define HWCAP_CRC32             (1 << 7)
 #endif /* HWCAP_CRC32 */
@@ -41,7 +43,6 @@
 #define FIO_HAVE_IOSCHED_SWITCH
 #define FIO_HAVE_ODIRECT
 #define FIO_HAVE_HUGETLB
-#define FIO_HAVE_RAWBIND
 #define FIO_HAVE_BLKTRACE
 #define FIO_HAVE_CL_SIZE
 #define FIO_HAVE_CGROUPS
@@ -120,16 +121,26 @@ enum {
 #define IOPRIO_MIN_PRIO_CLASS  0
 #define IOPRIO_MAX_PRIO_CLASS  3
 
-static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
+static inline int ioprio_value(int ioprio_class, int ioprio)
 {
        /*
         * If no class is set, assume BE
         */
-       if (!ioprio_class)
-               ioprio_class = IOPRIO_CLASS_BE;
+        if (!ioprio_class)
+                ioprio_class = IOPRIO_CLASS_BE;
+
+       return (ioprio_class << IOPRIO_CLASS_SHIFT) | ioprio;
+}
 
-       ioprio |= ioprio_class << IOPRIO_CLASS_SHIFT;
-       return syscall(__NR_ioprio_set, which, who, ioprio);
+static inline bool ioprio_value_is_class_rt(unsigned int priority)
+{
+       return (priority >> IOPRIO_CLASS_SHIFT) == IOPRIO_CLASS_RT;
+}
+
+static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
+{
+       return syscall(__NR_ioprio_set, which, who,
+                      ioprio_value(ioprio_class, ioprio));
 }
 
 #ifndef CONFIG_HAVE_GETTID
@@ -178,36 +189,6 @@ static inline unsigned long long os_phys_mem(void)
        return (unsigned long long) pages * (unsigned long long) pagesize;
 }
 
-static inline int fio_lookup_raw(dev_t dev, int *majdev, int *mindev)
-{
-       struct raw_config_request rq;
-       int fd;
-
-       if (major(dev) != RAW_MAJOR)
-               return 1;
-
-       /*
-        * we should be able to find /dev/rawctl or /dev/raw/rawctl
-        */
-       fd = open("/dev/rawctl", O_RDONLY);
-       if (fd < 0) {
-               fd = open("/dev/raw/rawctl", O_RDONLY);
-               if (fd < 0)
-                       return 1;
-       }
-
-       rq.raw_minor = minor(dev);
-       if (ioctl(fd, RAW_GETBIND, &rq) < 0) {
-               close(fd);
-               return 1;
-       }
-
-       close(fd);
-       *majdev = rq.block_major;
-       *mindev = rq.block_minor;
-       return 0;
-}
-
 #ifdef O_NOATIME
 #define FIO_O_NOATIME  O_NOATIME
 #else
@@ -427,7 +408,8 @@ static inline bool os_cpu_has(cpu_features feature)
 #ifdef ARCH_HAVE_CRC_CRYPTO
        case CPU_ARM64_CRC32C:
                hwcap = getauxval(AT_HWCAP);
-               have_feature = (hwcap & HWCAP_CRC32) != 0;
+               have_feature = (hwcap & (HWCAP_PMULL | HWCAP_CRC32)) ==
+                              (HWCAP_PMULL | HWCAP_CRC32);
                break;
 #endif
        default:
index ddfae41344cfe23c5bac09ef9dfd4f6ace0e95c1..59da9dba1a2c3fbf09200f313a539926e9de762a 100644 (file)
@@ -77,6 +77,7 @@
 #define SIGCONT        0
 #define SIGUSR1        1
 #define SIGUSR2 2
+#define SIGKILL 15 /* SIGKILL doesn't exists, let's use SIGTERM */
 
 typedef int sigset_t;
 typedef int siginfo_t;
diff --git a/os/os.h b/os/os.h
index e47d3d97064e2b897a9036f680470caf2b3b4f45..5965d7b806b055bf8f2c679da818516a503eb6db 100644 (file)
--- a/os/os.h
+++ b/os/os.h
@@ -117,7 +117,11 @@ static inline int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu_index)
 extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
 #endif
 
+#ifndef FIO_HAVE_IOPRIO_CLASS
+#define ioprio_value_is_class_rt(prio) (false)
+#endif
 #ifndef FIO_HAVE_IOPRIO
+#define ioprio_value(prioclass, prio)  (0)
 #define ioprio_set(which, who, prioclass, prio)        (0)
 #endif
 
@@ -157,10 +161,6 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
 #define OS_RAND_MAX                    RAND_MAX
 #endif
 
-#ifndef FIO_HAVE_RAWBIND
-#define fio_lookup_raw(dev, majdev, mindev)    1
-#endif
-
 #ifndef FIO_PREFERRED_ENGINE
 #define FIO_PREFERRED_ENGINE   "psync"
 #endif
@@ -412,4 +412,13 @@ static inline bool os_cpu_has(cpu_features feature)
 # define fio_mkdir(path, mode) mkdir(path, mode)
 #endif
 
+#ifdef _SC_CLK_TCK
+static inline void os_clk_tck(long *clk_tck)
+{
+       *clk_tck = sysconf(_SC_CLK_TCK);
+}
+#else
+extern void os_clk_tck(long *clk_tck);
+#endif
+
 #endif /* FIO_OS_H */
diff --git a/os/windows/dlls.c b/os/windows/dlls.c
new file mode 100644 (file)
index 0000000..774b1c6
--- /dev/null
@@ -0,0 +1,33 @@
+#include "os/os.h"
+
+#include <windows.h>
+
+void os_clk_tck(long *clk_tck)
+{
+       /*
+        * The timer resolution is variable on Windows. Try to query it 
+        * or use 64 Hz, the clock frequency lower bound. See also
+        * https://carpediemsystems.co.uk/2019/07/18/windows-system-timer-granularity/.
+        */
+       unsigned long minRes, maxRes, curRes;
+       HMODULE lib;
+       FARPROC queryTimer;
+       FARPROC setTimer;
+
+       if (!(lib = LoadLibrary(TEXT("ntdll.dll"))) ||
+               !(queryTimer = GetProcAddress(lib, "NtQueryTimerResolution")) ||
+               !(setTimer = GetProcAddress(lib, "NtSetTimerResolution"))) {
+               dprint(FD_HELPERTHREAD, 
+                       "Failed to load ntdll library, set to lower bound 64 Hz\n");
+               *clk_tck = 64;
+       } else {
+               queryTimer(&minRes, &maxRes, &curRes);
+               dprint(FD_HELPERTHREAD, 
+                       "minRes = %lu, maxRes = %lu, curRes = %lu\n",
+                       minRes, maxRes, curRes);
+
+               /* Use maximum resolution for most accurate timestamps */
+               setTimer(maxRes, 1, &curRes);
+               *clk_tck = (long) (10000000L / maxRes);
+       }
+}
\ No newline at end of file
index 4e441d29b8de2b4173684f8f5d79744192dd0bff..185bd5011bbc5105e7bf67a171b0d93a93727ae2 100644 (file)
@@ -169,8 +169,10 @@ int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
                return -EIO;
 
        max_open_str = blkzoned_get_sysfs_attr(f->file_name, "queue/max_open_zones");
-       if (!max_open_str)
+       if (!max_open_str) {
+               *max_open_zones = 0;
                return 0;
+       }
 
        dprint(FD_ZBD, "%s: max open zones supported by device: %s\n",
               f->file_name, max_open_str);
diff --git a/parse.c b/parse.c
index 45f4f2d3dd6db800f4f8e11a168c8ef36e788d3b..d086ee488f956048ff654082e8717ed49f5347df 100644 (file)
--- a/parse.c
+++ b/parse.c
@@ -477,13 +477,17 @@ static int check_int(const char *p, int *val)
 
 static size_t opt_len(const char *str)
 {
+       char delimiter[] = {',', ':'};
        char *postfix;
+       unsigned int i;
 
-       postfix = strchr(str, ':');
-       if (!postfix)
-               return strlen(str);
+       for (i = 0; i < FIO_ARRAY_SIZE(delimiter); i++) {
+               postfix = strchr(str, delimiter[i]);
+               if (postfix)
+                       return (int)(postfix - str);
+       }
 
-       return (int)(postfix - str);
+       return strlen(str);
 }
 
 static int str_match_len(const struct value_pair *vp, const char *str)
index 8daefbabfeae93f6c260c0b74eec6fedc7bbd973..90c52e01ac231f4972bbd94e91361d3fe61ad132 100644 (file)
--- a/server.c
+++ b/server.c
@@ -409,8 +409,9 @@ struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait)
                        if (cmdret->opcode == FIO_NET_CMD_TEXT) {
                                struct cmd_text_pdu *__pdu = (struct cmd_text_pdu *) cmdret->payload;
                                char *buf = (char *) __pdu->buf;
+                               int len = le32_to_cpu(__pdu->buf_len);
 
-                               buf[__pdu->buf_len] = '\0';
+                               buf[len] = '\0';
                        } else if (cmdret->opcode == FIO_NET_CMD_JOB) {
                                struct cmd_job_pdu *__pdu = (struct cmd_job_pdu *) cmdret->payload;
                                char *buf = (char *) __pdu->buf;
@@ -2456,6 +2457,11 @@ static void set_sig_handlers(void)
        };
 
        sigaction(SIGINT, &act, NULL);
+
+       /* Windows uses SIGBREAK as a quit signal from other applications */
+#ifdef WIN32
+       sigaction(SIGBREAK, &act, NULL);
+#endif
 }
 
 void fio_server_destroy_sk_key(void)
@@ -2564,6 +2570,7 @@ static int write_pid(pid_t pid, const char *pidfile)
  */
 int fio_start_server(char *pidfile)
 {
+       FILE *file;
        pid_t pid;
        int ret;
 
@@ -2596,14 +2603,28 @@ int fio_start_server(char *pidfile)
        setsid();
        openlog("fio", LOG_NDELAY|LOG_NOWAIT|LOG_PID, LOG_USER);
        log_syslog = true;
-       close(STDIN_FILENO);
-       close(STDOUT_FILENO);
-       close(STDERR_FILENO);
+
+       file = freopen("/dev/null", "r", stdin);
+       if (!file)
+               perror("freopen");
+
+       file = freopen("/dev/null", "w", stdout);
+       if (!file)
+               perror("freopen");
+
+       file = freopen("/dev/null", "w", stderr);
+       if (!file)
+               perror("freopen");
+
        f_out = NULL;
        f_err = NULL;
 
        ret = fio_server();
 
+       fclose(stdin);
+       fclose(stdout);
+       fclose(stderr);
+
        closelog();
        unlink(pidfile);
        free(pidfile);
index c128df28adda112d04ad67549e1d0fa4194f5b1c..25b6bbdc25dfb6f6f45e7ddb318d6a5cb9030156 100644 (file)
--- a/server.h
+++ b/server.h
@@ -48,7 +48,7 @@ struct fio_net_cmd_reply {
 };
 
 enum {
-       FIO_SERVER_VER                  = 91,
+       FIO_SERVER_VER                  = 95,
 
        FIO_SERVER_MAX_FRAGMENT_PDU     = 1024,
        FIO_SERVER_MAX_CMD_MB           = 2048,
@@ -193,6 +193,7 @@ struct cmd_iolog_pdu {
        uint32_t log_type;
        uint32_t compressed;
        uint32_t log_offset;
+       uint32_t log_prio;
        uint32_t log_hist_coarseness;
        uint8_t name[FIO_NET_NAME_MAX];
        struct io_sample samples[0];
diff --git a/stat.c b/stat.c
index a8a96c85a4120b0d70dee939c9102fe340565aae..7e84058d9b911294956d0970e218cf97ea0a3344 100644 (file)
--- a/stat.c
+++ b/stat.c
 #include "zbd.h"
 #include "oslib/asprintf.h"
 
+#ifdef WIN32
+#define LOG_MSEC_SLACK 2
+#else
 #define LOG_MSEC_SLACK 1
+#endif
 
 struct fio_sem *stat_sem;
 
@@ -211,7 +215,7 @@ static void show_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr,
 
        len = calc_clat_percentiles(io_u_plat, nr, plist, &ovals, &maxv, &minv);
        if (!len || !ovals)
-               goto out;
+               return;
 
        /*
         * We default to nsecs, but if the value range is such that we
@@ -258,7 +262,6 @@ static void show_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr,
                        log_buf(out, "\n");
        }
 
-out:
        free(ovals);
 }
 
@@ -480,22 +483,13 @@ static void show_mixed_ddir_status(struct group_run_stats *rs, struct thread_sta
        struct thread_stat *ts_lcl;
 
        int i2p;
-       int ddir = 0, i;
+       int ddir = 0;
 
        /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
        ts_lcl = malloc(sizeof(struct thread_stat));
        memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
        ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
-       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               ts_lcl->clat_stat[i].min_val = ULONG_MAX;
-               ts_lcl->slat_stat[i].min_val = ULONG_MAX;
-               ts_lcl->lat_stat[i].min_val = ULONG_MAX;
-               ts_lcl->bw_stat[i].min_val = ULONG_MAX;
-               ts_lcl->iops_stat[i].min_val = ULONG_MAX;
-               ts_lcl->clat_high_prio_stat[i].min_val = ULONG_MAX;
-               ts_lcl->clat_low_prio_stat[i].min_val = ULONG_MAX;
-       }
-       ts_lcl->sync_stat.min_val = ULONG_MAX;
+       init_thread_stat_min_vals(ts_lcl);
 
        sum_thread_stats(ts_lcl, ts, 1);
 
@@ -1463,22 +1457,12 @@ static void show_mixed_ddir_status_terse(struct thread_stat *ts,
                                   int ver, struct buf_output *out)
 {
        struct thread_stat *ts_lcl;
-       int i;
 
        /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
        ts_lcl = malloc(sizeof(struct thread_stat));
        memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
        ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
-       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               ts_lcl->clat_stat[i].min_val = ULONG_MAX;
-               ts_lcl->slat_stat[i].min_val = ULONG_MAX;
-               ts_lcl->lat_stat[i].min_val = ULONG_MAX;
-               ts_lcl->bw_stat[i].min_val = ULONG_MAX;
-               ts_lcl->iops_stat[i].min_val = ULONG_MAX;
-               ts_lcl->clat_high_prio_stat[i].min_val = ULONG_MAX;
-               ts_lcl->clat_low_prio_stat[i].min_val = ULONG_MAX;
-       }
-       ts_lcl->sync_stat.min_val = ULONG_MAX;
+       init_thread_stat_min_vals(ts_lcl);
        ts_lcl->lat_percentiles = ts->lat_percentiles;
        ts_lcl->clat_percentiles = ts->clat_percentiles;
        ts_lcl->slat_percentiles = ts->slat_percentiles;
@@ -1665,22 +1649,12 @@ static void add_mixed_ddir_status_json(struct thread_stat *ts,
                struct group_run_stats *rs, struct json_object *parent)
 {
        struct thread_stat *ts_lcl;
-       int i;
 
        /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
        ts_lcl = malloc(sizeof(struct thread_stat));
        memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
        ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
-       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               ts_lcl->clat_stat[i].min_val = ULONG_MAX;
-               ts_lcl->slat_stat[i].min_val = ULONG_MAX;
-               ts_lcl->lat_stat[i].min_val = ULONG_MAX;
-               ts_lcl->bw_stat[i].min_val = ULONG_MAX;
-               ts_lcl->iops_stat[i].min_val = ULONG_MAX;
-               ts_lcl->clat_high_prio_stat[i].min_val = ULONG_MAX;
-               ts_lcl->clat_low_prio_stat[i].min_val = ULONG_MAX;
-       }
-       ts_lcl->sync_stat.min_val = ULONG_MAX;
+       init_thread_stat_min_vals(ts_lcl);
        ts_lcl->lat_percentiles = ts->lat_percentiles;
        ts_lcl->clat_percentiles = ts->clat_percentiles;
        ts_lcl->slat_percentiles = ts->slat_percentiles;
@@ -2267,22 +2241,27 @@ void init_group_run_stat(struct group_run_stats *gs)
                gs->min_bw[i] = gs->min_run[i] = ~0UL;
 }
 
-void init_thread_stat(struct thread_stat *ts)
+void init_thread_stat_min_vals(struct thread_stat *ts)
 {
-       int j;
+       int i;
 
+       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+               ts->clat_stat[i].min_val = ULONG_MAX;
+               ts->slat_stat[i].min_val = ULONG_MAX;
+               ts->lat_stat[i].min_val = ULONG_MAX;
+               ts->bw_stat[i].min_val = ULONG_MAX;
+               ts->iops_stat[i].min_val = ULONG_MAX;
+               ts->clat_high_prio_stat[i].min_val = ULONG_MAX;
+               ts->clat_low_prio_stat[i].min_val = ULONG_MAX;
+       }
+       ts->sync_stat.min_val = ULONG_MAX;
+}
+
+void init_thread_stat(struct thread_stat *ts)
+{
        memset(ts, 0, sizeof(*ts));
 
-       for (j = 0; j < DDIR_RWDIR_CNT; j++) {
-               ts->lat_stat[j].min_val = -1UL;
-               ts->clat_stat[j].min_val = -1UL;
-               ts->slat_stat[j].min_val = -1UL;
-               ts->bw_stat[j].min_val = -1UL;
-               ts->iops_stat[j].min_val = -1UL;
-               ts->clat_high_prio_stat[j].min_val = -1UL;
-               ts->clat_low_prio_stat[j].min_val = -1UL;
-       }
-       ts->sync_stat.min_val = -1UL;
+       init_thread_stat_min_vals(ts);
        ts->groupid = -1;
 }
 
@@ -2709,27 +2688,25 @@ static inline void add_stat_sample(struct io_stat *is, unsigned long long data)
  */
 static struct io_logs *get_new_log(struct io_log *iolog)
 {
-       size_t new_size, new_samples;
+       size_t new_samples;
        struct io_logs *cur_log;
 
        /*
         * Cap the size at MAX_LOG_ENTRIES, so we don't keep doubling
         * forever
         */
-       if (!iolog->cur_log_max)
-               new_samples = DEF_LOG_ENTRIES;
-       else {
+       if (!iolog->cur_log_max) {
+               new_samples = iolog->td->o.log_entries;
+       else {
                new_samples = iolog->cur_log_max * 2;
                if (new_samples > MAX_LOG_ENTRIES)
                        new_samples = MAX_LOG_ENTRIES;
        }
 
-       new_size = new_samples * log_entry_sz(iolog);
-
        cur_log = smalloc(sizeof(*cur_log));
        if (cur_log) {
                INIT_FLIST_HEAD(&cur_log->list);
-               cur_log->log = malloc(new_size);
+               cur_log->log = calloc(new_samples, log_entry_sz(iolog));
                if (cur_log->log) {
                        cur_log->nr_samples = 0;
                        cur_log->max_samples = new_samples;
@@ -2860,7 +2837,8 @@ static struct io_logs *get_cur_log(struct io_log *iolog)
 
 static void __add_log_sample(struct io_log *iolog, union io_sample_data data,
                             enum fio_ddir ddir, unsigned long long bs,
-                            unsigned long t, uint64_t offset, uint8_t priority_bit)
+                            unsigned long t, uint64_t offset,
+                            unsigned int priority)
 {
        struct io_logs *cur_log;
 
@@ -2879,7 +2857,7 @@ static void __add_log_sample(struct io_log *iolog, union io_sample_data data,
                s->time = t + (iolog->td ? iolog->td->unix_epoch : 0);
                io_sample_set_ddir(iolog, s, ddir);
                s->bs = bs;
-               s->priority_bit = priority_bit;
+               s->priority = priority;
 
                if (iolog->log_offset) {
                        struct io_sample_offset *so = (void *) s;
@@ -2956,7 +2934,7 @@ void reset_io_stats(struct thread_data *td)
 }
 
 static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
-                             unsigned long elapsed, bool log_max, uint8_t priority_bit)
+                             unsigned long elapsed, bool log_max)
 {
        /*
         * Note an entry in the log. Use the mean from the logged samples,
@@ -2971,26 +2949,26 @@ static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
                else
                        data.val = iolog->avg_window[ddir].mean.u.f + 0.50;
 
-               __add_log_sample(iolog, data, ddir, 0, elapsed, 0, priority_bit);
+               __add_log_sample(iolog, data, ddir, 0, elapsed, 0, 0);
        }
 
        reset_io_stat(&iolog->avg_window[ddir]);
 }
 
 static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed,
-                            bool log_max, uint8_t priority_bit)
+                            bool log_max)
 {
        int ddir;
 
        for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
-               __add_stat_to_log(iolog, ddir, elapsed, log_max, priority_bit);
+               __add_stat_to_log(iolog, ddir, elapsed, log_max);
 }
 
 static unsigned long add_log_sample(struct thread_data *td,
                                    struct io_log *iolog,
                                    union io_sample_data data,
                                    enum fio_ddir ddir, unsigned long long bs,
-                                   uint64_t offset, uint8_t priority_bit)
+                                   uint64_t offset, unsigned int ioprio)
 {
        unsigned long elapsed, this_window;
 
@@ -3003,7 +2981,8 @@ static unsigned long add_log_sample(struct thread_data *td,
         * If no time averaging, just add the log sample.
         */
        if (!iolog->avg_msec) {
-               __add_log_sample(iolog, data, ddir, bs, elapsed, offset, priority_bit);
+               __add_log_sample(iolog, data, ddir, bs, elapsed, offset,
+                                ioprio);
                return 0;
        }
 
@@ -3027,7 +3006,7 @@ static unsigned long add_log_sample(struct thread_data *td,
                        return diff;
        }
 
-       __add_stat_to_log(iolog, ddir, elapsed, td->o.log_max != 0, priority_bit);
+       __add_stat_to_log(iolog, ddir, elapsed, td->o.log_max != 0);
 
        iolog->avg_last[ddir] = elapsed - (elapsed % iolog->avg_msec);
 
@@ -3041,19 +3020,19 @@ void finalize_logs(struct thread_data *td, bool unit_logs)
        elapsed = mtime_since_now(&td->epoch);
 
        if (td->clat_log && unit_logs)
-               _add_stat_to_log(td->clat_log, elapsed, td->o.log_max != 0, 0);
+               _add_stat_to_log(td->clat_log, elapsed, td->o.log_max != 0);
        if (td->slat_log && unit_logs)
-               _add_stat_to_log(td->slat_log, elapsed, td->o.log_max != 0, 0);
+               _add_stat_to_log(td->slat_log, elapsed, td->o.log_max != 0);
        if (td->lat_log && unit_logs)
-               _add_stat_to_log(td->lat_log, elapsed, td->o.log_max != 0, 0);
+               _add_stat_to_log(td->lat_log, elapsed, td->o.log_max != 0);
        if (td->bw_log && (unit_logs == per_unit_log(td->bw_log)))
-               _add_stat_to_log(td->bw_log, elapsed, td->o.log_max != 0, 0);
+               _add_stat_to_log(td->bw_log, elapsed, td->o.log_max != 0);
        if (td->iops_log && (unit_logs == per_unit_log(td->iops_log)))
-               _add_stat_to_log(td->iops_log, elapsed, td->o.log_max != 0, 0);
+               _add_stat_to_log(td->iops_log, elapsed, td->o.log_max != 0);
 }
 
-void add_agg_sample(union io_sample_data data, enum fio_ddir ddir, unsigned long long bs,
-                                       uint8_t priority_bit)
+void add_agg_sample(union io_sample_data data, enum fio_ddir ddir,
+                   unsigned long long bs)
 {
        struct io_log *iolog;
 
@@ -3061,7 +3040,7 @@ void add_agg_sample(union io_sample_data data, enum fio_ddir ddir, unsigned long
                return;
 
        iolog = agg_io_log[ddir];
-       __add_log_sample(iolog, data, ddir, bs, mtime_since_genesis(), 0, priority_bit);
+       __add_log_sample(iolog, data, ddir, bs, mtime_since_genesis(), 0, 0);
 }
 
 void add_sync_clat_sample(struct thread_stat *ts, unsigned long long nsec)
@@ -3073,8 +3052,10 @@ void add_sync_clat_sample(struct thread_stat *ts, unsigned long long nsec)
        add_stat_sample(&ts->sync_stat, nsec);
 }
 
-static void add_lat_percentile_sample_noprio(struct thread_stat *ts,
-                               unsigned long long nsec, enum fio_ddir ddir, enum fio_lat lat)
+static inline void add_lat_percentile_sample(struct thread_stat *ts,
+                                            unsigned long long nsec,
+                                            enum fio_ddir ddir,
+                                            enum fio_lat lat)
 {
        unsigned int idx = plat_val_to_idx(nsec);
        assert(idx < FIO_IO_U_PLAT_NR);
@@ -3082,15 +3063,14 @@ static void add_lat_percentile_sample_noprio(struct thread_stat *ts,
        ts->io_u_plat[lat][ddir][idx]++;
 }
 
-static void add_lat_percentile_sample(struct thread_stat *ts,
-                               unsigned long long nsec, enum fio_ddir ddir, uint8_t priority_bit,
-                               enum fio_lat lat)
+static inline void add_lat_percentile_prio_sample(struct thread_stat *ts,
+                                                 unsigned long long nsec,
+                                                 enum fio_ddir ddir,
+                                                 bool high_prio)
 {
        unsigned int idx = plat_val_to_idx(nsec);
 
-       add_lat_percentile_sample_noprio(ts, nsec, ddir, lat);
-
-       if (!priority_bit)
+       if (!high_prio)
                ts->io_u_plat_low_prio[ddir][idx]++;
        else
                ts->io_u_plat_high_prio[ddir][idx]++;
@@ -3098,7 +3078,7 @@ static void add_lat_percentile_sample(struct thread_stat *ts,
 
 void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
                     unsigned long long nsec, unsigned long long bs,
-                    uint64_t offset, uint8_t priority_bit)
+                    uint64_t offset, unsigned int ioprio, bool high_prio)
 {
        const bool needs_lock = td_async_processing(td);
        unsigned long elapsed, this_window;
@@ -3110,8 +3090,17 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 
        add_stat_sample(&ts->clat_stat[ddir], nsec);
 
+       /*
+        * When lat_percentiles=1 (default 0), the reported high/low priority
+        * percentiles and stats are used for describing total latency values,
+        * even though the variable names themselves start with clat_.
+        *
+        * Because of the above definition, add a prio stat sample only when
+        * lat_percentiles=0. add_lat_sample() will add the prio stat sample
+        * when lat_percentiles=1.
+        */
        if (!ts->lat_percentiles) {
-               if (priority_bit)
+               if (high_prio)
                        add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
                else
                        add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
@@ -3119,13 +3108,18 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 
        if (td->clat_log)
                add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs,
-                              offset, priority_bit);
+                              offset, ioprio);
 
        if (ts->clat_percentiles) {
-               if (ts->lat_percentiles)
-                       add_lat_percentile_sample_noprio(ts, nsec, ddir, FIO_CLAT);
-               else
-                       add_lat_percentile_sample(ts, nsec, ddir, priority_bit, FIO_CLAT);
+               /*
+                * Because of the above definition, add a prio lat percentile
+                * sample only when lat_percentiles=0. add_lat_sample() will add
+                * the prio lat percentile sample when lat_percentiles=1.
+                */
+               add_lat_percentile_sample(ts, nsec, ddir, FIO_CLAT);
+               if (!ts->lat_percentiles)
+                       add_lat_percentile_prio_sample(ts, nsec, ddir,
+                                                      high_prio);
        }
 
        if (iolog && iolog->hist_msec) {
@@ -3154,7 +3148,7 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
                                FIO_IO_U_PLAT_NR * sizeof(uint64_t));
                        flist_add(&dst->list, &hw->list);
                        __add_log_sample(iolog, sample_plat(dst), ddir, bs,
-                                               elapsed, offset, priority_bit);
+                                        elapsed, offset, ioprio);
 
                        /*
                         * Update the last time we recorded as being now, minus
@@ -3171,8 +3165,8 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 }
 
 void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
-                       unsigned long long nsec, unsigned long long bs, uint64_t offset,
-                       uint8_t priority_bit)
+                    unsigned long long nsec, unsigned long long bs,
+                    uint64_t offset, unsigned int ioprio)
 {
        const bool needs_lock = td_async_processing(td);
        struct thread_stat *ts = &td->ts;
@@ -3186,11 +3180,11 @@ void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
        add_stat_sample(&ts->slat_stat[ddir], nsec);
 
        if (td->slat_log)
-               add_log_sample(td, td->slat_log, sample_val(nsec), ddir, bs, offset,
-                       priority_bit);
+               add_log_sample(td, td->slat_log, sample_val(nsec), ddir, bs,
+                              offset, ioprio);
 
        if (ts->slat_percentiles)
-               add_lat_percentile_sample_noprio(ts, nsec, ddir, FIO_SLAT);
+               add_lat_percentile_sample(ts, nsec, ddir, FIO_SLAT);
 
        if (needs_lock)
                __td_io_u_unlock(td);
@@ -3198,7 +3192,7 @@ void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
 
 void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
                    unsigned long long nsec, unsigned long long bs,
-                   uint64_t offset, uint8_t priority_bit)
+                   uint64_t offset, unsigned int ioprio, bool high_prio)
 {
        const bool needs_lock = td_async_processing(td);
        struct thread_stat *ts = &td->ts;
@@ -3213,11 +3207,22 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
 
        if (td->lat_log)
                add_log_sample(td, td->lat_log, sample_val(nsec), ddir, bs,
-                              offset, priority_bit);
+                              offset, ioprio);
 
+       /*
+        * When lat_percentiles=1 (default 0), the reported high/low priority
+        * percentiles and stats are used for describing total latency values,
+        * even though the variable names themselves start with clat_.
+        *
+        * Because of the above definition, add a prio stat and prio lat
+        * percentile sample only when lat_percentiles=1. add_clat_sample() will
+        * add the prio stat and prio lat percentile sample when
+        * lat_percentiles=0.
+        */
        if (ts->lat_percentiles) {
-               add_lat_percentile_sample(ts, nsec, ddir, priority_bit, FIO_LAT);
-               if (priority_bit)
+               add_lat_percentile_sample(ts, nsec, ddir, FIO_LAT);
+               add_lat_percentile_prio_sample(ts, nsec, ddir, high_prio);
+               if (high_prio)
                        add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
                else
                        add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
@@ -3246,7 +3251,7 @@ void add_bw_sample(struct thread_data *td, struct io_u *io_u,
 
        if (td->bw_log)
                add_log_sample(td, td->bw_log, sample_val(rate), io_u->ddir,
-                              bytes, io_u->offset, io_u_is_prio(io_u));
+                              bytes, io_u->offset, io_u->ioprio);
 
        td->stat_io_bytes[io_u->ddir] = td->this_io_bytes[io_u->ddir];
 
@@ -3300,7 +3305,8 @@ static int __add_samples(struct thread_data *td, struct timespec *parent_tv,
                        if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
                                bs = td->o.min_bs[ddir];
 
-                       next = add_log_sample(td, log, sample_val(rate), ddir, bs, 0, 0);
+                       next = add_log_sample(td, log, sample_val(rate), ddir,
+                                             bs, 0, 0);
                        next_log = min(next_log, next);
                }
 
@@ -3340,7 +3346,7 @@ void add_iops_sample(struct thread_data *td, struct io_u *io_u,
 
        if (td->iops_log)
                add_log_sample(td, td->iops_log, sample_val(1), io_u->ddir,
-                              bytes, io_u->offset, io_u_is_prio(io_u));
+                              bytes, io_u->offset, io_u->ioprio);
 
        td->stat_io_blocks[io_u->ddir] = td->this_io_blocks[io_u->ddir];
 
diff --git a/stat.h b/stat.h
index d08d4dc09780720eea3f1fa5896d12a11bae5e40..9ef8caa438870a97eb6ff6fb7a4786d86d9db4af 100644 (file)
--- a/stat.h
+++ b/stat.h
@@ -327,6 +327,7 @@ extern void show_running_run_stats(void);
 extern void check_for_running_stats(void);
 extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first);
 extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src);
+extern void init_thread_stat_min_vals(struct thread_stat *ts);
 extern void init_thread_stat(struct thread_stat *ts);
 extern void init_group_run_stat(struct group_run_stats *gs);
 extern void eta_to_str(char *str, unsigned long eta_sec);
@@ -341,13 +342,12 @@ extern void update_rusage_stat(struct thread_data *);
 extern void clear_rusage_stat(struct thread_data *);
 
 extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
-                               unsigned long long, uint64_t, uint8_t);
+                          unsigned long long, uint64_t, unsigned int, bool);
 extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
-                               unsigned long long, uint64_t, uint8_t);
+                           unsigned long long, uint64_t, unsigned int, bool);
 extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
-                               unsigned long long, uint64_t, uint8_t);
-extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long bs,
-                               uint8_t priority_bit);
+                               unsigned long long, uint64_t, unsigned int);
+extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long);
 extern void add_iops_sample(struct thread_data *, struct io_u *,
                                unsigned int);
 extern void add_bw_sample(struct thread_data *, struct io_u *,
index 68d31f19bd7b6a028975ece96918e6f9a014c8c0..109ea1af49a888b13ef6285fd450859446be22bf 100644 (file)
 
 #include "../lib/bloom.h"
 #include "debug.h"
+#include "zlib.h"
+
+struct zlib_ctrl {
+       z_stream stream;
+       unsigned char *buf_in;
+       unsigned char *buf_out;
+};
 
 struct worker_thread {
+       struct zlib_ctrl zc;
        pthread_t thread;
-
-       volatile int done;
-
-       int fd;
        uint64_t cur_offset;
        uint64_t size;
-
+       unsigned long long unique_capacity;
        unsigned long items;
        unsigned long dupes;
        int err;
+       int fd;
+       volatile int done;
 };
 
 struct extent {
@@ -68,6 +74,7 @@ static unsigned int odirect;
 static unsigned int collision_check;
 static unsigned int print_progress = 1;
 static unsigned int use_bloom = 1;
+static unsigned int compression = 0;
 
 static uint64_t total_size;
 static uint64_t cur_offset;
@@ -87,8 +94,9 @@ static uint64_t get_size(struct fio_file *f, struct stat *sb)
                        return 0;
                }
                ret = bytes;
-       } else
+       } else {
                ret = sb->st_size;
+       }
 
        return (ret & ~((uint64_t)blocksize - 1));
 }
@@ -120,9 +128,9 @@ static int __read_block(int fd, void *buf, off_t offset, size_t count)
        if (ret < 0) {
                perror("pread");
                return 1;
-       } else if (!ret)
+       } else if (!ret) {
                return 1;
-       else if (ret != count) {
+       else if (ret != count) {
                log_err("dedupe: short read on block\n");
                return 1;
        }
@@ -135,6 +143,34 @@ static int read_block(int fd, void *buf, off_t offset)
        return __read_block(fd, buf, offset, blocksize);
 }
 
+static void account_unique_capacity(uint64_t offset, uint64_t *unique_capacity,
+                                   struct zlib_ctrl *zc)
+{
+       z_stream *stream = &zc->stream;
+       unsigned int compressed_len;
+       int ret;
+
+       if (read_block(file.fd, zc->buf_in, offset))
+               return;
+
+       stream->next_in = zc->buf_in;
+       stream->avail_in = blocksize;
+       stream->avail_out = deflateBound(stream, blocksize);
+       stream->next_out = zc->buf_out;
+
+       ret = deflate(stream, Z_FINISH);
+       assert(ret != Z_STREAM_ERROR);
+       compressed_len = blocksize - stream->avail_out;
+
+       if (dump_output)
+               printf("offset 0x%lx compressed to %d blocksize %d ratio %.2f \n",
+                               (unsigned long) offset, compressed_len, blocksize,
+                               (float)compressed_len / (float)blocksize);
+
+       *unique_capacity += compressed_len;
+       deflateReset(stream);
+}
+
 static void add_item(struct chunk *c, struct item *i)
 {
        /*      
@@ -182,13 +218,15 @@ static struct chunk *alloc_chunk(void)
        if (collision_check || dump_output) {
                c = malloc(sizeof(struct chunk) + sizeof(struct flist_head));
                INIT_FLIST_HEAD(&c->extent_list[0]);
-       } else
+       } else {
                c = malloc(sizeof(struct chunk));
+       }
 
        return c;
 }
 
-static void insert_chunk(struct item *i)
+static void insert_chunk(struct item *i, uint64_t *unique_capacity,
+                        struct zlib_ctrl *zc)
 {
        struct fio_rb_node **p, *parent;
        struct chunk *c;
@@ -201,11 +239,11 @@ static void insert_chunk(struct item *i)
 
                c = rb_entry(parent, struct chunk, rb_node);
                diff = memcmp(i->hash, c->hash, sizeof(i->hash));
-               if (diff < 0)
+               if (diff < 0) {
                        p = &(*p)->rb_left;
-               else if (diff > 0)
+               } else if (diff > 0) {
                        p = &(*p)->rb_right;
-               else {
+               else {
                        int ret;
 
                        if (!collision_check)
@@ -228,12 +266,15 @@ static void insert_chunk(struct item *i)
        memcpy(c->hash, i->hash, sizeof(i->hash));
        rb_link_node(&c->rb_node, parent, p);
        rb_insert_color(&c->rb_node, &rb_root);
+       if (compression)
+               account_unique_capacity(i->offset, unique_capacity, zc);
 add:
        add_item(c, i);
 }
 
 static void insert_chunks(struct item *items, unsigned int nitems,
-                         uint64_t *ndupes)
+                         uint64_t *ndupes, uint64_t *unique_capacity,
+                         struct zlib_ctrl *zc)
 {
        int i;
 
@@ -248,7 +289,7 @@ static void insert_chunks(struct item *items, unsigned int nitems,
                        r = bloom_set(bloom, items[i].hash, s);
                        *ndupes += r;
                } else
-                       insert_chunk(&items[i]);
+                       insert_chunk(&items[i], unique_capacity, zc);
        }
 
        fio_sem_up(rb_lock);
@@ -277,11 +318,13 @@ static int do_work(struct worker_thread *thread, void *buf)
        off_t offset;
        int nitems = 0;
        uint64_t ndupes = 0;
+       uint64_t unique_capacity = 0;
        struct item *items;
 
        offset = thread->cur_offset;
 
-       nblocks = read_blocks(thread->fd, buf, offset, min(thread->size, (uint64_t)chunk_size));
+       nblocks = read_blocks(thread->fd, buf, offset,
+                               min(thread->size, (uint64_t) chunk_size));
        if (!nblocks)
                return 1;
 
@@ -296,20 +339,39 @@ static int do_work(struct worker_thread *thread, void *buf)
                nitems++;
        }
 
-       insert_chunks(items, nitems, &ndupes);
+       insert_chunks(items, nitems, &ndupes, &unique_capacity, &thread->zc);
 
        free(items);
        thread->items += nitems;
        thread->dupes += ndupes;
+       thread->unique_capacity += unique_capacity;
        return 0;
 }
 
+static void thread_init_zlib_control(struct worker_thread *thread)
+{
+       size_t sz;
+
+       z_stream *stream = &thread->zc.stream;
+       stream->zalloc = Z_NULL;
+       stream->zfree = Z_NULL;
+       stream->opaque = Z_NULL;
+
+       if (deflateInit(stream, Z_DEFAULT_COMPRESSION) != Z_OK)
+               return;
+
+       thread->zc.buf_in = fio_memalign(blocksize, blocksize, false);
+       sz = deflateBound(stream, blocksize);
+       thread->zc.buf_out = fio_memalign(blocksize, sz, false);
+}
+
 static void *thread_fn(void *data)
 {
        struct worker_thread *thread = data;
        void *buf;
 
        buf = fio_memalign(blocksize, chunk_size, false);
+       thread_init_zlib_control(thread);
 
        do {
                if (get_work(&thread->cur_offset, &thread->size)) {
@@ -362,15 +424,17 @@ static void show_progress(struct worker_thread *threads, unsigned long total)
                        printf("%3.2f%% done (%luKiB/sec)\r", perc, this_items);
                        last_nitems = nitems;
                        fio_gettime(&last_tv, NULL);
-               } else
+               } else {
                        printf("%3.2f%% done\r", perc);
+               }
                fflush(stdout);
                usleep(250000);
        };
 }
 
 static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
-                             uint64_t *nextents, uint64_t *nchunks)
+                             uint64_t *nextents, uint64_t *nchunks,
+                             uint64_t *unique_capacity)
 {
        struct worker_thread *threads;
        unsigned long nitems, total_items;
@@ -398,11 +462,13 @@ static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
        nitems = 0;
        *nextents = 0;
        *nchunks = 1;
+       *unique_capacity = 0;
        for (i = 0; i < num_threads; i++) {
                void *ret;
                pthread_join(threads[i].thread, &ret);
                nitems += threads[i].items;
                *nchunks += threads[i].dupes;
+               *unique_capacity += threads[i].unique_capacity;
        }
 
        printf("Threads(%u): %lu items processed\n", num_threads, nitems);
@@ -416,7 +482,7 @@ static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
 }
 
 static int dedupe_check(const char *filename, uint64_t *nextents,
-                       uint64_t *nchunks)
+                       uint64_t *nchunks, uint64_t *unique_capacity)
 {
        uint64_t dev_size;
        struct stat sb;
@@ -451,9 +517,11 @@ static int dedupe_check(const char *filename, uint64_t *nextents,
                bloom = bloom_new(bloom_entries);
        }
 
-       printf("Will check <%s>, size <%llu>, using %u threads\n", filename, (unsigned long long) dev_size, num_threads);
+       printf("Will check <%s>, size <%llu>, using %u threads\n", filename,
+                               (unsigned long long) dev_size, num_threads);
 
-       return run_dedupe_threads(&file, dev_size, nextents, nchunks);
+       return run_dedupe_threads(&file, dev_size, nextents, nchunks,
+                                       unique_capacity);
 err:
        if (file.fd != -1)
                close(file.fd);
@@ -466,36 +534,69 @@ static void show_chunk(struct chunk *c)
        struct flist_head *n;
        struct extent *e;
 
-       printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1], c->hash[2], c->hash[3], (unsigned long) c->count);
+       printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1],
+                       c->hash[2], c->hash[3], (unsigned long) c->count);
        flist_for_each(n, &c->extent_list[0]) {
                e = flist_entry(n, struct extent, list);
                printf("\toffset %llu\n", (unsigned long long) e->offset);
        }
 }
 
-static void show_stat(uint64_t nextents, uint64_t nchunks)
+static const char *capacity_unit[] = {"b","KB", "MB", "GB", "TB", "PB", "EB"};
+
+static uint64_t bytes_to_human_readable_unit(uint64_t n, const char **unit_out)
+{
+       uint8_t i = 0;
+
+       while (n >= 1024) {
+               i++;
+               n /= 1024;
+       }
+
+       *unit_out = capacity_unit[i];
+       return n;
+}
+
+static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents,
+                     uint64_t unique_capacity)
 {
        double perc, ratio;
+       const char *unit;
+       uint64_t uc_human;
 
-       printf("Extents=%lu, Unique extents=%lu\n", (unsigned long) nextents, (unsigned long) nchunks);
+       printf("Extents=%lu, Unique extents=%lu", (unsigned long) nextents,
+                                               (unsigned long) nchunks);
+       if (!bloom)
+               printf(" Duplicated extents=%lu", (unsigned long) ndupextents);
+       printf("\n");
 
        if (nchunks) {
                ratio = (double) nextents / (double) nchunks;
                printf("De-dupe ratio: 1:%3.2f\n", ratio - 1.0);
-       } else
+       } else {
                printf("De-dupe ratio: 1:infinite\n");
+       }
+
+       if (ndupextents) {
+               printf("De-dupe working set at least: %3.2f%%\n",
+                       100.0 * (double) ndupextents / (double) nextents);
+       }
 
        perc = 1.00 - ((double) nchunks / (double) nextents);
        perc *= 100.0;
        printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
 
+
+       if (compression) {
+               uc_human = bytes_to_human_readable_unit(unique_capacity, &unit);
+               printf("Unique capacity %lu%s\n", (unsigned long) uc_human, unit);
+       }
 }
 
-static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
+static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks, uint64_t *ndupextents)
 {
        struct fio_rb_node *n;
-
-       *nchunks = *nextents = 0;
+       *nchunks = *nextents = *ndupextents = 0;
 
        n = rb_first(&rb_root);
        if (!n)
@@ -507,6 +608,7 @@ static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
                c = rb_entry(n, struct chunk, rb_node);
                (*nchunks)++;
                *nextents += c->count;
+               *ndupextents += (c->count > 1);
 
                if (dump_output)
                        show_chunk(c);
@@ -525,18 +627,19 @@ static int usage(char *argv[])
        log_err("\t-c\tFull collision check\n");
        log_err("\t-B\tUse probabilistic bloom filter\n");
        log_err("\t-p\tPrint progress indicator\n");
+       log_err("\t-C\tCalculate compressible size\n");
        return 1;
 }
 
 int main(int argc, char *argv[])
 {
-       uint64_t nextents = 0, nchunks = 0;
+       uint64_t nextents = 0, nchunks = 0, ndupextents = 0, unique_capacity;
        int c, ret;
 
        arch_init(argv);
        debug_init();
 
-       while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {
+       while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:C:")) != -1) {
                switch (c) {
                case 'b':
                        blocksize = atoi(optarg);
@@ -559,13 +662,16 @@ int main(int argc, char *argv[])
                case 'B':
                        use_bloom = atoi(optarg);
                        break;
+               case 'C':
+                       compression = atoi(optarg);
+                       break;
                case '?':
                default:
                        return usage(argv);
                }
        }
 
-       if (collision_check || dump_output)
+       if (collision_check || dump_output || compression)
                use_bloom = 0;
 
        if (!num_threads)
@@ -579,13 +685,13 @@ int main(int argc, char *argv[])
        rb_root = RB_ROOT;
        rb_lock = fio_sem_init(FIO_SEM_UNLOCKED);
 
-       ret = dedupe_check(argv[optind], &nextents, &nchunks);
+       ret = dedupe_check(argv[optind], &nextents, &nchunks, &unique_capacity);
 
        if (!ret) {
                if (!bloom)
-                       iter_rb_tree(&nextents, &nchunks);
+                       iter_rb_tree(&nextents, &nchunks, &ndupextents);
 
-               show_stat(nextents, nchunks);
+               show_stat(nextents, nchunks, ndupextents, unique_capacity);
        }
 
        fio_sem_remove(rb_lock);
index ff4c7a7c01807ed46bd73dca4da85de0e606158d..a98f78fd4a768af2736edb5c090282947b99e4f1 100644 (file)
@@ -5,6 +5,11 @@
 #include <stddef.h>
 #include <signal.h>
 #include <inttypes.h>
+#include <math.h>
+
+#ifdef CONFIG_LIBAIO
+#include <libaio.h>
+#endif
 
 #include <sys/types.h>
 #include <sys/stat.h>
 
 #include "../arch/arch.h"
 #include "../lib/types.h"
+#include "../lib/roundup.h"
+#include "../lib/rand.h"
+#include "../minmax.h"
 #include "../os/linux/io_uring.h"
 
-#define min(a, b)              ((a < b) ? (a) : (b))
-
 struct io_sq_ring {
        unsigned *head;
        unsigned *tail;
@@ -54,18 +60,28 @@ static unsigned sq_ring_mask, cq_ring_mask;
 
 struct file {
        unsigned long max_blocks;
+       unsigned long max_size;
+       unsigned long cur_off;
        unsigned pending_ios;
        int real_fd;
        int fixed_fd;
+       int fileno;
 };
 
+#define PLAT_BITS              6
+#define PLAT_VAL               (1 << PLAT_BITS)
+#define PLAT_GROUP_NR          29
+#define PLAT_NR                        (PLAT_GROUP_NR * PLAT_VAL)
+
 struct submitter {
        pthread_t thread;
        int ring_fd;
+       int index;
        struct io_sq_ring sq_ring;
        struct io_uring_sqe *sqes;
        struct io_cq_ring cq_ring;
        int inflight;
+       int tid;
        unsigned long reaps;
        unsigned long done;
        unsigned long calls;
@@ -73,6 +89,16 @@ struct submitter {
 
        __s32 *fds;
 
+       struct taus258_state rand_state;
+
+       unsigned long *clock_batch;
+       int clock_index;
+       unsigned long *plat;
+
+#ifdef CONFIG_LIBAIO
+       io_context_t aio_ctx;
+#endif
+
        struct file files[MAX_FDS];
        unsigned nr_files;
        unsigned cur_file;
@@ -81,6 +107,8 @@ struct submitter {
 
 static struct submitter *submitter;
 static volatile int finish;
+static int stats_running;
+static unsigned long max_iops;
 
 static int depth = DEPTH;
 static int batch_submit = BATCH_SUBMIT;
@@ -88,14 +116,246 @@ static int batch_complete = BATCH_COMPLETE;
 static int bs = BS;
 static int polled = 1;         /* use IO polling */
 static int fixedbufs = 1;      /* use fixed user buffers */
+static int dma_map;            /* pre-map DMA buffers */
 static int register_files = 1; /* use fixed files */
 static int buffered = 0;       /* use buffered IO, not O_DIRECT */
 static int sq_thread_poll = 0; /* use kernel submission/poller thread */
 static int sq_thread_cpu = -1; /* pin above thread to this CPU */
 static int do_nop = 0;         /* no-op SQ ring commands */
+static int nthreads = 1;
+static int stats = 0;          /* generate IO stats */
+static int aio = 0;            /* use libaio */
+static int runtime = 0;                /* runtime */
+static int random_io = 1;      /* random or sequential IO */
+
+static unsigned long tsc_rate;
+
+#define TSC_RATE_FILE  "tsc-rate"
 
 static int vectored = 1;
 
+static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0,
+                       80.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.95, 99.99 };
+static int plist_len = 17;
+
+#ifndef IORING_REGISTER_MAP_BUFFERS
+#define IORING_REGISTER_MAP_BUFFERS    20
+struct io_uring_map_buffers {
+       __s32   fd;
+       __u32   buf_start;
+       __u32   buf_end;
+       __u32   flags;
+       __u64   rsvd[2];
+};
+#endif
+
+static unsigned long cycles_to_nsec(unsigned long cycles)
+{
+       uint64_t val;
+
+       if (!tsc_rate)
+               return cycles;
+
+       val = cycles * 1000000000ULL;
+       return val / tsc_rate;
+}
+
+static unsigned long plat_idx_to_val(unsigned int idx)
+{
+       unsigned int error_bits;
+       unsigned long k, base;
+
+       assert(idx < PLAT_NR);
+
+       /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+        * all bits of the sample as index */
+       if (idx < (PLAT_VAL << 1))
+               return cycles_to_nsec(idx);
+
+       /* Find the group and compute the minimum value of that group */
+       error_bits = (idx >> PLAT_BITS) - 1;
+       base = ((unsigned long) 1) << (error_bits + PLAT_BITS);
+
+       /* Find its bucket number of the group */
+       k = idx % PLAT_VAL;
+
+       /* Return the mean of the range of the bucket */
+       return cycles_to_nsec(base + ((k + 0.5) * (1 << error_bits)));
+}
+
+unsigned int calc_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
+                                  unsigned long **output,
+                                  unsigned long *maxv, unsigned long *minv)
+{
+       unsigned long sum = 0;
+       unsigned int len = plist_len, i, j = 0;
+       unsigned long *ovals = NULL;
+       bool is_last;
+
+       *minv = -1UL;
+       *maxv = 0;
+
+       ovals = malloc(len * sizeof(*ovals));
+       if (!ovals)
+               return 0;
+
+       /*
+        * Calculate bucket values, note down max and min values
+        */
+       is_last = false;
+       for (i = 0; i < PLAT_NR && !is_last; i++) {
+               sum += io_u_plat[i];
+               while (sum >= ((long double) plist[j] / 100.0 * nr)) {
+                       assert(plist[j] <= 100.0);
+
+                       ovals[j] = plat_idx_to_val(i);
+                       if (ovals[j] < *minv)
+                               *minv = ovals[j];
+                       if (ovals[j] > *maxv)
+                               *maxv = ovals[j];
+
+                       is_last = (j == len - 1) != 0;
+                       if (is_last)
+                               break;
+
+                       j++;
+               }
+       }
+
+       if (!is_last)
+               fprintf(stderr, "error calculating latency percentiles\n");
+
+       *output = ovals;
+       return len;
+}
+
+static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
+                                 unsigned int precision)
+{
+       unsigned int divisor, len, i, j = 0;
+       unsigned long minv, maxv;
+       unsigned long *ovals;
+       int per_line, scale_down, time_width;
+       bool is_last;
+       char fmt[32];
+
+       len = calc_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv);
+       if (!len || !ovals)
+               goto out;
+
+       if (!tsc_rate) {
+               scale_down = 0;
+               divisor = 1;
+               printf("    percentiles (tsc ticks):\n     |");
+       } else if (minv > 2000 && maxv > 99999) {
+               scale_down = 1;
+               divisor = 1000;
+               printf("    percentiles (usec):\n     |");
+       } else {
+               scale_down = 0;
+               divisor = 1;
+               printf("    percentiles (nsec):\n     |");
+       }
+
+       time_width = max(5, (int) (log10(maxv / divisor) + 1));
+       snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
+                       precision, time_width);
+       /* fmt will be something like " %5.2fth=[%4llu]%c" */
+       per_line = (80 - 7) / (precision + 10 + time_width);
+
+       for (j = 0; j < len; j++) {
+               /* for formatting */
+               if (j != 0 && (j % per_line) == 0)
+                       printf("     |");
+
+               /* end of the list */
+               is_last = (j == len - 1) != 0;
+
+               for (i = 0; i < scale_down; i++)
+                       ovals[j] = (ovals[j] + 999) / 1000;
+
+               printf(fmt, plist[j], ovals[j], is_last ? '\n' : ',');
+
+               if (is_last)
+                       break;
+
+               if ((j % per_line) == per_line - 1)     /* for formatting */
+                       printf("\n");
+       }
+
+out:
+       free(ovals);
+}
+
+static unsigned int plat_val_to_idx(unsigned long val)
+{
+       unsigned int msb, error_bits, base, offset, idx;
+
+       /* Find MSB starting from bit 0 */
+       if (val == 0)
+               msb = 0;
+       else
+               msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
+
+       /*
+        * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+        * all bits of the sample as index
+        */
+       if (msb <= PLAT_BITS)
+               return val;
+
+       /* Compute the number of error bits to discard*/
+       error_bits = msb - PLAT_BITS;
+
+       /* Compute the number of buckets before the group */
+       base = (error_bits + 1) << PLAT_BITS;
+
+       /*
+        * Discard the error bits and apply the mask to find the
+        * index for the buckets in the group
+        */
+       offset = (PLAT_VAL - 1) & (val >> error_bits);
+
+       /* Make sure the index does not exceed (array size - 1) */
+       idx = (base + offset) < (PLAT_NR - 1) ?
+               (base + offset) : (PLAT_NR - 1);
+
+       return idx;
+}
+
+static void add_stat(struct submitter *s, int clock_index, int nr)
+{
+#ifdef ARCH_HAVE_CPU_CLOCK
+       unsigned long cycles;
+       unsigned int pidx;
+
+       if (!s->finish && clock_index) {
+               cycles = get_cpu_clock();
+               cycles -= s->clock_batch[clock_index];
+               pidx = plat_val_to_idx(cycles);
+               s->plat[pidx] += nr;
+       }
+#endif
+}
+
+static int io_uring_map_buffers(struct submitter *s)
+{
+       struct io_uring_map_buffers map = {
+               .fd             = s->files[0].real_fd,
+               .buf_end        = depth,
+       };
+
+       if (do_nop)
+               return 0;
+       if (s->nr_files > 1) {
+               fprintf(stderr, "Can't map buffers with multiple files\n");
+               return -1;
+       }
+
+       return syscall(__NR_io_uring_register, s->ring_fd,
+                       IORING_REGISTER_MAP_BUFFERS, &map, 1);
+}
+
 static int io_uring_register_buffers(struct submitter *s)
 {
        if (do_nop)
@@ -124,6 +384,13 @@ static int io_uring_register_files(struct submitter *s)
 
 static int io_uring_setup(unsigned entries, struct io_uring_params *p)
 {
+       /*
+        * Clamp CQ ring size at our SQ ring size, we don't need more entries
+        * than that.
+        */
+       p->flags |= IORING_SETUP_CQSIZE;
+       p->cq_entries = entries;
+
        return syscall(__NR_io_uring_setup, entries, p);
 }
 
@@ -194,8 +461,15 @@ static void init_io(struct submitter *s, unsigned index)
        }
        f->pending_ios++;
 
-       r = lrand48();
-       offset = (r % (f->max_blocks - 1)) * bs;
+       if (random_io) {
+               r = __rand64(&s->rand_state);
+               offset = (r % (f->max_blocks - 1)) * bs;
+       } else {
+               offset = f->cur_off;
+               f->cur_off += bs;
+               if (f->cur_off + bs > f->max_size)
+                       f->cur_off = 0;
+       }
 
        if (register_files) {
                sqe->flags = IOSQE_FIXED_FILE;
@@ -222,10 +496,12 @@ static void init_io(struct submitter *s, unsigned index)
        }
        sqe->ioprio = 0;
        sqe->off = offset;
-       sqe->user_data = (unsigned long) f;
+       sqe->user_data = (unsigned long) f->fileno;
+       if (stats && stats_running)
+               sqe->user_data |= ((uint64_t)s->clock_index << 32);
 }
 
-static int prep_more_ios(struct submitter *s, int max_ios)
+static int prep_more_ios_uring(struct submitter *s, int max_ios)
 {
        struct io_sq_ring *ring = &s->sq_ring;
        unsigned index, tail, next_tail, prepped = 0;
@@ -261,20 +537,23 @@ static int get_file_size(struct file *f)
                        return -1;
 
                f->max_blocks = bytes / bs;
+               f->max_size = bytes;
                return 0;
        } else if (S_ISREG(st.st_mode)) {
                f->max_blocks = st.st_size / bs;
+               f->max_size = st.st_size;
                return 0;
        }
 
        return -1;
 }
 
-static int reap_events(struct submitter *s)
+static int reap_events_uring(struct submitter *s)
 {
        struct io_cq_ring *ring = &s->cq_ring;
        struct io_uring_cqe *cqe;
        unsigned head, reaped = 0;
+       int last_idx = -1, stat_nr = 0;
 
        head = *ring->head;
        do {
@@ -285,7 +564,9 @@ static int reap_events(struct submitter *s)
                        break;
                cqe = &ring->cqes[head & cq_ring_mask];
                if (!do_nop) {
-                       f = (struct file *) (uintptr_t) cqe->user_data;
+                       int fileno = cqe->user_data & 0xffffffff;
+
+                       f = &s->files[fileno];
                        f->pending_ios--;
                        if (cqe->res != bs) {
                                printf("io: unexpected ret=%d\n", cqe->res);
@@ -294,10 +575,25 @@ static int reap_events(struct submitter *s)
                                return -1;
                        }
                }
+               if (stats) {
+                       int clock_index = cqe->user_data >> 32;
+
+                       if (last_idx != clock_index) {
+                               if (last_idx != -1) {
+                                       add_stat(s, last_idx, stat_nr);
+                                       stat_nr = 0;
+                               }
+                               last_idx = clock_index;
+                       }
+                       stat_nr++;
+               }
                reaped++;
                head++;
        } while (1);
 
+       if (stat_nr)
+               add_stat(s, last_idx, stat_nr);
+
        if (reaped) {
                s->inflight -= reaped;
                atomic_store_release(ring->head, head);
@@ -305,16 +601,197 @@ static int reap_events(struct submitter *s)
        return reaped;
 }
 
-static void *submitter_fn(void *data)
+static int submitter_init(struct submitter *s)
 {
-       struct submitter *s = data;
-       struct io_sq_ring *ring = &s->sq_ring;
-       int ret, prepped;
+       int i, nr_batch;
 
-       printf("submitter=%d\n", gettid());
+       s->tid = gettid();
+       printf("submitter=%d, tid=%d\n", s->index, s->tid);
 
+       __init_rand64(&s->rand_state, pthread_self());
        srand48(pthread_self());
 
+       for (i = 0; i < MAX_FDS; i++)
+               s->files[i].fileno = i;
+
+       if (stats) {
+               nr_batch = roundup_pow2(depth / batch_submit);
+               if (nr_batch < 2)
+                       nr_batch = 2;
+               s->clock_batch = calloc(nr_batch, sizeof(unsigned long));
+               s->clock_index = 1;
+
+               s->plat = calloc(PLAT_NR, sizeof(unsigned long));
+       } else {
+               s->clock_batch = NULL;
+               s->plat = NULL;
+               nr_batch = 0;
+       }
+
+       return nr_batch;
+}
+
+#ifdef CONFIG_LIBAIO
+static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
+{
+       unsigned long offset, data;
+       struct file *f;
+       unsigned index;
+       long r;
+
+       index = 0;
+       while (index < max_ios) {
+               struct iocb *iocb = &iocbs[index];
+
+               if (s->nr_files == 1) {
+                       f = &s->files[0];
+               } else {
+                       f = &s->files[s->cur_file];
+                       if (f->pending_ios >= file_depth(s)) {
+                               s->cur_file++;
+                               if (s->cur_file == s->nr_files)
+                                       s->cur_file = 0;
+                               f = &s->files[s->cur_file];
+                       }
+               }
+               f->pending_ios++;
+
+               r = lrand48();
+               offset = (r % (f->max_blocks - 1)) * bs;
+               io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base,
+                               s->iovecs[index].iov_len, offset);
+
+               data = f->fileno;
+               if (stats && stats_running)
+                       data |= ((unsigned long) s->clock_index << 32);
+               iocb->data = (void *) (uintptr_t) data;
+               index++;
+       }
+       return index;
+}
+
+static int reap_events_aio(struct submitter *s, struct io_event *events, int evs)
+{
+       int last_idx = -1, stat_nr = 0;
+       int reaped = 0;
+
+       while (evs) {
+               unsigned long data = (uintptr_t) events[reaped].data;
+               struct file *f = &s->files[data & 0xffffffff];
+
+               f->pending_ios--;
+               if (events[reaped].res != bs) {
+                       printf("io: unexpected ret=%ld\n", events[reaped].res);
+                       return -1;
+               }
+               if (stats) {
+                       int clock_index = data >> 32;
+
+                       if (last_idx != clock_index) {
+                               if (last_idx != -1) {
+                                       add_stat(s, last_idx, stat_nr);
+                                       stat_nr = 0;
+                               }
+                               last_idx = clock_index;
+                       }
+                       stat_nr++;
+               }
+               reaped++;
+               evs--;
+       }
+
+       if (stat_nr)
+               add_stat(s, last_idx, stat_nr);
+
+       s->inflight -= reaped;
+       s->done += reaped;
+       return reaped;
+}
+
+static void *submitter_aio_fn(void *data)
+{
+       struct submitter *s = data;
+       int i, ret, prepped, nr_batch;
+       struct iocb **iocbsptr;
+       struct iocb *iocbs;
+       struct io_event *events;
+
+       nr_batch = submitter_init(s);
+
+       iocbsptr = calloc(depth, sizeof(struct iocb *));
+       iocbs = calloc(depth, sizeof(struct iocb));
+       events = calloc(depth, sizeof(struct io_event));
+
+       for (i = 0; i < depth; i++)
+               iocbsptr[i] = &iocbs[i];
+
+       prepped = 0;
+       do {
+               int to_wait, to_submit, to_prep;
+
+               if (!prepped && s->inflight < depth) {
+                       to_prep = min(depth - s->inflight, batch_submit);
+                       prepped = prep_more_ios_aio(s, to_prep, iocbs);
+#ifdef ARCH_HAVE_CPU_CLOCK
+                       if (prepped && stats) {
+                               s->clock_batch[s->clock_index] = get_cpu_clock();
+                               s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
+                       }
+#endif
+               }
+               s->inflight += prepped;
+               to_submit = prepped;
+
+               if (to_submit && (s->inflight + to_submit <= depth))
+                       to_wait = 0;
+               else
+                       to_wait = min(s->inflight + to_submit, batch_complete);
+
+               ret = io_submit(s->aio_ctx, to_submit, iocbsptr);
+               s->calls++;
+               if (ret < 0) {
+                       perror("io_submit");
+                       break;
+               } else if (ret != to_submit) {
+                       printf("submitted %d, wanted %d\n", ret, to_submit);
+                       break;
+               }
+               prepped = 0;
+
+               while (to_wait) {
+                       int r;
+
+                       s->calls++;
+                       r = io_getevents(s->aio_ctx, to_wait, to_wait, events, NULL);
+                       if (r < 0) {
+                               perror("io_getevents");
+                               break;
+                       } else if (r != to_wait) {
+                               printf("r=%d, wait=%d\n", r, to_wait);
+                               break;
+                       }
+                       r = reap_events_aio(s, events, r);
+                       s->reaps += r;
+                       to_wait -= r;
+               }
+       } while (!s->finish);
+
+       free(iocbsptr);
+       free(iocbs);
+       free(events);
+       finish = 1;
+       return NULL;
+}
+#endif
+
+static void *submitter_uring_fn(void *data)
+{
+       struct submitter *s = data;
+       struct io_sq_ring *ring = &s->sq_ring;
+       int ret, prepped, nr_batch;
+
+       nr_batch = submitter_init(s);
+
        prepped = 0;
        do {
                int to_wait, to_submit, this_reap, to_prep;
@@ -322,7 +799,13 @@ static void *submitter_fn(void *data)
 
                if (!prepped && s->inflight < depth) {
                        to_prep = min(depth - s->inflight, batch_submit);
-                       prepped = prep_more_ios(s, to_prep);
+                       prepped = prep_more_ios_uring(s, to_prep);
+#ifdef ARCH_HAVE_CPU_CLOCK
+                       if (prepped && stats) {
+                               s->clock_batch[s->clock_index] = get_cpu_clock();
+                               s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
+                       }
+#endif
                }
                s->inflight += prepped;
 submit_more:
@@ -361,7 +844,8 @@ submit:
                this_reap = 0;
                do {
                        int r;
-                       r = reap_events(s);
+
+                       r = reap_events_uring(s);
                        if (r == -1) {
                                s->finish = 1;
                                break;
@@ -404,13 +888,36 @@ submit:
        return NULL;
 }
 
-static void sig_int(int sig)
+static struct submitter *get_submitter(int offset)
+{
+       void *ret;
+
+       ret = submitter;
+       if (offset)
+               ret += offset * (sizeof(*submitter) + depth * sizeof(struct iovec));
+       return ret;
+}
+
+static void do_finish(const char *reason)
 {
-       printf("Exiting on signal %d\n", sig);
-       submitter->finish = 1;
+       int j;
+       printf("Exiting on %s\n", reason);
+       for (j = 0; j < nthreads; j++) {
+               struct submitter *s = get_submitter(j);
+               s->finish = 1;
+       }
+       if (max_iops > 100000)
+               printf("Maximum IOPS=%luK\n", max_iops / 1000);
+       else if (max_iops)
+               printf("Maximum IOPS=%lu\n", max_iops);
        finish = 1;
 }
 
+static void sig_int(int sig)
+{
+       do_finish("signal");
+}
+
 static void arm_sig_int(void)
 {
        struct sigaction act;
@@ -419,6 +926,39 @@ static void arm_sig_int(void)
        act.sa_handler = sig_int;
        act.sa_flags = SA_RESTART;
        sigaction(SIGINT, &act, NULL);
+
+       /* Windows uses SIGBREAK as a quit signal from other applications */
+#ifdef WIN32
+       sigaction(SIGBREAK, &act, NULL);
+#endif
+}
+
+static int setup_aio(struct submitter *s)
+{
+#ifdef CONFIG_LIBAIO
+       if (polled) {
+               fprintf(stderr, "aio does not support polled IO\n");
+               polled = 0;
+       }
+       if (sq_thread_poll) {
+               fprintf(stderr, "aio does not support SQPOLL IO\n");
+               sq_thread_poll = 0;
+       }
+       if (do_nop) {
+               fprintf(stderr, "aio does not support polled IO\n");
+               do_nop = 0;
+       }
+       if (fixedbufs || register_files) {
+               fprintf(stderr, "aio does not support registered files or buffers\n");
+               fixedbufs = register_files = 0;
+       }
+
+       return io_queue_init(depth, &s->aio_ctx);
+#else
+       fprintf(stderr, "Legacy AIO not available on this system/build\n");
+       errno = EINVAL;
+       return -1;
+#endif
 }
 
 static int setup_ring(struct submitter *s)
@@ -451,11 +991,26 @@ static int setup_ring(struct submitter *s)
        io_uring_probe(fd);
 
        if (fixedbufs) {
+               struct rlimit rlim;
+
+               rlim.rlim_cur = RLIM_INFINITY;
+               rlim.rlim_max = RLIM_INFINITY;
+               /* ignore potential error, not needed on newer kernels */
+               setrlimit(RLIMIT_MEMLOCK, &rlim);
+
                ret = io_uring_register_buffers(s);
                if (ret < 0) {
                        perror("io_uring_register_buffers");
                        return 1;
                }
+
+               if (dma_map) {
+                       ret = io_uring_map_buffers(s);
+                       if (ret < 0) {
+                               perror("io_uring_map_buffers");
+                               return 1;
+                       }
+               }
        }
 
        if (register_files) {
@@ -469,7 +1024,6 @@ static int setup_ring(struct submitter *s)
        ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
                        PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
                        IORING_OFF_SQ_RING);
-       printf("sq_ring ptr = 0x%p\n", ptr);
        sring->head = ptr + p.sq_off.head;
        sring->tail = ptr + p.sq_off.tail;
        sring->ring_mask = ptr + p.sq_off.ring_mask;
@@ -481,12 +1035,10 @@ static int setup_ring(struct submitter *s)
        s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
                        PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
                        IORING_OFF_SQES);
-       printf("sqes ptr    = 0x%p\n", s->sqes);
 
        ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
                        PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
                        IORING_OFF_CQ_RING);
-       printf("cq_ring ptr = 0x%p\n", ptr);
        cring->head = ptr + p.cq_off.head;
        cring->tail = ptr + p.cq_off.tail;
        cring->ring_mask = ptr + p.cq_off.ring_mask;
@@ -498,57 +1050,127 @@ static int setup_ring(struct submitter *s)
 
 static void file_depths(char *buf)
 {
-       struct submitter *s = submitter;
+       bool prev = false;
        char *p;
-       int i;
+       int i, j;
 
        buf[0] = '\0';
        p = buf;
-       for (i = 0; i < s->nr_files; i++) {
-               struct file *f = &s->files[i];
+       for (j = 0; j < nthreads; j++) {
+               struct submitter *s = get_submitter(j);
 
-               if (i + 1 == s->nr_files)
-                       p += sprintf(p, "%d", f->pending_ios);
-               else
-                       p += sprintf(p, "%d, ", f->pending_ios);
+               for (i = 0; i < s->nr_files; i++) {
+                       struct file *f = &s->files[i];
+
+                       if (prev)
+                               p += sprintf(p, " %d", f->pending_ios);
+                       else
+                               p += sprintf(p, "%d", f->pending_ios);
+                       prev = true;
+               }
        }
 }
 
-static void usage(char *argv)
+static void usage(char *argv, int status)
 {
+       char runtime_str[16];
+       snprintf(runtime_str, sizeof(runtime_str), "%d", runtime);
        printf("%s [options] -- [filenames]\n"
-               " -d <int> : IO Depth, default %d\n"
-               " -s <int> : Batch submit, default %d\n"
-               " -c <int> : Batch complete, default %d\n"
-               " -b <int> : Block size, default %d\n"
-               " -p <bool> : Polled IO, default %d\n",
-               argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled);
-       exit(0);
+               " -d <int>  : IO Depth, default %d\n"
+               " -s <int>  : Batch submit, default %d\n"
+               " -c <int>  : Batch complete, default %d\n"
+               " -b <int>  : Block size, default %d\n"
+               " -p <bool> : Polled IO, default %d\n"
+               " -B <bool> : Fixed buffers, default %d\n"
+               " -D <bool> : DMA map fixed buffers, default %d\n"
+               " -F <bool> : Register files, default %d\n"
+               " -n <int>  : Number of threads, default %d\n"
+               " -O <bool> : Use O_DIRECT, default %d\n"
+               " -N <bool> : Perform just no-op requests, default %d\n"
+               " -t <bool> : Track IO latencies, default %d\n"
+               " -T <int>  : TSC rate in HZ\n"
+               " -r <int>  : Runtime in seconds, default %s\n"
+               " -R <bool> : Use random IO, default %d\n"
+               " -a <bool> : Use legacy aio, default %d\n",
+               argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
+               fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
+               stats, runtime == 0 ? "unlimited" : runtime_str, aio, random_io);
+       exit(status);
+}
+
+static void read_tsc_rate(void)
+{
+       char buffer[32];
+       int fd, ret;
+
+       if (tsc_rate)
+               return;
+
+       fd = open(TSC_RATE_FILE, O_RDONLY);
+       if (fd < 0)
+               return;
+
+       ret = read(fd, buffer, sizeof(buffer));
+       if (ret < 0) {
+               close(fd);
+               return;
+       }
+
+       tsc_rate = strtoul(buffer, NULL, 10);
+       printf("Using TSC rate %luHz\n", tsc_rate);
+       close(fd);
+}
+
+static void write_tsc_rate(void)
+{
+       char buffer[32];
+       struct stat sb;
+       int fd, ret;
+
+       if (!stat(TSC_RATE_FILE, &sb))
+               return;
+
+       fd = open(TSC_RATE_FILE, O_WRONLY | O_CREAT, 0644);
+       if (fd < 0)
+               return;
+
+       memset(buffer, 0, sizeof(buffer));
+       sprintf(buffer, "%lu", tsc_rate);
+       ret = write(fd, buffer, strlen(buffer));
+       if (ret < 0)
+               perror("write");
+       close(fd);
 }
 
 int main(int argc, char *argv[])
 {
        struct submitter *s;
        unsigned long done, calls, reap;
-       int err, i, flags, fd, opt;
+       int err, i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles;
+       struct file f;
        char *fdepths;
        void *ret;
 
-       if (!do_nop && argc < 2) {
-               printf("%s: filename [options]\n", argv[0]);
-               return 1;
-       }
+       if (!do_nop && argc < 2)
+               usage(argv[0], 1);
 
-       while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:h?")) != -1) {
+       while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:h?")) != -1) {
                switch (opt) {
+               case 'a':
+                       aio = !!atoi(optarg);
+                       break;
                case 'd':
                        depth = atoi(optarg);
                        break;
                case 's':
                        batch_submit = atoi(optarg);
+                       if (!batch_submit)
+                               batch_submit = 1;
                        break;
                case 'c':
                        batch_complete = atoi(optarg);
+                       if (!batch_complete)
+                               batch_complete = 1;
                        break;
                case 'b':
                        bs = atoi(optarg);
@@ -562,115 +1184,244 @@ int main(int argc, char *argv[])
                case 'F':
                        register_files = !!atoi(optarg);
                        break;
+               case 'n':
+                       nthreads = atoi(optarg);
+                       if (!nthreads) {
+                               printf("Threads must be non-zero\n");
+                               usage(argv[0], 1);
+                       }
+                       break;
+               case 'N':
+                       do_nop = !!atoi(optarg);
+                       break;
+               case 'O':
+                       buffered = !atoi(optarg);
+                       break;
+               case 't':
+#ifndef ARCH_HAVE_CPU_CLOCK
+                       fprintf(stderr, "Stats not supported on this CPU\n");
+                       return 1;
+#endif
+                       stats = !!atoi(optarg);
+                       break;
+               case 'T':
+#ifndef ARCH_HAVE_CPU_CLOCK
+                       fprintf(stderr, "Stats not supported on this CPU\n");
+                       return 1;
+#endif
+                       tsc_rate = strtoul(optarg, NULL, 10);
+                       write_tsc_rate();
+                       break;
+               case 'r':
+                       runtime = atoi(optarg);
+                       break;
+               case 'D':
+                       dma_map = !!atoi(optarg);
+                       break;
+               case 'R':
+                       random_io = !!atoi(optarg);
+                       break;
                case 'h':
                case '?':
                default:
-                       usage(argv[0]);
+                       usage(argv[0], 0);
                        break;
                }
        }
 
-       submitter = malloc(sizeof(*submitter) + depth * sizeof(struct iovec));
-       memset(submitter, 0, sizeof(*submitter) + depth * sizeof(struct iovec));
-       s = submitter;
+       if (stats)
+               read_tsc_rate();
+
+       if (batch_complete > depth)
+               batch_complete = depth;
+       if (batch_submit > depth)
+               batch_submit = depth;
+       if (!fixedbufs && dma_map)
+               dma_map = 0;
+
+       submitter = calloc(nthreads, sizeof(*submitter) +
+                               depth * sizeof(struct iovec));
+       for (j = 0; j < nthreads; j++) {
+               s = get_submitter(j);
+               s->index = j;
+               s->done = s->calls = s->reaps = 0;
+       }
 
        flags = O_RDONLY | O_NOATIME;
        if (!buffered)
                flags |= O_DIRECT;
 
+       j = 0;
        i = optind;
+       nfiles = argc - i;
+       if (!do_nop) {
+               if (!nfiles) {
+                       printf("No files specified\n");
+                       usage(argv[0], 1);
+               }
+               threads_per_f = nthreads / nfiles;
+               /* make sure each thread gets assigned files */
+               if (threads_per_f == 0) {
+                       threads_per_f = 1;
+               } else {
+                       threads_rem = nthreads - threads_per_f * nfiles;
+               }
+       }
        while (!do_nop && i < argc) {
-               struct file *f;
+               int k, limit;
+
+               memset(&f, 0, sizeof(f));
 
-               if (s->nr_files == MAX_FDS) {
-                       printf("Max number of files (%d) reached\n", MAX_FDS);
-                       break;
-               }
                fd = open(argv[i], flags);
                if (fd < 0) {
                        perror("open");
                        return 1;
                }
-
-               f = &s->files[s->nr_files];
-               f->real_fd = fd;
-               if (get_file_size(f)) {
+               f.real_fd = fd;
+               if (get_file_size(&f)) {
                        printf("failed getting size of device/file\n");
                        return 1;
                }
-               if (f->max_blocks <= 1) {
+               if (f.max_blocks <= 1) {
                        printf("Zero file/device size?\n");
                        return 1;
                }
-               f->max_blocks--;
+               f.max_blocks--;
 
-               printf("Added file %s\n", argv[i]);
-               s->nr_files++;
-               i++;
-       }
+               limit = threads_per_f;
+               limit += threads_rem > 0 ? 1 : 0;
+               for (k = 0; k < limit; k++) {
+                       s = get_submitter((j + k) % nthreads);
 
-       if (fixedbufs) {
-               struct rlimit rlim;
+                       if (s->nr_files == MAX_FDS) {
+                               printf("Max number of files (%d) reached\n", MAX_FDS);
+                               break;
+                       }
 
-               rlim.rlim_cur = RLIM_INFINITY;
-               rlim.rlim_max = RLIM_INFINITY;
-               if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
-                       perror("setrlimit");
-                       return 1;
+                       memcpy(&s->files[s->nr_files], &f, sizeof(f));
+
+                       printf("Added file %s (submitter %d)\n", argv[i], s->index);
+                       s->nr_files++;
                }
+               threads_rem--;
+               i++;
+               j += limit;
        }
 
        arm_sig_int();
 
-       for (i = 0; i < depth; i++) {
-               void *buf;
+       for (j = 0; j < nthreads; j++) {
+               s = get_submitter(j);
+               for (i = 0; i < depth; i++) {
+                       void *buf;
 
-               if (posix_memalign(&buf, bs, bs)) {
-                       printf("failed alloc\n");
-                       return 1;
+                       if (posix_memalign(&buf, bs, bs)) {
+                               printf("failed alloc\n");
+                               return 1;
+                       }
+                       s->iovecs[i].iov_base = buf;
+                       s->iovecs[i].iov_len = bs;
                }
-               s->iovecs[i].iov_base = buf;
-               s->iovecs[i].iov_len = bs;
        }
 
-       err = setup_ring(s);
-       if (err) {
-               printf("ring setup failed: %s, %d\n", strerror(errno), err);
-               return 1;
-       }
-       printf("polled=%d, fixedbufs=%d, register_files=%d, buffered=%d", polled, fixedbufs, register_files, buffered);
-       printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", depth, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+       for (j = 0; j < nthreads; j++) {
+               s = get_submitter(j);
 
-       pthread_create(&s->thread, NULL, submitter_fn, s);
+               if (!aio)
+                       err = setup_ring(s);
+               else
+                       err = setup_aio(s);
+               if (err) {
+                       printf("ring setup failed: %s, %d\n", strerror(errno), err);
+                       return 1;
+               }
+       }
+       s = get_submitter(0);
+       printf("polled=%d, fixedbufs=%d/%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, dma_map, register_files, buffered, depth);
+       if (!aio)
+               printf("Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+       else
+               printf("Engine=aio\n");
+
+       for (j = 0; j < nthreads; j++) {
+               s = get_submitter(j);
+               if (!aio)
+                       pthread_create(&s->thread, NULL, submitter_uring_fn, s);
+#ifdef CONFIG_LIBAIO
+               else
+                       pthread_create(&s->thread, NULL, submitter_aio_fn, s);
+#endif
+       }
 
-       fdepths = malloc(8 * s->nr_files);
+       fdepths = malloc(8 * s->nr_files * nthreads);
        reap = calls = done = 0;
        do {
                unsigned long this_done = 0;
                unsigned long this_reap = 0;
                unsigned long this_call = 0;
                unsigned long rpc = 0, ipc = 0;
+               unsigned long iops, bw;
 
                sleep(1);
-               this_done += s->done;
-               this_call += s->calls;
-               this_reap += s->reaps;
+               if (runtime && !--runtime)
+                       do_finish("timeout");
+
+               /* don't print partial run, if interrupted by signal */
+               if (finish)
+                       break;
+
+               /* one second in to the run, enable stats */
+               if (stats)
+                       stats_running = 1;
+
+               for (j = 0; j < nthreads; j++) {
+                       s = get_submitter(j);
+                       this_done += s->done;
+                       this_call += s->calls;
+                       this_reap += s->reaps;
+               }
                if (this_call - calls) {
                        rpc = (this_done - done) / (this_call - calls);
                        ipc = (this_reap - reap) / (this_call - calls);
                } else
                        rpc = ipc = -1;
                file_depths(fdepths);
-               printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n",
-                               this_done - done, rpc, ipc, s->inflight,
-                               fdepths);
+               iops = this_done - done;
+               if (bs > 1048576)
+                       bw = iops * (bs / 1048576);
+               else
+                       bw = iops / (1048576 / bs);
+               if (iops > 100000)
+                       printf("IOPS=%luK, ", iops / 1000);
+               else
+                       printf("IOPS=%lu, ", iops);
+               max_iops = max(max_iops, iops);
+               if (!do_nop)
+                       printf("BW=%luMiB/s, ", bw);
+               printf("IOS/call=%ld/%ld, inflight=(%s)\n", rpc, ipc, fdepths);
                done = this_done;
                calls = this_call;
                reap = this_reap;
        } while (!finish);
 
-       pthread_join(s->thread, &ret);
-       close(s->ring_fd);
+       for (j = 0; j < nthreads; j++) {
+               s = get_submitter(j);
+               pthread_join(s->thread, &ret);
+               close(s->ring_fd);
+
+               if (stats) {
+                       unsigned long nr;
+
+                       printf("%d: Latency percentiles:\n", s->tid);
+                       for (i = 0, nr = 0; i < PLAT_NR; i++)
+                               nr += s->plat[i];
+                       show_clat_percentiles(s->plat, nr, 4);
+                       free(s->clock_batch);
+                       free(s->plat);
+               }
+       }
+
        free(fdepths);
+       free(submitter);
        return 0;
 }
diff --git a/t/one-core-peak.sh b/t/one-core-peak.sh
new file mode 100755 (executable)
index 0000000..9da8304
--- /dev/null
@@ -0,0 +1,287 @@
+#!/bin/bash
+
+args=$*
+first_cores=""
+taskset_cores=""
+first_cores_count=0
+nb_threads=1
+drives=""
+
+# Default options
+latency_cmdline=""
+
+fatal() {
+  echo "$@"
+  exit 1
+}
+
+hint() {
+  echo "Warning: $*"
+}
+
+info() {
+  item=$1
+  shift
+  echo "${item}: $*"
+}
+
+check_root() {
+  [[ ${EUID} -eq 0 ]] || fatal "You should be root to run this tool"
+}
+
+check_binary() {
+  # Ensure the binaries are present and executable
+  for bin in "$@"; do
+    if [ ! -x ${bin} ]; then
+      which ${bin} >/dev/null
+      [ $? -eq 0 ] || fatal "${bin} doesn't exists or is not executable"
+    fi
+  done
+}
+
+detect_first_core() {
+  cpu_to_search="0"
+  if [ "${#drives[@]}" -eq 1 ]; then
+    device_name=$(block_dev_name ${drives[0]})
+    device_dir="/sys/block/${device_name}/device/"
+    pci_addr=$(cat ${device_dir}/address)
+    pci_dir="/sys/bus/pci/devices/${pci_addr}/"
+    cpu_to_search=$(cat ${pci_dir}/local_cpulist | cut -d"," -f 1 | cut -d"-" -f 1)
+  else
+    hint 'Passed multiple devices. Running on the first core.'
+  fi
+  core_to_run=$(lscpu  --all -pSOCKET,CORE,CPU | grep ",$cpu_to_search\$" | cut -d"," -f1-2)
+
+  # Detect which logical cpus belongs to the first physical core
+  # If Hyperthreading is enabled, two cores are returned
+  cpus=$(lscpu  --all -pSOCKET,CORE,CPU | grep "$core_to_run")
+  for cpu in ${cpus}; do
+    IFS=','
+    # shellcheck disable=SC2206
+    array=(${cpu})
+    if [ ${first_cores_count} -eq 0 ]; then
+      first_cores="${array[2]}"
+    else
+      first_cores="${first_cores} ${array[2]}"
+    fi
+
+    first_cores_count=$((first_cores_count + 1))
+    unset IFS
+  done
+  [ ${first_cores_count} -eq 0 ] && fatal "Cannot detect first core"
+  taskset_cores=$(echo "${first_cores}" | tr ' ' ',')
+}
+
+usage() {
+  echo "usage: [options] block_device [other_block_devices]
+
+   -h         : print help
+   -l         : enable latency reporting
+
+   example:
+      t/one-core-peak.sh /dev/nvme0n1
+      t/one-core-peak.sh -l /dev/nvme0n1 /dev/nvme1n1
+  "
+  exit 0
+}
+
+check_args() {
+  local OPTIND option
+  while getopts "hl" option; do
+    case "${option}" in
+        h) # Show help
+            usage
+            ;;
+        l) # Report latency
+            latency_cmdline="1"
+            ;;
+        *)
+            fatal "Unsupported ${option} option"
+            ;;
+    esac
+  done
+  shift $((OPTIND-1))
+  [ $# -eq 0 ] && fatal "Missing drive(s) as argument"
+  drives="$*"
+}
+
+check_drive_exists() {
+  # Ensure the block device exists
+  [ -b $1 ] || fatal "$1 is not a valid block device"
+}
+
+is_nvme() {
+  [[ ${*} == *"nvme"* ]]
+}
+
+check_poll_queue() {
+  # Print a warning if the nvme poll queues aren't enabled
+  is_nvme ${drives} || return
+  poll_queue=$(cat /sys/module/nvme/parameters/poll_queues)
+  [ ${poll_queue} -eq 0 ] && hint "For better performance, you should enable nvme poll queues by setting nvme.poll_queues=32 on the kernel commande line"
+}
+
+block_dev_name() {
+  echo ${1#"/dev/"}
+}
+
+get_sys_block_dir() {
+  # Returns the /sys/block/ directory of a given block device
+  device_name=$1
+  sys_block_dir="/sys/block/${device_name}"
+  [ -d "${sys_block_dir}" ] || fatal "Cannot find ${sys_block_dir} directory"
+  echo ${sys_block_dir}
+}
+
+check_io_scheduler() {
+  # Ensure io_sched is set to none
+  device_name=$(block_dev_name $1)
+  sys_block_dir=$(get_sys_block_dir ${device_name})
+  sched_file="${sys_block_dir}/queue/scheduler"
+  [ -f "${sched_file}" ] || fatal "Cannot find IO scheduler for ${device_name}"
+  grep -q '\[none\]' ${sched_file}
+  if [ $? -ne 0 ]; then
+    info "${device_name}" "set none as io scheduler"
+    echo "none" > ${sched_file}
+  fi
+
+}
+
+check_sysblock_value() {
+  device_name=$(block_dev_name $1)
+  sys_block_dir=$(get_sys_block_dir ${device_name})
+  target_file="${sys_block_dir}/$2"
+  value=$3
+  [ -f "${target_file}" ] || return
+  content=$(cat ${target_file} 2>/dev/null)
+  if [ "${content}" != "${value}" ]; then
+    echo ${value} > ${target_file} 2>/dev/null && info "${device_name}" "${target_file} set to ${value}." || hint "${device_name}: Cannot set ${value} on ${target_file}"
+  fi
+}
+
+compute_nb_threads() {
+  # Increase the number of threads if there is more devices or cores than the default value
+  [ $# -gt ${nb_threads} ] && nb_threads=$#
+  [ ${first_cores_count} -gt ${nb_threads} ] && nb_threads=${first_cores_count}
+}
+
+check_scaling_governor() {
+  driver=$(LC_ALL=C cpupower frequency-info |grep "driver:" |awk '{print $2}')
+  if [ -z "${driver}" ]; then
+    hint "Cannot detect processor scaling driver"
+    return
+  fi
+  cpupower frequency-set -g performance >/dev/null 2>&1 || fatal "Cannot set scaling processor governor"
+}
+
+check_idle_governor() {
+  filename="/sys/devices/system/cpu/cpuidle/current_governor"
+  if [ ! -f "${filename}" ]; then
+    hint "Cannot detect cpu idle governor"
+    return
+  fi
+  echo "menu" > ${filename} 2>/dev/null || fatal "Cannot set cpu idle governor to menu"
+}
+
+show_nvme() {
+  device="$1"
+  device_name=$(block_dev_name $1)
+  device_dir="/sys/block/${device_name}/device/"
+  pci_addr=$(cat ${device_dir}/address)
+  pci_dir="/sys/bus/pci/devices/${pci_addr}/"
+  link_speed=$(cat ${pci_dir}/current_link_speed)
+  irq=$(cat ${pci_dir}/irq)
+  numa=$([ -f ${pci_dir}/numa_node ] && cat ${pci_dir}/numa_node || echo "off")
+  cpus=$(cat ${pci_dir}/local_cpulist)
+  model=$(cat ${device_dir}/model | xargs) #xargs for trimming spaces
+  fw=$(cat ${device_dir}/firmware_rev | xargs) #xargs for trimming spaces
+  serial=$(cat ${device_dir}/serial | xargs) #xargs for trimming spaces
+  info ${device_name} "MODEL=${model} FW=${fw} serial=${serial} PCI=${pci_addr}@${link_speed} IRQ=${irq} NUMA=${numa} CPUS=${cpus} "
+  which nvme &> /dev/null
+  if [ $? -eq 0 ]; then
+    status=""
+    NCQA=$(nvme get-feature -H -f 0x7 ${device} 2>&1 |grep NCQA |cut -d ':' -f 2 | xargs)
+    [ -n "${NCQA}" ] && status="${status}Completion Queues:${NCQA}, "
+    NSQA=$(nvme get-feature -H -f 0x7 ${device} 2>&1 |grep NSQA |cut -d ':' -f 2 | xargs)
+    [ -n "${NSQA}" ] && status="${status}Submission Queues:${NSQA}, "
+    power_state=$(nvme get-feature -H -f 0x2 ${device} 2>&1 | grep PS |cut -d ":" -f 2 | xargs)
+    [ -n "${power_state}" ] && status="${status}PowerState:${power_state}, "
+    apste=$(nvme get-feature -H -f 0xc ${device} 2>&1 | grep APSTE |cut -d ":" -f 2 | xargs)
+    [ -n "${apste}" ] && status="${status} Autonomous Power State Transition:${apste}, "
+    temp=$(nvme smart-log ${device} 2>&1 |grep 'temperature' |cut -d ':' -f 2 |xargs)
+    [ -n "${temp}" ] && status="${status}Temp:${temp}"
+    info ${device_name} "${status}"
+  fi
+}
+
+show_device() {
+  device_name=$(block_dev_name $1)
+  is_nvme $1 && show_nvme $1
+}
+
+show_kernel_config_item() {
+  config_item="CONFIG_$1"
+  config_file="/boot/config-$(uname -r)"
+  if [ ! -f "${config_file}" ]; then
+    config_file='/proc/config.gz'
+    if [ ! -f "${config_file}" ]; then
+      return
+    fi
+  fi
+  status=$(zgrep ${config_item}= ${config_file})
+  if [ -z "${status}" ]; then
+    echo "${config_item}=N"
+  else
+    echo "${config_item}=$(echo ${status} | cut -d '=' -f 2)"
+  fi
+}
+
+show_system() {
+  CPU_MODEL=$(grep -m1 "model name" /proc/cpuinfo | awk '{print substr($0, index($0,$4))}')
+  MEMORY_SPEED=$(dmidecode -t 17 -q | grep -m 1 "Configured Memory Speed: [0-9]" | awk '{print substr($0, index($0,$4))}')
+  KERNEL=$(uname -r)
+  info "system" "CPU: ${CPU_MODEL}"
+  info "system" "MEMORY: ${MEMORY_SPEED}"
+  info "system" "KERNEL: ${KERNEL}"
+  for config_item in BLK_CGROUP BLK_WBT_MQ HZ RETPOLINE PAGE_TABLE_ISOLATION; do
+    info "system" "KERNEL: $(show_kernel_config_item ${config_item})"
+  done
+  info "system" "KERNEL: $(cat /proc/cmdline)"
+  info "system" "SElinux: $(getenforce)"
+  tsc=$(journalctl -k | grep 'tsc: Refined TSC clocksource calibration:' | awk '{print $11}')
+  if [ -n "${tsc}" ]; then
+    info "system" "TSC: ${tsc} Mhz"
+    tsc=$(echo ${tsc} | tr -d '.')
+    [ -n "${latency_cmdline}" ] && latency_cmdline="-t1 -T${tsc}000"
+  fi
+}
+
+### MAIN
+check_args ${args}
+check_root
+check_binary t/io_uring lscpu grep taskset cpupower awk tr xargs dmidecode
+detect_first_core
+
+info "##################################################"
+show_system
+for drive in ${drives}; do
+  check_drive_exists ${drive}
+  check_io_scheduler ${drive}
+  check_sysblock_value ${drive} "queue/iostats" 0 # Ensure iostats are disabled
+  check_sysblock_value ${drive} "queue/nomerges" 2 # Ensure merge are disabled
+  check_sysblock_value ${drive} "queue/io_poll" 1 # Ensure io_poll is enabled
+  check_sysblock_value ${drive} "queue/wbt_lat_usec" 0 # Disabling wbt lat
+  show_device ${drive}
+done
+
+check_poll_queue
+compute_nb_threads ${drives}
+check_scaling_governor
+check_idle_governor
+
+info "##################################################"
+echo
+
+cmdline="taskset -c ${taskset_cores} t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -B1 -n${nb_threads} ${latency_cmdline} ${drives}"
+info "io_uring" "Running ${cmdline}"
+${cmdline}
index a59cdfe054ee5ab60258930ef558b5d3f1396f12..612e50ca6ae83d6785f1be3726ce3689755110f8 100755 (executable)
@@ -49,6 +49,7 @@ import shutil
 import logging
 import argparse
 import platform
+import traceback
 import subprocess
 import multiprocessing
 from pathlib import Path
@@ -1057,9 +1058,16 @@ def main():
                 skipped = skipped + 1
                 continue
 
-        test.setup(artifact_root, config['test_id'])
-        test.run()
-        test.check_result()
+        try:
+            test.setup(artifact_root, config['test_id'])
+            test.run()
+            test.check_result()
+        except KeyboardInterrupt:
+            break
+        except Exception as e:
+            test.passed = False
+            test.failure_reason += str(e)
+            logging.debug("Test %d exception:\n%s\n", config['test_id'], traceback.format_exc())
         if test.passed:
             result = "PASSED"
             passed = passed + 1
index 08a2c629e842cc26e3666f992396ae23ecfdff37..e4e248b9ff26a0c99fe564598455ab4a86d5ac5f 100644 (file)
@@ -64,6 +64,32 @@ check_blkzone() {
        fi
 }
 
+# Check zone capacity of each zone and report block size aligned to the zone
+# capacities. If zone capacity is same as zone size for zones, report zone size.
+zone_cap_bs() {
+       local dev="${1}"
+       local zone_size="${2}"
+       local sed_str='s/.*len \([0-9A-Za-z]*\), cap \([0-9A-Za-z]*\).*/\1 \2/p'
+       local cap bs="$zone_size"
+
+       # When blkzone is not available or blkzone does not report capacity,
+       # assume that zone capacity is same as zone size for all zones.
+       if [ -z "${blkzone}" ] || ! blkzone_reports_capacity "${dev}"; then
+               echo "$zone_size"
+               return
+       fi
+
+       while read -r -a line; do
+               ((line[0] == line[1])) && continue
+               cap=$((line[1] * 512))
+               while ((bs > 512 && cap % bs)); do
+                       bs=$((bs / 2))
+               done
+       done < <(blkzone report "${dev}" | sed -n "${sed_str}")
+
+       echo "$bs"
+}
+
 # Reports the starting sector and length of the first sequential zone of device
 # $1.
 first_sequential_zone() {
index db901179493d242a9478e6a6af6a1a7a92b78bf5..7d2c7fa8fc2e5110bfc00d63ce3e7bcc83b4c88c 100755 (executable)
@@ -19,7 +19,6 @@ usage()
        echo -e "\t-L List the device layouts for every section without running"
        echo -e "\t   tests."
        echo -e "\t-s <#section> Only run the section with the given number."
-       echo -e "\t-l Use libzbc ioengine to run the tests."
        echo -e "\t-t <#test> Only run the test with the given number in every section."
        echo -e "\t-o <max_open_zones> Specify MaxOpen value, (${set_max_open} by default)."
        echo -e "\t-n <#number of runs> Set the number of times to run the entire suite "
@@ -239,7 +238,6 @@ dev_size=1024
 dev_blocksize=4096
 set_max_open=8
 zbd_test_opts=()
-libzbc=0
 num_of_runs=1
 test_case=0
 quit_on_err=0
@@ -250,7 +248,6 @@ while (($#)); do
                -o) set_max_open="${2}"; shift; shift;;
                -L) list_only=1; shift;;
                -r) cleanup_nullb; exit 0;;
-               -l) libzbc=1; shift;;
                -n) num_of_runs="${2}"; shift; shift;;
                -t) test_case="${2}"; shift; shift;;
                -q) quit_on_err=1; shift;;
@@ -311,17 +308,6 @@ while ((run_nr <= $num_of_runs)); do
                        exit 1
                fi
                show_nullb_config
-               if ((libzbc)); then
-                       if ((zone_capacity < zone_size)); then
-                               echo "libzbc doesn't support zone capacity, skipping section $(printf "%02d" $section_number)"
-                               continue
-                       fi
-                       if ((conv_pcnt == 100)); then
-                               echo "libzbc only supports zoned devices, skipping section $(printf "%02d" $section_number)"
-                               continue
-                       fi
-                       zbd_test_opts+=("-l")
-               fi
                cd "${scriptdir}"
                ((intr)) && exit 1
                ((list_only)) && continue
index 57e6d05ea73744858d56ee86341963460c7c0cf6..7e2fff00dac6adc4cc8bc08b7423ab9b7081ab3f 100755 (executable)
@@ -12,6 +12,7 @@ usage() {
        echo -e "\t-v Run fio with valgrind --read-var-info option"
        echo -e "\t-l Test with libzbc ioengine"
        echo -e "\t-r Reset all zones before test start"
+       echo -e "\t-w Reset all zones before executing each write test case"
        echo -e "\t-o <max_open_zones> Run fio with max_open_zones limit"
        echo -e "\t-t <test #> Run only a single test case with specified number"
        echo -e "\t-q Quit the test run after any failed test"
@@ -182,13 +183,14 @@ run_fio_on_seq() {
     run_one_fio_job "${opts[@]}" "$@"
 }
 
-# Prepare for write test by resetting zones. When max_open_zones option is
-# specified, reset all zones of the test target to ensure that zones out of the
-# test target range do not have open zones. This allows the write test to the
-# target range to be able to open zones up to max_open_zones.
+# Prepare for write test by resetting zones. When reset_before_write or
+# max_open_zones option is specified, reset all zones of the test target to
+# ensure that zones out of the test target range do not have open zones. This
+# allows the write test to the target range to be able to open zones up to
+# max_open_zones limit specified as the option or obtained from sysfs.
 prep_write() {
-       [[ -n "${max_open_zones_opt}" && -n "${is_zbd}" ]] &&
-               reset_zone "${dev}" -1
+       [[ -n "${reset_before_write}" || -n "${max_open_zones_opt}" ]] &&
+               [[ -n "${is_zbd}" ]] && reset_zone "${dev}" -1
 }
 
 SKIP_TESTCASE=255
@@ -310,7 +312,8 @@ test4() {
     off=$((first_sequential_zone_sector * 512 + 129 * zone_size))
     size=$((zone_size))
     [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
-    opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--bs=$size")
+    opts+=("--name=$dev" "--filename=$dev" "--offset=$off")
+    opts+=(--bs="$(min $((logical_block_size * 256)) $size)")
     opts+=("--size=$size" "--thread=1" "--read_beyond_wp=1")
     opts+=("$(ioengine "psync")" "--rw=read" "--direct=1" "--disable_lat=1")
     opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
@@ -320,15 +323,15 @@ test4() {
 
 # Sequential write to sequential zones.
 test5() {
-    local size off capacity
+    local size off capacity bs
 
     prep_write
     off=$((first_sequential_zone_sector * 512))
     capacity=$(total_zone_capacity 4 $off $dev)
     size=$((4 * zone_size))
+    bs=$(min "$(max $((zone_size / 64)) "$logical_block_size")" "$zone_cap_bs")
     run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write        \
-                  --bs="$(max $((zone_size / 64)) "$logical_block_size")"\
-                  --do_verify=1 --verify=md5                           \
+                  --bs="$bs" --do_verify=1 --verify=md5 \
                   >>"${logfile}.${test_number}" 2>&1 || return $?
     check_written $capacity || return $?
     check_read $capacity || return $?
@@ -336,18 +339,18 @@ test5() {
 
 # Sequential read from sequential zones.
 test6() {
-    local size off capacity
+    local size off capacity bs
 
     prep_write
     off=$((first_sequential_zone_sector * 512))
     capacity=$(total_zone_capacity 4 $off $dev)
     size=$((4 * zone_size))
+    bs=$(min "$(max $((zone_size / 64)) "$logical_block_size")" "$zone_cap_bs")
     write_and_run_one_fio_job \
            $((first_sequential_zone_sector * 512)) "${size}" \
            --offset="${off}" \
            --size="${size}" --zonemode=zbd --zonesize="${zone_size}" \
-           "$(ioengine "psync")" --iodepth=1 --rw=read \
-           --bs="$(max $((zone_size / 64)) "$logical_block_size")" \
+           "$(ioengine "psync")" --iodepth=1 --rw=read --bs="$bs" \
            >>"${logfile}.${test_number}" 2>&1 || return $?
     check_read $capacity || return $?
 }
@@ -485,7 +488,7 @@ test14() {
 
 # Sequential read on a mix of empty and full zones.
 test15() {
-    local i off size
+    local i off size bs
     local w_off w_size w_capacity
 
     for ((i=0;i<4;i++)); do
@@ -499,8 +502,9 @@ test15() {
     w_capacity=$(total_zone_capacity 2 $w_off $dev)
     off=$((first_sequential_zone_sector * 512))
     size=$((4 * zone_size))
+    bs=$(min $((zone_size / 16)) "$zone_cap_bs")
     write_and_run_one_fio_job "${w_off}" "${w_size}" \
-                   "$(ioengine "psync")" --rw=read --bs=$((zone_size / 16)) \
+                   "$(ioengine "psync")" --rw=read --bs="$bs" \
                    --zonemode=zbd --zonesize="${zone_size}" --offset=$off \
                    --size=$((size)) >>"${logfile}.${test_number}" 2>&1 ||
        return $?
@@ -852,7 +856,7 @@ test37() {
        off=$(((first_sequential_zone_sector - 1) * 512))
     fi
     size=$((zone_size + 2 * 512))
-    bs=$((zone_size / 4))
+    bs=$(min $((zone_size / 4)) "$zone_cap_bs")
     run_one_fio_job --offset=$off --size=$size "$(ioengine "psync")"   \
                    --iodepth=1 --rw=write --do_verify=1 --verify=md5   \
                    --bs=$bs --zonemode=zbd --zonesize="${zone_size}"   \
@@ -1215,10 +1219,37 @@ test57() {
                >> "${logfile}.${test_number}" 2>&1 || return $?
 }
 
+# Random writes and random trims to sequential write required zones for 30s.
+test58() {
+    local off size bs
+
+    require_seq_zones 128 || return $SKIP_TESTCASE
+
+    size=$((zone_size * 128))
+    bs="$(max $((zone_size / 128)) "$logical_block_size")"
+    prep_write
+    off=$((first_sequential_zone_sector * 512))
+    run_fio --zonemode=zbd --direct=1 --zonesize="${zone_size}" --thread=1 \
+           --filename="${dev}" --norandommap=1 \
+            --name="precondition"  --rw=write "$(ioengine "psync")" \
+            --offset="${off}" --size=$((zone_size * 16)) --bs="${bs}" \
+           "${job_var_opts[@]}" \
+           --name=wjob --wait_for="precondition" --rw=randwrite \
+           "$(ioengine "libaio")" --iodepth=8 \
+           --offset="${off}" --size="${size}" --bs="${bs}" \
+           --time_based --runtime=30s --flow=128 "${job_var_opts[@]}" \
+           --name=trimjob --wait_for="precondition" --rw=randtrim \
+           "$(ioengine "psync")" \
+           --offset="${off}" --size="${size}" --bs="${zone_size}" \
+           --time_based --runtime=30s --flow=1 "${job_var_opts[@]}" \
+           >>"${logfile}.${test_number}" 2>&1
+}
+
 SECONDS=0
 tests=()
 dynamic_analyzer=()
 reset_all_zones=
+reset_before_write=
 use_libzbc=
 zbd_debug=
 max_open_zones_opt=
@@ -1233,6 +1264,7 @@ while [ "${1#-}" != "$1" ]; do
        shift;;
     -l) use_libzbc=1; shift;;
     -r) reset_all_zones=1; shift;;
+    -w) reset_before_write=1; shift;;
     -t) tests+=("$2"); shift; shift;;
     -o) max_open_zones_opt="${2}"; shift; shift;;
     -v) dynamic_analyzer=(valgrind "--read-var-info=yes");
@@ -1351,6 +1383,8 @@ fi
 echo -n "First sequential zone starts at sector $first_sequential_zone_sector;"
 echo " zone size: $((zone_size >> 20)) MB"
 
+zone_cap_bs=$(zone_cap_bs "$dev" "$zone_size")
+
 if [ "${#tests[@]}" = 0 ]; then
     readarray -t tests < <(declare -F | grep "test[0-9]*" | \
                                   tr -c -d "[:digit:]\n" | sort -n)
index 05c2d1383e68403190d175608b995ece11d958fa..8f4c8a5996aefa912d13f6ec263d82d9d05db033 100644 (file)
@@ -31,11 +31,25 @@ enum fio_memtype {
        MEM_CUDA_MALLOC,/* use GPU memory */
 };
 
+/*
+ * What mode to use for deduped data generation
+ */
+enum dedupe_mode {
+       DEDUPE_MODE_REPEAT = 0,
+       DEDUPE_MODE_WORKING_SET = 1,
+};
+
 #define ERROR_STR_MAX  128
 
 #define BSSPLIT_MAX    64
 #define ZONESPLIT_MAX  256
 
+struct split {
+       unsigned int nr;
+       unsigned long long val1[ZONESPLIT_MAX];
+       unsigned long long val2[ZONESPLIT_MAX];
+};
+
 struct bssplit {
        uint64_t bs;
        uint32_t perc;
@@ -177,10 +191,6 @@ struct thread_options {
 
        unsigned int hugepage_size;
        unsigned long long rw_min_bs;
-       unsigned int thinktime;
-       unsigned int thinktime_spin;
-       unsigned int thinktime_blocks;
-       unsigned int thinktime_blocks_type;
        unsigned int fsync_blocks;
        unsigned int fdatasync_blocks;
        unsigned int barrier_blocks;
@@ -243,6 +253,8 @@ struct thread_options {
        unsigned int compress_percentage;
        unsigned int compress_chunk;
        unsigned int dedupe_percentage;
+       unsigned int dedupe_mode;
+       unsigned int dedupe_working_set_percentage;
        unsigned int time_based;
        unsigned int disable_lat;
        unsigned int disable_clat;
@@ -287,6 +299,12 @@ struct thread_options {
        char *exec_prerun;
        char *exec_postrun;
 
+       unsigned int thinktime;
+       unsigned int thinktime_spin;
+       unsigned int thinktime_blocks;
+       unsigned int thinktime_blocks_type;
+       unsigned int thinktime_iotime;
+
        uint64_t rate[DDIR_RWDIR_CNT];
        uint64_t ratemin[DDIR_RWDIR_CNT];
        unsigned int ratecycle;
@@ -358,6 +376,9 @@ struct thread_options {
        unsigned int ignore_zone_limits;
        fio_fp64_t zrt;
        fio_fp64_t zrf;
+
+       unsigned int log_entries;
+       unsigned int log_prio;
 };
 
 #define FIO_TOP_STR_MAX                256
@@ -486,10 +507,6 @@ struct thread_options_pack {
 
        uint32_t hugepage_size;
        uint64_t rw_min_bs;
-       uint32_t thinktime;
-       uint32_t thinktime_spin;
-       uint32_t thinktime_blocks;
-       uint32_t thinktime_blocks_type;
        uint32_t fsync_blocks;
        uint32_t fdatasync_blocks;
        uint32_t barrier_blocks;
@@ -549,6 +566,8 @@ struct thread_options_pack {
        uint32_t compress_percentage;
        uint32_t compress_chunk;
        uint32_t dedupe_percentage;
+       uint32_t dedupe_mode;
+       uint32_t dedupe_working_set_percentage;
        uint32_t time_based;
        uint32_t disable_lat;
        uint32_t disable_clat;
@@ -592,6 +611,12 @@ struct thread_options_pack {
        uint8_t exec_prerun[FIO_TOP_STR_MAX];
        uint8_t exec_postrun[FIO_TOP_STR_MAX];
 
+       uint32_t thinktime;
+       uint32_t thinktime_spin;
+       uint32_t thinktime_blocks;
+       uint32_t thinktime_blocks_type;
+       uint32_t thinktime_iotime;
+
        uint64_t rate[DDIR_RWDIR_CNT];
        uint64_t ratemin[DDIR_RWDIR_CNT];
        uint32_t ratecycle;
@@ -631,7 +656,6 @@ struct thread_options_pack {
        uint64_t latency_target;
        uint64_t latency_window;
        uint64_t max_latency[DDIR_RWDIR_CNT];
-       uint32_t pad5;
        fio_fp64_t latency_percentile;
        uint32_t latency_run;
 
@@ -659,6 +683,9 @@ struct thread_options_pack {
        uint32_t zone_mode;
        int32_t max_open_zones;
        uint32_t ignore_zone_limits;
+
+       uint32_t log_entries;
+       uint32_t log_prio;
 } __attribute__((packed));
 
 extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);
@@ -666,4 +693,13 @@ extern void convert_thread_options_to_net(struct thread_options_pack *top, struc
 extern int fio_test_cconv(struct thread_options *);
 extern void options_default_fill(struct thread_options *o);
 
+typedef int (split_parse_fn)(struct thread_options *, void *,
+                            enum fio_ddir, char *, bool);
+
+extern int str_split_parse(struct thread_data *td, char *str,
+                          split_parse_fn *fn, void *eo, bool data);
+
+extern int split_parse_ddir(struct thread_options *o, struct split *split,
+                           char *str, bool absolute, unsigned int max_splits);
+
 #endif
diff --git a/tools/fiograph/fiograph.conf b/tools/fiograph/fiograph.conf
new file mode 100644 (file)
index 0000000..cfd2fd8
--- /dev/null
@@ -0,0 +1,105 @@
+[fio_jobs]
+header=<<B><font color="{}"> {} </font></B> >
+header_color=black
+text_color=darkgreen
+shape=box
+shape_color=blue
+style=rounded
+title_style=<<table border='0' cellborder='0' cellspacing='1'> <tr> <td align='center'> <b> {} </b> </td> </tr>
+item_style=<tr> <td align = "left"> <font color="{}" > {} </font> </td> </tr>
+cluster_style=filled
+cluster_color=gainsboro
+
+[exec_prerun]
+text_color=red
+
+[exec_postrun]
+text_color=red
+
+[numjobs]
+text_color=red
+style=<font color="{}" > x {} </font>
+
+[ioengine]
+text_color=darkblue
+specific_options_color=darkblue
+
+# definitions of engine's specific options
+
+[ioengine_cpuio]
+specific_options=cpuload cpumode cpuchunks exit_on_io_done
+
+[ioengine_dfs]
+specific_options=pool  cont  chunk_size  object_class  svcl
+
+[ioengine_e4defrag]
+specific_options=donorname  inplace
+
+[ioengine_exec]
+specific_options=program arguments grace_time std_redirect
+
+[ioengine_filestat]
+specific_options=stat_type
+
+[ioengine_single-instance]
+specific_options=volume  brick
+
+[ioengine_http]
+specific_options=https  http_host  http_user  http_pass  http_s3_key  http_s3_keyid  http_swift_auth_token  http_s3_region  http_mode  http_verbose
+
+[ioengine_ime_aio]
+specific_options=ime_psync  ime_psyncv
+
+[ioengine_io_uring]
+specific_options=hipri  cmdprio_percentage  cmdprio_class  cmdprio  cmdprio_bssplit  fixedbufs  registerfiles  sqthread_poll  sqthread_poll_cpu  nonvectored  uncached  nowait  force_async
+
+[ioengine_libaio]
+specific_options=userspace_reap  cmdprio_percentage  cmdprio_class  cmdprio  cmdprio_bssplit  nowait
+
+[ioengine_libcufile]
+specific_options=gpu_dev_ids  cuda_io
+
+[ioengine_libhdfs]
+specific_options=namenode  hostname  port  hdfsdirectory  chunk_size  single_instance  hdfs_use_direct
+
+[ioengine_libiscsi]
+specific_options=initiator
+
+[ioengine_librpma_apm_server]
+specific_options=librpma_apm_client
+
+[ioengine_busy_wait_polling]
+specific_options=serverip  port  direct_write_to_pmem
+
+[ioengine_librpma_gpspm_server]
+specific_options=librpma_gpspm_client
+
+[ioengine_mmap]
+specific_options=thp
+
+[ioengine_mtd]
+specific_options=skip_bad
+
+[ioengine_nbd]
+specific_options=uri
+
+[ioengine_net]
+specific_options=hostname  port  protocol  nodelay  listen  pingpong  interface  ttl  window_size  mss  netsplice
+
+[ioengine_nfs]
+specific_options=nfs_url
+
+[ioengine_rados]
+specific_options=clustername  pool  clientname  busy_poll  touch_objects
+
+[ioengine_rbd]
+specific_options=clustername  rbdname  pool  clientname  busy_poll
+
+[ioengine_rdma]
+specific_options=hostname  bindname  port  verb
+
+[ioengine_sg]
+specific_options=hipri  readfua  writefua  sg_write_mode  sg
+
+[ioengine_pvsync2]
+specific_options=hipri  hipri_percentage  uncached  nowait  sync  psync  vsync  pvsync
diff --git a/tools/fiograph/fiograph.py b/tools/fiograph/fiograph.py
new file mode 100755 (executable)
index 0000000..b5669a2
--- /dev/null
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+from graphviz import Digraph
+import argparse
+import configparser
+import os
+
+config_file = None
+fio_file = None
+
+
+def get_section_option(section_name, option_name, default=None):
+    global fio_file
+    if fio_file.has_option(section_name, option_name):
+        return fio_file[section_name][option_name]
+    return default
+
+
+def get_config_option(section_name, option_name, default=None):
+    global config_file
+    if config_file.has_option(section_name, option_name):
+        return config_file[section_name][option_name]
+    return default
+
+
+def get_header_color(keyword='fio_jobs', default_color='black'):
+    return get_config_option(keyword, 'header_color', default_color)
+
+
+def get_shape_color(keyword='fio_jobs', default_color='black'):
+    return get_config_option(keyword, 'shape_color', default_color)
+
+
+def get_text_color(keyword='fio_jobs', default_color='black'):
+    return get_config_option(keyword, 'text_color', default_color)
+
+
+def get_cluster_color(keyword='fio_jobs', default_color='gray92'):
+    return get_config_option(keyword, 'cluster_color', default_color)
+
+
+def get_header(keyword='fio_jobs'):
+    return get_config_option(keyword, 'header')
+
+
+def get_shape(keyword='fio_jobs'):
+    return get_config_option(keyword, 'shape', 'box')
+
+
+def get_style(keyword='fio_jobs'):
+    return get_config_option(keyword, 'style', 'rounded')
+
+
+def get_cluster_style(keyword='fio_jobs'):
+    return get_config_option(keyword, 'cluster_style', 'filled')
+
+
+def get_specific_options(engine):
+    if not engine:
+        return ''
+    return get_config_option('ioengine_{}'.format(engine), 'specific_options', '').split(' ')
+
+
+def render_option(section, label, display, option, color_override=None):
+    # These options are already shown with graphical helpers, no need to report them directly
+    skip_list = ['size', 'stonewall', 'runtime', 'time_based',
+                 'numjobs', 'wait_for', 'wait_for_previous']
+    # If the option doesn't exist or if a special handling is already done
+    # don't render it, just return the current state
+    if option in skip_list or option not in section:
+        return label, display
+    display = option
+    if section[option]:
+        display = '{} = {}'.format(display, section[option])
+
+    # Adding jobs's options into the box, darkgreen is the default color
+    if color_override:
+        color = color_override
+    else:
+        color = get_text_color(option, get_text_color('fio_jobs', 'darkgreen'))
+    label += get_config_option('fio_jobs',
+                               'item_style').format(color, display)
+    return label, display
+
+
+def render_options(fio_file, section_name):
+    """Render all options of a section."""
+    display = section_name
+    section = fio_file[section_name]
+
+    # Add a multiplier to the section_name if numjobs is set
+    numjobs = int(get_section_option(section_name, 'numjobs', '1'))
+    if numjobs > 1:
+        display = display + \
+            get_style('numjobs').format(
+                get_text_color('numjobs'), numjobs)
+
+    # Header of the box
+    label = get_config_option('fio_jobs', 'title_style').format(display)
+
+    # Let's parse all the options of the current fio thread
+    # Some needs to be printed on top or bottom of the job to ease the read
+    to_early_print = ['exec_prerun', 'ioengine']
+    to_late_print = ['exec_postrun']
+
+    # Let's print the options on top of the box
+    for early_print in to_early_print:
+        label, display = render_option(
+            section, label, display, early_print)
+
+    current_io_engine = get_section_option(
+        section_name, 'ioengine', None)
+    if current_io_engine:
+        # Let's print all specifics options for this engine
+        for specific_option in sorted(get_specific_options(current_io_engine)):
+            label, display = render_option(
+                section, label, display, specific_option, get_config_option('ioengine', 'specific_options_color'))
+
+    # Let's print generic options sorted by name
+    for option in sorted(section):
+        if option in to_early_print or option in to_late_print or option in get_specific_options(current_io_engine):
+            continue
+        label, display = render_option(section, label, display, option)
+
+    # let's print options on the bottom of the box
+    for late_print in to_late_print:
+        label, display = render_option(
+            section, label, display, late_print)
+
+    # End of the box content
+    label += '</table>>'
+    return label
+
+
+def render_section(current_graph, fio_file, section_name, label):
+    """Render the section."""
+    attr = None
+    section = fio_file[section_name]
+
+    # Let's render the box associated to a job
+    current_graph.node(section_name, label,
+                       shape=get_shape(),
+                       color=get_shape_color(),
+                       style=get_style())
+
+    # Let's report the duration of the jobs with a self-loop arrow
+    if 'runtime' in section and 'time_based' in section:
+        attr = 'runtime={}'.format(section['runtime'])
+    elif 'size' in section:
+        attr = 'size={}'.format(section['size'])
+    if attr:
+        current_graph.edge(section_name, section_name, attr)
+
+
+def create_sub_graph(name):
+    """Return a new graph."""
+    # We need to put 'cluster' in the name to ensure graphviz consider it as a cluster
+    cluster_name = 'cluster_' + name
+    # Unset the main graph labels to avoid a recopy in each subgraph
+    attr = {}
+    attr['label'] = ''
+    new_graph = Digraph(name=cluster_name, graph_attr=attr)
+    new_graph.attr(style=get_cluster_style(),
+                   color=get_cluster_color())
+    return new_graph
+
+
+def create_legend():
+    """Return a legend."""
+    html_table = "<<table border='0' cellborder='1' cellspacing='0' cellpadding='4'>"
+    html_table += '<tr><td COLSPAN="2"><b>Legend</b></td></tr>'
+    legend_item = '<tr> <td>{}</td> <td><font color="{}">{}</font></td></tr>"'
+    legend_bgcolor_item = '<tr><td>{}</td><td BGCOLOR="{}"></td></tr>'
+    html_table += legend_item.format('numjobs',
+                                     get_text_color('numjobs'), 'x numjobs')
+    html_table += legend_item.format('generic option',
+                                     get_text_color(), 'generic option')
+    html_table += legend_item.format('ioengine option',
+                                     get_text_color('ioengine'), 'ioengine option')
+    html_table += legend_bgcolor_item.format('job', get_shape_color())
+    html_table += legend_bgcolor_item.format(
+        'execution group', get_cluster_color())
+    html_table += '</table>>'
+    legend = Digraph('html_table')
+    legend.node('legend', shape='none', label=html_table)
+    return legend
+
+
+def fio_to_graphviz(filename, format):
+    """Compute the graphviz graph from the fio file."""
+
+    # Let's read the fio file
+    global fio_file
+    fio_file = configparser.RawConfigParser(
+        allow_no_value=True,
+        default_section="global",
+        inline_comment_prefixes="'#', ';'")
+    fio_file.read(filename)
+
+    # Prepare the main graph object
+    # Let's define the header of the document
+    attrs = {}
+    attrs['labelloc'] = 't'
+    attrs['label'] = get_header().format(
+        get_header_color(), os.path.basename(filename))
+    main_graph = Digraph(engine='dot', graph_attr=attrs, format=format)
+
+    # Let's add a legend
+    main_graph.subgraph(create_legend())
+
+    # By default all jobs are run in parallel and depends on "global"
+    depends_on = fio_file.default_section
+
+    # The previous section is by default the global section
+    previous_section = fio_file.default_section
+
+    current_graph = main_graph
+
+    # The first job will be a new execution group
+    new_execution_group = True
+
+    # Let's interate on all sections to create links between them
+    for section_name in fio_file.sections():
+        # The current section
+        section = fio_file[section_name]
+
+        # If the current section is waiting the previous job
+        if ('stonewall' or 'wait_for_previous') in section:
+            # let's remember what was the previous job we depend on
+            depends_on = previous_section
+            new_execution_group = True
+        elif 'wait_for' in section:
+            # This sections depends on a named section pointed by wait_for
+            depends_on = section['wait_for']
+            new_execution_group = True
+
+        if new_execution_group:
+            # Let's link the current graph with the main one
+            main_graph.subgraph(current_graph)
+            # Let's create a new graph to represent all the incoming jobs running at the same time
+            current_graph = create_sub_graph(section_name)
+
+        # Let's render the current section in its execution group
+        render_section(current_graph, fio_file, section_name,
+                       render_options(fio_file, section_name))
+
+        # Let's trace the link between this job and the one it depends on
+        # If we depend on 'global', we can avoid doing adding an arrow as we don't want to see 'global'
+        if depends_on != fio_file.default_section:
+            current_graph.edge(depends_on, section_name)
+
+        # The current section become the parent of the next one
+        previous_section = section_name
+
+        # We are by default in the same execution group
+        new_execution_group = False
+
+    # The last subgraph isn't rendered yet
+    main_graph.subgraph(current_graph)
+
+    # Let's return the main graphviz object
+    return main_graph
+
+
+def setup_commandline():
+    "Prepare the command line."
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--file', action='store',
+                        type=str,
+                        required=True,
+                        help='the fio file to graph')
+    parser.add_argument('--output', action='store',
+                        type=str,
+                        help='the output filename')
+    parser.add_argument('--format', action='store',
+                        type=str,
+                        default='png',
+                        help='the output format')
+    parser.add_argument('--view', action='store_true',
+                        default=False,
+                        help='view the graph')
+    parser.add_argument('--keep', action='store_true',
+                        default=False,
+                        help='keep the graphviz script file')
+    parser.add_argument('--config', action='store',
+                        type=str,
+                        default='fiograph.conf',
+                        help='the configuration filename')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    global config_file
+    args = setup_commandline()
+    if args.output is None:
+        output_file = args.file
+        output_file = output_file.replace('.fio', '')
+    else:
+        output_file = args.output
+    config_file = configparser.RawConfigParser(allow_no_value=True)
+    config_file.read(args.config)
+    fio_to_graphviz(args.file, args.format).render(output_file, view=args.view)
+    if not args.keep:
+        os.remove(output_file)
+
+
+main()
index a418c05413426905e4772cb62855dad83e273197..0e1e4639348547b86319c67a05349e161f17b7e0 100644 (file)
--- a/verify.c
+++ b/verify.c
@@ -1411,7 +1411,6 @@ static void *verify_async_thread(void *data)
                        ret = pthread_cond_wait(&td->verify_cond,
                                                        &td->io_u_lock);
                        if (ret) {
-                               pthread_mutex_unlock(&td->io_u_lock);
                                break;
                        }
                }
diff --git a/zbd.c b/zbd.c
index 8e99eb95dc6261101c48e6ea84ab5edf1601c316..c18998c46f5428c106f0f910e3901e279ca63dd0 100644 (file)
--- a/zbd.c
+++ b/zbd.c
@@ -83,12 +83,12 @@ int zbd_report_zones(struct thread_data *td, struct fio_file *f,
                ret = blkzoned_report_zones(td, f, offset, zones, nr_zones);
        if (ret < 0) {
                td_verror(td, errno, "report zones failed");
-               log_err("%s: report zones from sector %llu failed (%d).\n",
-                       f->file_name, (unsigned long long)offset >> 9, errno);
+               log_err("%s: report zones from sector %"PRIu64" failed (%d).\n",
+                       f->file_name, offset >> 9, errno);
        } else if (ret == 0) {
                td_verror(td, errno, "Empty zone report");
-               log_err("%s: report zones from sector %llu is empty.\n",
-                       f->file_name, (unsigned long long)offset >> 9);
+               log_err("%s: report zones from sector %"PRIu64" is empty.\n",
+                       f->file_name, offset >> 9);
                ret = -EIO;
        }
 
@@ -116,9 +116,8 @@ int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
                ret = blkzoned_reset_wp(td, f, offset, length);
        if (ret < 0) {
                td_verror(td, errno, "resetting wp failed");
-               log_err("%s: resetting wp for %llu sectors at sector %llu failed (%d).\n",
-                       f->file_name, (unsigned long long)length >> 9,
-                       (unsigned long long)offset >> 9, errno);
+               log_err("%s: resetting wp for %"PRIu64" sectors at sector %"PRIu64" failed (%d).\n",
+                       f->file_name, length >> 9, offset >> 9, errno);
        }
 
        return ret;
@@ -318,16 +317,16 @@ static bool zbd_verify_sizes(void)
                                        return false;
                                }
                        } else if (td->o.zone_size != f->zbd_info->zone_size) {
-                               log_err("%s: job parameter zonesize %llu does not match disk zone size %llu.\n",
-                                       f->file_name, (unsigned long long) td->o.zone_size,
-                                       (unsigned long long) f->zbd_info->zone_size);
+                               log_err("%s: job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
+                                       f->file_name, td->o.zone_size,
+                                       f->zbd_info->zone_size);
                                return false;
                        }
 
                        if (td->o.zone_skip % td->o.zone_size) {
                                log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
-                                       f->file_name, (unsigned long long) td->o.zone_skip,
-                                       (unsigned long long) td->o.zone_size);
+                                       f->file_name, td->o.zone_skip,
+                                       td->o.zone_size);
                                return false;
                        }
 
@@ -341,9 +340,9 @@ static bool zbd_verify_sizes(void)
                                                 f->file_name);
                                        return false;
                                }
-                               log_info("%s: rounded up offset from %llu to %llu\n",
-                                        f->file_name, (unsigned long long) f->file_offset,
-                                        (unsigned long long) new_offset);
+                               log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
+                                        f->file_name, f->file_offset,
+                                        new_offset);
                                f->io_size -= (new_offset - f->file_offset);
                                f->file_offset = new_offset;
                        }
@@ -357,9 +356,9 @@ static bool zbd_verify_sizes(void)
                                                 f->file_name);
                                        return false;
                                }
-                               log_info("%s: rounded down io_size from %llu to %llu\n",
-                                        f->file_name, (unsigned long long) f->io_size,
-                                        (unsigned long long) new_end - f->file_offset);
+                               log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
+                                        f->file_name, f->io_size,
+                                        new_end - f->file_offset);
                                f->io_size = new_end - f->file_offset;
                        }
                }
@@ -375,18 +374,30 @@ static bool zbd_verify_bs(void)
        int i, j, k;
 
        for_each_td(td, i) {
+               if (td_trim(td) &&
+                   (td->o.min_bs[DDIR_TRIM] != td->o.max_bs[DDIR_TRIM] ||
+                    td->o.bssplit_nr[DDIR_TRIM])) {
+                       log_info("bsrange and bssplit are not allowed for trim with zonemode=zbd\n");
+                       return false;
+               }
                for_each_file(td, f, j) {
                        uint64_t zone_size;
 
                        if (!f->zbd_info)
                                continue;
                        zone_size = f->zbd_info->zone_size;
+                       if (td_trim(td) && td->o.bs[DDIR_TRIM] != zone_size) {
+                               log_info("%s: trim block size %llu is not the zone size %"PRIu64"\n",
+                                        f->file_name, td->o.bs[DDIR_TRIM],
+                                        zone_size);
+                               return false;
+                       }
                        for (k = 0; k < FIO_ARRAY_SIZE(td->o.bs); k++) {
                                if (td->o.verify != VERIFY_NONE &&
                                    zone_size % td->o.bs[k] != 0) {
-                                       log_info("%s: block size %llu is not a divisor of the zone size %llu\n",
+                                       log_info("%s: block size %llu is not a divisor of the zone size %"PRIu64"\n",
                                                 f->file_name, td->o.bs[k],
-                                                (unsigned long long)zone_size);
+                                                zone_size);
                                        return false;
                                }
                        }
@@ -436,8 +447,7 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
 
        if (zone_capacity > zone_size) {
                log_err("%s: job parameter zonecapacity %llu is larger than zone size %llu\n",
-                       f->file_name, (unsigned long long) td->o.zone_capacity,
-                       (unsigned long long) td->o.zone_size);
+                       f->file_name, td->o.zone_capacity, td->o.zone_size);
                return 1;
        }
 
@@ -513,15 +523,14 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
        if (td->o.zone_size == 0) {
                td->o.zone_size = zone_size;
        } else if (td->o.zone_size != zone_size) {
-               log_err("fio: %s job parameter zonesize %llu does not match disk zone size %llu.\n",
-                       f->file_name, (unsigned long long) td->o.zone_size,
-                       (unsigned long long) zone_size);
+               log_err("fio: %s job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
+                       f->file_name, td->o.zone_size, zone_size);
                ret = -EINVAL;
                goto out;
        }
 
-       dprint(FD_ZBD, "Device %s has %d zones of size %llu KB\n", f->file_name,
-              nr_zones, (unsigned long long) zone_size / 1024);
+       dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n", f->file_name,
+              nr_zones, zone_size / 1024);
 
        zbd_info = scalloc(1, sizeof(*zbd_info) +
                           (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
@@ -575,9 +584,8 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
                                           ZBD_REPORT_MAX_ZONES));
                if (nrz < 0) {
                        ret = nrz;
-                       log_info("fio: report zones (offset %llu) failed for %s (%d).\n",
-                                (unsigned long long)offset,
-                                f->file_name, -ret);
+                       log_info("fio: report zones (offset %"PRIu64") failed for %s (%d).\n",
+                                offset, f->file_name, -ret);
                        goto out;
                }
        }
@@ -636,8 +644,12 @@ static int zbd_set_max_open_zones(struct thread_data *td, struct fio_file *f)
 
 out:
        /* Ensure that the limit is not larger than FIO's internal limit */
-       zbd->max_open_zones = min_not_zero(zbd->max_open_zones,
-                                          (uint32_t) ZBD_MAX_OPEN_ZONES);
+       if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
+               td_verror(td, EINVAL, "'max_open_zones' value is too large");
+               log_err("'max_open_zones' value is larger than %u\n", ZBD_MAX_OPEN_ZONES);
+               return -EINVAL;
+       }
+
        dprint(FD_ZBD, "%s: using max open zones limit: %"PRIu32"\n",
               f->file_name, zbd->max_open_zones);
 
@@ -827,11 +839,25 @@ int zbd_setup_files(struct thread_data *td)
                        log_err("Different 'max_open_zones' values\n");
                        return 1;
                }
-               if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
-                       log_err("'max_open_zones' value is limited by %u\n", ZBD_MAX_OPEN_ZONES);
+
+               /*
+                * The per job max open zones limit cannot be used without a
+                * global max open zones limit. (As the tracking of open zones
+                * is disabled when there is no global max open zones limit.)
+                */
+               if (td->o.job_max_open_zones && !zbd->max_open_zones) {
+                       log_err("'job_max_open_zones' cannot be used without a global open zones limit\n");
                        return 1;
                }
 
+               /*
+                * zbd->max_open_zones is the global limit shared for all jobs
+                * that target the same zoned block device. Force sync the per
+                * thread global limit with the actual global limit. (The real
+                * per thread/job limit is stored in td->o.job_max_open_zones).
+                */
+               td->o.max_open_zones = zbd->max_open_zones;
+
                for (zi = f->min_zone; zi < f->max_zone; zi++) {
                        z = &zbd->zone_info[zi];
                        if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
@@ -942,7 +968,7 @@ static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
                           struct fio_zone_info *const ze)
 {
        struct fio_zone_info *z;
-       const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+       const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
        int res = 0;
 
        assert(min_bs);
@@ -1093,6 +1119,8 @@ static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
        struct zoned_block_device_info *zbdi = f->zbd_info;
        int i;
 
+       /* This function should never be called when zbdi->max_open_zones == 0 */
+       assert(zbdi->max_open_zones);
        assert(td->o.job_max_open_zones == 0 || td->num_open_zones <= td->o.job_max_open_zones);
        assert(td->o.job_max_open_zones <= zbdi->max_open_zones);
        assert(zbdi->num_open_zones <= zbdi->max_open_zones);
@@ -1113,7 +1141,8 @@ static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
 static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
                          uint32_t zone_idx)
 {
-       const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+       const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
+       struct zoned_block_device_info *zbdi = f->zbd_info;
        struct fio_zone_info *z = get_zone(f, zone_idx);
        bool res = true;
 
@@ -1127,7 +1156,15 @@ static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
        if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
                return false;
 
-       pthread_mutex_lock(&f->zbd_info->mutex);
+       /*
+        * zbdi->max_open_zones == 0 means that there is no limit on the maximum
+        * number of open zones. In this case, do no track open zones in
+        * zbdi->open_zones array.
+        */
+       if (!zbdi->max_open_zones)
+               return true;
+
+       pthread_mutex_lock(&zbdi->mutex);
        if (is_zone_open(td, f, zone_idx)) {
                /*
                 * If the zone is already open and going to be full by writes
@@ -1142,24 +1179,38 @@ static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
        if (td->o.job_max_open_zones > 0 &&
            td->num_open_zones >= td->o.job_max_open_zones)
                goto out;
-       if (f->zbd_info->num_open_zones >= f->zbd_info->max_open_zones)
+       if (zbdi->num_open_zones >= zbdi->max_open_zones)
                goto out;
        dprint(FD_ZBD, "%s: opening zone %d\n", f->file_name, zone_idx);
-       f->zbd_info->open_zones[f->zbd_info->num_open_zones++] = zone_idx;
+       zbdi->open_zones[zbdi->num_open_zones++] = zone_idx;
        td->num_open_zones++;
        z->open = 1;
        res = true;
 
 out:
-       pthread_mutex_unlock(&f->zbd_info->mutex);
+       pthread_mutex_unlock(&zbdi->mutex);
        return res;
 }
 
-/* Anything goes as long as it is not a constant. */
+/* Return random zone index for one of the open zones. */
 static uint32_t pick_random_zone_idx(const struct fio_file *f,
                                     const struct io_u *io_u)
 {
-       return io_u->offset * f->zbd_info->num_open_zones / f->real_file_size;
+       return (io_u->offset - f->file_offset) * f->zbd_info->num_open_zones /
+               f->io_size;
+}
+
+static bool any_io_in_flight(void)
+{
+       struct thread_data *td;
+       int i;
+
+       for_each_td(td, i) {
+               if (td->io_u_in_flight)
+                       return true;
+       }
+
+       return false;
 }
 
 /*
@@ -1173,22 +1224,25 @@ static uint32_t pick_random_zone_idx(const struct fio_file *f,
 static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
                                                      struct io_u *io_u)
 {
-       const uint32_t min_bs = td->o.min_bs[io_u->ddir];
+       const uint64_t min_bs = td->o.min_bs[io_u->ddir];
        struct fio_file *f = io_u->file;
+       struct zoned_block_device_info *zbdi = f->zbd_info;
        struct fio_zone_info *z;
        unsigned int open_zone_idx = -1;
        uint32_t zone_idx, new_zone_idx;
        int i;
        bool wait_zone_close;
+       bool in_flight;
+       bool should_retry = true;
 
        assert(is_valid_offset(f, io_u->offset));
 
-       if (td->o.max_open_zones || td->o.job_max_open_zones) {
+       if (zbdi->max_open_zones || td->o.job_max_open_zones) {
                /*
-                * This statement accesses f->zbd_info->open_zones[] on purpose
+                * This statement accesses zbdi->open_zones[] on purpose
                 * without locking.
                 */
-               zone_idx = f->zbd_info->open_zones[pick_random_zone_idx(f, io_u)];
+               zone_idx = zbdi->open_zones[pick_random_zone_idx(f, io_u)];
        } else {
                zone_idx = zbd_zone_idx(f, io_u->offset);
        }
@@ -1200,9 +1254,9 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
               __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
 
        /*
-        * Since z->mutex is the outer lock and f->zbd_info->mutex the inner
+        * Since z->mutex is the outer lock and zbdi->mutex the inner
         * lock it can happen that the state of the zone with index zone_idx
-        * has changed after 'z' has been assigned and before f->zbd_info->mutex
+        * has changed after 'z' has been assigned and before zbdi->mutex
         * has been obtained. Hence the loop.
         */
        for (;;) {
@@ -1211,12 +1265,12 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
                z = get_zone(f, zone_idx);
                if (z->has_wp)
                        zone_lock(td, f, z);
-               pthread_mutex_lock(&f->zbd_info->mutex);
+               pthread_mutex_lock(&zbdi->mutex);
                if (z->has_wp) {
                        if (z->cond != ZBD_ZONE_COND_OFFLINE &&
-                           td->o.max_open_zones == 0 && td->o.job_max_open_zones == 0)
+                           zbdi->max_open_zones == 0 && td->o.job_max_open_zones == 0)
                                goto examine_zone;
-                       if (f->zbd_info->num_open_zones == 0) {
+                       if (zbdi->num_open_zones == 0) {
                                dprint(FD_ZBD, "%s(%s): no zones are open\n",
                                       __func__, f->file_name);
                                goto open_other_zone;
@@ -1230,14 +1284,14 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
                 */
                open_zone_idx = pick_random_zone_idx(f, io_u);
                assert(!open_zone_idx ||
-                      open_zone_idx < f->zbd_info->num_open_zones);
+                      open_zone_idx < zbdi->num_open_zones);
                tmp_idx = open_zone_idx;
-               for (i = 0; i < f->zbd_info->num_open_zones; i++) {
+               for (i = 0; i < zbdi->num_open_zones; i++) {
                        uint32_t tmpz;
 
-                       if (tmp_idx >= f->zbd_info->num_open_zones)
+                       if (tmp_idx >= zbdi->num_open_zones)
                                tmp_idx = 0;
-                       tmpz = f->zbd_info->open_zones[tmp_idx];
+                       tmpz = zbdi->open_zones[tmp_idx];
                        if (f->min_zone <= tmpz && tmpz < f->max_zone) {
                                open_zone_idx = tmp_idx;
                                goto found_candidate_zone;
@@ -1248,39 +1302,39 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
 
                dprint(FD_ZBD, "%s(%s): no candidate zone\n",
                        __func__, f->file_name);
-               pthread_mutex_unlock(&f->zbd_info->mutex);
+               pthread_mutex_unlock(&zbdi->mutex);
                if (z->has_wp)
                        zone_unlock(z);
                return NULL;
 
 found_candidate_zone:
-               new_zone_idx = f->zbd_info->open_zones[open_zone_idx];
+               new_zone_idx = zbdi->open_zones[open_zone_idx];
                if (new_zone_idx == zone_idx)
                        break;
                zone_idx = new_zone_idx;
-               pthread_mutex_unlock(&f->zbd_info->mutex);
+               pthread_mutex_unlock(&zbdi->mutex);
                if (z->has_wp)
                        zone_unlock(z);
        }
 
-       /* Both z->mutex and f->zbd_info->mutex are held. */
+       /* Both z->mutex and zbdi->mutex are held. */
 
 examine_zone:
        if (z->wp + min_bs <= zbd_zone_capacity_end(z)) {
-               pthread_mutex_unlock(&f->zbd_info->mutex);
+               pthread_mutex_unlock(&zbdi->mutex);
                goto out;
        }
 
 open_other_zone:
        /* Check if number of open zones reaches one of limits. */
        wait_zone_close =
-               f->zbd_info->num_open_zones == f->max_zone - f->min_zone ||
-               (td->o.max_open_zones &&
-                f->zbd_info->num_open_zones == td->o.max_open_zones) ||
+               zbdi->num_open_zones == f->max_zone - f->min_zone ||
+               (zbdi->max_open_zones &&
+                zbdi->num_open_zones == zbdi->max_open_zones) ||
                (td->o.job_max_open_zones &&
                 td->num_open_zones == td->o.job_max_open_zones);
 
-       pthread_mutex_unlock(&f->zbd_info->mutex);
+       pthread_mutex_unlock(&zbdi->mutex);
 
        /* Only z->mutex is held. */
 
@@ -1294,8 +1348,9 @@ open_other_zone:
                io_u_quiesce(td);
        }
 
+retry:
        /* Zone 'z' is full, so try to open a new zone. */
-       for (i = f->io_size / f->zbd_info->zone_size; i > 0; i--) {
+       for (i = f->io_size / zbdi->zone_size; i > 0; i--) {
                zone_idx++;
                if (z->has_wp)
                        zone_unlock(z);
@@ -1318,12 +1373,12 @@ open_other_zone:
        /* Only z->mutex is held. */
 
        /* Check whether the write fits in any of the already opened zones. */
-       pthread_mutex_lock(&f->zbd_info->mutex);
-       for (i = 0; i < f->zbd_info->num_open_zones; i++) {
-               zone_idx = f->zbd_info->open_zones[i];
+       pthread_mutex_lock(&zbdi->mutex);
+       for (i = 0; i < zbdi->num_open_zones; i++) {
+               zone_idx = zbdi->open_zones[i];
                if (zone_idx < f->min_zone || zone_idx >= f->max_zone)
                        continue;
-               pthread_mutex_unlock(&f->zbd_info->mutex);
+               pthread_mutex_unlock(&zbdi->mutex);
                zone_unlock(z);
 
                z = get_zone(f, zone_idx);
@@ -1331,9 +1386,27 @@ open_other_zone:
                zone_lock(td, f, z);
                if (z->wp + min_bs <= zbd_zone_capacity_end(z))
                        goto out;
-               pthread_mutex_lock(&f->zbd_info->mutex);
+               pthread_mutex_lock(&zbdi->mutex);
        }
-       pthread_mutex_unlock(&f->zbd_info->mutex);
+
+       /*
+        * When any I/O is in-flight or when all I/Os in-flight get completed,
+        * the I/Os might have closed zones then retry the steps to open a zone.
+        * Before retry, call io_u_quiesce() to complete in-flight writes.
+        */
+       in_flight = any_io_in_flight();
+       if (in_flight || should_retry) {
+               dprint(FD_ZBD, "%s(%s): wait zone close and retry open zones\n",
+                      __func__, f->file_name);
+               pthread_mutex_unlock(&zbdi->mutex);
+               zone_unlock(z);
+               io_u_quiesce(td);
+               zone_lock(td, f, z);
+               should_retry = in_flight;
+               goto retry;
+       }
+
+       pthread_mutex_unlock(&zbdi->mutex);
        zone_unlock(z);
        dprint(FD_ZBD, "%s(%s): did not open another zone\n", __func__,
               f->file_name);
@@ -1354,7 +1427,7 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
                                                    struct fio_zone_info *z)
 {
        const struct fio_file *f = io_u->file;
-       const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+       const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
 
        if (!zbd_open_zone(td, f, zbd_zone_nr(f, z))) {
                zone_unlock(z);
@@ -1363,8 +1436,8 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
        }
 
        if (z->verify_block * min_bs >= z->capacity) {
-               log_err("%s: %d * %d >= %llu\n", f->file_name, z->verify_block,
-                       min_bs, (unsigned long long)z->capacity);
+               log_err("%s: %d * %"PRIu64" >= %"PRIu64"\n", f->file_name, z->verify_block,
+                       min_bs, z->capacity);
                /*
                 * If the assertion below fails during a test run, adding
                 * "--experimental_verify=1" to the command line may help.
@@ -1373,8 +1446,8 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
        }
        io_u->offset = z->start + z->verify_block * min_bs;
        if (io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
-               log_err("%s: %llu + %llu >= %llu\n", f->file_name, io_u->offset,
-                       io_u->buflen, (unsigned long long) zbd_zone_capacity_end(z));
+               log_err("%s: %llu + %llu >= %"PRIu64"\n", f->file_name, io_u->offset,
+                       io_u->buflen, zbd_zone_capacity_end(z));
                assert(false);
        }
        z->verify_block += io_u->buflen / min_bs;
@@ -1383,18 +1456,16 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
 }
 
 /*
- * Find another zone for which @io_u fits in the readable data in the zone.
- * Search in zones @zb + 1 .. @zl. For random workload, also search in zones
- * @zb - 1 .. @zf.
+ * Find another zone which has @min_bytes of readable data. Search in zones
+ * @zb + 1 .. @zl. For random workload, also search in zones @zb - 1 .. @zf.
  *
  * Either returns NULL or returns a zone pointer. When the zone has write
  * pointer, hold the mutex for the zone.
  */
 static struct fio_zone_info *
-zbd_find_zone(struct thread_data *td, struct io_u *io_u,
+zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
              struct fio_zone_info *zb, struct fio_zone_info *zl)
 {
-       const uint32_t min_bs = td->o.min_bs[io_u->ddir];
        struct fio_file *f = io_u->file;
        struct fio_zone_info *z1, *z2;
        const struct fio_zone_info *const zf = get_zone(f, f->min_zone);
@@ -1407,7 +1478,7 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u,
                if (z1 < zl && z1->cond != ZBD_ZONE_COND_OFFLINE) {
                        if (z1->has_wp)
                                zone_lock(td, f, z1);
-                       if (z1->start + min_bs <= z1->wp)
+                       if (z1->start + min_bytes <= z1->wp)
                                return z1;
                        if (z1->has_wp)
                                zone_unlock(z1);
@@ -1418,14 +1489,14 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u,
                    z2->cond != ZBD_ZONE_COND_OFFLINE) {
                        if (z2->has_wp)
                                zone_lock(td, f, z2);
-                       if (z2->start + min_bs <= z2->wp)
+                       if (z2->start + min_bytes <= z2->wp)
                                return z2;
                        if (z2->has_wp)
                                zone_unlock(z2);
                }
        }
-       dprint(FD_ZBD, "%s: adjusting random read offset failed\n",
-              f->file_name);
+       dprint(FD_ZBD, "%s: no zone has %"PRIu64" bytes of readable data\n",
+              f->file_name, min_bytes);
        return NULL;
 }
 
@@ -1500,9 +1571,6 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
                pthread_mutex_unlock(&zbd_info->mutex);
                z->wp = zone_end;
                break;
-       case DDIR_TRIM:
-               assert(z->wp == z->start);
-               break;
        default:
                break;
        }
@@ -1600,10 +1668,9 @@ void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
            f->last_pos[ddir] >= zbd_zone_capacity_end(z)) {
                dprint(FD_ZBD,
                       "%s: Jump from zone capacity limit to zone end:"
-                      " (%llu -> %llu) for zone %u (%llu)\n",
-                      f->file_name, (unsigned long long) f->last_pos[ddir],
-                      (unsigned long long) zbd_zone_end(z), zone_idx,
-                      (unsigned long long) z->capacity);
+                      " (%"PRIu64" -> %"PRIu64") for zone %u (%"PRIu64")\n",
+                      f->file_name, f->last_pos[ddir],
+                      zbd_zone_end(z), zone_idx, z->capacity);
                td->io_skip_bytes += zbd_zone_end(z) - f->last_pos[ddir];
                f->last_pos[ddir] = zbd_zone_end(z);
        }
@@ -1683,14 +1750,15 @@ enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u,
 enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 {
        struct fio_file *f = io_u->file;
+       struct zoned_block_device_info *zbdi = f->zbd_info;
        uint32_t zone_idx_b;
        struct fio_zone_info *zb, *zl, *orig_zb;
        uint32_t orig_len = io_u->buflen;
-       uint32_t min_bs = td->o.min_bs[io_u->ddir];
+       uint64_t min_bs = td->o.min_bs[io_u->ddir];
        uint64_t new_len;
        int64_t range;
 
-       assert(f->zbd_info);
+       assert(zbdi);
        assert(min_bs);
        assert(is_valid_offset(f, io_u->offset));
        assert(io_u->buflen);
@@ -1712,9 +1780,9 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 
                if (io_u->offset + min_bs > (zb + 1)->start) {
                        dprint(FD_IO,
-                              "%s: off=%llu + min_bs=%u > next zone %llu\n",
+                              "%s: off=%llu + min_bs=%"PRIu64" > next zone %"PRIu64"\n",
                               f->file_name, io_u->offset,
-                              min_bs, (unsigned long long) (zb + 1)->start);
+                              min_bs, (zb + 1)->start);
                        io_u->offset = zb->start + (zb + 1)->start - io_u->offset;
                        new_len = min(io_u->buflen, (zb + 1)->start - io_u->offset);
                } else {
@@ -1753,7 +1821,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                    ((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) {
                        zone_unlock(zb);
                        zl = get_zone(f, f->max_zone);
-                       zb = zbd_find_zone(td, io_u, zb, zl);
+                       zb = zbd_find_zone(td, io_u, min_bs, zb, zl);
                        if (!zb) {
                                dprint(FD_ZBD,
                                       "%s: zbd_find_zone(%lld, %llu) failed\n",
@@ -1802,12 +1870,11 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                assert(io_u->offset + io_u->buflen <= zb->wp);
                goto accept;
        case DDIR_WRITE:
-               if (io_u->buflen > f->zbd_info->zone_size) {
+               if (io_u->buflen > zbdi->zone_size) {
                        td_verror(td, EINVAL, "I/O buflen exceeds zone size");
                        dprint(FD_IO,
-                              "%s: I/O buflen %llu exceeds zone size %llu\n",
-                              f->file_name, io_u->buflen,
-                              (unsigned long long) f->zbd_info->zone_size);
+                              "%s: I/O buflen %llu exceeds zone size %"PRIu64"\n",
+                              f->file_name, io_u->buflen, zbdi->zone_size);
                        goto eof;
                }
                if (!zbd_open_zone(td, f, zone_idx_b)) {
@@ -1818,11 +1885,10 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                                       f->file_name);
                                goto eof;
                        }
-                       zone_idx_b = zbd_zone_nr(f, zb);
                }
                /* Check whether the zone reset threshold has been exceeded */
                if (td->o.zrf.u.f) {
-                       if (f->zbd_info->wp_sectors_with_data >=
+                       if (zbdi->wp_sectors_with_data >=
                            f->io_size * td->o.zrt.u.f &&
                            zbd_dec_and_reset_write_cnt(td, f)) {
                                zb->reset_zone = 1;
@@ -1845,9 +1911,8 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 
                        if (zb->capacity < min_bs) {
                                td_verror(td, EINVAL, "ZCAP is less min_bs");
-                               log_err("zone capacity %llu smaller than minimum block size %d\n",
-                                       (unsigned long long)zb->capacity,
-                                       min_bs);
+                               log_err("zone capacity %"PRIu64" smaller than minimum block size %"PRIu64"\n",
+                                       zb->capacity, min_bs);
                                goto eof;
                        }
                }
@@ -1877,12 +1942,27 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                        goto accept;
                }
                td_verror(td, EIO, "zone remainder too small");
-               log_err("zone remainder %lld smaller than min block size %d\n",
+               log_err("zone remainder %lld smaller than min block size %"PRIu64"\n",
                        (zbd_zone_capacity_end(zb) - io_u->offset), min_bs);
                goto eof;
        case DDIR_TRIM:
-               /* fall-through */
+               /* Check random trim targets a non-empty zone */
+               if (!td_random(td) || zb->wp > zb->start)
+                       goto accept;
+
+               /* Find out a non-empty zone to trim */
+               zone_unlock(zb);
+               zl = get_zone(f, f->max_zone);
+               zb = zbd_find_zone(td, io_u, 1, zb, zl);
+               if (zb) {
+                       io_u->offset = zb->start;
+                       dprint(FD_ZBD, "%s: found new zone(%lld) for trim\n",
+                              f->file_name, io_u->offset);
+                       goto accept;
+               }
+               goto eof;
        case DDIR_SYNC:
+               /* fall-through */
        case DDIR_DATASYNC:
        case DDIR_SYNC_FILE_RANGE:
        case DDIR_WAIT:
@@ -1919,7 +1999,42 @@ char *zbd_write_status(const struct thread_stat *ts)
 {
        char *res;
 
-       if (asprintf(&res, "; %llu zone resets", (unsigned long long) ts->nr_zone_resets) < 0)
+       if (asprintf(&res, "; %"PRIu64" zone resets", ts->nr_zone_resets) < 0)
                return NULL;
        return res;
 }
+
+/**
+ * zbd_do_io_u_trim - If reset zone is applicable, do reset zone instead of trim
+ *
+ * @td: FIO thread data.
+ * @io_u: FIO I/O unit.
+ *
+ * It is assumed that z->mutex is already locked.
+ * Return io_u_completed when reset zone succeeds. Return 0 when the target zone
+ * does not have write pointer. On error, return negative errno.
+ */
+int zbd_do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
+{
+       struct fio_file *f = io_u->file;
+       struct fio_zone_info *z;
+       uint32_t zone_idx;
+       int ret;
+
+       zone_idx = zbd_zone_idx(f, io_u->offset);
+       z = get_zone(f, zone_idx);
+
+       if (!z->has_wp)
+               return 0;
+
+       if (io_u->offset != z->start) {
+               log_err("Trim offset not at zone start (%lld)\n", io_u->offset);
+               return -EINVAL;
+       }
+
+       ret = zbd_reset_zone((struct thread_data *)td, f, z);
+       if (ret < 0)
+               return ret;
+
+       return io_u_completed;
+}
diff --git a/zbd.h b/zbd.h
index 6453439313f8de4d5c049c371237762f05d8b7d6..0a73b41dd9ec5ee1223b9ceb7f41fd211dac8cb0 100644 (file)
--- a/zbd.h
+++ b/zbd.h
@@ -17,6 +17,7 @@ struct fio_file;
 enum io_u_action {
        io_u_accept     = 0,
        io_u_eof        = 1,
+       io_u_completed  = 2,
 };
 
 /**
@@ -50,7 +51,8 @@ struct fio_zone_info {
  * zoned_block_device_info - zoned block device characteristics
  * @model: Device model.
  * @max_open_zones: global limit on the number of simultaneously opened
- *     sequential write zones.
+ *     sequential write zones. A zero value means unlimited open zones,
+ *     and that open zones will not be tracked in the open_zones array.
  * @mutex: Protects the modifiable members in this structure (refcount and
  *             num_open_zones).
  * @zone_size: size of a single zone in bytes.
@@ -98,6 +100,7 @@ enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u,
                              enum fio_ddir ddir);
 enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u);
 char *zbd_write_status(const struct thread_stat *ts);
+int zbd_do_io_u_trim(const struct thread_data *td, struct io_u *io_u);
 
 static inline void zbd_close_file(struct fio_file *f)
 {