Merge branch 'fio_pr_alternate_epoch' of https://github.com/PCPartPicker/fio

author Jens Axboe <axboe@kernel.dk>

Thu, 3 Feb 2022 22:34:40 +0000 (15:34 -0700)

committer Jens Axboe <axboe@kernel.dk>

Thu, 3 Feb 2022 22:34:40 +0000 (15:34 -0700)
author Jens Axboe <axboe@kernel.dk>
Thu, 3 Feb 2022 22:34:40 +0000 (15:34 -0700)
committer Jens Axboe <axboe@kernel.dk>
Thu, 3 Feb 2022 22:34:40 +0000 (15:34 -0700)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml

index a766cfa8e2e055022d83964c6d395337347761bd..cd8ce1429ceca8517f6a9bfd4bc7f50f942d5af5 100644 (file)
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -23,7 +23,7 @@ jobs:
            os: ubuntu-20.04
            cc: clang
          - build: macos
-          os: macos-10.15
+          os: macos-11
          - build: linux-i686-gcc
            os: ubuntu-20.04
            arch: i686
diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml

new file mode 100644 (file)

index 0000000..acc8d48
--- /dev/null
+++ b/.github/workflows/cifuzz.yml
@@ -0,0 +1,24 @@
+name: CIFuzz
+on: [pull_request]
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'fio'
+        dry-run: false
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'fio'
+        fuzz-seconds: 600
+        dry-run: false
+    - name: Upload Crash
+      uses: actions/upload-artifact@v1
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts
diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN

index e9d563c124a419dda8d013211fcd7154ab249d45..60f7bb21c09b8a613cf3dfd14ae9acd686308436 100755 (executable)
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN
@@ -1,7 +1,7 @@
  #!/bin/sh
  
  GVF=FIO-VERSION-FILE
-DEF_VER=fio-3.28
+DEF_VER=fio-3.29
  
  LF='
  '
diff --git a/HOWTO b/HOWTO

index 99fb575126483aa050c1d86abbc4f41c56ede13e..74ba7216e132c358309ffd0961c2cb73ea7494be 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -1063,6 +1063,12 @@ Target file/device
         Limit on the number of simultaneously opened zones per single
         thread/process.
  
+.. option:: ignore_zone_limits=bool
+       If this option is used, fio will ignore the maximum number of open
+       zones limit of the zoned block device in use, thus allowing the
+       option :option:`max_open_zones` value to be larger than the device
+       reported limit. Default: false.
+
  .. option:: zone_reset_threshold=float
  
         A number between zero and one that indicates the ratio of logical
@@ -1338,7 +1344,7 @@ I/O type
  .. option:: fdatasync=int
  
         Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
-       not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no
+       not metadata blocks. In Windows, DragonFlyBSD or OSX there is no
         :manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`.
         Defaults to 0, which means fio does not periodically issue and wait for a
         data-only sync to complete.
@@ -1880,11 +1886,12 @@ I/O size
  
  .. option:: filesize=irange(int)
  
-       Individual file sizes. May be a range, in which case fio will select sizes
-       for files at random within the given range and limited to :option:`size` in
-       total (if that is given). If not given, each created file is the same size.
-       This option overrides :option:`size` in terms of file size, which means
-       this value is used as a fixed size or possible range of each file.
+       Individual file sizes. May be a range, in which case fio will select sizes for
+       files at random within the given range. If not given, each created file is the
+       same size. This option overrides :option:`size` in terms of file size, i.e. if
+       :option:`filesize` is specified then :option:`size` becomes merely the default
+       for :option:`io_size` and has no effect at all if :option:`io_size` is set
+       explicitly.
  
  .. option:: file_append=bool
  
@@ -2205,10 +2212,28 @@ with the caveat that when used on the command line, they must come after the
         depending on the block size of the IO. This option is useful only
         when used together with the :option:`bssplit` option, that is,
         multiple different block sizes are used for reads and writes.
-       The format for this option is the same as the format of the
-       :option:`bssplit` option, with the exception that values for
-       trim IOs are ignored. This option is mutually exclusive with the
-       :option:`cmdprio_percentage` option.
+
+       The first accepted format for this option is the same as the format of
+       the :option:`bssplit` option:
+
+               cmdprio_bssplit=blocksize/percentage:blocksize/percentage
+
+       In this case, each entry will use the priority class and priority
+       level defined by the options :option:`cmdprio_class` and
+       :option:`cmdprio` respectively.
+
+       The second accepted format for this option is:
+
+               cmdprio_bssplit=blocksize/percentage/class/level:blocksize/percentage/class/level
+
+       In this case, the priority class and priority level is defined inside
+       each entry. In comparison with the first accepted format, the second
+       accepted format does not restrict all entries to have the same priority
+       class and priority level.
+
+       For both formats, only the read and write data directions are supported,
+       values for trim IOs are ignored. This option is mutually exclusive with
+       the :option:`cmdprio_percentage` option.
  
  .. option:: fixedbufs : [io_uring]
  
@@ -2490,11 +2515,13 @@ with the caveat that when used on the command line, they must come after the
  
         **write**
                 This is the default where write opcodes are issued as usual.
-       **verify**
+       **write_and_verify**
                 Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
                 directs the device to carry out a medium verification with no data
                 comparison. The writefua option is ignored with this selection.
-       **same**
+       **verify**
+               This option is deprecated. Use write_and_verify instead.
+       **write_same**
                 Issue WRITE SAME commands. This transfers a single block to the device
                 and writes this same block of data to a contiguous sequence of LBAs
                 beginning at the specified offset. fio's block size parameter specifies
@@ -2505,6 +2532,36 @@ with the caveat that when used on the command line, they must come after the
                 for each command but only the first 512 bytes will be used and
                 transferred to the device. The writefua option is ignored with this
                 selection.
+       **same**
+               This option is deprecated. Use write_same instead.
+       **write_same_ndob**
+               Issue WRITE SAME(16) commands as above but with the No Data Output
+               Buffer (NDOB) bit set. No data will be transferred to the device with
+               this bit set. Data written will be a pre-determined pattern such as
+               all zeroes.
+       **write_stream**
+               Issue WRITE STREAM(16) commands. Use the **stream_id** option to specify
+               the stream identifier.
+       **verify_bytchk_00**
+               Issue VERIFY commands with BYTCHK set to 00. This directs the
+               device to carry out a medium verification with no data comparison.
+       **verify_bytchk_01**
+               Issue VERIFY commands with BYTCHK set to 01. This directs the device to
+               compare the data on the device with the data transferred to the device.
+       **verify_bytchk_11**
+               Issue VERIFY commands with BYTCHK set to 11. This transfers a
+               single block to the device and compares the contents of this block with the
+               data on the device beginning at the specified offset. fio's block size
+               parameter specifies the total amount of data compared with this command.
+               However, only one block (sector) worth of data is transferred to the device.
+               This is similar to the WRITE SAME command except that data is compared instead
+               of written.
+
+.. option:: stream_id=int : [sg]
+
+       Set the stream identifier for WRITE STREAM commands. If this is set to 0 (which is not
+       a valid stream identifier) fio will open a stream and then close it when done. Default
+       is 0.
  
  .. option:: hipri : [sg]
  
diff --git a/Makefile b/Makefile

index 5d17bcab906591ff121d3675d795ecd2be4695e6..2432f5192f8eb63295a382108f6f1be3f29cdbb2 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -99,6 +99,7 @@ endif
  ifdef CONFIG_LIBAIO
    libaio_SRCS = engines/libaio.c
    cmdprio_SRCS = engines/cmdprio.c
+  LIBS += -laio
    libaio_LIBS = -laio
    ENGINES += libaio
  endif
@@ -294,7 +295,7 @@ define engine_template =
  $(1)_OBJS := $$($(1)_SRCS:.c=.o)
  $$($(1)_OBJS): CFLAGS := -fPIC $$($(1)_CFLAGS) $(CFLAGS)
  engines/fio-$(1).so: $$($(1)_OBJS)
-       $$(QUIET_LINK)$(CC) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS)
+       $$(QUIET_LINK)$(CC) $(DYNAMIC) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS)
  ENGS_OBJS += engines/fio-$(1).so
  endef
  else # !CONFIG_DYNAMIC_ENGINES
@@ -429,7 +430,9 @@ T_TEST_PROGS += $(T_AXMAP_PROGS)
  T_TEST_PROGS += $(T_LFSR_TEST_PROGS)
  T_TEST_PROGS += $(T_GEN_RAND_PROGS)
  T_PROGS += $(T_BTRACE_FIO_PROGS)
+ifdef CONFIG_ZLIB
  T_PROGS += $(T_DEDUPE_PROGS)
+endif
  T_PROGS += $(T_VS_PROGS)
  T_TEST_PROGS += $(T_MEMLOCK_PROGS)
  ifdef CONFIG_PREAD
@@ -617,8 +620,10 @@ t/fio-btrace2fio: $(T_BTRACE_FIO_OBJS)
         $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
  endif
  
+ifdef CONFIG_ZLIB
  t/fio-dedupe: $(T_DEDUPE_OBJS)
         $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS)
+endif
  
  t/fio-verify-state: $(T_VS_OBJS)
         $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
diff --git a/README b/README

deleted file mode 100644 (file)

index d566fae..0000000
--- a/README
+++ /dev/null
@@ -1,286 +0,0 @@
-Overview and history
---------------------
-
-Fio was originally written to save me the hassle of writing special test case
-programs when I wanted to test a specific workload, either for performance
-reasons or to find/reproduce a bug. The process of writing such a test app can
-be tiresome, especially if you have to do it often.  Hence I needed a tool that
-would be able to simulate a given I/O workload without resorting to writing a
-tailored test case again and again.
-
-A test work load is difficult to define, though. There can be any number of
-processes or threads involved, and they can each be using their own way of
-generating I/O. You could have someone dirtying large amounts of memory in a
-memory mapped file, or maybe several threads issuing reads using asynchronous
-I/O. fio needed to be flexible enough to simulate both of these cases, and many
-more.
-
-Fio spawns a number of threads or processes doing a particular type of I/O
-action as specified by the user. fio takes a number of global parameters, each
-inherited by the thread unless otherwise parameters given to them overriding
-that setting is given.  The typical use of fio is to write a job file matching
-the I/O load one wants to simulate.
-
-
-Source
-------
-
-Fio resides in a git repo, the canonical place is:
-
-       git://git.kernel.dk/fio.git
-
-When inside a corporate firewall, git:// URL sometimes does not work.
-If git:// does not work, use the http protocol instead:
-
-       http://git.kernel.dk/fio.git
-
-Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
-meta data as well. Other tarballs are archives of official fio releases.
-Snapshots can download from:
-
-       http://brick.kernel.dk/snaps/
-
-There are also two official mirrors. Both of these are automatically synced with
-the main repository, when changes are pushed. If the main repo is down for some
-reason, either one of these is safe to use as a backup:
-
-       git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
-
-       https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
-
-or
-
-       git://github.com/axboe/fio.git
-
-       https://github.com/axboe/fio.git
-
-
-Mailing list
-------------
-
-The fio project mailing list is meant for anything related to fio including
-general discussion, bug reporting, questions, and development. For bug reporting,
-see REPORTING-BUGS.
-
-An automated mail detailing recent commits is automatically sent to the list at
-most daily. The list address is fio@vger.kernel.org, subscribe by sending an
-email to majordomo@vger.kernel.org with
-
-       subscribe fio
-
-in the body of the email. Archives can be found here:
-
-       http://www.spinics.net/lists/fio/
-
-or here:
-
-       https://lore.kernel.org/fio/
-
-and archives for the old list can be found here:
-
-       http://maillist.kernel.dk/fio-devel/
-
-
-Author
-------
-
-Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
-the Linux I/O subsystem and schedulers. He got tired of writing specific test
-applications to simulate a given workload, and found that the existing I/O
-benchmark/test tools out there weren't flexible enough to do what he wanted.
-
-Jens Axboe <axboe@kernel.dk> 20060905
-
-
-Binary packages
----------------
-
-Debian:
-       Starting with Debian "Squeeze", fio packages are part of the official
-       Debian repository. http://packages.debian.org/search?keywords=fio .
-
-Ubuntu:
-       Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
-       of the Ubuntu "universe" repository.
-       http://packages.ubuntu.com/search?keywords=fio .
-
-Red Hat, Fedora, CentOS & Co:
-       Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
-       packages are part of the Fedora/EPEL repositories.
-       https://apps.fedoraproject.org/packages/fio .
-
-Mandriva:
-       Mandriva has integrated fio into their package repository, so installing
-       on that distro should be as easy as typing ``urpmi fio``.
-
-Arch Linux:
-        An Arch Linux package is provided under the Community sub-repository:
-        https://www.archlinux.org/packages/?sort=&q=fio
-
-Solaris:
-       Packages for Solaris are available from OpenCSW. Install their pkgutil
-       tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
-       ``pkgutil -i fio``.
-
-Windows:
-       Rebecca Cran <rebecca@bsdio.com> has fio packages for Windows at
-       https://bsdio.com/fio/ . The latest builds for Windows can also
-       be grabbed from https://ci.appveyor.com/project/axboe/fio by clicking
-       the latest x86 or x64 build, then selecting the ARTIFACTS tab.
-
-BSDs:
-       Packages for BSDs may be available from their binary package repositories.
-       Look for a package "fio" using their binary package managers.
-
-
-Building
---------
-
-Just type::
-
- $ ./configure
- $ make
- $ make install
-
-Note that GNU make is required. On BSDs it's available from devel/gmake within
-ports directory; on Solaris it's in the SUNWgmake package.  On platforms where
-GNU make isn't the default, type ``gmake`` instead of ``make``.
-
-Configure will print the enabled options. Note that on Linux based platforms,
-the libaio development packages must be installed to use the libaio
-engine. Depending on distro, it is usually called libaio-devel or libaio-dev.
-
-For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
-to be installed.  gfio isn't built automatically and can be enabled with a
-``--enable-gfio`` option to configure.
-
-To build fio with a cross-compiler::
-
- $ make clean
- $ make CROSS_COMPILE=/path/to/toolchain/prefix
-
-Configure will attempt to determine the target platform automatically.
-
-It's possible to build fio for ESX as well, use the ``--esx`` switch to
-configure.
-
-
-Windows
-~~~~~~~
-
-The minimum versions of Windows for building/runing fio are Windows 7/Windows
-Server 2008 R2. On Windows, Cygwin (https://www.cygwin.com/) is required in
-order to build fio. To create an MSI installer package install WiX from
-https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
-directory.
-
-How to compile fio on 64-bit Windows:
-
- 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
-    packages starting with **mingw64-x86_64**. Ensure
-    **mingw64-x86_64-zlib** are installed if you wish
-    to enable fio's log compression functionality.
- 2. Open the Cygwin Terminal.
- 3. Go to the fio directory (source files).
- 4. Run ``make clean && make -j``.
-
-To build fio for 32-bit Windows, ensure the -i686 versions of the previously
-mentioned -x86_64 packages are installed and run ``./configure
---build-32bit-win`` before ``make``.
-
-It's recommended that once built or installed, fio be run in a Command Prompt or
-other 'native' console such as console2, since there are known to be display and
-signal issues when running it under a Cygwin shell (see
-https://github.com/mintty/mintty/issues/56 and
-https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs
-for details).
-
-
-Documentation
-~~~~~~~~~~~~~
-
-Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
-To build HTML formatted documentation run ``make -C doc html`` and direct your
-browser to :file:`./doc/output/html/index.html`.  To build manual page run
-``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
-output formats are supported run ``make -C doc help``.
-
-.. _reStructuredText: http://www.sphinx-doc.org/rest.html
-.. _Sphinx: http://www.sphinx-doc.org
-
-
-Platforms
----------
-
-Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
-Windows, FreeBSD, and DragonFly. Some features and/or options may only be
-available on some of the platforms, typically because those features only apply
-to that platform (like the solarisaio engine, or the splice engine on Linux).
-
-Some features are not available on FreeBSD/Solaris even if they could be
-implemented, I'd be happy to take patches for that. An example of that is disk
-utility statistics and (I think) huge page support, support for that does exist
-in FreeBSD/Solaris.
-
-Fio uses pthread mutexes for signalling and locking and some platforms do not
-support process shared pthread mutexes. As a result, on such platforms only
-threads are supported. This could be fixed with sysv ipc locking or other
-locking alternatives.
-
-Other \*BSD platforms are untested, but fio should work there almost out of the
-box. Since I don't do test runs or even compiles on those platforms, your
-mileage may vary. Sending me patches for other platforms is greatly
-appreciated. There's a lot of value in having the same test/benchmark tool
-available on all platforms.
-
-Note that POSIX aio is not enabled by default on AIX. Messages like these::
-
-    Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because:
-        Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix.
-
-indicate one needs to enable POSIX aio. Run the following commands as root::
-
-    # lsdev -C -l posix_aio0
-        posix_aio0 Defined  Posix Asynchronous I/O
-    # cfgmgr -l posix_aio0
-    # lsdev -C -l posix_aio0
-        posix_aio0 Available  Posix Asynchronous I/O
-
-POSIX aio should work now. To make the change permanent::
-
-    # chdev -l posix_aio0 -P -a autoconfig='available'
-        posix_aio0 changed
-
-
-Running fio
------------
-
-Running fio is normally the easiest part - you just give it the job file
-(or job files) as parameters::
-
-       $ fio [options] [jobfile] ...
-
-and it will start doing what the *jobfile* tells it to do. You can give more
-than one job file on the command line, fio will serialize the running of those
-files. Internally that is the same as using the :option:`stonewall` parameter
-described in the parameter section.
-
-If the job file contains only one job, you may as well just give the parameters
-on the command line. The command line parameters are identical to the job
-parameters, with a few extra that control global parameters.  For example, for
-the job file parameter :option:`iodepth=2 <iodepth>`, the mirror command line
-option would be :option:`--iodepth 2 <iodepth>` or :option:`--iodepth=2
-<iodepth>`. You can also use the command line for giving more than one job
-entry. For each :option:`--name <name>` option that fio sees, it will start a
-new job with that name.  Command line entries following a
-:option:`--name <name>` entry will apply to that job, until there are no more
-entries or a new :option:`--name <name>` entry is seen. This is similar to the
-job file options, where each option applies to the current job until a new []
-job entry is seen.
-
-fio does not need to run as root, except if the files or devices specified in
-the job section requires that. Some other options may also be restricted, such
-as memory locking, I/O scheduler switching, and decreasing the nice value.
-
-If *jobfile* is specified as ``-``, the job file will be read from standard
-input.
diff --git a/README.rst b/README.rst

new file mode 100644 (file)

index 0000000..d566fae
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,286 @@
+Overview and history
+--------------------
+
+Fio was originally written to save me the hassle of writing special test case
+programs when I wanted to test a specific workload, either for performance
+reasons or to find/reproduce a bug. The process of writing such a test app can
+be tiresome, especially if you have to do it often.  Hence I needed a tool that
+would be able to simulate a given I/O workload without resorting to writing a
+tailored test case again and again.
+
+A test work load is difficult to define, though. There can be any number of
+processes or threads involved, and they can each be using their own way of
+generating I/O. You could have someone dirtying large amounts of memory in a
+memory mapped file, or maybe several threads issuing reads using asynchronous
+I/O. fio needed to be flexible enough to simulate both of these cases, and many
+more.
+
+Fio spawns a number of threads or processes doing a particular type of I/O
+action as specified by the user. fio takes a number of global parameters, each
+inherited by the thread unless otherwise parameters given to them overriding
+that setting is given.  The typical use of fio is to write a job file matching
+the I/O load one wants to simulate.
+
+
+Source
+------
+
+Fio resides in a git repo, the canonical place is:
+
+       git://git.kernel.dk/fio.git
+
+When inside a corporate firewall, git:// URL sometimes does not work.
+If git:// does not work, use the http protocol instead:
+
+       http://git.kernel.dk/fio.git
+
+Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
+meta data as well. Other tarballs are archives of official fio releases.
+Snapshots can download from:
+
+       http://brick.kernel.dk/snaps/
+
+There are also two official mirrors. Both of these are automatically synced with
+the main repository, when changes are pushed. If the main repo is down for some
+reason, either one of these is safe to use as a backup:
+
+       git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
+
+       https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
+
+or
+
+       git://github.com/axboe/fio.git
+
+       https://github.com/axboe/fio.git
+
+
+Mailing list
+------------
+
+The fio project mailing list is meant for anything related to fio including
+general discussion, bug reporting, questions, and development. For bug reporting,
+see REPORTING-BUGS.
+
+An automated mail detailing recent commits is automatically sent to the list at
+most daily. The list address is fio@vger.kernel.org, subscribe by sending an
+email to majordomo@vger.kernel.org with
+
+       subscribe fio
+
+in the body of the email. Archives can be found here:
+
+       http://www.spinics.net/lists/fio/
+
+or here:
+
+       https://lore.kernel.org/fio/
+
+and archives for the old list can be found here:
+
+       http://maillist.kernel.dk/fio-devel/
+
+
+Author
+------
+
+Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
+the Linux I/O subsystem and schedulers. He got tired of writing specific test
+applications to simulate a given workload, and found that the existing I/O
+benchmark/test tools out there weren't flexible enough to do what he wanted.
+
+Jens Axboe <axboe@kernel.dk> 20060905
+
+
+Binary packages
+---------------
+
+Debian:
+       Starting with Debian "Squeeze", fio packages are part of the official
+       Debian repository. http://packages.debian.org/search?keywords=fio .
+
+Ubuntu:
+       Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
+       of the Ubuntu "universe" repository.
+       http://packages.ubuntu.com/search?keywords=fio .
+
+Red Hat, Fedora, CentOS & Co:
+       Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
+       packages are part of the Fedora/EPEL repositories.
+       https://apps.fedoraproject.org/packages/fio .
+
+Mandriva:
+       Mandriva has integrated fio into their package repository, so installing
+       on that distro should be as easy as typing ``urpmi fio``.
+
+Arch Linux:
+        An Arch Linux package is provided under the Community sub-repository:
+        https://www.archlinux.org/packages/?sort=&q=fio
+
+Solaris:
+       Packages for Solaris are available from OpenCSW. Install their pkgutil
+       tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
+       ``pkgutil -i fio``.
+
+Windows:
+       Rebecca Cran <rebecca@bsdio.com> has fio packages for Windows at
+       https://bsdio.com/fio/ . The latest builds for Windows can also
+       be grabbed from https://ci.appveyor.com/project/axboe/fio by clicking
+       the latest x86 or x64 build, then selecting the ARTIFACTS tab.
+
+BSDs:
+       Packages for BSDs may be available from their binary package repositories.
+       Look for a package "fio" using their binary package managers.
+
+
+Building
+--------
+
+Just type::
+
+ $ ./configure
+ $ make
+ $ make install
+
+Note that GNU make is required. On BSDs it's available from devel/gmake within
+ports directory; on Solaris it's in the SUNWgmake package.  On platforms where
+GNU make isn't the default, type ``gmake`` instead of ``make``.
+
+Configure will print the enabled options. Note that on Linux based platforms,
+the libaio development packages must be installed to use the libaio
+engine. Depending on distro, it is usually called libaio-devel or libaio-dev.
+
+For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
+to be installed.  gfio isn't built automatically and can be enabled with a
+``--enable-gfio`` option to configure.
+
+To build fio with a cross-compiler::
+
+ $ make clean
+ $ make CROSS_COMPILE=/path/to/toolchain/prefix
+
+Configure will attempt to determine the target platform automatically.
+
+It's possible to build fio for ESX as well, use the ``--esx`` switch to
+configure.
+
+
+Windows
+~~~~~~~
+
+The minimum versions of Windows for building/runing fio are Windows 7/Windows
+Server 2008 R2. On Windows, Cygwin (https://www.cygwin.com/) is required in
+order to build fio. To create an MSI installer package install WiX from
+https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
+directory.
+
+How to compile fio on 64-bit Windows:
+
+ 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
+    packages starting with **mingw64-x86_64**. Ensure
+    **mingw64-x86_64-zlib** are installed if you wish
+    to enable fio's log compression functionality.
+ 2. Open the Cygwin Terminal.
+ 3. Go to the fio directory (source files).
+ 4. Run ``make clean && make -j``.
+
+To build fio for 32-bit Windows, ensure the -i686 versions of the previously
+mentioned -x86_64 packages are installed and run ``./configure
+--build-32bit-win`` before ``make``.
+
+It's recommended that once built or installed, fio be run in a Command Prompt or
+other 'native' console such as console2, since there are known to be display and
+signal issues when running it under a Cygwin shell (see
+https://github.com/mintty/mintty/issues/56 and
+https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs
+for details).
+
+
+Documentation
+~~~~~~~~~~~~~
+
+Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
+To build HTML formatted documentation run ``make -C doc html`` and direct your
+browser to :file:`./doc/output/html/index.html`.  To build manual page run
+``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
+output formats are supported run ``make -C doc help``.
+
+.. _reStructuredText: http://www.sphinx-doc.org/rest.html
+.. _Sphinx: http://www.sphinx-doc.org
+
+
+Platforms
+---------
+
+Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
+Windows, FreeBSD, and DragonFly. Some features and/or options may only be
+available on some of the platforms, typically because those features only apply
+to that platform (like the solarisaio engine, or the splice engine on Linux).
+
+Some features are not available on FreeBSD/Solaris even if they could be
+implemented, I'd be happy to take patches for that. An example of that is disk
+utility statistics and (I think) huge page support, support for that does exist
+in FreeBSD/Solaris.
+
+Fio uses pthread mutexes for signalling and locking and some platforms do not
+support process shared pthread mutexes. As a result, on such platforms only
+threads are supported. This could be fixed with sysv ipc locking or other
+locking alternatives.
+
+Other \*BSD platforms are untested, but fio should work there almost out of the
+box. Since I don't do test runs or even compiles on those platforms, your
+mileage may vary. Sending me patches for other platforms is greatly
+appreciated. There's a lot of value in having the same test/benchmark tool
+available on all platforms.
+
+Note that POSIX aio is not enabled by default on AIX. Messages like these::
+
+    Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because:
+        Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix.
+
+indicate one needs to enable POSIX aio. Run the following commands as root::
+
+    # lsdev -C -l posix_aio0
+        posix_aio0 Defined  Posix Asynchronous I/O
+    # cfgmgr -l posix_aio0
+    # lsdev -C -l posix_aio0
+        posix_aio0 Available  Posix Asynchronous I/O
+
+POSIX aio should work now. To make the change permanent::
+
+    # chdev -l posix_aio0 -P -a autoconfig='available'
+        posix_aio0 changed
+
+
+Running fio
+-----------
+
+Running fio is normally the easiest part - you just give it the job file
+(or job files) as parameters::
+
+       $ fio [options] [jobfile] ...
+
+and it will start doing what the *jobfile* tells it to do. You can give more
+than one job file on the command line, fio will serialize the running of those
+files. Internally that is the same as using the :option:`stonewall` parameter
+described in the parameter section.
+
+If the job file contains only one job, you may as well just give the parameters
+on the command line. The command line parameters are identical to the job
+parameters, with a few extra that control global parameters.  For example, for
+the job file parameter :option:`iodepth=2 <iodepth>`, the mirror command line
+option would be :option:`--iodepth 2 <iodepth>` or :option:`--iodepth=2
+<iodepth>`. You can also use the command line for giving more than one job
+entry. For each :option:`--name <name>` option that fio sees, it will start a
+new job with that name.  Command line entries following a
+:option:`--name <name>` entry will apply to that job, until there are no more
+entries or a new :option:`--name <name>` entry is seen. This is similar to the
+job file options, where each option applies to the current job until a new []
+job entry is seen.
+
+fio does not need to run as root, except if the files or devices specified in
+the job section requires that. Some other options may also be restricted, such
+as memory locking, I/O scheduler switching, and decreasing the nice value.
+
+If *jobfile* is specified as ``-``, the job file will be read from standard
+input.
diff --git a/backend.c b/backend.c

index 151a561555fc45895cf29fa780d0b8b5cf399b44..061e3b329d2cd113d1f566a6e2da31c8019d37c7 100644 (file)
--- a/backend.c
+++ b/backend.c
@@ -1777,6 +1777,18 @@ static void *thread_main(void *data)
         if (!init_iolog(td))
                 goto err;
  
+       /* ioprio_set() has to be done before td_io_init() */
+       if (fio_option_is_set(o, ioprio) ||
+           fio_option_is_set(o, ioprio_class)) {
+               ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, o->ioprio);
+               if (ret == -1) {
+                       td_verror(td, errno, "ioprio_set");
+                       goto err;
+               }
+               td->ioprio = ioprio_value(o->ioprio_class, o->ioprio);
+               td->ts.ioprio = td->ioprio;
+       }
+
         if (td_io_init(td))
                 goto err;
  
@@ -1789,16 +1801,6 @@ static void *thread_main(void *data)
         if (o->verify_async && verify_async_init(td))
                 goto err;
  
-       if (fio_option_is_set(o, ioprio) ||
-           fio_option_is_set(o, ioprio_class)) {
-               ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, o->ioprio);
-               if (ret == -1) {
-                       td_verror(td, errno, "ioprio_set");
-                       goto err;
-               }
-               td->ioprio = ioprio_value(o->ioprio_class, o->ioprio);
-       }
-
         if (o->cgroup && cgroup_setup(td, cgroup_list, &cgroup_mnt))
                 goto err;
  
@@ -2611,6 +2613,9 @@ int fio_backend(struct sk_out *sk_out)
         }
  
         for_each_td(td, i) {
+               struct thread_stat *ts = &td->ts;
+
+               free_clat_prio_stats(ts);
                 steadystate_free(td);
                 fio_options_free(td);
                 fio_dump_options_free(td);
diff --git a/blktrace.c b/blktrace.c

index 64a610a95944c9ec1e5d4fdaba56b987e24c7943..e18047658953f718fdded2212bee42237c42ac9a 100644 (file)
--- a/blktrace.c
+++ b/blktrace.c
@@ -4,71 +4,34 @@
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
+#include <errno.h>
  
  #include "flist.h"
  #include "fio.h"
+#include "iolog.h"
  #include "blktrace.h"
  #include "blktrace_api.h"
  #include "oslib/linux-dev-lookup.h"
  
-#define TRACE_FIFO_SIZE        8192
-
-/*
- * fifo refill frontend, to avoid reading data in trace sized bites
- */
-static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd)
-{
-       char buf[TRACE_FIFO_SIZE];
-       unsigned int total;
-       int ret;
-
-       total = sizeof(buf);
-       if (total > fifo_room(fifo))
-               total = fifo_room(fifo);
-
-       ret = read(fd, buf, total);
-       if (ret < 0) {
-               int read_err = errno;
-
-               assert(read_err > 0);
-               td_verror(td, read_err, "read blktrace file");
-               return -read_err;
-       }
-
-       if (ret > 0)
-               ret = fifo_put(fifo, buf, ret);
-
-       dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret);
-       return ret;
-}
-
-/*
- * Retrieve 'len' bytes from the fifo, refilling if necessary.
- */
-static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd,
-                         void *buf, unsigned int len)
-{
-       if (fifo_len(fifo) < len) {
-               int ret = refill_fifo(td, fifo, fd);
-
-               if (ret < 0)
-                       return ret;
-       }
-
-       return fifo_get(fifo, buf, len);
-}
+struct file_cache {
+       unsigned int maj;
+       unsigned int min;
+       unsigned int fileno;
+};
  
  /*
   * Just discard the pdu by seeking past it.
   */
-static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd,
-                      struct blk_io_trace *t)
+static int discard_pdu(FILE* f, struct blk_io_trace *t)
  {
         if (t->pdu_len == 0)
                 return 0;
  
         dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len);
-       return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len);
+       if (fseek(f, t->pdu_len, SEEK_CUR) < 0)
+               return -errno;
+
+       return t->pdu_len;
  }
  
  /*
@@ -130,28 +93,28 @@ static void trace_add_open_close_event(struct thread_data *td, int fileno, enum
         flist_add_tail(&ipo->list, &td->io_log_list);
  }
  
-static int trace_add_file(struct thread_data *td, __u32 device)
+static int trace_add_file(struct thread_data *td, __u32 device,
+                         struct file_cache *cache)
  {
-       static unsigned int last_maj, last_min, last_fileno;
         unsigned int maj = FMAJOR(device);
         unsigned int min = FMINOR(device);
         struct fio_file *f;
         char dev[256];
         unsigned int i;
  
-       if (last_maj == maj && last_min == min)
-               return last_fileno;
+       if (cache->maj == maj && cache->min == min)
+               return cache->fileno;
  
-       last_maj = maj;
-       last_min = min;
+       cache->maj = maj;
+       cache->min = min;
  
         /*
          * check for this file in our list
          */
         for_each_file(td, f, i)
                 if (f->major == maj && f->minor == min) {
-                       last_fileno = f->fileno;
-                       return last_fileno;
+                       cache->fileno = f->fileno;
+                       return cache->fileno;
                 }
  
         strcpy(dev, "/dev");
@@ -171,10 +134,10 @@ static int trace_add_file(struct thread_data *td, __u32 device)
                 td->files[fileno]->major = maj;
                 td->files[fileno]->minor = min;
                 trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
-               last_fileno = fileno;
+               cache->fileno = fileno;
         }
  
-       return last_fileno;
+       return cache->fileno;
  }
  
  static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t)
@@ -215,7 +178,7 @@ static void store_ipo(struct thread_data *td, unsigned long long offset,
         queue_io_piece(td, ipo);
  }
  
-static void handle_trace_notify(struct blk_io_trace *t)
+static bool handle_trace_notify(struct blk_io_trace *t)
  {
         switch (t->action) {
         case BLK_TN_PROCESS:
@@ -232,22 +195,24 @@ static void handle_trace_notify(struct blk_io_trace *t)
                 dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action);
                 break;
         }
+       return false;
  }
  
-static void handle_trace_discard(struct thread_data *td,
+static bool handle_trace_discard(struct thread_data *td,
                                  struct blk_io_trace *t,
                                  unsigned long long ttime,
-                                unsigned long *ios, unsigned int *bs)
+                                unsigned long *ios, unsigned long long *bs,
+                                struct file_cache *cache)
  {
         struct io_piece *ipo;
         int fileno;
  
         if (td->o.replay_skip & (1u << DDIR_TRIM))
-               return;
+               return false;
  
         ipo = calloc(1, sizeof(*ipo));
         init_ipo(ipo);
-       fileno = trace_add_file(td, t->device);
+       fileno = trace_add_file(td, t->device, cache);
  
         ios[DDIR_TRIM]++;
         if (t->bytes > bs[DDIR_TRIM])
@@ -270,6 +235,7 @@ static void handle_trace_discard(struct thread_data *td,
                                                         ipo->offset, ipo->len,
                                                         ipo->delay);
         queue_io_piece(td, ipo);
+       return true;
  }
  
  static void dump_trace(struct blk_io_trace *t)
@@ -277,29 +243,29 @@ static void dump_trace(struct blk_io_trace *t)
         log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action);
  }
  
-static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
+static bool handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
                             unsigned long long ttime, unsigned long *ios,
-                           unsigned int *bs)
+                           unsigned long long *bs, struct file_cache *cache)
  {
         int rw;
         int fileno;
  
-       fileno = trace_add_file(td, t->device);
+       fileno = trace_add_file(td, t->device, cache);
  
         rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
  
         if (rw) {
                 if (td->o.replay_skip & (1u << DDIR_WRITE))
-                       return;
+                       return false;
         } else {
                 if (td->o.replay_skip & (1u << DDIR_READ))
-                       return;
+                       return false;
         }
  
         if (!t->bytes) {
                 if (!fio_did_warn(FIO_WARN_BTRACE_ZERO))
                         dump_trace(t);
-               return;
+               return false;
         }
  
         if (t->bytes > bs[rw])
@@ -308,20 +274,22 @@ static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
         ios[rw]++;
         td->o.size += t->bytes;
         store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
+       return true;
  }
  
-static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
-                              unsigned long long ttime, unsigned long *ios)
+static bool handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
+                              unsigned long long ttime, unsigned long *ios,
+                              struct file_cache *cache)
  {
         struct io_piece *ipo;
         int fileno;
  
         if (td->o.replay_skip & (1u << DDIR_SYNC))
-               return;
+               return false;
  
         ipo = calloc(1, sizeof(*ipo));
         init_ipo(ipo);
-       fileno = trace_add_file(td, t->device);
+       fileno = trace_add_file(td, t->device, cache);
  
         ipo->delay = ttime / 1000;
         ipo->ddir = DDIR_SYNC;
@@ -330,47 +298,49 @@ static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
         ios[DDIR_SYNC]++;
         dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay);
         queue_io_piece(td, ipo);
+       return true;
  }
  
  /*
   * We only care for queue traces, most of the others are side effects
   * due to internal workings of the block layer.
   */
-static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
-                        unsigned long *ios, unsigned int *bs)
+static bool queue_trace(struct thread_data *td, struct blk_io_trace *t,
+                        unsigned long *ios, unsigned long long *bs,
+                        struct file_cache *cache)
  {
-       static unsigned long long last_ttime;
+       unsigned long long *last_ttime = &td->io_log_blktrace_last_ttime;
         unsigned long long delay = 0;
  
         if ((t->action & 0xffff) != __BLK_TA_QUEUE)
-               return;
+               return false;
  
         if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
-               if (!last_ttime || td->o.no_stall)
+               if (!*last_ttime || td->o.no_stall || t->time < *last_ttime)
                         delay = 0;
                 else if (td->o.replay_time_scale == 100)
-                       delay = t->time - last_ttime;
+                       delay = t->time - *last_ttime;
                 else {
-                       double tmp = t->time - last_ttime;
+                       double tmp = t->time - *last_ttime;
                         double scale;
  
                         scale = (double) 100.0 / (double) td->o.replay_time_scale;
                         tmp *= scale;
                         delay = tmp;
                 }
-               last_ttime = t->time;
+               *last_ttime = t->time;
         }
  
         t_bytes_align(&td->o, t);
  
         if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
-               handle_trace_notify(t);
+               return handle_trace_notify(t);
         else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
-               handle_trace_discard(td, t, delay, ios, bs);
+               return handle_trace_discard(td, t, delay, ios, bs, cache);
         else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH))
-               handle_trace_flush(td, t, delay, ios);
+               return handle_trace_flush(td, t, delay, ios, cache);
         else
-               handle_trace_fs(td, t, delay, ios, bs);
+               return handle_trace_fs(td, t, delay, ios, bs, cache);
  }
  
  static void byteswap_trace(struct blk_io_trace *t)
@@ -438,43 +408,79 @@ static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth)
   * Load a blktrace file by reading all the blk_io_trace entries, and storing
   * them as io_pieces like the fio text version would do.
   */
-bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
+bool init_blktrace_read(struct thread_data *td, const char *filename, int need_swap)
+{
+       int old_state;
+
+       td->io_log_rfile = fopen(filename, "rb");
+       if (!td->io_log_rfile) {
+               td_verror(td, errno, "open blktrace file");
+               goto err;
+       }
+       td->io_log_blktrace_swap = need_swap;
+       td->io_log_blktrace_last_ttime = 0;
+       td->o.size = 0;
+
+       free_release_files(td);
+
+       old_state = td_bump_runstate(td, TD_SETTING_UP);
+
+       if (!read_blktrace(td)) {
+               goto err;
+       }
+
+       td_restore_runstate(td, old_state);
+
+       if (!td->files_index) {
+               log_err("fio: did not find replay device(s)\n");
+               return false;
+       }
+
+       return true;
+
+err:
+       if (td->io_log_rfile) {
+               fclose(td->io_log_rfile);
+               td->io_log_rfile = NULL;
+       }
+       return false;
+}
+
+bool read_blktrace(struct thread_data* td)
  {
         struct blk_io_trace t;
+       struct file_cache cache = { };
         unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { };
-       unsigned int rw_bs[DDIR_RWDIR_CNT] = { };
+       unsigned long long rw_bs[DDIR_RWDIR_CNT] = { };
         unsigned long skipped_writes;
-       struct fifo *fifo;
-       int fd, i, old_state, max_depth;
-       struct fio_file *f;
+       FILE *f = td->io_log_rfile;
+       int i, max_depth;
+       struct fio_file *fiof;
         int this_depth[DDIR_RWDIR_CNT] = { };
         int depth[DDIR_RWDIR_CNT] = { };
+       int64_t items_to_fetch = 0;
  
-       fd = open(filename, O_RDONLY);
-       if (fd < 0) {
-               td_verror(td, errno, "open blktrace file");
-               return false;
+       if (td->o.read_iolog_chunked) {
+               items_to_fetch = iolog_items_to_fetch(td);
+               if (!items_to_fetch)
+                       return true;
         }
  
-       fifo = fifo_alloc(TRACE_FIFO_SIZE);
-
-       old_state = td_bump_runstate(td, TD_SETTING_UP);
-
-       td->o.size = 0;
         skipped_writes = 0;
         do {
-               int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
+               int ret = fread(&t, 1, sizeof(t), f);
  
-               if (ret < 0)
+               if (ferror(f)) {
+                       td_verror(td, errno, "read blktrace file");
                         goto err;
-               else if (!ret)
+               } else if (feof(f)) {
                         break;
-               else if (ret < (int) sizeof(t)) {
-                       log_err("fio: short fifo get\n");
+               } else if (ret < (int) sizeof(t)) {
+                       log_err("fio: iolog short read\n");
                         break;
                 }
  
-               if (need_swap)
+               if (td->io_log_blktrace_swap)
                         byteswap_trace(&t);
  
                 if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
@@ -487,13 +493,10 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
                                                                 t.magic & 0xff);
                         goto err;
                 }
-               ret = discard_pdu(td, fifo, fd, &t);
+               ret = discard_pdu(f, &t);
                 if (ret < 0) {
                         td_verror(td, -ret, "blktrace lseek");
                         goto err;
-               } else if (t.pdu_len != ret) {
-                       log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
-                       goto err;
                 }
                 if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
                         if ((t.action & 0xffff) == __BLK_TA_QUEUE)
@@ -510,22 +513,53 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
                         }
                 }
  
-               handle_trace(td, &t, ios, rw_bs);
-       } while (1);
+               if (!queue_trace(td, &t, ios, rw_bs, &cache))
+                       continue;
  
-       for_each_file(td, f, i)
-               trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE);
+               if (td->o.read_iolog_chunked) {
+                       td->io_log_current++;
+                       items_to_fetch--;
+                       if (items_to_fetch == 0)
+                               break;
+               }
+       } while (1);
  
-       fifo_free(fifo);
-       close(fd);
+       if (td->o.read_iolog_chunked) {
+               td->io_log_highmark = td->io_log_current;
+               td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
+               fio_gettime(&td->io_log_highmark_time, NULL);
+       }
  
-       td_restore_runstate(td, old_state);
+       if (skipped_writes)
+               log_err("fio: %s skips replay of %lu writes due to read-only\n",
+                                               td->o.name, skipped_writes);
  
-       if (!td->files_index) {
-               log_err("fio: did not find replay device(s)\n");
-               return false;
+       if (td->o.read_iolog_chunked) {
+               if (td->io_log_current == 0) {
+                       return false;
+               }
+               td->o.td_ddir = TD_DDIR_RW;
+               if ((rw_bs[DDIR_READ] > td->o.max_bs[DDIR_READ] ||
+                    rw_bs[DDIR_WRITE] > td->o.max_bs[DDIR_WRITE] ||
+                    rw_bs[DDIR_TRIM] > td->o.max_bs[DDIR_TRIM]) &&
+                   td->orig_buffer)
+               {
+                       td->o.max_bs[DDIR_READ] = max(td->o.max_bs[DDIR_READ], rw_bs[DDIR_READ]);
+                       td->o.max_bs[DDIR_WRITE] = max(td->o.max_bs[DDIR_WRITE], rw_bs[DDIR_WRITE]);
+                       td->o.max_bs[DDIR_TRIM] = max(td->o.max_bs[DDIR_TRIM], rw_bs[DDIR_TRIM]);
+                       io_u_quiesce(td);
+                       free_io_mem(td);
+                       init_io_u_buffers(td);
+               }
+               return true;
         }
  
+       for_each_file(td, fiof, i)
+               trace_add_open_close_event(td, fiof->fileno, FIO_LOG_CLOSE_FILE);
+
+       fclose(td->io_log_rfile);
+       td->io_log_rfile = NULL;
+
         /*
          * For stacked devices, we don't always get a COMPLETE event so
          * the depth grows to insane values. Limit it to something sane(r).
@@ -539,10 +573,6 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
                 max_depth = max(depth[i], max_depth);
         }
  
-       if (skipped_writes)
-               log_err("fio: %s skips replay of %lu writes due to read-only\n",
-                                               td->o.name, skipped_writes);
-
         if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] &&
             !ios[DDIR_SYNC]) {
                 log_err("fio: found no ios in blktrace data\n");
@@ -563,14 +593,6 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
                 td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM];
         }
  
-       /*
-        * We need to do direct/raw ios to the device, to avoid getting
-        * read-ahead in our way. But only do so if the minimum block size
-        * is a multiple of 4k, otherwise we don't know if it's safe to do so.
-        */
-       if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095))
-               td->o.odirect = 1;
-
         /*
          * If depth wasn't manually set, use probed depth
          */
@@ -579,8 +601,7 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
  
         return true;
  err:
-       close(fd);
-       fifo_free(fifo);
+       fclose(f);
         return false;
  }
  
@@ -625,15 +646,14 @@ static void merge_finish_file(struct blktrace_cursor *bcs, int i, int *nr_logs)
  {
         bcs[i].iter++;
         if (bcs[i].iter < bcs[i].nr_iter) {
-               lseek(bcs[i].fd, 0, SEEK_SET);
+               fseek(bcs[i].f, 0, SEEK_SET);
                 return;
         }
  
         *nr_logs -= 1;
  
         /* close file */
-       fifo_free(bcs[i].fifo);
-       close(bcs[i].fd);
+       fclose(bcs[i].f);
  
         /* keep active files contiguous */
         memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i]));
@@ -646,15 +666,16 @@ static int read_trace(struct thread_data *td, struct blktrace_cursor *bc)
  
  read_skip:
         /* read an io trace */
-       ret = trace_fifo_get(td, bc->fifo, bc->fd, t, sizeof(*t));
-       if (ret < 0) {
+       ret = fread(&t, 1, sizeof(t), bc->f);
+       if (ferror(bc->f)) {
+               td_verror(td, errno, "read blktrace file");
                 return ret;
-       } else if (!ret) {
+       } else if (feof(bc->f)) {
                 if (!bc->length)
                         bc->length = bc->t.time;
                 return ret;
         } else if (ret < (int) sizeof(*t)) {
-               log_err("fio: short fifo get\n");
+               log_err("fio: iolog short read\n");
                 return -1;
         }
  
@@ -664,14 +685,10 @@ read_skip:
         /* skip over actions that fio does not care about */
         if ((t->action & 0xffff) != __BLK_TA_QUEUE ||
             t_get_ddir(t) == DDIR_INVAL) {
-               ret = discard_pdu(td, bc->fifo, bc->fd, t);
+               ret = discard_pdu(bc->f, t);
                 if (ret < 0) {
                         td_verror(td, -ret, "blktrace lseek");
                         return ret;
-               } else if (t->pdu_len != ret) {
-                       log_err("fio: discarded %d of %d\n", ret,
-                               t->pdu_len);
-                       return -1;
                 }
                 goto read_skip;
         }
@@ -729,14 +746,13 @@ int merge_blktrace_iologs(struct thread_data *td)
         str = ptr = strdup(td->o.read_iolog_file);
         nr_logs = 0;
         for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) {
-               bcs[i].fd = open(name, O_RDONLY);
-               if (bcs[i].fd < 0) {
+               bcs[i].f = fopen(name, "rb");
+               if (!bcs[i].f) {
                         log_err("fio: could not open file: %s\n", name);
-                       ret = bcs[i].fd;
+                       ret = -errno;
                         free(str);
                         goto err_file;
                 }
-               bcs[i].fifo = fifo_alloc(TRACE_FIFO_SIZE);
                 nr_logs++;
  
                 if (!is_blktrace(name, &bcs[i].swap)) {
@@ -761,14 +777,10 @@ int merge_blktrace_iologs(struct thread_data *td)
                 i = find_earliest_io(bcs, nr_logs);
                 bc = &bcs[i];
                 /* skip over the pdu */
-               ret = discard_pdu(td, bc->fifo, bc->fd, &bc->t);
+               ret = discard_pdu(bc->f, &bc->t);
                 if (ret < 0) {
                         td_verror(td, -ret, "blktrace lseek");
                         goto err_file;
-               } else if (bc->t.pdu_len != ret) {
-                       log_err("fio: discarded %d of %d\n", ret,
-                               bc->t.pdu_len);
-                       goto err_file;
                 }
  
                 ret = write_trace(merge_fp, &bc->t);
@@ -786,8 +798,7 @@ int merge_blktrace_iologs(struct thread_data *td)
  err_file:
         /* cleanup */
         for (i = 0; i < nr_logs; i++) {
-               fifo_free(bcs[i].fifo);
-               close(bcs[i].fd);
+               fclose(bcs[i].f);
         }
  err_merge_buf:
         free(merge_buf);
diff --git a/blktrace.h b/blktrace.h

index a0e82faa05eed81bb3e8819a200208089a6009d7..c53b717ba4e2d3069f170d1552027817cc734106 100644 (file)
--- a/blktrace.h
+++ b/blktrace.h
@@ -10,7 +10,7 @@
  
  struct blktrace_cursor {
         struct fifo             *fifo;  // fifo queue for reading
-       int                     fd;     // blktrace file
+       FILE                    *f;     // blktrace file
         __u64                   length; // length of trace
         struct blk_io_trace     t;      // current io trace
         int                     swap;   // bitwise reverse required
@@ -20,7 +20,9 @@ struct blktrace_cursor {
  };
  
  bool is_blktrace(const char *, int *);
-bool load_blktrace(struct thread_data *, const char *, int);
+bool init_blktrace_read(struct thread_data *, const char *, int);
+bool read_blktrace(struct thread_data* td);
+
  int merge_blktrace_iologs(struct thread_data *td);
  
  #else
@@ -30,12 +32,18 @@ static inline bool is_blktrace(const char *fname, int *need_swap)
         return false;
  }
  
-static inline bool load_blktrace(struct thread_data *td, const char *fname,
+static inline bool init_blktrace_read(struct thread_data *td, const char *fname,
                                  int need_swap)
  {
         return false;
  }
  
+static inline bool read_blktrace(struct thread_data* td)
+{
+       return false;
+}
+
+
  static inline int merge_blktrace_iologs(struct thread_data *td)
  {
         return false;
diff --git a/ci/actions-install.sh b/ci/actions-install.sh

index 7408ccb4f93fb7f3921da5d8d8b247aa6977ba6a..b3486a475d50a8ed9b68b4cd0ad82fecd29614e9 100755 (executable)
--- a/ci/actions-install.sh
+++ b/ci/actions-install.sh
@@ -31,14 +31,17 @@ DPKGCFG
      case "${CI_TARGET_ARCH}" in
          "i686")
              sudo dpkg --add-architecture i386
+            opts="--allow-downgrades"
              pkgs=("${pkgs[@]/%/:i386}")
              pkgs+=(
                  gcc-multilib
                  pkg-config:i386
                  zlib1g-dev:i386
+               libpcre2-8-0=10.34-7
              )
              ;;
          "x86_64")
+            opts=""
              pkgs+=(
                  libglusterfs-dev
                  libgoogle-perftools-dev
@@ -62,7 +65,7 @@ DPKGCFG
      echo "Updating APT..."
      sudo apt-get -qq update
      echo "Installing packages..."
-    sudo apt-get install -o APT::Immediate-Configure=false --no-install-recommends -qq -y "${pkgs[@]}"
+    sudo apt-get install "$opts" -o APT::Immediate-Configure=false --no-install-recommends -qq -y "${pkgs[@]}"
  }
  
  install_linux() {
diff --git a/client.c b/client.c

index 8b230617f79bd0dbab6853e9388a1e7fa8f700ce..605a3ce573aa4c842df83c4947c823c03882fa03 100644 (file)
--- a/client.c
+++ b/client.c
@@ -284,9 +284,10 @@ static int fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn)
  static void fio_drain_client_text(struct fio_client *client)
  {
         do {
-               struct fio_net_cmd *cmd;
+               struct fio_net_cmd *cmd = NULL;
  
-               cmd = fio_net_recv_cmd(client->fd, false);
+               if (fio_server_poll_fd(client->fd, POLLIN, 0))
+                       cmd = fio_net_recv_cmd(client->fd, false);
                 if (!cmd)
                         break;
  
@@ -953,6 +954,8 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
         dst->pid                = le32_to_cpu(src->pid);
         dst->members            = le32_to_cpu(src->members);
         dst->unified_rw_rep     = le32_to_cpu(src->unified_rw_rep);
+       dst->ioprio             = le32_to_cpu(src->ioprio);
+       dst->disable_prio_stat  = le32_to_cpu(src->disable_prio_stat);
  
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
                 convert_io_stat(&dst->clat_stat[i], &src->clat_stat[i]);
@@ -1035,14 +1038,6 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
         dst->nr_block_infos     = le64_to_cpu(src->nr_block_infos);
         for (i = 0; i < dst->nr_block_infos; i++)
                 dst->block_infos[i] = le32_to_cpu(src->block_infos[i]);
-       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
-                       dst->io_u_plat_high_prio[i][j] = le64_to_cpu(src->io_u_plat_high_prio[i][j]);
-                       dst->io_u_plat_low_prio[i][j] = le64_to_cpu(src->io_u_plat_low_prio[i][j]);
-               }
-               convert_io_stat(&dst->clat_high_prio_stat[i], &src->clat_high_prio_stat[i]);
-               convert_io_stat(&dst->clat_low_prio_stat[i], &src->clat_low_prio_stat[i]);
-       }
  
         dst->ss_dur             = le64_to_cpu(src->ss_dur);
         dst->ss_state           = le32_to_cpu(src->ss_state);
@@ -1052,6 +1047,19 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
         dst->ss_deviation.u.f   = fio_uint64_to_double(le64_to_cpu(src->ss_deviation.u.i));
         dst->ss_criterion.u.f   = fio_uint64_to_double(le64_to_cpu(src->ss_criterion.u.i));
  
+       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+               dst->nr_clat_prio[i] = le32_to_cpu(src->nr_clat_prio[i]);
+               for (j = 0; j < dst->nr_clat_prio[i]; j++) {
+                       for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+                               dst->clat_prio[i][j].io_u_plat[k] =
+                                       le64_to_cpu(src->clat_prio[i][j].io_u_plat[k]);
+                       convert_io_stat(&dst->clat_prio[i][j].clat_stat,
+                                       &src->clat_prio[i][j].clat_stat);
+                       dst->clat_prio[i][j].ioprio =
+                               le32_to_cpu(dst->clat_prio[i][j].ioprio);
+               }
+       }
+
         if (dst->ss_state & FIO_SS_DATA) {
                 for (i = 0; i < dst->ss_dur; i++ ) {
                         dst->ss_iops_data[i] = le64_to_cpu(src->ss_iops_data[i]);
@@ -1111,7 +1119,7 @@ static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd)
         if (sum_stat_clients <= 1)
                 return;
  
-       sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
+       sum_thread_stats(&client_ts, &p->ts);
         sum_group_stats(&client_gs, &p->rs);
  
         client_ts.members++;
@@ -1760,7 +1768,6 @@ int fio_handle_client(struct fio_client *client)
  {
         struct client_ops *ops = client->ops;
         struct fio_net_cmd *cmd;
-       int size;
  
         dprint(FD_NET, "client: handle %s\n", client->hostname);
  
@@ -1794,14 +1801,26 @@ int fio_handle_client(struct fio_client *client)
                 }
         case FIO_NET_CMD_TS: {
                 struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
+               uint64_t offset;
+               int i;
+
+               for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+                       if (le32_to_cpu(p->ts.nr_clat_prio[i])) {
+                               offset = le64_to_cpu(p->ts.clat_prio_offset[i]);
+                               p->ts.clat_prio[i] =
+                                       (struct clat_prio_stat *)((char *)p + offset);
+                       }
+               }
  
                 dprint(FD_NET, "client: ts->ss_state = %u\n", (unsigned int) le32_to_cpu(p->ts.ss_state));
                 if (le32_to_cpu(p->ts.ss_state) & FIO_SS_DATA) {
                         dprint(FD_NET, "client: received steadystate ring buffers\n");
  
-                       size = le64_to_cpu(p->ts.ss_dur);
-                       p->ts.ss_iops_data = (uint64_t *) ((struct cmd_ts_pdu *)cmd->payload + 1);
-                       p->ts.ss_bw_data = p->ts.ss_iops_data + size;
+                       offset = le64_to_cpu(p->ts.ss_iops_data_offset);
+                       p->ts.ss_iops_data = (uint64_t *)((char *)p + offset);
+
+                       offset = le64_to_cpu(p->ts.ss_bw_data_offset);
+                       p->ts.ss_bw_data = (uint64_t *)((char *)p + offset);
                 }
  
                 convert_ts(&p->ts, &p->ts);
@@ -2152,6 +2171,7 @@ int fio_handle_clients(struct client_ops *ops)
  
         fio_client_json_fini();
  
+       free_clat_prio_stats(&client_ts);
         free(pfds);
         return retval || error_clients;
  }
diff --git a/configure b/configure

index 84ccce040ece052b6cf844b082114f2d38bc8f26..0efde7d6a84acf6887e81ec89eae6f8110cbcb0c 100755 (executable)
--- a/configure
+++ b/configure
@@ -955,17 +955,16 @@ print_config "rdmacm" "$rdmacm"
  
  ##########################################
  # librpma probe
+# The librpma engine requires librpma>=v0.10.0 with rpma_mr_advise().
  if test "$librpma" != "yes" ; then
    librpma="no"
  fi
  cat > $TMPC << EOF
-#include <stdio.h>
  #include <librpma.h>
-int main(int argc, char **argv)
+int main(void)
  {
-  enum rpma_conn_event event = RPMA_CONN_REJECTED;
-  (void) event; /* unused */
-  rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
+  void *ptr = rpma_mr_advise;
+  (void) ptr; /* unused */
    return 0;
  }
  EOF
diff --git a/doc/fio_doc.rst b/doc/fio_doc.rst

index b5987b52a893f34cb3b16e8e44eeaf597de9641e..8e1216f02c4fe7345daccda5f8f0ee99706d83e4 100644 (file)
--- a/doc/fio_doc.rst
+++ b/doc/fio_doc.rst
@@ -2,7 +2,7 @@ fio - Flexible I/O tester rev. |version|
  ========================================
  
  
-.. include:: ../README
+.. include:: ../README.rst
  
  
  .. include:: ../HOWTO
diff --git a/doc/fio_man.rst b/doc/fio_man.rst

index c6a6438ff3436763d7c1dc00779ed316023c5e2a..44312f16ac479c0802d678d9784fb7a065f92136 100644 (file)
--- a/doc/fio_man.rst
+++ b/doc/fio_man.rst
@@ -6,7 +6,7 @@ Fio Manpage
  (rev. |release|)
  
  
-.. include:: ../README
+.. include:: ../README.rst
  
  
  .. include:: ../HOWTO
diff --git a/engines/cmdprio.c b/engines/cmdprio.c

index 92b752aecd413d5085249bd880502ab49e6609f6..dd358754de4d34d24037df7ed621327342a80e27 100644 (file)
--- a/engines/cmdprio.c
+++ b/engines/cmdprio.c
@@ -5,45 +5,201 @@
  
  #include "cmdprio.h"
  
-static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg,
-                                   enum fio_ddir ddir, char *str, bool data)
+/*
+ * Temporary array used during parsing. Will be freed after the corresponding
+ * struct bsprio_desc has been generated and saved in cmdprio->bsprio_desc.
+ */
+struct cmdprio_parse_result {
+       struct split_prio *entries;
+       int nr_entries;
+};
+
+/*
+ * Temporary array used during init. Will be freed after the corresponding
+ * struct clat_prio_stat array has been saved in td->ts.clat_prio and the
+ * matching clat_prio_indexes have been saved in each struct cmdprio_prio.
+ */
+struct cmdprio_values {
+       unsigned int *prios;
+       int nr_prios;
+};
+
+static int find_clat_prio_index(unsigned int *all_prios, int nr_prios,
+                               int32_t prio)
  {
-       struct cmdprio *cmdprio = cb_arg;
-       struct split split;
-       unsigned int i;
+       int i;
  
-       if (ddir == DDIR_TRIM)
-               return 0;
+       for (i = 0; i < nr_prios; i++) {
+               if (all_prios[i] == prio)
+                       return i;
+       }
  
-       memset(&split, 0, sizeof(split));
+       return -1;
+}
  
-       if (split_parse_ddir(to, &split, str, data, BSSPLIT_MAX))
+/**
+ * assign_clat_prio_index - In order to avoid stat.c the need to loop through
+ * all possible priorities each time add_clat_sample() / add_lat_sample() is
+ * called, save which index to use in each cmdprio_prio. This will later be
+ * propagated to the io_u, if the specific io_u was determined to use a cmdprio
+ * priority value.
+ */
+static void assign_clat_prio_index(struct cmdprio_prio *prio,
+                                  struct cmdprio_values *values)
+{
+       int clat_prio_index = find_clat_prio_index(values->prios,
+                                                  values->nr_prios,
+                                                  prio->prio);
+       if (clat_prio_index == -1) {
+               clat_prio_index = values->nr_prios;
+               values->prios[clat_prio_index] = prio->prio;
+               values->nr_prios++;
+       }
+       prio->clat_prio_index = clat_prio_index;
+}
+
+/**
+ * init_cmdprio_values - Allocate a temporary array that can hold all unique
+ * priorities (per ddir), so that we can assign_clat_prio_index() for each
+ * cmdprio_prio during setup. This temporary array is freed after setup.
+ */
+static int init_cmdprio_values(struct cmdprio_values *values,
+                              int max_unique_prios, struct thread_stat *ts)
+{
+       values->prios = calloc(max_unique_prios + 1,
+                              sizeof(*values->prios));
+       if (!values->prios)
                 return 1;
-       if (!split.nr)
-               return 0;
  
-       cmdprio->bssplit_nr[ddir] = split.nr;
-       cmdprio->bssplit[ddir] = malloc(split.nr * sizeof(struct bssplit));
-       if (!cmdprio->bssplit[ddir])
+       /* td->ioprio/ts->ioprio is always stored at index 0. */
+       values->prios[0] = ts->ioprio;
+       values->nr_prios++;
+
+       return 0;
+}
+
+/**
+ * init_ts_clat_prio - Allocates and fills a clat_prio_stat array which holds
+ * all unique priorities (per ddir).
+ */
+static int init_ts_clat_prio(struct thread_stat *ts, enum fio_ddir ddir,
+                            struct cmdprio_values *values)
+{
+       int i;
+
+       if (alloc_clat_prio_stat_ddir(ts, ddir, values->nr_prios))
                 return 1;
  
-       for (i = 0; i < split.nr; i++) {
-               cmdprio->bssplit[ddir][i].bs = split.val1[i];
-               if (split.val2[i] == -1U) {
-                       cmdprio->bssplit[ddir][i].perc = 0;
-               } else {
-                       if (split.val2[i] > 100)
-                               cmdprio->bssplit[ddir][i].perc = 100;
-                       else
-                               cmdprio->bssplit[ddir][i].perc = split.val2[i];
+       for (i = 0; i < values->nr_prios; i++)
+               ts->clat_prio[ddir][i].ioprio = values->prios[i];
+
+       return 0;
+}
+
+static int fio_cmdprio_fill_bsprio(struct cmdprio_bsprio *bsprio,
+                                  struct split_prio *entries,
+                                  struct cmdprio_values *values,
+                                  int implicit_cmdprio, int start, int end)
+{
+       struct cmdprio_prio *prio;
+       int i = end - start + 1;
+
+       bsprio->prios = calloc(i, sizeof(*bsprio->prios));
+       if (!bsprio->prios)
+               return 1;
+
+       bsprio->bs = entries[start].bs;
+       bsprio->nr_prios = 0;
+       for (i = start; i <= end; i++) {
+               prio = &bsprio->prios[bsprio->nr_prios];
+               prio->perc = entries[i].perc;
+               if (entries[i].prio == -1)
+                       prio->prio = implicit_cmdprio;
+               else
+                       prio->prio = entries[i].prio;
+               assign_clat_prio_index(prio, values);
+               bsprio->tot_perc += entries[i].perc;
+               if (bsprio->tot_perc > 100) {
+                       log_err("fio: cmdprio_bssplit total percentage "
+                               "for bs: %"PRIu64" exceeds 100\n",
+                               bsprio->bs);
+                       free(bsprio->prios);
+                       return 1;
                 }
+               bsprio->nr_prios++;
+       }
+
+       return 0;
+}
+
+static int
+fio_cmdprio_generate_bsprio_desc(struct cmdprio_bsprio_desc *bsprio_desc,
+                                struct cmdprio_parse_result *parse_res,
+                                struct cmdprio_values *values,
+                                int implicit_cmdprio)
+{
+       struct split_prio *entries = parse_res->entries;
+       int nr_entries = parse_res->nr_entries;
+       struct cmdprio_bsprio *bsprio;
+       int i, start, count = 0;
+
+       /*
+        * The parsed result is sorted by blocksize, so count only the number
+        * of different blocksizes, to know how many cmdprio_bsprio we need.
+        */
+       for (i = 0; i < nr_entries; i++) {
+               while (i + 1 < nr_entries && entries[i].bs == entries[i + 1].bs)
+                       i++;
+               count++;
+       }
+
+       /*
+        * This allocation is not freed on error. Instead, the calling function
+        * is responsible for calling fio_cmdprio_cleanup() on error.
+        */
+       bsprio_desc->bsprios = calloc(count, sizeof(*bsprio_desc->bsprios));
+       if (!bsprio_desc->bsprios)
+               return 1;
+
+       start = 0;
+       bsprio_desc->nr_bsprios = 0;
+       for (i = 0; i < nr_entries; i++) {
+               while (i + 1 < nr_entries && entries[i].bs == entries[i + 1].bs)
+                       i++;
+               bsprio = &bsprio_desc->bsprios[bsprio_desc->nr_bsprios];
+               /*
+                * All parsed entries with the same blocksize get saved in the
+                * same cmdprio_bsprio, to expedite the search in the hot path.
+                */
+               if (fio_cmdprio_fill_bsprio(bsprio, entries, values,
+                                           implicit_cmdprio, start, i))
+                       return 1;
+
+               start = i + 1;
+               bsprio_desc->nr_bsprios++;
         }
  
         return 0;
  }
  
-int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
-                             struct cmdprio *cmdprio)
+static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg,
+                                   enum fio_ddir ddir, char *str, bool data)
+{
+       struct cmdprio_parse_result *parse_res_arr = cb_arg;
+       struct cmdprio_parse_result *parse_res = &parse_res_arr[ddir];
+
+       if (ddir == DDIR_TRIM)
+               return 0;
+
+       if (split_parse_prio_ddir(to, &parse_res->entries,
+                                 &parse_res->nr_entries, str))
+               return 1;
+
+       return 0;
+}
+
+static int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
+                                    struct cmdprio_parse_result *parse_res)
  {
         char *str, *p;
         int ret = 0;
@@ -53,26 +209,39 @@ int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
         strip_blank_front(&str);
         strip_blank_end(str);
  
-       ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, cmdprio,
+       ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, parse_res,
                               false);
  
         free(p);
         return ret;
  }
  
-static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u)
+/**
+ * fio_cmdprio_percentage - Returns the percentage of I/Os that should
+ * use a cmdprio priority value (rather than the default context priority).
+ *
+ * For CMDPRIO_MODE_BSSPLIT, if the percentage is non-zero, we will also
+ * return the matching bsprio, to avoid the same linear search elsewhere.
+ * For CMDPRIO_MODE_PERC, we will never return a bsprio.
+ */
+static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u,
+                                 struct cmdprio_bsprio **bsprio)
  {
+       struct cmdprio_bsprio *bsprio_entry;
         enum fio_ddir ddir = io_u->ddir;
-       struct cmdprio_options *options = cmdprio->options;
         int i;
  
         switch (cmdprio->mode) {
         case CMDPRIO_MODE_PERC:
-               return options->percentage[ddir];
+               *bsprio = NULL;
+               return cmdprio->perc_entry[ddir].perc;
         case CMDPRIO_MODE_BSSPLIT:
-               for (i = 0; i < cmdprio->bssplit_nr[ddir]; i++) {
-                       if (cmdprio->bssplit[ddir][i].bs == io_u->buflen)
-                               return cmdprio->bssplit[ddir][i].perc;
+               for (i = 0; i < cmdprio->bsprio_desc[ddir].nr_bsprios; i++) {
+                       bsprio_entry = &cmdprio->bsprio_desc[ddir].bsprios[i];
+                       if (bsprio_entry->bs == io_u->buflen) {
+                               *bsprio = bsprio_entry;
+                               return bsprio_entry->tot_perc;
+                       }
                 }
                 break;
         default:
@@ -83,6 +252,11 @@ static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u)
                 assert(0);
         }
  
+       /*
+        * This is totally fine, the given blocksize simply does not
+        * have any (non-zero) cmdprio_bssplit entries defined.
+        */
+       *bsprio = NULL;
         return 0;
  }
  
@@ -100,52 +274,162 @@ static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u)
  bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio,
                             struct io_u *io_u)
  {
-       enum fio_ddir ddir = io_u->ddir;
-       struct cmdprio_options *options = cmdprio->options;
-       unsigned int p;
-       unsigned int cmdprio_value =
-               ioprio_value(options->class[ddir], options->level[ddir]);
-
-       p = fio_cmdprio_percentage(cmdprio, io_u);
-       if (p && rand_between(&td->prio_state, 0, 99) < p) {
-               io_u->ioprio = cmdprio_value;
-               if (!td->ioprio || cmdprio_value < td->ioprio) {
-                       /*
-                        * The async IO priority is higher (has a lower value)
-                        * than the default priority (which is either 0 or the
-                        * value set by "prio" and "prioclass" options).
-                        */
-                       io_u->flags |= IO_U_F_HIGH_PRIO;
-               }
+       struct cmdprio_bsprio *bsprio;
+       unsigned int p, rand;
+       uint32_t perc = 0;
+       int i;
+
+       p = fio_cmdprio_percentage(cmdprio, io_u, &bsprio);
+       if (!p)
+               return false;
+
+       rand = rand_between(&td->prio_state, 0, 99);
+       if (rand >= p)
+               return false;
+
+       switch (cmdprio->mode) {
+       case CMDPRIO_MODE_PERC:
+               io_u->ioprio = cmdprio->perc_entry[io_u->ddir].prio;
+               io_u->clat_prio_index =
+                       cmdprio->perc_entry[io_u->ddir].clat_prio_index;
                 return true;
+       case CMDPRIO_MODE_BSSPLIT:
+               assert(bsprio);
+               for (i = 0; i < bsprio->nr_prios; i++) {
+                       struct cmdprio_prio *prio = &bsprio->prios[i];
+
+                       perc += prio->perc;
+                       if (rand < perc) {
+                               io_u->ioprio = prio->prio;
+                               io_u->clat_prio_index = prio->clat_prio_index;
+                               return true;
+                       }
+               }
+               break;
+       default:
+               assert(0);
         }
  
-       if (td->ioprio && td->ioprio < cmdprio_value) {
+       /* When rand < p (total perc), we should always find a cmdprio_prio. */
+       assert(0);
+       return false;
+}
+
+static int fio_cmdprio_gen_perc(struct thread_data *td, struct cmdprio *cmdprio)
+{
+       struct cmdprio_options *options = cmdprio->options;
+       struct cmdprio_prio *prio;
+       struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {0};
+       struct thread_stat *ts = &td->ts;
+       enum fio_ddir ddir;
+       int ret;
+
+       for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
                 /*
-                * The IO will be executed with the default priority (which is
-                * either 0 or the value set by "prio" and "prioclass options),
-                * and this priority is higher (has a lower value) than the
-                * async IO priority.
+                * Do not allocate a clat_prio array nor set the cmdprio struct
+                * if zero percent of the I/Os (for the ddir) should use a
+                * cmdprio priority value, or when the ddir is not enabled.
                  */
-               io_u->flags |= IO_U_F_HIGH_PRIO;
+               if (!options->percentage[ddir] ||
+                   (ddir == DDIR_READ && !td_read(td)) ||
+                   (ddir == DDIR_WRITE && !td_write(td)))
+                       continue;
+
+               ret = init_cmdprio_values(&values[ddir], 1, ts);
+               if (ret)
+                       goto err;
+
+               prio = &cmdprio->perc_entry[ddir];
+               prio->perc = options->percentage[ddir];
+               prio->prio = ioprio_value(options->class[ddir],
+                                         options->level[ddir]);
+               assign_clat_prio_index(prio, &values[ddir]);
+
+               ret = init_ts_clat_prio(ts, ddir, &values[ddir]);
+               if (ret)
+                       goto err;
+
+               free(values[ddir].prios);
+               values[ddir].prios = NULL;
+               values[ddir].nr_prios = 0;
         }
  
-       return false;
+       return 0;
+
+err:
+       for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++)
+               free(values[ddir].prios);
+       free_clat_prio_stats(ts);
+
+       return ret;
  }
  
  static int fio_cmdprio_parse_and_gen_bssplit(struct thread_data *td,
                                              struct cmdprio *cmdprio)
  {
         struct cmdprio_options *options = cmdprio->options;
-       int ret;
-
-       ret = fio_cmdprio_bssplit_parse(td, options->bssplit_str, cmdprio);
+       struct cmdprio_parse_result parse_res[CMDPRIO_RWDIR_CNT] = {0};
+       struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {0};
+       struct thread_stat *ts = &td->ts;
+       int ret, implicit_cmdprio;
+       enum fio_ddir ddir;
+
+       ret = fio_cmdprio_bssplit_parse(td, options->bssplit_str,
+                                       &parse_res[0]);
         if (ret)
                 goto err;
  
+       for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+               /*
+                * Do not allocate a clat_prio array nor set the cmdprio structs
+                * if there are no non-zero entries (for the ddir), or when the
+                * ddir is not enabled.
+                */
+               if (!parse_res[ddir].nr_entries ||
+                   (ddir == DDIR_READ && !td_read(td)) ||
+                   (ddir == DDIR_WRITE && !td_write(td))) {
+                       free(parse_res[ddir].entries);
+                       parse_res[ddir].entries = NULL;
+                       parse_res[ddir].nr_entries = 0;
+                       continue;
+               }
+
+               ret = init_cmdprio_values(&values[ddir],
+                                         parse_res[ddir].nr_entries, ts);
+               if (ret)
+                       goto err;
+
+               implicit_cmdprio = ioprio_value(options->class[ddir],
+                                               options->level[ddir]);
+
+               ret = fio_cmdprio_generate_bsprio_desc(&cmdprio->bsprio_desc[ddir],
+                                                      &parse_res[ddir],
+                                                      &values[ddir],
+                                                      implicit_cmdprio);
+               if (ret)
+                       goto err;
+
+               free(parse_res[ddir].entries);
+               parse_res[ddir].entries = NULL;
+               parse_res[ddir].nr_entries = 0;
+
+               ret = init_ts_clat_prio(ts, ddir, &values[ddir]);
+               if (ret)
+                       goto err;
+
+               free(values[ddir].prios);
+               values[ddir].prios = NULL;
+               values[ddir].nr_prios = 0;
+       }
+
         return 0;
  
  err:
+       for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+               free(parse_res[ddir].entries);
+               free(values[ddir].prios);
+       }
+       free_clat_prio_stats(ts);
         fio_cmdprio_cleanup(cmdprio);
  
         return ret;
@@ -157,40 +441,46 @@ static int fio_cmdprio_parse_and_gen(struct thread_data *td,
         struct cmdprio_options *options = cmdprio->options;
         int i, ret;
  
+       /*
+        * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class
+        * is not set, default to RT priority class.
+        */
+       for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
+               /*
+                * A cmdprio value is only used when fio_cmdprio_percentage()
+                * returns non-zero, so it is safe to set a class even for a
+                * DDIR that will never use it.
+                */
+               if (!options->class[i])
+                       options->class[i] = IOPRIO_CLASS_RT;
+       }
+
         switch (cmdprio->mode) {
         case CMDPRIO_MODE_BSSPLIT:
                 ret = fio_cmdprio_parse_and_gen_bssplit(td, cmdprio);
                 break;
         case CMDPRIO_MODE_PERC:
-               ret = 0;
+               ret = fio_cmdprio_gen_perc(td, cmdprio);
                 break;
         default:
                 assert(0);
                 return 1;
         }
  
-       /*
-        * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class
-        * is not set, default to RT priority class.
-        */
-       for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
-               if (options->percentage[i] || cmdprio->bssplit_nr[i]) {
-                       if (!options->class[i])
-                               options->class[i] = IOPRIO_CLASS_RT;
-               }
-       }
-
         return ret;
  }
  
  void fio_cmdprio_cleanup(struct cmdprio *cmdprio)
  {
-       int ddir;
+       enum fio_ddir ddir;
+       int i;
  
         for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
-               free(cmdprio->bssplit[ddir]);
-               cmdprio->bssplit[ddir] = NULL;
-               cmdprio->bssplit_nr[ddir] = 0;
+               for (i = 0; i < cmdprio->bsprio_desc[ddir].nr_bsprios; i++)
+                       free(cmdprio->bsprio_desc[ddir].bsprios[i].prios);
+               free(cmdprio->bsprio_desc[ddir].bsprios);
+               cmdprio->bsprio_desc[ddir].bsprios = NULL;
+               cmdprio->bsprio_desc[ddir].nr_bsprios = 0;
         }
  
         /*
diff --git a/engines/cmdprio.h b/engines/cmdprio.h

index 0c7bd6cf4b92222189ff3bc897c4446f895f9847..755da8d0f8fff654bc81c5f0ca04ba02aa79ec1c 100644 (file)
--- a/engines/cmdprio.h
+++ b/engines/cmdprio.h
@@ -17,6 +17,24 @@ enum {
         CMDPRIO_MODE_BSSPLIT,
  };
  
+struct cmdprio_prio {
+       int32_t prio;
+       uint32_t perc;
+       uint16_t clat_prio_index;
+};
+
+struct cmdprio_bsprio {
+       uint64_t bs;
+       uint32_t tot_perc;
+       unsigned int nr_prios;
+       struct cmdprio_prio *prios;
+};
+
+struct cmdprio_bsprio_desc {
+       struct cmdprio_bsprio *bsprios;
+       unsigned int nr_bsprios;
+};
+
  struct cmdprio_options {
         unsigned int percentage[CMDPRIO_RWDIR_CNT];
         unsigned int class[CMDPRIO_RWDIR_CNT];
@@ -26,8 +44,8 @@ struct cmdprio_options {
  
  struct cmdprio {
         struct cmdprio_options *options;
-       unsigned int bssplit_nr[CMDPRIO_RWDIR_CNT];
-       struct bssplit *bssplit[CMDPRIO_RWDIR_CNT];
+       struct cmdprio_prio perc_entry[CMDPRIO_RWDIR_CNT];
+       struct cmdprio_bsprio_desc bsprio_desc[CMDPRIO_RWDIR_CNT];
         unsigned int mode;
  };
  
diff --git a/engines/filecreate.c b/engines/filecreate.c

index 4bb13c348c1a4113a1425dbf620dc5d74a8264be..7884752d31d9ceea5111b5dce39351076d9a6dfc 100644 (file)
--- a/engines/filecreate.c
+++ b/engines/filecreate.c
@@ -49,7 +49,7 @@ static int open_file(struct thread_data *td, struct fio_file *f)
                 uint64_t nsec;
  
                 nsec = ntime_since_now(&start);
-               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false);
+               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
         }
  
         return 0;
diff --git a/engines/filedelete.c b/engines/filedelete.c

index e882ccf0176726be863a605c81afc0f5fbd8e999..df388ac92004e04dd9a4d8e5f74d292b110d9ba0 100644 (file)
--- a/engines/filedelete.c
+++ b/engines/filedelete.c
@@ -51,7 +51,7 @@ static int delete_file(struct thread_data *td, struct fio_file *f)
                 uint64_t nsec;
  
                 nsec = ntime_since_now(&start);
-               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false);
+               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
         }
  
         return 0;
diff --git a/engines/filestat.c b/engines/filestat.c

index 003112474b39f61c94937447a8518d8b8bc25027..e587eb542d1bcac9c8a0d2f45558136fdc536bc1 100644 (file)
--- a/engines/filestat.c
+++ b/engines/filestat.c
@@ -125,7 +125,7 @@ static int stat_file(struct thread_data *td, struct fio_file *f)
                 uint64_t nsec;
  
                 nsec = ntime_since_now(&start);
-               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, false);
+               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
         }
  
         return 0;
diff --git a/engines/io_uring.c b/engines/io_uring.c

index 00ae34823f86ec32907700a8118c30a5d3e97ae1..a2533c88e6a2c0bfa819dd782e37888f6f0bc960 100644 (file)
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -699,9 +699,15 @@ static int fio_ioring_queue_init(struct thread_data *td)
         p.flags |= IORING_SETUP_CQSIZE;
         p.cq_entries = depth;
  
+retry:
         ret = syscall(__NR_io_uring_setup, depth, &p);
-       if (ret < 0)
+       if (ret < 0) {
+               if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
+                       p.flags &= ~IORING_SETUP_CQSIZE;
+                       goto retry;
+               }
                 return ret;
+       }
  
         ld->ring_fd = ret;
  
diff --git a/engines/librpma_fio.c b/engines/librpma_fio.c

index 3d605ed6c3721dcb5866d3bbe6b4754a3cb9e3dd..9d6ebf38ee865130ae9450dc79719dbbecc3f419 100644 (file)
--- a/engines/librpma_fio.c
+++ b/engines/librpma_fio.c
@@ -108,7 +108,7 @@ char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
         return mem_ptr;
  }
  
-char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
+char *librpma_fio_allocate_pmem(struct thread_data *td, struct fio_file *f,
                 size_t size, struct librpma_fio_mem *mem)
  {
         size_t size_mmap = 0;
@@ -122,18 +122,24 @@ char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
                 return NULL;
         }
  
-       ws_offset = (td->thread_number - 1) * size;
+       if (f->filetype == FIO_TYPE_CHAR) {
+               /* Each thread uses a separate offset within DeviceDAX. */
+               ws_offset = (td->thread_number - 1) * size;
+       } else {
+               /* Each thread uses a separate FileSystemDAX file. No offset is needed. */
+               ws_offset = 0;
+       }
  
-       if (!filename) {
+       if (!f->file_name) {
                 log_err("fio: filename is not set\n");
                 return NULL;
         }
  
         /* map the file */
-       mem_ptr = pmem_map_file(filename, 0 /* len */, 0 /* flags */,
+       mem_ptr = pmem_map_file(f->file_name, 0 /* len */, 0 /* flags */,
                         0 /* mode */, &size_mmap, &is_pmem);
         if (mem_ptr == NULL) {
-               log_err("fio: pmem_map_file(%s) failed\n", filename);
+               log_err("fio: pmem_map_file(%s) failed\n", f->file_name);
                 /* pmem_map_file() sets errno on failure */
                 td_verror(td, errno, "pmem_map_file");
                 return NULL;
@@ -142,7 +148,7 @@ char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
         /* pmem is expected */
         if (!is_pmem) {
                 log_err("fio: %s is not located in persistent memory\n",
-                       filename);
+                       f->file_name);
                 goto err_unmap;
         }
  
@@ -150,12 +156,12 @@ char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
         if (size_mmap < ws_offset + size) {
                 log_err(
                         "fio: %s is too small to handle so many threads (%zu < %zu)\n",
-                       filename, size_mmap, ws_offset + size);
+                       f->file_name, size_mmap, ws_offset + size);
                 goto err_unmap;
         }
  
         log_info("fio: size of memory mapped from the file %s: %zu\n",
-               filename, size_mmap);
+               f->file_name, size_mmap);
  
         mem->mem_ptr = mem_ptr;
         mem->size_mmap = size_mmap;
@@ -893,6 +899,7 @@ int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
         size_t mem_size = td->o.size;
         size_t mr_desc_size;
         void *ws_ptr;
+       bool is_dram;
         int usage_mem_type;
         int ret;
  
@@ -910,14 +917,14 @@ int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
                 return -1;
         }
  
-       if (strcmp(f->file_name, "malloc") == 0) {
+       is_dram = !strcmp(f->file_name, "malloc");
+       if (is_dram) {
                 /* allocation from DRAM using posix_memalign() */
                 ws_ptr = librpma_fio_allocate_dram(td, mem_size, &csd->mem);
                 usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_VISIBILITY;
         } else {
                 /* allocation from PMEM using pmem_map_file() */
-               ws_ptr = librpma_fio_allocate_pmem(td, f->file_name,
-                               mem_size, &csd->mem);
+               ws_ptr = librpma_fio_allocate_pmem(td, f, mem_size, &csd->mem);
                 usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT;
         }
  
@@ -934,6 +941,21 @@ int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
                 goto err_free;
         }
  
+       if (!is_dram && f->filetype == FIO_TYPE_FILE) {
+               ret = rpma_mr_advise(mr, 0, mem_size,
+                               IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE,
+                               IBV_ADVISE_MR_FLAG_FLUSH);
+               if (ret) {
+                       librpma_td_verror(td, ret, "rpma_mr_advise");
+                       /* an invalid argument is an error */
+                       if (ret == RPMA_E_INVAL)
+                               goto err_mr_dereg;
+
+                       /* log_err used instead of log_info to avoid corruption of the JSON output */
+                       log_err("Note: having rpma_mr_advise(3) failed because of RPMA_E_NOSUPP or RPMA_E_PROVIDER may come with a performance penalty, but it is not a blocker for running the benchmark.\n");
+               }
+       }
+
         /* get size of the memory region's descriptor */
         if ((ret = rpma_mr_get_descriptor_size(mr, &mr_desc_size))) {
                 librpma_td_verror(td, ret, "rpma_mr_get_descriptor_size");
diff --git a/engines/librpma_fio.h b/engines/librpma_fio.h

index fb89d99d696cd26be4061ec30865782051ac16ae..2c507e9c5c1c74290a61d10ba04b30d1e3a01d62 100644 (file)
--- a/engines/librpma_fio.h
+++ b/engines/librpma_fio.h
@@ -77,7 +77,7 @@ struct librpma_fio_mem {
  char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
                 struct librpma_fio_mem *mem);
  
-char *librpma_fio_allocate_pmem(struct thread_data *td, const char *filename,
+char *librpma_fio_allocate_pmem(struct thread_data *td, struct fio_file *f,
                 size_t size, struct librpma_fio_mem *mem);
  
  void librpma_fio_free(struct librpma_fio_mem *mem);
diff --git a/engines/sg.c b/engines/sg.c

index 1c0193840df3cacbf276be48542f5cd5af7b80ad..72ee07ba4592f715618e935bf57712246bce8ac0 100644 (file)
--- a/engines/sg.c
+++ b/engines/sg.c
@@ -66,8 +66,13 @@
  
  enum {
         FIO_SG_WRITE            = 1,
-       FIO_SG_WRITE_VERIFY     = 2,
-       FIO_SG_WRITE_SAME       = 3
+       FIO_SG_WRITE_VERIFY,
+       FIO_SG_WRITE_SAME,
+       FIO_SG_WRITE_SAME_NDOB,
+       FIO_SG_WRITE_STREAM,
+       FIO_SG_VERIFY_BYTCHK_00,
+       FIO_SG_VERIFY_BYTCHK_01,
+       FIO_SG_VERIFY_BYTCHK_11,
  };
  
  struct sg_options {
@@ -76,6 +81,7 @@ struct sg_options {
         unsigned int readfua;
         unsigned int writefua;
         unsigned int write_mode;
+       uint16_t stream_id;
  };
  
  static struct fio_option options[] = {
@@ -120,18 +126,58 @@ static struct fio_option options[] = {
                             .oval = FIO_SG_WRITE,
                             .help = "Issue standard SCSI WRITE commands",
                           },
-                         { .ival = "verify",
+                         { .ival = "write_and_verify",
                             .oval = FIO_SG_WRITE_VERIFY,
                             .help = "Issue SCSI WRITE AND VERIFY commands",
                           },
-                         { .ival = "same",
+                         { .ival = "verify",
+                           .oval = FIO_SG_WRITE_VERIFY,
+                           .help = "Issue SCSI WRITE AND VERIFY commands. This "
+                                   "option is deprecated. Use write_and_verify instead.",
+                         },
+                         { .ival = "write_same",
                             .oval = FIO_SG_WRITE_SAME,
                             .help = "Issue SCSI WRITE SAME commands",
                           },
+                         { .ival = "same",
+                           .oval = FIO_SG_WRITE_SAME,
+                           .help = "Issue SCSI WRITE SAME commands. This "
+                                   "option is deprecated. Use write_same instead.",
+                         },
+                         { .ival = "write_same_ndob",
+                           .oval = FIO_SG_WRITE_SAME_NDOB,
+                           .help = "Issue SCSI WRITE SAME(16) commands with NDOB flag set",
+                         },
+                         { .ival = "verify_bytchk_00",
+                           .oval = FIO_SG_VERIFY_BYTCHK_00,
+                           .help = "Issue SCSI VERIFY commands with BYTCHK set to 00",
+                         },
+                         { .ival = "verify_bytchk_01",
+                           .oval = FIO_SG_VERIFY_BYTCHK_01,
+                           .help = "Issue SCSI VERIFY commands with BYTCHK set to 01",
+                         },
+                         { .ival = "verify_bytchk_11",
+                           .oval = FIO_SG_VERIFY_BYTCHK_11,
+                           .help = "Issue SCSI VERIFY commands with BYTCHK set to 11",
+                         },
+                         { .ival = "write_stream",
+                           .oval = FIO_SG_WRITE_STREAM,
+                           .help = "Issue SCSI WRITE STREAM(16) commands",
+                         },
                 },
                 .category = FIO_OPT_C_ENGINE,
                 .group  = FIO_OPT_G_SG,
         },
+       {
+               .name   = "stream_id",
+               .lname  = "stream id for WRITE STREAM(16) commands",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct sg_options, stream_id),
+               .help   = "Stream ID for WRITE STREAM(16) commands",
+               .def    = "0",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_SG,
+       },
         {
                 .name   = NULL,
         },
@@ -171,6 +217,11 @@ struct sgio_data {
  #endif
  };
  
+static inline uint16_t sgio_get_be16(uint8_t *buf)
+{
+       return be16_to_cpu(*((uint16_t *) buf));
+}
+
  static inline uint32_t sgio_get_be32(uint8_t *buf)
  {
         return be32_to_cpu(*((uint32_t *) buf));
@@ -502,9 +553,9 @@ static enum fio_q_status fio_sgio_doio(struct thread_data *td,
  }
  
  static void fio_sgio_rw_lba(struct sg_io_hdr *hdr, unsigned long long lba,
-                           unsigned long long nr_blocks)
+                           unsigned long long nr_blocks, bool override16)
  {
-       if (lba < MAX_10B_LBA) {
+       if (lba < MAX_10B_LBA && !override16) {
                 sgio_set_be32((uint32_t) lba, &hdr->cmdp[2]);
                 sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[7]);
         } else {
@@ -545,7 +596,7 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
                 if (o->readfua)
                         hdr->cmdp[1] |= 0x08;
  
-               fio_sgio_rw_lba(hdr, lba, nr_blocks);
+               fio_sgio_rw_lba(hdr, lba, nr_blocks, false);
  
         } else if (io_u->ddir == DDIR_WRITE) {
                 sgio_hdr_init(sd, hdr, io_u, 1);
@@ -576,9 +627,46 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
                         else
                                 hdr->cmdp[0] = 0x93; // write same(16)
                         break;
+               case FIO_SG_WRITE_SAME_NDOB:
+                       hdr->cmdp[0] = 0x93; // write same(16)
+                       hdr->cmdp[1] |= 0x1; // no data output buffer
+                       hdr->dxfer_len = 0;
+                       break;
+               case FIO_SG_WRITE_STREAM:
+                       hdr->cmdp[0] = 0x9a; // write stream (16)
+                       if (o->writefua)
+                               hdr->cmdp[1] |= 0x08;
+                       sgio_set_be64(lba, &hdr->cmdp[2]);
+                       sgio_set_be16((uint16_t) io_u->file->engine_pos, &hdr->cmdp[10]);
+                       sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[12]);
+                       break;
+               case FIO_SG_VERIFY_BYTCHK_00:
+                       if (lba < MAX_10B_LBA)
+                               hdr->cmdp[0] = 0x2f; // VERIFY(10)
+                       else
+                               hdr->cmdp[0] = 0x8f; // VERIFY(16)
+                       hdr->dxfer_len = 0;
+                       break;
+               case FIO_SG_VERIFY_BYTCHK_01:
+                       if (lba < MAX_10B_LBA)
+                               hdr->cmdp[0] = 0x2f; // VERIFY(10)
+                       else
+                               hdr->cmdp[0] = 0x8f; // VERIFY(16)
+                       hdr->cmdp[1] |= 0x02;           // BYTCHK = 01b
+                       break;
+               case FIO_SG_VERIFY_BYTCHK_11:
+                       if (lba < MAX_10B_LBA)
+                               hdr->cmdp[0] = 0x2f; // VERIFY(10)
+                       else
+                               hdr->cmdp[0] = 0x8f; // VERIFY(16)
+                       hdr->cmdp[1] |= 0x06;           // BYTCHK = 11b
+                       hdr->dxfer_len = sd->bs;
+                       break;
                 };
  
-               fio_sgio_rw_lba(hdr, lba, nr_blocks);
+               if (o->write_mode != FIO_SG_WRITE_STREAM)
+                       fio_sgio_rw_lba(hdr, lba, nr_blocks,
+                               o->write_mode == FIO_SG_WRITE_SAME_NDOB);
  
         } else if (io_u->ddir == DDIR_TRIM) {
                 struct sgio_trim *st;
@@ -970,9 +1058,60 @@ static int fio_sgio_type_check(struct thread_data *td, struct fio_file *f)
         return 0;
  }
  
+static int fio_sgio_stream_control(struct fio_file *f, bool open_stream, uint16_t *stream_id)
+{
+       struct sg_io_hdr hdr;
+       unsigned char cmd[16];
+       unsigned char sb[64];
+       unsigned char buf[8];
+       int ret;
+
+       memset(&hdr, 0, sizeof(hdr));
+       memset(cmd, 0, sizeof(cmd));
+       memset(sb, 0, sizeof(sb));
+       memset(buf, 0, sizeof(buf));
+
+       hdr.interface_id = 'S';
+       hdr.cmdp = cmd;
+       hdr.cmd_len = 16;
+       hdr.sbp = sb;
+       hdr.mx_sb_len = sizeof(sb);
+       hdr.timeout = SCSI_TIMEOUT_MS;
+       hdr.cmdp[0] = 0x9e;
+       hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+       hdr.dxferp = buf;
+       hdr.dxfer_len = sizeof(buf);
+       sgio_set_be32(sizeof(buf), &hdr.cmdp[10]);
+
+       if (open_stream)
+               hdr.cmdp[1] = 0x34;
+       else {
+               hdr.cmdp[1] = 0x54;
+               sgio_set_be16(*stream_id, &hdr.cmdp[4]);
+       }
+
+       ret = ioctl(f->fd, SG_IO, &hdr);
+
+       if (ret < 0)
+               return ret;
+
+       if (hdr.info & SG_INFO_CHECK)
+               return 1;
+
+       if (open_stream) {
+               *stream_id = sgio_get_be16(&buf[4]);
+               dprint(FD_FILE, "sgio_stream_control: opened stream %u\n", (unsigned int) *stream_id);
+               assert(*stream_id != 0);
+       } else
+               dprint(FD_FILE, "sgio_stream_control: closed stream %u\n", (unsigned int) *stream_id);
+
+       return 0;
+}
+
  static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
  {
         struct sgio_data *sd = td->io_ops_data;
+       struct sg_options *o = td->eo;
         int ret;
  
         ret = generic_open_file(td, f);
@@ -984,9 +1123,33 @@ static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
                 return ret;
         }
  
+       if (o->write_mode == FIO_SG_WRITE_STREAM) {
+               if (o->stream_id)
+                       f->engine_pos = o->stream_id;
+               else {
+                       ret = fio_sgio_stream_control(f, true, (uint16_t *) &f->engine_pos);
+                       if (ret)
+                               return ret;
+               }
+       }
+
         return 0;
  }
  
+int fio_sgio_close(struct thread_data *td, struct fio_file *f)
+{
+       struct sg_options *o = td->eo;
+       int ret;
+
+       if (!o->stream_id && o->write_mode == FIO_SG_WRITE_STREAM) {
+               ret = fio_sgio_stream_control(f, false, (uint16_t *) &f->engine_pos);
+               if (ret)
+                       return ret;
+       }
+
+       return generic_close_file(td, f);
+}
+
  /*
   * Build an error string with details about the driver, host or scsi
   * error contained in the sg header Caller will use as necessary.
@@ -1261,7 +1424,7 @@ static struct ioengine_ops ioengine = {
         .event          = fio_sgio_event,
         .cleanup        = fio_sgio_cleanup,
         .open_file      = fio_sgio_open,
-       .close_file     = generic_close_file,
+       .close_file     = fio_sgio_close,
         .get_file_size  = fio_sgio_get_file_size,
         .flags          = FIO_SYNCIO | FIO_RAWIO,
         .options        = options,
diff --git a/engines/windowsaio.c b/engines/windowsaio.c

index 9868e816adb68b8e196dc42d1832b8869e36fec3..d82c80536145409356739c3a5ea94d518259f141 100644 (file)
--- a/engines/windowsaio.c
+++ b/engines/windowsaio.c
@@ -11,6 +11,7 @@
  #include <errno.h>
  
  #include "../fio.h"
+#include "../optgroup.h"
  
  typedef BOOL (WINAPI *CANCELIOEX)(HANDLE hFile, LPOVERLAPPED lpOverlapped);
  
@@ -35,6 +36,26 @@ struct thread_ctx {
         struct windowsaio_data *wd;
  };
  
+struct windowsaio_options {
+       struct thread_data *td;
+       unsigned int no_completion_thread;
+};
+
+static struct fio_option options[] = {
+       {
+               .name   = "no_completion_thread",
+               .lname  = "No completion polling thread",
+               .type   = FIO_OPT_STR_SET,
+               .off1   = offsetof(struct windowsaio_options, no_completion_thread),
+               .help   = "Use to avoid separate completion polling thread",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_WINDOWSAIO,
+       },
+       {
+               .name   = NULL,
+       },
+};
+
  static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter);
  
  static int fio_windowsaio_init(struct thread_data *td)
@@ -80,6 +101,7 @@ static int fio_windowsaio_init(struct thread_data *td)
                 struct thread_ctx *ctx;
                 struct windowsaio_data *wd;
                 HANDLE hFile;
+               struct windowsaio_options *o = td->eo;
  
                 hFile = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
                 if (hFile == INVALID_HANDLE_VALUE) {
@@ -91,29 +113,30 @@ static int fio_windowsaio_init(struct thread_data *td)
                 wd->iothread_running = TRUE;
                 wd->iocp = hFile;
  
-               if (!rc)
-                       ctx = malloc(sizeof(struct thread_ctx));
+               if (o->no_completion_thread == 0) {
+                       if (!rc)
+                               ctx = malloc(sizeof(struct thread_ctx));
  
-               if (!rc && ctx == NULL) {
-                       log_err("windowsaio: failed to allocate memory for thread context structure\n");
-                       CloseHandle(hFile);
-                       rc = 1;
-               }
+                       if (!rc && ctx == NULL) {
+                               log_err("windowsaio: failed to allocate memory for thread context structure\n");
+                               CloseHandle(hFile);
+                               rc = 1;
+                       }
  
-               if (!rc) {
-                       DWORD threadid;
+                       if (!rc) {
+                               DWORD threadid;
  
-                       ctx->iocp = hFile;
-                       ctx->wd = wd;
-                       wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
-                       if (!wd->iothread)
-                               log_err("windowsaio: failed to create io completion thread\n");
-                       else if (fio_option_is_set(&td->o, cpumask))
-                               fio_setaffinity(threadid, td->o.cpumask);
+                               ctx->iocp = hFile;
+                               ctx->wd = wd;
+                               wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
+                               if (!wd->iothread)
+                                       log_err("windowsaio: failed to create io completion thread\n");
+                               else if (fio_option_is_set(&td->o, cpumask))
+                                       fio_setaffinity(threadid, td->o.cpumask);
+                       }
+                       if (rc || wd->iothread == NULL)
+                               rc = 1;
                 }
-
-               if (rc || wd->iothread == NULL)
-                       rc = 1;
         }
  
         return rc;
@@ -302,9 +325,63 @@ static struct io_u* fio_windowsaio_event(struct thread_data *td, int event)
         return wd->aio_events[event];
  }
  
-static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
-                                   unsigned int max,
-                                   const struct timespec *t)
+/* dequeue completion entrees directly (no separate completion thread) */
+static int fio_windowsaio_getevents_nothread(struct thread_data *td, unsigned int min,
+                                   unsigned int max, const struct timespec *t)
+{
+       struct windowsaio_data *wd = td->io_ops_data;
+       unsigned int dequeued = 0;
+       struct io_u *io_u;
+       DWORD start_count = 0;
+       DWORD end_count = 0;
+       DWORD mswait = 250;
+       struct fio_overlapped *fov;
+
+       if (t != NULL) {
+               mswait = (t->tv_sec * 1000) + (t->tv_nsec / 1000000);
+               start_count = GetTickCount();
+               end_count = start_count + (t->tv_sec * 1000) + (t->tv_nsec / 1000000);
+       }
+
+       do {
+               BOOL ret;
+               OVERLAPPED *ovl;
+
+               ULONG entries = min(16, max-dequeued);
+               OVERLAPPED_ENTRY oe[16];
+               ret = GetQueuedCompletionStatusEx(wd->iocp, oe, 16, &entries, mswait, 0);
+               if (ret && entries) {
+                       int entry_num;
+
+                       for (entry_num=0; entry_num<entries; entry_num++) {
+                               ovl = oe[entry_num].lpOverlapped;
+                               fov = CONTAINING_RECORD(ovl, struct fio_overlapped, o);
+                               io_u = fov->io_u;
+
+                               if (ovl->Internal == ERROR_SUCCESS) {
+                                       io_u->resid = io_u->xfer_buflen - ovl->InternalHigh;
+                                       io_u->error = 0;
+                               } else {
+                                       io_u->resid = io_u->xfer_buflen;
+                                       io_u->error = win_to_posix_error(GetLastError());
+                               }
+
+                               fov->io_complete = FALSE;
+                               wd->aio_events[dequeued] = io_u;
+                               dequeued++;
+                       }
+               }
+
+               if (dequeued >= min ||
+                       (t != NULL && timeout_expired(start_count, end_count)))
+                       break;
+       } while (1);
+       return dequeued;
+}
+
+/* dequeue completion entrees creates by separate IoCompletionRoutine thread */
+static int fio_windowaio_getevents_thread(struct thread_data *td, unsigned int min,
+                                   unsigned int max, const struct timespec *t)
  {
         struct windowsaio_data *wd = td->io_ops_data;
         unsigned int dequeued = 0;
@@ -334,7 +411,6 @@ static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
                                 wd->aio_events[dequeued] = io_u;
                                 dequeued++;
                         }
-
                 }
                 if (dequeued >= min)
                         break;
@@ -353,6 +429,16 @@ static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
         return dequeued;
  }
  
+static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
+                                   unsigned int max, const struct timespec *t)
+{
+       struct windowsaio_options *o = td->eo;
+
+       if (o->no_completion_thread)
+               return fio_windowsaio_getevents_nothread(td, min, max, t);
+       return fio_windowaio_getevents_thread(td, min, max, t);
+}
+
  static enum fio_q_status fio_windowsaio_queue(struct thread_data *td,
                                               struct io_u *io_u)
  {
@@ -484,6 +570,8 @@ static struct ioengine_ops ioengine = {
         .get_file_size  = generic_get_file_size,
         .io_u_init      = fio_windowsaio_io_u_init,
         .io_u_free      = fio_windowsaio_io_u_free,
+       .options        = options,
+       .option_struct_size     = sizeof(struct windowsaio_options),
  };
  
  static void fio_init fio_windowsaio_register(void)
diff --git a/examples/cmdprio-bssplit.fio b/examples/cmdprio-bssplit.fio

index 47e9a790605e388e341ac069e692057346127a93..f3b2fac02d1eba5706d98952721cbabbd8e44b66 100644 (file)
--- a/examples/cmdprio-bssplit.fio
+++ b/examples/cmdprio-bssplit.fio
@@ -1,17 +1,44 @@
  ; Randomly read/write a block device file at queue depth 16.
-; 40 % of read IOs are 64kB and 60% are 1MB. 100% of writes are 1MB.
-; 100% of the 64kB reads are executed at the highest priority and
-; all other IOs executed without a priority set.
  [global]
  filename=/dev/sda
  direct=1
  write_lat_log=prio-run.log
  log_prio=1
-
-[randrw]
  rw=randrw
-bssplit=64k/40:1024k/60,1024k/100
  ioengine=libaio
  iodepth=16
+
+; Simple cmdprio_bssplit format. All non-zero percentage entries will
+; use the same prio class and prio level defined by the cmdprio_class
+; and cmdprio options.
+[cmdprio]
+; 40% of read IOs are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 100% of the 64kB reads are executed with prio class 1 and prio level 0.
+; All other IOs are executed without a priority set.
+bssplit=64k/40:1024k/60,1024k/100
  cmdprio_bssplit=64k/100:1024k/0,1024k/0
  cmdprio_class=1
+cmdprio=0
+
+; Advanced cmdprio_bssplit format. Each non-zero percentage entry can
+; use a different prio class and prio level (appended to each entry).
+[cmdprio-adv]
+; 40% of read IOs are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 25% of the 64kB reads are executed with prio class 1 and prio level 1,
+; 75% of the 64kB reads are executed with prio class 3 and prio level 2.
+; All other IOs are executed without a priority set.
+stonewall
+bssplit=64k/40:1024k/60,1024k/100
+cmdprio_bssplit=64k/25/1/1:64k/75/3/2:1024k/0,1024k/0
+
+; Identical to the previous example, but with a default priority defined.
+[cmdprio-adv-def]
+; 40% of read IOs are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 25% of the 64kB reads are executed with prio class 1 and prio level 1,
+; 75% of the 64kB reads are executed with prio class 3 and prio level 2.
+; All other IOs are executed with prio class 2 and prio level 7.
+stonewall
+prioclass=2
+prio=7
+bssplit=64k/40:1024k/60,1024k/100
+cmdprio_bssplit=64k/25/1/1:64k/75/3/2:1024k/0,1024k/0
diff --git a/examples/sg_verify-fail.fio b/examples/sg_verify-fail.fio

new file mode 100644 (file)

index 0000000..64feece
--- /dev/null
+++ b/examples/sg_verify-fail.fio
@@ -0,0 +1,48 @@
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL  *
+# * DESTROY DATA ON /dev/sdb       *
+# **********************************
+#
+# Test SCSI VERIFY commands issued via the sg ioengine
+# The jobs with fail in the name should produce errors
+#
+# job                  description
+# precon               precondition the device by writing with a known
+#                      pattern
+# verify01             verify each block one at a time by comparing to known
+#                      pattern
+# verify01-fail                verifying one too many blocks should produce a failure
+# verify11-one_ios     verify all 20 blocks by sending only 512 bytes
+# verify11-fail                verifying beyond the preconditioned region should
+#                      produce a failure
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+number_ios=20
+stonewall
+
+[precon]
+
+[verify01]
+sg_write_mode=verify_bytchk_01
+number_ios=20
+
+[verify01-fail]
+sg_write_mode=verify_bytchk_01
+number_ios=21
+
+[verify11-one_ios]
+sg_write_mode=verify_bytchk_11
+number_ios=1
+bs=10240
+
+[verify11-fail]
+sg_write_mode=verify_bytchk_11
+number_ios=1
+bs=10752
diff --git a/examples/sg_verify.fio b/examples/sg_verify.fio

new file mode 100644 (file)

index 0000000..6db0dd0
--- /dev/null
+++ b/examples/sg_verify.fio
@@ -0,0 +1,57 @@
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL  *
+# * DESTROY DATA ON /dev/sdb       *
+# **********************************
+#
+# Test SCSI VERIFY commands issued via the sg ioengine
+# All of the jobs below should complete without error
+#
+# job                  description
+# precon               precondition the device by writing with a known
+#                      pattern
+# verify00             verify written data on medium only
+# verify01             verify each block one at a time by comparing to known
+#                      pattern
+# verify01-two_ios     verify same data but with only two VERIFY operations
+# verify11             verify each block one at a time
+# verify11-five_ios    verify data with five IOs, four blocks at a time,
+#                      sending 512 bytes for each IO
+# verify11-one_ios     verify all 20 blocks by sending only 512 bytes
+#
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+number_ios=20
+stonewall
+
+[precon]
+
+[verify00]
+sg_write_mode=verify_bytchk_00
+
+[verify01]
+sg_write_mode=verify_bytchk_01
+
+[verify01-two_ios]
+sg_write_mode=verify_bytchk_01
+bs=5120
+number_ios=2
+
+[verify11]
+sg_write_mode=verify_bytchk_11
+
+[verify11-five_ios]
+sg_write_mode=verify_bytchk_11
+bs=2048
+number_ios=5
+
+[verify11-one_ios]
+sg_write_mode=verify_bytchk_11
+bs=10240
+number_ios=1
diff --git a/examples/sg_write_same_ndob.fio b/examples/sg_write_same_ndob.fio

new file mode 100644 (file)

index 0000000..fb04731
--- /dev/null
+++ b/examples/sg_write_same_ndob.fio
@@ -0,0 +1,44 @@
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL  *
+# * DESTROY DATA ON /dev/sdb       *
+# **********************************
+#
+# Test WRITE SAME commands with the NDOB flag set
+# issued via the sg ioengine
+# All of the jobs below should complete without error
+# except the last one
+#
+# job                  description
+# precon               Precondition the device by writing 20 blocks with a
+#                      known pattern
+# write_same_ndob      Write 19 sectors of all zeroes with the NDOB flag set
+# verify-pass          Verify 19 blocks of all zeroes
+# verify-fail          Verify 20 blocks of all zeroes. This should fail.
+#
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+stonewall
+
+[precon]
+number_ios=20
+
+[write_same_ndob]
+sg_write_mode=write_same_ndob
+number_ios=19
+
+[verify-pass]
+sg_write_mode=verify_bytchk_01
+buffer_pattern=0x00
+number_ios=19
+
+[verify-fail]
+sg_write_mode=verify_bytchk_01
+buffer_pattern=0x00
+number_ios=20
diff --git a/fio.1 b/fio.1

index 74f1a6ea07e9039f76503fc2b88d4c383c3825b6..f32d791594e24ed46d818c407bcd79e1b6e57c18 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -838,9 +838,9 @@ threads/processes.
  Limit on the number of simultaneously opened zones per single thread/process.
  .TP
  .BI ignore_zone_limits \fR=\fPbool
-If this isn't set, fio will query the max open zones limit from the zoned block
-device, and exit if the specified \fBmax_open_zones\fR value is larger than the
-limit reported by the device. Default: false.
+If this option is used, fio will ignore the maximum number of open zones limit
+of the zoned block device in use, thus allowing the option \fBmax_open_zones\fR
+value to be larger than the device reported limit. Default: false.
  .TP
  .BI zone_reset_threshold \fR=\fPfloat
  A number between zero and one that indicates the ratio of logical blocks with
@@ -1122,7 +1122,7 @@ see \fBend_fsync\fR and \fBfsync_on_close\fR.
  .TP
  .BI fdatasync \fR=\fPint
  Like \fBfsync\fR but uses \fBfdatasync\fR\|(2) to only sync data and
-not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no
+not metadata blocks. In Windows, DragonFlyBSD or OSX there is no
  \fBfdatasync\fR\|(2) so this falls back to using \fBfsync\fR\|(2).
  Defaults to 0, which means fio does not periodically issue and wait for a
  data-only sync to complete.
@@ -1686,10 +1686,10 @@ also be set as number of zones using 'z'.
  .TP
  .BI filesize \fR=\fPirange(int)
  Individual file sizes. May be a range, in which case fio will select sizes
-for files at random within the given range and limited to \fBsize\fR in
-total (if that is given). If not given, each created file is the same size.
-This option overrides \fBsize\fR in terms of file size, which means
-this value is used as a fixed size or possible range of each file.
+for files at random within the given range. If not given, each created file
+is the same size. This option overrides \fBsize\fR in terms of file size, 
+i.e. \fBsize\fR becomes merely the default for \fBio_size\fR (and
+has no effect it all if \fBio_size\fR is set explicitly).
  .TP
  .BI file_append \fR=\fPbool
  Perform I/O after the end of the file. Normally fio will operate within the
@@ -1995,10 +1995,34 @@ To get a finer control over I/O priority, this option allows specifying
  the percentage of IOs that must have a priority set depending on the block
  size of the IO. This option is useful only when used together with the option
  \fBbssplit\fR, that is, multiple different block sizes are used for reads and
-writes. The format for this option is the same as the format of the
-\fBbssplit\fR option, with the exception that values for trim IOs are
-ignored. This option is mutually exclusive with the \fBcmdprio_percentage\fR
-option.
+writes.
+.RS
+.P
+The first accepted format for this option is the same as the format of the
+\fBbssplit\fR option:
+.RS
+.P
+cmdprio_bssplit=blocksize/percentage:blocksize/percentage
+.RE
+.P
+In this case, each entry will use the priority class and priority level defined
+by the options \fBcmdprio_class\fR and \fBcmdprio\fR respectively.
+.P
+The second accepted format for this option is:
+.RS
+.P
+cmdprio_bssplit=blocksize/percentage/class/level:blocksize/percentage/class/level
+.RE
+.P
+In this case, the priority class and priority level is defined inside each
+entry. In comparison with the first accepted format, the second accepted format
+does not restrict all entries to have the same priority class and priority
+level.
+.P
+For both formats, only the read and write data directions are supported, values
+for trim IOs are ignored. This option is mutually exclusive with the
+\fBcmdprio_percentage\fR option.
+.RE
  .TP
  .BI (io_uring)fixedbufs
  If fio is asked to do direct IO, then Linux will map pages for each IO call, and
@@ -2284,7 +2308,7 @@ With writefua option set to 1, write operations include the force
  unit access (fua) flag. Default: 0.
  .TP
  .BI (sg)sg_write_mode \fR=\fPstr
-Specify the type of write commands to issue. This option can take three
+Specify the type of write commands to issue. This option can take multiple
  values:
  .RS
  .RS
@@ -2292,12 +2316,15 @@ values:
  .B write (default)
  Write opcodes are issued as usual
  .TP
+.B write_and_verify
+Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 00b. This directs the
+device to carry out a medium verification with no data comparison for the data
+that was written. The writefua option is ignored with this selection.
+.TP
  .B verify
-Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
-directs the device to carry out a medium verification with no data
-comparison. The writefua option is ignored with this selection.
+This option is deprecated. Use write_and_verify instead.
  .TP
-.B same
+.B write_same
  Issue WRITE SAME commands. This transfers a single block to the device
  and writes this same block of data to a contiguous sequence of LBAs
  beginning at the specified offset. fio's block size parameter
@@ -2308,9 +2335,43 @@ blocksize=8k will write 16 sectors with each command. fio will still
  generate 8k of data for each command butonly the first 512 bytes will
  be used and transferred to the device. The writefua option is ignored
  with this selection.
+.TP
+.B same
+This option is deprecated. Use write_same instead.
+.TP
+.B write_same_ndob
+Issue WRITE SAME(16) commands as above but with the No Data Output
+Buffer (NDOB) bit set. No data will be transferred to the device with
+this bit set. Data written will be a pre-determined pattern such as
+all zeroes.
+.TP
+.B write_stream
+Issue WRITE STREAM(16) commands. Use the stream_id option to specify
+the stream identifier.
+.TP
+.B verify_bytchk_00
+Issue VERIFY commands with BYTCHK set to 00. This directs the device to carry
+out a medium verification with no data comparison.
+.TP
+.B verify_bytchk_01
+Issue VERIFY commands with BYTCHK set to 01. This directs the device to
+compare the data on the device with the data transferred to the device.
+.TP
+.B verify_bytchk_11
+Issue VERIFY commands with BYTCHK set to 11. This transfers a single block to
+the device and compares the contents of this block with the data on the device
+beginning at the specified offset. fio's block size parameter specifies the
+total amount of data compared with this command. However, only one block
+(sector) worth of data is transferred to the device. This is similar to the
+WRITE SAME command except that data is compared instead of written.
  .RE
  .RE
  .TP
+.BI (sg)stream_id \fR=\fPint
+Set the stream identifier for WRITE STREAM commands. If this is set to 0 (which is not
+a valid stream identifier) fio will open a stream and then close it when done. Default
+is 0.
+.TP
  .BI (nbd)uri \fR=\fPstr
  Specify the NBD URI of the server to test.
  The string is a standard NBD URI (see
diff --git a/fio.h b/fio.h

index e9cdba94a6091f0fb7e9f72a34e2dd6b317da64b..7b0ca8435978f7eb1216d287584c780941fe4049 100644 (file)
--- a/fio.h
+++ b/fio.h
@@ -428,6 +428,8 @@ struct thread_data {
         struct flist_head io_log_list;
         FILE *io_log_rfile;
         unsigned int io_log_blktrace;
+       unsigned int io_log_blktrace_swap;
+       unsigned long long io_log_blktrace_last_ttime;
         unsigned int io_log_current;
         unsigned int io_log_checkmark;
         unsigned int io_log_highmark;
diff --git a/gclient.c b/gclient.c

index e0e0e7bf920cd0bc3933e5c9f62ae024be4437b4..c59bcfe2f6f3c501bfedb0fa0de3140490f7b25f 100644 (file)
--- a/gclient.c
+++ b/gclient.c
@@ -292,7 +292,7 @@ static void gfio_thread_status_op(struct fio_client *client,
         if (sum_stat_clients == 1)
                 return;
  
-       sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
+       sum_thread_stats(&client_ts, &p->ts);
         sum_group_stats(&client_gs, &p->rs);
  
         client_ts.members++;
@@ -1155,21 +1155,18 @@ out:
  #define GFIO_CLAT      1
  #define GFIO_SLAT      2
  #define GFIO_LAT       4
-#define GFIO_HILAT     8
-#define GFIO_LOLAT     16
  
  static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
                                   struct group_run_stats *rs,
                                   struct thread_stat *ts, int ddir)
  {
         const char *ddir_label[3] = { "Read", "Write", "Trim" };
-       const char *hilat, *lolat;
         GtkWidget *frame, *label, *box, *vbox, *main_vbox;
-       unsigned long long min[5], max[5];
+       unsigned long long min[3], max[3];
         unsigned long runt;
         unsigned long long bw, iops;
         unsigned int flags = 0;
-       double mean[5], dev[5];
+       double mean[3], dev[3];
         char *io_p, *io_palt, *bw_p, *bw_palt, *iops_p;
         char tmp[128];
         int i2p;
@@ -1268,14 +1265,6 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
                 flags |= GFIO_CLAT;
         if (calc_lat(&ts->lat_stat[ddir], &min[2], &max[2], &mean[2], &dev[2]))
                 flags |= GFIO_LAT;
-       if (calc_lat(&ts->clat_high_prio_stat[ddir], &min[3], &max[3], &mean[3], &dev[3])) {
-               flags |= GFIO_HILAT;
-               if (calc_lat(&ts->clat_low_prio_stat[ddir], &min[4], &max[4], &mean[4], &dev[4]))
-                       flags |= GFIO_LOLAT;
-               /* we only want to print low priority statistics if other IOs were
-                * submitted with the priority bit set
-                */
-       }
  
         if (flags) {
                 frame = gtk_frame_new("Latency");
@@ -1284,24 +1273,12 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
                 vbox = gtk_vbox_new(FALSE, 3);
                 gtk_container_add(GTK_CONTAINER(frame), vbox);
  
-               if (ts->lat_percentiles) {
-                       hilat = "High priority total latency";
-                       lolat = "Low priority total latency";
-               } else {
-                       hilat = "High priority completion latency";
-                       lolat = "Low priority completion latency";
-               }
-
                 if (flags & GFIO_SLAT)
                         gfio_show_lat(vbox, "Submission latency", min[0], max[0], mean[0], dev[0]);
                 if (flags & GFIO_CLAT)
                         gfio_show_lat(vbox, "Completion latency", min[1], max[1], mean[1], dev[1]);
                 if (flags & GFIO_LAT)
                         gfio_show_lat(vbox, "Total latency", min[2], max[2], mean[2], dev[2]);
-               if (flags & GFIO_HILAT)
-                       gfio_show_lat(vbox, hilat, min[3], max[3], mean[3], dev[3]);
-               if (flags & GFIO_LOLAT)
-                       gfio_show_lat(vbox, lolat, min[4], max[4], mean[4], dev[4]);
         }
  
         if (ts->slat_percentiles && flags & GFIO_SLAT)
@@ -1309,40 +1286,16 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
                                 ts->io_u_plat[FIO_SLAT][ddir],
                                 ts->slat_stat[ddir].samples,
                                 "Submission");
-       if (ts->clat_percentiles && flags & GFIO_CLAT) {
+       if (ts->clat_percentiles && flags & GFIO_CLAT)
                 gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
                                 ts->io_u_plat[FIO_CLAT][ddir],
                                 ts->clat_stat[ddir].samples,
                                 "Completion");
-               if (!ts->lat_percentiles) {
-                       if (flags & GFIO_HILAT)
-                               gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-                                               ts->io_u_plat_high_prio[ddir],
-                                               ts->clat_high_prio_stat[ddir].samples,
-                                               "High priority completion");
-                       if (flags & GFIO_LOLAT)
-                               gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-                                               ts->io_u_plat_low_prio[ddir],
-                                               ts->clat_low_prio_stat[ddir].samples,
-                                               "Low priority completion");
-               }
-       }
-       if (ts->lat_percentiles && flags & GFIO_LAT) {
+       if (ts->lat_percentiles && flags & GFIO_LAT)
                 gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
                                 ts->io_u_plat[FIO_LAT][ddir],
                                 ts->lat_stat[ddir].samples,
                                 "Total");
-               if (flags & GFIO_HILAT)
-                       gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-                                       ts->io_u_plat_high_prio[ddir],
-                                       ts->clat_high_prio_stat[ddir].samples,
-                                       "High priority total");
-               if (flags & GFIO_LOLAT)
-                       gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-                                       ts->io_u_plat_low_prio[ddir],
-                                       ts->clat_low_prio_stat[ddir].samples,
-                                       "Low priority total");
-       }
  
         free(io_p);
         free(bw_p);
diff --git a/init.c b/init.c

index 5f069d9a5b4af0fb4e4cb2fe67861a25d020fbff..139351527bebb2abe3037f2f0b09391ef6f36b84 100644 (file)
--- a/init.c
+++ b/init.c
@@ -224,6 +224,13 @@ static struct option l_opts[FIO_NR_OPTIONS] = {
                 .has_arg        = optional_argument,
                 .val            = 'S',
         },
+#ifdef WIN32
+       {
+               .name           = (char *) "server-internal",
+               .has_arg        = required_argument,
+               .val            = 'N',
+       },
+#endif
         {       .name           = (char *) "daemonize",
                 .has_arg        = required_argument,
                 .val            = 'D',
@@ -1445,6 +1452,26 @@ static bool wait_for_ok(const char *jobname, struct thread_options *o)
         return true;
  }
  
+static int verify_per_group_options(struct thread_data *td, const char *jobname)
+{
+       struct thread_data *td2;
+       int i;
+
+       for_each_td(td2, i) {
+               if (td->groupid != td2->groupid)
+                       continue;
+
+               if (td->o.stats &&
+                   td->o.lat_percentiles != td2->o.lat_percentiles) {
+                       log_err("fio: lat_percentiles in job: %s differs from group\n",
+                               jobname);
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
  /*
   * Treat an empty log file name the same as a one not given
   */
@@ -1563,6 +1590,10 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
         td->groupid = groupid;
         prev_group_jobs++;
  
+       if (td->o.group_reporting && prev_group_jobs > 1 &&
+           verify_per_group_options(td, jobname))
+               goto err;
+
         if (setup_rate(td))
                 goto err;
  
@@ -1586,17 +1617,23 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                 else
                         suf = "log";
  
-               gen_log_name(logname, sizeof(logname), "lat", pre,
-                               td->thread_number, suf, o->per_job_logs);
-               setup_log(&td->lat_log, &p, logname);
+               if (!o->disable_lat) {
+                       gen_log_name(logname, sizeof(logname), "lat", pre,
+                                    td->thread_number, suf, o->per_job_logs);
+                       setup_log(&td->lat_log, &p, logname);
+               }
  
-               gen_log_name(logname, sizeof(logname), "slat", pre,
-                               td->thread_number, suf, o->per_job_logs);
-               setup_log(&td->slat_log, &p, logname);
+               if (!o->disable_slat) {
+                       gen_log_name(logname, sizeof(logname), "slat", pre,
+                                    td->thread_number, suf, o->per_job_logs);
+                       setup_log(&td->slat_log, &p, logname);
+               }
  
-               gen_log_name(logname, sizeof(logname), "clat", pre,
-                               td->thread_number, suf, o->per_job_logs);
-               setup_log(&td->clat_log, &p, logname);
+               if (!o->disable_clat) {
+                       gen_log_name(logname, sizeof(logname), "clat", pre,
+                                    td->thread_number, suf, o->per_job_logs);
+                       setup_log(&td->clat_log, &p, logname);
+               }
  
         }
  
@@ -2789,6 +2826,12 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
                         exit_val = 1;
  #endif
                         break;
+#ifdef WIN32
+               case 'N':
+                       did_arg = true;
+                       fio_server_internal_set(optarg);
+                       break;
+#endif
                 case 'D':
                         if (pid_file)
                                 free(pid_file);
diff --git a/io_u.c b/io_u.c

index 3c72d63d0d5368db1ecae9158371f99efb9a27e0..059637e592d2b8f6d9b5bff8617be961ca919c0f 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -1595,7 +1595,7 @@ again:
                 assert(io_u->flags & IO_U_F_FREE);
                 io_u_clear(td, io_u, IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
                                  IO_U_F_TRIMMED | IO_U_F_BARRIER |
-                                IO_U_F_VER_LIST | IO_U_F_HIGH_PRIO);
+                                IO_U_F_VER_LIST);
  
                 io_u->error = 0;
                 io_u->acct_ddir = -1;
@@ -1803,6 +1803,7 @@ struct io_u *get_io_u(struct thread_data *td)
          * Remember the issuing context priority. The IO engine may change this.
          */
         io_u->ioprio = td->ioprio;
+       io_u->clat_prio_index = 0;
  out:
         assert(io_u->file);
         if (!td_io_prep(td, io_u)) {
@@ -1889,7 +1890,7 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
  
                 tnsec = ntime_since(&io_u->start_time, &icd->time);
                 add_lat_sample(td, idx, tnsec, bytes, io_u->offset,
-                              io_u->ioprio, io_u_is_high_prio(io_u));
+                              io_u->ioprio, io_u->clat_prio_index);
  
                 if (td->flags & TD_F_PROFILE_OPS) {
                         struct prof_io_ops *ops = &td->prof_io_ops;
@@ -1911,7 +1912,7 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
         if (ddir_rw(idx)) {
                 if (!td->o.disable_clat) {
                         add_clat_sample(td, idx, llnsec, bytes, io_u->offset,
-                                       io_u->ioprio, io_u_is_high_prio(io_u));
+                                       io_u->ioprio, io_u->clat_prio_index);
                         io_u_mark_latency(td, llnsec);
                 }
  
diff --git a/io_u.h b/io_u.h

index bdbac52577afeff573e6895621dccc903acc1d0a..206e24fee09ec955a05220203576a6e9c03220d3 100644 (file)
--- a/io_u.h
+++ b/io_u.h
@@ -21,7 +21,6 @@ enum {
         IO_U_F_TRIMMED          = 1 << 5,
         IO_U_F_BARRIER          = 1 << 6,
         IO_U_F_VER_LIST         = 1 << 7,
-       IO_U_F_HIGH_PRIO        = 1 << 8,
  };
  
  /*
@@ -50,6 +49,7 @@ struct io_u {
          * IO priority.
          */
         unsigned short ioprio;
+       unsigned short clat_prio_index;
  
         /*
          * Allocated/set buffer and length
@@ -193,6 +193,5 @@ static inline enum fio_ddir acct_ddir(struct io_u *io_u)
         td_flags_clear((td), &(io_u->flags), (val))
  #define io_u_set(td, io_u, val)                \
         td_flags_set((td), &(io_u)->flags, (val))
-#define io_u_is_high_prio(io_u)        (io_u->flags & IO_U_F_HIGH_PRIO)
  
  #endif
diff --git a/iolog.c b/iolog.c

index 1aeb7a76b2b6ca21ab8bd3d5c18e5ec200669fcb..a2cf0c1ccd6083ebf2b3c0d11686713238ad2983 100644 (file)
--- a/iolog.c
+++ b/iolog.c
@@ -152,10 +152,15 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u)
         while (!flist_empty(&td->io_log_list)) {
                 int ret;
  
-               if (!td->io_log_blktrace && td->o.read_iolog_chunked) {
+               if (td->o.read_iolog_chunked) {
                         if (td->io_log_checkmark == td->io_log_current) {
-                               if (!read_iolog2(td))
-                                       return 1;
+                               if (td->io_log_blktrace) {
+                                       if (!read_blktrace(td))
+                                               return 1;
+                               } else {
+                                       if (!read_iolog2(td))
+                                               return 1;
+                               }
                         }
                         td->io_log_current--;
                 }
@@ -355,7 +360,7 @@ void write_iolog_close(struct thread_data *td)
         td->iolog_buf = NULL;
  }
  
-static int64_t iolog_items_to_fetch(struct thread_data *td)
+int64_t iolog_items_to_fetch(struct thread_data *td)
  {
         struct timespec now;
         uint64_t elapsed;
@@ -626,8 +631,6 @@ static bool init_iolog_read(struct thread_data *td, char *fname)
         } else
                 f = fopen(fname, "r");
  
-       free(fname);
-
         if (!f) {
                 perror("fopen read iolog");
                 return false;
@@ -709,11 +712,12 @@ bool init_iolog(struct thread_data *td)
                  */
                 if (is_blktrace(fname, &need_swap)) {
                         td->io_log_blktrace = 1;
-                       ret = load_blktrace(td, fname, need_swap);
+                       ret = init_blktrace_read(td, fname, need_swap);
                 } else {
                         td->io_log_blktrace = 0;
                         ret = init_iolog_read(td, fname);
                 }
+               free(fname);
         } else if (td->o.write_iolog_file)
                 ret = init_iolog_write(td);
         else
diff --git a/iolog.h b/iolog.h

index 7d66b7c42fb6e4c0d455165d0ebf56467a7b7e0b..a39863095ad3f8264e055b40c448031c9507cc9c 100644 (file)
--- a/iolog.h
+++ b/iolog.h
@@ -254,6 +254,7 @@ extern void trim_io_piece(const struct io_u *);
  extern void queue_io_piece(struct thread_data *, struct io_piece *);
  extern void prune_io_piece_log(struct thread_data *);
  extern void write_iolog_close(struct thread_data *);
+int64_t iolog_items_to_fetch(struct thread_data *td);
  extern int iolog_compress_init(struct thread_data *, struct sk_out *);
  extern void iolog_compress_exit(struct thread_data *);
  extern size_t log_chunk_sizes(struct io_log *);
diff --git a/optgroup.h b/optgroup.h

index 1fb84a296b51990f5d909c92d2ff75a52f89178e..3ac8f62a81aa96b10a825ee69dffe0f0d070076a 100644 (file)
--- a/optgroup.h
+++ b/optgroup.h
@@ -71,6 +71,7 @@ enum opt_category_group {
         __FIO_OPT_G_LIBCUFILE,
         __FIO_OPT_G_DFS,
         __FIO_OPT_G_NFS,
+       __FIO_OPT_G_WINDOWSAIO,
  
         FIO_OPT_G_RATE          = (1ULL << __FIO_OPT_G_RATE),
         FIO_OPT_G_ZONE          = (1ULL << __FIO_OPT_G_ZONE),
@@ -116,6 +117,7 @@ enum opt_category_group {
         FIO_OPT_G_FILESTAT      = (1ULL << __FIO_OPT_G_FILESTAT),
         FIO_OPT_G_LIBCUFILE     = (1ULL << __FIO_OPT_G_LIBCUFILE),
         FIO_OPT_G_DFS           = (1ULL << __FIO_OPT_G_DFS),
+       FIO_OPT_G_WINDOWSAIO    = (1ULL << __FIO_OPT_G_WINDOWSAIO),
  };
  
  extern const struct opt_group *opt_group_from_mask(uint64_t *mask);
diff --git a/options.c b/options.c

index 0d7dc0797312aef62f5f79f4f227bfed5dfe6b3c..6cdbd2686cf5e715b8dd76baae0d23d1fca5088a 100644 (file)
--- a/options.c
+++ b/options.c
@@ -278,6 +278,128 @@ static int str_bssplit_cb(void *data, const char *input)
         return ret;
  }
  
+static int parse_cmdprio_bssplit_entry(struct thread_options *o,
+                                      struct split_prio *entry, char *str)
+{
+       int matches = 0;
+       char *bs_str = NULL;
+       long long bs_val;
+       unsigned int perc = 0, class, level;
+
+       /*
+        * valid entry formats:
+        * bs/ - %s/ - set perc to 0, prio to -1.
+        * bs/perc - %s/%u - set prio to -1.
+        * bs/perc/class/level - %s/%u/%u/%u
+        */
+       matches = sscanf(str, "%m[^/]/%u/%u/%u", &bs_str, &perc, &class, &level);
+       if (matches < 1) {
+               log_err("fio: invalid cmdprio_bssplit format\n");
+               return 1;
+       }
+
+       if (str_to_decimal(bs_str, &bs_val, 1, o, 0, 0)) {
+               log_err("fio: split conversion failed\n");
+               free(bs_str);
+               return 1;
+       }
+       free(bs_str);
+
+       entry->bs = bs_val;
+       entry->perc = min(perc, 100u);
+       entry->prio = -1;
+       switch (matches) {
+       case 1: /* bs/ case */
+       case 2: /* bs/perc case */
+               break;
+       case 4: /* bs/perc/class/level case */
+               class = min(class, (unsigned int) IOPRIO_MAX_PRIO_CLASS);
+               level = min(level, (unsigned int) IOPRIO_MAX_PRIO);
+               entry->prio = ioprio_value(class, level);
+               break;
+       default:
+               log_err("fio: invalid cmdprio_bssplit format\n");
+               return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * Returns a negative integer if the first argument should be before the second
+ * argument in the sorted list. A positive integer if the first argument should
+ * be after the second argument in the sorted list. A zero if they are equal.
+ */
+static int fio_split_prio_cmp(const void *p1, const void *p2)
+{
+       const struct split_prio *tmp1 = p1;
+       const struct split_prio *tmp2 = p2;
+
+       if (tmp1->bs > tmp2->bs)
+               return 1;
+       if (tmp1->bs < tmp2->bs)
+               return -1;
+       return 0;
+}
+
+int split_parse_prio_ddir(struct thread_options *o, struct split_prio **entries,
+                         int *nr_entries, char *str)
+{
+       struct split_prio *tmp_entries;
+       unsigned int nr_bssplits;
+       char *str_cpy, *p, *fname;
+
+       /* strsep modifies the string, dup it so that we can use strsep twice */
+       p = str_cpy = strdup(str);
+       if (!p)
+               return 1;
+
+       nr_bssplits = 0;
+       while ((fname = strsep(&str_cpy, ":")) != NULL) {
+               if (!strlen(fname))
+                       break;
+               nr_bssplits++;
+       }
+       free(p);
+
+       if (nr_bssplits > BSSPLIT_MAX) {
+               log_err("fio: too many cmdprio_bssplit entries\n");
+               return 1;
+       }
+
+       tmp_entries = calloc(nr_bssplits, sizeof(*tmp_entries));
+       if (!tmp_entries)
+               return 1;
+
+       nr_bssplits = 0;
+       while ((fname = strsep(&str, ":")) != NULL) {
+               struct split_prio *entry;
+
+               if (!strlen(fname))
+                       break;
+
+               entry = &tmp_entries[nr_bssplits];
+
+               if (parse_cmdprio_bssplit_entry(o, entry, fname)) {
+                       log_err("fio: failed to parse cmdprio_bssplit entry\n");
+                       free(tmp_entries);
+                       return 1;
+               }
+
+               /* skip zero perc entries, they provide no useful information */
+               if (entry->perc)
+                       nr_bssplits++;
+       }
+
+       qsort(tmp_entries, nr_bssplits, sizeof(*tmp_entries),
+             fio_split_prio_cmp);
+
+       *entries = tmp_entries;
+       *nr_entries = nr_bssplits;
+
+       return 0;
+}
+
  static int str2error(char *str)
  {
         const char *err[] = { "EPERM", "ENOENT", "ESRCH", "EINTR", "EIO",
diff --git a/os/os-windows.h b/os/os-windows.h

index 59da9dba1a2c3fbf09200f313a539926e9de762a..510b8143db1d0b538a63a1b8c67d9b79cb39dea1 100644 (file)
--- a/os/os-windows.h
+++ b/os/os-windows.h
@@ -110,6 +110,8 @@ int nanosleep(const struct timespec *rqtp, struct timespec *rmtp);
  ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
  ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
                 off_t offset);
+HANDLE windows_handle_connection(HANDLE hjob, int sk);
+HANDLE windows_create_job(void);
  
  static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
  {
diff --git a/os/os.h b/os/os.h

index 5965d7b806b055bf8f2c679da818516a503eb6db..810e61668572171a5b201d110238781ac8f939b3 100644 (file)
--- a/os/os.h
+++ b/os/os.h
@@ -119,10 +119,14 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
  
  #ifndef FIO_HAVE_IOPRIO_CLASS
  #define ioprio_value_is_class_rt(prio) (false)
+#define IOPRIO_MIN_PRIO_CLASS          0
+#define IOPRIO_MAX_PRIO_CLASS          0
  #endif
  #ifndef FIO_HAVE_IOPRIO
  #define ioprio_value(prioclass, prio)  (0)
  #define ioprio_set(which, who, prioclass, prio)        (0)
+#define IOPRIO_MIN_PRIO                        0
+#define IOPRIO_MAX_PRIO                        0
  #endif
  
  #ifndef FIO_HAVE_ODIRECT
diff --git a/os/windows/posix.c b/os/windows/posix.c

index f1df2d76e0abe4ec0891be04fb2632366832512d..0d415e1e0dd0272f7ea797b4e8735812f83522f2 100644 (file)
--- a/os/windows/posix.c
+++ b/os/windows/posix.c
@@ -1031,3 +1031,174 @@ in_addr_t inet_network(const char *cp)
         hbo = ((nbo & 0xFF) << 24) + ((nbo & 0xFF00) << 8) + ((nbo & 0xFF0000) >> 8) + ((nbo & 0xFF000000) >> 24);
         return hbo;
  }
+
+static HANDLE create_named_pipe(char *pipe_name, int wait_connect_time)
+{
+       HANDLE hpipe;
+
+       hpipe = CreateNamedPipe (
+                       pipe_name,
+                       PIPE_ACCESS_DUPLEX,
+                       PIPE_WAIT | PIPE_TYPE_BYTE,
+                       1, 0, 0, wait_connect_time, NULL);
+
+       if (hpipe == INVALID_HANDLE_VALUE) {
+               log_err("ConnectNamedPipe failed (%lu).\n", GetLastError());
+               return INVALID_HANDLE_VALUE;
+       }
+
+       if (!ConnectNamedPipe(hpipe, NULL)) {
+               log_err("ConnectNamedPipe failed (%lu).\n", GetLastError());
+               CloseHandle(hpipe);
+               return INVALID_HANDLE_VALUE;
+       }
+
+       return hpipe;
+}
+
+static BOOL windows_create_process(PROCESS_INFORMATION *pi, const char *args, HANDLE *hjob)
+{
+       LPSTR this_cmd_line = GetCommandLine();
+       LPSTR new_process_cmd_line = malloc((strlen(this_cmd_line)+strlen(args)) * sizeof(char *));
+       STARTUPINFO si = {0};
+       DWORD flags = 0;
+
+       strcpy(new_process_cmd_line, this_cmd_line);
+       strcat(new_process_cmd_line, args);
+
+       si.cb = sizeof(si);
+       memset(pi, 0, sizeof(*pi));
+
+       if ((hjob != NULL) && (*hjob != INVALID_HANDLE_VALUE))
+               flags = CREATE_SUSPENDED | CREATE_BREAKAWAY_FROM_JOB;
+
+       flags |= CREATE_NEW_CONSOLE;
+
+       if( !CreateProcess( NULL,
+               new_process_cmd_line,
+               NULL,    /* Process handle not inherited */
+               NULL,    /* Thread handle not inherited */
+               TRUE,    /* no handle inheritance */
+               flags,
+               NULL,    /* Use parent's environment block */
+               NULL,    /* Use parent's starting directory */
+               &si,
+               pi )
+       )
+       {
+               log_err("CreateProcess failed (%lu).\n", GetLastError() );
+               free(new_process_cmd_line);
+               return 1;
+       }
+       if ((hjob != NULL) && (*hjob != INVALID_HANDLE_VALUE)) {
+               BOOL ret = AssignProcessToJobObject(*hjob, pi->hProcess);
+               if (!ret) {
+                       log_err("AssignProcessToJobObject failed (%lu).\n", GetLastError() );
+                       return 1;
+               }
+
+               ResumeThread(pi->hThread);
+       }
+
+       free(new_process_cmd_line);
+       return 0;
+}
+
+HANDLE windows_create_job(void)
+{
+       JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli = { 0 };
+       BOOL success;
+       HANDLE hjob = CreateJobObject(NULL, NULL);
+
+       jeli.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE;
+       success = SetInformationJobObject(hjob, JobObjectExtendedLimitInformation, &jeli, sizeof(jeli));
+       if ( success == 0 ) {
+        log_err( "SetInformationJobObject failed: error %lu\n", GetLastError() );
+        return INVALID_HANDLE_VALUE;
+    }
+       return hjob;
+}
+
+/* wait for a child process to either exit or connect to a child */
+static bool monitor_process_till_connect(PROCESS_INFORMATION *pi, HANDLE *hpipe)
+{
+       bool connected = FALSE;
+       bool process_alive = TRUE;
+       char buffer[32] = {0};
+       DWORD bytes_read;
+
+       do {
+               DWORD exit_code;
+               GetExitCodeProcess(pi->hProcess, &exit_code);
+               if (exit_code != STILL_ACTIVE) {
+                       dprint(FD_PROCESS, "process %u exited %d\n", GetProcessId(pi->hProcess), exit_code);
+                       break;
+               }
+
+               memset(buffer, 0, sizeof(buffer));
+               ReadFile(*hpipe, &buffer, sizeof(buffer) - 1, &bytes_read, NULL);
+               if (bytes_read && strstr(buffer, "connected")) {
+                       dprint(FD_PROCESS, "process %u connected to client\n", GetProcessId(pi->hProcess));
+                       connected = TRUE;
+               }
+               usleep(10*1000);
+       } while (process_alive && !connected);
+       return connected;
+}
+
+/*create a process with --server-internal to emulate fork() */
+HANDLE windows_handle_connection(HANDLE hjob, int sk)
+{
+       char pipe_name[64] =  "\\\\.\\pipe\\fiointernal-";
+       char args[128] = " --server-internal=";
+       PROCESS_INFORMATION pi;
+       HANDLE hpipe = INVALID_HANDLE_VALUE;
+       WSAPROTOCOL_INFO protocol_info;
+       HANDLE ret;
+
+       sprintf(pipe_name+strlen(pipe_name), "%d", GetCurrentProcessId());
+       sprintf(args+strlen(args), "%s", pipe_name);
+
+       if (windows_create_process(&pi, args, &hjob) != 0)
+               return INVALID_HANDLE_VALUE;
+       else
+               ret = pi.hProcess;
+
+       /* duplicate socket and write the protocol_info to pipe so child can
+        * duplicate the communciation socket */
+       if (WSADuplicateSocket(sk, GetProcessId(pi.hProcess), &protocol_info)) {
+               log_err("WSADuplicateSocket failed (%lu).\n", GetLastError());
+               ret = INVALID_HANDLE_VALUE;
+               goto cleanup;
+       }
+
+       /* make a pipe with a unique name based upon processid */
+       hpipe = create_named_pipe(pipe_name, 1000);
+       if (hpipe == INVALID_HANDLE_VALUE) {
+               ret = INVALID_HANDLE_VALUE;
+               goto cleanup;
+       }
+
+       if (!WriteFile(hpipe, &protocol_info, sizeof(protocol_info), NULL, NULL)) {
+               log_err("WriteFile failed (%lu).\n", GetLastError());
+               ret = INVALID_HANDLE_VALUE;
+               goto cleanup;
+       }
+
+       dprint(FD_PROCESS, "process %d created child process %u\n", GetCurrentProcessId(), GetProcessId(pi.hProcess));
+
+       /* monitor the process until it either exits or connects. This level
+        * doesnt care which of those occurs because the result is that it
+        * needs to loop around and create another child process to monitor */
+       if (!monitor_process_till_connect(&pi, &hpipe))
+               ret = INVALID_HANDLE_VALUE;
+
+cleanup:
+       /* close the handles and pipes because this thread is done monitoring them */
+       if (ret == INVALID_HANDLE_VALUE)
+               CloseHandle(pi.hProcess);
+       CloseHandle(pi.hThread);
+       DisconnectNamedPipe(hpipe);
+       CloseHandle(hpipe);
+       return ret;
+}
+\ No newline at end of file
diff --git a/oslib/linux-dev-lookup.c b/oslib/linux-dev-lookup.c

index 1dda93f2a0ef3fa0e537a9ccdf202a9a624ecb35..4335faf99b91b1c1876afd836673f1d4cb658472 100644 (file)
--- a/oslib/linux-dev-lookup.c
+++ b/oslib/linux-dev-lookup.c
@@ -16,6 +16,16 @@ int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
         int found = 0;
         DIR *D;
  
+       /*
+        * If replay_redirect is set then always return this device
+        * upon lookup which overrides the device lookup based on
+        * major minor in the actual blktrace
+        */
+       if (redirect) {
+               strcpy(path, redirect);
+               return 1;
+       }
+
         D = opendir(path);
         if (!D)
                 return 0;
@@ -44,17 +54,6 @@ int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
                 if (!S_ISBLK(st.st_mode))
                         continue;
  
-               /*
-                * If replay_redirect is set then always return this device
-                * upon lookup which overrides the device lookup based on
-                * major minor in the actual blktrace
-                */
-               if (redirect) {
-                       strcpy(path, redirect);
-                       found = 1;
-                       break;
-               }
-
                 if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
                         strcpy(path, full_path);
                         found = 1;
diff --git a/rate-submit.c b/rate-submit.c

index 13a0d706e078be41a53a31efc8d0a70f511d648f..268356d17a1f1b35a6eaa0ad82219db86177a4de 100644 (file)
--- a/rate-submit.c
+++ b/rate-submit.c
@@ -195,7 +195,16 @@ static void io_workqueue_exit_worker_fn(struct submit_worker *sw,
         struct thread_data *td = sw->priv;
  
         (*sum_cnt)++;
-       sum_thread_stats(&sw->wq->td->ts, &td->ts, *sum_cnt == 1);
+
+       /*
+        * io_workqueue_update_acct_fn() doesn't support per prio stats, and
+        * even if it did, offload can't be used with all async IO engines.
+        * If group reporting is set in the parent td, the group result
+        * generated by __show_run_stats() can still contain multiple prios
+        * from different offloaded jobs.
+        */
+       sw->wq->td->ts.disable_prio_stat = 1;
+       sum_thread_stats(&sw->wq->td->ts, &td->ts);
  
         fio_options_free(td);
         close_and_free_files(td);
diff --git a/server.c b/server.c

index 90c52e01ac231f4972bbd94e91361d3fe61ad132..914a8c74cddaee7555bc27fa6fc236782e39f802 100644 (file)
--- a/server.c
+++ b/server.c
@@ -63,12 +63,28 @@ static char me[128];
  
  static pthread_key_t sk_out_key;
  
+#ifdef WIN32
+static char *fio_server_pipe_name  = NULL;
+static HANDLE hjob = INVALID_HANDLE_VALUE;
+struct ffi_element {
+       union {
+               pthread_t thread;
+               HANDLE hProcess;
+       };
+       bool is_thread;
+};
+#endif
+
  struct fio_fork_item {
         struct flist_head list;
         int exitval;
         int signal;
         int exited;
+#ifdef WIN32
+       struct ffi_element element;
+#else
         pid_t pid;
+#endif
  };
  
  struct cmd_reply {
@@ -250,6 +266,28 @@ static int fio_send_data(int sk, const void *p, unsigned int len)
         return fio_sendv_data(sk, &iov, 1);
  }
  
+bool fio_server_poll_fd(int fd, short events, int timeout)
+{
+       struct pollfd pfd = {
+               .fd     = fd,
+               .events = events,
+       };
+       int ret;
+
+       ret = poll(&pfd, 1, timeout);
+       if (ret < 0) {
+               if (errno == EINTR)
+                       return false;
+               log_err("fio: poll: %s\n", strerror(errno));
+               return false;
+       } else if (!ret) {
+               return false;
+       }
+       if (pfd.revents & events)
+               return true;
+       return false;
+}
+
  static int fio_recv_data(int sk, void *buf, unsigned int len, bool wait)
  {
         int flags;
@@ -651,6 +689,63 @@ static int fio_net_queue_stop(int error, int signal)
         return fio_net_send_ack(NULL, error, signal);
  }
  
+#ifdef WIN32
+static void fio_server_add_fork_item(struct ffi_element *element, struct flist_head *list)
+{
+       struct fio_fork_item *ffi;
+
+       ffi = malloc(sizeof(*ffi));
+       ffi->exitval = 0;
+       ffi->signal = 0;
+       ffi->exited = 0;
+       ffi->element = *element;
+       flist_add_tail(&ffi->list, list);
+}
+
+static void fio_server_add_conn_pid(struct flist_head *conn_list, HANDLE hProcess)
+{
+       struct ffi_element element = {.hProcess = hProcess, .is_thread=FALSE};
+       dprint(FD_NET, "server: forked off connection job (tid=%u)\n", (int) element.thread);
+
+       fio_server_add_fork_item(&element, conn_list);
+}
+
+static void fio_server_add_job_pid(struct flist_head *job_list, pthread_t thread)
+{
+       struct ffi_element element = {.thread = thread, .is_thread=TRUE};
+       dprint(FD_NET, "server: forked off job job (tid=%u)\n", (int) element.thread);
+       fio_server_add_fork_item(&element, job_list);
+}
+
+static void fio_server_check_fork_item(struct fio_fork_item *ffi)
+{
+       int ret;
+
+       if (ffi->element.is_thread) {
+
+               ret = pthread_kill(ffi->element.thread, 0);
+               if (ret) {
+                       int rev_val;
+                       pthread_join(ffi->element.thread, (void**) &rev_val); /*if the thread is dead, then join it to get status*/
+
+                       ffi->exitval = rev_val;
+                       if (ffi->exitval)
+                               log_err("thread (tid=%u) exited with %x\n", (int) ffi->element.thread, (int) ffi->exitval);
+                       dprint(FD_PROCESS, "thread (tid=%u) exited with %x\n", (int) ffi->element.thread, (int) ffi->exitval);
+                       ffi->exited = 1;
+               }
+       } else {
+               DWORD exit_val;
+               GetExitCodeProcess(ffi->element.hProcess, &exit_val);
+
+               if (exit_val != STILL_ACTIVE) {
+                       dprint(FD_PROCESS, "process %u exited with %d\n", GetProcessId(ffi->element.hProcess), exit_val);
+                       ffi->exited = 1;
+                       ffi->exitval = exit_val;
+               }
+       }
+}
+#else
  static void fio_server_add_fork_item(pid_t pid, struct flist_head *list)
  {
         struct fio_fork_item *ffi;
@@ -698,10 +793,21 @@ static void fio_server_check_fork_item(struct fio_fork_item *ffi)
                 }
         }
  }
+#endif
  
  static void fio_server_fork_item_done(struct fio_fork_item *ffi, bool stop)
  {
+#ifdef WIN32
+       if (ffi->element.is_thread)
+               dprint(FD_NET, "tid %u exited, sig=%u, exitval=%d\n", (int) ffi->element.thread, ffi->signal, ffi->exitval);
+       else {
+               dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int)  GetProcessId(ffi->element.hProcess), ffi->signal, ffi->exitval);
+               CloseHandle(ffi->element.hProcess);
+               ffi->element.hProcess = INVALID_HANDLE_VALUE;
+       }
+#else
         dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int) ffi->pid, ffi->signal, ffi->exitval);
+#endif
  
         /*
          * Fold STOP and QUIT...
@@ -762,27 +868,62 @@ static int handle_load_file_cmd(struct fio_net_cmd *cmd)
         return 0;
  }
  
-static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
-                         struct fio_net_cmd *cmd)
+#ifdef WIN32
+static void *fio_backend_thread(void *data)
  {
-       pid_t pid;
         int ret;
+       struct sk_out *sk_out = (struct sk_out *) data;
  
         sk_out_assign(sk_out);
  
+       ret = fio_backend(sk_out);
+       sk_out_drop();
+
+       pthread_exit((void*) (intptr_t) ret);
+       return NULL;
+}
+#endif
+
+static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
+                         struct fio_net_cmd *cmd)
+{
+       int ret;
+
         fio_time_init();
         set_genesis_time();
  
-       pid = fork();
-       if (pid) {
-               fio_server_add_job_pid(job_list, pid);
-               return 0;
+#ifdef WIN32
+       {
+               pthread_t thread;
+               /* both this thread and backend_thread call sk_out_assign() to double increment
+                * the ref count.  This ensures struct is valid until both threads are done with it
+                */
+               sk_out_assign(sk_out);
+               ret = pthread_create(&thread, NULL,     fio_backend_thread, sk_out);
+               if (ret) {
+                       log_err("pthread_create: %s\n", strerror(ret));
+                       return ret;
+               }
+
+               fio_server_add_job_pid(job_list, thread);
+               return ret;
         }
+#else
+    {
+               pid_t pid;
+               sk_out_assign(sk_out);
+               pid = fork();
+               if (pid) {
+                       fio_server_add_job_pid(job_list, pid);
+                       return 0;
+               }
  
-       ret = fio_backend(sk_out);
-       free_threads_shm();
-       sk_out_drop();
-       _exit(ret);
+               ret = fio_backend(sk_out);
+               free_threads_shm();
+               sk_out_drop();
+               _exit(ret);
+       }
+#endif
  }
  
  static int handle_job_cmd(struct fio_net_cmd *cmd)
@@ -1238,7 +1379,8 @@ static int handle_connection(struct sk_out *sk_out)
                 if (ret < 0)
                         break;
  
-               cmd = fio_net_recv_cmd(sk_out->sk, true);
+               if (pfd.revents & POLLIN)
+                       cmd = fio_net_recv_cmd(sk_out->sk, true);
                 if (!cmd) {
                         ret = -1;
                         break;
@@ -1300,6 +1442,73 @@ static int get_my_addr_str(int sk)
         return 0;
  }
  
+#ifdef WIN32
+static int handle_connection_process(void)
+{
+       WSAPROTOCOL_INFO protocol_info;
+       DWORD bytes_read;
+       HANDLE hpipe;
+       int sk;
+       struct sk_out *sk_out;
+       int ret;
+       char *msg = (char *) "connected";
+
+       log_info("server enter accept loop.  ProcessID %d\n", GetCurrentProcessId());
+
+       hpipe = CreateFile(
+                                       fio_server_pipe_name,
+                                       GENERIC_READ | GENERIC_WRITE,
+                                       0, NULL,
+                                       OPEN_EXISTING,
+                                       0, NULL);
+
+       if (hpipe == INVALID_HANDLE_VALUE) {
+               log_err("couldnt open pipe %s error %lu\n",
+                               fio_server_pipe_name, GetLastError());
+               return -1;
+       }
+
+       if (!ReadFile(hpipe, &protocol_info, sizeof(protocol_info), &bytes_read, NULL)) {
+               log_err("couldnt read pi from pipe %s error %lu\n", fio_server_pipe_name,
+                               GetLastError());
+       }
+
+       if (use_ipv6) /* use protocol_info to create a duplicate of parents socket */
+               sk = WSASocket(AF_INET6, SOCK_STREAM, 0, &protocol_info, 0, 0);
+       else
+               sk = WSASocket(AF_INET,  SOCK_STREAM, 0, &protocol_info, 0, 0);
+
+       sk_out = scalloc(1, sizeof(*sk_out));
+       if (!sk_out) {
+               CloseHandle(hpipe);
+               close(sk);
+               return -1;
+       }
+
+       sk_out->sk = sk;
+       sk_out->hProcess = INVALID_HANDLE_VALUE;
+       INIT_FLIST_HEAD(&sk_out->list);
+       __fio_sem_init(&sk_out->lock, FIO_SEM_UNLOCKED);
+       __fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED);
+       __fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED);
+
+       get_my_addr_str(sk);
+
+       if (!WriteFile(hpipe, msg, strlen(msg), NULL, NULL)) {
+               log_err("couldnt write pipe\n");
+               close(sk);
+               return -1;
+       }
+       CloseHandle(hpipe);
+
+       sk_out_assign(sk_out);
+
+       ret = handle_connection(sk_out);
+       __sk_out_drop(sk_out);
+       return ret;
+}
+#endif
+
  static int accept_loop(int listen_sk)
  {
         struct sockaddr_in addr;
@@ -1317,8 +1526,11 @@ static int accept_loop(int listen_sk)
                 struct sk_out *sk_out;
                 const char *from;
                 char buf[64];
+#ifdef WIN32
+               HANDLE hProcess;
+#else
                 pid_t pid;
-
+#endif
                 pfd.fd = listen_sk;
                 pfd.events = POLLIN;
                 do {
@@ -1376,6 +1588,13 @@ static int accept_loop(int listen_sk)
                 __fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED);
                 __fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED);
  
+#ifdef WIN32
+               hProcess = windows_handle_connection(hjob, sk);
+               if (hProcess == INVALID_HANDLE_VALUE)
+                       return -1;
+               sk_out->hProcess = hProcess;
+               fio_server_add_conn_pid(&conn_list, hProcess);
+#else
                 pid = fork();
                 if (pid) {
                         close(sk);
@@ -1392,6 +1611,7 @@ static int accept_loop(int listen_sk)
                  */
                 sk_out_assign(sk_out);
                 handle_connection(sk_out);
+#endif
         }
  
         return exitval;
@@ -1465,8 +1685,11 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
  {
         struct cmd_ts_pdu p;
         int i, j, k;
-       void *ss_buf;
-       uint64_t *ss_iops, *ss_bw;
+       size_t clat_prio_stats_extra_size = 0;
+       size_t ss_extra_size = 0;
+       size_t extended_buf_size = 0;
+       void *extended_buf;
+       void *extended_buf_wp;
  
         dprint(FD_NET, "server sending end stats\n");
  
@@ -1483,6 +1706,8 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
         p.ts.pid                = cpu_to_le32(ts->pid);
         p.ts.members            = cpu_to_le32(ts->members);
         p.ts.unified_rw_rep     = cpu_to_le32(ts->unified_rw_rep);
+       p.ts.ioprio             = cpu_to_le32(ts->ioprio);
+       p.ts.disable_prio_stat  = cpu_to_le32(ts->disable_prio_stat);
  
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
                 convert_io_stat(&p.ts.clat_stat[i], &ts->clat_stat[i]);
@@ -1577,38 +1802,88 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
         p.ts.cachehit           = cpu_to_le64(ts->cachehit);
         p.ts.cachemiss          = cpu_to_le64(ts->cachemiss);
  
+       convert_gs(&p.rs, rs);
+
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
-                       p.ts.io_u_plat_high_prio[i][j] = cpu_to_le64(ts->io_u_plat_high_prio[i][j]);
-                       p.ts.io_u_plat_low_prio[i][j] = cpu_to_le64(ts->io_u_plat_low_prio[i][j]);
+               if (ts->nr_clat_prio[i])
+                       clat_prio_stats_extra_size += ts->nr_clat_prio[i] * sizeof(*ts->clat_prio[i]);
+       }
+       extended_buf_size += clat_prio_stats_extra_size;
+
+       dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
+       if (ts->ss_state & FIO_SS_DATA)
+               ss_extra_size = 2 * ts->ss_dur * sizeof(uint64_t);
+
+       extended_buf_size += ss_extra_size;
+       if (!extended_buf_size) {
+               fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
+               return;
+       }
+
+       extended_buf_size += sizeof(p);
+       extended_buf = calloc(1, extended_buf_size);
+       if (!extended_buf) {
+               log_err("fio: failed to allocate FIO_NET_CMD_TS buffer\n");
+               return;
+       }
+
+       memcpy(extended_buf, &p, sizeof(p));
+       extended_buf_wp = (struct cmd_ts_pdu *)extended_buf + 1;
+
+       if (clat_prio_stats_extra_size) {
+               for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+                       struct clat_prio_stat *prio = (struct clat_prio_stat *) extended_buf_wp;
+
+                       for (j = 0; j < ts->nr_clat_prio[i]; j++) {
+                               for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+                                       prio->io_u_plat[k] =
+                                               cpu_to_le64(ts->clat_prio[i][j].io_u_plat[k]);
+                               convert_io_stat(&prio->clat_stat,
+                                               &ts->clat_prio[i][j].clat_stat);
+                               prio->ioprio = cpu_to_le32(ts->clat_prio[i][j].ioprio);
+                               prio++;
+                       }
+
+                       if (ts->nr_clat_prio[i]) {
+                               uint64_t offset = (char *)extended_buf_wp - (char *)extended_buf;
+                               struct cmd_ts_pdu *ptr = extended_buf;
+
+                               ptr->ts.clat_prio_offset[i] = cpu_to_le64(offset);
+                               ptr->ts.nr_clat_prio[i] = cpu_to_le32(ts->nr_clat_prio[i]);
+                       }
+
+                       extended_buf_wp = prio;
                 }
-               convert_io_stat(&p.ts.clat_high_prio_stat[i], &ts->clat_high_prio_stat[i]);
-               convert_io_stat(&p.ts.clat_low_prio_stat[i], &ts->clat_low_prio_stat[i]);
         }
  
-       convert_gs(&p.rs, rs);
+       if (ss_extra_size) {
+               uint64_t *ss_iops, *ss_bw;
+               uint64_t offset;
+               struct cmd_ts_pdu *ptr = extended_buf;
  
-       dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
-       if (ts->ss_state & FIO_SS_DATA) {
                 dprint(FD_NET, "server sending steadystate ring buffers\n");
  
-               ss_buf = malloc(sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t));
+               /* ss iops */
+               ss_iops = (uint64_t *) extended_buf_wp;
+               for (i = 0; i < ts->ss_dur; i++)
+                       ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
  
-               memcpy(ss_buf, &p, sizeof(p));
+               offset = (char *)extended_buf_wp - (char *)extended_buf;
+               ptr->ts.ss_iops_data_offset = cpu_to_le64(offset);
+               extended_buf_wp = ss_iops + (int) ts->ss_dur;
  
-               ss_iops = (uint64_t *) ((struct cmd_ts_pdu *)ss_buf + 1);
-               ss_bw = ss_iops + (int) ts->ss_dur;
-               for (i = 0; i < ts->ss_dur; i++) {
-                       ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
+               /* ss bw */
+               ss_bw = extended_buf_wp;
+               for (i = 0; i < ts->ss_dur; i++)
                         ss_bw[i] = cpu_to_le64(ts->ss_bw_data[i]);
-               }
-
-               fio_net_queue_cmd(FIO_NET_CMD_TS, ss_buf, sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t), NULL, SK_F_COPY);
  
-               free(ss_buf);
+               offset = (char *)extended_buf_wp - (char *)extended_buf;
+               ptr->ts.ss_bw_data_offset = cpu_to_le64(offset);
+               extended_buf_wp = ss_bw + (int) ts->ss_dur;
         }
-       else
-               fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
+
+       fio_net_queue_cmd(FIO_NET_CMD_TS, extended_buf, extended_buf_size, NULL, SK_F_COPY);
+       free(extended_buf);
  }
  
  void fio_server_send_gs(struct group_run_stats *rs)
@@ -2489,12 +2764,25 @@ static int fio_server(void)
         if (fio_handle_server_arg())
                 return -1;
  
+       set_sig_handlers();
+
+#ifdef WIN32
+       /* if this is a child process, go handle the connection */
+       if (fio_server_pipe_name != NULL) {
+               ret = handle_connection_process();
+               return ret;
+       }
+
+       /* job to link child processes so they terminate together */
+       hjob = windows_create_job();
+       if (hjob == INVALID_HANDLE_VALUE)
+               return -1;
+#endif
+
         sk = fio_init_server_connection();
         if (sk < 0)
                 return -1;
  
-       set_sig_handlers();
-
         ret = accept_loop(sk);
  
         close(sk);
@@ -2635,3 +2923,10 @@ void fio_server_set_arg(const char *arg)
  {
         fio_server_arg = strdup(arg);
  }
+
+#ifdef WIN32
+void fio_server_internal_set(const char *arg)
+{
+       fio_server_pipe_name = strdup(arg);
+}
+#endif
diff --git a/server.h b/server.h

index 25b6bbdc25dfb6f6f45e7ddb318d6a5cb9030156..0e62b6dfe8eac2df8aba17abe843ef99b800d538 100644 (file)
--- a/server.h
+++ b/server.h
@@ -15,6 +15,9 @@ struct sk_out {
         unsigned int refs;      /* frees sk_out when it drops to zero.
                                  * protected by below ->lock */
  
+#ifdef WIN32
+       HANDLE hProcess;                /* process handle of handle_connection_process*/
+#endif
         int sk;                 /* socket fd to talk to client */
         struct fio_sem lock;    /* protects ref and below list */
         struct flist_head list; /* list of pending transmit work */
@@ -48,7 +51,7 @@ struct fio_net_cmd_reply {
  };
  
  enum {
-       FIO_SERVER_VER                  = 95,
+       FIO_SERVER_VER                  = 96,
  
         FIO_SERVER_MAX_FRAGMENT_PDU     = 1024,
         FIO_SERVER_MAX_CMD_MB           = 2048,
@@ -212,6 +215,7 @@ extern int fio_server_text_output(int, const char *, size_t);
  extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *);
  extern int fio_net_send_simple_cmd(int, uint16_t, uint64_t, struct flist_head *);
  extern void fio_server_set_arg(const char *);
+extern void fio_server_internal_set(const char *);
  extern int fio_server_parse_string(const char *, char **, bool *, int *, struct in_addr *, struct in6_addr *, int *);
  extern int fio_server_parse_host(const char *, int, struct in_addr *, struct in6_addr *);
  extern const char *fio_server_op(unsigned int);
@@ -222,6 +226,7 @@ extern void fio_server_send_gs(struct group_run_stats *);
  extern void fio_server_send_du(void);
  extern void fio_server_send_job_options(struct flist_head *, unsigned int);
  extern int fio_server_get_verify_state(const char *, int, void **);
+extern bool fio_server_poll_fd(int fd, short events, int timeout);
  
  extern struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait);
  
diff --git a/stat.c b/stat.c

index 98f30107d1ada1a3f49ebcb580089a87a884ffeb..0876222a1b6be28d0f5fd7ce6b278e6f90c099ff 100644 (file)
--- a/stat.c
+++ b/stat.c
@@ -265,6 +265,18 @@ static void show_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr,
         free(ovals);
  }
  
+static int get_nr_prios_with_samples(struct thread_stat *ts, enum fio_ddir ddir)
+{
+       int i, nr_prios_with_samples = 0;
+
+       for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+               if (ts->clat_prio[ddir][i].clat_stat.samples)
+                       nr_prios_with_samples++;
+       }
+
+       return nr_prios_with_samples;
+}
+
  bool calc_lat(struct io_stat *is, unsigned long long *min,
               unsigned long long *max, double *mean, double *dev)
  {
@@ -289,9 +301,10 @@ void show_mixed_group_stats(struct group_run_stats *rs, struct buf_output *out)
  {
         char *io, *agg, *min, *max;
         char *ioalt, *aggalt, *minalt, *maxalt;
-       uint64_t io_mix = 0, agg_mix = 0, min_mix = -1, max_mix = 0, min_run = -1, max_run = 0;
-       int i;
+       uint64_t io_mix = 0, agg_mix = 0, min_mix = -1, max_mix = 0;
+       uint64_t min_run = -1, max_run = 0;
         const int i2p = is_power_of_2(rs->kb_base);
+       int i;
  
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
                 if (!rs->max_run[i])
@@ -363,9 +376,9 @@ void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
                 free(minalt);
                 free(maxalt);
         }
-       
+
         /* Need to aggregate statisitics to show mixed values */
-       if (rs->unified_rw_rep == UNIFIED_BOTH) 
+       if (rs->unified_rw_rep == UNIFIED_BOTH)
                 show_mixed_group_stats(rs, out);
  }
  
@@ -461,179 +474,57 @@ static void display_lat(const char *name, unsigned long long min,
         free(maxp);
  }
  
-static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, int mean)
+static struct thread_stat *gen_mixed_ddir_stats_from_ts(struct thread_stat *ts)
  {
-       double p_of_agg = 100.0;
-       if (rs && rs->agg[ddir] > 1024) {
-               p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0);
-
-               if (p_of_agg > 100.0)
-                       p_of_agg = 100.0;
-       }
-       return p_of_agg;
-}
-
-static void show_mixed_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
-                            struct buf_output *out)
-{
-       unsigned long runt;
-       unsigned long long min, max, bw, iops;
-       double mean, dev;
-       char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL;
         struct thread_stat *ts_lcl;
  
-       int i2p;
-       int ddir = 0;
-
-       /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
+       /*
+        * Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and
+        * Trims (ddir = 2)
+        */
         ts_lcl = malloc(sizeof(struct thread_stat));
-       memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
-       ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
-       init_thread_stat_min_vals(ts_lcl);
-
-       sum_thread_stats(ts_lcl, ts, 1);
-
-       assert(ddir_rw(ddir));
-
-       if (!ts_lcl->runtime[ddir])
-               return;
-
-       i2p = is_power_of_2(rs->kb_base);
-       runt = ts_lcl->runtime[ddir];
-
-       bw = (1000 * ts_lcl->io_bytes[ddir]) / runt;
-       io_p = num2str(ts_lcl->io_bytes[ddir], ts->sig_figs, 1, i2p, N2S_BYTE);
-       bw_p = num2str(bw, ts->sig_figs, 1, i2p, ts->unit_base);
-       bw_p_alt = num2str(bw, ts->sig_figs, 1, !i2p, ts->unit_base);
-
-       iops = (1000 * ts_lcl->total_io_u[ddir]) / runt;
-       iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_NONE);
-
-       log_buf(out, "  mixed: IOPS=%s, BW=%s (%s)(%s/%llumsec)%s\n",
-                       iops_p, bw_p, bw_p_alt, io_p,
-                       (unsigned long long) ts_lcl->runtime[ddir],
-                       post_st ? : "");
-
-       free(post_st);
-       free(io_p);
-       free(bw_p);
-       free(bw_p_alt);
-       free(iops_p);
-
-       if (calc_lat(&ts_lcl->slat_stat[ddir], &min, &max, &mean, &dev))
-               display_lat("slat", min, max, mean, dev, out);
-       if (calc_lat(&ts_lcl->clat_stat[ddir], &min, &max, &mean, &dev))
-               display_lat("clat", min, max, mean, dev, out);
-       if (calc_lat(&ts_lcl->lat_stat[ddir], &min, &max, &mean, &dev))
-               display_lat(" lat", min, max, mean, dev, out);
-       if (calc_lat(&ts_lcl->clat_high_prio_stat[ddir], &min, &max, &mean, &dev)) {
-               display_lat(ts_lcl->lat_percentiles ? "high prio_lat" : "high prio_clat",
-                               min, max, mean, dev, out);
-               if (calc_lat(&ts_lcl->clat_low_prio_stat[ddir], &min, &max, &mean, &dev))
-                       display_lat(ts_lcl->lat_percentiles ? "low prio_lat" : "low prio_clat",
-                                       min, max, mean, dev, out);
-       }
-
-       if (ts->slat_percentiles && ts_lcl->slat_stat[ddir].samples > 0)
-               show_clat_percentiles(ts_lcl->io_u_plat[FIO_SLAT][ddir],
-                               ts_lcl->slat_stat[ddir].samples,
-                               ts->percentile_list,
-                               ts->percentile_precision, "slat", out);
-       if (ts->clat_percentiles && ts_lcl->clat_stat[ddir].samples > 0)
-               show_clat_percentiles(ts_lcl->io_u_plat[FIO_CLAT][ddir],
-                               ts_lcl->clat_stat[ddir].samples,
-                               ts->percentile_list,
-                               ts->percentile_precision, "clat", out);
-       if (ts->lat_percentiles && ts_lcl->lat_stat[ddir].samples > 0)
-               show_clat_percentiles(ts_lcl->io_u_plat[FIO_LAT][ddir],
-                               ts_lcl->lat_stat[ddir].samples,
-                               ts->percentile_list,
-                               ts->percentile_precision, "lat", out);
-
-       if (ts->clat_percentiles || ts->lat_percentiles) {
-               const char *name = ts->lat_percentiles ? "lat" : "clat";
-               char prio_name[32];
-               uint64_t samples;
-
-               if (ts->lat_percentiles)
-                       samples = ts_lcl->lat_stat[ddir].samples;
-               else
-                       samples = ts_lcl->clat_stat[ddir].samples;
-
-               /* Only print this if some high and low priority stats were collected */
-               if (ts_lcl->clat_high_prio_stat[ddir].samples > 0 &&
-                               ts_lcl->clat_low_prio_stat[ddir].samples > 0)
-               {
-                       sprintf(prio_name, "high prio (%.2f%%) %s",
-                                       100. * (double) ts_lcl->clat_high_prio_stat[ddir].samples / (double) samples,
-                                       name);
-                       show_clat_percentiles(ts_lcl->io_u_plat_high_prio[ddir],
-                                       ts_lcl->clat_high_prio_stat[ddir].samples,
-                                       ts->percentile_list,
-                                       ts->percentile_precision, prio_name, out);
-
-                       sprintf(prio_name, "low prio (%.2f%%) %s",
-                                       100. * (double) ts_lcl->clat_low_prio_stat[ddir].samples / (double) samples,
-                                       name);
-                       show_clat_percentiles(ts_lcl->io_u_plat_low_prio[ddir],
-                                       ts_lcl->clat_low_prio_stat[ddir].samples,
-                                       ts->percentile_list,
-                                       ts->percentile_precision, prio_name, out);
-               }
+       if (!ts_lcl) {
+               log_err("fio: failed to allocate local thread stat\n");
+               return NULL;
         }
  
-       if (calc_lat(&ts_lcl->bw_stat[ddir], &min, &max, &mean, &dev)) {
-               double p_of_agg = 100.0, fkb_base = (double)rs->kb_base;
-               const char *bw_str;
+       init_thread_stat(ts_lcl);
  
-               if ((rs->unit_base == 1) && i2p)
-                       bw_str = "Kibit";
-               else if (rs->unit_base == 1)
-                       bw_str = "kbit";
-               else if (i2p)
-                       bw_str = "KiB";
-               else
-                       bw_str = "kB";
+       /* calculate mixed stats  */
+       ts_lcl->unified_rw_rep = UNIFIED_MIXED;
+       ts_lcl->lat_percentiles = ts->lat_percentiles;
+       ts_lcl->clat_percentiles = ts->clat_percentiles;
+       ts_lcl->slat_percentiles = ts->slat_percentiles;
+       ts_lcl->percentile_precision = ts->percentile_precision;
+       memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
  
-               p_of_agg = convert_agg_kbytes_percent(rs, ddir, mean);
+       sum_thread_stats(ts_lcl, ts);
  
-               if (rs->unit_base == 1) {
-                       min *= 8.0;
-                       max *= 8.0;
-                       mean *= 8.0;
-                       dev *= 8.0;
-               }
+       return ts_lcl;
+}
  
-               if (mean > fkb_base * fkb_base) {
-                       min /= fkb_base;
-                       max /= fkb_base;
-                       mean /= fkb_base;
-                       dev /= fkb_base;
-                       bw_str = (rs->unit_base == 1 ? "Mibit" : "MiB");
-               }
+static double convert_agg_kbytes_percent(struct group_run_stats *rs,
+                                        enum fio_ddir ddir, int mean)
+{
+       double p_of_agg = 100.0;
+       if (rs && rs->agg[ddir] > 1024) {
+               p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0);
  
-               log_buf(out, "   bw (%5s/s): min=%5llu, max=%5llu, per=%3.2f%%, "
-                       "avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
-                       bw_str, min, max, p_of_agg, mean, dev,
-                       (&ts_lcl->bw_stat[ddir])->samples);
-       }
-       if (calc_lat(&ts_lcl->iops_stat[ddir], &min, &max, &mean, &dev)) {
-               log_buf(out, "   iops        : min=%5llu, max=%5llu, "
-                       "avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
-                       min, max, mean, dev, (&ts_lcl->iops_stat[ddir])->samples);
+               if (p_of_agg > 100.0)
+                       p_of_agg = 100.0;
         }
-
-       free(ts_lcl);
+       return p_of_agg;
  }
  
  static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
-                            int ddir, struct buf_output *out)
+                            enum fio_ddir ddir, struct buf_output *out)
  {
         unsigned long runt;
         unsigned long long min, max, bw, iops;
         double mean, dev;
         char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL;
-       int i2p;
+       int i2p, i;
+       const char *clat_type = ts->lat_percentiles ? "lat" : "clat";
  
         if (ddir_sync(ddir)) {
                 if (calc_lat(&ts->sync_stat, &min, &max, &mean, &dev)) {
@@ -694,12 +585,22 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
                 display_lat("clat", min, max, mean, dev, out);
         if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
                 display_lat(" lat", min, max, mean, dev, out);
-       if (calc_lat(&ts->clat_high_prio_stat[ddir], &min, &max, &mean, &dev)) {
-               display_lat(ts->lat_percentiles ? "high prio_lat" : "high prio_clat",
-                               min, max, mean, dev, out);
-               if (calc_lat(&ts->clat_low_prio_stat[ddir], &min, &max, &mean, &dev))
-                       display_lat(ts->lat_percentiles ? "low prio_lat" : "low prio_clat",
-                                       min, max, mean, dev, out);
+
+       /* Only print per prio stats if there are >= 2 prios with samples */
+       if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+               for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+                       if (calc_lat(&ts->clat_prio[ddir][i].clat_stat, &min,
+                                    &max, &mean, &dev)) {
+                               char buf[64];
+
+                               snprintf(buf, sizeof(buf),
+                                        "%s prio %u/%u",
+                                        clat_type,
+                                        ts->clat_prio[ddir][i].ioprio >> 13,
+                                        ts->clat_prio[ddir][i].ioprio & 7);
+                               display_lat(buf, min, max, mean, dev, out);
+                       }
+               }
         }
  
         if (ts->slat_percentiles && ts->slat_stat[ddir].samples > 0)
@@ -719,8 +620,7 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
                                         ts->percentile_precision, "lat", out);
  
         if (ts->clat_percentiles || ts->lat_percentiles) {
-               const char *name = ts->lat_percentiles ? "lat" : "clat";
-               char prio_name[32];
+               char prio_name[64];
                 uint64_t samples;
  
                 if (ts->lat_percentiles)
@@ -728,25 +628,24 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
                 else
                         samples = ts->clat_stat[ddir].samples;
  
-               /* Only print this if some high and low priority stats were collected */
-               if (ts->clat_high_prio_stat[ddir].samples > 0 &&
-                       ts->clat_low_prio_stat[ddir].samples > 0)
-               {
-                       sprintf(prio_name, "high prio (%.2f%%) %s",
-                                       100. * (double) ts->clat_high_prio_stat[ddir].samples / (double) samples,
-                                       name);
-                       show_clat_percentiles(ts->io_u_plat_high_prio[ddir],
-                                               ts->clat_high_prio_stat[ddir].samples,
-                                               ts->percentile_list,
-                                               ts->percentile_precision, prio_name, out);
-
-                       sprintf(prio_name, "low prio (%.2f%%) %s",
-                                       100. * (double) ts->clat_low_prio_stat[ddir].samples / (double) samples,
-                                       name);
-                       show_clat_percentiles(ts->io_u_plat_low_prio[ddir],
-                                               ts->clat_low_prio_stat[ddir].samples,
-                                               ts->percentile_list,
-                                               ts->percentile_precision, prio_name, out);
+               /* Only print per prio stats if there are >= 2 prios with samples */
+               if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+                       for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+                               uint64_t prio_samples = ts->clat_prio[ddir][i].clat_stat.samples;
+
+                               if (prio_samples > 0) {
+                                       snprintf(prio_name, sizeof(prio_name),
+                                                "%s prio %u/%u (%.2f%% of IOs)",
+                                                clat_type,
+                                                ts->clat_prio[ddir][i].ioprio >> 13,
+                                                ts->clat_prio[ddir][i].ioprio & 7,
+                                                100. * (double) prio_samples / (double) samples);
+                                       show_clat_percentiles(ts->clat_prio[ddir][i].io_u_plat,
+                                                             prio_samples, ts->percentile_list,
+                                                             ts->percentile_precision,
+                                                             prio_name, out);
+                               }
+                       }
                 }
         }
  
@@ -792,6 +691,19 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
         }
  }
  
+static void show_mixed_ddir_status(struct group_run_stats *rs,
+                                  struct thread_stat *ts,
+                                  struct buf_output *out)
+{
+       struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
+
+       if (ts_lcl)
+               show_ddir_status(rs, ts_lcl, DDIR_READ, out);
+
+       free_clat_prio_stats(ts_lcl);
+       free(ts_lcl);
+}
+
  static bool show_lat(double *io_u_lat, int nr, const char **ranges,
                      const char *msg, struct buf_output *out)
  {
@@ -1222,9 +1134,8 @@ void show_disk_util(int terse, struct json_object *parent,
         if (!is_running_backend())
                 return;
  
-       if (flist_empty(&disk_list)) {
+       if (flist_empty(&disk_list))
                 return;
-       }
  
         if ((output_format & FIO_OUTPUT_JSON) && parent)
                 do_json = true;
@@ -1234,9 +1145,9 @@ void show_disk_util(int terse, struct json_object *parent,
         if (!terse && !do_json)
                 log_buf(out, "\nDisk stats (read/write):\n");
  
-       if (do_json)
+       if (do_json) {
                 json_object_add_disk_utils(parent, &disk_list);
-       else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
+       } else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
                 flist_for_each(entry, &disk_list) {
                         du = flist_entry(entry, struct disk_util, list);
  
@@ -1363,8 +1274,9 @@ static void show_thread_status_normal(struct thread_stat *ts,
  }
  
  static void show_ddir_status_terse(struct thread_stat *ts,
-                                  struct group_run_stats *rs, int ddir,
-                                  int ver, struct buf_output *out)
+                                  struct group_run_stats *rs,
+                                  enum fio_ddir ddir, int ver,
+                                  struct buf_output *out)
  {
         unsigned long long min, max, minv, maxv, bw, iops;
         unsigned long long *ovals = NULL;
@@ -1396,19 +1308,20 @@ static void show_ddir_status_terse(struct thread_stat *ts,
         else
                 log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
  
-       if (ts->lat_percentiles)
+       if (ts->lat_percentiles) {
                 len = calc_clat_percentiles(ts->io_u_plat[FIO_LAT][ddir],
                                         ts->lat_stat[ddir].samples,
                                         ts->percentile_list, &ovals, &maxv,
                                         &minv);
-       else if (ts->clat_percentiles)
+       } else if (ts->clat_percentiles) {
                 len = calc_clat_percentiles(ts->io_u_plat[FIO_CLAT][ddir],
                                         ts->clat_stat[ddir].samples,
                                         ts->percentile_list, &ovals, &maxv,
                                         &minv);
-       else
+       } else {
                 len = 0;
-       
+       }
+
         for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
                 if (i >= len) {
                         log_buf(out, ";0%%=0");
@@ -1435,8 +1348,9 @@ static void show_ddir_status_terse(struct thread_stat *ts,
                 }
  
                 log_buf(out, ";%llu;%llu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
-       } else
+       } else {
                 log_buf(out, ";%llu;%llu;%f%%;%f;%f", 0ULL, 0ULL, 0.0, 0.0, 0.0);
+       }
  
         if (ver == 5) {
                 if (bw_stat)
@@ -1456,28 +1370,19 @@ static void show_mixed_ddir_status_terse(struct thread_stat *ts,
                                    struct group_run_stats *rs,
                                    int ver, struct buf_output *out)
  {
-       struct thread_stat *ts_lcl;
+       struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
  
-       /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
-       ts_lcl = malloc(sizeof(struct thread_stat));
-       memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
-       ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
-       init_thread_stat_min_vals(ts_lcl);
-       ts_lcl->lat_percentiles = ts->lat_percentiles;
-       ts_lcl->clat_percentiles = ts->clat_percentiles;
-       ts_lcl->slat_percentiles = ts->slat_percentiles;
-       ts_lcl->percentile_precision = ts->percentile_precision;                
-       memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
-       
-       sum_thread_stats(ts_lcl, ts, 1);
+       if (ts_lcl)
+               show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out);
  
-       /* add the aggregated stats to json parent */
-       show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out);
+       free_clat_prio_stats(ts_lcl);
         free(ts_lcl);
  }
  
-static struct json_object *add_ddir_lat_json(struct thread_stat *ts, uint32_t percentiles,
-               struct io_stat *lat_stat, uint64_t *io_u_plat)
+static struct json_object *add_ddir_lat_json(struct thread_stat *ts,
+                                            uint32_t percentiles,
+                                            struct io_stat *lat_stat,
+                                            uint64_t *io_u_plat)
  {
         char buf[120];
         double mean, dev;
@@ -1527,7 +1432,8 @@ static struct json_object *add_ddir_lat_json(struct thread_stat *ts, uint32_t pe
  }
  
  static void add_ddir_status_json(struct thread_stat *ts,
-               struct group_run_stats *rs, int ddir, struct json_object *parent)
+                                struct group_run_stats *rs, enum fio_ddir ddir,
+                                struct json_object *parent)
  {
         unsigned long long min, max;
         unsigned long long bw_bytes, bw;
@@ -1587,25 +1493,37 @@ static void add_ddir_status_json(struct thread_stat *ts,
         if (!ddir_rw(ddir))
                 return;
  
-       /* Only print PRIO latencies if some high priority samples were gathered */
-       if (ts->clat_high_prio_stat[ddir].samples > 0) {
-               const char *high, *low;
+       /* Only include per prio stats if there are >= 2 prios with samples */
+       if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+               struct json_array *array = json_create_array();
+               const char *obj_name;
+               int i;
  
-               if (ts->lat_percentiles) {
-                       high = "lat_high_prio";
-                       low = "lat_low_prio";
-               } else {
-                       high = "clat_high_prio";
-                       low = "clat_low_prio";
+               if (ts->lat_percentiles)
+                       obj_name = "lat_ns";
+               else
+                       obj_name = "clat_ns";
+
+               json_object_add_value_array(dir_object, "prios", array);
+
+               for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+                       if (ts->clat_prio[ddir][i].clat_stat.samples > 0) {
+                               struct json_object *obj = json_create_object();
+                               unsigned long long class, level;
+
+                               class = ts->clat_prio[ddir][i].ioprio >> 13;
+                               json_object_add_value_int(obj, "prioclass", class);
+                               level = ts->clat_prio[ddir][i].ioprio & 7;
+                               json_object_add_value_int(obj, "prio", level);
+
+                               tmp_object = add_ddir_lat_json(ts,
+                                                              ts->clat_percentiles | ts->lat_percentiles,
+                                                              &ts->clat_prio[ddir][i].clat_stat,
+                                                              ts->clat_prio[ddir][i].io_u_plat);
+                               json_object_add_value_object(obj, obj_name, tmp_object);
+                               json_array_add_value_object(array, obj);
+                       }
                 }
-
-               tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles,
-                               &ts->clat_high_prio_stat[ddir], ts->io_u_plat_high_prio[ddir]);
-               json_object_add_value_object(dir_object, high, tmp_object);
-
-               tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles,
-                               &ts->clat_low_prio_stat[ddir], ts->io_u_plat_low_prio[ddir]);
-               json_object_add_value_object(dir_object, low, tmp_object);
         }
  
         if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
@@ -1648,23 +1566,13 @@ static void add_ddir_status_json(struct thread_stat *ts,
  static void add_mixed_ddir_status_json(struct thread_stat *ts,
                 struct group_run_stats *rs, struct json_object *parent)
  {
-       struct thread_stat *ts_lcl;
-
-       /* Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and Trims (ddir = 2) */
-       ts_lcl = malloc(sizeof(struct thread_stat));
-       memset((void *)ts_lcl, 0, sizeof(struct thread_stat));
-       ts_lcl->unified_rw_rep = UNIFIED_MIXED;               /* calculate mixed stats  */
-       init_thread_stat_min_vals(ts_lcl);
-       ts_lcl->lat_percentiles = ts->lat_percentiles;
-       ts_lcl->clat_percentiles = ts->clat_percentiles;
-       ts_lcl->slat_percentiles = ts->slat_percentiles;
-       ts_lcl->percentile_precision = ts->percentile_precision;                
-       memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
-
-       sum_thread_stats(ts_lcl, ts, 1);
+       struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
  
         /* add the aggregated stats to json parent */
-       add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent);
+       if (ts_lcl)
+               add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent);
+
+       free_clat_prio_stats(ts_lcl);
         free(ts_lcl);
  }
  
@@ -2073,9 +1981,10 @@ static void __sum_stat(struct io_stat *dst, struct io_stat *src, bool first)
   * numbers. For group_reporting, we should just add those up, not make
   * them the mean of everything.
   */
-static void sum_stat(struct io_stat *dst, struct io_stat *src, bool first,
-                    bool pure_sum)
+static void sum_stat(struct io_stat *dst, struct io_stat *src, bool pure_sum)
  {
+       bool first = dst->samples == 0;
+
         if (src->samples == 0)
                 return;
  
@@ -2125,48 +2034,248 @@ void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src)
                 dst->sig_figs = src->sig_figs;
  }
  
-void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
-                     bool first)
+/*
+ * Free the clat_prio_stat arrays allocated by alloc_clat_prio_stat_ddir().
+ */
+void free_clat_prio_stats(struct thread_stat *ts)
+{
+       enum fio_ddir ddir;
+
+       for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+               sfree(ts->clat_prio[ddir]);
+               ts->clat_prio[ddir] = NULL;
+               ts->nr_clat_prio[ddir] = 0;
+       }
+}
+
+/*
+ * Allocate a clat_prio_stat array. The array has to be allocated/freed using
+ * smalloc/sfree, so that it is accessible by the process/thread summing the
+ * thread_stats.
+ */
+int alloc_clat_prio_stat_ddir(struct thread_stat *ts, enum fio_ddir ddir,
+                             int nr_prios)
+{
+       struct clat_prio_stat *clat_prio;
+       int i;
+
+       clat_prio = scalloc(nr_prios, sizeof(*ts->clat_prio[ddir]));
+       if (!clat_prio) {
+               log_err("fio: failed to allocate ts clat data\n");
+               return 1;
+       }
+
+       for (i = 0; i < nr_prios; i++)
+               clat_prio[i].clat_stat.min_val = ULONG_MAX;
+
+       ts->clat_prio[ddir] = clat_prio;
+       ts->nr_clat_prio[ddir] = nr_prios;
+
+       return 0;
+}
+
+static int grow_clat_prio_stat(struct thread_stat *dst, enum fio_ddir ddir)
+{
+       int curr_len = dst->nr_clat_prio[ddir];
+       void *new_arr;
+
+       new_arr = scalloc(curr_len + 1, sizeof(*dst->clat_prio[ddir]));
+       if (!new_arr) {
+               log_err("fio: failed to grow clat prio array\n");
+               return 1;
+       }
+
+       memcpy(new_arr, dst->clat_prio[ddir],
+              curr_len * sizeof(*dst->clat_prio[ddir]));
+       sfree(dst->clat_prio[ddir]);
+
+       dst->clat_prio[ddir] = new_arr;
+       dst->clat_prio[ddir][curr_len].clat_stat.min_val = ULONG_MAX;
+       dst->nr_clat_prio[ddir]++;
+
+       return 0;
+}
+
+static int find_clat_prio_index(struct thread_stat *dst, enum fio_ddir ddir,
+                               uint32_t ioprio)
+{
+       int i, nr_prios = dst->nr_clat_prio[ddir];
+
+       for (i = 0; i < nr_prios; i++) {
+               if (dst->clat_prio[ddir][i].ioprio == ioprio)
+                       return i;
+       }
+
+       return -1;
+}
+
+static int alloc_or_get_clat_prio_index(struct thread_stat *dst,
+                                       enum fio_ddir ddir, uint32_t ioprio,
+                                       int *idx)
+{
+       int index = find_clat_prio_index(dst, ddir, ioprio);
+
+       if (index == -1) {
+               index = dst->nr_clat_prio[ddir];
+
+               if (grow_clat_prio_stat(dst, ddir))
+                       return 1;
+
+               dst->clat_prio[ddir][index].ioprio = ioprio;
+       }
+
+       *idx = index;
+
+       return 0;
+}
+
+static int clat_prio_stats_copy(struct thread_stat *dst, struct thread_stat *src,
+                               enum fio_ddir dst_ddir, enum fio_ddir src_ddir)
+{
+       size_t sz = sizeof(*src->clat_prio[src_ddir]) *
+               src->nr_clat_prio[src_ddir];
+
+       dst->clat_prio[dst_ddir] = smalloc(sz);
+       if (!dst->clat_prio[dst_ddir]) {
+               log_err("fio: failed to alloc clat prio array\n");
+               return 1;
+       }
+
+       memcpy(dst->clat_prio[dst_ddir], src->clat_prio[src_ddir], sz);
+       dst->nr_clat_prio[dst_ddir] = src->nr_clat_prio[src_ddir];
+
+       return 0;
+}
+
+static int clat_prio_stat_add_samples(struct thread_stat *dst,
+                                     enum fio_ddir dst_ddir, uint32_t ioprio,
+                                     struct io_stat *io_stat,
+                                     uint64_t *io_u_plat)
+{
+       int i, dst_index;
+
+       if (!io_stat->samples)
+               return 0;
+
+       if (alloc_or_get_clat_prio_index(dst, dst_ddir, ioprio, &dst_index))
+               return 1;
+
+       sum_stat(&dst->clat_prio[dst_ddir][dst_index].clat_stat, io_stat,
+                false);
+
+       for (i = 0; i < FIO_IO_U_PLAT_NR; i++)
+               dst->clat_prio[dst_ddir][dst_index].io_u_plat[i] += io_u_plat[i];
+
+       return 0;
+}
+
+static int sum_clat_prio_stats_src_single_prio(struct thread_stat *dst,
+                                              struct thread_stat *src,
+                                              enum fio_ddir dst_ddir,
+                                              enum fio_ddir src_ddir)
+{
+       struct io_stat *io_stat;
+       uint64_t *io_u_plat;
+
+       /*
+        * If src ts has no clat_prio_stat array, then all I/Os were submitted
+        * using src->ioprio. Thus, the global samples in src->clat_stat (or
+        * src->lat_stat) can be used as the 'per prio' samples for src->ioprio.
+        */
+       assert(!src->clat_prio[src_ddir]);
+       assert(src->nr_clat_prio[src_ddir] == 0);
+
+       if (src->lat_percentiles) {
+               io_u_plat = src->io_u_plat[FIO_LAT][src_ddir];
+               io_stat = &src->lat_stat[src_ddir];
+       } else {
+               io_u_plat = src->io_u_plat[FIO_CLAT][src_ddir];
+               io_stat = &src->clat_stat[src_ddir];
+       }
+
+       return clat_prio_stat_add_samples(dst, dst_ddir, src->ioprio, io_stat,
+                                         io_u_plat);
+}
+
+static int sum_clat_prio_stats_src_multi_prio(struct thread_stat *dst,
+                                             struct thread_stat *src,
+                                             enum fio_ddir dst_ddir,
+                                             enum fio_ddir src_ddir)
+{
+       int i;
+
+       /*
+        * If src ts has a clat_prio_stat array, then there are multiple prios
+        * in use (i.e. src ts had cmdprio_percentage or cmdprio_bssplit set).
+        * The samples for the default prio will exist in the src->clat_prio
+        * array, just like the samples for any other prio.
+        */
+       assert(src->clat_prio[src_ddir]);
+       assert(src->nr_clat_prio[src_ddir]);
+
+       /* If the dst ts doesn't yet have a clat_prio array, simply memcpy. */
+       if (!dst->clat_prio[dst_ddir])
+               return clat_prio_stats_copy(dst, src, dst_ddir, src_ddir);
+
+       /* The dst ts already has a clat_prio_array, add src stats into it. */
+       for (i = 0; i < src->nr_clat_prio[src_ddir]; i++) {
+               struct io_stat *io_stat = &src->clat_prio[src_ddir][i].clat_stat;
+               uint64_t *io_u_plat = src->clat_prio[src_ddir][i].io_u_plat;
+               uint32_t ioprio = src->clat_prio[src_ddir][i].ioprio;
+
+               if (clat_prio_stat_add_samples(dst, dst_ddir, ioprio, io_stat, io_u_plat))
+                       return 1;
+       }
+
+       return 0;
+}
+
+static int sum_clat_prio_stats(struct thread_stat *dst, struct thread_stat *src,
+                              enum fio_ddir dst_ddir, enum fio_ddir src_ddir)
+{
+       if (dst->disable_prio_stat)
+               return 0;
+
+       if (!src->clat_prio[src_ddir])
+               return sum_clat_prio_stats_src_single_prio(dst, src, dst_ddir,
+                                                          src_ddir);
+
+       return sum_clat_prio_stats_src_multi_prio(dst, src, dst_ddir, src_ddir);
+}
+
+void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src)
  {
         int k, l, m;
  
         for (l = 0; l < DDIR_RWDIR_CNT; l++) {
-               if (!(dst->unified_rw_rep == UNIFIED_MIXED)) {
-                       sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first, false);
-                       sum_stat(&dst->clat_high_prio_stat[l], &src->clat_high_prio_stat[l], first, false);
-                       sum_stat(&dst->clat_low_prio_stat[l], &src->clat_low_prio_stat[l], first, false);
-                       sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first, false);
-                       sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first, false);
-                       sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first, true);
-                       sum_stat(&dst->iops_stat[l], &src->iops_stat[l], first, true);
+               if (dst->unified_rw_rep != UNIFIED_MIXED) {
+                       sum_stat(&dst->clat_stat[l], &src->clat_stat[l], false);
+                       sum_stat(&dst->slat_stat[l], &src->slat_stat[l], false);
+                       sum_stat(&dst->lat_stat[l], &src->lat_stat[l], false);
+                       sum_stat(&dst->bw_stat[l], &src->bw_stat[l], true);
+                       sum_stat(&dst->iops_stat[l], &src->iops_stat[l], true);
+                       sum_clat_prio_stats(dst, src, l, l);
  
                         dst->io_bytes[l] += src->io_bytes[l];
  
                         if (dst->runtime[l] < src->runtime[l])
                                 dst->runtime[l] = src->runtime[l];
                 } else {
-                       sum_stat(&dst->clat_stat[0], &src->clat_stat[l], first, false);
-                       sum_stat(&dst->clat_high_prio_stat[0], &src->clat_high_prio_stat[l], first, false);
-                       sum_stat(&dst->clat_low_prio_stat[0], &src->clat_low_prio_stat[l], first, false);
-                       sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first, false);
-                       sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first, false);
-                       sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first, true);
-                       sum_stat(&dst->iops_stat[0], &src->iops_stat[l], first, true);
+                       sum_stat(&dst->clat_stat[0], &src->clat_stat[l], false);
+                       sum_stat(&dst->slat_stat[0], &src->slat_stat[l], false);
+                       sum_stat(&dst->lat_stat[0], &src->lat_stat[l], false);
+                       sum_stat(&dst->bw_stat[0], &src->bw_stat[l], true);
+                       sum_stat(&dst->iops_stat[0], &src->iops_stat[l], true);
+                       sum_clat_prio_stats(dst, src, 0, l);
  
                         dst->io_bytes[0] += src->io_bytes[l];
  
                         if (dst->runtime[0] < src->runtime[l])
                                 dst->runtime[0] = src->runtime[l];
-
-                       /*
-                        * We're summing to the same destination, so override
-                        * 'first' after the first iteration of the loop
-                        */
-                       first = false;
                 }
         }
  
-       sum_stat(&dst->sync_stat, &src->sync_stat, first, false);
+       sum_stat(&dst->sync_stat, &src->sync_stat, false);
         dst->usr_time += src->usr_time;
         dst->sys_time += src->sys_time;
         dst->ctx += src->ctx;
@@ -2187,7 +2296,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
                 dst->io_u_lat_m[k] += src->io_u_lat_m[k];
  
         for (k = 0; k < DDIR_RWDIR_CNT; k++) {
-               if (!(dst->unified_rw_rep == UNIFIED_MIXED)) {
+               if (dst->unified_rw_rep != UNIFIED_MIXED) {
                         dst->total_io_u[k] += src->total_io_u[k];
                         dst->short_io_u[k] += src->short_io_u[k];
                         dst->drop_io_u[k] += src->drop_io_u[k];
@@ -2203,7 +2312,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
         for (k = 0; k < FIO_LAT_CNT; k++)
                 for (l = 0; l < DDIR_RWDIR_CNT; l++)
                         for (m = 0; m < FIO_IO_U_PLAT_NR; m++)
-                               if (!(dst->unified_rw_rep == UNIFIED_MIXED))
+                               if (dst->unified_rw_rep != UNIFIED_MIXED)
                                         dst->io_u_plat[k][l][m] += src->io_u_plat[k][l][m];
                                 else
                                         dst->io_u_plat[k][0][m] += src->io_u_plat[k][l][m];
@@ -2211,19 +2320,6 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
         for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
                 dst->io_u_sync_plat[k] += src->io_u_sync_plat[k];
  
-       for (k = 0; k < DDIR_RWDIR_CNT; k++) {
-               for (m = 0; m < FIO_IO_U_PLAT_NR; m++) {
-                       if (!(dst->unified_rw_rep == UNIFIED_MIXED)) {
-                               dst->io_u_plat_high_prio[k][m] += src->io_u_plat_high_prio[k][m];
-                               dst->io_u_plat_low_prio[k][m] += src->io_u_plat_low_prio[k][m];
-                       } else {
-                               dst->io_u_plat_high_prio[0][m] += src->io_u_plat_high_prio[k][m];
-                               dst->io_u_plat_low_prio[0][m] += src->io_u_plat_low_prio[k][m];
-                       }
-
-               }
-       }
-
         dst->total_run_time += src->total_run_time;
         dst->total_submit += src->total_submit;
         dst->total_complete += src->total_complete;
@@ -2251,8 +2347,6 @@ void init_thread_stat_min_vals(struct thread_stat *ts)
                 ts->lat_stat[i].min_val = ULONG_MAX;
                 ts->bw_stat[i].min_val = ULONG_MAX;
                 ts->iops_stat[i].min_val = ULONG_MAX;
-               ts->clat_high_prio_stat[i].min_val = ULONG_MAX;
-               ts->clat_low_prio_stat[i].min_val = ULONG_MAX;
         }
         ts->sync_stat.min_val = ULONG_MAX;
  }
@@ -2265,6 +2359,58 @@ void init_thread_stat(struct thread_stat *ts)
         ts->groupid = -1;
  }
  
+static void init_per_prio_stats(struct thread_stat *threadstats, int nr_ts)
+{
+       struct thread_data *td;
+       struct thread_stat *ts;
+       int i, j, last_ts, idx;
+       enum fio_ddir ddir;
+
+       j = 0;
+       last_ts = -1;
+       idx = 0;
+
+       /*
+        * Loop through all tds, if a td requires per prio stats, temporarily
+        * store a 1 in ts->disable_prio_stat, and then do an additional
+        * loop at the end where we invert the ts->disable_prio_stat values.
+        */
+       for_each_td(td, i) {
+               if (!td->o.stats)
+                       continue;
+               if (idx &&
+                   (!td->o.group_reporting ||
+                    (td->o.group_reporting && last_ts != td->groupid))) {
+                       idx = 0;
+                       j++;
+               }
+
+               last_ts = td->groupid;
+               ts = &threadstats[j];
+
+               /* idx == 0 means first td in group, or td is not in a group. */
+               if (idx == 0)
+                       ts->ioprio = td->ioprio;
+               else if (td->ioprio != ts->ioprio)
+                       ts->disable_prio_stat = 1;
+
+               for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+                       if (td->ts.clat_prio[ddir]) {
+                               ts->disable_prio_stat = 1;
+                               break;
+                       }
+               }
+
+               idx++;
+       }
+
+       /* Loop through all dst threadstats and fixup the values. */
+       for (i = 0; i < nr_ts; i++) {
+               ts = &threadstats[i];
+               ts->disable_prio_stat = !ts->disable_prio_stat;
+       }
+}
+
  void __show_run_stats(void)
  {
         struct group_run_stats *runstats, *rs;
@@ -2311,6 +2457,8 @@ void __show_run_stats(void)
                 opt_lists[i] = NULL;
         }
  
+       init_per_prio_stats(threadstats, nr_ts);
+
         j = 0;
         last_ts = -1;
         idx = 0;
@@ -2335,7 +2483,6 @@ void __show_run_stats(void)
                 opt_lists[j] = &td->opt_list;
  
                 idx++;
-               ts->members++;
  
                 if (ts->groupid == -1) {
                         /*
@@ -2400,7 +2547,9 @@ void __show_run_stats(void)
                 for (k = 0; k < ts->nr_block_infos; k++)
                         ts->block_infos[k] = td->ts.block_infos[k];
  
-               sum_thread_stats(ts, &td->ts, idx == 1);
+               sum_thread_stats(ts, &td->ts);
+
+               ts->members++;
  
                 if (td->o.ss_dur) {
                         ts->ss_state = td->ss.state;
@@ -2450,7 +2599,7 @@ void __show_run_stats(void)
         }
  
         for (i = 0; i < groupid + 1; i++) {
-               int ddir;
+               enum fio_ddir ddir;
  
                 rs = &runstats[i];
  
@@ -2556,6 +2705,12 @@ void __show_run_stats(void)
  
         log_info_flush();
         free(runstats);
+
+       /* free arrays allocated by sum_thread_stats(), if any */
+       for (i = 0; i < nr_ts; i++) {
+               ts = &threadstats[i];
+               free_clat_prio_stats(ts);
+       }
         free(threadstats);
         free(opt_lists);
  }
@@ -2682,6 +2837,14 @@ static inline void add_stat_sample(struct io_stat *is, unsigned long long data)
         is->samples++;
  }
  
+static inline void add_stat_prio_sample(struct clat_prio_stat *clat_prio,
+                                       unsigned short clat_prio_index,
+                                       unsigned long long nsec)
+{
+       if (clat_prio)
+               add_stat_sample(&clat_prio[clat_prio_index].clat_stat, nsec);
+}
+
  /*
   * Return a struct io_logs, which is added to the tail of the log
   * list for 'iolog'.
@@ -2879,14 +3042,36 @@ static inline void reset_io_stat(struct io_stat *ios)
         ios->mean.u.f = ios->S.u.f = 0;
  }
  
+static inline void reset_io_u_plat(uint64_t *io_u_plat)
+{
+       int i;
+
+       for (i = 0; i < FIO_IO_U_PLAT_NR; i++)
+               io_u_plat[i] = 0;
+}
+
+static inline void reset_clat_prio_stats(struct thread_stat *ts)
+{
+       enum fio_ddir ddir;
+       int i;
+
+       for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+               if (!ts->clat_prio[ddir])
+                       continue;
+
+               for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+                       reset_io_stat(&ts->clat_prio[ddir][i].clat_stat);
+                       reset_io_u_plat(ts->clat_prio[ddir][i].io_u_plat);
+               }
+       }
+}
+
  void reset_io_stats(struct thread_data *td)
  {
         struct thread_stat *ts = &td->ts;
-       int i, j, k;
+       int i, j;
  
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               reset_io_stat(&ts->clat_high_prio_stat[i]);
-               reset_io_stat(&ts->clat_low_prio_stat[i]);
                 reset_io_stat(&ts->clat_stat[i]);
                 reset_io_stat(&ts->slat_stat[i]);
                 reset_io_stat(&ts->lat_stat[i]);
@@ -2898,21 +3083,16 @@ void reset_io_stats(struct thread_data *td)
                 ts->total_io_u[i] = 0;
                 ts->short_io_u[i] = 0;
                 ts->drop_io_u[i] = 0;
-
-               for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
-                       ts->io_u_plat_high_prio[i][j] = 0;
-                       ts->io_u_plat_low_prio[i][j] = 0;
-                       if (!i)
-                               ts->io_u_sync_plat[j] = 0;
-               }
         }
  
         for (i = 0; i < FIO_LAT_CNT; i++)
                 for (j = 0; j < DDIR_RWDIR_CNT; j++)
-                       for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
-                               ts->io_u_plat[i][j][k] = 0;
+                       reset_io_u_plat(ts->io_u_plat[i][j]);
+
+       reset_clat_prio_stats(ts);
  
         ts->total_io_u[DDIR_SYNC] = 0;
+       reset_io_u_plat(ts->io_u_sync_plat);
  
         for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
                 ts->io_u_map[i] = 0;
@@ -2958,7 +3138,7 @@ static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
  static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed,
                              bool log_max)
  {
-       int ddir;
+       enum fio_ddir ddir;
  
         for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
                 __add_stat_to_log(iolog, ddir, elapsed, log_max);
@@ -3063,22 +3243,21 @@ static inline void add_lat_percentile_sample(struct thread_stat *ts,
         ts->io_u_plat[lat][ddir][idx]++;
  }
  
-static inline void add_lat_percentile_prio_sample(struct thread_stat *ts,
-                                                 unsigned long long nsec,
-                                                 enum fio_ddir ddir,
-                                                 bool high_prio)
+static inline void
+add_lat_percentile_prio_sample(struct thread_stat *ts, unsigned long long nsec,
+                              enum fio_ddir ddir,
+                              unsigned short clat_prio_index)
  {
         unsigned int idx = plat_val_to_idx(nsec);
  
-       if (!high_prio)
-               ts->io_u_plat_low_prio[ddir][idx]++;
-       else
-               ts->io_u_plat_high_prio[ddir][idx]++;
+       if (ts->clat_prio[ddir])
+               ts->clat_prio[ddir][clat_prio_index].io_u_plat[idx]++;
  }
  
  void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
                      unsigned long long nsec, unsigned long long bs,
-                    uint64_t offset, unsigned int ioprio, bool high_prio)
+                    uint64_t offset, unsigned int ioprio,
+                    unsigned short clat_prio_index)
  {
         const bool needs_lock = td_async_processing(td);
         unsigned long elapsed, this_window;
@@ -3091,7 +3270,7 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
         add_stat_sample(&ts->clat_stat[ddir], nsec);
  
         /*
-        * When lat_percentiles=1 (default 0), the reported high/low priority
+        * When lat_percentiles=1 (default 0), the reported per priority
          * percentiles and stats are used for describing total latency values,
          * even though the variable names themselves start with clat_.
          *
@@ -3099,12 +3278,9 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
          * lat_percentiles=0. add_lat_sample() will add the prio stat sample
          * when lat_percentiles=1.
          */
-       if (!ts->lat_percentiles) {
-               if (high_prio)
-                       add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
-               else
-                       add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
-       }
+       if (!ts->lat_percentiles)
+               add_stat_prio_sample(ts->clat_prio[ddir], clat_prio_index,
+                                    nsec);
  
         if (td->clat_log)
                 add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs,
@@ -3119,7 +3295,7 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
                 add_lat_percentile_sample(ts, nsec, ddir, FIO_CLAT);
                 if (!ts->lat_percentiles)
                         add_lat_percentile_prio_sample(ts, nsec, ddir,
-                                                      high_prio);
+                                                      clat_prio_index);
         }
  
         if (iolog && iolog->hist_msec) {
@@ -3192,7 +3368,8 @@ void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
  
  void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
                     unsigned long long nsec, unsigned long long bs,
-                   uint64_t offset, unsigned int ioprio, bool high_prio)
+                   uint64_t offset, unsigned int ioprio,
+                   unsigned short clat_prio_index)
  {
         const bool needs_lock = td_async_processing(td);
         struct thread_stat *ts = &td->ts;
@@ -3210,7 +3387,7 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
                                offset, ioprio);
  
         /*
-        * When lat_percentiles=1 (default 0), the reported high/low priority
+        * When lat_percentiles=1 (default 0), the reported per priority
          * percentiles and stats are used for describing total latency values,
          * even though the variable names themselves start with clat_.
          *
@@ -3221,12 +3398,9 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
          */
         if (ts->lat_percentiles) {
                 add_lat_percentile_sample(ts, nsec, ddir, FIO_LAT);
-               add_lat_percentile_prio_sample(ts, nsec, ddir, high_prio);
-               if (high_prio)
-                       add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
-               else
-                       add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
-
+               add_lat_percentile_prio_sample(ts, nsec, ddir, clat_prio_index);
+               add_stat_prio_sample(ts->clat_prio[ddir], clat_prio_index,
+                                    nsec);
         }
         if (needs_lock)
                 __td_io_u_unlock(td);
diff --git a/stat.h b/stat.h

index 9ef8caa438870a97eb6ff6fb7a4786d86d9db4af..dce0bb0dc9b416a136ba1f3eff4a088ce947d5f8 100644 (file)
--- a/stat.h
+++ b/stat.h
@@ -158,6 +158,12 @@ enum fio_lat {
         FIO_LAT_CNT = 3,
  };
  
+struct clat_prio_stat {
+       uint64_t io_u_plat[FIO_IO_U_PLAT_NR];
+       struct io_stat clat_stat;
+       uint32_t ioprio;
+};
+
  struct thread_stat {
         char name[FIO_JOBNAME_SIZE];
         char verror[FIO_VERROR_SIZE];
@@ -168,6 +174,7 @@ struct thread_stat {
         char description[FIO_JOBDESC_SIZE];
         uint32_t members;
         uint32_t unified_rw_rep;
+       uint32_t disable_prio_stat;
  
         /*
          * bandwidth and latency stats
@@ -252,21 +259,40 @@ struct thread_stat {
         fio_fp64_t ss_deviation;
         fio_fp64_t ss_criterion;
  
-       uint64_t io_u_plat_high_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR] __attribute__((aligned(8)));;
-       uint64_t io_u_plat_low_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
-       struct io_stat clat_high_prio_stat[DDIR_RWDIR_CNT] __attribute__((aligned(8)));
-       struct io_stat clat_low_prio_stat[DDIR_RWDIR_CNT];
+       /* A mirror of td->ioprio. */
+       uint32_t ioprio;
  
         union {
                 uint64_t *ss_iops_data;
+               /*
+                * For FIO_NET_CMD_TS, the pointed to data will temporarily
+                * be stored at this offset from the start of the payload.
+                */
+               uint64_t ss_iops_data_offset;
                 uint64_t pad4;
         };
  
         union {
                 uint64_t *ss_bw_data;
+               /*
+                * For FIO_NET_CMD_TS, the pointed to data will temporarily
+                * be stored at this offset from the start of the payload.
+                */
+               uint64_t ss_bw_data_offset;
                 uint64_t pad5;
         };
  
+       union {
+               struct clat_prio_stat *clat_prio[DDIR_RWDIR_CNT];
+               /*
+                * For FIO_NET_CMD_TS, the pointed to data will temporarily
+                * be stored at this offset from the start of the payload.
+                */
+               uint64_t clat_prio_offset[DDIR_RWDIR_CNT];
+               uint64_t pad6;
+       };
+       uint32_t nr_clat_prio[DDIR_RWDIR_CNT];
+
         uint64_t cachehit;
         uint64_t cachemiss;
  } __attribute__((packed));
@@ -325,7 +351,7 @@ extern void __show_run_stats(void);
  extern int __show_running_run_stats(void);
  extern void show_running_run_stats(void);
  extern void check_for_running_stats(void);
-extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first);
+extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src);
  extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src);
  extern void init_thread_stat_min_vals(struct thread_stat *ts);
  extern void init_thread_stat(struct thread_stat *ts);
@@ -342,9 +368,9 @@ extern void update_rusage_stat(struct thread_data *);
  extern void clear_rusage_stat(struct thread_data *);
  
  extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
-                          unsigned long long, uint64_t, unsigned int, bool);
+                          unsigned long long, uint64_t, unsigned int, unsigned short);
  extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
-                           unsigned long long, uint64_t, unsigned int, bool);
+                           unsigned long long, uint64_t, unsigned int, unsigned short);
  extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
                                 unsigned long long, uint64_t, unsigned int);
  extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long);
@@ -355,6 +381,8 @@ extern void add_bw_sample(struct thread_data *, struct io_u *,
  extern void add_sync_clat_sample(struct thread_stat *ts,
                                 unsigned long long nsec);
  extern int calc_log_samples(void);
+extern void free_clat_prio_stats(struct thread_stat *);
+extern int alloc_clat_prio_stat_ddir(struct thread_stat *, enum fio_ddir, int);
  
  extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse, struct buf_output *);
  extern void json_array_add_disk_util(struct disk_util_stat *dus,
diff --git a/t/io_uring.c b/t/io_uring.c

index a98f78fd4a768af2736edb5c090282947b99e4f1..e8365a79d2f024bf409b90d3e0e2a4f8cc4ac514 100644 (file)
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -634,7 +634,8 @@ static int submitter_init(struct submitter *s)
  #ifdef CONFIG_LIBAIO
  static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
  {
-       unsigned long offset, data;
+       uint64_t data;
+       long long offset;
         struct file *f;
         unsigned index;
         long r;
@@ -663,7 +664,7 @@ static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocb
  
                 data = f->fileno;
                 if (stats && stats_running)
-                       data |= ((unsigned long) s->clock_index << 32);
+                       data |= (((uint64_t) s->clock_index) << 32);
                 iocb->data = (void *) (uintptr_t) data;
                 index++;
         }
@@ -676,7 +677,7 @@ static int reap_events_aio(struct submitter *s, struct io_event *events, int evs
         int reaped = 0;
  
         while (evs) {
-               unsigned long data = (uintptr_t) events[reaped].data;
+               uint64_t data = (uintptr_t) events[reaped].data;
                 struct file *f = &s->files[data & 0xffffffff];
  
                 f->pending_ios--;
@@ -1094,7 +1095,7 @@ static void usage(char *argv, int status)
                 " -a <bool> : Use legacy aio, default %d\n",
                 argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
                 fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
-               stats, runtime == 0 ? "unlimited" : runtime_str, aio, random_io);
+               stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio);
         exit(status);
  }
  
diff --git a/t/latency_percentiles.py b/t/latency_percentiles.py

index cc4374262efcca612e19f9dce6f969de399861fa..9e37d9fee54963a425ef9f6ebeadcf239944b92b 100755 (executable)
--- a/t/latency_percentiles.py
+++ b/t/latency_percentiles.py
@@ -80,6 +80,7 @@ import time
  import argparse
  import platform
  import subprocess
+from collections import Counter
  from pathlib import Path
  
  
@@ -125,7 +126,8 @@ class FioLatTest():
              "--output-format={output-format}".format(**self.test_options),
          ]
          for opt in ['slat_percentiles', 'clat_percentiles', 'lat_percentiles',
-                    'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs', 'cmdprio_percentage']:
+                    'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs',
+                    'cmdprio_percentage', 'bssplit', 'cmdprio_bssplit']:
              if opt in self.test_options:
                  option = '--{0}={{{0}}}'.format(opt)
                  fio_args.append(option.format(**self.test_options))
@@ -363,20 +365,19 @@ class FioLatTest():
  
      def check_nocmdprio_lat(self, job):
          """
-        Make sure no high/low priority latencies appear.
+        Make sure no per priority latencies appear.
  
          job         JSON object to check
          """
  
          for ddir in ['read', 'write', 'trim']:
              if ddir in job:
-                if 'lat_high_prio' in job[ddir] or 'lat_low_prio' in job[ddir] or \
-                    'clat_high_prio' in job[ddir] or 'clat_low_prio' in job[ddir]:
-                    print("Unexpected high/low priority latencies found in %s output" % ddir)
+                if 'prios' in job[ddir]:
+                    print("Unexpected per priority latencies found in %s output" % ddir)
                      return False
  
          if self.debug:
-            print("No high/low priority latencies found")
+            print("No per priority latencies found")
  
          return True
  
@@ -497,7 +498,7 @@ class FioLatTest():
          return retval
  
      def check_prio_latencies(self, jsondata, clat=True, plus=False):
-        """Check consistency of high/low priority latencies.
+        """Check consistency of per priority latencies.
  
          clat                True if we should check clat data; other check lat data
          plus                True if we have json+ format data where additional checks can
@@ -506,78 +507,78 @@ class FioLatTest():
          """
  
          if clat:
-            high = 'clat_high_prio'
-            low = 'clat_low_prio'
-            combined = 'clat_ns'
+            obj = combined = 'clat_ns'
          else:
-            high = 'lat_high_prio'
-            low = 'lat_low_prio'
-            combined = 'lat_ns'
+            obj = combined = 'lat_ns'
  
-        if not high in jsondata or not low in jsondata or not combined in jsondata:
-            print("Error identifying high/low priority latencies")
+        if not 'prios' in jsondata or not combined in jsondata:
+            print("Error identifying per priority latencies")
              return False
  
-        if jsondata[high]['N'] + jsondata[low]['N'] != jsondata[combined]['N']:
-            print("High %d + low %d != combined sample size %d" % \
-                    (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N']))
+        sum_sample_size = sum([x[obj]['N'] for x in jsondata['prios']])
+        if sum_sample_size != jsondata[combined]['N']:
+            print("Per prio sample size sum %d != combined sample size %d" %
+                  (sum_sample_size, jsondata[combined]['N']))
              return False
          elif self.debug:
-            print("High %d + low %d == combined sample size %d" % \
-                    (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N']))
+            print("Per prio sample size sum %d == combined sample size %d" %
+                  (sum_sample_size, jsondata[combined]['N']))
  
-        if min(jsondata[high]['min'], jsondata[low]['min']) != jsondata[combined]['min']:
-            print("Min of high %d, low %d min latencies does not match min %d from combined data" % \
-                    (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min']))
+        min_val = min([x[obj]['min'] for x in jsondata['prios']])
+        if min_val != jsondata[combined]['min']:
+            print("Min per prio min latency %d does not match min %d from combined data" %
+                  (min_val, jsondata[combined]['min']))
              return False
          elif self.debug:
-            print("Min of high %d, low %d min latencies matches min %d from combined data" % \
-                    (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min']))
+            print("Min per prio min latency %d matches min %d from combined data" %
+                  (min_val, jsondata[combined]['min']))
  
-        if max(jsondata[high]['max'], jsondata[low]['max']) != jsondata[combined]['max']:
-            print("Max of high %d, low %d max latencies does not match max %d from combined data" % \
-                    (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max']))
+        max_val = max([x[obj]['max'] for x in jsondata['prios']])
+        if max_val != jsondata[combined]['max']:
+            print("Max per prio max latency %d does not match max %d from combined data" %
+                  (max_val, jsondata[combined]['max']))
              return False
          elif self.debug:
-            print("Max of high %d, low %d max latencies matches max %d from combined data" % \
-                    (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max']))
+            print("Max per prio max latency %d matches max %d from combined data" %
+                  (max_val, jsondata[combined]['max']))
  
-        weighted_avg = (jsondata[high]['mean'] * jsondata[high]['N'] + \
-                        jsondata[low]['mean'] * jsondata[low]['N']) / jsondata[combined]['N']
+        weighted_vals = [x[obj]['mean'] * x[obj]['N'] for x in jsondata['prios']]
+        weighted_avg = sum(weighted_vals) / jsondata[combined]['N']
          delta = abs(weighted_avg - jsondata[combined]['mean'])
          if (delta / jsondata[combined]['mean']) > 0.0001:
-            print("Difference between weighted average %f of high, low means "
+            print("Difference between merged per prio weighted average %f mean "
                    "and actual mean %f exceeds 0.01%%" % (weighted_avg, jsondata[combined]['mean']))
              return False
          elif self.debug:
-            print("Weighted average %f of high, low means matches actual mean %f" % \
-                    (weighted_avg, jsondata[combined]['mean']))
+            print("Merged per prio weighted average %f mean matches actual mean %f" %
+                  (weighted_avg, jsondata[combined]['mean']))
  
          if plus:
-            if not self.check_jsonplus(jsondata[high]):
-                return False
-            if not self.check_jsonplus(jsondata[low]):
-                return False
+            for prio in jsondata['prios']:
+                if not self.check_jsonplus(prio[obj]):
+                    return False
  
-            bins = {**jsondata[high]['bins'], **jsondata[low]['bins']}
-            for duration in bins.keys():
-                if duration in jsondata[high]['bins'] and duration in jsondata[low]['bins']:
-                    bins[duration] = jsondata[high]['bins'][duration] + \
-                            jsondata[low]['bins'][duration]
+            counter = Counter()
+            for prio in jsondata['prios']:
+                counter.update(prio[obj]['bins'])
+
+            bins = dict(counter)
  
              if len(bins) != len(jsondata[combined]['bins']):
-                print("Number of combined high/low bins does not match number of overall bins")
+                print("Number of merged bins %d does not match number of overall bins %d" %
+                      (len(bins), len(jsondata[combined]['bins'])))
                  return False
              elif self.debug:
-                print("Number of bins from merged high/low data matches number of overall bins")
+                print("Number of merged bins %d matches number of overall bins %d" %
+                      (len(bins), len(jsondata[combined]['bins'])))
  
              for duration in bins.keys():
                  if bins[duration] != jsondata[combined]['bins'][duration]:
-                    print("Merged high/low count does not match overall count for duration %d" \
-                            % duration)
+                    print("Merged per prio count does not match overall count for duration %d" %
+                          duration)
                      return False
  
-        print("Merged high/low priority latency data match combined latency data")
+        print("Merged per priority latency data match combined latency data")
          return True
  
      def check(self):
@@ -602,7 +603,7 @@ class Test001(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, slat=False)
@@ -626,7 +627,7 @@ class Test002(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['write'], 1, slat=False, clat=False)
@@ -650,7 +651,7 @@ class Test003(FioLatTest):
              print("Unexpected write data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['trim'], 2, slat=False, tlat=False)
@@ -674,7 +675,7 @@ class Test004(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, plus=True)
@@ -698,7 +699,7 @@ class Test005(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
@@ -722,7 +723,7 @@ class Test006(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True)
@@ -743,7 +744,7 @@ class Test007(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, clat=False, tlat=False, plus=True)
@@ -761,11 +762,11 @@ class Test008(FioLatTest):
          job = self.json_data['jobs'][0]
  
          retval = True
-        if 'read' in job or 'write'in job or 'trim' in job:
+        if 'read' in job or 'write' in job or 'trim' in job:
              print("Unexpected data direction found in fio output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['mixed'], 0, plus=True, unified=True)
@@ -792,7 +793,7 @@ class Test009(FioLatTest):
              print("Error checking fsync latency data")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
@@ -813,7 +814,7 @@ class Test010(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, plus=True)
@@ -839,7 +840,7 @@ class Test011(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, slat=False, clat=False, plus=True)
@@ -953,7 +954,7 @@ class Test019(FioLatTest):
          job = self.json_data['jobs'][0]
  
          retval = True
-        if 'read' in job or 'write'in job or 'trim' in job:
+        if 'read' in job or 'write' in job or 'trim' in job:
              print("Unexpected data direction found in fio output")
              retval = False
  
@@ -963,6 +964,27 @@ class Test019(FioLatTest):
          return retval
  
  
+class Test021(FioLatTest):
+    """Test object for Test 21."""
+
+    def check(self):
+        """Check Test 21 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True)
+        retval &= self.check_latencies(job['write'], 1, slat=False, tlat=False, plus=True)
+        retval &= self.check_prio_latencies(job['read'], clat=True, plus=True)
+        retval &= self.check_prio_latencies(job['write'], clat=True, plus=True)
+
+        return retval
+
+
  def parse_args():
      """Parse command-line arguments."""
  
@@ -1007,7 +1029,7 @@ def main():
              # randread, null
              # enable slat, clat, lat
              # only clat and lat will appear because
-            # because the null ioengine is syncrhonous
+            # because the null ioengine is synchronous
              "test_id": 1,
              "runtime": 2,
              "output-format": "json",
@@ -1047,7 +1069,7 @@ def main():
          {
              # randread, aio
              # enable slat, clat, lat
-            # all will appear because liaio is asynchronous
+            # all will appear because libaio is asynchronous
              "test_id": 4,
              "runtime": 5,
              "output-format": "json+",
@@ -1153,9 +1175,9 @@ def main():
              # randread, null
              # enable slat, clat, lat
              # only clat and lat will appear because
-            # because the null ioengine is syncrhonous
-            # same as Test 1 except
-            # numjobs = 4 to test sum_thread_stats() changes
+            # because the null ioengine is synchronous
+            # same as Test 1 except add numjobs = 4 to test
+            # sum_thread_stats() changes
              "test_id": 12,
              "runtime": 2,
              "output-format": "json",
@@ -1170,9 +1192,9 @@ def main():
          {
              # randread, aio
              # enable slat, clat, lat
-            # all will appear because liaio is asynchronous
-            # same as Test 4 except
-            # numjobs = 4 to test sum_thread_stats() changes
+            # all will appear because libaio is asynchronous
+            # same as Test 4 except add numjobs = 4 to test
+            # sum_thread_stats() changes
              "test_id": 13,
              "runtime": 5,
              "output-format": "json+",
@@ -1187,8 +1209,8 @@ def main():
          {
              # 50/50 r/w, aio, unified_rw_reporting
              # enable slat, clat, lata
-            # same as Test 8 except
-            # numjobs = 4 to test sum_thread_stats() changes
+            # same as Test 8 except add numjobs = 4 to test
+            # sum_thread_stats() changes
              "test_id": 14,
              "runtime": 5,
              "output-format": "json+",
@@ -1204,7 +1226,7 @@ def main():
          {
              # randread, aio
              # enable slat, clat, lat
-            # all will appear because liaio is asynchronous
+            # all will appear because libaio is asynchronous
              # same as Test 4 except add cmdprio_percentage
              "test_id": 15,
              "runtime": 5,
@@ -1278,8 +1300,8 @@ def main():
          {
              # 50/50 r/w, aio, unified_rw_reporting
              # enable slat, clat, lat
-            # same as Test 19 except
-            # add numjobs = 4 to test sum_thread_stats() changes
+            # same as Test 19 except add numjobs = 4 to test
+            # sum_thread_stats() changes
              "test_id": 20,
              "runtime": 5,
              "output-format": "json+",
@@ -1293,6 +1315,40 @@ def main():
              'numjobs': 4,
              "test_obj": Test019,
          },
+        {
+            # r/w, aio
+            # enable only clat
+            # test bssplit and cmdprio_bssplit
+            "test_id": 21,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 0,
+            "clat_percentiles": 1,
+            "lat_percentiles": 0,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'bssplit': '64k/40:1024k/60',
+            'cmdprio_bssplit': '64k/25/1/1:64k/75/3/2:1024k/0',
+            "test_obj": Test021,
+        },
+        {
+            # r/w, aio
+            # enable only clat
+            # same as Test 21 except add numjobs = 4 to test
+            # sum_thread_stats() changes
+            "test_id": 22,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 0,
+            "clat_percentiles": 1,
+            "lat_percentiles": 0,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'bssplit': '64k/40:1024k/60',
+            'cmdprio_bssplit': '64k/25/1/1:64k/75/3/2:1024k/0',
+            'numjobs': 4,
+            "test_obj": Test021,
+        },
      ]
  
      passed = 0
@@ -1304,9 +1360,10 @@ def main():
             (args.run_only and test['test_id'] not in args.run_only):
              skipped = skipped + 1
              outcome = 'SKIPPED (User request)'
-        elif (platform.system() != 'Linux' or os.geteuid() != 0) and 'cmdprio_percentage' in test:
+        elif (platform.system() != 'Linux' or os.geteuid() != 0) and \
+             ('cmdprio_percentage' in test or 'cmdprio_bssplit' in test):
              skipped = skipped + 1
-            outcome = 'SKIPPED (Linux root required for cmdprio_percentage tests)'
+            outcome = 'SKIPPED (Linux root required for cmdprio tests)'
          else:
              test_obj = test['test_obj'](artifact_root, test, args.debug)
              status = test_obj.run_fio(fio)
diff --git a/t/zbd/functions b/t/zbd/functions

index e4e248b9ff26a0c99fe564598455ab4a86d5ac5f..7cff18fd18c0b50ae7ccf961fa665402d258ca71 100644 (file)
--- a/t/zbd/functions
+++ b/t/zbd/functions
@@ -72,9 +72,11 @@ zone_cap_bs() {
         local sed_str='s/.*len \([0-9A-Za-z]*\), cap \([0-9A-Za-z]*\).*/\1 \2/p'
         local cap bs="$zone_size"
  
-       # When blkzone is not available or blkzone does not report capacity,
+       # When blkzone command is neither available nor relevant to the
+       # test device, or when blkzone command does not report capacity,
         # assume that zone capacity is same as zone size for all zones.
-       if [ -z "${blkzone}" ] || ! blkzone_reports_capacity "${dev}"; then
+       if [ -z "${blkzone}" ] || [ -z "$is_zbd" ] || [ -c "$dev" ] ||
+                  ! blkzone_reports_capacity "${dev}"; then
                 echo "$zone_size"
                 return
         fi
diff --git a/thread_options.h b/thread_options.h

index 450e7ddeee25d2ac471e79da637bbf8e168d4502..4162c42faf731db6ec1d434656032d31e8350777 100644 (file)
--- a/thread_options.h
+++ b/thread_options.h
@@ -50,6 +50,12 @@ struct split {
         unsigned long long val2[ZONESPLIT_MAX];
  };
  
+struct split_prio {
+       uint64_t bs;
+       int32_t prio;
+       uint32_t perc;
+};
+
  struct bssplit {
         uint64_t bs;
         uint32_t perc;
@@ -706,4 +712,8 @@ extern int str_split_parse(struct thread_data *td, char *str,
  extern int split_parse_ddir(struct thread_options *o, struct split *split,
                             char *str, bool absolute, unsigned int max_splits);
  
+extern int split_parse_prio_ddir(struct thread_options *o,
+                                struct split_prio **entries, int *nr_entries,
+                                char *str);
+
  #endif
diff --git a/zbd.c b/zbd.c

index c18998c46f5428c106f0f910e3901e279ca63dd0..b1fd6b4bb0ae23cda7dba13a3c0bf3577139600f 100644 (file)
--- a/zbd.c
+++ b/zbd.c
@@ -22,13 +22,126 @@
  #include "pshared.h"
  #include "zbd.h"
  
+static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
+{
+       return (uint64_t)(offset - f->file_offset) < f->io_size;
+}
+
+static inline unsigned int zbd_zone_idx(const struct fio_file *f,
+                                       struct fio_zone_info *zone)
+{
+       return zone - f->zbd_info->zone_info;
+}
+
+/**
+ * zbd_offset_to_zone_idx - convert an offset into a zone number
+ * @f: file pointer.
+ * @offset: offset in bytes. If this offset is in the first zone_size bytes
+ *         past the disk size then the index of the sentinel is returned.
+ */
+static unsigned int zbd_offset_to_zone_idx(const struct fio_file *f,
+                                          uint64_t offset)
+{
+       uint32_t zone_idx;
+
+       if (f->zbd_info->zone_size_log2 > 0)
+               zone_idx = offset >> f->zbd_info->zone_size_log2;
+       else
+               zone_idx = offset / f->zbd_info->zone_size;
+
+       return min(zone_idx, f->zbd_info->nr_zones);
+}
+
+/**
+ * zbd_zone_end - Return zone end location
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
+{
+       return (z+1)->start;
+}
+
+/**
+ * zbd_zone_capacity_end - Return zone capacity limit end location
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
+{
+       return z->start + z->capacity;
+}
+
+/**
+ * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
+ * @f: file pointer.
+ * @z: zone info pointer.
+ * @required: minimum number of bytes that must remain in a zone.
+ *
+ * The caller must hold z->mutex.
+ */
+static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
+                         uint64_t required)
+{
+       assert((required & 511) == 0);
+
+       return z->has_wp &&
+               z->wp + required > zbd_zone_capacity_end(z);
+}
+
+static void zone_lock(struct thread_data *td, const struct fio_file *f,
+                     struct fio_zone_info *z)
+{
+       struct zoned_block_device_info *zbd = f->zbd_info;
+       uint32_t nz = z - zbd->zone_info;
+
+       /* A thread should never lock zones outside its working area. */
+       assert(f->min_zone <= nz && nz < f->max_zone);
+
+       assert(z->has_wp);
+
+       /*
+        * Lock the io_u target zone. The zone will be unlocked if io_u offset
+        * is changed or when io_u completes and zbd_put_io() executed.
+        * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
+        * other waiting for zone locks when building an io_u batch, first
+        * only trylock the zone. If the zone is already locked by another job,
+        * process the currently queued I/Os so that I/O progress is made and
+        * zones unlocked.
+        */
+       if (pthread_mutex_trylock(&z->mutex) != 0) {
+               if (!td_ioengine_flagged(td, FIO_SYNCIO))
+                       io_u_quiesce(td);
+               pthread_mutex_lock(&z->mutex);
+       }
+}
+
+static inline void zone_unlock(struct fio_zone_info *z)
+{
+       int ret;
+
+       assert(z->has_wp);
+       ret = pthread_mutex_unlock(&z->mutex);
+       assert(!ret);
+}
+
+static inline struct fio_zone_info *zbd_get_zone(const struct fio_file *f,
+                                                unsigned int zone_idx)
+{
+       return &f->zbd_info->zone_info[zone_idx];
+}
+
+static inline struct fio_zone_info *
+zbd_offset_to_zone(const struct fio_file *f,  uint64_t offset)
+{
+       return zbd_get_zone(f, zbd_offset_to_zone_idx(f, offset));
+}
+
  /**
   * zbd_get_zoned_model - Get a device zoned model
   * @td: FIO thread data
   * @f: FIO file for which to get model information
   */
-int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
-                       enum zbd_zoned_model *model)
+static int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
+                              enum zbd_zoned_model *model)
  {
         int ret;
  
@@ -71,9 +184,9 @@ int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
   * upon failure. If the zone report is empty, always assume an error (device
   * problem) and return -EIO.
   */
-int zbd_report_zones(struct thread_data *td, struct fio_file *f,
-                    uint64_t offset, struct zbd_zone *zones,
-                    unsigned int nr_zones)
+static int zbd_report_zones(struct thread_data *td, struct fio_file *f,
+                           uint64_t offset, struct zbd_zone *zones,
+                           unsigned int nr_zones)
  {
         int ret;
  
@@ -105,8 +218,8 @@ int zbd_report_zones(struct thread_data *td, struct fio_file *f,
   * Reset the write pointer of all zones in the range @offset...@offset+@length.
   * Returns 0 upon success and a negative error code upon failure.
   */
-int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
-                uint64_t offset, uint64_t length)
+static int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
+                       uint64_t offset, uint64_t length)
  {
         int ret;
  
@@ -124,131 +237,233 @@ int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
  }
  
  /**
- * zbd_get_max_open_zones - Get the maximum number of open zones
- * @td: FIO thread data
- * @f: FIO file for which to get max open zones
- * @max_open_zones: Upon success, result will be stored here.
- *
- * A @max_open_zones value set to zero means no limit.
+ * zbd_reset_zone - reset the write pointer of a single zone
+ * @td: FIO thread data.
+ * @f: FIO file associated with the disk for which to reset a write pointer.
+ * @z: Zone to reset.
   *
   * Returns 0 upon success and a negative error code upon failure.
+ *
+ * The caller must hold z->mutex.
   */
-int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
-                          unsigned int *max_open_zones)
+static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
+                         struct fio_zone_info *z)
  {
-       int ret;
+       uint64_t offset = z->start;
+       uint64_t length = (z+1)->start - offset;
+       uint64_t data_in_zone = z->wp - z->start;
+       int ret = 0;
  
-       if (td->io_ops && td->io_ops->get_max_open_zones)
-               ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
-       else
-               ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
-       if (ret < 0) {
-               td_verror(td, errno, "get max open zones failed");
-               log_err("%s: get max open zones failed (%d).\n",
-                       f->file_name, errno);
+       if (!data_in_zone)
+               return 0;
+
+       assert(is_valid_offset(f, offset + length - 1));
+
+       dprint(FD_ZBD, "%s: resetting wp of zone %u.\n",
+              f->file_name, zbd_zone_idx(f, z));
+
+       switch (f->zbd_info->model) {
+       case ZBD_HOST_AWARE:
+       case ZBD_HOST_MANAGED:
+               ret = zbd_reset_wp(td, f, offset, length);
+               if (ret < 0)
+                       return ret;
+               break;
+       default:
+               break;
         }
  
+       pthread_mutex_lock(&f->zbd_info->mutex);
+       f->zbd_info->sectors_with_data -= data_in_zone;
+       f->zbd_info->wp_sectors_with_data -= data_in_zone;
+       pthread_mutex_unlock(&f->zbd_info->mutex);
+
+       z->wp = z->start;
+       z->verify_block = 0;
+
+       td->ts.nr_zone_resets++;
+
         return ret;
  }
  
  /**
- * zbd_zone_idx - convert an offset into a zone number
- * @f: file pointer.
- * @offset: offset in bytes. If this offset is in the first zone_size bytes
- *         past the disk size then the index of the sentinel is returned.
+ * zbd_close_zone - Remove a zone from the open zones array.
+ * @td: FIO thread data.
+ * @f: FIO file associated with the disk for which to reset a write pointer.
+ * @zone_idx: Index of the zone to remove.
+ *
+ * The caller must hold f->zbd_info->mutex.
   */
-static uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset)
+static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
+                          struct fio_zone_info *z)
  {
-       uint32_t zone_idx;
+       uint32_t ozi;
  
-       if (f->zbd_info->zone_size_log2 > 0)
-               zone_idx = offset >> f->zbd_info->zone_size_log2;
-       else
-               zone_idx = offset / f->zbd_info->zone_size;
+       if (!z->open)
+               return;
  
-       return min(zone_idx, f->zbd_info->nr_zones);
-}
+       for (ozi = 0; ozi < f->zbd_info->num_open_zones; ozi++) {
+               if (zbd_get_zone(f, f->zbd_info->open_zones[ozi]) == z)
+                       break;
+       }
+       if (ozi == f->zbd_info->num_open_zones)
+               return;
  
-/**
- * zbd_zone_end - Return zone end location
- * @z: zone info pointer.
- */
-static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
-{
-       return (z+1)->start;
+       dprint(FD_ZBD, "%s: closing zone %u\n",
+              f->file_name, zbd_zone_idx(f, z));
+
+       memmove(f->zbd_info->open_zones + ozi,
+               f->zbd_info->open_zones + ozi + 1,
+               (ZBD_MAX_OPEN_ZONES - (ozi + 1)) *
+               sizeof(f->zbd_info->open_zones[0]));
+
+       f->zbd_info->num_open_zones--;
+       td->num_open_zones--;
+       z->open = 0;
  }
  
  /**
- * zbd_zone_capacity_end - Return zone capacity limit end location
- * @z: zone info pointer.
+ * zbd_reset_zones - Reset a range of zones.
+ * @td: fio thread data.
+ * @f: fio file for which to reset zones
+ * @zb: first zone to reset.
+ * @ze: first zone not to reset.
+ *
+ * Returns 0 upon success and 1 upon failure.
   */
-static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
+static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
+                          struct fio_zone_info *const zb,
+                          struct fio_zone_info *const ze)
  {
-       return z->start + z->capacity;
+       struct fio_zone_info *z;
+       const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
+       int res = 0;
+
+       assert(min_bs);
+
+       dprint(FD_ZBD, "%s: examining zones %u .. %u\n",
+              f->file_name, zbd_zone_idx(f, zb), zbd_zone_idx(f, ze));
+
+       for (z = zb; z < ze; z++) {
+               if (!z->has_wp)
+                       continue;
+
+               zone_lock(td, f, z);
+               pthread_mutex_lock(&f->zbd_info->mutex);
+               zbd_close_zone(td, f, z);
+               pthread_mutex_unlock(&f->zbd_info->mutex);
+
+               if (z->wp != z->start) {
+                       dprint(FD_ZBD, "%s: resetting zone %u\n",
+                              f->file_name, zbd_zone_idx(f, z));
+                       if (zbd_reset_zone(td, f, z) < 0)
+                               res = 1;
+               }
+
+               zone_unlock(z);
+       }
+
+       return res;
  }
  
  /**
- * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
- * @f: file pointer.
- * @z: zone info pointer.
- * @required: minimum number of bytes that must remain in a zone.
+ * zbd_get_max_open_zones - Get the maximum number of open zones
+ * @td: FIO thread data
+ * @f: FIO file for which to get max open zones
+ * @max_open_zones: Upon success, result will be stored here.
   *
- * The caller must hold z->mutex.
+ * A @max_open_zones value set to zero means no limit.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
   */
-static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
-                         uint64_t required)
+static int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                                 unsigned int *max_open_zones)
  {
-       assert((required & 511) == 0);
+       int ret;
  
-       return z->has_wp &&
-               z->wp + required > zbd_zone_capacity_end(z);
+       if (td->io_ops && td->io_ops->get_max_open_zones)
+               ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
+       else
+               ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
+       if (ret < 0) {
+               td_verror(td, errno, "get max open zones failed");
+               log_err("%s: get max open zones failed (%d).\n",
+                       f->file_name, errno);
+       }
+
+       return ret;
  }
  
-static void zone_lock(struct thread_data *td, const struct fio_file *f,
-                     struct fio_zone_info *z)
+/**
+ * zbd_open_zone - Add a zone to the array of open zones.
+ * @td: fio thread data.
+ * @f: fio file that has the open zones to add.
+ * @zone_idx: Index of the zone to add.
+ *
+ * Open a ZBD zone if it is not already open. Returns true if either the zone
+ * was already open or if the zone was successfully added to the array of open
+ * zones without exceeding the maximum number of open zones. Returns false if
+ * the zone was not already open and opening the zone would cause the zone limit
+ * to be exceeded.
+ */
+static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
+                         struct fio_zone_info *z)
  {
-       struct zoned_block_device_info *zbd = f->zbd_info;
-       uint32_t nz = z - zbd->zone_info;
+       const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
+       struct zoned_block_device_info *zbdi = f->zbd_info;
+       uint32_t zone_idx = zbd_zone_idx(f, z);
+       bool res = true;
  
-       /* A thread should never lock zones outside its working area. */
-       assert(f->min_zone <= nz && nz < f->max_zone);
+       if (z->cond == ZBD_ZONE_COND_OFFLINE)
+               return false;
  
-       assert(z->has_wp);
+       /*
+        * Skip full zones with data verification enabled because resetting a
+        * zone causes data loss and hence causes verification to fail.
+        */
+       if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
+               return false;
  
         /*
-        * Lock the io_u target zone. The zone will be unlocked if io_u offset
-        * is changed or when io_u completes and zbd_put_io() executed.
-        * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
-        * other waiting for zone locks when building an io_u batch, first
-        * only trylock the zone. If the zone is already locked by another job,
-        * process the currently queued I/Os so that I/O progress is made and
-        * zones unlocked.
+        * zbdi->max_open_zones == 0 means that there is no limit on the maximum
+        * number of open zones. In this case, do no track open zones in
+        * zbdi->open_zones array.
          */
-       if (pthread_mutex_trylock(&z->mutex) != 0) {
-               if (!td_ioengine_flagged(td, FIO_SYNCIO))
-                       io_u_quiesce(td);
-               pthread_mutex_lock(&z->mutex);
+       if (!zbdi->max_open_zones)
+               return true;
+
+       pthread_mutex_lock(&zbdi->mutex);
+
+       if (z->open) {
+               /*
+                * If the zone is going to be completely filled by writes
+                * already in-flight, handle it as a full zone instead of an
+                * open zone.
+                */
+               if (z->wp >= zbd_zone_capacity_end(z))
+                       res = false;
+               goto out;
         }
-}
  
-static inline void zone_unlock(struct fio_zone_info *z)
-{
-       int ret;
+       res = false;
+       /* Zero means no limit */
+       if (td->o.job_max_open_zones > 0 &&
+           td->num_open_zones >= td->o.job_max_open_zones)
+               goto out;
+       if (zbdi->num_open_zones >= zbdi->max_open_zones)
+               goto out;
  
-       assert(z->has_wp);
-       ret = pthread_mutex_unlock(&z->mutex);
-       assert(!ret);
-}
+       dprint(FD_ZBD, "%s: opening zone %u\n",
+              f->file_name, zone_idx);
  
-static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
-{
-       return (uint64_t)(offset - f->file_offset) < f->io_size;
-}
+       zbdi->open_zones[zbdi->num_open_zones++] = zone_idx;
+       td->num_open_zones++;
+       z->open = 1;
+       res = true;
  
-static inline struct fio_zone_info *get_zone(const struct fio_file *f,
-                                            unsigned int zone_nr)
-{
-       return &f->zbd_info->zone_info[zone_nr];
+out:
+       pthread_mutex_unlock(&zbdi->mutex);
+       return res;
  }
  
  /* Verify whether direct I/O is used for all host-managed zoned drives. */
@@ -277,15 +492,91 @@ static bool zbd_is_seq_job(struct fio_file *f)
         uint32_t zone_idx, zone_idx_b, zone_idx_e;
  
         assert(f->zbd_info);
+
         if (f->io_size == 0)
                 return false;
-       zone_idx_b = zbd_zone_idx(f, f->file_offset);
-       zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size - 1);
+
+       zone_idx_b = zbd_offset_to_zone_idx(f, f->file_offset);
+       zone_idx_e =
+               zbd_offset_to_zone_idx(f, f->file_offset + f->io_size - 1);
         for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++)
-               if (get_zone(f, zone_idx)->has_wp)
+               if (zbd_get_zone(f, zone_idx)->has_wp)
                         return true;
  
-       return false;
+       return false;
+}
+
+/*
+ * Verify whether the file offset and size parameters are aligned with zone
+ * boundaries. If the file offset is not aligned, align it down to the start of
+ * the zone containing the start offset and align up the file io_size parameter.
+ */
+static bool zbd_zone_align_file_sizes(struct thread_data *td,
+                                     struct fio_file *f)
+{
+       const struct fio_zone_info *z;
+       uint64_t new_offset, new_end;
+
+       if (!f->zbd_info)
+               return true;
+       if (f->file_offset >= f->real_file_size)
+               return true;
+       if (!zbd_is_seq_job(f))
+               return true;
+
+       if (!td->o.zone_size) {
+               td->o.zone_size = f->zbd_info->zone_size;
+               if (!td->o.zone_size) {
+                       log_err("%s: invalid 0 zone size\n",
+                               f->file_name);
+                       return false;
+               }
+       } else if (td->o.zone_size != f->zbd_info->zone_size) {
+               log_err("%s: zonesize %llu does not match the device zone size %"PRIu64".\n",
+                       f->file_name, td->o.zone_size,
+                       f->zbd_info->zone_size);
+               return false;
+       }
+
+       if (td->o.zone_skip % td->o.zone_size) {
+               log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
+                       f->file_name, td->o.zone_skip,
+                       td->o.zone_size);
+               return false;
+       }
+
+       z = zbd_offset_to_zone(f, f->file_offset);
+       if ((f->file_offset != z->start) &&
+           (td->o.td_ddir != TD_DDIR_READ)) {
+               new_offset = zbd_zone_end(z);
+               if (new_offset >= f->file_offset + f->io_size) {
+                       log_info("%s: io_size must be at least one zone\n",
+                                f->file_name);
+                       return false;
+               }
+               log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
+                        f->file_name, f->file_offset,
+                        new_offset);
+               f->io_size -= (new_offset - f->file_offset);
+               f->file_offset = new_offset;
+       }
+
+       z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
+       new_end = z->start;
+       if ((td->o.td_ddir != TD_DDIR_READ) &&
+           (f->file_offset + f->io_size != new_end)) {
+               if (new_end <= f->file_offset) {
+                       log_info("%s: io_size must be at least one zone\n",
+                                f->file_name);
+                       return false;
+               }
+               log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
+                        f->file_name, f->io_size,
+                        new_end - f->file_offset);
+               f->io_size = new_end - f->file_offset;
+       }
+
+       return true;
  }
  
  /*
@@ -293,74 +584,14 @@ static bool zbd_is_seq_job(struct fio_file *f)
   */
  static bool zbd_verify_sizes(void)
  {
-       const struct fio_zone_info *z;
         struct thread_data *td;
         struct fio_file *f;
-       uint64_t new_offset, new_end;
-       uint32_t zone_idx;
         int i, j;
  
         for_each_td(td, i) {
                 for_each_file(td, f, j) {
-                       if (!f->zbd_info)
-                               continue;
-                       if (f->file_offset >= f->real_file_size)
-                               continue;
-                       if (!zbd_is_seq_job(f))
-                               continue;
-
-                       if (!td->o.zone_size) {
-                               td->o.zone_size = f->zbd_info->zone_size;
-                               if (!td->o.zone_size) {
-                                       log_err("%s: invalid 0 zone size\n",
-                                               f->file_name);
-                                       return false;
-                               }
-                       } else if (td->o.zone_size != f->zbd_info->zone_size) {
-                               log_err("%s: job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
-                                       f->file_name, td->o.zone_size,
-                                       f->zbd_info->zone_size);
-                               return false;
-                       }
-
-                       if (td->o.zone_skip % td->o.zone_size) {
-                               log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
-                                       f->file_name, td->o.zone_skip,
-                                       td->o.zone_size);
+                       if (!zbd_zone_align_file_sizes(td, f))
                                 return false;
-                       }
-
-                       zone_idx = zbd_zone_idx(f, f->file_offset);
-                       z = get_zone(f, zone_idx);
-                       if ((f->file_offset != z->start) &&
-                           (td->o.td_ddir != TD_DDIR_READ)) {
-                               new_offset = zbd_zone_end(z);
-                               if (new_offset >= f->file_offset + f->io_size) {
-                                       log_info("%s: io_size must be at least one zone\n",
-                                                f->file_name);
-                                       return false;
-                               }
-                               log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
-                                        f->file_name, f->file_offset,
-                                        new_offset);
-                               f->io_size -= (new_offset - f->file_offset);
-                               f->file_offset = new_offset;
-                       }
-                       zone_idx = zbd_zone_idx(f, f->file_offset + f->io_size);
-                       z = get_zone(f, zone_idx);
-                       new_end = z->start;
-                       if ((td->o.td_ddir != TD_DDIR_READ) &&
-                           (f->file_offset + f->io_size != new_end)) {
-                               if (new_end <= f->file_offset) {
-                                       log_info("%s: io_size must be at least one zone\n",
-                                                f->file_name);
-                                       return false;
-                               }
-                               log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
-                                        f->file_name, f->io_size,
-                                        new_end - f->file_offset);
-                               f->io_size = new_end - f->file_offset;
-                       }
                 }
         }
  
@@ -385,6 +616,7 @@ static bool zbd_verify_bs(void)
  
                         if (!f->zbd_info)
                                 continue;
+
                         zone_size = f->zbd_info->zone_size;
                         if (td_trim(td) && td->o.bs[DDIR_TRIM] != zone_size) {
                                 log_info("%s: trim block size %llu is not the zone size %"PRIu64"\n",
@@ -529,8 +761,8 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
                 goto out;
         }
  
-       dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n", f->file_name,
-              nr_zones, zone_size / 1024);
+       dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n",
+              f->file_name, nr_zones, zone_size / 1024);
  
         zbd_info = scalloc(1, sizeof(*zbd_info) +
                            (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
@@ -546,6 +778,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
                                                      PTHREAD_MUTEX_RECURSIVE);
                         p->start = z->start;
                         p->capacity = z->capacity;
+
                         switch (z->cond) {
                         case ZBD_ZONE_COND_NOT_WP:
                         case ZBD_ZONE_COND_FULL:
@@ -579,6 +812,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
                 offset = z->start + z->len;
                 if (j >= nr_zones)
                         break;
+
                 nrz = zbd_report_zones(td, f, offset, zones,
                                        min((uint32_t)(nr_zones - j),
                                            ZBD_REPORT_MAX_ZONES));
@@ -646,7 +880,8 @@ out:
         /* Ensure that the limit is not larger than FIO's internal limit */
         if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
                 td_verror(td, EINVAL, "'max_open_zones' value is too large");
-               log_err("'max_open_zones' value is larger than %u\n", ZBD_MAX_OPEN_ZONES);
+               log_err("'max_open_zones' value is larger than %u\n",
+                       ZBD_MAX_OPEN_ZONES);
                 return -EINVAL;
         }
  
@@ -748,14 +983,10 @@ static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file)
         ret = zbd_create_zone_info(td, file);
         if (ret < 0)
                 td_verror(td, -ret, "zbd_create_zone_info() failed");
+
         return ret;
  }
  
-static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
-                         uint32_t zone_idx);
-static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
-                         struct fio_zone_info *z);
-
  int zbd_init_files(struct thread_data *td)
  {
         struct fio_file *f;
@@ -765,6 +996,7 @@ int zbd_init_files(struct thread_data *td)
                 if (zbd_init_zone_info(td, f))
                         return 1;
         }
+
         return 0;
  }
  
@@ -775,27 +1007,24 @@ void zbd_recalc_options_with_zone_granularity(struct thread_data *td)
  
         for_each_file(td, f, i) {
                 struct zoned_block_device_info *zbd = f->zbd_info;
-               // zonemode=strided doesn't get per-file zone size.
-               uint64_t zone_size = zbd ? zbd->zone_size : td->o.zone_size;
+               uint64_t zone_size;
  
+               /* zonemode=strided doesn't get per-file zone size. */
+               zone_size = zbd ? zbd->zone_size : td->o.zone_size;
                 if (zone_size == 0)
                         continue;
  
-               if (td->o.size_nz > 0) {
+               if (td->o.size_nz > 0)
                         td->o.size = td->o.size_nz * zone_size;
-               }
-               if (td->o.io_size_nz > 0) {
+               if (td->o.io_size_nz > 0)
                         td->o.io_size = td->o.io_size_nz * zone_size;
-               }
-               if (td->o.start_offset_nz > 0) {
+               if (td->o.start_offset_nz > 0)
                         td->o.start_offset = td->o.start_offset_nz * zone_size;
-               }
-               if (td->o.offset_increment_nz > 0) {
-                       td->o.offset_increment = td->o.offset_increment_nz * zone_size;
-               }
-               if (td->o.zone_skip_nz > 0) {
+               if (td->o.offset_increment_nz > 0)
+                       td->o.offset_increment =
+                               td->o.offset_increment_nz * zone_size;
+               if (td->o.zone_skip_nz > 0)
                         td->o.zone_skip = td->o.zone_skip_nz * zone_size;
-               }
         }
  }
  
@@ -822,8 +1051,9 @@ int zbd_setup_files(struct thread_data *td)
  
                 assert(zbd);
  
-               f->min_zone = zbd_zone_idx(f, f->file_offset);
-               f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size);
+               f->min_zone = zbd_offset_to_zone_idx(f, f->file_offset);
+               f->max_zone =
+                       zbd_offset_to_zone_idx(f, f->file_offset + f->io_size);
  
                 /*
                  * When all zones in the I/O range are conventional, io_size
@@ -863,7 +1093,7 @@ int zbd_setup_files(struct thread_data *td)
                         if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
                             z->cond != ZBD_ZONE_COND_EXP_OPEN)
                                 continue;
-                       if (zbd_open_zone(td, f, zi))
+                       if (zbd_open_zone(td, f, z))
                                 continue;
                         /*
                          * If the number of open zones exceeds specified limits,
@@ -879,123 +1109,6 @@ int zbd_setup_files(struct thread_data *td)
         return 0;
  }
  
-static inline unsigned int zbd_zone_nr(const struct fio_file *f,
-                                      struct fio_zone_info *zone)
-{
-       return zone - f->zbd_info->zone_info;
-}
-
-/**
- * zbd_reset_zone - reset the write pointer of a single zone
- * @td: FIO thread data.
- * @f: FIO file associated with the disk for which to reset a write pointer.
- * @z: Zone to reset.
- *
- * Returns 0 upon success and a negative error code upon failure.
- *
- * The caller must hold z->mutex.
- */
-static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
-                         struct fio_zone_info *z)
-{
-       uint64_t offset = z->start;
-       uint64_t length = (z+1)->start - offset;
-       uint64_t data_in_zone = z->wp - z->start;
-       int ret = 0;
-
-       if (!data_in_zone)
-               return 0;
-
-       assert(is_valid_offset(f, offset + length - 1));
-
-       dprint(FD_ZBD, "%s: resetting wp of zone %u.\n", f->file_name,
-               zbd_zone_nr(f, z));
-       switch (f->zbd_info->model) {
-       case ZBD_HOST_AWARE:
-       case ZBD_HOST_MANAGED:
-               ret = zbd_reset_wp(td, f, offset, length);
-               if (ret < 0)
-                       return ret;
-               break;
-       default:
-               break;
-       }
-
-       pthread_mutex_lock(&f->zbd_info->mutex);
-       f->zbd_info->sectors_with_data -= data_in_zone;
-       f->zbd_info->wp_sectors_with_data -= data_in_zone;
-       pthread_mutex_unlock(&f->zbd_info->mutex);
-       z->wp = z->start;
-       z->verify_block = 0;
-
-       td->ts.nr_zone_resets++;
-
-       return ret;
-}
-
-/* The caller must hold f->zbd_info->mutex */
-static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
-                          unsigned int zone_idx)
-{
-       uint32_t open_zone_idx = 0;
-
-       for (; open_zone_idx < f->zbd_info->num_open_zones; open_zone_idx++) {
-               if (f->zbd_info->open_zones[open_zone_idx] == zone_idx)
-                       break;
-       }
-       if (open_zone_idx == f->zbd_info->num_open_zones)
-               return;
-
-       dprint(FD_ZBD, "%s: closing zone %d\n", f->file_name, zone_idx);
-       memmove(f->zbd_info->open_zones + open_zone_idx,
-               f->zbd_info->open_zones + open_zone_idx + 1,
-               (ZBD_MAX_OPEN_ZONES - (open_zone_idx + 1)) *
-               sizeof(f->zbd_info->open_zones[0]));
-       f->zbd_info->num_open_zones--;
-       td->num_open_zones--;
-       get_zone(f, zone_idx)->open = 0;
-}
-
-/*
- * Reset a range of zones. Returns 0 upon success and 1 upon failure.
- * @td: fio thread data.
- * @f: fio file for which to reset zones
- * @zb: first zone to reset.
- * @ze: first zone not to reset.
- */
-static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
-                          struct fio_zone_info *const zb,
-                          struct fio_zone_info *const ze)
-{
-       struct fio_zone_info *z;
-       const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
-       int res = 0;
-
-       assert(min_bs);
-
-       dprint(FD_ZBD, "%s: examining zones %u .. %u\n", f->file_name,
-               zbd_zone_nr(f, zb), zbd_zone_nr(f, ze));
-       for (z = zb; z < ze; z++) {
-               uint32_t nz = zbd_zone_nr(f, z);
-
-               if (!z->has_wp)
-                       continue;
-               zone_lock(td, f, z);
-               pthread_mutex_lock(&f->zbd_info->mutex);
-               zbd_close_zone(td, f, nz);
-               pthread_mutex_unlock(&f->zbd_info->mutex);
-               if (z->wp != z->start) {
-                       dprint(FD_ZBD, "%s: resetting zone %u\n",
-                              f->file_name, zbd_zone_nr(f, z));
-                       if (zbd_reset_zone(td, f, z) < 0)
-                               res = 1;
-               }
-               zone_unlock(z);
-       }
-
-       return res;
-}
-
  /*
   * Reset zbd_info.write_cnt, the counter that counts down towards the next
   * zone reset.
@@ -1046,8 +1159,8 @@ static uint64_t zbd_process_swd(struct thread_data *td,
         uint64_t swd = 0;
         uint64_t wp_swd = 0;
  
-       zb = get_zone(f, f->min_zone);
-       ze = get_zone(f, f->max_zone);
+       zb = zbd_get_zone(f, f->min_zone);
+       ze = zbd_get_zone(f, f->max_zone);
         for (z = zb; z < ze; z++) {
                 if (z->has_wp) {
                         zone_lock(td, f, z);
@@ -1055,6 +1168,7 @@ static uint64_t zbd_process_swd(struct thread_data *td,
                 }
                 swd += z->wp - z->start;
         }
+
         pthread_mutex_lock(&f->zbd_info->mutex);
         switch (a) {
         case CHECK_SWD:
@@ -1067,6 +1181,7 @@ static uint64_t zbd_process_swd(struct thread_data *td,
                 break;
         }
         pthread_mutex_unlock(&f->zbd_info->mutex);
+
         for (z = zb; z < ze; z++)
                 if (z->has_wp)
                         zone_unlock(z);
@@ -1097,11 +1212,13 @@ void zbd_file_reset(struct thread_data *td, struct fio_file *f)
         if (!f->zbd_info || !td_write(td))
                 return;
  
-       zb = get_zone(f, f->min_zone);
-       ze = get_zone(f, f->max_zone);
+       zb = zbd_get_zone(f, f->min_zone);
+       ze = zbd_get_zone(f, f->max_zone);
         swd = zbd_process_swd(td, f, SET_SWD);
-       dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n", __func__, f->file_name,
-              swd);
+
+       dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n",
+              __func__, f->file_name, swd);
+
         /*
          * If data verification is enabled reset the affected zones before
          * writing any data to avoid that a zone reset has to be issued while
@@ -1112,92 +1229,12 @@ void zbd_file_reset(struct thread_data *td, struct fio_file *f)
         zbd_reset_write_cnt(td, f);
  }
  
-/* The caller must hold f->zbd_info->mutex. */
-static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
-                        unsigned int zone_idx)
-{
-       struct zoned_block_device_info *zbdi = f->zbd_info;
-       int i;
-
-       /* This function should never be called when zbdi->max_open_zones == 0 */
-       assert(zbdi->max_open_zones);
-       assert(td->o.job_max_open_zones == 0 || td->num_open_zones <= td->o.job_max_open_zones);
-       assert(td->o.job_max_open_zones <= zbdi->max_open_zones);
-       assert(zbdi->num_open_zones <= zbdi->max_open_zones);
-
-       for (i = 0; i < zbdi->num_open_zones; i++)
-               if (zbdi->open_zones[i] == zone_idx)
-                       return true;
-
-       return false;
-}
-
-/*
- * Open a ZBD zone if it was not yet open. Returns true if either the zone was
- * already open or if opening a new zone is allowed. Returns false if the zone
- * was not yet open and opening a new zone would cause the zone limit to be
- * exceeded.
- */
-static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
-                         uint32_t zone_idx)
-{
-       const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
-       struct zoned_block_device_info *zbdi = f->zbd_info;
-       struct fio_zone_info *z = get_zone(f, zone_idx);
-       bool res = true;
-
-       if (z->cond == ZBD_ZONE_COND_OFFLINE)
-               return false;
-
-       /*
-        * Skip full zones with data verification enabled because resetting a
-        * zone causes data loss and hence causes verification to fail.
-        */
-       if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
-               return false;
-
-       /*
-        * zbdi->max_open_zones == 0 means that there is no limit on the maximum
-        * number of open zones. In this case, do no track open zones in
-        * zbdi->open_zones array.
-        */
-       if (!zbdi->max_open_zones)
-               return true;
-
-       pthread_mutex_lock(&zbdi->mutex);
-       if (is_zone_open(td, f, zone_idx)) {
-               /*
-                * If the zone is already open and going to be full by writes
-                * in-flight, handle it as a full zone instead of an open zone.
-                */
-               if (z->wp >= zbd_zone_capacity_end(z))
-                       res = false;
-               goto out;
-       }
-       res = false;
-       /* Zero means no limit */
-       if (td->o.job_max_open_zones > 0 &&
-           td->num_open_zones >= td->o.job_max_open_zones)
-               goto out;
-       if (zbdi->num_open_zones >= zbdi->max_open_zones)
-               goto out;
-       dprint(FD_ZBD, "%s: opening zone %d\n", f->file_name, zone_idx);
-       zbdi->open_zones[zbdi->num_open_zones++] = zone_idx;
-       td->num_open_zones++;
-       z->open = 1;
-       res = true;
-
-out:
-       pthread_mutex_unlock(&zbdi->mutex);
-       return res;
-}
-
  /* Return random zone index for one of the open zones. */
  static uint32_t pick_random_zone_idx(const struct fio_file *f,
                                      const struct io_u *io_u)
  {
-       return (io_u->offset - f->file_offset) * f->zbd_info->num_open_zones /
-               f->io_size;
+       return (io_u->offset - f->file_offset) *
+               f->zbd_info->num_open_zones / f->io_size;
  }
  
  static bool any_io_in_flight(void)
@@ -1244,13 +1281,15 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
                  */
                 zone_idx = zbdi->open_zones[pick_random_zone_idx(f, io_u)];
         } else {
-               zone_idx = zbd_zone_idx(f, io_u->offset);
+               zone_idx = zbd_offset_to_zone_idx(f, io_u->offset);
         }
         if (zone_idx < f->min_zone)
                 zone_idx = f->min_zone;
         else if (zone_idx >= f->max_zone)
                 zone_idx = f->max_zone - 1;
-       dprint(FD_ZBD, "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
+
+       dprint(FD_ZBD,
+              "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
                __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
  
         /*
@@ -1262,13 +1301,16 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
         for (;;) {
                 uint32_t tmp_idx;
  
-               z = get_zone(f, zone_idx);
+               z = zbd_get_zone(f, zone_idx);
                 if (z->has_wp)
                         zone_lock(td, f, z);
+
                 pthread_mutex_lock(&zbdi->mutex);
+
                 if (z->has_wp) {
                         if (z->cond != ZBD_ZONE_COND_OFFLINE &&
-                           zbdi->max_open_zones == 0 && td->o.job_max_open_zones == 0)
+                           zbdi->max_open_zones == 0 &&
+                           td->o.job_max_open_zones == 0)
                                 goto examine_zone;
                         if (zbdi->num_open_zones == 0) {
                                 dprint(FD_ZBD, "%s(%s): no zones are open\n",
@@ -1278,14 +1320,15 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
                 }
  
                 /*
-                * List of opened zones is per-device, shared across all threads.
-                * Start with quasi-random candidate zone.
-                * Ignore zones which don't belong to thread's offset/size area.
+                * List of opened zones is per-device, shared across all
+                * threads. Start with quasi-random candidate zone. Ignore
+                * zones which don't belong to thread's offset/size area.
                  */
                 open_zone_idx = pick_random_zone_idx(f, io_u);
                 assert(!open_zone_idx ||
                        open_zone_idx < zbdi->num_open_zones);
                 tmp_idx = open_zone_idx;
+
                 for (i = 0; i < zbdi->num_open_zones; i++) {
                         uint32_t tmpz;
  
@@ -1302,9 +1345,12 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
  
                 dprint(FD_ZBD, "%s(%s): no candidate zone\n",
                         __func__, f->file_name);
+
                 pthread_mutex_unlock(&zbdi->mutex);
+
                 if (z->has_wp)
                         zone_unlock(z);
+
                 return NULL;
  
  found_candidate_zone:
@@ -1312,7 +1358,9 @@ found_candidate_zone:
                 if (new_zone_idx == zone_idx)
                         break;
                 zone_idx = new_zone_idx;
+
                 pthread_mutex_unlock(&zbdi->mutex);
+
                 if (z->has_wp)
                         zone_unlock(z);
         }
@@ -1343,7 +1391,8 @@ open_other_zone:
          * zone close before opening a new zone.
          */
         if (wait_zone_close) {
-               dprint(FD_ZBD, "%s(%s): quiesce to allow open zones to close\n",
+               dprint(FD_ZBD,
+                      "%s(%s): quiesce to allow open zones to close\n",
                        __func__, f->file_name);
                 io_u_quiesce(td);
         }
@@ -1358,7 +1407,7 @@ retry:
                 if (!is_valid_offset(f, z->start)) {
                         /* Wrap-around. */
                         zone_idx = f->min_zone;
-                       z = get_zone(f, zone_idx);
+                       z = zbd_get_zone(f, zone_idx);
                 }
                 assert(is_valid_offset(f, z->start));
                 if (!z->has_wp)
@@ -1366,7 +1415,7 @@ retry:
                 zone_lock(td, f, z);
                 if (z->open)
                         continue;
-               if (zbd_open_zone(td, f, zone_idx))
+               if (zbd_open_zone(td, f, z))
                         goto out;
         }
  
@@ -1381,7 +1430,7 @@ retry:
                 pthread_mutex_unlock(&zbdi->mutex);
                 zone_unlock(z);
  
-               z = get_zone(f, zone_idx);
+               z = zbd_get_zone(f, zone_idx);
  
                 zone_lock(td, f, z);
                 if (z->wp + min_bs <= zbd_zone_capacity_end(z))
@@ -1396,7 +1445,8 @@ retry:
          */
         in_flight = any_io_in_flight();
         if (in_flight || should_retry) {
-               dprint(FD_ZBD, "%s(%s): wait zone close and retry open zones\n",
+               dprint(FD_ZBD,
+                      "%s(%s): wait zone close and retry open zones\n",
                        __func__, f->file_name);
                 pthread_mutex_unlock(&zbdi->mutex);
                 zone_unlock(z);
@@ -1407,17 +1457,22 @@ retry:
         }
  
         pthread_mutex_unlock(&zbdi->mutex);
+
         zone_unlock(z);
-       dprint(FD_ZBD, "%s(%s): did not open another zone\n", __func__,
-              f->file_name);
+
+       dprint(FD_ZBD, "%s(%s): did not open another zone\n",
+              __func__, f->file_name);
+
         return NULL;
  
  out:
-       dprint(FD_ZBD, "%s(%s): returning zone %d\n", __func__, f->file_name,
-              zone_idx);
+       dprint(FD_ZBD, "%s(%s): returning zone %d\n",
+              __func__, f->file_name, zone_idx);
+
         io_u->offset = z->start;
         assert(z->has_wp);
         assert(z->cond != ZBD_ZONE_COND_OFFLINE);
+
         return z;
  }
  
@@ -1429,25 +1484,27 @@ static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
         const struct fio_file *f = io_u->file;
         const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
  
-       if (!zbd_open_zone(td, f, zbd_zone_nr(f, z))) {
+       if (!zbd_open_zone(td, f, z)) {
                 zone_unlock(z);
                 z = zbd_convert_to_open_zone(td, io_u);
                 assert(z);
         }
  
         if (z->verify_block * min_bs >= z->capacity) {
-               log_err("%s: %d * %"PRIu64" >= %"PRIu64"\n", f->file_name, z->verify_block,
-                       min_bs, z->capacity);
+               log_err("%s: %d * %"PRIu64" >= %"PRIu64"\n",
+                       f->file_name, z->verify_block, min_bs, z->capacity);
                 /*
                  * If the assertion below fails during a test run, adding
                  * "--experimental_verify=1" to the command line may help.
                  */
                 assert(false);
         }
+
         io_u->offset = z->start + z->verify_block * min_bs;
         if (io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
-               log_err("%s: %llu + %llu >= %"PRIu64"\n", f->file_name, io_u->offset,
-                       io_u->buflen, zbd_zone_capacity_end(z));
+               log_err("%s: %llu + %llu >= %"PRIu64"\n",
+                       f->file_name, io_u->offset, io_u->buflen,
+                       zbd_zone_capacity_end(z));
                 assert(false);
         }
         z->verify_block += io_u->buflen / min_bs;
@@ -1468,7 +1525,7 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
  {
         struct fio_file *f = io_u->file;
         struct fio_zone_info *z1, *z2;
-       const struct fio_zone_info *const zf = get_zone(f, f->min_zone);
+       const struct fio_zone_info *const zf = zbd_get_zone(f, f->min_zone);
  
         /*
          * Skip to the next non-empty zone in case of sequential I/O and to
@@ -1485,6 +1542,7 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
                 } else if (!td_random(td)) {
                         break;
                 }
+
                 if (td_random(td) && z2 >= zf &&
                     z2->cond != ZBD_ZONE_COND_OFFLINE) {
                         if (z2->has_wp)
@@ -1495,8 +1553,11 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
                                 zone_unlock(z2);
                 }
         }
-       dprint(FD_ZBD, "%s: no zone has %"PRIu64" bytes of readable data\n",
+
+       dprint(FD_ZBD,
+              "%s: no zone has %"PRIu64" bytes of readable data\n",
                f->file_name, min_bytes);
+
         return NULL;
  }
  
@@ -1517,7 +1578,7 @@ static void zbd_end_zone_io(struct thread_data *td, const struct io_u *io_u,
         if (io_u->ddir == DDIR_WRITE &&
             io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
                 pthread_mutex_lock(&f->zbd_info->mutex);
-               zbd_close_zone(td, f, zbd_zone_nr(f, z));
+               zbd_close_zone(td, f, z);
                 pthread_mutex_unlock(&f->zbd_info->mutex);
         }
  }
@@ -1537,15 +1598,11 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
         const struct fio_file *f = io_u->file;
         struct zoned_block_device_info *zbd_info = f->zbd_info;
         struct fio_zone_info *z;
-       uint32_t zone_idx;
         uint64_t zone_end;
  
         assert(zbd_info);
  
-       zone_idx = zbd_zone_idx(f, io_u->offset);
-       assert(zone_idx < zbd_info->nr_zones);
-       z = get_zone(f, zone_idx);
-
+       z = zbd_offset_to_zone(f, io_u->offset);
         assert(z->has_wp);
  
         if (!success)
@@ -1553,17 +1610,18 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
  
         dprint(FD_ZBD,
                "%s: queued I/O (%lld, %llu) for zone %u\n",
-              f->file_name, io_u->offset, io_u->buflen, zone_idx);
+              f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
  
         switch (io_u->ddir) {
         case DDIR_WRITE:
                 zone_end = min((uint64_t)(io_u->offset + io_u->buflen),
                                zbd_zone_capacity_end(z));
-               pthread_mutex_lock(&zbd_info->mutex);
+
                 /*
                  * z->wp > zone_end means that one or more I/O errors
                  * have occurred.
                  */
+               pthread_mutex_lock(&zbd_info->mutex);
                 if (z->wp <= zone_end) {
                         zbd_info->sectors_with_data += zone_end - z->wp;
                         zbd_info->wp_sectors_with_data += zone_end - z->wp;
@@ -1595,19 +1653,15 @@ static void zbd_put_io(struct thread_data *td, const struct io_u *io_u)
         const struct fio_file *f = io_u->file;
         struct zoned_block_device_info *zbd_info = f->zbd_info;
         struct fio_zone_info *z;
-       uint32_t zone_idx;
  
         assert(zbd_info);
  
-       zone_idx = zbd_zone_idx(f, io_u->offset);
-       assert(zone_idx < zbd_info->nr_zones);
-       z = get_zone(f, zone_idx);
-
+       z = zbd_offset_to_zone(f, io_u->offset);
         assert(z->has_wp);
  
         dprint(FD_ZBD,
                "%s: terminate I/O (%lld, %llu) for zone %u\n",
-              f->file_name, io_u->offset, io_u->buflen, zone_idx);
+              f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
  
         zbd_end_zone_io(td, io_u, z);
  
@@ -1649,28 +1703,26 @@ void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
         struct fio_file *f = io_u->file;
         enum fio_ddir ddir = io_u->ddir;
         struct fio_zone_info *z;
-       uint32_t zone_idx;
  
         assert(td->o.zone_mode == ZONE_MODE_ZBD);
         assert(td->o.zone_size);
         assert(f->zbd_info);
  
-       zone_idx = zbd_zone_idx(f, f->last_pos[ddir]);
-       z = get_zone(f, zone_idx);
+       z = zbd_offset_to_zone(f, f->last_pos[ddir]);
  
         /*
          * When the zone capacity is smaller than the zone size and the I/O is
          * sequential write, skip to zone end if the latest position is at the
          * zone capacity limit.
          */
-       if (z->capacity < f->zbd_info->zone_size && !td_random(td) &&
-           ddir == DDIR_WRITE &&
+       if (z->capacity < f->zbd_info->zone_size &&
+           !td_random(td) && ddir == DDIR_WRITE &&
             f->last_pos[ddir] >= zbd_zone_capacity_end(z)) {
                 dprint(FD_ZBD,
                        "%s: Jump from zone capacity limit to zone end:"
                        " (%"PRIu64" -> %"PRIu64") for zone %u (%"PRIu64")\n",
                        f->file_name, f->last_pos[ddir],
-                      zbd_zone_end(z), zone_idx, z->capacity);
+                      zbd_zone_end(z), zbd_zone_idx(f, z), z->capacity);
                 td->io_skip_bytes += zbd_zone_end(z) - f->last_pos[ddir];
                 f->last_pos[ddir] = zbd_zone_end(z);
         }
@@ -1751,7 +1803,6 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
  {
         struct fio_file *f = io_u->file;
         struct zoned_block_device_info *zbdi = f->zbd_info;
-       uint32_t zone_idx_b;
         struct fio_zone_info *zb, *zl, *orig_zb;
         uint32_t orig_len = io_u->buflen;
         uint64_t min_bs = td->o.min_bs[io_u->ddir];
@@ -1762,14 +1813,15 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
         assert(min_bs);
         assert(is_valid_offset(f, io_u->offset));
         assert(io_u->buflen);
-       zone_idx_b = zbd_zone_idx(f, io_u->offset);
-       zb = get_zone(f, zone_idx_b);
+
+       zb = zbd_offset_to_zone(f, io_u->offset);
         orig_zb = zb;
  
         if (!zb->has_wp) {
                 /* Accept non-write I/Os for conventional zones. */
                 if (io_u->ddir != DDIR_WRITE)
                         return io_u_accept;
+
                 /*
                  * Make sure that writes to conventional zones
                  * don't cross over to any sequential zones.
@@ -1783,12 +1835,16 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                                "%s: off=%llu + min_bs=%"PRIu64" > next zone %"PRIu64"\n",
                                f->file_name, io_u->offset,
                                min_bs, (zb + 1)->start);
-                       io_u->offset = zb->start + (zb + 1)->start - io_u->offset;
-                       new_len = min(io_u->buflen, (zb + 1)->start - io_u->offset);
+                       io_u->offset =
+                               zb->start + (zb + 1)->start - io_u->offset;
+                       new_len = min(io_u->buflen,
+                                     (zb + 1)->start - io_u->offset);
                 } else {
                         new_len = (zb + 1)->start - io_u->offset;
                 }
+
                 io_u->buflen = new_len / min_bs * min_bs;
+
                 return io_u_accept;
         }
  
@@ -1810,6 +1866,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                         zb = zbd_replay_write_order(td, io_u, zb);
                         goto accept;
                 }
+
                 /*
                  * Check that there is enough written data in the zone to do an
                  * I/O of at least min_bs B. If there isn't, find a new zone for
@@ -1820,7 +1877,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                 if (range < min_bs ||
                     ((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) {
                         zone_unlock(zb);
-                       zl = get_zone(f, f->max_zone);
+                       zl = zbd_get_zone(f, f->max_zone);
                         zb = zbd_find_zone(td, io_u, min_bs, zb, zl);
                         if (!zb) {
                                 dprint(FD_ZBD,
@@ -1839,6 +1896,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                         if (!td_random(td))
                                 io_u->offset = zb->start;
                 }
+
                 /*
                  * Make sure the I/O is within the zone valid data range while
                  * maximizing the I/O size and preserving randomness.
@@ -1849,12 +1907,14 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                         io_u->offset = zb->start +
                                 ((io_u->offset - orig_zb->start) %
                                  (range - io_u->buflen)) / min_bs * min_bs;
+
                 /*
                  * When zbd_find_zone() returns a conventional zone,
                  * we can simply accept the new i/o offset here.
                  */
                 if (!zb->has_wp)
                         return io_u_accept;
+
                 /*
                  * Make sure the I/O does not cross over the zone wp position.
                  */
@@ -1866,9 +1926,12 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                         dprint(FD_IO, "Changed length from %u into %llu\n",
                                orig_len, io_u->buflen);
                 }
+
                 assert(zb->start <= io_u->offset);
                 assert(io_u->offset + io_u->buflen <= zb->wp);
+
                 goto accept;
+
         case DDIR_WRITE:
                 if (io_u->buflen > zbdi->zone_size) {
                         td_verror(td, EINVAL, "I/O buflen exceeds zone size");
@@ -1877,7 +1940,8 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                                f->file_name, io_u->buflen, zbdi->zone_size);
                         goto eof;
                 }
-               if (!zbd_open_zone(td, f, zone_idx_b)) {
+
+               if (!zbd_open_zone(td, f, zb)) {
                         zone_unlock(zb);
                         zb = zbd_convert_to_open_zone(td, io_u);
                         if (!zb) {
@@ -1886,14 +1950,14 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                                 goto eof;
                         }
                 }
+
                 /* Check whether the zone reset threshold has been exceeded */
                 if (td->o.zrf.u.f) {
-                       if (zbdi->wp_sectors_with_data >=
-                           f->io_size * td->o.zrt.u.f &&
-                           zbd_dec_and_reset_write_cnt(td, f)) {
+                       if (zbdi->wp_sectors_with_data >= f->io_size * td->o.zrt.u.f &&
+                           zbd_dec_and_reset_write_cnt(td, f))
                                 zb->reset_zone = 1;
-                       }
                 }
+
                 /* Reset the zone pointer if necessary */
                 if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) {
                         assert(td->o.verify == VERIFY_NONE);
@@ -1916,6 +1980,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                                 goto eof;
                         }
                 }
+
                 /* Make writes occur at the write pointer */
                 assert(!zbd_zone_full(f, zb, min_bs));
                 io_u->offset = zb->wp;
@@ -1925,6 +1990,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                                f->file_name, io_u->offset);
                         goto eof;
                 }
+
                 /*
                  * Make sure that the buflen is a multiple of the minimal
                  * block size. Give up if shrinking would make the request too
@@ -1941,10 +2007,13 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                                orig_len, io_u->buflen);
                         goto accept;
                 }
+
                 td_verror(td, EIO, "zone remainder too small");
                 log_err("zone remainder %lld smaller than min block size %"PRIu64"\n",
                         (zbd_zone_capacity_end(zb) - io_u->offset), min_bs);
+
                 goto eof;
+
         case DDIR_TRIM:
                 /* Check random trim targets a non-empty zone */
                 if (!td_random(td) || zb->wp > zb->start)
@@ -1952,7 +2021,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
  
                 /* Find out a non-empty zone to trim */
                 zone_unlock(zb);
-               zl = get_zone(f, f->max_zone);
+               zl = zbd_get_zone(f, f->max_zone);
                 zb = zbd_find_zone(td, io_u, 1, zb, zl);
                 if (zb) {
                         io_u->offset = zb->start;
@@ -1960,7 +2029,9 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                                f->file_name, io_u->offset);
                         goto accept;
                 }
+
                 goto eof;
+
         case DDIR_SYNC:
                 /* fall-through */
         case DDIR_DATASYNC:
@@ -1978,19 +2049,23 @@ accept:
         assert(zb->cond != ZBD_ZONE_COND_OFFLINE);
         assert(!io_u->zbd_queue_io);
         assert(!io_u->zbd_put_io);
+
         io_u->zbd_queue_io = zbd_queue_io;
         io_u->zbd_put_io = zbd_put_io;
+
         /*
          * Since we return with the zone lock still held,
          * add an annotation to let Coverity know that it
          * is intentional.
          */
         /* coverity[missing_unlock] */
+
         return io_u_accept;
  
  eof:
         if (zb && zb->has_wp)
                 zone_unlock(zb);
+
         return io_u_eof;
  }
  
@@ -2018,17 +2093,15 @@ int zbd_do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
  {
         struct fio_file *f = io_u->file;
         struct fio_zone_info *z;
-       uint32_t zone_idx;
         int ret;
  
-       zone_idx = zbd_zone_idx(f, io_u->offset);
-       z = get_zone(f, zone_idx);
-
+       z = zbd_offset_to_zone(f, io_u->offset);
         if (!z->has_wp)
                 return 0;
  
         if (io_u->offset != z->start) {
-               log_err("Trim offset not at zone start (%lld)\n", io_u->offset);
+               log_err("Trim offset not at zone start (%lld)\n",
+                       io_u->offset);
                 return -EINVAL;
         }
author	Jens Axboe <axboe@kernel.dk>
	Thu, 3 Feb 2022 22:34:40 +0000 (15:34 -0700)
committer	Jens Axboe <axboe@kernel.dk>
	Thu, 3 Feb 2022 22:34:40 +0000 (15:34 -0700)
.github/workflows/ci.yml		patch \| blob \| blame \| history
.github/workflows/cifuzz.yml	[new file with mode: 0644]	patch \| blob
FIO-VERSION-GEN		patch \| blob \| blame \| history
HOWTO		patch \| blob \| blame \| history
Makefile		patch \| blob \| blame \| history
README	[deleted file]	patch \| blob \| blame \| history
README.rst	[new file with mode: 0644]	patch \| blob
backend.c		patch \| blob \| blame \| history
blktrace.c		patch \| blob \| blame \| history
blktrace.h		patch \| blob \| blame \| history
ci/actions-install.sh		patch \| blob \| blame \| history
client.c		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
doc/fio_doc.rst		patch \| blob \| blame \| history
doc/fio_man.rst		patch \| blob \| blame \| history
engines/cmdprio.c		patch \| blob \| blame \| history
engines/cmdprio.h		patch \| blob \| blame \| history
engines/filecreate.c		patch \| blob \| blame \| history
engines/filedelete.c		patch \| blob \| blame \| history
engines/filestat.c		patch \| blob \| blame \| history
engines/io_uring.c		patch \| blob \| blame \| history
engines/librpma_fio.c		patch \| blob \| blame \| history
engines/librpma_fio.h		patch \| blob \| blame \| history
engines/sg.c		patch \| blob \| blame \| history
engines/windowsaio.c		patch \| blob \| blame \| history
examples/cmdprio-bssplit.fio		patch \| blob \| blame \| history
examples/sg_verify-fail.fio	[new file with mode: 0644]	patch \| blob
examples/sg_verify.fio	[new file with mode: 0644]	patch \| blob
examples/sg_write_same_ndob.fio	[new file with mode: 0644]	patch \| blob
fio.1		patch \| blob \| blame \| history
fio.h		patch \| blob \| blame \| history
gclient.c		patch \| blob \| blame \| history
init.c		patch \| blob \| blame \| history
io_u.c		patch \| blob \| blame \| history
io_u.h		patch \| blob \| blame \| history
iolog.c		patch \| blob \| blame \| history
iolog.h		patch \| blob \| blame \| history
optgroup.h		patch \| blob \| blame \| history
options.c		patch \| blob \| blame \| history
os/os-windows.h		patch \| blob \| blame \| history
os/os.h		patch \| blob \| blame \| history
os/windows/posix.c		patch \| blob \| blame \| history
oslib/linux-dev-lookup.c		patch \| blob \| blame \| history
rate-submit.c		patch \| blob \| blame \| history
server.c		patch \| blob \| blame \| history
server.h		patch \| blob \| blame \| history
stat.c		patch \| blob \| blame \| history
stat.h		patch \| blob \| blame \| history
t/io_uring.c		patch \| blob \| blame \| history
t/latency_percentiles.py		patch \| blob \| blame \| history
t/zbd/functions		patch \| blob \| blame \| history
thread_options.h		patch \| blob \| blame \| history
zbd.c		patch \| blob \| blame \| history