docs: update for new data placement options

author Vincent Fu <vincent.fu@samsung.com>

Wed, 17 Jan 2024 18:52:48 +0000 (18:52 +0000)

committer Vincent Fu <vincent.fu@samsung.com>

Wed, 24 Apr 2024 17:44:09 +0000 (13:44 -0400)
author Vincent Fu <vincent.fu@samsung.com>
Wed, 17 Jan 2024 18:52:48 +0000 (18:52 +0000)
committer Vincent Fu <vincent.fu@samsung.com>
Wed, 24 Apr 2024 17:44:09 +0000 (13:44 -0400)
diff --git a/.appveyor.yml b/.appveyor.yml

deleted file mode 100644 (file)

index 42b7995..0000000
--- a/.appveyor.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-clone_depth: 1 # NB: this stops FIO-VERSION-GEN making tag based versions
-
-image:
-  - Visual Studio 2019
-
-environment:
-  CYG_MIRROR: http://cygwin.mirror.constant.com
-  matrix:
-    - ARCHITECTURE: x64
-      CC: clang
-      CONFIGURE_OPTIONS: --enable-pdb
-      DISTRO: msys2
-# Skip 32 bit clang build
-#    - ARCHITECTURE: x86
-#      CC: clang
-#      CONFIGURE_OPTIONS: --enable-pdb
-#      DISTRO: msys2
-    - ARCHITECTURE: x64
-      CONFIGURE_OPTIONS:
-      DISTRO: cygwin
-    - ARCHITECTURE: x86
-      CONFIGURE_OPTIONS: --build-32bit-win
-      DISTRO: cygwin
-
-install:
-  - if %DISTRO%==cygwin (
-      SET "PATH=C:\cygwin64\bin;C:\cygwin64;%PATH%"
-    )
-  - if %DISTRO%==msys2 if %ARCHITECTURE%==x86 (
-      SET "PATH=C:\msys64\mingw32\bin;C:\msys64\usr\bin;%PATH%"
-    )
-  - if %DISTRO%==msys2 if %ARCHITECTURE%==x64 (
-      SET "PATH=C:\msys64\mingw64\bin;C:\msys64\usr\bin;%PATH%"
-    )
-  - SET PATH=C:\Python38-x64;%PATH% # NB: Changed env variables persist to later sections
-  - SET PYTHONUNBUFFERED=TRUE
-  - bash.exe ci\appveyor-install.sh
-
-build_script:
-  - bash.exe configure --extra-cflags=-Werror --disable-native %CONFIGURE_OPTIONS%
-  - make.exe -j2
-
-after_build:
-  - file.exe fio.exe
-  - make.exe test
-  - 'cd os\windows && dobuild.cmd %ARCHITECTURE% && cd ..'
-  - ps: Get-ChildItem .\os\windows\*.msi | % { Push-AppveyorArtifact $_.FullName -FileName $_.Name -DeploymentName fio.msi }
-
-test_script:
-  - python.exe t/run-fio-tests.py --artifact-root test-artifacts --debug
-
-on_finish:
-  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && [ -d test-artifacts ] && 7z a -t7z test-artifacts.7z test-artifacts -xr!foo.0.0 -xr!latency.?.0 -xr!fio_jsonplus_clat2csv.test && appveyor PushArtifact test-artifacts.7z'
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md

new file mode 100644 (file)

index 0000000..6cead5b
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,15 @@
+Please confirm that your commit message(s) follow these guidelines:
+
+1. First line is a commit title, a descriptive one-liner for the change
+2. Empty second line
+3. Commit message body that explains why the change is useful. Break lines that
+   aren't something like a URL at 72-74 chars.
+4. Empty line
+5. Signed-off-by: Real Name <real@email.com>
+
+Reminders:
+
+1. If you modify struct thread_options, also make corresponding changes in
+   cconv.c and bump FIO_SERVER_VER in server.h
+2. If you change the ioengine interface (hooks, flags, etc), remember to bump
+   FIO_IOOPS_VERSION in ioengines.h.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml

new file mode 100644 (file)

index 0000000..e53082c
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,129 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        build:
+        - linux-gcc
+        - linux-clang
+        - macos
+        - linux-i686-gcc
+        - android
+        - windows-cygwin-64
+        - windows-cygwin-32
+        - windows-msys2-64
+        include:
+        - build: linux-gcc
+          os: ubuntu-22.04
+          cc: gcc
+        - build: linux-clang
+          os: ubuntu-22.04
+          cc: clang
+        - build: macos
+          os: macos-13
+        - build: linux-i686-gcc
+          os: ubuntu-22.04
+          arch: i686
+        - build: android
+          os: ubuntu-22.04
+          arch: aarch64-linux-android32
+        - build: android-recovery
+          os: ubuntu-22.04
+          arch: aarch64-linux-android32
+        - build: windows-cygwin-64
+          os: windows-latest
+          arch: x86_64
+          installer_arch: x64
+          shell: bash
+        - build: windows-cygwin-32
+          os: windows-latest
+          arch: i686
+          installer_arch: x86
+          shell: bash
+        - build: windows-msys2-64
+          os: windows-latest
+          cc: clang
+          arch: x86_64
+          installer_arch: x64
+          shell: msys2
+
+    env:
+      CI_TARGET_BUILD: ${{ matrix.build }}
+      CI_TARGET_ARCH: ${{ matrix.arch }}
+      CC: ${{ matrix.cc }}
+
+    steps:
+    - name: git config line endings (Windows)
+      if: ${{ contains( matrix.build, 'windows' ) }}
+      run: git config --global core.autocrlf input
+    - name: Checkout repo
+      uses: actions/checkout@v4
+    - name: Install Cygwin toolchain (Windows)
+      if: ${{ startsWith(matrix.build, 'windows-cygwin') }}
+      uses: cygwin/cygwin-install-action@master
+      with:
+        packages: >
+          mingw64-${{matrix.arch}}-binutils
+          mingw64-${{matrix.arch}}-CUnit
+          mingw64-${{matrix.arch}}-curl
+          mingw64-${{matrix.arch}}-dlfcn
+          mingw64-${{matrix.arch}}-gcc-core
+          mingw64-${{matrix.arch}}-headers
+          mingw64-${{matrix.arch}}-runtime
+          mingw64-${{matrix.arch}}-zlib
+
+    - name: Install msys2 toolchain (Windows)
+      if: ${{ startsWith(matrix.build, 'windows-msys2') }}
+      uses: msys2/setup-msys2@v2
+      with:
+        install: >
+          git
+          base-devel
+          mingw-w64-${{matrix.arch}}-clang
+          mingw-w64-${{matrix.arch}}-cunit
+          mingw-w64-${{matrix.arch}}-toolchain
+          mingw-w64-${{matrix.arch}}-lld
+          mingw-w64-${{matrix.arch}}-python-scipy
+          mingw-w64-${{matrix.arch}}-python-six
+          mingw-w64-${{matrix.arch}}-python-statsmodels
+          mingw-w64-${{matrix.arch}}-python-sphinx
+
+    - name: Install dependencies
+      run: ${{matrix.shell}} ./ci/actions-install.sh
+      if: ${{ !contains( matrix.build, 'msys2' ) }}
+    - name: Build
+      run:  ${{matrix.shell}} ./ci/actions-build.sh
+    - name: Build installer (Windows)
+      if: ${{ contains( matrix.build, 'windows' ) }}
+      shell: cmd
+      run: |
+        cd os\windows
+        dobuild.cmd ${{ matrix.installer_arch }}
+        cd ..\..
+
+    - name: Upload installer as artifact (Windows)
+      if: ${{ contains( matrix.build, 'windows' ) }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: ${{ matrix.build }}-installer
+        path: os\windows\*.msi
+    - name: Upload installer as release for tagged builds (Windows)
+      uses: softprops/action-gh-release@v1
+      if: ${{ startsWith(github.ref, 'refs/tags/') && startsWith(matrix.build, 'windows-cygwin') }}
+      with:
+        files: os/windows/*.msi
+    - name: Remove dependency files to resolve Makefile Cygwin sed issue (Windows)
+      if: ${{ startsWith(matrix.build, 'windows-cygwin') }}
+      run: rm *.d */*.d */*/*.d
+      shell: bash
+    - name: Smoke test
+      run:  ${{matrix.shell}} ./ci/actions-smoke-test.sh
+    - name: Full test
+      run:  ${{matrix.shell}} ./ci/actions-full-test.sh
diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml

new file mode 100644 (file)

index 0000000..acc8d48
--- /dev/null
+++ b/.github/workflows/cifuzz.yml
@@ -0,0 +1,24 @@
+name: CIFuzz
+on: [pull_request]
+jobs:
+  Fuzzing:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Build Fuzzers
+      id: build
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'fio'
+        dry-run: false
+    - name: Run Fuzzers
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      with:
+        oss-fuzz-project-name: 'fio'
+        fuzz-seconds: 600
+        dry-run: false
+    - name: Upload Crash
+      uses: actions/upload-artifact@v1
+      if: failure() && steps.build.outcome == 'success'
+      with:
+        name: artifacts
+        path: ./out/artifacts
diff --git a/.gitignore b/.gitignore

index 0aa4a3611c031024f631418fee0fad1ba94d0cae..72494a1e2a9edafb7b77f50af94fce7d4435454b 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,5 @@ doc/output
  /tags
  /TAGS
  /t/zbd/test-zbd-support.log.*
+/t/fuzz/fuzz_parseini
+tsc-rate
diff --git a/.travis.yml b/.travis.yml

deleted file mode 100644 (file)

index e35aff3..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-language: c
-dist: bionic
-os:
-  - linux
-compiler:
-  - clang
-  - gcc
-arch:
-  - amd64
-  - arm64
-env:
-  global:
-    - MAKEFLAGS="-j 2"
-matrix:
-  include:
-    - os: linux
-      compiler: gcc
-      arch: amd64
-      env: BUILD_ARCH="x86" # Only do the gcc x86 build to reduce clutter
-    # Default xcode image
-    - os: osx
-      compiler: clang # Workaround travis setting CC=["clang", "gcc"]
-      arch: amd64
-    # Latest xcode image (needs periodic updating)
-    - os: osx
-      compiler: clang
-      osx_image: xcode11.2
-      arch: amd64
-  exclude:
-    - os: osx
-      compiler: gcc
-
-install:
-  - ci/travis-install.sh
-
-script:
-  - ci/travis-build.sh
diff --git a/CITATION.cff b/CITATION.cff

new file mode 100644 (file)

index 0000000..3df315e
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,11 @@
+cff-version: 1.2.0
+preferred-citation:
+  type: software
+  authors:
+  - family-names: "Axboe"
+    given-names: "Jens"
+    email: axboe@kernel.dk
+  title: "Flexible I/O Tester"
+  year: 2022
+  url: "https://github.com/axboe/fio"
+licence: GNU GPL v2.0
diff --git a/DEDUPE-TODO b/DEDUPE-TODO

new file mode 100644 (file)

index 0000000..4b0bfd1
--- /dev/null
+++ b/DEDUPE-TODO
@@ -0,0 +1,16 @@
+- Shifted dedup-able data.
+  Allow for dedup buffer generation to shift contents by random number
+  of sectors (fill the gaps with uncompressible data). Some storage
+  subsystems modernized the deduplication detection algorithms to look
+  for shifted data as well. For example, some databases push a timestamp
+  on the prefix of written blocks, which makes the underlying data
+  dedup-able in different alignment. FIO should be able to simulate such
+  workload.
+
+- Generation of similar data (but not exact).
+  A rising trend in enterprise storage systems.
+  Generation of "similar" data means random uncompressible buffers
+  that differ by few(configurable number of) bits from each other.
+  The storage subsystem usually identifies the similar buffers using
+  locality-sensitive hashing or other methods.
+
diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN

index 81a6355b981b1694bb79c3f073de9e693778e3b4..be0d7620ee4c644fc7225e838ace5fc6fe4c4414 100755 (executable)
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN
@@ -1,7 +1,7 @@
  #!/bin/sh
  
  GVF=FIO-VERSION-FILE
-DEF_VER=fio-3.25
+DEF_VER=fio-3.37
  
  LF='
  '
diff --git a/HOWTO b/HOWTO

deleted file mode 100644 (file)

index 52812cc..0000000
--- a/HOWTO
+++ /dev/null
@@ -1,4375 +0,0 @@
-How fio works
--------------
-
-The first step in getting fio to simulate a desired I/O workload, is writing a
-job file describing that specific setup. A job file may contain any number of
-threads and/or files -- the typical contents of the job file is a *global*
-section defining shared parameters, and one or more job sections describing the
-jobs involved. When run, fio parses this file and sets everything up as
-described. If we break down a job from top to bottom, it contains the following
-basic parameters:
-
-`I/O type`_
-
-               Defines the I/O pattern issued to the file(s).  We may only be reading
-               sequentially from this file(s), or we may be writing randomly. Or even
-               mixing reads and writes, sequentially or randomly.
-               Should we be doing buffered I/O, or direct/raw I/O?
-
-`Block size`_
-
-               In how large chunks are we issuing I/O? This may be a single value,
-               or it may describe a range of block sizes.
-
-`I/O size`_
-
-               How much data are we going to be reading/writing.
-
-`I/O engine`_
-
-               How do we issue I/O? We could be memory mapping the file, we could be
-               using regular read/write, we could be using splice, async I/O, or even
-               SG (SCSI generic sg).
-
-`I/O depth`_
-
-               If the I/O engine is async, how large a queuing depth do we want to
-               maintain?
-
-
-`Target file/device`_
-
-               How many files are we spreading the workload over.
-
-`Threads, processes and job synchronization`_
-
-               How many threads or processes should we spread this workload over.
-
-The above are the basic parameters defined for a workload, in addition there's a
-multitude of parameters that modify other aspects of how this job behaves.
-
-
-Command line options
---------------------
-
-.. option:: --debug=type
-
-       Enable verbose tracing `type` of various fio actions.  May be ``all`` for all types
-       or individual types separated by a comma (e.g. ``--debug=file,mem`` will
-       enable file and memory debugging).  Currently, additional logging is
-       available for:
-
-       *process*
-                       Dump info related to processes.
-       *file*
-                       Dump info related to file actions.
-       *io*
-                       Dump info related to I/O queuing.
-       *mem*
-                       Dump info related to memory allocations.
-       *blktrace*
-                       Dump info related to blktrace setup.
-       *verify*
-                       Dump info related to I/O verification.
-       *all*
-                       Enable all debug options.
-       *random*
-                       Dump info related to random offset generation.
-       *parse*
-                       Dump info related to option matching and parsing.
-       *diskutil*
-                       Dump info related to disk utilization updates.
-       *job:x*
-                       Dump info only related to job number x.
-       *mutex*
-                       Dump info only related to mutex up/down ops.
-       *profile*
-                       Dump info related to profile extensions.
-       *time*
-                       Dump info related to internal time keeping.
-       *net*
-                       Dump info related to networking connections.
-       *rate*
-                       Dump info related to I/O rate switching.
-       *compress*
-                       Dump info related to log compress/decompress.
-       *steadystate*
-                       Dump info related to steadystate detection.
-       *helperthread*
-                       Dump info related to the helper thread.
-       *zbd*
-                       Dump info related to support for zoned block devices.
-       *?* or *help*
-                       Show available debug options.
-
-.. option:: --parse-only
-
-       Parse options only, don't start any I/O.
-
-.. option:: --merge-blktrace-only
-
-       Merge blktraces only, don't start any I/O.
-
-.. option:: --output=filename
-
-       Write output to file `filename`.
-
-.. option:: --output-format=format
-
-       Set the reporting `format` to `normal`, `terse`, `json`, or `json+`.  Multiple
-       formats can be selected, separated by a comma.  `terse` is a CSV based
-       format.  `json+` is like `json`, except it adds a full dump of the latency
-       buckets.
-
-.. option:: --bandwidth-log
-
-       Generate aggregate bandwidth logs.
-
-.. option:: --minimal
-
-       Print statistics in a terse, semicolon-delimited format.
-
-.. option:: --append-terse
-
-       Print statistics in selected mode AND terse, semicolon-delimited format.
-       **Deprecated**, use :option:`--output-format` instead to select multiple
-       formats.
-
-.. option:: --terse-version=version
-
-       Set terse `version` output format (default 3, or 2 or 4 or 5).
-
-.. option:: --version
-
-       Print version information and exit.
-
-.. option:: --help
-
-       Print a summary of the command line options and exit.
-
-.. option:: --cpuclock-test
-
-       Perform test and validation of internal CPU clock.
-
-.. option:: --crctest=[test]
-
-       Test the speed of the built-in checksumming functions. If no argument is
-       given, all of them are tested. Alternatively, a comma separated list can
-       be passed, in which case the given ones are tested.
-
-.. option:: --cmdhelp=command
-
-       Print help information for `command`. May be ``all`` for all commands.
-
-.. option:: --enghelp=[ioengine[,command]]
-
-       List all commands defined by `ioengine`, or print help for `command`
-       defined by `ioengine`.  If no `ioengine` is given, list all
-       available ioengines.
-
-.. option:: --showcmd=jobfile
-
-       Convert `jobfile` to a set of command-line options.
-
-.. option:: --readonly
-
-       Turn on safety read-only checks, preventing writes and trims.  The
-       ``--readonly`` option is an extra safety guard to prevent users from
-       accidentally starting a write or trim workload when that is not desired.
-       Fio will only modify the device under test if
-       `rw=write/randwrite/rw/randrw/trim/randtrim/trimwrite` is given.  This
-       safety net can be used as an extra precaution.
-
-.. option:: --eta=when
-
-       Specifies when real-time ETA estimate should be printed.  `when` may be
-       `always`, `never` or `auto`. `auto` is the default, it prints ETA
-       when requested if the output is a TTY. `always` disregards the output
-       type, and prints ETA when requested. `never` never prints ETA.
-
-.. option:: --eta-interval=time
-
-       By default, fio requests client ETA status roughly every second. With
-       this option, the interval is configurable. Fio imposes a minimum
-       allowed time to avoid flooding the console, less than 250 msec is
-       not supported.
-
-.. option:: --eta-newline=time
-
-       Force a new line for every `time` period passed.  When the unit is omitted,
-       the value is interpreted in seconds.
-
-.. option:: --status-interval=time
-
-       Force a full status dump of cumulative (from job start) values at `time`
-       intervals. This option does *not* provide per-period measurements. So
-       values such as bandwidth are running averages. When the time unit is omitted,
-       `time` is interpreted in seconds. Note that using this option with
-       ``--output-format=json`` will yield output that technically isn't valid
-       json, since the output will be collated sets of valid json. It will need
-       to be split into valid sets of json after the run.
-
-.. option:: --section=name
-
-       Only run specified section `name` in job file.  Multiple sections can be specified.
-       The ``--section`` option allows one to combine related jobs into one file.
-       E.g. one job file could define light, moderate, and heavy sections. Tell
-       fio to run only the "heavy" section by giving ``--section=heavy``
-       command line option.  One can also specify the "write" operations in one
-       section and "verify" operation in another section.  The ``--section`` option
-       only applies to job sections.  The reserved *global* section is always
-       parsed and used.
-
-.. option:: --alloc-size=kb
-
-       Allocate additional internal smalloc pools of size `kb` in KiB.  The
-       ``--alloc-size`` option increases shared memory set aside for use by fio.
-       If running large jobs with randommap enabled, fio can run out of memory.
-       Smalloc is an internal allocator for shared structures from a fixed size
-       memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
-
-       NOTE: While running :file:`.fio_smalloc.*` backing store files are visible
-       in :file:`/tmp`.
-
-.. option:: --warnings-fatal
-
-       All fio parser warnings are fatal, causing fio to exit with an
-       error.
-
-.. option:: --max-jobs=nr
-
-       Set the maximum number of threads/processes to support to `nr`.
-       NOTE: On Linux, it may be necessary to increase the shared-memory
-       limit (:file:`/proc/sys/kernel/shmmax`) if fio runs into errors while
-       creating jobs.
-
-.. option:: --server=args
-
-       Start a backend server, with `args` specifying what to listen to.
-       See `Client/Server`_ section.
-
-.. option:: --daemonize=pidfile
-
-       Background a fio server, writing the pid to the given `pidfile` file.
-
-.. option:: --client=hostname
-
-       Instead of running the jobs locally, send and run them on the given `hostname`
-       or set of `hostname`\s.  See `Client/Server`_ section.
-
-.. option:: --remote-config=file
-
-       Tell fio server to load this local `file`.
-
-.. option:: --idle-prof=option
-
-       Report CPU idleness. `option` is one of the following:
-
-               **calibrate**
-                       Run unit work calibration only and exit.
-
-               **system**
-                       Show aggregate system idleness and unit work.
-
-               **percpu**
-                       As **system** but also show per CPU idleness.
-
-.. option:: --inflate-log=log
-
-       Inflate and output compressed `log`.
-
-.. option:: --trigger-file=file
-
-       Execute trigger command when `file` exists.
-
-.. option:: --trigger-timeout=time
-
-       Execute trigger at this `time`.
-
-.. option:: --trigger=command
-
-       Set this `command` as local trigger.
-
-.. option:: --trigger-remote=command
-
-       Set this `command` as remote trigger.
-
-.. option:: --aux-path=path
-
-       Use the directory specified by `path` for generated state files instead
-       of the current working directory.
-
-Any parameters following the options will be assumed to be job files, unless
-they match a job file parameter. Multiple job files can be listed and each job
-file will be regarded as a separate group. Fio will :option:`stonewall`
-execution between each group.
-
-
-Job file format
----------------
-
-As previously described, fio accepts one or more job files describing what it is
-supposed to do. The job file format is the classic ini file, where the names
-enclosed in [] brackets define the job name. You are free to use any ASCII name
-you want, except *global* which has special meaning.  Following the job name is
-a sequence of zero or more parameters, one per line, that define the behavior of
-the job. If the first character in a line is a ';' or a '#', the entire line is
-discarded as a comment.
-
-A *global* section sets defaults for the jobs described in that file. A job may
-override a *global* section parameter, and a job file may even have several
-*global* sections if so desired. A job is only affected by a *global* section
-residing above it.
-
-The :option:`--cmdhelp` option also lists all options. If used with a `command`
-argument, :option:`--cmdhelp` will detail the given `command`.
-
-See the `examples/` directory for inspiration on how to write job files.  Note
-the copyright and license requirements currently apply to `examples/` files.
-
-So let's look at a really simple job file that defines two processes, each
-randomly reading from a 128MiB file:
-
-.. code-block:: ini
-
-    ; -- start job file --
-    [global]
-    rw=randread
-    size=128m
-
-    [job1]
-
-    [job2]
-
-    ; -- end job file --
-
-As you can see, the job file sections themselves are empty as all the described
-parameters are shared. As no :option:`filename` option is given, fio makes up a
-`filename` for each of the jobs as it sees fit. On the command line, this job
-would look as follows::
-
-$ fio --name=global --rw=randread --size=128m --name=job1 --name=job2
-
-
-Let's look at an example that has a number of processes writing randomly to
-files:
-
-.. code-block:: ini
-
-    ; -- start job file --
-    [random-writers]
-    ioengine=libaio
-    iodepth=4
-    rw=randwrite
-    bs=32k
-    direct=0
-    size=64m
-    numjobs=4
-    ; -- end job file --
-
-Here we have no *global* section, as we only have one job defined anyway.  We
-want to use async I/O here, with a depth of 4 for each file. We also increased
-the buffer size used to 32KiB and define numjobs to 4 to fork 4 identical
-jobs. The result is 4 processes each randomly writing to their own 64MiB
-file. Instead of using the above job file, you could have given the parameters
-on the command line. For this case, you would specify::
-
-$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
-
-When fio is utilized as a basis of any reasonably large test suite, it might be
-desirable to share a set of standardized settings across multiple job files.
-Instead of copy/pasting such settings, any section may pull in an external
-:file:`filename.fio` file with *include filename* directive, as in the following
-example::
-
-    ; -- start job file including.fio --
-    [global]
-    filename=/tmp/test
-    filesize=1m
-    include glob-include.fio
-
-    [test]
-    rw=randread
-    bs=4k
-    time_based=1
-    runtime=10
-    include test-include.fio
-    ; -- end job file including.fio --
-
-.. code-block:: ini
-
-    ; -- start job file glob-include.fio --
-    thread=1
-    group_reporting=1
-    ; -- end job file glob-include.fio --
-
-.. code-block:: ini
-
-    ; -- start job file test-include.fio --
-    ioengine=libaio
-    iodepth=4
-    ; -- end job file test-include.fio --
-
-Settings pulled into a section apply to that section only (except *global*
-section). Include directives may be nested in that any included file may contain
-further include directive(s). Include files may not contain [] sections.
-
-
-Environment variables
-~~~~~~~~~~~~~~~~~~~~~
-
-Fio also supports environment variable expansion in job files. Any sub-string of
-the form ``${VARNAME}`` as part of an option value (in other words, on the right
-of the '='), will be expanded to the value of the environment variable called
-`VARNAME`.  If no such environment variable is defined, or `VARNAME` is the
-empty string, the empty string will be substituted.
-
-As an example, let's look at a sample fio invocation and job file::
-
-$ SIZE=64m NUMJOBS=4 fio jobfile.fio
-
-.. code-block:: ini
-
-    ; -- start job file --
-    [random-writers]
-    rw=randwrite
-    size=${SIZE}
-    numjobs=${NUMJOBS}
-    ; -- end job file --
-
-This will expand to the following equivalent job file at runtime:
-
-.. code-block:: ini
-
-    ; -- start job file --
-    [random-writers]
-    rw=randwrite
-    size=64m
-    numjobs=4
-    ; -- end job file --
-
-Fio ships with a few example job files, you can also look there for inspiration.
-
-Reserved keywords
-~~~~~~~~~~~~~~~~~
-
-Additionally, fio has a set of reserved keywords that will be replaced
-internally with the appropriate value. Those keywords are:
-
-**$pagesize**
-
-       The architecture page size of the running system.
-
-**$mb_memory**
-
-       Megabytes of total memory in the system.
-
-**$ncpus**
-
-       Number of online available CPUs.
-
-These can be used on the command line or in the job file, and will be
-automatically substituted with the current system values when the job is
-run. Simple math is also supported on these keywords, so you can perform actions
-like::
-
-       size=8*$mb_memory
-
-and get that properly expanded to 8 times the size of memory in the machine.
-
-
-Job file parameters
--------------------
-
-This section describes in details each parameter associated with a job.  Some
-parameters take an option of a given type, such as an integer or a
-string. Anywhere a numeric value is required, an arithmetic expression may be
-used, provided it is surrounded by parentheses. Supported operators are:
-
-       - addition (+)
-       - subtraction (-)
-       - multiplication (*)
-       - division (/)
-       - modulus (%)
-       - exponentiation (^)
-
-For time values in expressions, units are microseconds by default. This is
-different than for time values not in expressions (not enclosed in
-parentheses). The following types are used:
-
-
-Parameter types
-~~~~~~~~~~~~~~~
-
-**str**
-       String: A sequence of alphanumeric characters.
-
-**time**
-       Integer with possible time suffix.  Without a unit value is interpreted as
-       seconds unless otherwise specified.  Accepts a suffix of 'd' for days, 'h' for
-       hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and
-       'us' (or 'usec') for microseconds.  For example, use 10m for 10 minutes.
-
-.. _int:
-
-**int**
-       Integer. A whole number value, which may contain an integer prefix
-       and an integer suffix:
-
-       [*integer prefix*] **number** [*integer suffix*]
-
-       The optional *integer prefix* specifies the number's base. The default
-       is decimal. *0x* specifies hexadecimal.
-
-       The optional *integer suffix* specifies the number's units, and includes an
-       optional unit prefix and an optional unit.  For quantities of data, the
-       default unit is bytes. For quantities of time, the default unit is seconds
-       unless otherwise specified.
-
-       With :option:`kb_base`\=1000, fio follows international standards for unit
-       prefixes.  To specify power-of-10 decimal values defined in the
-       International System of Units (SI):
-
-               * *K* -- means kilo (K) or 1000
-               * *M* -- means mega (M) or 1000**2
-               * *G* -- means giga (G) or 1000**3
-               * *T* -- means tera (T) or 1000**4
-               * *P* -- means peta (P) or 1000**5
-
-       To specify power-of-2 binary values defined in IEC 80000-13:
-
-               * *Ki* -- means kibi (Ki) or 1024
-               * *Mi* -- means mebi (Mi) or 1024**2
-               * *Gi* -- means gibi (Gi) or 1024**3
-               * *Ti* -- means tebi (Ti) or 1024**4
-               * *Pi* -- means pebi (Pi) or 1024**5
-
-       With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite
-       from those specified in the SI and IEC 80000-13 standards to provide
-       compatibility with old scripts.  For example, 4k means 4096.
-
-       For quantities of data, an optional unit of 'B' may be included
-       (e.g., 'kB' is the same as 'k').
-
-       The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
-       not milli). 'b' and 'B' both mean byte, not bit.
-
-       Examples with :option:`kb_base`\=1000:
-
-               * *4 KiB*: 4096, 4096b, 4096B, 4ki, 4kib, 4kiB, 4Ki, 4KiB
-               * *1 MiB*: 1048576, 1mi, 1024ki
-               * *1 MB*: 1000000, 1m, 1000k
-               * *1 TiB*: 1099511627776, 1ti, 1024gi, 1048576mi
-               * *1 TB*: 1000000000, 1t, 1000m, 1000000k
-
-       Examples with :option:`kb_base`\=1024 (default):
-
-               * *4 KiB*: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
-               * *1 MiB*: 1048576, 1m, 1024k
-               * *1 MB*: 1000000, 1mi, 1000ki
-               * *1 TiB*: 1099511627776, 1t, 1024g, 1048576m
-               * *1 TB*: 1000000000, 1ti, 1000mi, 1000000ki
-
-       To specify times (units are not case sensitive):
-
-               * *D* -- means days
-               * *H* -- means hours
-               * *M* -- means minutes
-               * *s* -- or sec means seconds (default)
-               * *ms* -- or *msec* means milliseconds
-               * *us* -- or *usec* means microseconds
-
-       If the option accepts an upper and lower range, use a colon ':' or
-       minus '-' to separate such values. See :ref:`irange <irange>`.
-       If the lower value specified happens to be larger than the upper value
-       the two values are swapped.
-
-.. _bool:
-
-**bool**
-       Boolean. Usually parsed as an integer, however only defined for
-       true and false (1 and 0).
-
-.. _irange:
-
-**irange**
-       Integer range with suffix. Allows value range to be given, such as
-       1024-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
-       option allows two sets of ranges, they can be specified with a ',' or '/'
-       delimiter: 1k-4k/8k-32k. Also see :ref:`int <int>`.
-
-**float_list**
-       A list of floating point numbers, separated by a ':' character.
-
-With the above in mind, here follows the complete list of fio job parameters.
-
-
-Units
-~~~~~
-
-.. option:: kb_base=int
-
-       Select the interpretation of unit prefixes in input parameters.
-
-               **1000**
-                       Inputs comply with IEC 80000-13 and the International
-                       System of Units (SI). Use:
-
-                               - power-of-2 values with IEC prefixes (e.g., KiB)
-                               - power-of-10 values with SI prefixes (e.g., kB)
-
-               **1024**
-                       Compatibility mode (default).  To avoid breaking old scripts:
-
-                               - power-of-2 values with SI prefixes
-                               - power-of-10 values with IEC prefixes
-
-       See :option:`bs` for more details on input parameters.
-
-       Outputs always use correct prefixes.  Most outputs include both
-       side-by-side, like::
-
-               bw=2383.3kB/s (2327.4KiB/s)
-
-       If only one value is reported, then kb_base selects the one to use:
-
-               **1000** -- SI prefixes
-
-               **1024** -- IEC prefixes
-
-.. option:: unit_base=int
-
-       Base unit for reporting.  Allowed values are:
-
-       **0**
-               Use auto-detection (default).
-       **8**
-               Byte based.
-       **1**
-               Bit based.
-
-
-Job description
-~~~~~~~~~~~~~~~
-
-.. option:: name=str
-
-       ASCII name of the job. This may be used to override the name printed by fio
-       for this job. Otherwise the job name is used. On the command line this
-       parameter has the special purpose of also signaling the start of a new job.
-
-.. option:: description=str
-
-       Text description of the job. Doesn't do anything except dump this text
-       description when this job is run. It's not parsed.
-
-.. option:: loops=int
-
-       Run the specified number of iterations of this job. Used to repeat the same
-       workload a given number of times. Defaults to 1.
-
-.. option:: numjobs=int
-
-       Create the specified number of clones of this job. Each clone of job
-       is spawned as an independent thread or process. May be used to setup a
-       larger number of threads/processes doing the same thing. Each thread is
-       reported separately; to see statistics for all clones as a whole, use
-       :option:`group_reporting` in conjunction with :option:`new_group`.
-       See :option:`--max-jobs`.  Default: 1.
-
-
-Time related parameters
-~~~~~~~~~~~~~~~~~~~~~~~
-
-.. option:: runtime=time
-
-       Tell fio to terminate processing after the specified period of time.  It
-       can be quite hard to determine for how long a specified job will run, so
-       this parameter is handy to cap the total runtime to a given time.  When
-       the unit is omitted, the value is interpreted in seconds.
-
-.. option:: time_based
-
-       If set, fio will run for the duration of the :option:`runtime` specified
-       even if the file(s) are completely read or written. It will simply loop over
-       the same workload as many times as the :option:`runtime` allows.
-
-.. option:: startdelay=irange(time)
-
-       Delay the start of job for the specified amount of time.  Can be a single
-       value or a range.  When given as a range, each thread will choose a value
-       randomly from within the range.  Value is in seconds if a unit is omitted.
-
-.. option:: ramp_time=time
-
-       If set, fio will run the specified workload for this amount of time before
-       logging any performance numbers. Useful for letting performance settle
-       before logging results, thus minimizing the runtime required for stable
-       results. Note that the ``ramp_time`` is considered lead in time for a job,
-       thus it will increase the total runtime if a special timeout or
-       :option:`runtime` is specified.  When the unit is omitted, the value is
-       given in seconds.
-
-.. option:: clocksource=str
-
-       Use the given clocksource as the base of timing. The supported options are:
-
-               **gettimeofday**
-                       :manpage:`gettimeofday(2)`
-
-               **clock_gettime**
-                       :manpage:`clock_gettime(2)`
-
-               **cpu**
-                       Internal CPU clock source
-
-       cpu is the preferred clocksource if it is reliable, as it is very fast (and
-       fio is heavy on time calls). Fio will automatically use this clocksource if
-       it's supported and considered reliable on the system it is running on,
-       unless another clocksource is specifically set. For x86/x86-64 CPUs, this
-       means supporting TSC Invariant.
-
-.. option:: gtod_reduce=bool
-
-       Enable all of the :manpage:`gettimeofday(2)` reducing options
-       (:option:`disable_clat`, :option:`disable_slat`, :option:`disable_bw_measurement`) plus
-       reduce precision of the timeout somewhat to really shrink the
-       :manpage:`gettimeofday(2)` call count. With this option enabled, we only do
-       about 0.4% of the :manpage:`gettimeofday(2)` calls we would have done if all
-       time keeping was enabled.
-
-.. option:: gtod_cpu=int
-
-       Sometimes it's cheaper to dedicate a single thread of execution to just
-       getting the current time. Fio (and databases, for instance) are very
-       intensive on :manpage:`gettimeofday(2)` calls. With this option, you can set
-       one CPU aside for doing nothing but logging current time to a shared memory
-       location. Then the other threads/processes that run I/O workloads need only
-       copy that segment, instead of entering the kernel with a
-       :manpage:`gettimeofday(2)` call. The CPU set aside for doing these time
-       calls will be excluded from other uses. Fio will manually clear it from the
-       CPU mask of other jobs.
-
-
-Target file/device
-~~~~~~~~~~~~~~~~~~
-
-.. option:: directory=str
-
-       Prefix filenames with this directory. Used to place files in a different
-       location than :file:`./`.  You can specify a number of directories by
-       separating the names with a ':' character. These directories will be
-       assigned equally distributed to job clones created by :option:`numjobs` as
-       long as they are using generated filenames. If specific `filename(s)` are
-       set fio will use the first listed directory, and thereby matching the
-       `filename` semantic (which generates a file for each clone if not
-       specified, but lets all clones use the same file if set).
-
-       See the :option:`filename` option for information on how to escape "``:``"
-       characters within the directory path itself.
-
-       Note: To control the directory fio will use for internal state files
-       use :option:`--aux-path`.
-
-.. option:: filename=str
-
-       Fio normally makes up a `filename` based on the job name, thread number, and
-       file number (see :option:`filename_format`). If you want to share files
-       between threads in a job or several
-       jobs with fixed file paths, specify a `filename` for each of them to override
-       the default. If the ioengine is file based, you can specify a number of files
-       by separating the names with a ':' colon. So if you wanted a job to open
-       :file:`/dev/sda` and :file:`/dev/sdb` as the two working files, you would use
-       ``filename=/dev/sda:/dev/sdb``. This also means that whenever this option is
-       specified, :option:`nrfiles` is ignored. The size of regular files specified
-       by this option will be :option:`size` divided by number of files unless an
-       explicit size is specified by :option:`filesize`.
-
-       Each colon in the wanted path must be escaped with a ``\``
-       character.  For instance, if the path is :file:`/dev/dsk/foo@3,0:c` then you
-       would use ``filename=/dev/dsk/foo@3,0\:c`` and if the path is
-       :file:`F:\\filename` then you would use ``filename=F\:\filename``.
-
-       On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for
-       the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc.
-       Note: Windows and FreeBSD prevent write access to areas
-       of the disk containing in-use data (e.g. filesystems).
-
-       The filename "`-`" is a reserved name, meaning *stdin* or *stdout*.  Which
-       of the two depends on the read/write direction set.
-
-.. option:: filename_format=str
-
-       If sharing multiple files between jobs, it is usually necessary to have fio
-       generate the exact names that you want. By default, fio will name a file
-       based on the default file format specification of
-       :file:`jobname.jobnumber.filenumber`. With this option, that can be
-       customized. Fio will recognize and replace the following keywords in this
-       string:
-
-               **$jobname**
-                               The name of the worker thread or process.
-               **$clientuid**
-                               IP of the fio process when using client/server mode.
-               **$jobnum**
-                               The incremental number of the worker thread or process.
-               **$filenum**
-                               The incremental number of the file for that worker thread or
-                               process.
-
-       To have dependent jobs share a set of files, this option can be set to have
-       fio generate filenames that are shared between the two. For instance, if
-       :file:`testfiles.$filenum` is specified, file number 4 for any job will be
-       named :file:`testfiles.4`. The default of :file:`$jobname.$jobnum.$filenum`
-       will be used if no other format specifier is given.
-
-       If you specify a path then the directories will be created up to the
-       main directory for the file.  So for example if you specify
-       ``filename_format=a/b/c/$jobnum`` then the directories a/b/c will be
-       created before the file setup part of the job.  If you specify
-       :option:`directory` then the path will be relative that directory,
-       otherwise it is treated as the absolute path.
-
-.. option:: unique_filename=bool
-
-       To avoid collisions between networked clients, fio defaults to prefixing any
-       generated filenames (with a directory specified) with the source of the
-       client connecting. To disable this behavior, set this option to 0.
-
-.. option:: opendir=str
-
-       Recursively open any files below directory `str`.
-
-.. option:: lockfile=str
-
-       Fio defaults to not locking any files before it does I/O to them. If a file
-       or file descriptor is shared, fio can serialize I/O to that file to make the
-       end result consistent. This is usual for emulating real workloads that share
-       files. The lock modes are:
-
-               **none**
-                       No locking. The default.
-               **exclusive**
-                       Only one thread or process may do I/O at a time, excluding all
-                       others.
-               **readwrite**
-                       Read-write locking on the file. Many readers may
-                       access the file at the same time, but writes get exclusive access.
-
-.. option:: nrfiles=int
-
-       Number of files to use for this job. Defaults to 1. The size of files
-       will be :option:`size` divided by this unless explicit size is specified by
-       :option:`filesize`. Files are created for each thread separately, and each
-       file will have a file number within its name by default, as explained in
-       :option:`filename` section.
-
-
-.. option:: openfiles=int
-
-       Number of files to keep open at the same time. Defaults to the same as
-       :option:`nrfiles`, can be set smaller to limit the number simultaneous
-       opens.
-
-.. option:: file_service_type=str
-
-       Defines how fio decides which file from a job to service next. The following
-       types are defined:
-
-               **random**
-                       Choose a file at random.
-
-               **roundrobin**
-                       Round robin over opened files. This is the default.
-
-               **sequential**
-                       Finish one file before moving on to the next. Multiple files can
-                       still be open depending on :option:`openfiles`.
-
-               **zipf**
-                       Use a *Zipf* distribution to decide what file to access.
-
-               **pareto**
-                       Use a *Pareto* distribution to decide what file to access.
-
-               **normal**
-                       Use a *Gaussian* (normal) distribution to decide what file to
-                       access.
-
-               **gauss**
-                       Alias for normal.
-
-       For *random*, *roundrobin*, and *sequential*, a postfix can be appended to
-       tell fio how many I/Os to issue before switching to a new file. For example,
-       specifying ``file_service_type=random:8`` would cause fio to issue
-       8 I/Os before selecting a new file at random. For the non-uniform
-       distributions, a floating point postfix can be given to influence how the
-       distribution is skewed. See :option:`random_distribution` for a description
-       of how that would work.
-
-.. option:: ioscheduler=str
-
-       Attempt to switch the device hosting the file to the specified I/O scheduler
-       before running.
-
-.. option:: create_serialize=bool
-
-       If true, serialize the file creation for the jobs.  This may be handy to
-       avoid interleaving of data files, which may greatly depend on the filesystem
-       used and even the number of processors in the system.  Default: true.
-
-.. option:: create_fsync=bool
-
-       :manpage:`fsync(2)` the data file after creation. This is the default.
-
-.. option:: create_on_open=bool
-
-       If true, don't pre-create files but allow the job's open() to create a file
-       when it's time to do I/O.  Default: false -- pre-create all necessary files
-       when the job starts.
-
-.. option:: create_only=bool
-
-       If true, fio will only run the setup phase of the job.  If files need to be
-       laid out or updated on disk, only that will be done -- the actual job contents
-       are not executed.  Default: false.
-
-.. option:: allow_file_create=bool
-
-       If true, fio is permitted to create files as part of its workload.  If this
-       option is false, then fio will error out if
-       the files it needs to use don't already exist. Default: true.
-
-.. option:: allow_mounted_write=bool
-
-       If this isn't set, fio will abort jobs that are destructive (e.g. that write)
-       to what appears to be a mounted device or partition. This should help catch
-       creating inadvertently destructive tests, not realizing that the test will
-       destroy data on the mounted file system. Note that some platforms don't allow
-       writing against a mounted device regardless of this option. Default: false.
-
-.. option:: pre_read=bool
-
-       If this is given, files will be pre-read into memory before starting the
-       given I/O operation. This will also clear the :option:`invalidate` flag,
-       since it is pointless to pre-read and then drop the cache. This will only
-       work for I/O engines that are seek-able, since they allow you to read the
-       same data multiple times. Thus it will not work on non-seekable I/O engines
-       (e.g. network, splice). Default: false.
-
-.. option:: unlink=bool
-
-       Unlink the job files when done. Not the default, as repeated runs of that
-       job would then waste time recreating the file set again and again. Default:
-       false.
-
-.. option:: unlink_each_loop=bool
-
-       Unlink job files after each iteration or loop.  Default: false.
-
-.. option:: zonemode=str
-
-       Accepted values are:
-
-               **none**
-                               The :option:`zonerange`, :option:`zonesize`,
-                               :option `zonecapacity` and option:`zoneskip`
-                               parameters are ignored.
-               **strided**
-                               I/O happens in a single zone until
-                               :option:`zonesize` bytes have been transferred.
-                               After that number of bytes has been
-                               transferred processing of the next zone
-                               starts. :option `zonecapacity` is ignored.
-               **zbd**
-                               Zoned block device mode. I/O happens
-                               sequentially in each zone, even if random I/O
-                               has been selected. Random I/O happens across
-                               all zones instead of being restricted to a
-                               single zone. The :option:`zoneskip` parameter
-                               is ignored. :option:`zonerange` and
-                               :option:`zonesize` must be identical.
-
-.. option:: zonerange=int
-
-       Size of a single zone. See also :option:`zonesize` and
-       :option:`zoneskip`.
-
-.. option:: zonesize=int
-
-       For :option:`zonemode` =strided, this is the number of bytes to
-       transfer before skipping :option:`zoneskip` bytes. If this parameter
-       is smaller than :option:`zonerange` then only a fraction of each zone
-       with :option:`zonerange` bytes will be accessed.  If this parameter is
-       larger than :option:`zonerange` then each zone will be accessed
-       multiple times before skipping to the next zone.
-
-       For :option:`zonemode` =zbd, this is the size of a single zone. The
-       :option:`zonerange` parameter is ignored in this mode.
-
-
-.. option:: zonecapacity=int
-
-       For :option:`zonemode` =zbd, this defines the capacity of a single zone,
-       which is the accessible area starting from the zone start address.
-       This parameter only applies when using :option:`zonemode` =zbd in
-       combination with regular block devices. If not specified it defaults to
-       the zone size. If the target device is a zoned block device, the zone
-       capacity is obtained from the device information and this option is
-       ignored.
-
-.. option:: zoneskip=int
-
-       For :option:`zonemode` =strided, the number of bytes to skip after
-       :option:`zonesize` bytes of data have been transferred. This parameter
-       must be zero for :option:`zonemode` =zbd.
-
-.. option:: read_beyond_wp=bool
-
-       This parameter applies to :option:`zonemode` =zbd only.
-
-       Zoned block devices are block devices that consist of multiple zones.
-       Each zone has a type, e.g. conventional or sequential. A conventional
-       zone can be written at any offset that is a multiple of the block
-       size. Sequential zones must be written sequentially. The position at
-       which a write must occur is called the write pointer. A zoned block
-       device can be either drive managed, host managed or host aware. For
-       host managed devices the host must ensure that writes happen
-       sequentially. Fio recognizes host managed devices and serializes
-       writes to sequential zones for these devices.
-
-       If a read occurs in a sequential zone beyond the write pointer then
-       the zoned block device will complete the read without reading any data
-       from the storage medium. Since such reads lead to unrealistically high
-       bandwidth and IOPS numbers fio only reads beyond the write pointer if
-       explicitly told to do so. Default: false.
-
-.. option:: max_open_zones=int
-
-       When running a random write test across an entire drive many more
-       zones will be open than in a typical application workload. Hence this
-       command line option that allows to limit the number of open zones. The
-       number of open zones is defined as the number of zones to which write
-       commands are issued.
-
-.. option:: zone_reset_threshold=float
-
-       A number between zero and one that indicates the ratio of logical
-       blocks with data to the total number of logical blocks in the test
-       above which zones should be reset periodically.
-
-.. option:: zone_reset_frequency=float
-
-       A number between zero and one that indicates how often a zone reset
-       should be issued if the zone reset threshold has been exceeded. A zone
-       reset is submitted after each (1 / zone_reset_frequency) write
-       requests. This and the previous parameter can be used to simulate
-       garbage collection activity.
-
-
-I/O type
-~~~~~~~~
-
-.. option:: direct=bool
-
-       If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
-       OpenBSD and ZFS on Solaris don't support direct I/O.  On Windows the synchronous
-       ioengines don't support direct I/O.  Default: false.
-
-.. option:: atomic=bool
-
-       If value is true, attempt to use atomic direct I/O. Atomic writes are
-       guaranteed to be stable once acknowledged by the operating system. Only
-       Linux supports O_ATOMIC right now.
-
-.. option:: buffered=bool
-
-       If value is true, use buffered I/O. This is the opposite of the
-       :option:`direct` option. Defaults to true.
-
-.. option:: readwrite=str, rw=str
-
-       Type of I/O pattern. Accepted values are:
-
-               **read**
-                               Sequential reads.
-               **write**
-                               Sequential writes.
-               **trim**
-                               Sequential trims (Linux block devices and SCSI
-                               character devices only).
-               **randread**
-                               Random reads.
-               **randwrite**
-                               Random writes.
-               **randtrim**
-                               Random trims (Linux block devices and SCSI
-                               character devices only).
-               **rw,readwrite**
-                               Sequential mixed reads and writes.
-               **randrw**
-                               Random mixed reads and writes.
-               **trimwrite**
-                               Sequential trim+write sequences. Blocks will be trimmed first,
-                               then the same blocks will be written to.
-
-       Fio defaults to read if the option is not specified.  For the mixed I/O
-       types, the default is to split them 50/50.  For certain types of I/O the
-       result may still be skewed a bit, since the speed may be different.
-
-       It is possible to specify the number of I/Os to do before getting a new
-       offset by appending ``:<nr>`` to the end of the string given.  For a
-       random read, it would look like ``rw=randread:8`` for passing in an offset
-       modifier with a value of 8. If the suffix is used with a sequential I/O
-       pattern, then the *<nr>* value specified will be **added** to the generated
-       offset for each I/O turning sequential I/O into sequential I/O with holes.
-       For instance, using ``rw=write:4k`` will skip 4k for every write.  Also see
-       the :option:`rw_sequencer` option.
-
-.. option:: rw_sequencer=str
-
-       If an offset modifier is given by appending a number to the ``rw=<str>``
-       line, then this option controls how that number modifies the I/O offset
-       being generated. Accepted values are:
-
-               **sequential**
-                       Generate sequential offset.
-               **identical**
-                       Generate the same offset.
-
-       ``sequential`` is only useful for random I/O, where fio would normally
-       generate a new random offset for every I/O. If you append e.g. 8 to randread,
-       you would get a new random offset for every 8 I/Os. The result would be a
-       seek for only every 8 I/Os, instead of for every I/O. Use ``rw=randread:8``
-       to specify that. As sequential I/O is already sequential, setting
-       ``sequential`` for that would not result in any differences.  ``identical``
-       behaves in a similar fashion, except it sends the same offset 8 number of
-       times before generating a new offset.
-
-.. option:: unified_rw_reporting=bool
-
-       Fio normally reports statistics on a per data direction basis, meaning that
-       reads, writes, and trims are accounted and reported separately. If this
-       option is set fio sums the results and report them as "mixed" instead.
-
-.. option:: randrepeat=bool
-
-       Seed the random number generator used for random I/O patterns in a
-       predictable way so the pattern is repeatable across runs. Default: true.
-
-.. option:: allrandrepeat=bool
-
-       Seed all random number generators in a predictable way so results are
-       repeatable across runs.  Default: false.
-
-.. option:: randseed=int
-
-       Seed the random number generators based on this seed value, to be able to
-       control what sequence of output is being generated.  If not set, the random
-       sequence depends on the :option:`randrepeat` setting.
-
-.. option:: fallocate=str
-
-       Whether pre-allocation is performed when laying down files.
-       Accepted values are:
-
-               **none**
-                       Do not pre-allocate space.
-
-               **native**
-                       Use a platform's native pre-allocation call but fall back to
-                       **none** behavior if it fails/is not implemented.
-
-               **posix**
-                       Pre-allocate via :manpage:`posix_fallocate(3)`.
-
-               **keep**
-                       Pre-allocate via :manpage:`fallocate(2)` with
-                       FALLOC_FL_KEEP_SIZE set.
-
-               **truncate**
-                       Extend file to final size via :manpage:`ftruncate(2)`
-                       instead of allocating.
-
-               **0**
-                       Backward-compatible alias for **none**.
-
-               **1**
-                       Backward-compatible alias for **posix**.
-
-       May not be available on all supported platforms. **keep** is only available
-       on Linux. If using ZFS on Solaris this cannot be set to **posix**
-       because ZFS doesn't support pre-allocation. Default: **native** if any
-       pre-allocation methods except **truncate** are available, **none** if not.
-
-       Note that using **truncate** on Windows will interact surprisingly
-       with non-sequential write patterns. When writing to a file that has
-       been extended by setting the end-of-file information, Windows will
-       backfill the unwritten portion of the file up to that offset with
-       zeroes before issuing the new write. This means that a single small
-       write to the end of an extended file will stall until the entire
-       file has been filled with zeroes.
-
-.. option:: fadvise_hint=str
-
-       Use :manpage:`posix_fadvise(2)` or :manpage:`posix_fadvise(2)` to
-       advise the kernel on what I/O patterns are likely to be issued.
-       Accepted values are:
-
-               **0**
-                       Backwards-compatible hint for "no hint".
-
-               **1**
-                       Backwards compatible hint for "advise with fio workload type". This
-                       uses **FADV_RANDOM** for a random workload, and **FADV_SEQUENTIAL**
-                       for a sequential workload.
-
-               **sequential**
-                       Advise using **FADV_SEQUENTIAL**.
-
-               **random**
-                       Advise using **FADV_RANDOM**.
-
-.. option:: write_hint=str
-
-       Use :manpage:`fcntl(2)` to advise the kernel what life time to expect
-       from a write. Only supported on Linux, as of version 4.13. Accepted
-       values are:
-
-               **none**
-                       No particular life time associated with this file.
-
-               **short**
-                       Data written to this file has a short life time.
-
-               **medium**
-                       Data written to this file has a medium life time.
-
-               **long**
-                       Data written to this file has a long life time.
-
-               **extreme**
-                       Data written to this file has a very long life time.
-
-       The values are all relative to each other, and no absolute meaning
-       should be associated with them.
-
-.. option:: offset=int
-
-       Start I/O at the provided offset in the file, given as either a fixed size in
-       bytes or a percentage. If a percentage is given, the generated offset will be
-       aligned to the minimum ``blocksize`` or to the value of ``offset_align`` if
-       provided. Data before the given offset will not be touched. This
-       effectively caps the file size at `real_size - offset`. Can be combined with
-       :option:`size` to constrain the start and end range of the I/O workload.
-       A percentage can be specified by a number between 1 and 100 followed by '%',
-       for example, ``offset=20%`` to specify 20%.
-
-.. option:: offset_align=int
-
-       If set to non-zero value, the byte offset generated by a percentage ``offset``
-       is aligned upwards to this value. Defaults to 0 meaning that a percentage
-       offset is aligned to the minimum block size.
-
-.. option:: offset_increment=int
-
-       If this is provided, then the real offset becomes `offset + offset_increment
-       * thread_number`, where the thread number is a counter that starts at 0 and
-       is incremented for each sub-job (i.e. when :option:`numjobs` option is
-       specified). This option is useful if there are several jobs which are
-       intended to operate on a file in parallel disjoint segments, with even
-       spacing between the starting points. Percentages can be used for this option.
-       If a percentage is given, the generated offset will be aligned to the minimum
-       ``blocksize`` or to the value of ``offset_align`` if provided.
-
-.. option:: number_ios=int
-
-       Fio will normally perform I/Os until it has exhausted the size of the region
-       set by :option:`size`, or if it exhaust the allocated time (or hits an error
-       condition). With this setting, the range/size can be set independently of
-       the number of I/Os to perform. When fio reaches this number, it will exit
-       normally and report status. Note that this does not extend the amount of I/O
-       that will be done, it will only stop fio if this condition is met before
-       other end-of-job criteria.
-
-.. option:: fsync=int
-
-       If writing to a file, issue an :manpage:`fsync(2)` (or its equivalent) of
-       the dirty data for every number of blocks given. For example, if you give 32
-       as a parameter, fio will sync the file after every 32 writes issued. If fio is
-       using non-buffered I/O, we may not sync the file. The exception is the sg
-       I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
-       means fio does not periodically issue and wait for a sync to complete. Also
-       see :option:`end_fsync` and :option:`fsync_on_close`.
-
-.. option:: fdatasync=int
-
-       Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
-       not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no
-       :manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`.
-       Defaults to 0, which means fio does not periodically issue and wait for a
-       data-only sync to complete.
-
-.. option:: write_barrier=int
-
-       Make every `N-th` write a barrier write.
-
-.. option:: sync_file_range=str:int
-
-       Use :manpage:`sync_file_range(2)` for every `int` number of write
-       operations. Fio will track range of writes that have happened since the last
-       :manpage:`sync_file_range(2)` call. `str` can currently be one or more of:
-
-               **wait_before**
-                       SYNC_FILE_RANGE_WAIT_BEFORE
-               **write**
-                       SYNC_FILE_RANGE_WRITE
-               **wait_after**
-                       SYNC_FILE_RANGE_WAIT_AFTER
-
-       So if you do ``sync_file_range=wait_before,write:8``, fio would use
-       ``SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE`` for every 8
-       writes. Also see the :manpage:`sync_file_range(2)` man page.  This option is
-       Linux specific.
-
-.. option:: overwrite=bool
-
-       If true, writes to a file will always overwrite existing data. If the file
-       doesn't already exist, it will be created before the write phase begins. If
-       the file exists and is large enough for the specified write phase, nothing
-       will be done. Default: false.
-
-.. option:: end_fsync=bool
-
-       If true, :manpage:`fsync(2)` file contents when a write stage has completed.
-       Default: false.
-
-.. option:: fsync_on_close=bool
-
-       If true, fio will :manpage:`fsync(2)` a dirty file on close.  This differs
-       from :option:`end_fsync` in that it will happen on every file close, not
-       just at the end of the job.  Default: false.
-
-.. option:: rwmixread=int
-
-       Percentage of a mixed workload that should be reads. Default: 50.
-
-.. option:: rwmixwrite=int
-
-       Percentage of a mixed workload that should be writes. If both
-       :option:`rwmixread` and :option:`rwmixwrite` is given and the values do not
-       add up to 100%, the latter of the two will be used to override the
-       first. This may interfere with a given rate setting, if fio is asked to
-       limit reads or writes to a certain rate.  If that is the case, then the
-       distribution may be skewed. Default: 50.
-
-.. option:: random_distribution=str:float[:float][,str:float][,str:float]
-
-       By default, fio will use a completely uniform random distribution when asked
-       to perform random I/O. Sometimes it is useful to skew the distribution in
-       specific ways, ensuring that some parts of the data is more hot than others.
-       fio includes the following distribution models:
-
-               **random**
-                               Uniform random distribution
-
-               **zipf**
-                               Zipf distribution
-
-               **pareto**
-                               Pareto distribution
-
-               **normal**
-                               Normal (Gaussian) distribution
-
-               **zoned**
-                               Zoned random distribution
-
-               **zoned_abs**
-                               Zone absolute random distribution
-
-       When using a **zipf** or **pareto** distribution, an input value is also
-       needed to define the access pattern. For **zipf**, this is the `Zipf
-       theta`. For **pareto**, it's the `Pareto power`. Fio includes a test
-       program, :command:`fio-genzipf`, that can be used visualize what the given input
-       values will yield in terms of hit rates.  If you wanted to use **zipf** with
-       a `theta` of 1.2, you would use ``random_distribution=zipf:1.2`` as the
-       option. If a non-uniform model is used, fio will disable use of the random
-       map. For the **normal** distribution, a normal (Gaussian) deviation is
-       supplied as a value between 0 and 100.
-
-       The second, optional float is allowed for **pareto**, **zipf** and **normal** distributions.
-       It allows to set base of distribution in non-default place, giving more control
-       over most probable outcome. This value is in range [0-1] which maps linearly to
-       range of possible random values.
-       Defaults are: random for **pareto** and **zipf**, and 0.5 for **normal**.
-       If you wanted to use **zipf** with a `theta` of 1.2 centered on 1/4 of allowed value range,
-       you would use ``random_distibution=zipf:1.2:0.25``.
-
-       For a **zoned** distribution, fio supports specifying percentages of I/O
-       access that should fall within what range of the file or device. For
-       example, given a criteria of:
-
-               * 60% of accesses should be to the first 10%
-               * 30% of accesses should be to the next 20%
-               * 8% of accesses should be to the next 30%
-               * 2% of accesses should be to the next 40%
-
-       we can define that through zoning of the random accesses. For the above
-       example, the user would do::
-
-               random_distribution=zoned:60/10:30/20:8/30:2/40
-
-       A **zoned_abs** distribution works exactly like the **zoned**, except
-       that it takes absolute sizes. For example, let's say you wanted to
-       define access according to the following criteria:
-
-               * 60% of accesses should be to the first 20G
-               * 30% of accesses should be to the next 100G
-               * 10% of accesses should be to the next 500G
-
-       we can define an absolute zoning distribution with:
-
-               random_distribution=zoned_abs=60/20G:30/100G:10/500g
-
-       For both **zoned** and **zoned_abs**, fio supports defining up to
-       256 separate zones.
-
-       Similarly to how :option:`bssplit` works for setting ranges and
-       percentages of block sizes. Like :option:`bssplit`, it's possible to
-       specify separate zones for reads, writes, and trims. If just one set
-       is given, it'll apply to all of them. This goes for both **zoned**
-       **zoned_abs** distributions.
-
-.. option:: percentage_random=int[,int][,int]
-
-       For a random workload, set how big a percentage should be random. This
-       defaults to 100%, in which case the workload is fully random. It can be set
-       from anywhere from 0 to 100.  Setting it to 0 would make the workload fully
-       sequential. Any setting in between will result in a random mix of sequential
-       and random I/O, at the given percentages.  Comma-separated values may be
-       specified for reads, writes, and trims as described in :option:`blocksize`.
-
-.. option:: norandommap
-
-       Normally fio will cover every block of the file when doing random I/O. If
-       this option is given, fio will just get a new random offset without looking
-       at past I/O history. This means that some blocks may not be read or written,
-       and that some blocks may be read/written more than once. If this option is
-       used with :option:`verify` and multiple blocksizes (via :option:`bsrange`),
-       only intact blocks are verified, i.e., partially-overwritten blocks are
-       ignored.  With an async I/O engine and an I/O depth > 1, it is possible for
-       the same block to be overwritten, which can cause verification errors.  Either
-       do not use norandommap in this case, or also use the lfsr random generator.
-
-.. option:: softrandommap=bool
-
-       See :option:`norandommap`. If fio runs with the random block map enabled and
-       it fails to allocate the map, if this option is set it will continue without
-       a random block map. As coverage will not be as complete as with random maps,
-       this option is disabled by default.
-
-.. option:: random_generator=str
-
-       Fio supports the following engines for generating I/O offsets for random I/O:
-
-               **tausworthe**
-                       Strong 2^88 cycle random number generator.
-               **lfsr**
-                       Linear feedback shift register generator.
-               **tausworthe64**
-                       Strong 64-bit 2^258 cycle random number generator.
-
-       **tausworthe** is a strong random number generator, but it requires tracking
-       on the side if we want to ensure that blocks are only read or written
-       once. **lfsr** guarantees that we never generate the same offset twice, and
-       it's also less computationally expensive. It's not a true random generator,
-       however, though for I/O purposes it's typically good enough. **lfsr** only
-       works with single block sizes, not with workloads that use multiple block
-       sizes. If used with such a workload, fio may read or write some blocks
-       multiple times. The default value is **tausworthe**, unless the required
-       space exceeds 2^32 blocks. If it does, then **tausworthe64** is
-       selected automatically.
-
-
-Block size
-~~~~~~~~~~
-
-.. option:: blocksize=int[,int][,int], bs=int[,int][,int]
-
-       The block size in bytes used for I/O units. Default: 4096.  A single value
-       applies to reads, writes, and trims.  Comma-separated values may be
-       specified for reads, writes, and trims.  A value not terminated in a comma
-       applies to subsequent types.
-
-       Examples:
-
-               **bs=256k**
-                       means 256k for reads, writes and trims.
-
-               **bs=8k,32k**
-                       means 8k for reads, 32k for writes and trims.
-
-               **bs=8k,32k,**
-                       means 8k for reads, 32k for writes, and default for trims.
-
-               **bs=,8k**
-                       means default for reads, 8k for writes and trims.
-
-               **bs=,8k,**
-                       means default for reads, 8k for writes, and default for trims.
-
-.. option:: blocksize_range=irange[,irange][,irange], bsrange=irange[,irange][,irange]
-
-       A range of block sizes in bytes for I/O units.  The issued I/O unit will
-       always be a multiple of the minimum size, unless
-       :option:`blocksize_unaligned` is set.
-
-       Comma-separated ranges may be specified for reads, writes, and trims as
-       described in :option:`blocksize`.
-
-       Example: ``bsrange=1k-4k,2k-8k``.
-
-.. option:: bssplit=str[,str][,str]
-
-       Sometimes you want even finer grained control of the block sizes
-       issued, not just an even split between them.  This option allows you to
-       weight various block sizes, so that you are able to define a specific
-       amount of block sizes issued. The format for this option is::
-
-               bssplit=blocksize/percentage:blocksize/percentage
-
-       for as many block sizes as needed. So if you want to define a workload
-       that has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would
-       write::
-
-               bssplit=4k/10:64k/50:32k/40
-
-       Ordering does not matter. If the percentage is left blank, fio will
-       fill in the remaining values evenly. So a bssplit option like this one::
-
-               bssplit=4k/50:1k/:32k/
-
-       would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always
-       add up to 100, if bssplit is given a range that adds up to more, it
-       will error out.
-
-       Comma-separated values may be specified for reads, writes, and trims as
-       described in :option:`blocksize`.
-
-       If you want a workload that has 50% 2k reads and 50% 4k reads, while
-       having 90% 4k writes and 10% 8k writes, you would specify::
-
-               bssplit=2k/50:4k/50,4k/90:8k/10
-
-       Fio supports defining up to 64 different weights for each data
-       direction.
-
-.. option:: blocksize_unaligned, bs_unaligned
-
-       If set, fio will issue I/O units with any size within
-       :option:`blocksize_range`, not just multiples of the minimum size.  This
-       typically won't work with direct I/O, as that normally requires sector
-       alignment.
-
-.. option:: bs_is_seq_rand=bool
-
-       If this option is set, fio will use the normal read,write blocksize settings
-       as sequential,random blocksize settings instead. Any random read or write
-       will use the WRITE blocksize settings, and any sequential read or write will
-       use the READ blocksize settings.
-
-.. option:: blockalign=int[,int][,int], ba=int[,int][,int]
-
-       Boundary to which fio will align random I/O units.  Default:
-       :option:`blocksize`.  Minimum alignment is typically 512b for using direct
-       I/O, though it usually depends on the hardware block size. This option is
-       mutually exclusive with using a random map for files, so it will turn off
-       that option.  Comma-separated values may be specified for reads, writes, and
-       trims as described in :option:`blocksize`.
-
-
-Buffers and memory
-~~~~~~~~~~~~~~~~~~
-
-.. option:: zero_buffers
-
-       Initialize buffers with all zeros. Default: fill buffers with random data.
-
-.. option:: refill_buffers
-
-       If this option is given, fio will refill the I/O buffers on every
-       submit. Only makes sense if :option:`zero_buffers` isn't specified,
-       naturally. Defaults to being unset i.e., the buffer is only filled at
-       init time and the data in it is reused when possible but if any of
-       :option:`verify`, :option:`buffer_compress_percentage` or
-       :option:`dedupe_percentage` are enabled then `refill_buffers` is also
-       automatically enabled.
-
-.. option:: scramble_buffers=bool
-
-       If :option:`refill_buffers` is too costly and the target is using data
-       deduplication, then setting this option will slightly modify the I/O buffer
-       contents to defeat normal de-dupe attempts. This is not enough to defeat
-       more clever block compression attempts, but it will stop naive dedupe of
-       blocks. Default: true.
-
-.. option:: buffer_compress_percentage=int
-
-       If this is set, then fio will attempt to provide I/O buffer content
-       (on WRITEs) that compresses to the specified level. Fio does this by
-       providing a mix of random data followed by fixed pattern data. The
-       fixed pattern is either zeros, or the pattern specified by
-       :option:`buffer_pattern`. If the `buffer_pattern` option is used, it
-       might skew the compression ratio slightly. Setting
-       `buffer_compress_percentage` to a value other than 100 will also
-       enable :option:`refill_buffers` in order to reduce the likelihood that
-       adjacent blocks are so similar that they over compress when seen
-       together. See :option:`buffer_compress_chunk` for how to set a finer or
-       coarser granularity for the random/fixed data region. Defaults to unset
-       i.e., buffer data will not adhere to any compression level.
-
-.. option:: buffer_compress_chunk=int
-
-       This setting allows fio to manage how big the random/fixed data region
-       is when using :option:`buffer_compress_percentage`. When
-       `buffer_compress_chunk` is set to some non-zero value smaller than the
-       block size, fio can repeat the random/fixed region throughout the I/O
-       buffer at the specified interval (which particularly useful when
-       bigger block sizes are used for a job). When set to 0, fio will use a
-       chunk size that matches the block size resulting in a single
-       random/fixed region within the I/O buffer. Defaults to 512. When the
-       unit is omitted, the value is interpreted in bytes.
-
-.. option:: buffer_pattern=str
-
-       If set, fio will fill the I/O buffers with this pattern or with the contents
-       of a file. If not set, the contents of I/O buffers are defined by the other
-       options related to buffer contents. The setting can be any pattern of bytes,
-       and can be prefixed with 0x for hex values. It may also be a string, where
-       the string must then be wrapped with ``""``. Or it may also be a filename,
-       where the filename must be wrapped with ``''`` in which case the file is
-       opened and read. Note that not all the file contents will be read if that
-       would cause the buffers to overflow. So, for example::
-
-               buffer_pattern='filename'
-
-       or::
-
-               buffer_pattern="abcd"
-
-       or::
-
-               buffer_pattern=-12
-
-       or::
-
-               buffer_pattern=0xdeadface
-
-       Also you can combine everything together in any order::
-
-               buffer_pattern=0xdeadface"abcd"-12'filename'
-
-.. option:: dedupe_percentage=int
-
-       If set, fio will generate this percentage of identical buffers when
-       writing. These buffers will be naturally dedupable. The contents of the
-       buffers depend on what other buffer compression settings have been set. It's
-       possible to have the individual buffers either fully compressible, or not at
-       all -- this option only controls the distribution of unique buffers. Setting
-       this option will also enable :option:`refill_buffers` to prevent every buffer
-       being identical.
-
-.. option:: invalidate=bool
-
-       Invalidate the buffer/page cache parts of the files to be used prior to
-       starting I/O if the platform and file type support it.  Defaults to true.
-       This will be ignored if :option:`pre_read` is also specified for the
-       same job.
-
-.. option:: sync=str
-
-       Whether, and what type, of synchronous I/O to use for writes.  The allowed
-       values are:
-
-               **none**
-                       Do not use synchronous IO, the default.
-
-               **0**
-                       Same as **none**.
-
-               **sync**
-                       Use synchronous file IO. For the majority of I/O engines,
-                       this means using O_SYNC.
-
-               **1**
-                       Same as **sync**.
-
-               **dsync**
-                       Use synchronous data IO. For the majority of I/O engines,
-                       this means using O_DSYNC.
-
-
-.. option:: iomem=str, mem=str
-
-       Fio can use various types of memory as the I/O unit buffer.  The allowed
-       values are:
-
-               **malloc**
-                       Use memory from :manpage:`malloc(3)` as the buffers.  Default memory
-                       type.
-
-               **shm**
-                       Use shared memory as the buffers. Allocated through
-                       :manpage:`shmget(2)`.
-
-               **shmhuge**
-                       Same as shm, but use huge pages as backing.
-
-               **mmap**
-                       Use :manpage:`mmap(2)` to allocate buffers. May either be anonymous memory, or can
-                       be file backed if a filename is given after the option. The format
-                       is `mem=mmap:/path/to/file`.
-
-               **mmaphuge**
-                       Use a memory mapped huge file as the buffer backing. Append filename
-                       after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file`.
-
-               **mmapshared**
-                       Same as mmap, but use a MMAP_SHARED mapping.
-
-               **cudamalloc**
-                       Use GPU memory as the buffers for GPUDirect RDMA benchmark.
-                       The :option:`ioengine` must be `rdma`.
-
-       The area allocated is a function of the maximum allowed bs size for the job,
-       multiplied by the I/O depth given. Note that for **shmhuge** and
-       **mmaphuge** to work, the system must have free huge pages allocated. This
-       can normally be checked and set by reading/writing
-       :file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page
-       is 4MiB in size. So to calculate the number of huge pages you need for a
-       given job file, add up the I/O depth of all jobs (normally one unless
-       :option:`iodepth` is used) and multiply by the maximum bs set. Then divide
-       that number by the huge page size. You can see the size of the huge pages in
-       :file:`/proc/meminfo`. If no huge pages are allocated by having a non-zero
-       number in `nr_hugepages`, using **mmaphuge** or **shmhuge** will fail. Also
-       see :option:`hugepage-size`.
-
-       **mmaphuge** also needs to have hugetlbfs mounted and the file location
-       should point there. So if it's mounted in :file:`/huge`, you would use
-       `mem=mmaphuge:/huge/somefile`.
-
-.. option:: iomem_align=int, mem_align=int
-
-       This indicates the memory alignment of the I/O memory buffers.  Note that
-       the given alignment is applied to the first I/O unit buffer, if using
-       :option:`iodepth` the alignment of the following buffers are given by the
-       :option:`bs` used. In other words, if using a :option:`bs` that is a
-       multiple of the page sized in the system, all buffers will be aligned to
-       this value. If using a :option:`bs` that is not page aligned, the alignment
-       of subsequent I/O memory buffers is the sum of the :option:`iomem_align` and
-       :option:`bs` used.
-
-.. option:: hugepage-size=int
-
-       Defines the size of a huge page. Must at least be equal to the system
-       setting, see :file:`/proc/meminfo`. Defaults to 4MiB.  Should probably
-       always be a multiple of megabytes, so using ``hugepage-size=Xm`` is the
-       preferred way to set this to avoid setting a non-pow-2 bad value.
-
-.. option:: lockmem=int
-
-       Pin the specified amount of memory with :manpage:`mlock(2)`. Can be used to
-       simulate a smaller amount of memory.  The amount specified is per worker.
-
-
-I/O size
-~~~~~~~~
-
-.. option:: size=int
-
-       The total size of file I/O for each thread of this job. Fio will run until
-       this many bytes has been transferred, unless runtime is limited by other options
-       (such as :option:`runtime`, for instance, or increased/decreased by :option:`io_size`).
-       Fio will divide this size between the available files determined by options
-       such as :option:`nrfiles`, :option:`filename`, unless :option:`filesize` is
-       specified by the job. If the result of division happens to be 0, the size is
-       set to the physical size of the given files or devices if they exist.
-       If this option is not specified, fio will use the full size of the given
-       files or devices.  If the files do not exist, size must be given. It is also
-       possible to give size as a percentage between 1 and 100. If ``size=20%`` is
-       given, fio will use 20% of the full size of the given files or devices.
-       Can be combined with :option:`offset` to constrain the start and end range
-       that I/O will be done within.
-
-.. option:: io_size=int, io_limit=int
-
-       Normally fio operates within the region set by :option:`size`, which means
-       that the :option:`size` option sets both the region and size of I/O to be
-       performed. Sometimes that is not what you want. With this option, it is
-       possible to define just the amount of I/O that fio should do. For instance,
-       if :option:`size` is set to 20GiB and :option:`io_size` is set to 5GiB, fio
-       will perform I/O within the first 20GiB but exit when 5GiB have been
-       done. The opposite is also possible -- if :option:`size` is set to 20GiB,
-       and :option:`io_size` is set to 40GiB, then fio will do 40GiB of I/O within
-       the 0..20GiB region.
-
-.. option:: filesize=irange(int)
-
-       Individual file sizes. May be a range, in which case fio will select sizes
-       for files at random within the given range and limited to :option:`size` in
-       total (if that is given). If not given, each created file is the same size.
-       This option overrides :option:`size` in terms of file size, which means
-       this value is used as a fixed size or possible range of each file.
-
-.. option:: file_append=bool
-
-       Perform I/O after the end of the file. Normally fio will operate within the
-       size of a file. If this option is set, then fio will append to the file
-       instead. This has identical behavior to setting :option:`offset` to the size
-       of a file.  This option is ignored on non-regular files.
-
-.. option:: fill_device=bool, fill_fs=bool
-
-       Sets size to something really large and waits for ENOSPC (no space left on
-       device) as the terminating condition. Only makes sense with sequential
-       write. For a read workload, the mount point will be filled first then I/O
-       started on the result. This option doesn't make sense if operating on a raw
-       device node, since the size of that is already known by the file system.
-       Additionally, writing beyond end-of-device will not return ENOSPC there.
-
-
-I/O engine
-~~~~~~~~~~
-
-.. option:: ioengine=str
-
-       Defines how the job issues I/O to the file. The following types are defined:
-
-               **sync**
-                       Basic :manpage:`read(2)` or :manpage:`write(2)`
-                       I/O. :manpage:`lseek(2)` is used to position the I/O location.
-                       See :option:`fsync` and :option:`fdatasync` for syncing write I/Os.
-
-               **psync**
-                       Basic :manpage:`pread(2)` or :manpage:`pwrite(2)` I/O.  Default on
-                       all supported operating systems except for Windows.
-
-               **vsync**
-                       Basic :manpage:`readv(2)` or :manpage:`writev(2)` I/O.  Will emulate
-                       queuing by coalescing adjacent I/Os into a single submission.
-
-               **pvsync**
-                       Basic :manpage:`preadv(2)` or :manpage:`pwritev(2)` I/O.
-
-               **pvsync2**
-                       Basic :manpage:`preadv2(2)` or :manpage:`pwritev2(2)` I/O.
-
-               **io_uring**
-                       Fast Linux native asynchronous I/O. Supports async IO
-                       for both direct and buffered IO.
-                       This engine defines engine specific options.
-
-               **libaio**
-                       Linux native asynchronous I/O. Note that Linux may only support
-                       queued behavior with non-buffered I/O (set ``direct=1`` or
-                       ``buffered=0``).
-                       This engine defines engine specific options.
-
-               **posixaio**
-                       POSIX asynchronous I/O using :manpage:`aio_read(3)` and
-                       :manpage:`aio_write(3)`.
-
-               **solarisaio**
-                       Solaris native asynchronous I/O.
-
-               **windowsaio**
-                       Windows native asynchronous I/O.  Default on Windows.
-
-               **mmap**
-                       File is memory mapped with :manpage:`mmap(2)` and data copied
-                       to/from using :manpage:`memcpy(3)`.
-
-               **splice**
-                       :manpage:`splice(2)` is used to transfer the data and
-                       :manpage:`vmsplice(2)` to transfer data from user space to the
-                       kernel.
-
-               **sg**
-                       SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
-                       ioctl, or if the target is an sg character device we use
-                       :manpage:`read(2)` and :manpage:`write(2)` for asynchronous
-                       I/O. Requires :option:`filename` option to specify either block or
-                       character devices. This engine supports trim operations.
-                       The sg engine includes engine specific options.
-
-               **null**
-                       Doesn't transfer any data, just pretends to.  This is mainly used to
-                       exercise fio itself and for debugging/testing purposes.
-
-               **net**
-                       Transfer over the network to given ``host:port``.  Depending on the
-                       :option:`protocol` used, the :option:`hostname`, :option:`port`,
-                       :option:`listen` and :option:`filename` options are used to specify
-                       what sort of connection to make, while the :option:`protocol` option
-                       determines which protocol will be used.  This engine defines engine
-                       specific options.
-
-               **netsplice**
-                       Like **net**, but uses :manpage:`splice(2)` and
-                       :manpage:`vmsplice(2)` to map data and send/receive.
-                       This engine defines engine specific options.
-
-               **cpuio**
-                       Doesn't transfer any data, but burns CPU cycles according to the
-                       :option:`cpuload`, :option:`cpuchunks` and :option:`cpumode` options.
-                       Setting :option:`cpuload`\=85 will cause that job to do nothing but burn 85%
-                       of the CPU. In case of SMP machines, use :option:`numjobs`\=<nr_of_cpu>
-                       to get desired CPU usage, as the cpuload only loads a
-                       single CPU at the desired rate. A job never finishes unless there is
-                       at least one non-cpuio job.
-                       Setting :option:`cpumode`\=qsort replace the default noop instructions loop
-                       by a qsort algorithm to consume more energy.
-
-               **rdma**
-                       The RDMA I/O engine supports both RDMA memory semantics
-                       (RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
-                       InfiniBand, RoCE and iWARP protocols. This engine defines engine
-                       specific options.
-
-               **falloc**
-                       I/O engine that does regular fallocate to simulate data transfer as
-                       fio ioengine.
-
-                       DDIR_READ
-                               does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
-
-                       DDIR_WRITE
-                               does fallocate(,mode = 0).
-
-                       DDIR_TRIM
-                               does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
-
-               **ftruncate**
-                       I/O engine that sends :manpage:`ftruncate(2)` operations in response
-                       to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
-                       size to the current block offset. :option:`blocksize` is ignored.
-
-               **e4defrag**
-                       I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
-                       defragment activity in request to DDIR_WRITE event.
-
-               **rados**
-                       I/O engine supporting direct access to Ceph Reliable Autonomic
-                       Distributed Object Store (RADOS) via librados. This ioengine
-                       defines engine specific options.
-
-               **rbd**
-                       I/O engine supporting direct access to Ceph Rados Block Devices
-                       (RBD) via librbd without the need to use the kernel rbd driver. This
-                       ioengine defines engine specific options.
-
-               **http**
-                       I/O engine supporting GET/PUT requests over HTTP(S) with libcurl to
-                       a WebDAV or S3 endpoint.  This ioengine defines engine specific options.
-
-                       This engine only supports direct IO of iodepth=1; you need to scale this
-                       via numjobs. blocksize defines the size of the objects to be created.
-
-                       TRIM is translated to object deletion.
-
-               **gfapi**
-                       Using GlusterFS libgfapi sync interface to direct access to
-                       GlusterFS volumes without having to go through FUSE.  This ioengine
-                       defines engine specific options.
-
-               **gfapi_async**
-                       Using GlusterFS libgfapi async interface to direct access to
-                       GlusterFS volumes without having to go through FUSE. This ioengine
-                       defines engine specific options.
-
-               **libhdfs**
-                       Read and write through Hadoop (HDFS).  The :option:`filename` option
-                       is used to specify host,port of the hdfs name-node to connect.  This
-                       engine interprets offsets a little differently.  In HDFS, files once
-                       created cannot be modified so random writes are not possible. To
-                       imitate this the libhdfs engine expects a bunch of small files to be
-                       created over HDFS and will randomly pick a file from them
-                       based on the offset generated by fio backend (see the example
-                       job file to create such files, use ``rw=write`` option). Please
-                       note, it may be necessary to set environment variables to work
-                       with HDFS/libhdfs properly.  Each job uses its own connection to
-                       HDFS.
-
-               **mtd**
-                       Read, write and erase an MTD character device (e.g.,
-                       :file:`/dev/mtd0`). Discards are treated as erases. Depending on the
-                       underlying device type, the I/O may have to go in a certain pattern,
-                       e.g., on NAND, writing sequentially to erase blocks and discarding
-                       before overwriting. The `trimwrite` mode works well for this
-                       constraint.
-
-               **pmemblk**
-                       Read and write using filesystem DAX to a file on a filesystem
-                       mounted with DAX on a persistent memory device through the PMDK
-                       libpmemblk library.
-
-               **dev-dax**
-                       Read and write using device DAX to a persistent memory device (e.g.,
-                       /dev/dax0.0) through the PMDK libpmem library.
-
-               **external**
-                       Prefix to specify loading an external I/O engine object file. Append
-                       the engine filename, e.g. ``ioengine=external:/tmp/foo.o`` to load
-                       ioengine :file:`foo.o` in :file:`/tmp`. The path can be either
-                       absolute or relative. See :file:`engines/skeleton_external.c` for
-                       details of writing an external I/O engine.
-
-               **filecreate**
-                       Simply create the files and do no I/O to them.  You still need to
-                       set  `filesize` so that all the accounting still occurs, but no
-                       actual I/O will be done other than creating the file.
-
-               **filestat**
-                       Simply do stat() and do no I/O to the file. You need to set 'filesize'
-                       and 'nrfiles', so that files will be created.
-                       This engine is to measure file lookup and meta data access.
-
-               **libpmem**
-                       Read and write using mmap I/O to a file on a filesystem
-                       mounted with DAX on a persistent memory device through the PMDK
-                       libpmem library.
-
-               **ime_psync**
-                       Synchronous read and write using DDN's Infinite Memory Engine (IME).
-                       This engine is very basic and issues calls to IME whenever an IO is
-                       queued.
-
-               **ime_psyncv**
-                       Synchronous read and write using DDN's Infinite Memory Engine (IME).
-                       This engine uses iovecs and will try to stack as much IOs as possible
-                       (if the IOs are "contiguous" and the IO depth is not exceeded)
-                       before issuing a call to IME.
-
-               **ime_aio**
-                       Asynchronous read and write using DDN's Infinite Memory Engine (IME).
-                       This engine will try to stack as much IOs as possible by creating
-                       requests for IME. FIO will then decide when to commit these requests.
-               **libiscsi**
-                       Read and write iscsi lun with libiscsi.
-               **nbd**
-                       Read and write a Network Block Device (NBD).
-
-               **libcufile**
-                       I/O engine supporting libcufile synchronous access to nvidia-fs and a
-                       GPUDirect Storage-supported filesystem. This engine performs
-                       I/O without transferring buffers between user-space and the kernel,
-                       unless :option:`verify` is set or :option:`cuda_io` is `posix`.
-                       :option:`iomem` must not be `cudamalloc`. This ioengine defines
-                       engine specific options.
-
-I/O engine specific parameters
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In addition, there are some parameters which are only valid when a specific
-:option:`ioengine` is in use. These are used identically to normal parameters,
-with the caveat that when used on the command line, they must come after the
-:option:`ioengine` that defines them is selected.
-
-.. option:: cmdprio_percentage=int : [io_uring] [libaio]
-
-    Set the percentage of I/O that will be issued with higher priority by setting
-    the priority bit. Non-read I/O is likely unaffected by ``cmdprio_percentage``.
-    This option cannot be used with the `prio` or `prioclass` options. For this
-    option to set the priority bit properly, NCQ priority must be supported and
-    enabled and :option:`direct`\=1 option must be used. fio must also be run as
-    the root user.
-
-.. option:: fixedbufs : [io_uring]
-
-    If fio is asked to do direct IO, then Linux will map pages for each
-    IO call, and release them when IO is done. If this option is set, the
-    pages are pre-mapped before IO is started. This eliminates the need to
-    map and release for each IO. This is more efficient, and reduces the
-    IO latency as well.
-
-.. option:: hipri : [io_uring]
-
-    If this option is set, fio will attempt to use polled IO completions.
-    Normal IO completions generate interrupts to signal the completion of
-    IO, polled completions do not. Hence they are require active reaping
-    by the application. The benefits are more efficient IO for high IOPS
-    scenarios, and lower latencies for low queue depth IO.
-
-.. option:: registerfiles : [io_uring]
-
-       With this option, fio registers the set of files being used with the
-       kernel. This avoids the overhead of managing file counts in the kernel,
-       making the submission and completion part more lightweight. Required
-       for the below :option:`sqthread_poll` option.
-
-.. option:: sqthread_poll : [io_uring]
-
-       Normally fio will submit IO by issuing a system call to notify the
-       kernel of available items in the SQ ring. If this option is set, the
-       act of submitting IO will be done by a polling thread in the kernel.
-       This frees up cycles for fio, at the cost of using more CPU in the
-       system.
-
-.. option:: sqthread_poll_cpu : [io_uring]
-
-       When :option:`sqthread_poll` is set, this option provides a way to
-       define which CPU should be used for the polling thread.
-
-.. option:: userspace_reap : [libaio]
-
-       Normally, with the libaio engine in use, fio will use the
-       :manpage:`io_getevents(2)` system call to reap newly returned events.  With
-       this flag turned on, the AIO ring will be read directly from user-space to
-       reap events. The reaping mode is only enabled when polling for a minimum of
-       0 events (e.g. when :option:`iodepth_batch_complete` `=0`).
-
-.. option:: hipri : [pvsync2]
-
-       Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
-       than normal.
-
-.. option:: hipri_percentage : [pvsync2]
-
-       When hipri is set this determines the probability of a pvsync2 I/O being high
-       priority. The default is 100%.
-
-.. option:: nowait : [pvsync2] [libaio] [io_uring]
-
-       By default if a request cannot be executed immediately (e.g. resource starvation,
-       waiting on locks) it is queued and the initiating process will be blocked until
-       the required resource becomes free.
-
-       This option sets the RWF_NOWAIT flag (supported from the 4.14 Linux kernel) and
-       the call will return instantly with EAGAIN or a partial result rather than waiting.
-
-       It is useful to also use ignore_error=EAGAIN when using this option.
-
-       Note: glibc 2.27, 2.28 have a bug in syscall wrappers preadv2, pwritev2.
-       They return EOPNOTSUP instead of EAGAIN.
-
-       For cached I/O, using this option usually means a request operates only with
-       cached data. Currently the RWF_NOWAIT flag does not supported for cached write.
-
-       For direct I/O, requests will only succeed if cache invalidation isn't required,
-       file blocks are fully allocated and the disk request could be issued immediately.
-
-.. option:: cpuload=int : [cpuio]
-
-       Attempt to use the specified percentage of CPU cycles. This is a mandatory
-       option when using cpuio I/O engine.
-
-.. option:: cpuchunks=int : [cpuio]
-
-       Split the load into cycles of the given time. In microseconds.
-
-.. option:: exit_on_io_done=bool : [cpuio]
-
-       Detect when I/O threads are done, then exit.
-
-.. option:: namenode=str : [libhdfs]
-
-       The hostname or IP address of a HDFS cluster namenode to contact.
-
-.. option:: port=int
-
-   [libhdfs]
-
-               The listening port of the HFDS cluster namenode.
-
-   [netsplice], [net]
-
-               The TCP or UDP port to bind to or connect to. If this is used with
-               :option:`numjobs` to spawn multiple instances of the same job type, then
-               this will be the starting port number since fio will use a range of
-               ports.
-
-   [rdma]
-
-               The port to use for RDMA-CM communication. This should be the same value
-               on the client and the server side.
-
-.. option:: hostname=str : [netsplice] [net] [rdma]
-
-       The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O.  If the job
-       is a TCP listener or UDP reader, the hostname is not used and must be omitted
-       unless it is a valid UDP multicast address.
-
-.. option:: interface=str : [netsplice] [net]
-
-       The IP address of the network interface used to send or receive UDP
-       multicast.
-
-.. option:: ttl=int : [netsplice] [net]
-
-       Time-to-live value for outgoing UDP multicast packets. Default: 1.
-
-.. option:: nodelay=bool : [netsplice] [net]
-
-       Set TCP_NODELAY on TCP connections.
-
-.. option:: protocol=str, proto=str : [netsplice] [net]
-
-       The network protocol to use. Accepted values are:
-
-       **tcp**
-               Transmission control protocol.
-       **tcpv6**
-               Transmission control protocol V6.
-       **udp**
-               User datagram protocol.
-       **udpv6**
-               User datagram protocol V6.
-       **unix**
-               UNIX domain socket.
-
-       When the protocol is TCP or UDP, the port must also be given, as well as the
-       hostname if the job is a TCP listener or UDP reader. For unix sockets, the
-       normal :option:`filename` option should be used and the port is invalid.
-
-.. option:: listen : [netsplice] [net]
-
-       For TCP network connections, tell fio to listen for incoming connections
-       rather than initiating an outgoing connection. The :option:`hostname` must
-       be omitted if this option is used.
-
-.. option:: pingpong : [netsplice] [net]
-
-       Normally a network writer will just continue writing data, and a network
-       reader will just consume packages. If ``pingpong=1`` is set, a writer will
-       send its normal payload to the reader, then wait for the reader to send the
-       same payload back. This allows fio to measure network latencies. The
-       submission and completion latencies then measure local time spent sending or
-       receiving, and the completion latency measures how long it took for the
-       other end to receive and send back.  For UDP multicast traffic
-       ``pingpong=1`` should only be set for a single reader when multiple readers
-       are listening to the same address.
-
-.. option:: window_size : [netsplice] [net]
-
-       Set the desired socket buffer size for the connection.
-
-.. option:: mss : [netsplice] [net]
-
-       Set the TCP maximum segment size (TCP_MAXSEG).
-
-.. option:: donorname=str : [e4defrag]
-
-       File will be used as a block donor (swap extents between files).
-
-.. option:: inplace=int : [e4defrag]
-
-       Configure donor file blocks allocation strategy:
-
-       **0**
-               Default. Preallocate donor's file on init.
-       **1**
-               Allocate space immediately inside defragment event, and free right
-               after event.
-
-.. option:: clustername=str : [rbd,rados]
-
-       Specifies the name of the Ceph cluster.
-
-.. option:: rbdname=str : [rbd]
-
-       Specifies the name of the RBD.
-
-.. option:: pool=str : [rbd,rados]
-
-       Specifies the name of the Ceph pool containing RBD or RADOS data.
-
-.. option:: clientname=str : [rbd,rados]
-
-       Specifies the username (without the 'client.' prefix) used to access the
-       Ceph cluster. If the *clustername* is specified, the *clientname* shall be
-       the full *type.id* string. If no type. prefix is given, fio will add
-       'client.' by default.
-
-.. option:: busy_poll=bool : [rbd,rados]
-
-        Poll store instead of waiting for completion. Usually this provides better
-        throughput at cost of higher(up to 100%) CPU utilization.
-
-.. option:: skip_bad=bool : [mtd]
-
-       Skip operations against known bad blocks.
-
-.. option:: hdfsdirectory : [libhdfs]
-
-       libhdfs will create chunk in this HDFS directory.
-
-.. option:: chunk_size : [libhdfs]
-
-       The size of the chunk to use for each file.
-
-.. option:: verb=str : [rdma]
-
-       The RDMA verb to use on this side of the RDMA ioengine connection. Valid
-       values are write, read, send and recv. These correspond to the equivalent
-       RDMA verbs (e.g. write = rdma_write etc.). Note that this only needs to be
-       specified on the client side of the connection. See the examples folder.
-
-.. option:: bindname=str : [rdma]
-
-       The name to use to bind the local RDMA-CM connection to a local RDMA device.
-       This could be a hostname or an IPv4 or IPv6 address. On the server side this
-       will be passed into the rdma_bind_addr() function and on the client site it
-       will be used in the rdma_resolve_add() function. This can be useful when
-       multiple paths exist between the client and the server or in certain loopback
-       configurations.
-
-.. option:: stat_type=str : [filestat]
-
-       Specify stat system call type to measure lookup/getattr performance.
-       Default is **stat** for :manpage:`stat(2)`.
-
-.. option:: readfua=bool : [sg]
-
-       With readfua option set to 1, read operations include
-       the force unit access (fua) flag. Default is 0.
-
-.. option:: writefua=bool : [sg]
-
-       With writefua option set to 1, write operations include
-       the force unit access (fua) flag. Default is 0.
-
-.. option:: sg_write_mode=str : [sg]
-
-       Specify the type of write commands to issue. This option can take three values:
-
-       **write**
-               This is the default where write opcodes are issued as usual.
-       **verify**
-               Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
-               directs the device to carry out a medium verification with no data
-               comparison. The writefua option is ignored with this selection.
-       **same**
-               Issue WRITE SAME commands. This transfers a single block to the device
-               and writes this same block of data to a contiguous sequence of LBAs
-               beginning at the specified offset. fio's block size parameter specifies
-               the amount of data written with each command. However, the amount of data
-               actually transferred to the device is equal to the device's block
-               (sector) size. For a device with 512 byte sectors, blocksize=8k will
-               write 16 sectors with each command. fio will still generate 8k of data
-               for each command but only the first 512 bytes will be used and
-               transferred to the device. The writefua option is ignored with this
-               selection.
-
-.. option:: hipri : [sg]
-
-       If this option is set, fio will attempt to use polled IO completions.
-       This will have a similar effect as (io_uring)hipri. Only SCSI READ and
-       WRITE commands will have the SGV4_FLAG_HIPRI set (not UNMAP (trim) nor
-       VERIFY). Older versions of the Linux sg driver that do not support
-       hipri will simply ignore this flag and do normal IO. The Linux SCSI
-       Low Level Driver (LLD) that "owns" the device also needs to support
-       hipri (also known as iopoll and mq_poll). The MegaRAID driver is an
-       example of a SCSI LLD. Default: clear (0) which does normal
-       (interrupted based) IO.
-
-.. option:: http_host=str : [http]
-
-       Hostname to connect to. For S3, this could be the bucket hostname.
-       Default is **localhost**
-
-.. option:: http_user=str : [http]
-
-       Username for HTTP authentication.
-
-.. option:: http_pass=str : [http]
-
-       Password for HTTP authentication.
-
-.. option:: https=str : [http]
-
-       Enable HTTPS instead of http. *on* enables HTTPS; *insecure*
-       will enable HTTPS, but disable SSL peer verification (use with
-       caution!). Default is **off**
-
-.. option:: http_mode=str : [http]
-
-       Which HTTP access mode to use: *webdav*, *swift*, or *s3*.
-       Default is **webdav**
-
-.. option:: http_s3_region=str : [http]
-
-       The S3 region/zone string.
-       Default is **us-east-1**
-
-.. option:: http_s3_key=str : [http]
-
-       The S3 secret key.
-
-.. option:: http_s3_keyid=str : [http]
-
-       The S3 key/access id.
-
-.. option:: http_swift_auth_token=str : [http]
-
-       The Swift auth token. See the example configuration file on how
-       to retrieve this.
-
-.. option:: http_verbose=int : [http]
-
-       Enable verbose requests from libcurl. Useful for debugging. 1
-       turns on verbose logging from libcurl, 2 additionally enables
-       HTTP IO tracing. Default is **0**
-
-.. option:: uri=str : [nbd]
-
-       Specify the NBD URI of the server to test.  The string
-       is a standard NBD URI
-       (see https://github.com/NetworkBlockDevice/nbd/tree/master/doc).
-       Example URIs: nbd://localhost:10809
-       nbd+unix:///?socket=/tmp/socket
-       nbds://tlshost/exportname
-
-.. option:: gpu_dev_ids=str : [libcufile]
-
-       Specify the GPU IDs to use with CUDA. This is a colon-separated list of
-       int. GPUs are assigned to workers roundrobin. Default is 0.
-
-.. option:: cuda_io=str : [libcufile]
-
-       Specify the type of I/O to use with CUDA. Default is **cufile**.
-
-       **cufile**
-               Use libcufile and nvidia-fs. This option performs I/O directly
-               between a GPUDirect Storage filesystem and GPU buffers,
-               avoiding use of a bounce buffer. If :option:`verify` is set,
-               cudaMemcpy is used to copy verificaton data between RAM and GPU.
-               Verification data is copied from RAM to GPU before a write
-               and from GPU to RAM after a read. :option:`direct` must be 1.
-       **posix**
-               Use POSIX to perform I/O with a RAM buffer, and use cudaMemcpy
-               to transfer data between RAM and the GPUs. Data is copied from
-               GPU to RAM before a write and copied from RAM to GPU after a
-               read. :option:`verify` does not affect use of cudaMemcpy.
-
-I/O depth
-~~~~~~~~~
-
-.. option:: iodepth=int
-
-       Number of I/O units to keep in flight against the file.  Note that
-       increasing *iodepth* beyond 1 will not affect synchronous ioengines (except
-       for small degrees when :option:`verify_async` is in use).  Even async
-       engines may impose OS restrictions causing the desired depth not to be
-       achieved.  This may happen on Linux when using libaio and not setting
-       :option:`direct`\=1, since buffered I/O is not async on that OS.  Keep an
-       eye on the I/O depth distribution in the fio output to verify that the
-       achieved depth is as expected. Default: 1.
-
-.. option:: iodepth_batch_submit=int, iodepth_batch=int
-
-       This defines how many pieces of I/O to submit at once.  It defaults to 1
-       which means that we submit each I/O as soon as it is available, but can be
-       raised to submit bigger batches of I/O at the time. If it is set to 0 the
-       :option:`iodepth` value will be used.
-
-.. option:: iodepth_batch_complete_min=int, iodepth_batch_complete=int
-
-       This defines how many pieces of I/O to retrieve at once. It defaults to 1
-       which means that we'll ask for a minimum of 1 I/O in the retrieval process
-       from the kernel. The I/O retrieval will go on until we hit the limit set by
-       :option:`iodepth_low`. If this variable is set to 0, then fio will always
-       check for completed events before queuing more I/O. This helps reduce I/O
-       latency, at the cost of more retrieval system calls.
-
-.. option:: iodepth_batch_complete_max=int
-
-       This defines maximum pieces of I/O to retrieve at once. This variable should
-       be used along with :option:`iodepth_batch_complete_min`\=int variable,
-       specifying the range of min and max amount of I/O which should be
-       retrieved. By default it is equal to the :option:`iodepth_batch_complete_min`
-       value.
-
-       Example #1::
-
-               iodepth_batch_complete_min=1
-               iodepth_batch_complete_max=<iodepth>
-
-       which means that we will retrieve at least 1 I/O and up to the whole
-       submitted queue depth. If none of I/O has been completed yet, we will wait.
-
-       Example #2::
-
-               iodepth_batch_complete_min=0
-               iodepth_batch_complete_max=<iodepth>
-
-       which means that we can retrieve up to the whole submitted queue depth, but
-       if none of I/O has been completed yet, we will NOT wait and immediately exit
-       the system call. In this example we simply do polling.
-
-.. option:: iodepth_low=int
-
-       The low water mark indicating when to start filling the queue
-       again. Defaults to the same as :option:`iodepth`, meaning that fio will
-       attempt to keep the queue full at all times.  If :option:`iodepth` is set to
-       e.g. 16 and *iodepth_low* is set to 4, then after fio has filled the queue of
-       16 requests, it will let the depth drain down to 4 before starting to fill
-       it again.
-
-.. option:: serialize_overlap=bool
-
-       Serialize in-flight I/Os that might otherwise cause or suffer from data races.
-       When two or more I/Os are submitted simultaneously, there is no guarantee that
-       the I/Os will be processed or completed in the submitted order. Further, if
-       two or more of those I/Os are writes, any overlapping region between them can
-       become indeterminate/undefined on certain storage. These issues can cause
-       verification to fail erratically when at least one of the racing I/Os is
-       changing data and the overlapping region has a non-zero size. Setting
-       ``serialize_overlap`` tells fio to avoid provoking this behavior by explicitly
-       serializing in-flight I/Os that have a non-zero overlap. Note that setting
-       this option can reduce both performance and the :option:`iodepth` achieved.
-
-       This option only applies to I/Os issued for a single job except when it is
-       enabled along with :option:`io_submit_mode`\=offload. In offload mode, fio
-       will check for overlap among all I/Os submitted by offload jobs with :option:`serialize_overlap`
-       enabled.
-
-       Default: false.
-
-.. option:: io_submit_mode=str
-
-       This option controls how fio submits the I/O to the I/O engine. The default
-       is `inline`, which means that the fio job threads submit and reap I/O
-       directly. If set to `offload`, the job threads will offload I/O submission
-       to a dedicated pool of I/O threads. This requires some coordination and thus
-       has a bit of extra overhead, especially for lower queue depth I/O where it
-       can increase latencies. The benefit is that fio can manage submission rates
-       independently of the device completion rates. This avoids skewed latency
-       reporting if I/O gets backed up on the device side (the coordinated omission
-       problem). Note that this option cannot reliably be used with async IO
-       engines.
-
-
-I/O rate
-~~~~~~~~
-
-.. option:: thinktime=time
-
-       Stall the job for the specified period of time after an I/O has completed before issuing the
-       next. May be used to simulate processing being done by an application.
-       When the unit is omitted, the value is interpreted in microseconds.  See
-       :option:`thinktime_blocks` and :option:`thinktime_spin`.
-
-.. option:: thinktime_spin=time
-
-       Only valid if :option:`thinktime` is set - pretend to spend CPU time doing
-       something with the data received, before falling back to sleeping for the
-       rest of the period specified by :option:`thinktime`.  When the unit is
-       omitted, the value is interpreted in microseconds.
-
-.. option:: thinktime_blocks=int
-
-       Only valid if :option:`thinktime` is set - control how many blocks to issue,
-       before waiting :option:`thinktime` usecs. If not set, defaults to 1 which will make
-       fio wait :option:`thinktime` usecs after every block. This effectively makes any
-       queue depth setting redundant, since no more than 1 I/O will be queued
-       before we have to complete it and do our :option:`thinktime`. In other words, this
-       setting effectively caps the queue depth if the latter is larger.
-
-.. option:: thinktime_blocks_type=str
-
-       Only valid if :option:`thinktime` is set - control how :option:`thinktime_blocks`
-       triggers. The default is `complete`, which triggers thinktime when fio completes
-       :option:`thinktime_blocks` blocks. If this is set to `issue`, then the trigger happens
-       at the issue side.
-
-.. option:: rate=int[,int][,int]
-
-       Cap the bandwidth used by this job. The number is in bytes/sec, the normal
-       suffix rules apply.  Comma-separated values may be specified for reads,
-       writes, and trims as described in :option:`blocksize`.
-
-       For example, using `rate=1m,500k` would limit reads to 1MiB/sec and writes to
-       500KiB/sec.  Capping only reads or writes can be done with `rate=,500k` or
-       `rate=500k,` where the former will only limit writes (to 500KiB/sec) and the
-       latter will only limit reads.
-
-.. option:: rate_min=int[,int][,int]
-
-       Tell fio to do whatever it can to maintain at least this bandwidth. Failing
-       to meet this requirement will cause the job to exit.  Comma-separated values
-       may be specified for reads, writes, and trims as described in
-       :option:`blocksize`.
-
-.. option:: rate_iops=int[,int][,int]
-
-       Cap the bandwidth to this number of IOPS. Basically the same as
-       :option:`rate`, just specified independently of bandwidth. If the job is
-       given a block size range instead of a fixed value, the smallest block size
-       is used as the metric.  Comma-separated values may be specified for reads,
-       writes, and trims as described in :option:`blocksize`.
-
-.. option:: rate_iops_min=int[,int][,int]
-
-       If fio doesn't meet this rate of I/O, it will cause the job to exit.
-       Comma-separated values may be specified for reads, writes, and trims as
-       described in :option:`blocksize`.
-
-.. option:: rate_process=str
-
-       This option controls how fio manages rated I/O submissions. The default is
-       `linear`, which submits I/O in a linear fashion with fixed delays between
-       I/Os that gets adjusted based on I/O completion rates. If this is set to
-       `poisson`, fio will submit I/O based on a more real world random request
-       flow, known as the Poisson process
-       (https://en.wikipedia.org/wiki/Poisson_point_process). The lambda will be
-       10^6 / IOPS for the given workload.
-
-.. option:: rate_ignore_thinktime=bool
-
-       By default, fio will attempt to catch up to the specified rate setting,
-       if any kind of thinktime setting was used. If this option is set, then
-       fio will ignore the thinktime and continue doing IO at the specified
-       rate, instead of entering a catch-up mode after thinktime is done.
-
-
-I/O latency
-~~~~~~~~~~~
-
-.. option:: latency_target=time
-
-       If set, fio will attempt to find the max performance point that the given
-       workload will run at while maintaining a latency below this target.  When
-       the unit is omitted, the value is interpreted in microseconds.  See
-       :option:`latency_window` and :option:`latency_percentile`.
-
-.. option:: latency_window=time
-
-       Used with :option:`latency_target` to specify the sample window that the job
-       is run at varying queue depths to test the performance.  When the unit is
-       omitted, the value is interpreted in microseconds.
-
-.. option:: latency_percentile=float
-
-       The percentage of I/Os that must fall within the criteria specified by
-       :option:`latency_target` and :option:`latency_window`. If not set, this
-       defaults to 100.0, meaning that all I/Os must be equal or below to the value
-       set by :option:`latency_target`.
-
-.. option:: latency_run=bool
-
-       Used with :option:`latency_target`. If false (default), fio will find
-       the highest queue depth that meets :option:`latency_target` and exit. If
-       true, fio will continue running and try to meet :option:`latency_target`
-       by adjusting queue depth.
-
-.. option:: max_latency=time
-
-       If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
-       maximum latency. When the unit is omitted, the value is interpreted in
-       microseconds.
-
-.. option:: rate_cycle=int
-
-       Average bandwidth for :option:`rate` and :option:`rate_min` over this number
-       of milliseconds. Defaults to 1000.
-
-
-I/O replay
-~~~~~~~~~~
-
-.. option:: write_iolog=str
-
-       Write the issued I/O patterns to the specified file. See
-       :option:`read_iolog`.  Specify a separate file for each job, otherwise the
-       iologs will be interspersed and the file may be corrupt.
-
-.. option:: read_iolog=str
-
-       Open an iolog with the specified filename and replay the I/O patterns it
-       contains. This can be used to store a workload and replay it sometime
-       later. The iolog given may also be a blktrace binary file, which allows fio
-       to replay a workload captured by :command:`blktrace`. See
-       :manpage:`blktrace(8)` for how to capture such logging data. For blktrace
-       replay, the file needs to be turned into a blkparse binary data file first
-       (``blkparse <device> -o /dev/null -d file_for_fio.bin``).
-       You can specify a number of files by separating the names with a ':'
-       character. See the :option:`filename` option for information on how to
-       escape ':' characters within the file names. These files will
-       be sequentially assigned to job clones created by :option:`numjobs`.
-       '-' is a reserved name, meaning read from stdin, notably if
-       :option:`filename` is set to '-' which means stdin as well, then
-       this flag can't be set to '-'.
-
-.. option:: read_iolog_chunked=bool
-
-       Determines how iolog is read. If false(default) entire :option:`read_iolog`
-       will be read at once. If selected true, input from iolog will be read
-       gradually. Useful when iolog is very large, or it is generated.
-
-.. option:: merge_blktrace_file=str
-
-       When specified, rather than replaying the logs passed to :option:`read_iolog`,
-       the logs go through a merge phase which aggregates them into a single
-       blktrace. The resulting file is then passed on as the :option:`read_iolog`
-       parameter. The intention here is to make the order of events consistent.
-       This limits the influence of the scheduler compared to replaying multiple
-       blktraces via concurrent jobs.
-
-.. option:: merge_blktrace_scalars=float_list
-
-       This is a percentage based option that is index paired with the list of
-       files passed to :option:`read_iolog`. When merging is performed, scale
-       the time of each event by the corresponding amount. For example,
-       ``--merge_blktrace_scalars="50:100"`` runs the first trace in halftime
-       and the second trace in realtime. This knob is separately tunable from
-       :option:`replay_time_scale` which scales the trace during runtime and
-       does not change the output of the merge unlike this option.
-
-.. option:: merge_blktrace_iters=float_list
-
-       This is a whole number option that is index paired with the list of files
-       passed to :option:`read_iolog`. When merging is performed, run each trace
-       for the specified number of iterations. For example,
-       ``--merge_blktrace_iters="2:1"`` runs the first trace for two iterations
-       and the second trace for one iteration.
-
-.. option:: replay_no_stall=bool
-
-       When replaying I/O with :option:`read_iolog` the default behavior is to
-       attempt to respect the timestamps within the log and replay them with the
-       appropriate delay between IOPS. By setting this variable fio will not
-       respect the timestamps and attempt to replay them as fast as possible while
-       still respecting ordering. The result is the same I/O pattern to a given
-       device, but different timings.
-
-.. option:: replay_time_scale=int
-
-       When replaying I/O with :option:`read_iolog`, fio will honor the
-       original timing in the trace. With this option, it's possible to scale
-       the time. It's a percentage option, if set to 50 it means run at 50%
-       the original IO rate in the trace. If set to 200, run at twice the
-       original IO rate. Defaults to 100.
-
-.. option:: replay_redirect=str
-
-       While replaying I/O patterns using :option:`read_iolog` the default behavior
-       is to replay the IOPS onto the major/minor device that each IOP was recorded
-       from.  This is sometimes undesirable because on a different machine those
-       major/minor numbers can map to a different device.  Changing hardware on the
-       same system can also result in a different major/minor mapping.
-       ``replay_redirect`` causes all I/Os to be replayed onto the single specified
-       device regardless of the device it was recorded
-       from. i.e. :option:`replay_redirect`\= :file:`/dev/sdc` would cause all I/O
-       in the blktrace or iolog to be replayed onto :file:`/dev/sdc`.  This means
-       multiple devices will be replayed onto a single device, if the trace
-       contains multiple devices. If you want multiple devices to be replayed
-       concurrently to multiple redirected devices you must blkparse your trace
-       into separate traces and replay them with independent fio invocations.
-       Unfortunately this also breaks the strict time ordering between multiple
-       device accesses.
-
-.. option:: replay_align=int
-
-       Force alignment of the byte offsets in a trace to this value. The value
-       must be a power of 2.
-
-.. option:: replay_scale=int
-
-       Scale byte offsets down by this factor when replaying traces. Should most
-       likely use :option:`replay_align` as well.
-
-.. option:: replay_skip=str
-
-       Sometimes it's useful to skip certain IO types in a replay trace.
-       This could be, for instance, eliminating the writes in the trace.
-       Or not replaying the trims/discards, if you are redirecting to
-       a device that doesn't support them. This option takes a comma
-       separated list of read, write, trim, sync.
-
-
-Threads, processes and job synchronization
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. option:: thread
-
-       Fio defaults to creating jobs by using fork, however if this option is
-       given, fio will create jobs by using POSIX Threads' function
-       :manpage:`pthread_create(3)` to create threads instead.
-
-.. option:: wait_for=str
-
-       If set, the current job won't be started until all workers of the specified
-       waitee job are done.
-
-       ``wait_for`` operates on the job name basis, so there are a few
-       limitations. First, the waitee must be defined prior to the waiter job
-       (meaning no forward references). Second, if a job is being referenced as a
-       waitee, it must have a unique name (no duplicate waitees).
-
-.. option:: nice=int
-
-       Run the job with the given nice value. See man :manpage:`nice(2)`.
-
-       On Windows, values less than -15 set the process class to "High"; -1 through
-       -15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
-       priority class.
-
-.. option:: prio=int
-
-       Set the I/O priority value of this job. Linux limits us to a positive value
-       between 0 and 7, with 0 being the highest.  See man
-       :manpage:`ionice(1)`. Refer to an appropriate manpage for other operating
-       systems since meaning of priority may differ. For per-command priority
-       setting, see I/O engine specific `cmdprio_percentage` and `hipri_percentage`
-       options.
-
-.. option:: prioclass=int
-
-       Set the I/O priority class. See man :manpage:`ionice(1)`. For per-command
-       priority setting, see I/O engine specific `cmdprio_percentage` and
-       `hipri_percentage` options.
-
-.. option:: cpus_allowed=str
-
-       Controls the same options as :option:`cpumask`, but accepts a textual
-       specification of the permitted CPUs instead and CPUs are indexed from 0. So
-       to use CPUs 0 and 5 you would specify ``cpus_allowed=0,5``. This option also
-       allows a range of CPUs to be specified -- say you wanted a binding to CPUs
-       0, 5, and 8 to 15, you would set ``cpus_allowed=0,5,8-15``.
-
-       On Windows, when ``cpus_allowed`` is unset only CPUs from fio's current
-       processor group will be used and affinity settings are inherited from the
-       system. An fio build configured to target Windows 7 makes options that set
-       CPUs processor group aware and values will set both the processor group
-       and a CPU from within that group. For example, on a system where processor
-       group 0 has 40 CPUs and processor group 1 has 32 CPUs, ``cpus_allowed``
-       values between 0 and 39 will bind CPUs from processor group 0 and
-       ``cpus_allowed`` values between 40 and 71 will bind CPUs from processor
-       group 1. When using ``cpus_allowed_policy=shared`` all CPUs specified by a
-       single ``cpus_allowed`` option must be from the same processor group. For
-       Windows fio builds not built for Windows 7, CPUs will only be selected from
-       (and be relative to) whatever processor group fio happens to be running in
-       and CPUs from other processor groups cannot be used.
-
-.. option:: cpus_allowed_policy=str
-
-       Set the policy of how fio distributes the CPUs specified by
-       :option:`cpus_allowed` or :option:`cpumask`. Two policies are supported:
-
-               **shared**
-                       All jobs will share the CPU set specified.
-               **split**
-                       Each job will get a unique CPU from the CPU set.
-
-       **shared** is the default behavior, if the option isn't specified. If
-       **split** is specified, then fio will assign one cpu per job. If not
-       enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
-       in the set.
-
-.. option:: cpumask=int
-
-       Set the CPU affinity of this job. The parameter given is a bit mask of
-       allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
-       and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
-       :manpage:`sched_setaffinity(2)`. This may not work on all supported
-       operating systems or kernel versions. This option doesn't work well for a
-       higher CPU count than what you can store in an integer mask, so it can only
-       control cpus 1-32. For boxes with larger CPU counts, use
-       :option:`cpus_allowed`.
-
-.. option:: numa_cpu_nodes=str
-
-       Set this job running on specified NUMA nodes' CPUs. The arguments allow
-       comma delimited list of cpu numbers, A-B ranges, or `all`. Note, to enable
-       NUMA options support, fio must be built on a system with libnuma-dev(el)
-       installed.
-
-.. option:: numa_mem_policy=str
-
-       Set this job's memory policy and corresponding NUMA nodes. Format of the
-       arguments::
-
-               <mode>[:<nodelist>]
-
-       ``mode`` is one of the following memory policies: ``default``, ``prefer``,
-       ``bind``, ``interleave`` or ``local``. For ``default`` and ``local`` memory
-       policies, no node needs to be specified.  For ``prefer``, only one node is
-       allowed.  For ``bind`` and ``interleave`` the ``nodelist`` may be as
-       follows: a comma delimited list of numbers, A-B ranges, or `all`.
-
-.. option:: cgroup=str
-
-       Add job to this control group. If it doesn't exist, it will be created. The
-       system must have a mounted cgroup blkio mount point for this to work. If
-       your system doesn't have it mounted, you can do so with::
-
-               # mount -t cgroup -o blkio none /cgroup
-
-.. option:: cgroup_weight=int
-
-       Set the weight of the cgroup to this value. See the documentation that comes
-       with the kernel, allowed values are in the range of 100..1000.
-
-.. option:: cgroup_nodelete=bool
-
-       Normally fio will delete the cgroups it has created after the job
-       completion. To override this behavior and to leave cgroups around after the
-       job completion, set ``cgroup_nodelete=1``.  This can be useful if one wants
-       to inspect various cgroup files after job completion. Default: false.
-
-.. option:: flow_id=int
-
-       The ID of the flow. If not specified, it defaults to being a global
-       flow. See :option:`flow`.
-
-.. option:: flow=int
-
-       Weight in token-based flow control. If this value is used, then there is a
-       'flow counter' which is used to regulate the proportion of activity between
-       two or more jobs. Fio attempts to keep this flow counter near zero. The
-       ``flow`` parameter stands for how much should be added or subtracted to the
-       flow counter on each iteration of the main I/O loop. That is, if one job has
-       ``flow=8`` and another job has ``flow=-1``, then there will be a roughly 1:8
-       ratio in how much one runs vs the other.
-
-.. option:: flow_sleep=int
-
-       The period of time, in microseconds, to wait after the flow counter
-       has exceeded its proportion before retrying operations.
-
-.. option:: stonewall, wait_for_previous
-
-       Wait for preceding jobs in the job file to exit, before starting this
-       one. Can be used to insert serialization points in the job file. A stone
-       wall also implies starting a new reporting group, see
-       :option:`group_reporting`.
-
-.. option:: exitall
-
-       By default, fio will continue running all other jobs when one job finishes.
-       Sometimes this is not the desired action.  Setting ``exitall`` will instead
-       make fio terminate all jobs in the same group, as soon as one job of that
-       group finishes.
-
-.. option:: exit_what
-
-       By default, fio will continue running all other jobs when one job finishes.
-       Sometimes this is not the desired action. Setting ``exit_all`` will
-       instead make fio terminate all jobs in the same group. The option
-        ``exit_what`` allows to control which jobs get terminated when ``exitall`` is
-        enabled. The default is ``group`` and does not change the behaviour of
-        ``exitall``. The setting ``all`` terminates all jobs. The setting ``stonewall``
-        terminates all currently running jobs across all groups and continues execution
-        with the next stonewalled group.
-
-.. option:: exec_prerun=str
-
-       Before running this job, issue the command specified through
-       :manpage:`system(3)`. Output is redirected in a file called
-       :file:`jobname.prerun.txt`.
-
-.. option:: exec_postrun=str
-
-       After the job completes, issue the command specified though
-       :manpage:`system(3)`. Output is redirected in a file called
-       :file:`jobname.postrun.txt`.
-
-.. option:: uid=int
-
-       Instead of running as the invoking user, set the user ID to this value
-       before the thread/process does any work.
-
-.. option:: gid=int
-
-       Set group ID, see :option:`uid`.
-
-
-Verification
-~~~~~~~~~~~~
-
-.. option:: verify_only
-
-       Do not perform specified workload, only verify data still matches previous
-       invocation of this workload. This option allows one to check data multiple
-       times at a later date without overwriting it. This option makes sense only
-       for workloads that write data, and does not support workloads with the
-       :option:`time_based` option set.
-
-.. option:: do_verify=bool
-
-       Run the verify phase after a write phase. Only valid if :option:`verify` is
-       set. Default: true.
-
-.. option:: verify=str
-
-       If writing to a file, fio can verify the file contents after each iteration
-       of the job. Each verification method also implies verification of special
-       header, which is written to the beginning of each block. This header also
-       includes meta information, like offset of the block, block number, timestamp
-       when block was written, etc.  :option:`verify` can be combined with
-       :option:`verify_pattern` option.  The allowed values are:
-
-               **md5**
-                       Use an md5 sum of the data area and store it in the header of
-                       each block.
-
-               **crc64**
-                       Use an experimental crc64 sum of the data area and store it in the
-                       header of each block.
-
-               **crc32c**
-                       Use a crc32c sum of the data area and store it in the header of
-                       each block. This will automatically use hardware acceleration
-                       (e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
-                       fall back to software crc32c if none is found. Generally the
-                       fastest checksum fio supports when hardware accelerated.
-
-               **crc32c-intel**
-                       Synonym for crc32c.
-
-               **crc32**
-                       Use a crc32 sum of the data area and store it in the header of each
-                       block.
-
-               **crc16**
-                       Use a crc16 sum of the data area and store it in the header of each
-                       block.
-
-               **crc7**
-                       Use a crc7 sum of the data area and store it in the header of each
-                       block.
-
-               **xxhash**
-                       Use xxhash as the checksum function. Generally the fastest software
-                       checksum that fio supports.
-
-               **sha512**
-                       Use sha512 as the checksum function.
-
-               **sha256**
-                       Use sha256 as the checksum function.
-
-               **sha1**
-                       Use optimized sha1 as the checksum function.
-
-               **sha3-224**
-                       Use optimized sha3-224 as the checksum function.
-
-               **sha3-256**
-                       Use optimized sha3-256 as the checksum function.
-
-               **sha3-384**
-                       Use optimized sha3-384 as the checksum function.
-
-               **sha3-512**
-                       Use optimized sha3-512 as the checksum function.
-
-               **meta**
-                       This option is deprecated, since now meta information is included in
-                       generic verification header and meta verification happens by
-                       default. For detailed information see the description of the
-                       :option:`verify` setting. This option is kept because of
-                       compatibility's sake with old configurations. Do not use it.
-
-               **pattern**
-                       Verify a strict pattern. Normally fio includes a header with some
-                       basic information and checksumming, but if this option is set, only
-                       the specific pattern set with :option:`verify_pattern` is verified.
-
-               **null**
-                       Only pretend to verify. Useful for testing internals with
-                       :option:`ioengine`\=null, not for much else.
-
-       This option can be used for repeated burn-in tests of a system to make sure
-       that the written data is also correctly read back. If the data direction
-       given is a read or random read, fio will assume that it should verify a
-       previously written file. If the data direction includes any form of write,
-       the verify will be of the newly written data.
-
-       To avoid false verification errors, do not use the norandommap option when
-       verifying data with async I/O engines and I/O depths > 1.  Or use the
-       norandommap and the lfsr random generator together to avoid writing to the
-       same offset with muliple outstanding I/Os.
-
-.. option:: verify_offset=int
-
-       Swap the verification header with data somewhere else in the block before
-       writing. It is swapped back before verifying.
-
-.. option:: verify_interval=int
-
-       Write the verification header at a finer granularity than the
-       :option:`blocksize`. It will be written for chunks the size of
-       ``verify_interval``. :option:`blocksize` should divide this evenly.
-
-.. option:: verify_pattern=str
-
-       If set, fio will fill the I/O buffers with this pattern. Fio defaults to
-       filling with totally random bytes, but sometimes it's interesting to fill
-       with a known pattern for I/O verification purposes. Depending on the width
-       of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
-       be either a decimal or a hex number).  The ``verify_pattern`` if larger than
-       a 32-bit quantity has to be a hex number that starts with either "0x" or
-       "0X". Use with :option:`verify`. Also, ``verify_pattern`` supports %o
-       format, which means that for each block offset will be written and then
-       verified back, e.g.::
-
-               verify_pattern=%o
-
-       Or use combination of everything::
-
-               verify_pattern=0xff%o"abcd"-12
-
-.. option:: verify_fatal=bool
-
-       Normally fio will keep checking the entire contents before quitting on a
-       block verification failure. If this option is set, fio will exit the job on
-       the first observed failure. Default: false.
-
-.. option:: verify_dump=bool
-
-       If set, dump the contents of both the original data block and the data block
-       we read off disk to files. This allows later analysis to inspect just what
-       kind of data corruption occurred. Off by default.
-
-.. option:: verify_async=int
-
-       Fio will normally verify I/O inline from the submitting thread. This option
-       takes an integer describing how many async offload threads to create for I/O
-       verification instead, causing fio to offload the duty of verifying I/O
-       contents to one or more separate threads. If using this offload option, even
-       sync I/O engines can benefit from using an :option:`iodepth` setting higher
-       than 1, as it allows them to have I/O in flight while verifies are running.
-       Defaults to 0 async threads, i.e. verification is not asynchronous.
-
-.. option:: verify_async_cpus=str
-
-       Tell fio to set the given CPU affinity on the async I/O verification
-       threads. See :option:`cpus_allowed` for the format used.
-
-.. option:: verify_backlog=int
-
-       Fio will normally verify the written contents of a job that utilizes verify
-       once that job has completed. In other words, everything is written then
-       everything is read back and verified. You may want to verify continually
-       instead for a variety of reasons. Fio stores the meta data associated with
-       an I/O block in memory, so for large verify workloads, quite a bit of memory
-       would be used up holding this meta data. If this option is enabled, fio will
-       write only N blocks before verifying these blocks.
-
-.. option:: verify_backlog_batch=int
-
-       Control how many blocks fio will verify if :option:`verify_backlog` is
-       set. If not set, will default to the value of :option:`verify_backlog`
-       (meaning the entire queue is read back and verified).  If
-       ``verify_backlog_batch`` is less than :option:`verify_backlog` then not all
-       blocks will be verified, if ``verify_backlog_batch`` is larger than
-       :option:`verify_backlog`, some blocks will be verified more than once.
-
-.. option:: verify_state_save=bool
-
-       When a job exits during the write phase of a verify workload, save its
-       current state. This allows fio to replay up until that point, if the verify
-       state is loaded for the verify read phase. The format of the filename is,
-       roughly::
-
-               <type>-<jobname>-<jobindex>-verify.state.
-
-       <type> is "local" for a local run, "sock" for a client/server socket
-       connection, and "ip" (192.168.0.1, for instance) for a networked
-       client/server connection. Defaults to true.
-
-.. option:: verify_state_load=bool
-
-       If a verify termination trigger was used, fio stores the current write state
-       of each thread. This can be used at verification time so that fio knows how
-       far it should verify.  Without this information, fio will run a full
-       verification pass, according to the settings in the job file used.  Default
-       false.
-
-.. option:: trim_percentage=int
-
-       Number of verify blocks to discard/trim.
-
-.. option:: trim_verify_zero=bool
-
-       Verify that trim/discarded blocks are returned as zeros.
-
-.. option:: trim_backlog=int
-
-       Trim after this number of blocks are written.
-
-.. option:: trim_backlog_batch=int
-
-       Trim this number of I/O blocks.
-
-.. option:: experimental_verify=bool
-
-       Enable experimental verification.
-
-Steady state
-~~~~~~~~~~~~
-
-.. option:: steadystate=str:float, ss=str:float
-
-       Define the criterion and limit for assessing steady state performance. The
-       first parameter designates the criterion whereas the second parameter sets
-       the threshold. When the criterion falls below the threshold for the
-       specified duration, the job will stop. For example, `iops_slope:0.1%` will
-       direct fio to terminate the job when the least squares regression slope
-       falls below 0.1% of the mean IOPS. If :option:`group_reporting` is enabled
-       this will apply to all jobs in the group. Below is the list of available
-       steady state assessment criteria. All assessments are carried out using only
-       data from the rolling collection window. Threshold limits can be expressed
-       as a fixed value or as a percentage of the mean in the collection window.
-
-       When using this feature, most jobs should include the :option:`time_based`
-       and :option:`runtime` options or the :option:`loops` option so that fio does not
-       stop running after it has covered the full size of the specified file(s) or device(s).
-
-               **iops**
-                       Collect IOPS data. Stop the job if all individual IOPS measurements
-                       are within the specified limit of the mean IOPS (e.g., ``iops:2``
-                       means that all individual IOPS values must be within 2 of the mean,
-                       whereas ``iops:0.2%`` means that all individual IOPS values must be
-                       within 0.2% of the mean IOPS to terminate the job).
-
-               **iops_slope**
-                       Collect IOPS data and calculate the least squares regression
-                       slope. Stop the job if the slope falls below the specified limit.
-
-               **bw**
-                       Collect bandwidth data. Stop the job if all individual bandwidth
-                       measurements are within the specified limit of the mean bandwidth.
-
-               **bw_slope**
-                       Collect bandwidth data and calculate the least squares regression
-                       slope. Stop the job if the slope falls below the specified limit.
-
-.. option:: steadystate_duration=time, ss_dur=time
-
-       A rolling window of this duration will be used to judge whether steady state
-       has been reached. Data will be collected once per second. The default is 0
-       which disables steady state detection.  When the unit is omitted, the
-       value is interpreted in seconds.
-
-.. option:: steadystate_ramp_time=time, ss_ramp=time
-
-       Allow the job to run for the specified duration before beginning data
-       collection for checking the steady state job termination criterion. The
-       default is 0.  When the unit is omitted, the value is interpreted in seconds.
-
-
-Measurements and reporting
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. option:: per_job_logs=bool
-
-       If set, this generates bw/clat/iops log with per file private filenames. If
-       not set, jobs with identical names will share the log filename. Default:
-       true.
-
-.. option:: group_reporting
-
-       It may sometimes be interesting to display statistics for groups of jobs as
-       a whole instead of for each individual job.  This is especially true if
-       :option:`numjobs` is used; looking at individual thread/process output
-       quickly becomes unwieldy.  To see the final report per-group instead of
-       per-job, use :option:`group_reporting`. Jobs in a file will be part of the
-       same reporting group, unless if separated by a :option:`stonewall`, or by
-       using :option:`new_group`.
-
-.. option:: new_group
-
-       Start a new reporting group. See: :option:`group_reporting`.  If not given,
-       all jobs in a file will be part of the same reporting group, unless
-       separated by a :option:`stonewall`.
-
-.. option:: stats=bool
-
-       By default, fio collects and shows final output results for all jobs
-       that run. If this option is set to 0, then fio will ignore it in
-       the final stat output.
-
-.. option:: write_bw_log=str
-
-       If given, write a bandwidth log for this job. Can be used to store data of
-       the bandwidth of the jobs in their lifetime.
-
-       If no str argument is given, the default filename of
-       :file:`jobname_type.x.log` is used. Even when the argument is given, fio
-       will still append the type of log. So if one specifies::
-
-               write_bw_log=foo
-
-       The actual log name will be :file:`foo_bw.x.log` where `x` is the index
-       of the job (`1..N`, where `N` is the number of jobs). If
-       :option:`per_job_logs` is false, then the filename will not include the
-       `.x` job index.
-
-       The included :command:`fio_generate_plots` script uses :command:`gnuplot` to turn these
-       text files into nice graphs. See `Log File Formats`_ for how data is
-       structured within the file.
-
-.. option:: write_lat_log=str
-
-       Same as :option:`write_bw_log`, except this option creates I/O
-       submission (e.g., :file:`name_slat.x.log`), completion (e.g.,
-       :file:`name_clat.x.log`), and total (e.g., :file:`name_lat.x.log`)
-       latency files instead. See :option:`write_bw_log` for details about
-       the filename format and `Log File Formats`_ for how data is structured
-       within the files.
-
-.. option:: write_hist_log=str
-
-       Same as :option:`write_bw_log` but writes an I/O completion latency
-       histogram file (e.g., :file:`name_hist.x.log`) instead. Note that this
-       file will be empty unless :option:`log_hist_msec` has also been set.
-       See :option:`write_bw_log` for details about the filename format and
-       `Log File Formats`_ for how data is structured within the file.
-
-.. option:: write_iops_log=str
-
-       Same as :option:`write_bw_log`, but writes an IOPS file (e.g.
-       :file:`name_iops.x.log`) instead. Because fio defaults to individual
-       I/O logging, the value entry in the IOPS log will be 1 unless windowed
-       logging (see :option:`log_avg_msec`) has been enabled. See
-       :option:`write_bw_log` for details about the filename format and `Log
-       File Formats`_ for how data is structured within the file.
-
-.. option:: log_avg_msec=int
-
-       By default, fio will log an entry in the iops, latency, or bw log for every
-       I/O that completes. When writing to the disk log, that can quickly grow to a
-       very large size. Setting this option makes fio average the each log entry
-       over the specified period of time, reducing the resolution of the log.  See
-       :option:`log_max_value` as well. Defaults to 0, logging all entries.
-       Also see `Log File Formats`_.
-
-.. option:: log_hist_msec=int
-
-       Same as :option:`log_avg_msec`, but logs entries for completion latency
-       histograms. Computing latency percentiles from averages of intervals using
-       :option:`log_avg_msec` is inaccurate. Setting this option makes fio log
-       histogram entries over the specified period of time, reducing log sizes for
-       high IOPS devices while retaining percentile accuracy.  See
-       :option:`log_hist_coarseness` and :option:`write_hist_log` as well.
-       Defaults to 0, meaning histogram logging is disabled.
-
-.. option:: log_hist_coarseness=int
-
-       Integer ranging from 0 to 6, defining the coarseness of the resolution of
-       the histogram logs enabled with :option:`log_hist_msec`. For each increment
-       in coarseness, fio outputs half as many bins. Defaults to 0, for which
-       histogram logs contain 1216 latency bins. See :option:`write_hist_log`
-       and `Log File Formats`_.
-
-.. option:: log_max_value=bool
-
-       If :option:`log_avg_msec` is set, fio logs the average over that window. If
-       you instead want to log the maximum value, set this option to 1. Defaults to
-       0, meaning that averaged values are logged.
-
-.. option:: log_offset=bool
-
-       If this is set, the iolog options will include the byte offset for the I/O
-       entry as well as the other data values. Defaults to 0 meaning that
-       offsets are not present in logs. Also see `Log File Formats`_.
-
-.. option:: log_compression=int
-
-       If this is set, fio will compress the I/O logs as it goes, to keep the
-       memory footprint lower. When a log reaches the specified size, that chunk is
-       removed and compressed in the background. Given that I/O logs are fairly
-       highly compressible, this yields a nice memory savings for longer runs. The
-       downside is that the compression will consume some background CPU cycles, so
-       it may impact the run. This, however, is also true if the logging ends up
-       consuming most of the system memory.  So pick your poison. The I/O logs are
-       saved normally at the end of a run, by decompressing the chunks and storing
-       them in the specified log file. This feature depends on the availability of
-       zlib.
-
-.. option:: log_compression_cpus=str
-
-       Define the set of CPUs that are allowed to handle online log compression for
-       the I/O jobs. This can provide better isolation between performance
-       sensitive jobs, and background compression work. See
-       :option:`cpus_allowed` for the format used.
-
-.. option:: log_store_compressed=bool
-
-       If set, fio will store the log files in a compressed format. They can be
-       decompressed with fio, using the :option:`--inflate-log` command line
-       parameter. The files will be stored with a :file:`.fz` suffix.
-
-.. option:: log_unix_epoch=bool
-
-       If set, fio will log Unix timestamps to the log files produced by enabling
-       write_type_log for each log type, instead of the default zero-based
-       timestamps.
-
-.. option:: block_error_percentiles=bool
-
-       If set, record errors in trim block-sized units from writes and trims and
-       output a histogram of how many trims it took to get to errors, and what kind
-       of error was encountered.
-
-.. option:: bwavgtime=int
-
-       Average the calculated bandwidth over the given time. Value is specified in
-       milliseconds. If the job also does bandwidth logging through
-       :option:`write_bw_log`, then the minimum of this option and
-       :option:`log_avg_msec` will be used.  Default: 500ms.
-
-.. option:: iopsavgtime=int
-
-       Average the calculated IOPS over the given time. Value is specified in
-       milliseconds. If the job also does IOPS logging through
-       :option:`write_iops_log`, then the minimum of this option and
-       :option:`log_avg_msec` will be used.  Default: 500ms.
-
-.. option:: disk_util=bool
-
-       Generate disk utilization statistics, if the platform supports it.
-       Default: true.
-
-.. option:: disable_lat=bool
-
-       Disable measurements of total latency numbers. Useful only for cutting back
-       the number of calls to :manpage:`gettimeofday(2)`, as that does impact
-       performance at really high IOPS rates.  Note that to really get rid of a
-       large amount of these calls, this option must be used with
-       :option:`disable_slat` and :option:`disable_bw_measurement` as well.
-
-.. option:: disable_clat=bool
-
-       Disable measurements of completion latency numbers. See
-       :option:`disable_lat`.
-
-.. option:: disable_slat=bool
-
-       Disable measurements of submission latency numbers. See
-       :option:`disable_lat`.
-
-.. option:: disable_bw_measurement=bool, disable_bw=bool
-
-       Disable measurements of throughput/bandwidth numbers. See
-       :option:`disable_lat`.
-
-.. option:: slat_percentiles=bool
-
-       Report submission latency percentiles. Submission latency is not recorded
-       for synchronous ioengines.
-
-.. option:: clat_percentiles=bool
-
-       Report completion latency percentiles.
-
-.. option:: lat_percentiles=bool
-
-       Report total latency percentiles. Total latency is the sum of submission
-       latency and completion latency.
-
-.. option:: percentile_list=float_list
-
-       Overwrite the default list of percentiles for latencies and the block error
-       histogram.  Each number is a floating point number in the range (0,100], and
-       the maximum length of the list is 20. Use ``:`` to separate the numbers. For
-       example, ``--percentile_list=99.5:99.9`` will cause fio to report the
-       latency durations below which 99.5% and 99.9% of the observed latencies fell,
-       respectively.
-
-.. option:: significant_figures=int
-
-       If using :option:`--output-format` of `normal`, set the significant
-       figures to this value. Higher values will yield more precise IOPS and
-       throughput units, while lower values will round. Requires a minimum
-       value of 1 and a maximum value of 10. Defaults to 4.
-
-
-Error handling
-~~~~~~~~~~~~~~
-
-.. option:: exitall_on_error
-
-       When one job finishes in error, terminate the rest. The default is to wait
-       for each job to finish.
-
-.. option:: continue_on_error=str
-
-       Normally fio will exit the job on the first observed failure. If this option
-       is set, fio will continue the job when there is a 'non-fatal error' (EIO or
-       EILSEQ) until the runtime is exceeded or the I/O size specified is
-       completed. If this option is used, there are two more stats that are
-       appended, the total error count and the first error. The error field given
-       in the stats is the first error that was hit during the run.
-
-       The allowed values are:
-
-               **none**
-                       Exit on any I/O or verify errors.
-
-               **read**
-                       Continue on read errors, exit on all others.
-
-               **write**
-                       Continue on write errors, exit on all others.
-
-               **io**
-                       Continue on any I/O error, exit on all others.
-
-               **verify**
-                       Continue on verify errors, exit on all others.
-
-               **all**
-                       Continue on all errors.
-
-               **0**
-                       Backward-compatible alias for 'none'.
-
-               **1**
-                       Backward-compatible alias for 'all'.
-
-.. option:: ignore_error=str
-
-       Sometimes you want to ignore some errors during test in that case you can
-       specify error list for each error type, instead of only being able to
-       ignore the default 'non-fatal error' using :option:`continue_on_error`.
-       ``ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST`` errors for
-       given error type is separated with ':'. Error may be symbol ('ENOSPC',
-       'ENOMEM') or integer.  Example::
-
-               ignore_error=EAGAIN,ENOSPC:122
-
-       This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
-       WRITE. This option works by overriding :option:`continue_on_error` with
-       the list of errors for each error type if any.
-
-.. option:: error_dump=bool
-
-       If set dump every error even if it is non fatal, true by default. If
-       disabled only fatal error will be dumped.
-
-Running predefined workloads
-----------------------------
-
-Fio includes predefined profiles that mimic the I/O workloads generated by
-other tools.
-
-.. option:: profile=str
-
-       The predefined workload to run.  Current profiles are:
-
-               **tiobench**
-                       Threaded I/O bench (tiotest/tiobench) like workload.
-
-               **act**
-                       Aerospike Certification Tool (ACT) like workload.
-
-To view a profile's additional options use :option:`--cmdhelp` after specifying
-the profile.  For example::
-
-       $ fio --profile=act --cmdhelp
-
-Act profile options
-~~~~~~~~~~~~~~~~~~~
-
-.. option:: device-names=str
-       :noindex:
-
-       Devices to use.
-
-.. option:: load=int
-       :noindex:
-
-       ACT load multiplier.  Default: 1.
-
-.. option:: test-duration=time
-       :noindex:
-
-       How long the entire test takes to run.  When the unit is omitted, the value
-       is given in seconds.  Default: 24h.
-
-.. option:: threads-per-queue=int
-       :noindex:
-
-       Number of read I/O threads per device.  Default: 8.
-
-.. option:: read-req-num-512-blocks=int
-       :noindex:
-
-       Number of 512B blocks to read at the time.  Default: 3.
-
-.. option:: large-block-op-kbytes=int
-       :noindex:
-
-       Size of large block ops in KiB (writes).  Default: 131072.
-
-.. option:: prep
-       :noindex:
-
-       Set to run ACT prep phase.
-
-Tiobench profile options
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. option:: size=str
-       :noindex:
-
-       Size in MiB.
-
-.. option:: block=int
-       :noindex:
-
-       Block size in bytes.  Default: 4096.
-
-.. option:: numruns=int
-       :noindex:
-
-       Number of runs.
-
-.. option:: dir=str
-       :noindex:
-
-       Test directory.
-
-.. option:: threads=int
-       :noindex:
-
-       Number of threads.
-
-Interpreting the output
------------------------
-
-..
-       Example output was based on the following:
-       TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --time_based \
-               --rate=1256k --bs=14K --name=quick --runtime=1s --name=mixed \
-               --runtime=2m --rw=rw
-
-Fio spits out a lot of output. While running, fio will display the status of the
-jobs created. An example of that would be::
-
-    Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
-
-The characters inside the first set of square brackets denote the current status of
-each thread.  The first character is the first job defined in the job file, and so
-forth.  The possible values (in typical life cycle order) are:
-
-+------+-----+-----------------------------------------------------------+
-| Idle | Run |                                                           |
-+======+=====+===========================================================+
-| P    |     | Thread setup, but not started.                            |
-+------+-----+-----------------------------------------------------------+
-| C    |     | Thread created.                                           |
-+------+-----+-----------------------------------------------------------+
-| I    |     | Thread initialized, waiting or generating necessary data. |
-+------+-----+-----------------------------------------------------------+
-|      |  p  | Thread running pre-reading file(s).                       |
-+------+-----+-----------------------------------------------------------+
-|      |  /  | Thread is in ramp period.                                 |
-+------+-----+-----------------------------------------------------------+
-|      |  R  | Running, doing sequential reads.                          |
-+------+-----+-----------------------------------------------------------+
-|      |  r  | Running, doing random reads.                              |
-+------+-----+-----------------------------------------------------------+
-|      |  W  | Running, doing sequential writes.                         |
-+------+-----+-----------------------------------------------------------+
-|      |  w  | Running, doing random writes.                             |
-+------+-----+-----------------------------------------------------------+
-|      |  M  | Running, doing mixed sequential reads/writes.             |
-+------+-----+-----------------------------------------------------------+
-|      |  m  | Running, doing mixed random reads/writes.                 |
-+------+-----+-----------------------------------------------------------+
-|      |  D  | Running, doing sequential trims.                          |
-+------+-----+-----------------------------------------------------------+
-|      |  d  | Running, doing random trims.                              |
-+------+-----+-----------------------------------------------------------+
-|      |  F  | Running, currently waiting for :manpage:`fsync(2)`.       |
-+------+-----+-----------------------------------------------------------+
-|      |  V  | Running, doing verification of written data.              |
-+------+-----+-----------------------------------------------------------+
-| f    |     | Thread finishing.                                         |
-+------+-----+-----------------------------------------------------------+
-| E    |     | Thread exited, not reaped by main thread yet.             |
-+------+-----+-----------------------------------------------------------+
-| _    |     | Thread reaped.                                            |
-+------+-----+-----------------------------------------------------------+
-| X    |     | Thread reaped, exited with an error.                      |
-+------+-----+-----------------------------------------------------------+
-| K    |     | Thread reaped, exited due to signal.                      |
-+------+-----+-----------------------------------------------------------+
-
-..
-       Example output was based on the following:
-       TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --runtime=58m \
-               --time_based --rate=2512k --bs=256K --numjobs=10 \
-               --name=readers --rw=read --name=writers --rw=write
-
-Fio will condense the thread string as not to take up more space on the command
-line than needed. For instance, if you have 10 readers and 10 writers running,
-the output would look like this::
-
-    Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
-
-Note that the status string is displayed in order, so it's possible to tell which of
-the jobs are currently doing what.  In the example above this means that jobs 1--10
-are readers and 11--20 are writers.
-
-The other values are fairly self explanatory -- number of threads currently
-running and doing I/O, the number of currently open files (f=), the estimated
-completion percentage, the rate of I/O since last check (read speed listed first,
-then write speed and optionally trim speed) in terms of bandwidth and IOPS,
-and time to completion for the current running group. It's impossible to estimate
-runtime of the following groups (if any).
-
-..
-       Example output was based on the following:
-       TZ=UTC fio --iodepth=16 --ioengine=posixaio --filename=/tmp/fiofile \
-               --direct=1 --size=100M --time_based --runtime=50s --rate_iops=89 \
-               --bs=7K --name=Client1 --rw=write
-
-When fio is done (or interrupted by :kbd:`Ctrl-C`), it will show the data for
-each thread, group of threads, and disks in that order. For each overall thread (or
-group) the output looks like::
-
-       Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
-         write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
-           slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
-           clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
-            lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
-           clat percentiles (usec):
-            |  1.00th=[  302],  5.00th=[  326], 10.00th=[  343], 20.00th=[  363],
-            | 30.00th=[  392], 40.00th=[  404], 50.00th=[  416], 60.00th=[  445],
-            | 70.00th=[  816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
-            | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
-            | 99.99th=[78119]
-          bw (  KiB/s): min=  532, max=  686, per=0.10%, avg=622.87, stdev=24.82, samples=  100
-          iops        : min=   76, max=   98, avg=88.98, stdev= 3.54, samples=  100
-         lat (usec)   : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
-         lat (msec)   : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
-         lat (msec)   : 100=0.65%
-         cpu          : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
-         IO depths    : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
-            submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
-            complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
-            issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
-            latency   : target=0, window=0, percentile=100.00%, depth=8
-
-The job name (or first job's name when using :option:`group_reporting`) is printed,
-along with the group id, count of jobs being aggregated, last error id seen (which
-is 0 when there are no errors), pid/tid of that thread and the time the job/group
-completed.  Below are the I/O statistics for each data direction performed (showing
-writes in the example above).  In the order listed, they denote:
-
-**read/write/trim**
-               The string before the colon shows the I/O direction the statistics
-               are for.  **IOPS** is the average I/Os performed per second.  **BW**
-               is the average bandwidth rate shown as: value in power of 2 format
-               (value in power of 10 format).  The last two values show: (**total
-               I/O performed** in power of 2 format / **runtime** of that thread).
-
-**slat**
-               Submission latency (**min** being the minimum, **max** being the
-               maximum, **avg** being the average, **stdev** being the standard
-               deviation).  This is the time it took to submit the I/O.  For
-               sync I/O this row is not displayed as the slat is really the
-               completion latency (since queue/complete is one operation there).
-               This value can be in nanoseconds, microseconds or milliseconds ---
-               fio will choose the most appropriate base and print that (in the
-               example above nanoseconds was the best scale).  Note: in :option:`--minimal` mode
-               latencies are always expressed in microseconds.
-
-**clat**
-               Completion latency. Same names as slat, this denotes the time from
-               submission to completion of the I/O pieces. For sync I/O, clat will
-               usually be equal (or very close) to 0, as the time from submit to
-               complete is basically just CPU time (I/O has already been done, see slat
-               explanation).
-
-**lat**
-               Total latency. Same names as slat and clat, this denotes the time from
-               when fio created the I/O unit to completion of the I/O operation.
-
-**bw**
-               Bandwidth statistics based on samples. Same names as the xlat stats,
-               but also includes the number of samples taken (**samples**) and an
-               approximate percentage of total aggregate bandwidth this thread
-               received in its group (**per**). This last value is only really
-               useful if the threads in this group are on the same disk, since they
-               are then competing for disk access.
-
-**iops**
-               IOPS statistics based on samples. Same names as bw.
-
-**lat (nsec/usec/msec)**
-               The distribution of I/O completion latencies. This is the time from when
-               I/O leaves fio and when it gets completed. Unlike the separate
-               read/write/trim sections above, the data here and in the remaining
-               sections apply to all I/Os for the reporting group. 250=0.04% means that
-               0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
-               of the I/Os required 250 to 499us for completion.
-
-**cpu**
-               CPU usage. User and system time, along with the number of context
-               switches this thread went through, usage of system and user time, and
-               finally the number of major and minor page faults. The CPU utilization
-               numbers are averages for the jobs in that reporting group, while the
-               context and fault counters are summed.
-
-**IO depths**
-               The distribution of I/O depths over the job lifetime.  The numbers are
-               divided into powers of 2 and each entry covers depths from that value
-               up to those that are lower than the next entry -- e.g., 16= covers
-               depths from 16 to 31.  Note that the range covered by a depth
-               distribution entry can be different to the range covered by the
-               equivalent submit/complete distribution entry.
-
-**IO submit**
-               How many pieces of I/O were submitting in a single submit call. Each
-               entry denotes that amount and below, until the previous entry -- e.g.,
-               16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
-               call.  Note that the range covered by a submit distribution entry can
-               be different to the range covered by the equivalent depth distribution
-               entry.
-
-**IO complete**
-               Like the above submit number, but for completions instead.
-
-**IO issued rwt**
-               The number of read/write/trim requests issued, and how many of them were
-               short or dropped.
-
-**IO latency**
-               These values are for :option:`latency_target` and related options. When
-               these options are engaged, this section describes the I/O depth required
-               to meet the specified latency target.
-
-..
-       Example output was based on the following:
-       TZ=UTC fio --ioengine=null --iodepth=2 --size=100M --numjobs=2 \
-               --rate_process=poisson --io_limit=32M --name=read --bs=128k \
-               --rate=11M --name=write --rw=write --bs=2k --rate=700k
-
-After each client has been listed, the group statistics are printed. They
-will look like this::
-
-    Run status group 0 (all jobs):
-       READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s-10.8MiB/s (10.9MB/s-11.3MB/s), io=64.0MiB (67.1MB), run=2973-3069msec
-      WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s-621KiB/s (630kB/s-636kB/s), io=64.0MiB (67.1MB), run=52747-53223msec
-
-For each data direction it prints:
-
-**bw**
-               Aggregate bandwidth of threads in this group followed by the
-               minimum and maximum bandwidth of all the threads in this group.
-               Values outside of brackets are power-of-2 format and those
-               within are the equivalent value in a power-of-10 format.
-**io**
-               Aggregate I/O performed of all threads in this group. The
-               format is the same as bw.
-**run**
-               The smallest and longest runtimes of the threads in this group.
-
-And finally, the disk statistics are printed. This is Linux specific. They will look like this::
-
-  Disk stats (read/write):
-    sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
-
-Each value is printed for both reads and writes, with reads first. The
-numbers denote:
-
-**ios**
-               Number of I/Os performed by all groups.
-**merge**
-               Number of merges performed by the I/O scheduler.
-**ticks**
-               Number of ticks we kept the disk busy.
-**in_queue**
-               Total time spent in the disk queue.
-**util**
-               The disk utilization. A value of 100% means we kept the disk
-               busy constantly, 50% would be a disk idling half of the time.
-
-It is also possible to get fio to dump the current output while it is running,
-without terminating the job. To do that, send fio the **USR1** signal.  You can
-also get regularly timed dumps by using the :option:`--status-interval`
-parameter, or by creating a file in :file:`/tmp` named
-:file:`fio-dump-status`. If fio sees this file, it will unlink it and dump the
-current output status.
-
-
-Terse output
-------------
-
-For scripted usage where you typically want to generate tables or graphs of the
-results, fio can output the results in a semicolon separated format.  The format
-is one long line of values, such as::
-
-    2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
-    A description of this job goes here.
-
-The job description (if provided) follows on a second line for terse v2.
-It appears on the same line for other terse versions.
-
-To enable terse output, use the :option:`--minimal` or
-:option:`--output-format`\=terse command line options. The
-first value is the version of the terse output format. If the output has to be
-changed for some reason, this number will be incremented by 1 to signify that
-change.
-
-Split up, the format is as follows (comments in brackets denote when a
-field was introduced or whether it's specific to some terse version):
-
-    ::
-
-        terse version, fio version [v3], jobname, groupid, error
-
-    READ status::
-
-        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
-        Submission latency: min, max, mean, stdev (usec)
-        Completion latency: min, max, mean, stdev (usec)
-        Completion latency percentiles: 20 fields (see below)
-        Total latency: min, max, mean, stdev (usec)
-        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
-        IOPS [v5]: min, max, mean, stdev, number of samples
-
-    WRITE status:
-
-    ::
-
-        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
-        Submission latency: min, max, mean, stdev (usec)
-        Completion latency: min, max, mean, stdev (usec)
-        Completion latency percentiles: 20 fields (see below)
-        Total latency: min, max, mean, stdev (usec)
-        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
-        IOPS [v5]: min, max, mean, stdev, number of samples
-
-    TRIM status [all but version 3]:
-
-        Fields are similar to READ/WRITE status.
-
-    CPU usage::
-
-        user, system, context switches, major faults, minor faults
-
-    I/O depths::
-
-        <=1, 2, 4, 8, 16, 32, >=64
-
-    I/O latencies microseconds::
-
-        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
-
-    I/O latencies milliseconds::
-
-        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
-
-    Disk utilization [v3]::
-
-        disk name, read ios, write ios, read merges, write merges, read ticks, write ticks,
-        time spent in queue, disk utilization percentage
-
-    Additional Info (dependent on continue_on_error, default off)::
-
-        total # errors, first error code
-
-    Additional Info (dependent on description being set)::
-
-        Text description
-
-Completion latency percentiles can be a grouping of up to 20 sets, so for the
-terse output fio writes all of them. Each field will look like this::
-
-        1.00%=6112
-
-which is the Xth percentile, and the `usec` latency associated with it.
-
-For `Disk utilization`, all disks used by fio are shown. So for each disk there
-will be a disk utilization section.
-
-Below is a single line containing short names for each of the fields in the
-minimal output v3, separated by semicolons::
-
-        terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth_kb;read_iops;read_runtime_ms;read_slat_min_us;read_slat_max_us;read_slat_mean_us;read_slat_dev_us;read_clat_min_us;read_clat_max_us;read_clat_mean_us;read_clat_dev_us;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min_us;read_lat_max_us;read_lat_mean_us;read_lat_dev_us;read_bw_min_kb;read_bw_max_kb;read_bw_agg_pct;read_bw_mean_kb;read_bw_dev_kb;write_kb;write_bandwidth_kb;write_iops;write_runtime_ms;write_slat_min_us;write_slat_max_us;write_slat_mean_us;write_slat_dev_us;write_clat_min_us;write_clat_max_us;write_clat_mean_us;write_clat_dev_us;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min_us;write_lat_max_us;write_lat_mean_us;write_lat_dev_us;write_bw_min_kb;write_bw_max_kb;write_bw_agg_pct;write_bw_mean_kb;write_bw_dev_kb;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
-
-In client/server mode terse output differs from what appears when jobs are run
-locally. Disk utilization data is omitted from the standard terse output and
-for v3 and later appears on its own separate line at the end of each terse
-reporting cycle.
-
-
-JSON output
-------------
-
-The `json` output format is intended to be both human readable and convenient
-for automated parsing. For the most part its sections mirror those of the
-`normal` output. The `runtime` value is reported in msec and the `bw` value is
-reported in 1024 bytes per second units.
-
-
-JSON+ output
-------------
-
-The `json+` output format is identical to the `json` output format except that it
-adds a full dump of the completion latency bins. Each `bins` object contains a
-set of (key, value) pairs where keys are latency durations and values count how
-many I/Os had completion latencies of the corresponding duration. For example,
-consider:
-
-       "bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
-
-This data indicates that one I/O required 87,552ns to complete, two I/Os required
-100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
-
-Also included with fio is a Python script `fio_jsonplus_clat2csv` that takes
-json+ output and generates CSV-formatted latency data suitable for plotting.
-
-The latency durations actually represent the midpoints of latency intervals.
-For details refer to :file:`stat.h`.
-
-
-Trace file format
------------------
-
-There are two trace file format that you can encounter. The older (v1) format is
-unsupported since version 1.20-rc3 (March 2008). It will still be described
-below in case that you get an old trace and want to understand it.
-
-In any case the trace is a simple text file with a single action per line.
-
-
-Trace file format v1
-~~~~~~~~~~~~~~~~~~~~
-
-Each line represents a single I/O action in the following format::
-
-       rw, offset, length
-
-where `rw=0/1` for read/write, and the `offset` and `length` entries being in bytes.
-
-This format is not supported in fio versions >= 1.20-rc3.
-
-
-Trace file format v2
-~~~~~~~~~~~~~~~~~~~~
-
-The second version of the trace file format was added in fio version 1.17.  It
-allows to access more then one file per trace and has a bigger set of possible
-file actions.
-
-The first line of the trace file has to be::
-
-    fio version 2 iolog
-
-Following this can be lines in two different formats, which are described below.
-
-The file management format::
-
-    filename action
-
-The `filename` is given as an absolute path. The `action` can be one of these:
-
-**add**
-               Add the given `filename` to the trace.
-**open**
-               Open the file with the given `filename`. The `filename` has to have
-               been added with the **add** action before.
-**close**
-               Close the file with the given `filename`. The file has to have been
-               opened before.
-
-
-The file I/O action format::
-
-    filename action offset length
-
-The `filename` is given as an absolute path, and has to have been added and
-opened before it can be used with this format. The `offset` and `length` are
-given in bytes. The `action` can be one of these:
-
-**wait**
-          Wait for `offset` microseconds. Everything below 100 is discarded.
-          The time is relative to the previous `wait` statement.
-**read**
-          Read `length` bytes beginning from `offset`.
-**write**
-          Write `length` bytes beginning from `offset`.
-**sync**
-          :manpage:`fsync(2)` the file.
-**datasync**
-          :manpage:`fdatasync(2)` the file.
-**trim**
-          Trim the given file from the given `offset` for `length` bytes.
-
-
-I/O Replay - Merging Traces
----------------------------
-
-Colocation is a common practice used to get the most out of a machine.
-Knowing which workloads play nicely with each other and which ones don't is
-a much harder task. While fio can replay workloads concurrently via multiple
-jobs, it leaves some variability up to the scheduler making results harder to
-reproduce. Merging is a way to make the order of events consistent.
-
-Merging is integrated into I/O replay and done when a
-:option:`merge_blktrace_file` is specified. The list of files passed to
-:option:`read_iolog` go through the merge process and output a single file
-stored to the specified file. The output file is passed on as if it were the
-only file passed to :option:`read_iolog`. An example would look like::
-
-       $ fio --read_iolog="<file1>:<file2>" --merge_blktrace_file="<output_file>"
-
-Creating only the merged file can be done by passing the command line argument
-:option:`--merge-blktrace-only`.
-
-Scaling traces can be done to see the relative impact of any particular trace
-being slowed down or sped up. :option:`merge_blktrace_scalars` takes in a colon
-separated list of percentage scalars. It is index paired with the files passed
-to :option:`read_iolog`.
-
-With scaling, it may be desirable to match the running time of all traces.
-This can be done with :option:`merge_blktrace_iters`. It is index paired with
-:option:`read_iolog` just like :option:`merge_blktrace_scalars`.
-
-In an example, given two traces, A and B, each 60s long. If we want to see
-the impact of trace A issuing IOs twice as fast and repeat trace A over the
-runtime of trace B, the following can be done::
-
-       $ fio --read_iolog="<trace_a>:"<trace_b>" --merge_blktrace_file"<output_file>" --merge_blktrace_scalars="50:100" --merge_blktrace_iters="2:1"
-
-This runs trace A at 2x the speed twice for approximately the same runtime as
-a single run of trace B.
-
-
-CPU idleness profiling
-----------------------
-
-In some cases, we want to understand CPU overhead in a test. For example, we
-test patches for the specific goodness of whether they reduce CPU usage.
-Fio implements a balloon approach to create a thread per CPU that runs at idle
-priority, meaning that it only runs when nobody else needs the cpu.
-By measuring the amount of work completed by the thread, idleness of each CPU
-can be derived accordingly.
-
-An unit work is defined as touching a full page of unsigned characters. Mean and
-standard deviation of time to complete an unit work is reported in "unit work"
-section. Options can be chosen to report detailed percpu idleness or overall
-system idleness by aggregating percpu stats.
-
-
-Verification and triggers
--------------------------
-
-Fio is usually run in one of two ways, when data verification is done. The first
-is a normal write job of some sort with verify enabled. When the write phase has
-completed, fio switches to reads and verifies everything it wrote. The second
-model is running just the write phase, and then later on running the same job
-(but with reads instead of writes) to repeat the same I/O patterns and verify
-the contents. Both of these methods depend on the write phase being completed,
-as fio otherwise has no idea how much data was written.
-
-With verification triggers, fio supports dumping the current write state to
-local files. Then a subsequent read verify workload can load this state and know
-exactly where to stop. This is useful for testing cases where power is cut to a
-server in a managed fashion, for instance.
-
-A verification trigger consists of two things:
-
-1) Storing the write state of each job.
-2) Executing a trigger command.
-
-The write state is relatively small, on the order of hundreds of bytes to single
-kilobytes. It contains information on the number of completions done, the last X
-completions, etc.
-
-A trigger is invoked either through creation ('touch') of a specified file in
-the system, or through a timeout setting. If fio is run with
-:option:`--trigger-file`\= :file:`/tmp/trigger-file`, then it will continually
-check for the existence of :file:`/tmp/trigger-file`. When it sees this file, it
-will fire off the trigger (thus saving state, and executing the trigger
-command).
-
-For client/server runs, there's both a local and remote trigger. If fio is
-running as a server backend, it will send the job states back to the client for
-safe storage, then execute the remote trigger, if specified. If a local trigger
-is specified, the server will still send back the write state, but the client
-will then execute the trigger.
-
-Verification trigger example
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Let's say we want to run a powercut test on the remote Linux machine 'server'.
-Our write workload is in :file:`write-test.fio`. We want to cut power to 'server' at
-some point during the run, and we'll run this test from the safety or our local
-machine, 'localbox'. On the server, we'll start the fio backend normally::
-
-       server# fio --server
-
-and on the client, we'll fire off the workload::
-
-       localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
-
-We set :file:`/tmp/my-trigger` as the trigger file, and we tell fio to execute::
-
-       echo b > /proc/sysrq-trigger
-
-on the server once it has received the trigger and sent us the write state. This
-will work, but it's not **really** cutting power to the server, it's merely
-abruptly rebooting it. If we have a remote way of cutting power to the server
-through IPMI or similar, we could do that through a local trigger command
-instead. Let's assume we have a script that does IPMI reboot of a given hostname,
-ipmi-reboot. On localbox, we could then have run fio with a local trigger
-instead::
-
-       localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
-
-For this case, fio would wait for the server to send us the write state, then
-execute ``ipmi-reboot server`` when that happened.
-
-Loading verify state
-~~~~~~~~~~~~~~~~~~~~
-
-To load stored write state, a read verification job file must contain the
-:option:`verify_state_load` option. If that is set, fio will load the previously
-stored state. For a local fio run this is done by loading the files directly,
-and on a client/server run, the server backend will ask the client to send the
-files over and load them from there.
-
-
-Log File Formats
-----------------
-
-Fio supports a variety of log file formats, for logging latencies, bandwidth,
-and IOPS. The logs share a common format, which looks like this:
-
-    *time* (`msec`), *value*, *data direction*, *block size* (`bytes`),
-    *offset* (`bytes`), *command priority*
-
-*Time* for the log entry is always in milliseconds. The *value* logged depends
-on the type of log, it will be one of the following:
-
-    **Latency log**
-               Value is latency in nsecs
-    **Bandwidth log**
-               Value is in KiB/sec
-    **IOPS log**
-               Value is IOPS
-
-*Data direction* is one of the following:
-
-       **0**
-               I/O is a READ
-       **1**
-               I/O is a WRITE
-       **2**
-               I/O is a TRIM
-
-The entry's *block size* is always in bytes. The *offset* is the position in bytes
-from the start of the file for that particular I/O. The logging of the offset can be
-toggled with :option:`log_offset`.
-
-*Command priority* is 0 for normal priority and 1 for high priority. This is controlled
-by the ioengine specific :option:`cmdprio_percentage`.
-
-Fio defaults to logging every individual I/O but when windowed logging is set
-through :option:`log_avg_msec`, either the average (by default) or the maximum
-(:option:`log_max_value` is set) *value* seen over the specified period of time
-is recorded. Each *data direction* seen within the window period will aggregate
-its values in a separate row. Further, when using windowed logging the *block
-size* and *offset* entries will always contain 0.
-
-
-Client/Server
--------------
-
-Normally fio is invoked as a stand-alone application on the machine where the
-I/O workload should be generated. However, the backend and frontend of fio can
-be run separately i.e., the fio server can generate an I/O workload on the "Device
-Under Test" while being controlled by a client on another machine.
-
-Start the server on the machine which has access to the storage DUT::
-
-       $ fio --server=args
-
-where `args` defines what fio listens to. The arguments are of the form
-``type,hostname`` or ``IP,port``. *type* is either ``ip`` (or ip4) for TCP/IP
-v4, ``ip6`` for TCP/IP v6, or ``sock`` for a local unix domain socket.
-*hostname* is either a hostname or IP address, and *port* is the port to listen
-to (only valid for TCP/IP, not a local socket). Some examples:
-
-1) ``fio --server``
-
-   Start a fio server, listening on all interfaces on the default port (8765).
-
-2) ``fio --server=ip:hostname,4444``
-
-   Start a fio server, listening on IP belonging to hostname and on port 4444.
-
-3) ``fio --server=ip6:::1,4444``
-
-   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
-
-4) ``fio --server=,4444``
-
-   Start a fio server, listening on all interfaces on port 4444.
-
-5) ``fio --server=1.2.3.4``
-
-   Start a fio server, listening on IP 1.2.3.4 on the default port.
-
-6) ``fio --server=sock:/tmp/fio.sock``
-
-   Start a fio server, listening on the local socket :file:`/tmp/fio.sock`.
-
-Once a server is running, a "client" can connect to the fio server with::
-
-       fio <local-args> --client=<server> <remote-args> <job file(s)>
-
-where `local-args` are arguments for the client where it is running, `server`
-is the connect string, and `remote-args` and `job file(s)` are sent to the
-server. The `server` string follows the same format as it does on the server
-side, to allow IP/hostname/socket and port strings.
-
-Fio can connect to multiple servers this way::
-
-    fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
-
-If the job file is located on the fio server, then you can tell the server to
-load a local file as well. This is done by using :option:`--remote-config` ::
-
-   fio --client=server --remote-config /path/to/file.fio
-
-Then fio will open this local (to the server) job file instead of being passed
-one from the client.
-
-If you have many servers (example: 100 VMs/containers), you can input a pathname
-of a file containing host IPs/names as the parameter value for the
-:option:`--client` option.  For example, here is an example :file:`host.list`
-file containing 2 hostnames::
-
-       host1.your.dns.domain
-       host2.your.dns.domain
-
-The fio command would then be::
-
-    fio --client=host.list <job file(s)>
-
-In this mode, you cannot input server-specific parameters or job files -- all
-servers receive the same job file.
-
-In order to let ``fio --client`` runs use a shared filesystem from multiple
-hosts, ``fio --client`` now prepends the IP address of the server to the
-filename.  For example, if fio is using the directory :file:`/mnt/nfs/fio` and is
-writing filename :file:`fileio.tmp`, with a :option:`--client` `hostfile`
-containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and
-192.168.10.121, then fio will create two files::
-
-       /mnt/nfs/fio/192.168.10.120.fileio.tmp
-       /mnt/nfs/fio/192.168.10.121.fileio.tmp
-
-Terse output in client/server mode will differ slightly from what is produced
-when fio is run in stand-alone mode. See the terse output section for details.
diff --git a/HOWTO.rst b/HOWTO.rst

new file mode 100644 (file)

index 0000000..2f8ef6d
--- /dev/null
+++ b/HOWTO.rst
@@ -0,0 +1,5286 @@
+How fio works
+-------------
+
+The first step in getting fio to simulate a desired I/O workload, is writing a
+job file describing that specific setup. A job file may contain any number of
+threads and/or files -- the typical contents of the job file is a *global*
+section defining shared parameters, and one or more job sections describing the
+jobs involved. When run, fio parses this file and sets everything up as
+described. If we break down a job from top to bottom, it contains the following
+basic parameters:
+
+`I/O type`_
+
+               Defines the I/O pattern issued to the file(s).  We may only be reading
+               sequentially from this file(s), or we may be writing randomly. Or even
+               mixing reads and writes, sequentially or randomly.
+               Should we be doing buffered I/O, or direct/raw I/O?
+
+`Block size`_
+
+               In how large chunks are we issuing I/O? This may be a single value,
+               or it may describe a range of block sizes.
+
+`I/O size`_
+
+               How much data are we going to be reading/writing.
+
+`I/O engine`_
+
+               How do we issue I/O? We could be memory mapping the file, we could be
+               using regular read/write, we could be using splice, async I/O, or even
+               SG (SCSI generic sg).
+
+`I/O depth`_
+
+               If the I/O engine is async, how large a queuing depth do we want to
+               maintain?
+
+
+`Target file/device`_
+
+               How many files are we spreading the workload over.
+
+`Threads, processes and job synchronization`_
+
+               How many threads or processes should we spread this workload over.
+
+The above are the basic parameters defined for a workload, in addition there's a
+multitude of parameters that modify other aspects of how this job behaves.
+
+
+Command line options
+--------------------
+
+.. option:: --debug=type
+
+       Enable verbose tracing `type` of various fio actions.  May be ``all`` for all types
+       or individual types separated by a comma (e.g. ``--debug=file,mem`` will
+       enable file and memory debugging).  Currently, additional logging is
+       available for:
+
+       *process*
+                       Dump info related to processes.
+       *file*
+                       Dump info related to file actions.
+       *io*
+                       Dump info related to I/O queuing.
+       *mem*
+                       Dump info related to memory allocations.
+       *blktrace*
+                       Dump info related to blktrace setup.
+       *verify*
+                       Dump info related to I/O verification.
+       *all*
+                       Enable all debug options.
+       *random*
+                       Dump info related to random offset generation.
+       *parse*
+                       Dump info related to option matching and parsing.
+       *diskutil*
+                       Dump info related to disk utilization updates.
+       *job:x*
+                       Dump info only related to job number x.
+       *mutex*
+                       Dump info only related to mutex up/down ops.
+       *profile*
+                       Dump info related to profile extensions.
+       *time*
+                       Dump info related to internal time keeping.
+       *net*
+                       Dump info related to networking connections.
+       *rate*
+                       Dump info related to I/O rate switching.
+       *compress*
+                       Dump info related to log compress/decompress.
+       *steadystate*
+                       Dump info related to steadystate detection.
+       *helperthread*
+                       Dump info related to the helper thread.
+       *zbd*
+                       Dump info related to support for zoned block devices.
+       *?* or *help*
+                       Show available debug options.
+
+.. option:: --parse-only
+
+       Parse options only, don't start any I/O.
+
+.. option:: --merge-blktrace-only
+
+       Merge blktraces only, don't start any I/O.
+
+.. option:: --output=filename
+
+       Write output to file `filename`.
+
+.. option:: --output-format=format
+
+       Set the reporting `format` to `normal`, `terse`, `json`, or `json+`.  Multiple
+       formats can be selected, separated by a comma.  `terse` is a CSV based
+       format.  `json+` is like `json`, except it adds a full dump of the latency
+       buckets.
+
+.. option:: --bandwidth-log
+
+       Generate aggregate bandwidth logs.
+
+.. option:: --minimal
+
+       Print statistics in a terse, semicolon-delimited format.
+
+.. option:: --append-terse
+
+       Print statistics in selected mode AND terse, semicolon-delimited format.
+       **Deprecated**, use :option:`--output-format` instead to select multiple
+       formats.
+
+.. option:: --terse-version=version
+
+       Set terse `version` output format (default 3, or 2 or 4 or 5).
+
+.. option:: --version
+
+       Print version information and exit.
+
+.. option:: --help
+
+       Print a summary of the command line options and exit.
+
+.. option:: --cpuclock-test
+
+       Perform test and validation of internal CPU clock.
+
+.. option:: --crctest=[test]
+
+       Test the speed of the built-in checksumming functions. If no argument is
+       given, all of them are tested. Alternatively, a comma separated list can
+       be passed, in which case the given ones are tested.
+
+.. option:: --cmdhelp=command
+
+       Print help information for `command`. May be ``all`` for all commands.
+
+.. option:: --enghelp=[ioengine[,command]]
+
+       List all commands defined by `ioengine`, or print help for `command`
+       defined by `ioengine`.  If no `ioengine` is given, list all
+       available ioengines.
+
+.. option:: --showcmd
+
+       Convert given job files to a set of command-line options.
+
+.. option:: --readonly
+
+       Turn on safety read-only checks, preventing writes and trims.  The
+       ``--readonly`` option is an extra safety guard to prevent users from
+       accidentally starting a write or trim workload when that is not desired.
+       Fio will only modify the device under test if
+       `rw=write/randwrite/rw/randrw/trim/randtrim/trimwrite` is given.  This
+       safety net can be used as an extra precaution.
+
+.. option:: --eta=when
+
+       Specifies when real-time ETA estimate should be printed.  `when` may be
+       `always`, `never` or `auto`. `auto` is the default, it prints ETA
+       when requested if the output is a TTY. `always` disregards the output
+       type, and prints ETA when requested. `never` never prints ETA.
+
+.. option:: --eta-interval=time
+
+       By default, fio requests client ETA status roughly every second. With
+       this option, the interval is configurable. Fio imposes a minimum
+       allowed time to avoid flooding the console, less than 250 msec is
+       not supported.
+
+.. option:: --eta-newline=time
+
+       Force a new line for every `time` period passed.  When the unit is omitted,
+       the value is interpreted in seconds.
+
+.. option:: --status-interval=time
+
+       Force a full status dump of cumulative (from job start) values at `time`
+       intervals. This option does *not* provide per-period measurements. So
+       values such as bandwidth are running averages. When the time unit is omitted,
+       `time` is interpreted in seconds. Note that using this option with
+       ``--output-format=json`` will yield output that technically isn't valid
+       json, since the output will be collated sets of valid json. It will need
+       to be split into valid sets of json after the run.
+
+.. option:: --section=name
+
+       Only run specified section `name` in job file.  Multiple sections can be specified.
+       The ``--section`` option allows one to combine related jobs into one file.
+       E.g. one job file could define light, moderate, and heavy sections. Tell
+       fio to run only the "heavy" section by giving ``--section=heavy``
+       command line option.  One can also specify the "write" operations in one
+       section and "verify" operation in another section.  The ``--section`` option
+       only applies to job sections.  The reserved *global* section is always
+       parsed and used.
+
+.. option:: --alloc-size=kb
+
+       Allocate additional internal smalloc pools of size `kb` in KiB.  The
+       ``--alloc-size`` option increases shared memory set aside for use by fio.
+       If running large jobs with randommap enabled, fio can run out of memory.
+       Smalloc is an internal allocator for shared structures from a fixed size
+       memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
+
+       NOTE: While running :file:`.fio_smalloc.*` backing store files are visible
+       in :file:`/tmp`.
+
+.. option:: --warnings-fatal
+
+       All fio parser warnings are fatal, causing fio to exit with an
+       error.
+
+.. option:: --max-jobs=nr
+
+       Set the maximum number of threads/processes to support to `nr`.
+       NOTE: On Linux, it may be necessary to increase the shared-memory
+       limit (:file:`/proc/sys/kernel/shmmax`) if fio runs into errors while
+       creating jobs.
+
+.. option:: --server=args
+
+       Start a backend server, with `args` specifying what to listen to.
+       See `Client/Server`_ section.
+
+.. option:: --daemonize=pidfile
+
+       Background a fio server, writing the pid to the given `pidfile` file.
+
+.. option:: --client=hostname
+
+       Instead of running the jobs locally, send and run them on the given `hostname`
+       or set of `hostname`\s.  See `Client/Server`_ section.
+
+.. option:: --remote-config=file
+
+       Tell fio server to load this local `file`.
+
+.. option:: --idle-prof=option
+
+       Report CPU idleness. `option` is one of the following:
+
+               **calibrate**
+                       Run unit work calibration only and exit.
+
+               **system**
+                       Show aggregate system idleness and unit work.
+
+               **percpu**
+                       As **system** but also show per CPU idleness.
+
+.. option:: --inflate-log=log
+
+       Inflate and output compressed `log`.
+
+.. option:: --trigger-file=file
+
+       Execute trigger command when `file` exists.
+
+.. option:: --trigger-timeout=time
+
+       Execute trigger at this `time`.
+
+.. option:: --trigger=command
+
+       Set this `command` as local trigger.
+
+.. option:: --trigger-remote=command
+
+       Set this `command` as remote trigger.
+
+.. option:: --aux-path=path
+
+       Use the directory specified by `path` for generated state files instead
+       of the current working directory.
+
+Any parameters following the options will be assumed to be job files, unless
+they match a job file parameter. Multiple job files can be listed and each job
+file will be regarded as a separate group. Fio will :option:`stonewall`
+execution between each group.
+
+
+Job file format
+---------------
+
+As previously described, fio accepts one or more job files describing what it is
+supposed to do. The job file format is the classic ini file, where the names
+enclosed in [] brackets define the job name. You are free to use any ASCII name
+you want, except *global* which has special meaning.  Following the job name is
+a sequence of zero or more parameters, one per line, that define the behavior of
+the job. If the first character in a line is a ';' or a '#', the entire line is
+discarded as a comment.
+
+A *global* section sets defaults for the jobs described in that file. A job may
+override a *global* section parameter, and a job file may even have several
+*global* sections if so desired. A job is only affected by a *global* section
+residing above it.
+
+The :option:`--cmdhelp` option also lists all options. If used with a `command`
+argument, :option:`--cmdhelp` will detail the given `command`.
+
+See the `examples/` directory for inspiration on how to write job files.  Note
+the copyright and license requirements currently apply to `examples/` files.
+
+So let's look at a really simple job file that defines two processes, each
+randomly reading from a 128MiB file:
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [global]
+    rw=randread
+    size=128m
+
+    [job1]
+
+    [job2]
+
+    ; -- end job file --
+
+As you can see, the job file sections themselves are empty as all the described
+parameters are shared. As no :option:`filename` option is given, fio makes up a
+`filename` for each of the jobs as it sees fit. On the command line, this job
+would look as follows::
+
+$ fio --name=global --rw=randread --size=128m --name=job1 --name=job2
+
+
+Let's look at an example that has a number of processes writing randomly to
+files:
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    ioengine=libaio
+    iodepth=4
+    rw=randwrite
+    bs=32k
+    direct=0
+    size=64m
+    numjobs=4
+    ; -- end job file --
+
+Here we have no *global* section, as we only have one job defined anyway.  We
+want to use async I/O here, with a depth of 4 for each file. We also increased
+the buffer size used to 32KiB and define numjobs to 4 to fork 4 identical
+jobs. The result is 4 processes each randomly writing to their own 64MiB
+file. Instead of using the above job file, you could have given the parameters
+on the command line. For this case, you would specify::
+
+$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
+
+When fio is utilized as a basis of any reasonably large test suite, it might be
+desirable to share a set of standardized settings across multiple job files.
+Instead of copy/pasting such settings, any section may pull in an external
+:file:`filename.fio` file with *include filename* directive, as in the following
+example::
+
+    ; -- start job file including.fio --
+    [global]
+    filename=/tmp/test
+    filesize=1m
+    include glob-include.fio
+
+    [test]
+    rw=randread
+    bs=4k
+    time_based=1
+    runtime=10
+    include test-include.fio
+    ; -- end job file including.fio --
+
+.. code-block:: ini
+
+    ; -- start job file glob-include.fio --
+    thread=1
+    group_reporting=1
+    ; -- end job file glob-include.fio --
+
+.. code-block:: ini
+
+    ; -- start job file test-include.fio --
+    ioengine=libaio
+    iodepth=4
+    ; -- end job file test-include.fio --
+
+Settings pulled into a section apply to that section only (except *global*
+section). Include directives may be nested in that any included file may contain
+further include directive(s). Include files may not contain [] sections.
+
+
+Environment variables
+~~~~~~~~~~~~~~~~~~~~~
+
+Fio also supports environment variable expansion in job files. Any sub-string of
+the form ``${VARNAME}`` as part of an option value (in other words, on the right
+of the '='), will be expanded to the value of the environment variable called
+`VARNAME`.  If no such environment variable is defined, or `VARNAME` is the
+empty string, the empty string will be substituted.
+
+As an example, let's look at a sample fio invocation and job file::
+
+$ SIZE=64m NUMJOBS=4 fio jobfile.fio
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    rw=randwrite
+    size=${SIZE}
+    numjobs=${NUMJOBS}
+    ; -- end job file --
+
+This will expand to the following equivalent job file at runtime:
+
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    rw=randwrite
+    size=64m
+    numjobs=4
+    ; -- end job file --
+
+Fio ships with a few example job files, you can also look there for inspiration.
+
+Reserved keywords
+~~~~~~~~~~~~~~~~~
+
+Additionally, fio has a set of reserved keywords that will be replaced
+internally with the appropriate value. Those keywords are:
+
+**$pagesize**
+
+       The architecture page size of the running system.
+
+**$mb_memory**
+
+       Megabytes of total memory in the system.
+
+**$ncpus**
+
+       Number of online available CPUs.
+
+These can be used on the command line or in the job file, and will be
+automatically substituted with the current system values when the job is
+run. Simple math is also supported on these keywords, so you can perform actions
+like::
+
+       size=8*$mb_memory
+
+and get that properly expanded to 8 times the size of memory in the machine.
+
+
+Job file parameters
+-------------------
+
+This section describes in details each parameter associated with a job.  Some
+parameters take an option of a given type, such as an integer or a
+string. Anywhere a numeric value is required, an arithmetic expression may be
+used, provided it is surrounded by parentheses. Supported operators are:
+
+       - addition (+)
+       - subtraction (-)
+       - multiplication (*)
+       - division (/)
+       - modulus (%)
+       - exponentiation (^)
+
+For time values in expressions, units are microseconds by default. This is
+different than for time values not in expressions (not enclosed in
+parentheses). The following types are used:
+
+
+Parameter types
+~~~~~~~~~~~~~~~
+
+**str**
+       String: A sequence of alphanumeric characters.
+
+**time**
+       Integer with possible time suffix.  Without a unit value is interpreted as
+       seconds unless otherwise specified.  Accepts a suffix of 'd' for days, 'h' for
+       hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and
+       'us' (or 'usec') for microseconds.  For example, use 10m for 10 minutes.
+
+.. _int:
+
+**int**
+       Integer. A whole number value, which may contain an integer prefix
+       and an integer suffix:
+
+       [*integer prefix*] **number** [*integer suffix*]
+
+       The optional *integer prefix* specifies the number's base. The default
+       is decimal. *0x* specifies hexadecimal.
+
+       The optional *integer suffix* specifies the number's units, and includes an
+       optional unit prefix and an optional unit.  For quantities of data, the
+       default unit is bytes. For quantities of time, the default unit is seconds
+       unless otherwise specified.
+
+       With :option:`kb_base`\=1000, fio follows international standards for unit
+       prefixes.  To specify power-of-10 decimal values defined in the
+       International System of Units (SI):
+
+               * *K* -- means kilo (K) or 1000
+               * *M* -- means mega (M) or 1000**2
+               * *G* -- means giga (G) or 1000**3
+               * *T* -- means tera (T) or 1000**4
+               * *P* -- means peta (P) or 1000**5
+
+       To specify power-of-2 binary values defined in IEC 80000-13:
+
+               * *Ki* -- means kibi (Ki) or 1024
+               * *Mi* -- means mebi (Mi) or 1024**2
+               * *Gi* -- means gibi (Gi) or 1024**3
+               * *Ti* -- means tebi (Ti) or 1024**4
+               * *Pi* -- means pebi (Pi) or 1024**5
+
+       For Zone Block Device Mode:
+               * *z*  -- means Zone
+
+       With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite
+       from those specified in the SI and IEC 80000-13 standards to provide
+       compatibility with old scripts.  For example, 4k means 4096.
+
+       For quantities of data, an optional unit of 'B' may be included
+       (e.g., 'kB' is the same as 'k').
+
+       The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
+       not milli). 'b' and 'B' both mean byte, not bit.
+
+       Examples with :option:`kb_base`\=1000:
+
+               * *4 KiB*: 4096, 4096b, 4096B, 4ki, 4kib, 4kiB, 4Ki, 4KiB
+               * *1 MiB*: 1048576, 1mi, 1024ki
+               * *1 MB*: 1000000, 1m, 1000k
+               * *1 TiB*: 1099511627776, 1ti, 1024gi, 1048576mi
+               * *1 TB*: 1000000000, 1t, 1000m, 1000000k
+
+       Examples with :option:`kb_base`\=1024 (default):
+
+               * *4 KiB*: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+               * *1 MiB*: 1048576, 1m, 1024k
+               * *1 MB*: 1000000, 1mi, 1000ki
+               * *1 TiB*: 1099511627776, 1t, 1024g, 1048576m
+               * *1 TB*: 1000000000, 1ti, 1000mi, 1000000ki
+
+       To specify times (units are not case sensitive):
+
+               * *D* -- means days
+               * *H* -- means hours
+               * *M* -- means minutes
+               * *s* -- or sec means seconds (default)
+               * *ms* -- or *msec* means milliseconds
+               * *us* -- or *usec* means microseconds
+
+       If the option accepts an upper and lower range, use a colon ':' or
+       minus '-' to separate such values. See :ref:`irange <irange>`.
+       If the lower value specified happens to be larger than the upper value
+       the two values are swapped.
+
+.. _bool:
+
+**bool**
+       Boolean. Usually parsed as an integer, however only defined for
+       true and false (1 and 0).
+
+.. _irange:
+
+**irange**
+       Integer range with suffix. Allows value range to be given, such as
+       1024-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
+       option allows two sets of ranges, they can be specified with a ',' or '/'
+       delimiter: 1k-4k/8k-32k. Also see :ref:`int <int>`.
+
+**float_list**
+       A list of floating point numbers, separated by a ':' character.
+
+With the above in mind, here follows the complete list of fio job parameters.
+
+
+Units
+~~~~~
+
+.. option:: kb_base=int
+
+       Select the interpretation of unit prefixes in input parameters.
+
+               **1000**
+                       Inputs comply with IEC 80000-13 and the International
+                       System of Units (SI). Use:
+
+                               - power-of-2 values with IEC prefixes (e.g., KiB)
+                               - power-of-10 values with SI prefixes (e.g., kB)
+
+               **1024**
+                       Compatibility mode (default).  To avoid breaking old scripts:
+
+                               - power-of-2 values with SI prefixes
+                               - power-of-10 values with IEC prefixes
+
+       See :option:`bs` for more details on input parameters.
+
+       Outputs always use correct prefixes.  Most outputs include both
+       side-by-side, like::
+
+               bw=2383.3kB/s (2327.4KiB/s)
+
+       If only one value is reported, then kb_base selects the one to use:
+
+               **1000** -- SI prefixes
+
+               **1024** -- IEC prefixes
+
+.. option:: unit_base=int
+
+       Base unit for reporting.  Allowed values are:
+
+       **0**
+               Use auto-detection (default).
+       **8**
+               Byte based.
+       **1**
+               Bit based.
+
+
+Job description
+~~~~~~~~~~~~~~~
+
+.. option:: name=str
+
+       ASCII name of the job. This may be used to override the name printed by fio
+       for this job. Otherwise the job name is used. On the command line this
+       parameter has the special purpose of also signaling the start of a new job.
+
+.. option:: description=str
+
+       Text description of the job. Doesn't do anything except dump this text
+       description when this job is run. It's not parsed.
+
+.. option:: loops=int
+
+       Run the specified number of iterations of this job. Used to repeat the same
+       workload a given number of times. Defaults to 1.
+
+.. option:: numjobs=int
+
+       Create the specified number of clones of this job. Each clone of job
+       is spawned as an independent thread or process. May be used to setup a
+       larger number of threads/processes doing the same thing. Each thread is
+       reported separately; to see statistics for all clones as a whole, use
+       :option:`group_reporting` in conjunction with :option:`new_group`.
+       See :option:`--max-jobs`.  Default: 1.
+
+
+Time related parameters
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: runtime=time
+
+       Limit runtime. The test will run until it completes the configured I/O
+       workload or until it has run for this specified amount of time, whichever
+       occurs first. It can be quite hard to determine for how long a specified
+       job will run, so this parameter is handy to cap the total runtime to a
+       given time.  When the unit is omitted, the value is interpreted in
+       seconds.
+
+.. option:: time_based
+
+       If set, fio will run for the duration of the :option:`runtime` specified
+       even if the file(s) are completely read or written. It will simply loop over
+       the same workload as many times as the :option:`runtime` allows.
+
+.. option:: startdelay=irange(time)
+
+       Delay the start of job for the specified amount of time.  Can be a single
+       value or a range.  When given as a range, each thread will choose a value
+       randomly from within the range.  Value is in seconds if a unit is omitted.
+
+.. option:: ramp_time=time
+
+       If set, fio will run the specified workload for this amount of time before
+       logging any performance numbers. Useful for letting performance settle
+       before logging results, thus minimizing the runtime required for stable
+       results. Note that the ``ramp_time`` is considered lead in time for a job,
+       thus it will increase the total runtime if a special timeout or
+       :option:`runtime` is specified.  When the unit is omitted, the value is
+       given in seconds.
+
+.. option:: clocksource=str
+
+       Use the given clocksource as the base of timing. The supported options are:
+
+               **gettimeofday**
+                       :manpage:`gettimeofday(2)`
+
+               **clock_gettime**
+                       :manpage:`clock_gettime(2)`
+
+               **cpu**
+                       Internal CPU clock source
+
+       cpu is the preferred clocksource if it is reliable, as it is very fast (and
+       fio is heavy on time calls). Fio will automatically use this clocksource if
+       it's supported and considered reliable on the system it is running on,
+       unless another clocksource is specifically set. For x86/x86-64 CPUs, this
+       means supporting TSC Invariant.
+
+.. option:: gtod_reduce=bool
+
+       Enable all of the :manpage:`gettimeofday(2)` reducing options
+       (:option:`disable_clat`, :option:`disable_slat`, :option:`disable_bw_measurement`) plus
+       reduce precision of the timeout somewhat to really shrink the
+       :manpage:`gettimeofday(2)` call count. With this option enabled, we only do
+       about 0.4% of the :manpage:`gettimeofday(2)` calls we would have done if all
+       time keeping was enabled.
+
+.. option:: gtod_cpu=int
+
+       Sometimes it's cheaper to dedicate a single thread of execution to just
+       getting the current time. Fio (and databases, for instance) are very
+       intensive on :manpage:`gettimeofday(2)` calls. With this option, you can set
+       one CPU aside for doing nothing but logging current time to a shared memory
+       location. Then the other threads/processes that run I/O workloads need only
+       copy that segment, instead of entering the kernel with a
+       :manpage:`gettimeofday(2)` call. The CPU set aside for doing these time
+       calls will be excluded from other uses. Fio will manually clear it from the
+       CPU mask of other jobs.
+
+.. option:: job_start_clock_id=int
+
+        The clock_id passed to the call to `clock_gettime` used to record
+        job_start in the `json` output format. Default is 0, or CLOCK_REALTIME.
+
+
+Target file/device
+~~~~~~~~~~~~~~~~~~
+
+.. option:: directory=str
+
+       Prefix filenames with this directory. Used to place files in a different
+       location than :file:`./`.  You can specify a number of directories by
+       separating the names with a ':' character. These directories will be
+       assigned equally distributed to job clones created by :option:`numjobs` as
+       long as they are using generated filenames. If specific `filename(s)` are
+       set fio will use the first listed directory, and thereby matching the
+       `filename` semantic (which generates a file for each clone if not
+       specified, but lets all clones use the same file if set).
+
+       See the :option:`filename` option for information on how to escape "``:``"
+       characters within the directory path itself.
+
+       Note: To control the directory fio will use for internal state files
+       use :option:`--aux-path`.
+
+.. option:: filename=str
+
+       Fio normally makes up a `filename` based on the job name, thread number, and
+       file number (see :option:`filename_format`). If you want to share files
+       between threads in a job or several
+       jobs with fixed file paths, specify a `filename` for each of them to override
+       the default. If the ioengine is file based, you can specify a number of files
+       by separating the names with a ':' colon. So if you wanted a job to open
+       :file:`/dev/sda` and :file:`/dev/sdb` as the two working files, you would use
+       ``filename=/dev/sda:/dev/sdb``. This also means that whenever this option is
+       specified, :option:`nrfiles` is ignored. The size of regular files specified
+       by this option will be :option:`size` divided by number of files unless an
+       explicit size is specified by :option:`filesize`.
+
+       Each colon in the wanted path must be escaped with a ``\``
+       character.  For instance, if the path is :file:`/dev/dsk/foo@3,0:c` then you
+       would use ``filename=/dev/dsk/foo@3,0\:c`` and if the path is
+       :file:`F:\\filename` then you would use ``filename=F\:\filename``.
+
+       On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for
+       the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc.
+       Note: Windows and FreeBSD (refer to geom(4)) prevent write access to areas
+       of the disk containing in-use data (e.g. filesystems).
+
+       The filename "`-`" is a reserved name, meaning *stdin* or *stdout*.  Which
+       of the two depends on the read/write direction set.
+
+.. option:: filename_format=str
+
+       If sharing multiple files between jobs, it is usually necessary to have fio
+       generate the exact names that you want. By default, fio will name a file
+       based on the default file format specification of
+       :file:`jobname.jobnumber.filenumber`. With this option, that can be
+       customized. Fio will recognize and replace the following keywords in this
+       string:
+
+               **$jobname**
+                               The name of the worker thread or process.
+               **$clientuid**
+                               IP of the fio process when using client/server mode.
+               **$jobnum**
+                               The incremental number of the worker thread or process.
+               **$filenum**
+                               The incremental number of the file for that worker thread or
+                               process.
+
+       To have dependent jobs share a set of files, this option can be set to have
+       fio generate filenames that are shared between the two. For instance, if
+       :file:`testfiles.$filenum` is specified, file number 4 for any job will be
+       named :file:`testfiles.4`. The default of :file:`$jobname.$jobnum.$filenum`
+       will be used if no other format specifier is given.
+
+       If you specify a path then the directories will be created up to the
+       main directory for the file.  So for example if you specify
+       ``filename_format=a/b/c/$jobnum`` then the directories a/b/c will be
+       created before the file setup part of the job.  If you specify
+       :option:`directory` then the path will be relative that directory,
+       otherwise it is treated as the absolute path.
+
+.. option:: unique_filename=bool
+
+       To avoid collisions between networked clients, fio defaults to prefixing any
+       generated filenames (with a directory specified) with the source of the
+       client connecting. To disable this behavior, set this option to 0.
+
+.. option:: opendir=str
+
+        Recursively open any files below directory `str`. This accepts only a
+        single directory and unlike related options, colons appearing in the
+        path must not be escaped.
+
+.. option:: lockfile=str
+
+       Fio defaults to not locking any files before it does I/O to them. If a file
+       or file descriptor is shared, fio can serialize I/O to that file to make the
+       end result consistent. This is usual for emulating real workloads that share
+       files. The lock modes are:
+
+               **none**
+                       No locking. The default.
+               **exclusive**
+                       Only one thread or process may do I/O at a time, excluding all
+                       others.
+               **readwrite**
+                       Read-write locking on the file. Many readers may
+                       access the file at the same time, but writes get exclusive access.
+
+.. option:: nrfiles=int
+
+       Number of files to use for this job. Defaults to 1. The size of files
+       will be :option:`size` divided by this unless explicit size is specified by
+       :option:`filesize`. Files are created for each thread separately, and each
+       file will have a file number within its name by default, as explained in
+       :option:`filename` section.
+
+
+.. option:: openfiles=int
+
+       Number of files to keep open at the same time. Defaults to the same as
+       :option:`nrfiles`, can be set smaller to limit the number simultaneous
+       opens.
+
+.. option:: file_service_type=str
+
+       Defines how fio decides which file from a job to service next. The following
+       types are defined:
+
+               **random**
+                       Choose a file at random.
+
+               **roundrobin**
+                       Round robin over opened files. This is the default.
+
+               **sequential**
+                       Finish one file before moving on to the next. Multiple files can
+                       still be open depending on :option:`openfiles`.
+
+               **zipf**
+                       Use a *Zipf* distribution to decide what file to access.
+
+               **pareto**
+                       Use a *Pareto* distribution to decide what file to access.
+
+               **normal**
+                       Use a *Gaussian* (normal) distribution to decide what file to
+                       access.
+
+               **gauss**
+                       Alias for normal.
+
+       For *random*, *roundrobin*, and *sequential*, a postfix can be appended to
+       tell fio how many I/Os to issue before switching to a new file. For example,
+       specifying ``file_service_type=random:8`` would cause fio to issue
+       8 I/Os before selecting a new file at random. For the non-uniform
+       distributions, a floating point postfix can be given to influence how the
+       distribution is skewed. See :option:`random_distribution` for a description
+       of how that would work.
+
+.. option:: ioscheduler=str
+
+       Attempt to switch the device hosting the file to the specified I/O scheduler
+       before running.
+
+.. option:: create_serialize=bool
+
+       If true, serialize the file creation for the jobs.  This may be handy to
+       avoid interleaving of data files, which may greatly depend on the filesystem
+       used and even the number of processors in the system.  Default: true.
+
+.. option:: create_fsync=bool
+
+       :manpage:`fsync(2)` the data file after creation. This is the default.
+
+.. option:: create_on_open=bool
+
+       If true, don't pre-create files but allow the job's open() to create a file
+       when it's time to do I/O.  Default: false -- pre-create all necessary files
+       when the job starts.
+
+.. option:: create_only=bool
+
+       If true, fio will only run the setup phase of the job.  If files need to be
+       laid out or updated on disk, only that will be done -- the actual job contents
+       are not executed.  Default: false.
+
+.. option:: allow_file_create=bool
+
+       If true, fio is permitted to create files as part of its workload.  If this
+       option is false, then fio will error out if
+       the files it needs to use don't already exist. Default: true.
+
+.. option:: allow_mounted_write=bool
+
+       If this isn't set, fio will abort jobs that are destructive (e.g. that write)
+       to what appears to be a mounted device or partition. This should help catch
+       creating inadvertently destructive tests, not realizing that the test will
+       destroy data on the mounted file system. Note that some platforms don't allow
+       writing against a mounted device regardless of this option. Default: false.
+
+.. option:: pre_read=bool
+
+       If this is given, files will be pre-read into memory before starting the
+       given I/O operation. This will also clear the :option:`invalidate` flag,
+       since it is pointless to pre-read and then drop the cache. This will only
+       work for I/O engines that are seek-able, since they allow you to read the
+       same data multiple times. Thus it will not work on non-seekable I/O engines
+       (e.g. network, splice). Default: false.
+
+.. option:: unlink=bool
+
+       Unlink (delete) the job files when done. Not the default, as repeated runs of that
+       job would then waste time recreating the file set again and again. Default:
+       false.
+
+.. option:: unlink_each_loop=bool
+
+       Unlink (delete) job files after each iteration or loop.  Default: false.
+
+.. option:: zonemode=str
+
+       Accepted values are:
+
+               **none**
+                               The :option:`zonerange`, :option:`zonesize`,
+                               :option:`zonecapacity` and :option:`zoneskip`
+                               parameters are ignored.
+               **strided**
+                               I/O happens in a single zone until
+                               :option:`zonesize` bytes have been transferred.
+                               After that number of bytes has been
+                               transferred processing of the next zone
+                               starts. :option:`zonecapacity` is ignored.
+               **zbd**
+                               Zoned block device mode. I/O happens
+                               sequentially in each zone, even if random I/O
+                               has been selected. Random I/O happens across
+                               all zones instead of being restricted to a
+                               single zone. The :option:`zoneskip` parameter
+                               is ignored. :option:`zonerange` and
+                               :option:`zonesize` must be identical.
+                               Trim is handled using a zone reset operation.
+                               Trim only considers non-empty sequential write
+                               required and sequential write preferred zones.
+
+.. option:: zonerange=int
+
+       Size of a single zone. See also :option:`zonesize` and
+       :option:`zoneskip`.
+
+.. option:: zonesize=int
+
+       For :option:`zonemode` =strided, this is the number of bytes to
+       transfer before skipping :option:`zoneskip` bytes. If this parameter
+       is smaller than :option:`zonerange` then only a fraction of each zone
+       with :option:`zonerange` bytes will be accessed.  If this parameter is
+       larger than :option:`zonerange` then each zone will be accessed
+       multiple times before skipping to the next zone.
+
+       For :option:`zonemode` =zbd, this is the size of a single zone. The
+       :option:`zonerange` parameter is ignored in this mode.
+
+
+.. option:: zonecapacity=int
+
+       For :option:`zonemode` =zbd, this defines the capacity of a single zone,
+       which is the accessible area starting from the zone start address.
+       This parameter only applies when using :option:`zonemode` =zbd in
+       combination with regular block devices. If not specified it defaults to
+       the zone size. If the target device is a zoned block device, the zone
+       capacity is obtained from the device information and this option is
+       ignored.
+
+.. option:: zoneskip=int
+
+       For :option:`zonemode` =strided, the number of bytes to skip after
+       :option:`zonesize` bytes of data have been transferred. This parameter
+       must be zero for :option:`zonemode` =zbd.
+
+.. option:: read_beyond_wp=bool
+
+       This parameter applies to :option:`zonemode` =zbd only.
+
+       Zoned block devices are block devices that consist of multiple zones.
+       Each zone has a type, e.g. conventional or sequential. A conventional
+       zone can be written at any offset that is a multiple of the block
+       size. Sequential zones must be written sequentially. The position at
+       which a write must occur is called the write pointer. A zoned block
+       device can be either drive managed, host managed or host aware. For
+       host managed devices the host must ensure that writes happen
+       sequentially. Fio recognizes host managed devices and serializes
+       writes to sequential zones for these devices.
+
+       If a read occurs in a sequential zone beyond the write pointer then
+       the zoned block device will complete the read without reading any data
+       from the storage medium. Since such reads lead to unrealistically high
+       bandwidth and IOPS numbers fio only reads beyond the write pointer if
+       explicitly told to do so. Default: false.
+
+.. option:: max_open_zones=int
+
+       When a zone of a zoned block device is partially written (i.e. not all
+       sectors of the zone have been written), the zone is in one of three
+       conditions: 'implicit open', 'explicit open' or 'closed'. Zoned block
+       devices may have a limit called 'max_open_zones' (same name as the
+       parameter) on the total number of zones that can simultaneously be in
+       the 'implicit open' or 'explicit open' conditions. Zoned block devices
+       may have another limit called 'max_active_zones', on the total number of
+       zones that can simultaneously be in the three conditions. The
+       :option:`max_open_zones` parameter limits the number of zones to which
+       write commands are issued by all fio jobs, that is, limits the number of
+       zones that will be in the conditions. When the device has the
+       max_open_zones limit and does not have the max_active_zones limit, the
+       :option:`max_open_zones` parameter limits the number of zones in the two
+       open conditions up to the limit. In this case, fio includes zones in the
+       two open conditions to the write target zones at fio start. When the
+       device has both the max_open_zones and the max_active_zones limits, the
+       :option:`max_open_zones` parameter limits the number of zones in the
+       three conditions up to the limit. In this case, fio includes zones in
+       the three conditions to the write target zones at fio start.
+
+       This parameter is relevant only if the :option:`zonemode` =zbd is used.
+       The default value is always equal to the max_open_zones limit of the
+       target zoned block device and a value higher than this limit cannot be
+       specified by users unless the option :option:`ignore_zone_limits` is
+       specified. When :option:`ignore_zone_limits` is specified or the target
+       device does not have the max_open_zones limit, :option:`max_open_zones`
+       can specify 0 to disable any limit on the number of zones that can be
+       simultaneously written to by all jobs.
+
+.. option:: job_max_open_zones=int
+
+       In the same manner as :option:`max_open_zones`, limit the number of open
+       zones per fio job, that is, the number of zones that a single job can
+       simultaneously write to. A value of zero indicates no limit.
+       Default: zero.
+
+.. option:: ignore_zone_limits=bool
+
+       If this option is used, fio will ignore the maximum number of open
+       zones limit of the zoned block device in use, thus allowing the
+       option :option:`max_open_zones` value to be larger than the device
+       reported limit. Default: false.
+
+.. option:: zone_reset_threshold=float
+
+       A number between zero and one that indicates the ratio of written bytes
+       in the zones with write pointers in the IO range to the size of the IO
+       range. When current ratio is above this ratio, zones are reset
+       periodically as :option:`zone_reset_frequency` specifies. If there are
+       multiple jobs when using this option, the IO range for all write jobs
+       has to be the same.
+
+.. option:: zone_reset_frequency=float
+
+       A number between zero and one that indicates how often a zone reset
+       should be issued if the zone reset threshold has been exceeded. A zone
+       reset is submitted after each (1 / zone_reset_frequency) write
+       requests. This and the previous parameter can be used to simulate
+       garbage collection activity.
+
+
+I/O type
+~~~~~~~~
+
+.. option:: direct=bool
+
+       If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
+       OpenBSD and ZFS on Solaris don't support direct I/O.  On Windows the synchronous
+       ioengines don't support direct I/O.  Default: false.
+
+.. option:: buffered=bool
+
+       If value is true, use buffered I/O. This is the opposite of the
+       :option:`direct` option. Defaults to true.
+
+.. option:: readwrite=str, rw=str
+
+       Type of I/O pattern. Accepted values are:
+
+               **read**
+                               Sequential reads.
+               **write**
+                               Sequential writes.
+               **trim**
+                               Sequential trims (Linux block devices and SCSI
+                               character devices only).
+               **randread**
+                               Random reads.
+               **randwrite**
+                               Random writes.
+               **randtrim**
+                               Random trims (Linux block devices and SCSI
+                               character devices only).
+               **rw,readwrite**
+                               Sequential mixed reads and writes.
+               **randrw**
+                               Random mixed reads and writes.
+               **trimwrite**
+                               Sequential trim+write sequences. Blocks will be trimmed first,
+                               then the same blocks will be written to. So if ``io_size=64K``
+                               is specified, Fio will trim a total of 64K bytes and also
+                               write 64K bytes on the same trimmed blocks. This behaviour
+                               will be consistent with ``number_ios`` or other Fio options
+                               limiting the total bytes or number of I/O's.
+               **randtrimwrite**
+                               Like trimwrite, but uses random offsets rather
+                               than sequential writes.
+
+       Fio defaults to read if the option is not specified.  For the mixed I/O
+       types, the default is to split them 50/50.  For certain types of I/O the
+       result may still be skewed a bit, since the speed may be different.
+
+       It is possible to specify the number of I/Os to do before getting a new
+       offset by appending ``:<nr>`` to the end of the string given.  For a
+       random read, it would look like ``rw=randread:8`` for passing in an offset
+       modifier with a value of 8. If the suffix is used with a sequential I/O
+       pattern, then the *<nr>* value specified will be **added** to the generated
+       offset for each I/O turning sequential I/O into sequential I/O with holes.
+       For instance, using ``rw=write:4k`` will skip 4k for every write.  Also see
+       the :option:`rw_sequencer` option.
+
+.. option:: rw_sequencer=str
+
+       If an offset modifier is given by appending a number to the ``rw=<str>``
+       line, then this option controls how that number modifies the I/O offset
+       being generated. Accepted values are:
+
+               **sequential**
+                       Generate sequential offset.
+               **identical**
+                       Generate the same offset.
+
+       ``sequential`` is only useful for random I/O, where fio would normally
+       generate a new random offset for every I/O. If you append e.g. 8 to
+       randread, i.e. ``rw=randread:8`` you would get a new random offset for
+       every 8 I/Os. The result would be a sequence of 8 sequential offsets
+       with a random starting point. However this behavior may change if a
+       sequential I/O reaches end of the file. As sequential I/O is already
+       sequential, setting ``sequential`` for that would not result in any
+       difference. ``identical`` behaves in a similar fashion, except it sends
+       the same offset 8 number of times before generating a new offset.
+
+       Example #1::
+
+               rw=randread:8
+               rw_sequencer=sequential
+               bs=4k
+
+       The generated sequence of offsets will look like this:
+       4k, 8k, 12k, 16k, 20k, 24k, 28k, 32k, 92k, 96k, 100k, 104k, 108k,
+       112k, 116k, 120k, 48k, 52k ...
+
+       Example #2::
+
+               rw=randread:8
+               rw_sequencer=identical
+               bs=4k
+
+       The generated sequence of offsets will look like this:
+       4k, 4k, 4k, 4k, 4k, 4k, 4k, 4k, 92k, 92k, 92k, 92k, 92k, 92k, 92k, 92k,
+       48k, 48k, 48k ...
+
+.. option:: unified_rw_reporting=str
+
+       Fio normally reports statistics on a per data direction basis, meaning that
+       reads, writes, and trims are accounted and reported separately. This option
+       determines whether fio reports the results normally, summed together, or as
+       both options.
+       Accepted values are:
+
+               **none**
+                       Normal statistics reporting.
+
+               **mixed**
+                       Statistics are summed per data direction and reported together.
+
+               **both**
+                       Statistics are reported normally, followed by the mixed statistics.
+
+               **0**
+                       Backward-compatible alias for **none**.
+
+               **1**
+                       Backward-compatible alias for **mixed**.
+
+               **2**
+                       Alias for **both**.
+
+.. option:: randrepeat=bool
+
+        Seed all random number generators in a predictable way so the pattern
+        is repeatable across runs. Default: true.
+
+.. option:: allrandrepeat=bool
+
+       Alias for :option:`randrepeat`. Default: true.
+
+.. option:: randseed=int
+
+       Seed the random number generators based on this seed value, to be able to
+       control what sequence of output is being generated.  If not set, the random
+       sequence depends on the :option:`randrepeat` setting.
+
+.. option:: fallocate=str
+
+       Whether pre-allocation is performed when laying down files.
+       Accepted values are:
+
+               **none**
+                       Do not pre-allocate space.
+
+               **native**
+                       Use a platform's native pre-allocation call but fall back to
+                       **none** behavior if it fails/is not implemented.
+
+               **posix**
+                       Pre-allocate via :manpage:`posix_fallocate(3)`.
+
+               **keep**
+                       Pre-allocate via :manpage:`fallocate(2)` with
+                       FALLOC_FL_KEEP_SIZE set.
+
+               **truncate**
+                       Extend file to final size via :manpage:`ftruncate(2)`
+                       instead of allocating.
+
+               **0**
+                       Backward-compatible alias for **none**.
+
+               **1**
+                       Backward-compatible alias for **posix**.
+
+       May not be available on all supported platforms. **keep** is only available
+       on Linux. If using ZFS on Solaris this cannot be set to **posix**
+       because ZFS doesn't support pre-allocation. Default: **native** if any
+       pre-allocation methods except **truncate** are available, **none** if not.
+
+       Note that using **truncate** on Windows will interact surprisingly
+       with non-sequential write patterns. When writing to a file that has
+       been extended by setting the end-of-file information, Windows will
+       backfill the unwritten portion of the file up to that offset with
+       zeroes before issuing the new write. This means that a single small
+       write to the end of an extended file will stall until the entire
+       file has been filled with zeroes.
+
+.. option:: fadvise_hint=str
+
+       Use :manpage:`posix_fadvise(2)` or :manpage:`posix_fadvise(2)` to
+       advise the kernel on what I/O patterns are likely to be issued.
+       Accepted values are:
+
+               **0**
+                       Backwards-compatible hint for "no hint".
+
+               **1**
+                       Backwards compatible hint for "advise with fio workload type". This
+                       uses **FADV_RANDOM** for a random workload, and **FADV_SEQUENTIAL**
+                       for a sequential workload.
+
+               **sequential**
+                       Advise using **FADV_SEQUENTIAL**.
+
+               **random**
+                       Advise using **FADV_RANDOM**.
+
+               **noreuse**
+                       Advise using **FADV_NOREUSE**. This may be a no-op on older Linux
+                       kernels. Since Linux 6.3, it provides a hint to the LRU algorithm.
+                       See the :manpage:`posix_fadvise(2)` man page.
+
+.. option:: write_hint=str
+
+       Use :manpage:`fcntl(2)` to advise the kernel what life time to expect
+       from a write. Only supported on Linux, as of version 4.13. Accepted
+       values are:
+
+               **none**
+                       No particular life time associated with this file.
+
+               **short**
+                       Data written to this file has a short life time.
+
+               **medium**
+                       Data written to this file has a medium life time.
+
+               **long**
+                       Data written to this file has a long life time.
+
+               **extreme**
+                       Data written to this file has a very long life time.
+
+       The values are all relative to each other, and no absolute meaning
+       should be associated with them.
+
+.. option:: offset=int
+
+       Start I/O at the provided offset in the file, given as either a fixed size in
+       bytes, zones or a percentage. If a percentage is given, the generated offset will be
+       aligned to the minimum ``blocksize`` or to the value of ``offset_align`` if
+       provided. Data before the given offset will not be touched. This
+       effectively caps the file size at `real_size - offset`. Can be combined with
+       :option:`size` to constrain the start and end range of the I/O workload.
+       A percentage can be specified by a number between 1 and 100 followed by '%',
+       for example, ``offset=20%`` to specify 20%. In ZBD mode, value can be set as
+        number of zones using 'z'.
+
+.. option:: offset_align=int
+
+       If set to non-zero value, the byte offset generated by a percentage ``offset``
+       is aligned upwards to this value. Defaults to 0 meaning that a percentage
+       offset is aligned to the minimum block size.
+
+.. option:: offset_increment=int
+
+       If this is provided, then the real offset becomes `offset + offset_increment
+       * thread_number`, where the thread number is a counter that starts at 0 and
+       is incremented for each sub-job (i.e. when :option:`numjobs` option is
+       specified). This option is useful if there are several jobs which are
+       intended to operate on a file in parallel disjoint segments, with even
+       spacing between the starting points. Percentages can be used for this option.
+       If a percentage is given, the generated offset will be aligned to the minimum
+       ``blocksize`` or to the value of ``offset_align`` if provided. In ZBD mode, value can
+        also be set as number of zones using 'z'.
+
+.. option:: number_ios=int
+
+       Fio will normally perform I/Os until it has exhausted the size of the region
+       set by :option:`size`, or if it exhaust the allocated time (or hits an error
+       condition). With this setting, the range/size can be set independently of
+       the number of I/Os to perform. When fio reaches this number, it will exit
+       normally and report status. Note that this does not extend the amount of I/O
+       that will be done, it will only stop fio if this condition is met before
+       other end-of-job criteria.
+
+.. option:: fsync=int
+
+       If writing to a file, issue an :manpage:`fsync(2)` (or its equivalent) of
+       the dirty data for every number of blocks given. For example, if you give 32
+       as a parameter, fio will sync the file after every 32 writes issued. If fio is
+       using non-buffered I/O, we may not sync the file. The exception is the sg
+       I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
+       means fio does not periodically issue and wait for a sync to complete. Also
+       see :option:`end_fsync` and :option:`fsync_on_close`.
+
+.. option:: fdatasync=int
+
+       Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
+       not metadata blocks. In Windows, DragonFlyBSD or OSX there is no
+       :manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`.
+       Defaults to 0, which means fio does not periodically issue and wait for a
+       data-only sync to complete.
+
+.. option:: write_barrier=int
+
+       Make every `N-th` write a barrier write.
+
+.. option:: sync_file_range=str:int
+
+       Use :manpage:`sync_file_range(2)` for every `int` number of write
+       operations. Fio will track range of writes that have happened since the last
+       :manpage:`sync_file_range(2)` call. `str` can currently be one or more of:
+
+               **wait_before**
+                       SYNC_FILE_RANGE_WAIT_BEFORE
+               **write**
+                       SYNC_FILE_RANGE_WRITE
+               **wait_after**
+                       SYNC_FILE_RANGE_WAIT_AFTER
+
+       So if you do ``sync_file_range=wait_before,write:8``, fio would use
+       ``SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE`` for every 8
+       writes. Also see the :manpage:`sync_file_range(2)` man page.  This option is
+       Linux specific.
+
+.. option:: overwrite=bool
+
+       If true, writes to a file will always overwrite existing data. If the file
+       doesn't already exist, it will be created before the write phase begins. If
+       the file exists and is large enough for the specified write phase, nothing
+       will be done. Default: false.
+
+.. option:: end_fsync=bool
+
+       If true, :manpage:`fsync(2)` file contents when a write stage has completed.
+       Default: false.
+
+.. option:: fsync_on_close=bool
+
+       If true, fio will :manpage:`fsync(2)` a dirty file on close.  This differs
+       from :option:`end_fsync` in that it will happen on every file close, not
+       just at the end of the job.  Default: false.
+
+.. option:: rwmixread=int
+
+       Percentage of a mixed workload that should be reads. Default: 50.
+
+.. option:: rwmixwrite=int
+
+       Percentage of a mixed workload that should be writes. If both
+       :option:`rwmixread` and :option:`rwmixwrite` is given and the values do not
+       add up to 100%, the latter of the two will be used to override the
+       first. This may interfere with a given rate setting, if fio is asked to
+       limit reads or writes to a certain rate.  If that is the case, then the
+       distribution may be skewed. Default: 50.
+
+.. option:: random_distribution=str:float[:float][,str:float][,str:float]
+
+       By default, fio will use a completely uniform random distribution when asked
+       to perform random I/O. Sometimes it is useful to skew the distribution in
+       specific ways, ensuring that some parts of the data is more hot than others.
+       fio includes the following distribution models:
+
+               **random**
+                               Uniform random distribution
+
+               **zipf**
+                               Zipf distribution
+
+               **pareto**
+                               Pareto distribution
+
+               **normal**
+                               Normal (Gaussian) distribution
+
+               **zoned**
+                               Zoned random distribution
+
+               **zoned_abs**
+                               Zone absolute random distribution
+
+       When using a **zipf** or **pareto** distribution, an input value is also
+       needed to define the access pattern. For **zipf**, this is the `Zipf
+       theta`. For **pareto**, it's the `Pareto power`. Fio includes a test
+       program, :command:`fio-genzipf`, that can be used visualize what the given input
+       values will yield in terms of hit rates.  If you wanted to use **zipf** with
+       a `theta` of 1.2, you would use ``random_distribution=zipf:1.2`` as the
+       option. If a non-uniform model is used, fio will disable use of the random
+       map. For the **normal** distribution, a normal (Gaussian) deviation is
+       supplied as a value between 0 and 100.
+
+       The second, optional float is allowed for **pareto**, **zipf** and **normal** distributions.
+       It allows one to set base of distribution in non-default place, giving more control
+       over most probable outcome. This value is in range [0-1] which maps linearly to
+       range of possible random values.
+       Defaults are: random for **pareto** and **zipf**, and 0.5 for **normal**.
+       If you wanted to use **zipf** with a `theta` of 1.2 centered on 1/4 of allowed value range,
+       you would use ``random_distribution=zipf:1.2:0.25``.
+
+       For a **zoned** distribution, fio supports specifying percentages of I/O
+       access that should fall within what range of the file or device. For
+       example, given a criteria of:
+
+               * 60% of accesses should be to the first 10%
+               * 30% of accesses should be to the next 20%
+               * 8% of accesses should be to the next 30%
+               * 2% of accesses should be to the next 40%
+
+       we can define that through zoning of the random accesses. For the above
+       example, the user would do::
+
+               random_distribution=zoned:60/10:30/20:8/30:2/40
+
+       A **zoned_abs** distribution works exactly like the **zoned**, except
+       that it takes absolute sizes. For example, let's say you wanted to
+       define access according to the following criteria:
+
+               * 60% of accesses should be to the first 20G
+               * 30% of accesses should be to the next 100G
+               * 10% of accesses should be to the next 500G
+
+       we can define an absolute zoning distribution with:
+
+               random_distribution=zoned_abs=60/20G:30/100G:10/500g
+
+       For both **zoned** and **zoned_abs**, fio supports defining up to
+       256 separate zones.
+
+       Similarly to how :option:`bssplit` works for setting ranges and
+       percentages of block sizes. Like :option:`bssplit`, it's possible to
+       specify separate zones for reads, writes, and trims. If just one set
+       is given, it'll apply to all of them. This goes for both **zoned**
+       **zoned_abs** distributions.
+
+.. option:: percentage_random=int[,int][,int]
+
+       For a random workload, set how big a percentage should be random. This
+       defaults to 100%, in which case the workload is fully random. It can be set
+       from anywhere from 0 to 100.  Setting it to 0 would make the workload fully
+       sequential. Any setting in between will result in a random mix of sequential
+       and random I/O, at the given percentages.  Comma-separated values may be
+       specified for reads, writes, and trims as described in :option:`blocksize`.
+
+.. option:: norandommap
+
+       Normally fio will cover every block of the file when doing random I/O. If
+       this option is given, fio will just get a new random offset without looking
+       at past I/O history. This means that some blocks may not be read or written,
+       and that some blocks may be read/written more than once. If this option is
+       used with :option:`verify` and multiple blocksizes (via :option:`bsrange`),
+       only intact blocks are verified, i.e., partially-overwritten blocks are
+       ignored.  With an async I/O engine and an I/O depth > 1, it is possible for
+       the same block to be overwritten, which can cause verification errors.  Either
+       do not use norandommap in this case, or also use the lfsr random generator.
+
+.. option:: softrandommap=bool
+
+       See :option:`norandommap`. If fio runs with the random block map enabled and
+       it fails to allocate the map, if this option is set it will continue without
+       a random block map. As coverage will not be as complete as with random maps,
+       this option is disabled by default.
+
+.. option:: random_generator=str
+
+       Fio supports the following engines for generating I/O offsets for random I/O:
+
+               **tausworthe**
+                       Strong 2^88 cycle random number generator.
+               **lfsr**
+                       Linear feedback shift register generator.
+               **tausworthe64**
+                       Strong 64-bit 2^258 cycle random number generator.
+
+       **tausworthe** is a strong random number generator, but it requires tracking
+       on the side if we want to ensure that blocks are only read or written
+       once. **lfsr** guarantees that we never generate the same offset twice, and
+       it's also less computationally expensive. It's not a true random generator,
+       however, though for I/O purposes it's typically good enough. **lfsr** only
+       works with single block sizes, not with workloads that use multiple block
+       sizes. If used with such a workload, fio may read or write some blocks
+       multiple times. The default value is **tausworthe**, unless the required
+       space exceeds 2^32 blocks. If it does, then **tausworthe64** is
+       selected automatically.
+
+
+Block size
+~~~~~~~~~~
+
+.. option:: blocksize=int[,int][,int], bs=int[,int][,int]
+
+       The block size in bytes used for I/O units. Default: 4096.  A single value
+       applies to reads, writes, and trims.  Comma-separated values may be
+       specified for reads, writes, and trims.  A value not terminated in a comma
+       applies to subsequent types.
+
+       Examples:
+
+               **bs=256k**
+                       means 256k for reads, writes and trims.
+
+               **bs=8k,32k**
+                       means 8k for reads, 32k for writes and trims.
+
+               **bs=8k,32k,**
+                       means 8k for reads, 32k for writes, and default for trims.
+
+               **bs=,8k**
+                       means default for reads, 8k for writes and trims.
+
+               **bs=,8k,**
+                       means default for reads, 8k for writes, and default for trims.
+
+.. option:: blocksize_range=irange[,irange][,irange], bsrange=irange[,irange][,irange]
+
+       A range of block sizes in bytes for I/O units.  The issued I/O unit will
+       always be a multiple of the minimum size, unless
+       :option:`blocksize_unaligned` is set.
+
+       Comma-separated ranges may be specified for reads, writes, and trims as
+       described in :option:`blocksize`.
+
+       Example: ``bsrange=1k-4k,2k-8k`` also the ':' delimiter ``bsrange=1k:4k,2k:8k``.
+
+.. option:: bssplit=str[,str][,str]
+
+       Sometimes you want even finer grained control of the block sizes
+       issued, not just an even split between them.  This option allows you to
+       weight various block sizes, so that you are able to define a specific
+       amount of block sizes issued. The format for this option is::
+
+               bssplit=blocksize/percentage:blocksize/percentage
+
+       for as many block sizes as needed. So if you want to define a workload
+       that has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would
+       write::
+
+               bssplit=4k/10:64k/50:32k/40
+
+       Ordering does not matter. If the percentage is left blank, fio will
+       fill in the remaining values evenly. So a bssplit option like this one::
+
+               bssplit=4k/50:1k/:32k/
+
+       would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always
+       add up to 100, if bssplit is given a range that adds up to more, it
+       will error out.
+
+       Comma-separated values may be specified for reads, writes, and trims as
+       described in :option:`blocksize`.
+
+       If you want a workload that has 50% 2k reads and 50% 4k reads, while
+       having 90% 4k writes and 10% 8k writes, you would specify::
+
+               bssplit=2k/50:4k/50,4k/90:8k/10
+
+       Fio supports defining up to 64 different weights for each data
+       direction.
+
+.. option:: blocksize_unaligned, bs_unaligned
+
+       If set, fio will issue I/O units with any size within
+       :option:`blocksize_range`, not just multiples of the minimum size.  This
+       typically won't work with direct I/O, as that normally requires sector
+       alignment.
+
+.. option:: bs_is_seq_rand=bool
+
+       If this option is set, fio will use the normal read,write blocksize settings
+       as sequential,random blocksize settings instead. Any random read or write
+       will use the WRITE blocksize settings, and any sequential read or write will
+       use the READ blocksize settings.
+
+.. option:: blockalign=int[,int][,int], ba=int[,int][,int]
+
+       Boundary to which fio will align random I/O units.  Default:
+       :option:`blocksize`.  Minimum alignment is typically 512b for using direct
+       I/O, though it usually depends on the hardware block size. This option is
+       mutually exclusive with using a random map for files, so it will turn off
+       that option.  Comma-separated values may be specified for reads, writes, and
+       trims as described in :option:`blocksize`.
+
+
+Buffers and memory
+~~~~~~~~~~~~~~~~~~
+
+.. option:: zero_buffers
+
+       Initialize buffers with all zeros. Default: fill buffers with random data.
+
+.. option:: refill_buffers
+
+       If this option is given, fio will refill the I/O buffers on every
+       submit. Only makes sense if :option:`zero_buffers` isn't specified,
+       naturally. Defaults to being unset i.e., the buffer is only filled at
+       init time and the data in it is reused when possible but if any of
+       :option:`verify`, :option:`buffer_compress_percentage` or
+       :option:`dedupe_percentage` are enabled then `refill_buffers` is also
+       automatically enabled.
+
+.. option:: scramble_buffers=bool
+
+       If :option:`refill_buffers` is too costly and the target is using data
+       deduplication, then setting this option will slightly modify the I/O buffer
+       contents to defeat normal de-dupe attempts. This is not enough to defeat
+       more clever block compression attempts, but it will stop naive dedupe of
+       blocks. Default: true.
+
+.. option:: buffer_compress_percentage=int
+
+       If this is set, then fio will attempt to provide I/O buffer content
+       (on WRITEs) that compresses to the specified level. Fio does this by
+       providing a mix of random data followed by fixed pattern data. The
+       fixed pattern is either zeros, or the pattern specified by
+       :option:`buffer_pattern`. If the `buffer_pattern` option is used, it
+       might skew the compression ratio slightly. Setting
+       `buffer_compress_percentage` to a value other than 100 will also
+       enable :option:`refill_buffers` in order to reduce the likelihood that
+       adjacent blocks are so similar that they over compress when seen
+       together. See :option:`buffer_compress_chunk` for how to set a finer or
+       coarser granularity for the random/fixed data region. Defaults to unset
+       i.e., buffer data will not adhere to any compression level.
+
+.. option:: buffer_compress_chunk=int
+
+       This setting allows fio to manage how big the random/fixed data region
+       is when using :option:`buffer_compress_percentage`. When
+       `buffer_compress_chunk` is set to some non-zero value smaller than the
+       block size, fio can repeat the random/fixed region throughout the I/O
+       buffer at the specified interval (which particularly useful when
+       bigger block sizes are used for a job). When set to 0, fio will use a
+       chunk size that matches the block size resulting in a single
+       random/fixed region within the I/O buffer. Defaults to 512. When the
+       unit is omitted, the value is interpreted in bytes.
+
+.. option:: buffer_pattern=str
+
+       If set, fio will fill the I/O buffers with this pattern or with the contents
+       of a file. If not set, the contents of I/O buffers are defined by the other
+       options related to buffer contents. The setting can be any pattern of bytes,
+       and can be prefixed with 0x for hex values. It may also be a string, where
+       the string must then be wrapped with ``""``. Or it may also be a filename,
+       where the filename must be wrapped with ``''`` in which case the file is
+       opened and read. Note that not all the file contents will be read if that
+       would cause the buffers to overflow. So, for example::
+
+               buffer_pattern='filename'
+
+       or::
+
+               buffer_pattern="abcd"
+
+       or::
+
+               buffer_pattern=-12
+
+       or::
+
+               buffer_pattern=0xdeadface
+
+       Also you can combine everything together in any order::
+
+               buffer_pattern=0xdeadface"abcd"-12'filename'
+
+.. option:: dedupe_percentage=int
+
+       If set, fio will generate this percentage of identical buffers when
+       writing. These buffers will be naturally dedupable. The contents of the
+       buffers depend on what other buffer compression settings have been set. It's
+       possible to have the individual buffers either fully compressible, or not at
+       all -- this option only controls the distribution of unique buffers. Setting
+       this option will also enable :option:`refill_buffers` to prevent every buffer
+       being identical.
+
+.. option:: dedupe_mode=str
+
+       If ``dedupe_percentage=<int>`` is given, then this option controls how fio
+       generates the dedupe buffers.
+
+               **repeat**
+                       Generate dedupe buffers by repeating previous writes
+               **working_set**
+                       Generate dedupe buffers from working set
+
+       ``repeat`` is the default option for fio. Dedupe buffers are generated
+       by repeating previous unique write.
+
+       ``working_set`` is a more realistic workload.
+       With ``working_set``, ``dedupe_working_set_percentage=<int>`` should be provided.
+       Given that, fio will use the initial unique write buffers as its working set.
+       Upon deciding to dedupe, fio will randomly choose a buffer from the working set.
+       Note that by using ``working_set`` the dedupe percentage will converge
+       to the desired over time while ``repeat`` maintains the desired percentage
+       throughout the job.
+
+.. option:: dedupe_working_set_percentage=int
+
+       If ``dedupe_mode=<str>`` is set to ``working_set``, then this controls
+       the percentage of size of the file or device used as the buffers
+       fio will choose to generate the dedupe buffers from
+
+       Note that size needs to be explicitly provided and only 1 file per
+       job is supported
+
+.. option:: dedupe_global=bool
+
+       This controls whether the deduplication buffers will be shared amongst
+       all jobs that have this option set. The buffers are spread evenly between
+       participating jobs.
+
+.. option:: invalidate=bool
+
+       Invalidate the buffer/page cache parts of the files to be used prior to
+       starting I/O if the platform and file type support it.  Defaults to true.
+       This will be ignored if :option:`pre_read` is also specified for the
+       same job.
+
+.. option:: sync=str
+
+       Whether, and what type, of synchronous I/O to use for writes.  The allowed
+       values are:
+
+               **none**
+                       Do not use synchronous IO, the default.
+
+               **0**
+                       Same as **none**.
+
+               **sync**
+                       Use synchronous file IO. For the majority of I/O engines,
+                       this means using O_SYNC.
+
+               **1**
+                       Same as **sync**.
+
+               **dsync**
+                       Use synchronous data IO. For the majority of I/O engines,
+                       this means using O_DSYNC.
+
+
+.. option:: iomem=str, mem=str
+
+       Fio can use various types of memory as the I/O unit buffer.  The allowed
+       values are:
+
+               **malloc**
+                       Use memory from :manpage:`malloc(3)` as the buffers.  Default memory
+                       type.
+
+               **shm**
+                       Use shared memory as the buffers. Allocated through
+                       :manpage:`shmget(2)`.
+
+               **shmhuge**
+                       Same as shm, but use huge pages as backing.
+
+               **mmap**
+                       Use :manpage:`mmap(2)` to allocate buffers. May either be anonymous memory, or can
+                       be file backed if a filename is given after the option. The format
+                       is `mem=mmap:/path/to/file`.
+
+               **mmaphuge**
+                       Use a memory mapped huge file as the buffer backing. Append filename
+                       after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file`.
+
+               **mmapshared**
+                       Same as mmap, but use a MMAP_SHARED mapping.
+
+               **cudamalloc**
+                       Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+                       The :option:`ioengine` must be `rdma`.
+
+       The area allocated is a function of the maximum allowed bs size for the job,
+       multiplied by the I/O depth given. Note that for **shmhuge** and
+       **mmaphuge** to work, the system must have free huge pages allocated. This
+       can normally be checked and set by reading/writing
+       :file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page
+        is 2 or 4MiB in size depending on the platform. So to calculate the
+        number of huge pages you need for a given job file, add up the I/O
+        depth of all jobs (normally one unless :option:`iodepth` is used) and
+        multiply by the maximum bs set. Then divide that number by the huge
+        page size. You can see the size of the huge pages in
+        :file:`/proc/meminfo`. If no huge pages are allocated by having a
+        non-zero number in `nr_hugepages`, using **mmaphuge** or **shmhuge**
+        will fail. Also see :option:`hugepage-size`.
+
+       **mmaphuge** also needs to have hugetlbfs mounted and the file location
+       should point there. So if it's mounted in :file:`/huge`, you would use
+       `mem=mmaphuge:/huge/somefile`.
+
+.. option:: iomem_align=int, mem_align=int
+
+       This indicates the memory alignment of the I/O memory buffers.  Note that
+       the given alignment is applied to the first I/O unit buffer, if using
+       :option:`iodepth` the alignment of the following buffers are given by the
+       :option:`bs` used. In other words, if using a :option:`bs` that is a
+       multiple of the page sized in the system, all buffers will be aligned to
+       this value. If using a :option:`bs` that is not page aligned, the alignment
+       of subsequent I/O memory buffers is the sum of the :option:`iomem_align` and
+       :option:`bs` used.
+
+.. option:: hugepage-size=int
+
+        Defines the size of a huge page. Must at least be equal to the system
+        setting, see :file:`/proc/meminfo` and
+        :file:`/sys/kernel/mm/hugepages/`. Defaults to 2 or 4MiB depending on
+        the platform.  Should probably always be a multiple of megabytes, so
+        using ``hugepage-size=Xm`` is the preferred way to set this to avoid
+        setting a non-pow-2 bad value.
+
+.. option:: lockmem=int
+
+       Pin the specified amount of memory with :manpage:`mlock(2)`. Can be used to
+       simulate a smaller amount of memory.  The amount specified is per worker.
+
+
+I/O size
+~~~~~~~~
+
+.. option:: size=int
+
+       The total size of file I/O for each thread of this job. Fio will run until
+       this many bytes has been transferred, unless runtime is altered by other means
+       such as (1) :option:`runtime`, (2) :option:`io_size` (3) :option:`number_ios`,
+       (4) gaps/holes while doing I/O's such as ``rw=read:16K``, or (5) sequential
+       I/O reaching end of the file which is possible when :option:`percentage_random`
+       is less than 100.
+       Fio will divide this size between the available files determined by options
+       such as :option:`nrfiles`, :option:`filename`, unless :option:`filesize` is
+       specified by the job. If the result of division happens to be 0, the size is
+       set to the physical size of the given files or devices if they exist.
+       If this option is not specified, fio will use the full size of the given
+       files or devices.  If the files do not exist, size must be given. It is also
+       possible to give size as a percentage between 1 and 100. If ``size=20%`` is
+       given, fio will use 20% of the full size of the given files or devices.
+       In ZBD mode, value can also be set as number of zones using 'z'.
+       Can be combined with :option:`offset` to constrain the start and end range
+       that I/O will be done within.
+
+.. option:: io_size=int, io_limit=int
+
+       Normally fio operates within the region set by :option:`size`, which means
+       that the :option:`size` option sets both the region and size of I/O to be
+       performed. Sometimes that is not what you want. With this option, it is
+       possible to define just the amount of I/O that fio should do. For instance,
+       if :option:`size` is set to 20GiB and :option:`io_size` is set to 5GiB, fio
+       will perform I/O within the first 20GiB but exit when 5GiB have been
+       done. The opposite is also possible -- if :option:`size` is set to 20GiB,
+       and :option:`io_size` is set to 40GiB, then fio will do 40GiB of I/O within
+       the 0..20GiB region.
+
+.. option:: filesize=irange(int)
+
+       Individual file sizes. May be a range, in which case fio will select sizes for
+       files at random within the given range. If not given, each created file is the
+       same size. This option overrides :option:`size` in terms of file size, i.e. if
+       :option:`filesize` is specified then :option:`size` becomes merely the default
+       for :option:`io_size` and has no effect at all if :option:`io_size` is set
+       explicitly.
+
+.. option:: file_append=bool
+
+       Perform I/O after the end of the file. Normally fio will operate within the
+       size of a file. If this option is set, then fio will append to the file
+       instead. This has identical behavior to setting :option:`offset` to the size
+       of a file.  This option is ignored on non-regular files.
+
+.. option:: fill_device=bool, fill_fs=bool
+
+       Sets size to something really large and waits for ENOSPC (no space left on
+       device) or EDQUOT (disk quota exceeded)
+       as the terminating condition. Only makes sense with sequential
+       write. For a read workload, the mount point will be filled first then I/O
+       started on the result. This option doesn't make sense if operating on a raw
+       device node, since the size of that is already known by the file system.
+       Additionally, writing beyond end-of-device will not return ENOSPC there.
+
+
+I/O engine
+~~~~~~~~~~
+
+.. option:: ioengine=str
+
+       fio supports 2 kinds of performance measurement: I/O and file/directory operation.
+
+       I/O engines define how the job issues I/O to the file. The following types are defined:
+
+               **sync**
+                       Basic :manpage:`read(2)` or :manpage:`write(2)`
+                       I/O. :manpage:`lseek(2)` is used to position the I/O location.
+                       See :option:`fsync` and :option:`fdatasync` for syncing write I/Os.
+
+               **psync**
+                       Basic :manpage:`pread(2)` or :manpage:`pwrite(2)` I/O.  Default on
+                       all supported operating systems except for Windows.
+
+               **vsync**
+                       Basic :manpage:`readv(2)` or :manpage:`writev(2)` I/O.  Will emulate
+                       queuing by coalescing adjacent I/Os into a single submission.
+
+               **pvsync**
+                       Basic :manpage:`preadv(2)` or :manpage:`pwritev(2)` I/O.
+
+               **pvsync2**
+                       Basic :manpage:`preadv2(2)` or :manpage:`pwritev2(2)` I/O.
+
+               **io_uring**
+                       Fast Linux native asynchronous I/O. Supports async IO
+                       for both direct and buffered IO.
+                       This engine defines engine specific options.
+
+               **io_uring_cmd**
+                       Fast Linux native asynchronous I/O for pass through commands.
+                       This engine defines engine specific options.
+
+               **libaio**
+                       Linux native asynchronous I/O. Note that Linux may only support
+                       queued behavior with non-buffered I/O (set ``direct=1`` or
+                       ``buffered=0``).
+                       This engine defines engine specific options.
+
+               **posixaio**
+                       POSIX asynchronous I/O using :manpage:`aio_read(3)` and
+                       :manpage:`aio_write(3)`.
+
+               **solarisaio**
+                       Solaris native asynchronous I/O.
+
+               **windowsaio**
+                       Windows native asynchronous I/O.  Default on Windows.
+
+               **mmap**
+                       File is memory mapped with :manpage:`mmap(2)` and data copied
+                       to/from using :manpage:`memcpy(3)`.
+
+               **splice**
+                       :manpage:`splice(2)` is used to transfer the data and
+                       :manpage:`vmsplice(2)` to transfer data from user space to the
+                       kernel.
+
+               **sg**
+                       SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
+                       ioctl, or if the target is an sg character device we use
+                       :manpage:`read(2)` and :manpage:`write(2)` for asynchronous
+                       I/O. Requires :option:`filename` option to specify either block or
+                       character devices. This engine supports trim operations.
+                       The sg engine includes engine specific options.
+
+               **libzbc**
+                       Read, write, trim and ZBC/ZAC operations to a zoned
+                       block device using libzbc library. The target can be
+                       either an SG character device or a block device file.
+
+               **null**
+                       Doesn't transfer any data, just pretends to.  This is mainly used to
+                       exercise fio itself and for debugging/testing purposes.
+
+               **net**
+                       Transfer over the network to given ``host:port``.  Depending on the
+                       :option:`protocol` used, the :option:`hostname`, :option:`port`,
+                       :option:`listen` and :option:`filename` options are used to specify
+                       what sort of connection to make, while the :option:`protocol` option
+                       determines which protocol will be used.  This engine defines engine
+                       specific options.
+
+               **netsplice**
+                       Like **net**, but uses :manpage:`splice(2)` and
+                       :manpage:`vmsplice(2)` to map data and send/receive.
+                       This engine defines engine specific options.
+
+               **cpuio**
+                       Doesn't transfer any data, but burns CPU cycles according to the
+                       :option:`cpuload`, :option:`cpuchunks` and :option:`cpumode` options.
+                       Setting :option:`cpuload`\=85 will cause that job to do nothing but burn 85%
+                       of the CPU. In case of SMP machines, use :option:`numjobs`\=<nr_of_cpu>
+                       to get desired CPU usage, as the cpuload only loads a
+                       single CPU at the desired rate. A job never finishes unless there is
+                       at least one non-cpuio job.
+                       Setting :option:`cpumode`\=qsort replace the default noop instructions loop
+                       by a qsort algorithm to consume more energy.
+
+               **rdma**
+                       The RDMA I/O engine supports both RDMA memory semantics
+                       (RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
+                       InfiniBand, RoCE and iWARP protocols. This engine defines engine
+                       specific options.
+
+               **falloc**
+                       I/O engine that does regular fallocate to simulate data transfer as
+                       fio ioengine.
+
+                       DDIR_READ
+                               does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
+
+                       DDIR_WRITE
+                               does fallocate(,mode = 0).
+
+                       DDIR_TRIM
+                               does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
+
+               **ftruncate**
+                       I/O engine that sends :manpage:`ftruncate(2)` operations in response
+                       to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
+                       size to the current block offset. :option:`blocksize` is ignored.
+
+               **e4defrag**
+                       I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+                       defragment activity in request to DDIR_WRITE event.
+
+               **rados**
+                       I/O engine supporting direct access to Ceph Reliable Autonomic
+                       Distributed Object Store (RADOS) via librados. This ioengine
+                       defines engine specific options.
+
+               **rbd**
+                       I/O engine supporting direct access to Ceph Rados Block Devices
+                       (RBD) via librbd without the need to use the kernel rbd driver. This
+                       ioengine defines engine specific options.
+
+               **http**
+                       I/O engine supporting GET/PUT requests over HTTP(S) with libcurl to
+                       a WebDAV or S3 endpoint.  This ioengine defines engine specific options.
+
+                       This engine only supports direct IO of iodepth=1; you need to scale this
+                       via numjobs. blocksize defines the size of the objects to be created.
+
+                       TRIM is translated to object deletion.
+
+               **gfapi**
+                       Using GlusterFS libgfapi sync interface to direct access to
+                       GlusterFS volumes without having to go through FUSE.  This ioengine
+                       defines engine specific options.
+
+               **gfapi_async**
+                       Using GlusterFS libgfapi async interface to direct access to
+                       GlusterFS volumes without having to go through FUSE. This ioengine
+                       defines engine specific options.
+
+               **libhdfs**
+                       Read and write through Hadoop (HDFS).  The :option:`filename` option
+                       is used to specify host,port of the hdfs name-node to connect.  This
+                       engine interprets offsets a little differently.  In HDFS, files once
+                       created cannot be modified so random writes are not possible. To
+                       imitate this the libhdfs engine expects a bunch of small files to be
+                       created over HDFS and will randomly pick a file from them
+                       based on the offset generated by fio backend (see the example
+                       job file to create such files, use ``rw=write`` option). Please
+                       note, it may be necessary to set environment variables to work
+                       with HDFS/libhdfs properly.  Each job uses its own connection to
+                       HDFS.
+
+               **mtd**
+                       Read, write and erase an MTD character device (e.g.,
+                       :file:`/dev/mtd0`). Discards are treated as erases. Depending on the
+                       underlying device type, the I/O may have to go in a certain pattern,
+                       e.g., on NAND, writing sequentially to erase blocks and discarding
+                       before overwriting. The `trimwrite` mode works well for this
+                       constraint.
+
+               **dev-dax**
+                       Read and write using device DAX to a persistent memory device (e.g.,
+                       /dev/dax0.0) through the PMDK libpmem library.
+
+               **external**
+                       Prefix to specify loading an external I/O engine object file. Append
+                       the engine filename, e.g. ``ioengine=external:/tmp/foo.o`` to load
+                       ioengine :file:`foo.o` in :file:`/tmp`. The path can be either
+                       absolute or relative. See :file:`engines/skeleton_external.c` for
+                       details of writing an external I/O engine.
+
+               **libpmem**
+                       Read and write using mmap I/O to a file on a filesystem
+                       mounted with DAX on a persistent memory device through the PMDK
+                       libpmem library.
+
+               **ime_psync**
+                       Synchronous read and write using DDN's Infinite Memory Engine (IME).
+                       This engine is very basic and issues calls to IME whenever an IO is
+                       queued.
+
+               **ime_psyncv**
+                       Synchronous read and write using DDN's Infinite Memory Engine (IME).
+                       This engine uses iovecs and will try to stack as much IOs as possible
+                       (if the IOs are "contiguous" and the IO depth is not exceeded)
+                       before issuing a call to IME.
+
+               **ime_aio**
+                       Asynchronous read and write using DDN's Infinite Memory Engine (IME).
+                       This engine will try to stack as much IOs as possible by creating
+                       requests for IME. FIO will then decide when to commit these requests.
+
+               **libiscsi**
+                       Read and write iscsi lun with libiscsi.
+
+               **nbd**
+                       Read and write a Network Block Device (NBD).
+
+               **libcufile**
+                       I/O engine supporting libcufile synchronous access to nvidia-fs and a
+                       GPUDirect Storage-supported filesystem. This engine performs
+                       I/O without transferring buffers between user-space and the kernel,
+                       unless :option:`verify` is set or :option:`cuda_io` is `posix`.
+                       :option:`iomem` must not be `cudamalloc`. This ioengine defines
+                       engine specific options.
+
+               **dfs**
+                       I/O engine supporting asynchronous read and write operations to the
+                       DAOS File System (DFS) via libdfs.
+
+               **nfs**
+                       I/O engine supporting asynchronous read and write operations to
+                       NFS filesystems from userspace via libnfs. This is useful for
+                       achieving higher concurrency and thus throughput than is possible
+                       via kernel NFS.
+
+               **exec**
+                       Execute 3rd party tools. Could be used to perform monitoring during jobs runtime.
+
+               **xnvme**
+                       I/O engine using the xNVMe C API, for NVMe devices. The xnvme engine provides
+                       flexibility to access GNU/Linux Kernel NVMe driver via libaio, IOCTLs, io_uring,
+                       the SPDK NVMe driver, or your own custom NVMe driver. The xnvme engine includes
+                       engine specific options. (See https://xnvme.io).
+
+               **libblkio**
+                       Use the libblkio library
+                       (https://gitlab.com/libblkio/libblkio). The specific
+                       *driver* to use must be set using
+                       :option:`libblkio_driver`. If
+                       :option:`mem`/:option:`iomem` is not specified, memory
+                       allocation is delegated to libblkio (and so is
+                       guaranteed to work with the selected *driver*). One
+                       libblkio instance is used per process, so all jobs
+                       setting option :option:`thread` will share a single
+                       instance (with one queue per thread) and must specify
+                       compatible options. Note that some drivers don't allow
+                       several instances to access the same device or file
+                       simultaneously, but allow it for threads.
+
+       File/directory operation engines define how the job operates file or directory. The
+       following types are defined:
+
+               **filecreate**
+                       Simply create the files and do no I/O to them.  You still need to
+                       set  `filesize` so that all the accounting still occurs, but no
+                       actual I/O will be done other than creating the file.
+                       Example job file: filecreate-ioengine.fio.
+
+               **filestat**
+                       Simply do stat() and do no I/O to the file. You need to set 'filesize'
+                       and 'nrfiles', so that files will be created.
+                       This engine is to measure file lookup and meta data access.
+                       Example job file: filestat-ioengine.fio.
+
+               **filedelete**
+                       Simply delete the files by unlink() and do no I/O to them. You need to set 'filesize'
+                       and 'nrfiles', so that the files will be created.
+                       This engine is to measure file delete.
+                       Example job file: filedelete-ioengine.fio.
+
+               **dircreate**
+                       Simply create the directories and do no I/O to them.  You still need to
+                       set  `filesize` so that all the accounting still occurs, but no
+                       actual I/O will be done other than creating the directories.
+                       Example job file: dircreate-ioengine.fio.
+
+               **dirstat**
+                       Simply do stat() and do no I/O to the directories. You need to set 'filesize'
+                       and 'nrfiles', so that directories will be created.
+                       This engine is to measure directory lookup and meta data access.
+                       Example job file: dirstat-ioengine.fio.
+
+               **dirdelete**
+                       Simply delete the directories by rmdir() and do no I/O to them. You need to set 'filesize'
+                       and 'nrfiles', so that the directories will be created.
+                       This engine is to measure directory delete.
+                       Example job file: dirdelete-ioengine.fio.
+
+               For file and directory operation engines, there is no I/O throughput, then the
+               statistics data in report have different meanings. The meaningful output indexes are: 'iops' and 'clat'.
+               'bw' is meaningless. Refer to section: "Interpreting the output" for more details.
+
+
+I/O engine specific parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition, there are some parameters which are only valid when a specific
+:option:`ioengine` is in use. These are used identically to normal parameters,
+with the caveat that when used on the command line, they must come after the
+:option:`ioengine` that defines them is selected.
+
+.. option:: cmdprio_percentage=int[,int] : [io_uring] [libaio]
+
+    Set the percentage of I/O that will be issued with the highest priority.
+    Default: 0. A single value applies to reads and writes. Comma-separated
+    values may be specified for reads and writes. For this option to be
+    effective, NCQ priority must be supported and enabled, and the :option:`direct`
+    option must be set. fio must also be run as the root user. Unlike
+    slat/clat/lat stats, which can be tracked and reported independently, per
+    priority stats only track and report a single type of latency. By default,
+    completion latency (clat) will be reported, if :option:`lat_percentiles` is
+    set, total latency (lat) will be reported.
+
+.. option:: cmdprio_class=int[,int] : [io_uring] [libaio]
+
+       Set the I/O priority class to use for I/Os that must be issued with
+       a priority when :option:`cmdprio_percentage` or
+       :option:`cmdprio_bssplit` is set. If not specified when
+       :option:`cmdprio_percentage` or :option:`cmdprio_bssplit` is set,
+       this defaults to the highest priority class. A single value applies
+       to reads and writes. Comma-separated values may be specified for
+       reads and writes. See :manpage:`ionice(1)`. See also the
+       :option:`prioclass` option.
+
+.. option:: cmdprio_hint=int[,int] : [io_uring] [libaio]
+
+       Set the I/O priority hint to use for I/Os that must be issued with
+       a priority when :option:`cmdprio_percentage` or
+       :option:`cmdprio_bssplit` is set. If not specified when
+       :option:`cmdprio_percentage` or :option:`cmdprio_bssplit` is set,
+       this defaults to 0 (no hint). A single value applies to reads and
+       writes. Comma-separated values may be specified for reads and writes.
+       See also the :option:`priohint` option.
+
+.. option:: cmdprio=int[,int] : [io_uring] [libaio]
+
+       Set the I/O priority value to use for I/Os that must be issued with
+       a priority when :option:`cmdprio_percentage` or
+       :option:`cmdprio_bssplit` is set. If not specified when
+       :option:`cmdprio_percentage` or :option:`cmdprio_bssplit` is set,
+       this defaults to 0.
+       Linux limits us to a positive value between 0 and 7, with 0 being the
+       highest. A single value applies to reads and writes. Comma-separated
+       values may be specified for reads and writes. See :manpage:`ionice(1)`.
+       Refer to an appropriate manpage for other operating systems since
+       meaning of priority may differ. See also the :option:`prio` option.
+
+.. option:: cmdprio_bssplit=str[,str] : [io_uring] [libaio]
+
+       To get a finer control over I/O priority, this option allows
+       specifying the percentage of IOs that must have a priority set
+       depending on the block size of the IO. This option is useful only
+       when used together with the :option:`bssplit` option, that is,
+       multiple different block sizes are used for reads and writes.
+
+       The first accepted format for this option is the same as the format of
+       the :option:`bssplit` option:
+
+               cmdprio_bssplit=blocksize/percentage:blocksize/percentage
+
+       In this case, each entry will use the priority class, priority hint
+       and priority level defined by the options :option:`cmdprio_class`,
+        :option:`cmdprio` and :option:`cmdprio_hint` respectively.
+
+       The second accepted format for this option is:
+
+               cmdprio_bssplit=blocksize/percentage/class/level:blocksize/percentage/class/level
+
+       In this case, the priority class and priority level is defined inside
+       each entry. In comparison with the first accepted format, the second
+       accepted format does not restrict all entries to have the same priority
+       class and priority level.
+
+       The third accepted format for this option is:
+
+               cmdprio_bssplit=blocksize/percentage/class/level/hint:...
+
+       This is an extension of the second accepted format that allows one to
+       also specify a priority hint.
+
+       For all formats, only the read and write data directions are supported,
+       values for trim IOs are ignored. This option is mutually exclusive with
+       the :option:`cmdprio_percentage` option.
+
+.. option:: fixedbufs : [io_uring] [io_uring_cmd]
+
+       If fio is asked to do direct IO, then Linux will map pages for each
+       IO call, and release them when IO is done. If this option is set, the
+       pages are pre-mapped before IO is started. This eliminates the need to
+       map and release for each IO. This is more efficient, and reduces the
+       IO latency as well.
+
+.. option:: nonvectored=int : [io_uring] [io_uring_cmd]
+
+       With this option, fio will use non-vectored read/write commands, where
+       address must contain the address directly. Default is -1.
+
+.. option:: force_async=int : [io_uring] [io_uring_cmd]
+
+       Normal operation for io_uring is to try and issue an sqe as
+       non-blocking first, and if that fails, execute it in an async manner.
+       With this option set to N, then every N request fio will ask sqe to
+       be issued in an async manner. Default is 0.
+
+.. option:: registerfiles : [io_uring] [io_uring_cmd]
+
+       With this option, fio registers the set of files being used with the
+       kernel. This avoids the overhead of managing file counts in the kernel,
+       making the submission and completion part more lightweight. Required
+       for the below :option:`sqthread_poll` option.
+
+.. option:: sqthread_poll : [io_uring] [io_uring_cmd] [xnvme]
+
+       Normally fio will submit IO by issuing a system call to notify the
+       kernel of available items in the SQ ring. If this option is set, the
+       act of submitting IO will be done by a polling thread in the kernel.
+       This frees up cycles for fio, at the cost of using more CPU in the
+       system. As submission is just the time it takes to fill in the sqe
+       entries and any syscall required to wake up the idle kernel thread,
+       fio will not report submission latencies.
+
+.. option:: sqthread_poll_cpu=int : [io_uring] [io_uring_cmd]
+
+       When :option:`sqthread_poll` is set, this option provides a way to
+       define which CPU should be used for the polling thread.
+
+.. option:: cmd_type=str : [io_uring_cmd]
+
+       Specifies the type of uring passthrough command to be used. Supported
+       value is nvme. Default is nvme.
+
+.. option:: hipri
+
+   [io_uring] [io_uring_cmd] [xnvme]
+
+        If this option is set, fio will attempt to use polled IO completions.
+        Normal IO completions generate interrupts to signal the completion of
+        IO, polled completions do not. Hence they are require active reaping
+        by the application. The benefits are more efficient IO for high IOPS
+        scenarios, and lower latencies for low queue depth IO.
+
+   [libblkio]
+
+       Use poll queues. This is incompatible with
+       :option:`libblkio_wait_mode=eventfd <libblkio_wait_mode>` and
+       :option:`libblkio_force_enable_completion_eventfd`.
+
+   [pvsync2]
+
+       Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+       than normal.
+
+   [sg]
+
+       If this option is set, fio will attempt to use polled IO completions.
+       This will have a similar effect as (io_uring)hipri. Only SCSI READ and
+       WRITE commands will have the SGV4_FLAG_HIPRI set (not UNMAP (trim) nor
+       VERIFY). Older versions of the Linux sg driver that do not support
+       hipri will simply ignore this flag and do normal IO. The Linux SCSI
+       Low Level Driver (LLD) that "owns" the device also needs to support
+       hipri (also known as iopoll and mq_poll). The MegaRAID driver is an
+       example of a SCSI LLD. Default: clear (0) which does normal
+       (interrupted based) IO.
+
+.. option:: userspace_reap : [libaio]
+
+       Normally, with the libaio engine in use, fio will use the
+       :manpage:`io_getevents(2)` system call to reap newly returned events.  With
+       this flag turned on, the AIO ring will be read directly from user-space to
+       reap events. The reaping mode is only enabled when polling for a minimum of
+       0 events (e.g. when :option:`iodepth_batch_complete` `=0`).
+
+.. option:: hipri_percentage : [pvsync2]
+
+       When hipri is set this determines the probability of a pvsync2 I/O being high
+       priority. The default is 100%.
+
+.. option:: nowait=bool : [pvsync2] [libaio] [io_uring] [io_uring_cmd]
+
+       By default if a request cannot be executed immediately (e.g. resource starvation,
+       waiting on locks) it is queued and the initiating process will be blocked until
+       the required resource becomes free.
+
+       This option sets the RWF_NOWAIT flag (supported from the 4.14 Linux kernel) and
+       the call will return instantly with EAGAIN or a partial result rather than waiting.
+
+       It is useful to also use ignore_error=EAGAIN when using this option.
+
+       Note: glibc 2.27, 2.28 have a bug in syscall wrappers preadv2, pwritev2.
+       They return EOPNOTSUP instead of EAGAIN.
+
+       For cached I/O, using this option usually means a request operates only with
+       cached data. Currently the RWF_NOWAIT flag does not supported for cached write.
+
+       For direct I/O, requests will only succeed if cache invalidation isn't required,
+       file blocks are fully allocated and the disk request could be issued immediately.
+
+.. option:: fdp=bool : [io_uring_cmd] [xnvme]
+
+       Enable Flexible Data Placement mode for write commands.
+
+.. option:: dataplacement=str : [io_uring_cmd] [xnvme]
+
+        Specifies the data placement directive type to use for write commands.
+        The following types are supported:
+
+                **none**
+                        Do not use a data placement directive. This is the
+                        default.
+
+                **fdp**
+                        Use Flexible Data Placement directives for write
+                        commands. This is equivalent to specifying
+                        :option:`fdp` =1.
+
+               **streams**
+                        Use Streams directives for write commands.
+
+.. option:: plid_select=str, fdp_pli_select=str : [io_uring_cmd] [xnvme]
+
+       Defines how fio decides which placement ID to use next. The following
+       types are defined:
+
+               **random**
+                       Choose a placement ID at random (uniform).
+
+               **roundrobin**
+                       Round robin over available placement IDs. This is the
+                       default.
+
+       The available placement ID (indices) are defined by the option
+       :option:`plids`.
+
+.. option:: plids=str, fdp_pli=str : [io_uring_cmd] [xnvme]
+
+        Select which Placement IDs (streams) or Placement ID Indices (FDP) this
+        job is allowed to use for writes. For FDP by default, the job will
+        cycle through all available Placement IDs, so use this to isolate these
+        identifiers to specific jobs. If you want fio to use FDP placement
+        identifiers only at indices 0, 2 and 5 specify ``plids=0,2,5``. For
+        streams this should be a comma-separated list of Stream IDs.
+
+.. option:: md_per_io_size=int : [io_uring_cmd] [xnvme]
+
+       Size in bytes for separate metadata buffer per IO. Default: 0.
+
+.. option:: pi_act=int : [io_uring_cmd] [xnvme]
+
+       Action to take when nvme namespace is formatted with protection
+       information. If this is set to 1 and namespace is formatted with
+       metadata size equal to protection information size, fio won't use
+       separate metadata buffer or extended logical block. If this is set to
+       1 and namespace is formatted with metadata size greater than protection
+       information size, fio will not generate or verify the protection
+       information portion of metadata for write or read case respectively.
+       If this is set to 0, fio generates protection information for
+       write case and verifies for read case. Default: 1.
+
+       For 16 bit CRC generation fio will use isa-l if available otherwise
+       it will use the default slower generator.
+       (see: https://github.com/intel/isa-l)
+
+.. option:: pi_chk=str[,str][,str] : [io_uring_cmd] [xnvme]
+
+       Controls the protection information check. This can take one or more
+       of these values. Default: none.
+
+       **GUARD**
+               Enables protection information checking of guard field.
+       **REFTAG**
+               Enables protection information checking of logical block
+               reference tag field.
+       **APPTAG**
+               Enables protection information checking of application tag field.
+
+.. option:: apptag=int : [io_uring_cmd] [xnvme]
+
+       Specifies logical block application tag value, if namespace is
+       formatted to use end to end protection information. Default: 0x1234.
+
+.. option:: apptag_mask=int : [io_uring_cmd] [xnvme]
+
+       Specifies logical block application tag mask value, if namespace is
+       formatted to use end to end protection information. Default: 0xffff.
+
+.. option:: num_range=int : [io_uring_cmd]
+
+       For trim command this will be the number of ranges to trim per I/O
+       request. The number of logical blocks per range is determined by the
+       :option:`bs` option which should be a multiple of logical block size.
+       This cannot be used with read or write. Note that setting this
+       option > 1, :option:`log_offset` will not be able to log all the
+       offsets. Default: 1.
+
+.. option:: cpuload=int : [cpuio]
+
+       Attempt to use the specified percentage of CPU cycles. This is a mandatory
+       option when using cpuio I/O engine.
+
+.. option:: cpuchunks=int : [cpuio]
+
+       Split the load into cycles of the given time. In microseconds.
+
+.. option:: cpumode=str : [cpuio]
+
+       Specify how to stress the CPU. It can take these two values:
+
+       **noop**
+               This is the default where the CPU executes noop instructions.
+       **qsort**
+               Replace the default noop instructions loop with a qsort algorithm to
+               consume more energy.
+
+.. option:: exit_on_io_done=bool : [cpuio]
+
+       Detect when I/O threads are done, then exit.
+
+.. option:: namenode=str : [libhdfs]
+
+       The hostname or IP address of a HDFS cluster namenode to contact.
+
+.. option:: port=int
+
+   [libhdfs]
+
+               The listening port of the HFDS cluster namenode.
+
+   [netsplice], [net]
+
+               The TCP or UDP port to bind to or connect to. If this is used with
+               :option:`numjobs` to spawn multiple instances of the same job type, then
+               this will be the starting port number since fio will use a range of
+               ports.
+
+   [rdma], [librpma_*]
+
+               The port to use for RDMA-CM communication. This should be the same value
+               on the client and the server side.
+
+.. option:: hostname=str : [netsplice] [net] [rdma]
+
+       The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O.  If the job
+       is a TCP listener or UDP reader, the hostname is not used and must be omitted
+       unless it is a valid UDP multicast address.
+
+.. option:: serverip=str : [librpma_*]
+
+       The IP address to be used for RDMA-CM based I/O.
+
+.. option:: direct_write_to_pmem=bool : [librpma_*]
+
+       Set to 1 only when Direct Write to PMem from the remote host is possible.
+       Otherwise, set to 0.
+
+.. option:: busy_wait_polling=bool : [librpma_*_server]
+
+       Set to 0 to wait for completion instead of busy-wait polling completion.
+       Default: 1.
+
+.. option:: interface=str : [netsplice] [net]
+
+       The IP address of the network interface used to send or receive UDP
+       multicast.
+
+.. option:: ttl=int : [netsplice] [net]
+
+       Time-to-live value for outgoing UDP multicast packets. Default: 1.
+
+.. option:: nodelay=bool : [netsplice] [net]
+
+       Set TCP_NODELAY on TCP connections.
+
+.. option:: protocol=str, proto=str : [netsplice] [net]
+
+       The network protocol to use. Accepted values are:
+
+       **tcp**
+               Transmission control protocol.
+       **tcpv6**
+               Transmission control protocol V6.
+       **udp**
+               User datagram protocol.
+       **udpv6**
+               User datagram protocol V6.
+       **unix**
+               UNIX domain socket.
+       **vsock**
+               VSOCK protocol.
+
+       When the protocol is TCP, UDP or VSOCK, the port must also be given, as well as the
+       hostname if the job is a TCP or VSOCK listener or UDP reader. For unix sockets, the
+       normal :option:`filename` option should be used and the port is invalid.
+       When the protocol is VSOCK, the :option:`hostname` is the CID of the remote VM.
+
+.. option:: listen : [netsplice] [net]
+
+       For TCP network connections, tell fio to listen for incoming connections
+       rather than initiating an outgoing connection. The :option:`hostname` must
+       be omitted if this option is used.
+
+.. option:: pingpong : [netsplice] [net]
+
+       Normally a network writer will just continue writing data, and a network
+       reader will just consume packages. If ``pingpong=1`` is set, a writer will
+       send its normal payload to the reader, then wait for the reader to send the
+       same payload back. This allows fio to measure network latencies. The
+       submission and completion latencies then measure local time spent sending or
+       receiving, and the completion latency measures how long it took for the
+       other end to receive and send back.  For UDP multicast traffic
+       ``pingpong=1`` should only be set for a single reader when multiple readers
+       are listening to the same address.
+
+.. option:: window_size : [netsplice] [net]
+
+       Set the desired socket buffer size for the connection.
+
+.. option:: mss : [netsplice] [net]
+
+       Set the TCP maximum segment size (TCP_MAXSEG).
+
+.. option:: donorname=str : [e4defrag]
+
+       File will be used as a block donor (swap extents between files).
+
+.. option:: inplace=int : [e4defrag]
+
+       Configure donor file blocks allocation strategy:
+
+       **0**
+               Default. Preallocate donor's file on init.
+       **1**
+               Allocate space immediately inside defragment event, and free right
+               after event.
+
+.. option:: clustername=str : [rbd,rados]
+
+       Specifies the name of the Ceph cluster.
+
+.. option:: rbdname=str : [rbd]
+
+       Specifies the name of the RBD.
+
+.. option:: clientname=str : [rbd,rados]
+
+       Specifies the username (without the 'client.' prefix) used to access the
+       Ceph cluster. If the *clustername* is specified, the *clientname* shall be
+       the full *type.id* string. If no type. prefix is given, fio will add
+       'client.' by default.
+
+.. option:: conf=str : [rados]
+
+    Specifies the configuration path of ceph cluster, so conf file does not
+    have to be /etc/ceph/ceph.conf.
+
+.. option:: busy_poll=bool : [rbd,rados]
+
+        Poll store instead of waiting for completion. Usually this provides better
+        throughput at cost of higher(up to 100%) CPU utilization.
+
+.. option:: touch_objects=bool : [rados]
+
+        During initialization, touch (create if do not exist) all objects (files).
+        Touching all objects affects ceph caches and likely impacts test results.
+        Enabled by default.
+
+.. option:: pool=str :
+
+   [rbd,rados]
+
+       Specifies the name of the Ceph pool containing RBD or RADOS data.
+
+   [dfs]
+
+       Specify the label or UUID of the DAOS pool to connect to.
+
+.. option:: cont=str : [dfs]
+
+       Specify the label or UUID of the DAOS container to open.
+
+.. option:: chunk_size=int
+
+   [dfs]
+
+       Specify a different chunk size (in bytes) for the dfs file.
+       Use DAOS container's chunk size by default.
+
+   [libhdfs]
+
+       The size of the chunk to use for each file.
+
+.. option:: object_class=str : [dfs]
+
+       Specify a different object class for the dfs file.
+       Use DAOS container's object class by default.
+
+.. option:: skip_bad=bool : [mtd]
+
+       Skip operations against known bad blocks.
+
+.. option:: hdfsdirectory : [libhdfs]
+
+       libhdfs will create chunk in this HDFS directory.
+
+.. option:: verb=str : [rdma]
+
+       The RDMA verb to use on this side of the RDMA ioengine connection. Valid
+       values are write, read, send and recv. These correspond to the equivalent
+       RDMA verbs (e.g. write = rdma_write etc.). Note that this only needs to be
+       specified on the client side of the connection. See the examples folder.
+
+.. option:: bindname=str : [rdma]
+
+       The name to use to bind the local RDMA-CM connection to a local RDMA device.
+       This could be a hostname or an IPv4 or IPv6 address. On the server side this
+       will be passed into the rdma_bind_addr() function and on the client site it
+       will be used in the rdma_resolve_add() function. This can be useful when
+       multiple paths exist between the client and the server or in certain loopback
+       configurations.
+
+.. option:: stat_type=str : [filestat]
+
+       Specify stat system call type to measure lookup/getattr performance.
+       Default is **stat** for :manpage:`stat(2)`.
+
+.. option:: readfua=bool : [sg]
+
+       With readfua option set to 1, read operations include
+       the force unit access (fua) flag. Default is 0.
+
+.. option:: writefua=bool : [sg]
+
+       With writefua option set to 1, write operations include
+       the force unit access (fua) flag. Default is 0.
+
+.. option:: sg_write_mode=str : [sg]
+
+       Specify the type of write commands to issue. This option can take ten values:
+
+       **write**
+               This is the default where write opcodes are issued as usual.
+       **write_and_verify**
+               Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
+               directs the device to carry out a medium verification with no data
+               comparison. The writefua option is ignored with this selection.
+       **verify**
+               This option is deprecated. Use write_and_verify instead.
+       **write_same**
+               Issue WRITE SAME commands. This transfers a single block to the device
+               and writes this same block of data to a contiguous sequence of LBAs
+               beginning at the specified offset. fio's block size parameter specifies
+               the amount of data written with each command. However, the amount of data
+               actually transferred to the device is equal to the device's block
+               (sector) size. For a device with 512 byte sectors, blocksize=8k will
+               write 16 sectors with each command. fio will still generate 8k of data
+               for each command but only the first 512 bytes will be used and
+               transferred to the device. The writefua option is ignored with this
+               selection.
+       **same**
+               This option is deprecated. Use write_same instead.
+       **write_same_ndob**
+               Issue WRITE SAME(16) commands as above but with the No Data Output
+               Buffer (NDOB) bit set. No data will be transferred to the device with
+               this bit set. Data written will be a pre-determined pattern such as
+               all zeroes.
+       **write_stream**
+               Issue WRITE STREAM(16) commands. Use the **stream_id** option to specify
+               the stream identifier.
+       **verify_bytchk_00**
+               Issue VERIFY commands with BYTCHK set to 00. This directs the
+               device to carry out a medium verification with no data comparison.
+       **verify_bytchk_01**
+               Issue VERIFY commands with BYTCHK set to 01. This directs the device to
+               compare the data on the device with the data transferred to the device.
+       **verify_bytchk_11**
+               Issue VERIFY commands with BYTCHK set to 11. This transfers a
+               single block to the device and compares the contents of this block with the
+               data on the device beginning at the specified offset. fio's block size
+               parameter specifies the total amount of data compared with this command.
+               However, only one block (sector) worth of data is transferred to the device.
+               This is similar to the WRITE SAME command except that data is compared instead
+               of written.
+
+.. option:: stream_id=int : [sg]
+
+       Set the stream identifier for WRITE STREAM commands. If this is set to 0 (which is not
+       a valid stream identifier) fio will open a stream and then close it when done. Default
+       is 0.
+
+.. option:: http_host=str : [http]
+
+       Hostname to connect to. For S3, this could be the bucket hostname.
+       Default is **localhost**
+
+.. option:: http_user=str : [http]
+
+       Username for HTTP authentication.
+
+.. option:: http_pass=str : [http]
+
+       Password for HTTP authentication.
+
+.. option:: https=str : [http]
+
+       Enable HTTPS instead of http. *on* enables HTTPS; *insecure*
+       will enable HTTPS, but disable SSL peer verification (use with
+       caution!). Default is **off**
+
+.. option:: http_mode=str : [http]
+
+       Which HTTP access mode to use: *webdav*, *swift*, or *s3*.
+       Default is **webdav**
+
+.. option:: http_s3_region=str : [http]
+
+       The S3 region/zone string.
+       Default is **us-east-1**
+
+.. option:: http_s3_key=str : [http]
+
+       The S3 secret key.
+
+.. option:: http_s3_keyid=str : [http]
+
+       The S3 key/access id.
+
+.. option:: http_s3_sse_customer_key=str : [http]
+
+        The encryption customer key in SSE server side.
+
+.. option:: http_s3_sse_customer_algorithm=str : [http]
+
+        The encryption customer algorithm in SSE server side.
+        Default is **AES256**
+
+.. option:: http_s3_storage_class=str : [http]
+
+        Which storage class to access. User-customizable settings.
+        Default is **STANDARD**
+
+.. option:: http_swift_auth_token=str : [http]
+
+       The Swift auth token. See the example configuration file on how
+       to retrieve this.
+
+.. option:: http_verbose=int : [http]
+
+       Enable verbose requests from libcurl. Useful for debugging. 1
+       turns on verbose logging from libcurl, 2 additionally enables
+       HTTP IO tracing. Default is **0**
+
+.. option:: uri=str : [nbd]
+
+       Specify the NBD URI of the server to test.  The string
+       is a standard NBD URI
+       (see https://github.com/NetworkBlockDevice/nbd/tree/master/doc).
+       Example URIs: nbd://localhost:10809
+       nbd+unix:///?socket=/tmp/socket
+       nbds://tlshost/exportname
+
+.. option:: gpu_dev_ids=str : [libcufile]
+
+       Specify the GPU IDs to use with CUDA. This is a colon-separated list of
+       int. GPUs are assigned to workers roundrobin. Default is 0.
+
+.. option:: cuda_io=str : [libcufile]
+
+       Specify the type of I/O to use with CUDA. Default is **cufile**.
+
+       **cufile**
+               Use libcufile and nvidia-fs. This option performs I/O directly
+               between a GPUDirect Storage filesystem and GPU buffers,
+               avoiding use of a bounce buffer. If :option:`verify` is set,
+               cudaMemcpy is used to copy verificaton data between RAM and GPU.
+               Verification data is copied from RAM to GPU before a write
+               and from GPU to RAM after a read. :option:`direct` must be 1.
+       **posix**
+               Use POSIX to perform I/O with a RAM buffer, and use cudaMemcpy
+               to transfer data between RAM and the GPUs. Data is copied from
+               GPU to RAM before a write and copied from RAM to GPU after a
+               read. :option:`verify` does not affect use of cudaMemcpy.
+
+.. option:: nfs_url=str : [nfs]
+
+       URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
+       Refer to the libnfs README for more details.
+
+.. option:: program=str : [exec]
+
+       Specify the program to execute.
+
+.. option:: arguments=str : [exec]
+
+       Specify arguments to pass to program.
+       Some special variables can be expanded to pass fio's job details to the program.
+
+       **%r**
+               Replaced by the duration of the job in seconds.
+       **%n**
+               Replaced by the name of the job.
+
+.. option:: grace_time=int : [exec]
+
+       Specify the time between the SIGTERM and SIGKILL signals. Default is 1 second.
+
+.. option:: std_redirect=bool : [exec]
+
+       If set, stdout and stderr streams are redirected to files named from the job name. Default is true.
+
+.. option:: xnvme_async=str : [xnvme]
+
+       Select the xnvme async command interface. This can take these values.
+
+       **emu**
+               This is default and use to emulate asynchronous I/O by using a
+               single thread to create a queue pair on top of a synchronous
+               I/O interface using the NVMe driver IOCTL.
+       **thrpool**
+               Emulate an asynchronous I/O interface with a pool of userspace
+               threads on top of a synchronous I/O interface using the NVMe
+               driver IOCTL. By default four threads are used.
+       **io_uring**
+               Linux native asynchronous I/O interface which supports both
+               direct and buffered I/O.
+       **io_uring_cmd**
+               Fast Linux native asynchronous I/O interface for NVMe pass
+               through commands. This only works with NVMe character device
+               (/dev/ngXnY).
+       **libaio**
+               Use Linux aio for Asynchronous I/O.
+       **posix**
+               Use the posix asynchronous I/O interface to perform one or
+               more I/O operations asynchronously.
+       **vfio**
+               Use the user-space VFIO-based backend, implemented using
+               libvfn instead of SPDK.
+       **nil**
+               Do not transfer any data; just pretend to. This is mainly used
+               for introspective performance evaluation.
+
+.. option:: xnvme_sync=str : [xnvme]
+
+       Select the xnvme synchronous command interface. This can take these values.
+
+       **nvme**
+               This is default and uses Linux NVMe Driver ioctl() for
+               synchronous I/O.
+       **psync**
+               This supports regular as well as vectored pread() and pwrite()
+               commands.
+       **block**
+               This is the same as psync except that it also supports zone
+               management commands using Linux block layer IOCTLs.
+
+.. option:: xnvme_admin=str : [xnvme]
+
+       Select the xnvme admin command interface. This can take these values.
+
+       **nvme**
+               This is default and uses linux NVMe Driver ioctl() for admin
+               commands.
+       **block**
+               Use Linux Block Layer ioctl() and sysfs for admin commands.
+
+.. option:: xnvme_dev_nsid=int : [xnvme]
+
+       xnvme namespace identifier for userspace NVMe driver, SPDK or vfio.
+
+.. option:: xnvme_dev_subnqn=str : [xnvme]
+
+       Sets the subsystem NQN for fabrics. This is for xNVMe to utilize a
+       fabrics target with multiple systems.
+
+.. option:: xnvme_mem=str : [xnvme]
+
+       Select the xnvme memory backend. This can take these values.
+
+       **posix**
+               This is the default posix memory backend for linux NVMe driver.
+       **hugepage**
+               Use hugepages, instead of existing posix memory backend. The
+               memory backend uses hugetlbfs. This require users to allocate
+               hugepages, mount hugetlbfs and set an environment variable for
+               XNVME_HUGETLB_PATH.
+       **spdk**
+               Uses SPDK's memory allocator.
+       **vfio**
+               Uses libvfn's memory allocator. This also specifies the use
+               of libvfn backend instead of SPDK.
+
+.. option:: xnvme_iovec=int : [xnvme]
+
+       If this option is set. xnvme will use vectored read/write commands.
+
+.. option:: libblkio_driver=str : [libblkio]
+
+       The libblkio *driver* to use. Different drivers access devices through
+       different underlying interfaces. Available drivers depend on the
+       libblkio version in use and are listed at
+       https://libblkio.gitlab.io/libblkio/blkio.html#drivers
+
+.. option:: libblkio_path=str : [libblkio]
+
+       Sets the value of the driver-specific "path" property before connecting
+       the libblkio instance, which identifies the target device or file on
+       which to perform I/O. Its exact semantics are driver-dependent and not
+       all drivers may support it; see
+       https://libblkio.gitlab.io/libblkio/blkio.html#drivers
+
+.. option:: libblkio_pre_connect_props=str : [libblkio]
+
+       A colon-separated list of additional libblkio properties to be set after
+       creating but before connecting the libblkio instance. Each property must
+       have the format ``<name>=<value>``. Colons can be escaped as ``\:``.
+       These are set after the engine sets any other properties, so those can
+       be overridden. Available properties depend on the libblkio version in use
+       and are listed at
+       https://libblkio.gitlab.io/libblkio/blkio.html#properties
+
+.. option:: libblkio_num_entries=int : [libblkio]
+
+       Sets the value of the driver-specific "num-entries" property before
+       starting the libblkio instance. Its exact semantics are driver-dependent
+       and not all drivers may support it; see
+       https://libblkio.gitlab.io/libblkio/blkio.html#drivers
+
+.. option:: libblkio_queue_size=int : [libblkio]
+
+       Sets the value of the driver-specific "queue-size" property before
+       starting the libblkio instance. Its exact semantics are driver-dependent
+       and not all drivers may support it; see
+       https://libblkio.gitlab.io/libblkio/blkio.html#drivers
+
+.. option:: libblkio_pre_start_props=str : [libblkio]
+
+       A colon-separated list of additional libblkio properties to be set after
+       connecting but before starting the libblkio instance. Each property must
+       have the format ``<name>=<value>``. Colons can be escaped as ``\:``.
+       These are set after the engine sets any other properties, so those can
+       be overridden. Available properties depend on the libblkio version in use
+       and are listed at
+       https://libblkio.gitlab.io/libblkio/blkio.html#properties
+
+.. option:: libblkio_vectored : [libblkio]
+
+       Submit vectored read and write requests.
+
+.. option:: libblkio_write_zeroes_on_trim : [libblkio]
+
+       Submit trims as "write zeroes" requests instead of discard requests.
+
+.. option:: libblkio_wait_mode=str : [libblkio]
+
+       How to wait for completions:
+
+       **block** (default)
+               Use a blocking call to ``blkioq_do_io()``.
+       **eventfd**
+               Use a blocking call to ``read()`` on the completion eventfd.
+       **loop**
+               Use a busy loop with a non-blocking call to ``blkioq_do_io()``.
+
+.. option:: libblkio_force_enable_completion_eventfd : [libblkio]
+
+       Enable the queue's completion eventfd even when unused. This may impact
+       performance. The default is to enable it only if
+       :option:`libblkio_wait_mode=eventfd <libblkio_wait_mode>`.
+
+.. option:: no_completion_thread : [windowsaio]
+
+       Avoid using a separate thread for completion polling.
+
+I/O depth
+~~~~~~~~~
+
+.. option:: iodepth=int
+
+       Number of I/O units to keep in flight against the file.  Note that
+       increasing *iodepth* beyond 1 will not affect synchronous ioengines (except
+       for small degrees when :option:`verify_async` is in use).  Even async
+       engines may impose OS restrictions causing the desired depth not to be
+       achieved.  This may happen on Linux when using libaio and not setting
+       :option:`direct`\=1, since buffered I/O is not async on that OS.  Keep an
+       eye on the I/O depth distribution in the fio output to verify that the
+       achieved depth is as expected. Default: 1.
+
+.. option:: iodepth_batch_submit=int, iodepth_batch=int
+
+       This defines how many pieces of I/O to submit at once.  It defaults to 1
+       which means that we submit each I/O as soon as it is available, but can be
+       raised to submit bigger batches of I/O at the time. If it is set to 0 the
+       :option:`iodepth` value will be used.
+
+.. option:: iodepth_batch_complete_min=int, iodepth_batch_complete=int
+
+       This defines how many pieces of I/O to retrieve at once. It defaults to 1
+       which means that we'll ask for a minimum of 1 I/O in the retrieval process
+       from the kernel. The I/O retrieval will go on until we hit the limit set by
+       :option:`iodepth_low`. If this variable is set to 0, then fio will always
+       check for completed events before queuing more I/O. This helps reduce I/O
+       latency, at the cost of more retrieval system calls.
+
+.. option:: iodepth_batch_complete_max=int
+
+       This defines maximum pieces of I/O to retrieve at once. This variable should
+       be used along with :option:`iodepth_batch_complete_min`\=int variable,
+       specifying the range of min and max amount of I/O which should be
+       retrieved. By default it is equal to the :option:`iodepth_batch_complete_min`
+       value.
+
+       Example #1::
+
+               iodepth_batch_complete_min=1
+               iodepth_batch_complete_max=<iodepth>
+
+       which means that we will retrieve at least 1 I/O and up to the whole
+       submitted queue depth. If none of I/O has been completed yet, we will wait.
+
+       Example #2::
+
+               iodepth_batch_complete_min=0
+               iodepth_batch_complete_max=<iodepth>
+
+       which means that we can retrieve up to the whole submitted queue depth, but
+       if none of I/O has been completed yet, we will NOT wait and immediately exit
+       the system call. In this example we simply do polling.
+
+.. option:: iodepth_low=int
+
+       The low water mark indicating when to start filling the queue
+       again. Defaults to the same as :option:`iodepth`, meaning that fio will
+       attempt to keep the queue full at all times.  If :option:`iodepth` is set to
+       e.g. 16 and *iodepth_low* is set to 4, then after fio has filled the queue of
+       16 requests, it will let the depth drain down to 4 before starting to fill
+       it again.
+
+.. option:: serialize_overlap=bool
+
+       Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+       When two or more I/Os are submitted simultaneously, there is no guarantee that
+       the I/Os will be processed or completed in the submitted order. Further, if
+       two or more of those I/Os are writes, any overlapping region between them can
+       become indeterminate/undefined on certain storage. These issues can cause
+       verification to fail erratically when at least one of the racing I/Os is
+       changing data and the overlapping region has a non-zero size. Setting
+       ``serialize_overlap`` tells fio to avoid provoking this behavior by explicitly
+       serializing in-flight I/Os that have a non-zero overlap. Note that setting
+       this option can reduce both performance and the :option:`iodepth` achieved.
+
+       This option only applies to I/Os issued for a single job except when it is
+       enabled along with :option:`io_submit_mode`\=offload. In offload mode, fio
+       will check for overlap among all I/Os submitted by offload jobs with :option:`serialize_overlap`
+       enabled.
+
+       Default: false.
+
+.. option:: io_submit_mode=str
+
+       This option controls how fio submits the I/O to the I/O engine. The default
+       is `inline`, which means that the fio job threads submit and reap I/O
+       directly. If set to `offload`, the job threads will offload I/O submission
+       to a dedicated pool of I/O threads. This requires some coordination and thus
+       has a bit of extra overhead, especially for lower queue depth I/O where it
+       can increase latencies. The benefit is that fio can manage submission rates
+       independently of the device completion rates. This avoids skewed latency
+       reporting if I/O gets backed up on the device side (the coordinated omission
+       problem). Note that this option cannot reliably be used with async IO
+       engines.
+
+
+I/O rate
+~~~~~~~~
+
+.. option:: thinkcycles=int
+
+       Stall the job for the specified number of cycles after an I/O has completed before
+       issuing the next. May be used to simulate processing being done by an application.
+       This is not taken into account for the time to be waited on for  :option:`thinktime`.
+       Might not have any effect on some platforms, this can be checked by trying a setting
+       a high enough amount of thinkcycles.
+
+.. option:: thinktime=time
+
+       Stall the job for the specified period of time after an I/O has completed before issuing the
+       next. May be used to simulate processing being done by an application.
+       When the unit is omitted, the value is interpreted in microseconds.  See
+       :option:`thinktime_blocks`, :option:`thinktime_iotime` and :option:`thinktime_spin`.
+
+.. option:: thinktime_spin=time
+
+       Only valid if :option:`thinktime` is set - pretend to spend CPU time doing
+       something with the data received, before falling back to sleeping for the
+       rest of the period specified by :option:`thinktime`.  When the unit is
+       omitted, the value is interpreted in microseconds.
+
+.. option:: thinktime_blocks=int
+
+       Only valid if :option:`thinktime` is set - control how many blocks to issue,
+       before waiting :option:`thinktime` usecs. If not set, defaults to 1 which will make
+       fio wait :option:`thinktime` usecs after every block. This effectively makes any
+       queue depth setting redundant, since no more than 1 I/O will be queued
+       before we have to complete it and do our :option:`thinktime`. In other words, this
+       setting effectively caps the queue depth if the latter is larger.
+
+.. option:: thinktime_blocks_type=str
+
+       Only valid if :option:`thinktime` is set - control how :option:`thinktime_blocks`
+       triggers. The default is `complete`, which triggers thinktime when fio completes
+       :option:`thinktime_blocks` blocks. If this is set to `issue`, then the trigger happens
+       at the issue side.
+
+.. option:: thinktime_iotime=time
+
+       Only valid if :option:`thinktime` is set - control :option:`thinktime`
+       interval by time. The :option:`thinktime` stall is repeated after IOs
+       are executed for :option:`thinktime_iotime`. For example,
+       ``--thinktime_iotime=9s --thinktime=1s`` repeat 10-second cycle with IOs
+       for 9 seconds and stall for 1 second. When the unit is omitted,
+       :option:`thinktime_iotime` is interpreted as a number of seconds. If
+       this option is used together with :option:`thinktime_blocks`, the
+       :option:`thinktime` stall is repeated after :option:`thinktime_iotime`
+       or after :option:`thinktime_blocks` IOs, whichever happens first.
+
+.. option:: rate=int[,int][,int]
+
+       Cap the bandwidth used by this job. The number is in bytes/sec, the normal
+       suffix rules apply.  Comma-separated values may be specified for reads,
+       writes, and trims as described in :option:`blocksize`.
+
+       For example, using `rate=1m,500k` would limit reads to 1MiB/sec and writes to
+       500KiB/sec.  Capping only reads or writes can be done with `rate=,500k` or
+       `rate=500k,` where the former will only limit writes (to 500KiB/sec) and the
+       latter will only limit reads.
+
+.. option:: rate_min=int[,int][,int]
+
+       Tell fio to do whatever it can to maintain at least this bandwidth. Failing
+       to meet this requirement will cause the job to exit.  Comma-separated values
+       may be specified for reads, writes, and trims as described in
+       :option:`blocksize`.
+
+.. option:: rate_iops=int[,int][,int]
+
+       Cap the bandwidth to this number of IOPS. Basically the same as
+       :option:`rate`, just specified independently of bandwidth. If the job is
+       given a block size range instead of a fixed value, the smallest block size
+       is used as the metric.  Comma-separated values may be specified for reads,
+       writes, and trims as described in :option:`blocksize`.
+
+.. option:: rate_iops_min=int[,int][,int]
+
+       If fio doesn't meet this rate of I/O, it will cause the job to exit.
+       Comma-separated values may be specified for reads, writes, and trims as
+       described in :option:`blocksize`.
+
+.. option:: rate_process=str
+
+       This option controls how fio manages rated I/O submissions. The default is
+       `linear`, which submits I/O in a linear fashion with fixed delays between
+       I/Os that gets adjusted based on I/O completion rates. If this is set to
+       `poisson`, fio will submit I/O based on a more real world random request
+       flow, known as the Poisson process
+       (https://en.wikipedia.org/wiki/Poisson_point_process). The lambda will be
+       10^6 / IOPS for the given workload.
+
+.. option:: rate_ignore_thinktime=bool
+
+       By default, fio will attempt to catch up to the specified rate setting,
+       if any kind of thinktime setting was used. If this option is set, then
+       fio will ignore the thinktime and continue doing IO at the specified
+       rate, instead of entering a catch-up mode after thinktime is done.
+
+.. option:: rate_cycle=int
+
+        Average bandwidth for :option:`rate_min` and :option:`rate_iops_min`
+        over this number of milliseconds. Defaults to 1000.
+
+
+I/O latency
+~~~~~~~~~~~
+
+.. option:: latency_target=time
+
+       If set, fio will attempt to find the max performance point that the given
+       workload will run at while maintaining a latency below this target.  When
+       the unit is omitted, the value is interpreted in microseconds.  See
+       :option:`latency_window` and :option:`latency_percentile`.
+
+.. option:: latency_window=time
+
+       Used with :option:`latency_target` to specify the sample window that the job
+       is run at varying queue depths to test the performance.  When the unit is
+       omitted, the value is interpreted in microseconds.
+
+.. option:: latency_percentile=float
+
+       The percentage of I/Os that must fall within the criteria specified by
+       :option:`latency_target` and :option:`latency_window`. If not set, this
+       defaults to 100.0, meaning that all I/Os must be equal or below to the value
+       set by :option:`latency_target`.
+
+.. option:: latency_run=bool
+
+       Used with :option:`latency_target`. If false (default), fio will find
+       the highest queue depth that meets :option:`latency_target` and exit. If
+       true, fio will continue running and try to meet :option:`latency_target`
+       by adjusting queue depth.
+
+.. option:: max_latency=time[,time][,time]
+
+       If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
+       maximum latency. When the unit is omitted, the value is interpreted in
+       microseconds. Comma-separated values may be specified for reads, writes,
+       and trims as described in :option:`blocksize`.
+
+
+I/O replay
+~~~~~~~~~~
+
+.. option:: write_iolog=str
+
+       Write the issued I/O patterns to the specified file. See
+       :option:`read_iolog`.  Specify a separate file for each job, otherwise the
+        iologs will be interspersed and the file may be corrupt. This file will
+        be opened in append mode.
+
+.. option:: read_iolog=str
+
+       Open an iolog with the specified filename and replay the I/O patterns it
+       contains. This can be used to store a workload and replay it sometime
+       later. The iolog given may also be a blktrace binary file, which allows fio
+       to replay a workload captured by :command:`blktrace`. See
+       :manpage:`blktrace(8)` for how to capture such logging data. For blktrace
+       replay, the file needs to be turned into a blkparse binary data file first
+       (``blkparse <device> -o /dev/null -d file_for_fio.bin``).
+       You can specify a number of files by separating the names with a ':'
+       character. See the :option:`filename` option for information on how to
+       escape ':' characters within the file names. These files will
+       be sequentially assigned to job clones created by :option:`numjobs`.
+       '-' is a reserved name, meaning read from stdin, notably if
+       :option:`filename` is set to '-' which means stdin as well, then
+       this flag can't be set to '-'.
+
+.. option:: read_iolog_chunked=bool
+
+       Determines how iolog is read. If false(default) entire :option:`read_iolog`
+       will be read at once. If selected true, input from iolog will be read
+       gradually. Useful when iolog is very large, or it is generated.
+
+.. option:: merge_blktrace_file=str
+
+       When specified, rather than replaying the logs passed to :option:`read_iolog`,
+       the logs go through a merge phase which aggregates them into a single
+       blktrace. The resulting file is then passed on as the :option:`read_iolog`
+       parameter. The intention here is to make the order of events consistent.
+       This limits the influence of the scheduler compared to replaying multiple
+       blktraces via concurrent jobs.
+
+.. option:: merge_blktrace_scalars=float_list
+
+       This is a percentage based option that is index paired with the list of
+       files passed to :option:`read_iolog`. When merging is performed, scale
+       the time of each event by the corresponding amount. For example,
+       ``--merge_blktrace_scalars="50:100"`` runs the first trace in halftime
+       and the second trace in realtime. This knob is separately tunable from
+       :option:`replay_time_scale` which scales the trace during runtime and
+       does not change the output of the merge unlike this option.
+
+.. option:: merge_blktrace_iters=float_list
+
+       This is a whole number option that is index paired with the list of files
+       passed to :option:`read_iolog`. When merging is performed, run each trace
+       for the specified number of iterations. For example,
+       ``--merge_blktrace_iters="2:1"`` runs the first trace for two iterations
+       and the second trace for one iteration.
+
+.. option:: replay_no_stall=bool
+
+       When replaying I/O with :option:`read_iolog` the default behavior is to
+       attempt to respect the timestamps within the log and replay them with the
+       appropriate delay between IOPS. By setting this variable fio will not
+       respect the timestamps and attempt to replay them as fast as possible while
+       still respecting ordering. The result is the same I/O pattern to a given
+       device, but different timings.
+
+.. option:: replay_time_scale=int
+
+       When replaying I/O with :option:`read_iolog`, fio will honor the
+       original timing in the trace. With this option, it's possible to scale
+       the time. It's a percentage option, if set to 50 it means run at 50%
+       the original IO rate in the trace. If set to 200, run at twice the
+       original IO rate. Defaults to 100.
+
+.. option:: replay_redirect=str
+
+       While replaying I/O patterns using :option:`read_iolog` the default behavior
+       is to replay the IOPS onto the major/minor device that each IOP was recorded
+       from.  This is sometimes undesirable because on a different machine those
+       major/minor numbers can map to a different device.  Changing hardware on the
+       same system can also result in a different major/minor mapping.
+       ``replay_redirect`` causes all I/Os to be replayed onto the single specified
+       device regardless of the device it was recorded
+       from. i.e. :option:`replay_redirect`\= :file:`/dev/sdc` would cause all I/O
+       in the blktrace or iolog to be replayed onto :file:`/dev/sdc`.  This means
+       multiple devices will be replayed onto a single device, if the trace
+       contains multiple devices. If you want multiple devices to be replayed
+       concurrently to multiple redirected devices you must blkparse your trace
+       into separate traces and replay them with independent fio invocations.
+       Unfortunately this also breaks the strict time ordering between multiple
+       device accesses.
+
+.. option:: replay_align=int
+
+       Force alignment of the byte offsets in a trace to this value. The value
+       must be a power of 2.
+
+.. option:: replay_scale=int
+
+       Scale byte offsets down by this factor when replaying traces. Should most
+       likely use :option:`replay_align` as well.
+
+.. option:: replay_skip=str
+
+       Sometimes it's useful to skip certain IO types in a replay trace.
+       This could be, for instance, eliminating the writes in the trace.
+       Or not replaying the trims/discards, if you are redirecting to
+       a device that doesn't support them. This option takes a comma
+       separated list of read, write, trim, sync.
+
+
+Threads, processes and job synchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: thread
+
+       Fio defaults to creating jobs by using fork, however if this option is
+       given, fio will create jobs by using POSIX Threads' function
+       :manpage:`pthread_create(3)` to create threads instead.
+
+.. option:: wait_for=str
+
+       If set, the current job won't be started until all workers of the specified
+       waitee job are done.
+
+       ``wait_for`` operates on the job name basis, so there are a few
+       limitations. First, the waitee must be defined prior to the waiter job
+       (meaning no forward references). Second, if a job is being referenced as a
+       waitee, it must have a unique name (no duplicate waitees).
+
+.. option:: nice=int
+
+       Run the job with the given nice value. See man :manpage:`nice(2)`.
+
+       On Windows, values less than -15 set the process class to "High"; -1 through
+       -15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
+       priority class.
+
+.. option:: prio=int
+
+       Set the I/O priority value of this job. Linux limits us to a positive value
+       between 0 and 7, with 0 being the highest.  See man
+       :manpage:`ionice(1)`. Refer to an appropriate manpage for other operating
+       systems since meaning of priority may differ. For per-command priority
+       setting, see I/O engine specific :option:`cmdprio_percentage` and
+       :option:`cmdprio` options.
+
+.. option:: prioclass=int
+
+       Set the I/O priority class. See man :manpage:`ionice(1)`. For per-command
+       priority setting, see I/O engine specific :option:`cmdprio_percentage`
+       and :option:`cmdprio_class` options.
+
+.. option:: priohint=int
+
+       Set the I/O priority hint. This is only applicable to platforms that
+       support I/O priority classes and to devices with features controlled
+       through priority hints, e.g. block devices supporting command duration
+       limits, or CDL. CDL is a way to indicate the desired maximum latency
+       of I/Os so that the device can optimize its internal command scheduling
+       according to the latency limits indicated by the user.
+
+       For per-I/O priority hint setting, see the I/O engine specific
+       :option:`cmdprio_hint` option.
+
+.. option:: cpus_allowed=str
+
+       Controls the same options as :option:`cpumask`, but accepts a textual
+       specification of the permitted CPUs instead and CPUs are indexed from 0. So
+       to use CPUs 0 and 5 you would specify ``cpus_allowed=0,5``. This option also
+       allows a range of CPUs to be specified -- say you wanted a binding to CPUs
+       0, 5, and 8 to 15, you would set ``cpus_allowed=0,5,8-15``.
+
+       On Windows, when ``cpus_allowed`` is unset only CPUs from fio's current
+       processor group will be used and affinity settings are inherited from the
+       system. An fio build configured to target Windows 7 makes options that set
+       CPUs processor group aware and values will set both the processor group
+       and a CPU from within that group. For example, on a system where processor
+       group 0 has 40 CPUs and processor group 1 has 32 CPUs, ``cpus_allowed``
+       values between 0 and 39 will bind CPUs from processor group 0 and
+       ``cpus_allowed`` values between 40 and 71 will bind CPUs from processor
+       group 1. When using ``cpus_allowed_policy=shared`` all CPUs specified by a
+       single ``cpus_allowed`` option must be from the same processor group. For
+       Windows fio builds not built for Windows 7, CPUs will only be selected from
+       (and be relative to) whatever processor group fio happens to be running in
+       and CPUs from other processor groups cannot be used.
+
+.. option:: cpus_allowed_policy=str
+
+       Set the policy of how fio distributes the CPUs specified by
+       :option:`cpus_allowed` or :option:`cpumask`. Two policies are supported:
+
+               **shared**
+                       All jobs will share the CPU set specified.
+               **split**
+                       Each job will get a unique CPU from the CPU set.
+
+       **shared** is the default behavior, if the option isn't specified. If
+       **split** is specified, then fio will assign one cpu per job. If not
+       enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
+       in the set.
+
+.. option:: cpumask=int
+
+       Set the CPU affinity of this job. The parameter given is a bit mask of
+       allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
+       and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
+       :manpage:`sched_setaffinity(2)`. This may not work on all supported
+       operating systems or kernel versions. This option doesn't work well for a
+       higher CPU count than what you can store in an integer mask, so it can only
+       control cpus 1-32. For boxes with larger CPU counts, use
+       :option:`cpus_allowed`.
+
+.. option:: numa_cpu_nodes=str
+
+       Set this job running on specified NUMA nodes' CPUs. The arguments allow
+       comma delimited list of cpu numbers, A-B ranges, or `all`. Note, to enable
+       NUMA options support, fio must be built on a system with libnuma-dev(el)
+       installed.
+
+.. option:: numa_mem_policy=str
+
+       Set this job's memory policy and corresponding NUMA nodes. Format of the
+       arguments::
+
+               <mode>[:<nodelist>]
+
+       ``mode`` is one of the following memory policies: ``default``, ``prefer``,
+       ``bind``, ``interleave`` or ``local``. For ``default`` and ``local`` memory
+       policies, no node needs to be specified.  For ``prefer``, only one node is
+       allowed.  For ``bind`` and ``interleave`` the ``nodelist`` may be as
+       follows: a comma delimited list of numbers, A-B ranges, or `all`.
+
+.. option:: cgroup=str
+
+       Add job to this control group. If it doesn't exist, it will be created. The
+       system must have a mounted cgroup blkio mount point for this to work. If
+       your system doesn't have it mounted, you can do so with::
+
+               # mount -t cgroup -o blkio none /cgroup
+
+.. option:: cgroup_weight=int
+
+       Set the weight of the cgroup to this value. See the documentation that comes
+       with the kernel, allowed values are in the range of 100..1000.
+
+.. option:: cgroup_nodelete=bool
+
+       Normally fio will delete the cgroups it has created after the job
+       completion. To override this behavior and to leave cgroups around after the
+       job completion, set ``cgroup_nodelete=1``.  This can be useful if one wants
+       to inspect various cgroup files after job completion. Default: false.
+
+.. option:: flow_id=int
+
+       The ID of the flow. If not specified, it defaults to being a global
+       flow. See :option:`flow`.
+
+.. option:: flow=int
+
+        Weight in token-based flow control. If this value is used, then fio
+        regulates the activity between two or more jobs sharing the same
+        flow_id. Fio attempts to keep each job activity proportional to other
+        jobs' activities in the same flow_id group, with respect to requested
+        weight per job. That is, if one job has `flow=3', another job has
+        `flow=2' and another with `flow=1`, then there will be a roughly 3:2:1
+        ratio in how much one runs vs the others.
+
+.. option:: flow_sleep=int
+
+       The period of time, in microseconds, to wait after the flow counter
+       has exceeded its proportion before retrying operations.
+
+.. option:: stonewall, wait_for_previous
+
+       Wait for preceding jobs in the job file to exit, before starting this
+       one. Can be used to insert serialization points in the job file. A stone
+       wall also implies starting a new reporting group, see
+       :option:`group_reporting`.
+
+.. option:: exitall
+
+       By default, fio will continue running all other jobs when one job finishes.
+       Sometimes this is not the desired action.  Setting ``exitall`` will instead
+       make fio terminate all jobs in the same group, as soon as one job of that
+       group finishes.
+
+.. option:: exit_what=str
+
+       By default, fio will continue running all other jobs when one job finishes.
+       Sometimes this is not the desired action. Setting ``exitall`` will
+       instead make fio terminate all jobs in the same group. The option
+        ``exit_what`` allows one to control which jobs get terminated when ``exitall``
+        is enabled. The default is ``group`` and does not change the behaviour of
+        ``exitall``. The setting ``all`` terminates all jobs. The setting ``stonewall``
+        terminates all currently running jobs across all groups and continues execution
+        with the next stonewalled group.
+
+.. option:: exec_prerun=str
+
+       Before running this job, issue the command specified through
+       :manpage:`system(3)`. Output is redirected in a file called
+       :file:`jobname.prerun.txt`.
+
+.. option:: exec_postrun=str
+
+       After the job completes, issue the command specified though
+       :manpage:`system(3)`. Output is redirected in a file called
+       :file:`jobname.postrun.txt`.
+
+.. option:: uid=int
+
+       Instead of running as the invoking user, set the user ID to this value
+       before the thread/process does any work.
+
+.. option:: gid=int
+
+       Set group ID, see :option:`uid`.
+
+
+Verification
+~~~~~~~~~~~~
+
+.. option:: verify_only
+
+       Do not perform specified workload, only verify data still matches previous
+       invocation of this workload. This option allows one to check data multiple
+       times at a later date without overwriting it. This option makes sense only
+       for workloads that write data, and does not support workloads with the
+       :option:`time_based` option set.
+
+.. option:: do_verify=bool
+
+       Run the verify phase after a write phase. Only valid if :option:`verify` is
+       set. Default: true.
+
+.. option:: verify=str
+
+       If writing to a file, fio can verify the file contents after each iteration
+       of the job. Each verification method also implies verification of special
+       header, which is written to the beginning of each block. This header also
+       includes meta information, like offset of the block, block number, timestamp
+       when block was written, etc.  :option:`verify` can be combined with
+       :option:`verify_pattern` option.  The allowed values are:
+
+               **md5**
+                       Use an md5 sum of the data area and store it in the header of
+                       each block.
+
+               **crc64**
+                       Use an experimental crc64 sum of the data area and store it in the
+                       header of each block.
+
+               **crc32c**
+                       Use a crc32c sum of the data area and store it in the header of
+                       each block. This will automatically use hardware acceleration
+                       (e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
+                       fall back to software crc32c if none is found. Generally the
+                       fastest checksum fio supports when hardware accelerated.
+
+               **crc32c-intel**
+                       Synonym for crc32c.
+
+               **crc32**
+                       Use a crc32 sum of the data area and store it in the header of each
+                       block.
+
+               **crc16**
+                       Use a crc16 sum of the data area and store it in the header of each
+                       block.
+
+               **crc7**
+                       Use a crc7 sum of the data area and store it in the header of each
+                       block.
+
+               **xxhash**
+                       Use xxhash as the checksum function. Generally the fastest software
+                       checksum that fio supports.
+
+               **sha512**
+                       Use sha512 as the checksum function.
+
+               **sha256**
+                       Use sha256 as the checksum function.
+
+               **sha1**
+                       Use optimized sha1 as the checksum function.
+
+               **sha3-224**
+                       Use optimized sha3-224 as the checksum function.
+
+               **sha3-256**
+                       Use optimized sha3-256 as the checksum function.
+
+               **sha3-384**
+                       Use optimized sha3-384 as the checksum function.
+
+               **sha3-512**
+                       Use optimized sha3-512 as the checksum function.
+
+               **meta**
+                       This option is deprecated, since now meta information is included in
+                       generic verification header and meta verification happens by
+                       default. For detailed information see the description of the
+                       :option:`verify` setting. This option is kept because of
+                       compatibility's sake with old configurations. Do not use it.
+
+               **pattern**
+                       Verify a strict pattern. Normally fio includes a header with some
+                       basic information and checksumming, but if this option is set, only
+                       the specific pattern set with :option:`verify_pattern` is verified.
+
+               **null**
+                       Only pretend to verify. Useful for testing internals with
+                       :option:`ioengine`\=null, not for much else.
+
+       This option can be used for repeated burn-in tests of a system to make sure
+       that the written data is also correctly read back. If the data direction
+       given is a read or random read, fio will assume that it should verify a
+       previously written file. If the data direction includes any form of write,
+       the verify will be of the newly written data.
+
+       To avoid false verification errors, do not use the norandommap option when
+       verifying data with async I/O engines and I/O depths > 1.  Or use the
+       norandommap and the lfsr random generator together to avoid writing to the
+       same offset with multiple outstanding I/Os.
+
+.. option:: verify_offset=int
+
+       Swap the verification header with data somewhere else in the block before
+       writing. It is swapped back before verifying.
+
+.. option:: verify_interval=int
+
+       Write the verification header at a finer granularity than the
+       :option:`blocksize`. It will be written for chunks the size of
+       ``verify_interval``. :option:`blocksize` should divide this evenly.
+
+.. option:: verify_pattern=str
+
+       If set, fio will fill the I/O buffers with this pattern. Fio defaults to
+       filling with totally random bytes, but sometimes it's interesting to fill
+       with a known pattern for I/O verification purposes. Depending on the width
+       of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
+       be either a decimal or a hex number).  The ``verify_pattern`` if larger than
+       a 32-bit quantity has to be a hex number that starts with either "0x" or
+       "0X". Use with :option:`verify`. Also, ``verify_pattern`` supports %o
+       format, which means that for each block offset will be written and then
+       verified back, e.g.::
+
+               verify_pattern=%o
+
+       Or use combination of everything::
+
+               verify_pattern=0xff%o"abcd"-12
+
+.. option:: verify_fatal=bool
+
+       Normally fio will keep checking the entire contents before quitting on a
+       block verification failure. If this option is set, fio will exit the job on
+       the first observed failure. Default: false.
+
+.. option:: verify_dump=bool
+
+       If set, dump the contents of both the original data block and the data block
+       we read off disk to files. This allows later analysis to inspect just what
+       kind of data corruption occurred. Off by default.
+
+.. option:: verify_async=int
+
+       Fio will normally verify I/O inline from the submitting thread. This option
+       takes an integer describing how many async offload threads to create for I/O
+       verification instead, causing fio to offload the duty of verifying I/O
+       contents to one or more separate threads. If using this offload option, even
+       sync I/O engines can benefit from using an :option:`iodepth` setting higher
+       than 1, as it allows them to have I/O in flight while verifies are running.
+       Defaults to 0 async threads, i.e. verification is not asynchronous.
+
+.. option:: verify_async_cpus=str
+
+       Tell fio to set the given CPU affinity on the async I/O verification
+       threads. See :option:`cpus_allowed` for the format used.
+
+.. option:: verify_backlog=int
+
+       Fio will normally verify the written contents of a job that utilizes verify
+       once that job has completed. In other words, everything is written then
+       everything is read back and verified. You may want to verify continually
+       instead for a variety of reasons. Fio stores the meta data associated with
+       an I/O block in memory, so for large verify workloads, quite a bit of memory
+       would be used up holding this meta data. If this option is enabled, fio will
+       write only N blocks before verifying these blocks.
+
+.. option:: verify_backlog_batch=int
+
+       Control how many blocks fio will verify if :option:`verify_backlog` is
+       set. If not set, will default to the value of :option:`verify_backlog`
+       (meaning the entire queue is read back and verified).  If
+       ``verify_backlog_batch`` is less than :option:`verify_backlog` then not all
+       blocks will be verified, if ``verify_backlog_batch`` is larger than
+       :option:`verify_backlog`, some blocks will be verified more than once.
+
+.. option:: verify_state_save=bool
+
+       When a job exits during the write phase of a verify workload, save its
+       current state. This allows fio to replay up until that point, if the verify
+       state is loaded for the verify read phase. The format of the filename is,
+       roughly::
+
+               <type>-<jobname>-<jobindex>-verify.state.
+
+       <type> is "local" for a local run, "sock" for a client/server socket
+       connection, and "ip" (192.168.0.1, for instance) for a networked
+       client/server connection. Defaults to true.
+
+.. option:: verify_state_load=bool
+
+       If a verify termination trigger was used, fio stores the current write state
+       of each thread. This can be used at verification time so that fio knows how
+       far it should verify.  Without this information, fio will run a full
+       verification pass, according to the settings in the job file used.  Default
+       false.
+
+.. option:: experimental_verify=bool
+
+        Enable experimental verification. Standard verify records I/O metadata
+        for later use during the verification phase. Experimental verify
+        instead resets the file after the write phase and then replays I/Os for
+        the verification phase.
+
+.. option:: trim_percentage=int
+
+       Number of verify blocks to discard/trim.
+
+.. option:: trim_verify_zero=bool
+
+       Verify that trim/discarded blocks are returned as zeros.
+
+.. option:: trim_backlog=int
+
+       Trim after this number of blocks are written.
+
+.. option:: trim_backlog_batch=int
+
+       Trim this number of I/O blocks.
+
+Steady state
+~~~~~~~~~~~~
+
+.. option:: steadystate=str:float, ss=str:float
+
+       Define the criterion and limit for assessing steady state performance. The
+       first parameter designates the criterion whereas the second parameter sets
+       the threshold. When the criterion falls below the threshold for the
+       specified duration, the job will stop. For example, `iops_slope:0.1%` will
+       direct fio to terminate the job when the least squares regression slope
+       falls below 0.1% of the mean IOPS. If :option:`group_reporting` is enabled
+       this will apply to all jobs in the group. Below is the list of available
+       steady state assessment criteria. All assessments are carried out using only
+       data from the rolling collection window. Threshold limits can be expressed
+       as a fixed value or as a percentage of the mean in the collection window.
+
+       When using this feature, most jobs should include the :option:`time_based`
+       and :option:`runtime` options or the :option:`loops` option so that fio does not
+       stop running after it has covered the full size of the specified file(s) or device(s).
+
+               **iops**
+                       Collect IOPS data. Stop the job if all individual IOPS measurements
+                       are within the specified limit of the mean IOPS (e.g., ``iops:2``
+                       means that all individual IOPS values must be within 2 of the mean,
+                       whereas ``iops:0.2%`` means that all individual IOPS values must be
+                       within 0.2% of the mean IOPS to terminate the job).
+
+               **iops_slope**
+                       Collect IOPS data and calculate the least squares regression
+                       slope. Stop the job if the slope falls below the specified limit.
+
+               **bw**
+                       Collect bandwidth data. Stop the job if all individual bandwidth
+                       measurements are within the specified limit of the mean bandwidth.
+
+               **bw_slope**
+                       Collect bandwidth data and calculate the least squares regression
+                       slope. Stop the job if the slope falls below the specified limit.
+
+.. option:: steadystate_duration=time, ss_dur=time
+
+        A rolling window of this duration will be used to judge whether steady
+        state has been reached. Data will be collected every
+        :option:`ss_interval`.  The default is 0 which disables steady state
+        detection.  When the unit is omitted, the value is interpreted in
+        seconds.
+
+.. option:: steadystate_ramp_time=time, ss_ramp=time
+
+       Allow the job to run for the specified duration before beginning data
+       collection for checking the steady state job termination criterion. The
+       default is 0.  When the unit is omitted, the value is interpreted in seconds.
+
+.. option:: steadystate_check_interval=time, ss_interval=time
+
+        The values during the rolling window will be collected with a period of
+        this value. If :option:`ss_interval` is 30s and :option:`ss_dur` is
+        300s, 10 measurements will be taken. Default is 1s but that might not
+        converge, especially for slower devices, so set this accordingly. When
+        the unit is omitted, the value is interpreted in seconds.
+
+
+Measurements and reporting
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: per_job_logs=bool
+
+        If set to true, fio generates bw/clat/iops logs with per job unique
+        filenames. If set to false, jobs with identical names will share a log
+        filename. Note that when this option is set to false log files will be
+        opened in append mode and if log files already exist the previous
+        contents will not be overwritten. Default: true.
+
+.. option:: group_reporting
+
+       It may sometimes be interesting to display statistics for groups of jobs as
+       a whole instead of for each individual job.  This is especially true if
+       :option:`numjobs` is used; looking at individual thread/process output
+       quickly becomes unwieldy.  To see the final report per-group instead of
+       per-job, use :option:`group_reporting`. Jobs in a file will be part of the
+       same reporting group, unless if separated by a :option:`stonewall`, or by
+       using :option:`new_group`.
+
+       NOTE: When :option:`group_reporting` is used along with `json` output,
+       there are certain per-job properties which can be different between jobs
+       but do not have a natural group-level equivalent. Examples include
+       `kb_base`, `unit_base`, `sig_figs`, `thread_number`, `pid`, and
+       `job_start`. For these properties, the values for the first job are
+       recorded for the group.
+
+.. option:: new_group
+
+       Start a new reporting group. See: :option:`group_reporting`.  If not given,
+       all jobs in a file will be part of the same reporting group, unless
+       separated by a :option:`stonewall`.
+
+.. option:: stats=bool
+
+       By default, fio collects and shows final output results for all jobs
+       that run. If this option is set to 0, then fio will ignore it in
+       the final stat output.
+
+.. option:: write_bw_log=str
+
+       If given, write a bandwidth log for this job. Can be used to store data of
+       the bandwidth of the jobs in their lifetime.
+
+       If no str argument is given, the default filename of
+       :file:`jobname_type.x.log` is used. Even when the argument is given, fio
+       will still append the type of log. So if one specifies::
+
+               write_bw_log=foo
+
+       The actual log name will be :file:`foo_bw.x.log` where `x` is the index
+       of the job (`1..N`, where `N` is the number of jobs). If
+       :option:`per_job_logs` is false, then the filename will not include the
+       `.x` job index.
+
+       The included :command:`fio_generate_plots` script uses :command:`gnuplot` to turn these
+       text files into nice graphs. See `Log File Formats`_ for how data is
+       structured within the file.
+
+.. option:: write_lat_log=str
+
+       Same as :option:`write_bw_log`, except this option creates I/O
+       submission (e.g., :file:`name_slat.x.log`), completion (e.g.,
+       :file:`name_clat.x.log`), and total (e.g., :file:`name_lat.x.log`)
+       latency files instead. See :option:`write_bw_log` for details about
+       the filename format and `Log File Formats`_ for how data is structured
+       within the files.
+
+.. option:: write_hist_log=str
+
+       Same as :option:`write_bw_log` but writes an I/O completion latency
+       histogram file (e.g., :file:`name_hist.x.log`) instead. Note that this
+       file will be empty unless :option:`log_hist_msec` has also been set.
+       See :option:`write_bw_log` for details about the filename format and
+       `Log File Formats`_ for how data is structured within the file.
+
+.. option:: write_iops_log=str
+
+       Same as :option:`write_bw_log`, but writes an IOPS file (e.g.
+       :file:`name_iops.x.log`) instead. Because fio defaults to individual
+       I/O logging, the value entry in the IOPS log will be 1 unless windowed
+       logging (see :option:`log_avg_msec`) has been enabled. See
+       :option:`write_bw_log` for details about the filename format and `Log
+       File Formats`_ for how data is structured within the file.
+
+.. option:: log_entries=int
+
+       By default, fio will log an entry in the iops, latency, or bw log for
+       every I/O that completes. The initial number of I/O log entries is 1024.
+       When the log entries are all used, new log entries are dynamically
+       allocated.  This dynamic log entry allocation may negatively impact
+       time-related statistics such as I/O tail latencies (e.g. 99.9th percentile
+       completion latency). This option allows specifying a larger initial
+       number of log entries to avoid run-time allocations of new log entries,
+       resulting in more precise time-related I/O statistics.
+       Also see :option:`log_avg_msec`. Defaults to 1024.
+
+.. option:: log_avg_msec=int
+
+        By default, fio will log an entry in the iops, latency, or bw log for
+        every I/O that completes. When writing to the disk log, that can
+        quickly grow to a very large size. Setting this option directs fio to
+        instead record an average over the specified duration for each log
+        entry, reducing the resolution of the log. When the job completes, fio
+        will flush any accumulated latency log data, so the final log interval
+        may not match the value specified by this option and there may even be
+        duplicate timestamps. See :option:`log_window_value` as well. Defaults
+        to 0, logging entries for each I/O. Also see `Log File Formats`_.
+
+.. option:: log_hist_msec=int
+
+       Same as :option:`log_avg_msec`, but logs entries for completion latency
+       histograms. Computing latency percentiles from averages of intervals using
+       :option:`log_avg_msec` is inaccurate. Setting this option makes fio log
+       histogram entries over the specified period of time, reducing log sizes for
+       high IOPS devices while retaining percentile accuracy.  See
+       :option:`log_hist_coarseness` and :option:`write_hist_log` as well.
+       Defaults to 0, meaning histogram logging is disabled.
+
+.. option:: log_hist_coarseness=int
+
+       Integer ranging from 0 to 6, defining the coarseness of the resolution of
+       the histogram logs enabled with :option:`log_hist_msec`. For each increment
+       in coarseness, fio outputs half as many bins. Defaults to 0, for which
+       histogram logs contain 1216 latency bins. See :option:`write_hist_log`
+       and `Log File Formats`_.
+
+.. option:: log_window_value=str, log_max_value=str
+
+       If :option:`log_avg_msec` is set, fio by default logs the average over that
+       window. This option determines whether fio logs the average, maximum or
+       both the values over the window. This only affects the latency logging,
+       as both average and maximum values for iops or bw log will be same.
+       Accepted values are:
+
+               **avg**
+                       Log average value over the window. The default.
+
+               **max**
+                       Log maximum value in the window.
+
+               **both**
+                       Log both average and maximum value over the window.
+
+               **0**
+                       Backward-compatible alias for **avg**.
+
+               **1**
+                       Backward-compatible alias for **max**.
+
+.. option:: log_offset=bool
+
+       If this is set, the iolog options will include the byte offset for the I/O
+       entry as well as the other data values. Defaults to 0 meaning that
+       offsets are not present in logs. Also see `Log File Formats`_.
+
+.. option:: log_compression=int
+
+       If this is set, fio will compress the I/O logs as it goes, to keep the
+       memory footprint lower. When a log reaches the specified size, that chunk is
+       removed and compressed in the background. Given that I/O logs are fairly
+       highly compressible, this yields a nice memory savings for longer runs. The
+       downside is that the compression will consume some background CPU cycles, so
+       it may impact the run. This, however, is also true if the logging ends up
+       consuming most of the system memory.  So pick your poison. The I/O logs are
+       saved normally at the end of a run, by decompressing the chunks and storing
+       them in the specified log file. This feature depends on the availability of
+       zlib.
+
+.. option:: log_compression_cpus=str
+
+       Define the set of CPUs that are allowed to handle online log compression for
+       the I/O jobs. This can provide better isolation between performance
+       sensitive jobs, and background compression work. See
+       :option:`cpus_allowed` for the format used.
+
+.. option:: log_store_compressed=bool
+
+       If set, fio will store the log files in a compressed format. They can be
+       decompressed with fio, using the :option:`--inflate-log` command line
+       parameter. The files will be stored with a :file:`.fz` suffix.
+
+.. option:: log_unix_epoch=bool
+
+       Backwards compatible alias for log_alternate_epoch.
+
+.. option:: log_alternate_epoch=bool
+
+       If set, fio will log timestamps based on the epoch used by the clock specified
+       in the log_alternate_epoch_clock_id option, to the log files produced by
+       enabling write_type_log for each log type, instead of the default zero-based
+       timestamps.
+
+.. option:: log_alternate_epoch_clock_id=int
+
+    Specifies the clock_id to be used by clock_gettime to obtain the alternate
+    epoch if log_alternate_epoch is true. Otherwise has no effect. Default
+    value is 0, or CLOCK_REALTIME.
+
+.. option:: block_error_percentiles=bool
+
+       If set, record errors in trim block-sized units from writes and trims and
+       output a histogram of how many trims it took to get to errors, and what kind
+       of error was encountered.
+
+.. option:: bwavgtime=int
+
+       Average the calculated bandwidth over the given time. Value is specified in
+       milliseconds. If the job also does bandwidth logging through
+       :option:`write_bw_log`, then the minimum of this option and
+       :option:`log_avg_msec` will be used.  Default: 500ms.
+
+.. option:: iopsavgtime=int
+
+       Average the calculated IOPS over the given time. Value is specified in
+       milliseconds. If the job also does IOPS logging through
+       :option:`write_iops_log`, then the minimum of this option and
+       :option:`log_avg_msec` will be used.  Default: 500ms.
+
+.. option:: disk_util=bool
+
+       Generate disk utilization statistics, if the platform supports it.
+       Default: true.
+
+.. option:: disable_lat=bool
+
+       Disable measurements of total latency numbers. Useful only for cutting back
+       the number of calls to :manpage:`gettimeofday(2)`, as that does impact
+       performance at really high IOPS rates.  Note that to really get rid of a
+       large amount of these calls, this option must be used with
+       :option:`disable_slat` and :option:`disable_bw_measurement` as well.
+
+.. option:: disable_clat=bool
+
+       Disable measurements of completion latency numbers. See
+       :option:`disable_lat`.
+
+.. option:: disable_slat=bool
+
+       Disable measurements of submission latency numbers. See
+       :option:`disable_lat`.
+
+.. option:: disable_bw_measurement=bool, disable_bw=bool
+
+       Disable measurements of throughput/bandwidth numbers. See
+       :option:`disable_lat`.
+
+.. option:: slat_percentiles=bool
+
+       Report submission latency percentiles. Submission latency is not recorded
+       for synchronous ioengines.
+
+.. option:: clat_percentiles=bool
+
+       Report completion latency percentiles.
+
+.. option:: lat_percentiles=bool
+
+       Report total latency percentiles. Total latency is the sum of submission
+       latency and completion latency.
+
+.. option:: percentile_list=float_list
+
+       Overwrite the default list of percentiles for latencies and the block error
+       histogram.  Each number is a floating point number in the range (0,100], and
+       the maximum length of the list is 20. Use ``:`` to separate the numbers. For
+       example, ``--percentile_list=99.5:99.9`` will cause fio to report the
+       latency durations below which 99.5% and 99.9% of the observed latencies fell,
+       respectively.
+
+.. option:: significant_figures=int
+
+       If using :option:`--output-format` of `normal`, set the significant
+       figures to this value. Higher values will yield more precise IOPS and
+       throughput units, while lower values will round. Requires a minimum
+       value of 1 and a maximum value of 10. Defaults to 4.
+
+
+Error handling
+~~~~~~~~~~~~~~
+
+.. option:: exitall_on_error
+
+       When one job finishes in error, terminate the rest. The default is to wait
+       for each job to finish.
+
+.. option:: continue_on_error=str
+
+       Normally fio will exit the job on the first observed failure. If this option
+       is set, fio will continue the job when there is a 'non-fatal error' (EIO or
+       EILSEQ) until the runtime is exceeded or the I/O size specified is
+       completed. If this option is used, there are two more stats that are
+       appended, the total error count and the first error. The error field given
+       in the stats is the first error that was hit during the run.
+
+       Note: a write error from the device may go unnoticed by fio when using
+       buffered IO, as the write() (or similar) system call merely dirties the
+       kernel pages, unless :option:`sync` or :option:`direct` is used. Device IO
+       errors occur when the dirty data is actually written out to disk. If fully
+       sync writes aren't desirable, :option:`fsync` or :option:`fdatasync` can be
+       used as well. This is specific to writes, as reads are always synchronous.
+
+       The allowed values are:
+
+               **none**
+                       Exit on any I/O or verify errors.
+
+               **read**
+                       Continue on read errors, exit on all others.
+
+               **write**
+                       Continue on write errors, exit on all others.
+
+               **io**
+                       Continue on any I/O error, exit on all others.
+
+               **verify**
+                       Continue on verify errors, exit on all others.
+
+               **all**
+                       Continue on all errors.
+
+               **0**
+                       Backward-compatible alias for 'none'.
+
+               **1**
+                       Backward-compatible alias for 'all'.
+
+.. option:: ignore_error=str
+
+       Sometimes you want to ignore some errors during test in that case you can
+       specify error list for each error type, instead of only being able to
+       ignore the default 'non-fatal error' using :option:`continue_on_error`.
+       ``ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST`` errors for
+       given error type is separated with ':'. Error may be symbol ('ENOSPC',
+       'ENOMEM') or integer.  Example::
+
+               ignore_error=EAGAIN,ENOSPC:122
+
+       This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
+       WRITE. This option works by overriding :option:`continue_on_error` with
+       the list of errors for each error type if any.
+
+.. option:: error_dump=bool
+
+       If set dump every error even if it is non fatal, true by default. If
+       disabled only fatal error will be dumped.
+
+Running predefined workloads
+----------------------------
+
+Fio includes predefined profiles that mimic the I/O workloads generated by
+other tools.
+
+.. option:: profile=str
+
+       The predefined workload to run.  Current profiles are:
+
+               **tiobench**
+                       Threaded I/O bench (tiotest/tiobench) like workload.
+
+               **act**
+                       Aerospike Certification Tool (ACT) like workload.
+
+To view a profile's additional options use :option:`--cmdhelp` after specifying
+the profile.  For example::
+
+       $ fio --profile=act --cmdhelp
+
+Act profile options
+~~~~~~~~~~~~~~~~~~~
+
+.. option:: device-names=str
+       :noindex:
+
+       Devices to use.
+
+.. option:: load=int
+       :noindex:
+
+       ACT load multiplier.  Default: 1.
+
+.. option:: test-duration=time
+       :noindex:
+
+       How long the entire test takes to run.  When the unit is omitted, the value
+       is given in seconds.  Default: 24h.
+
+.. option:: threads-per-queue=int
+       :noindex:
+
+       Number of read I/O threads per device.  Default: 8.
+
+.. option:: read-req-num-512-blocks=int
+       :noindex:
+
+       Number of 512B blocks to read at the time.  Default: 3.
+
+.. option:: large-block-op-kbytes=int
+       :noindex:
+
+       Size of large block ops in KiB (writes).  Default: 131072.
+
+.. option:: prep
+       :noindex:
+
+       Set to run ACT prep phase.
+
+Tiobench profile options
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: size=str
+       :noindex:
+
+       Size in MiB.
+
+.. option:: block=int
+       :noindex:
+
+       Block size in bytes.  Default: 4096.
+
+.. option:: numruns=int
+       :noindex:
+
+       Number of runs.
+
+.. option:: dir=str
+       :noindex:
+
+       Test directory.
+
+.. option:: threads=int
+       :noindex:
+
+       Number of threads.
+
+Interpreting the output
+-----------------------
+
+..
+       Example output was based on the following:
+       TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --time_based \
+               --rate=1256k --bs=14K --name=quick --runtime=1s --name=mixed \
+               --runtime=2m --rw=rw
+
+Fio spits out a lot of output. While running, fio will display the status of the
+jobs created. An example of that would be::
+
+    Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
+
+The characters inside the first set of square brackets denote the current status of
+each thread.  The first character is the first job defined in the job file, and so
+forth.  The possible values (in typical life cycle order) are:
+
++------+-----+-----------------------------------------------------------+
+| Idle | Run |                                                           |
++======+=====+===========================================================+
+| P    |     | Thread setup, but not started.                            |
++------+-----+-----------------------------------------------------------+
+| C    |     | Thread created.                                           |
++------+-----+-----------------------------------------------------------+
+| I    |     | Thread initialized, waiting or generating necessary data. |
++------+-----+-----------------------------------------------------------+
+|      |  p  | Thread running pre-reading file(s).                       |
++------+-----+-----------------------------------------------------------+
+|      |  /  | Thread is in ramp period.                                 |
++------+-----+-----------------------------------------------------------+
+|      |  R  | Running, doing sequential reads.                          |
++------+-----+-----------------------------------------------------------+
+|      |  r  | Running, doing random reads.                              |
++------+-----+-----------------------------------------------------------+
+|      |  W  | Running, doing sequential writes.                         |
++------+-----+-----------------------------------------------------------+
+|      |  w  | Running, doing random writes.                             |
++------+-----+-----------------------------------------------------------+
+|      |  M  | Running, doing mixed sequential reads/writes.             |
++------+-----+-----------------------------------------------------------+
+|      |  m  | Running, doing mixed random reads/writes.                 |
++------+-----+-----------------------------------------------------------+
+|      |  D  | Running, doing sequential trims.                          |
++------+-----+-----------------------------------------------------------+
+|      |  d  | Running, doing random trims.                              |
++------+-----+-----------------------------------------------------------+
+|      |  F  | Running, currently waiting for :manpage:`fsync(2)`.       |
++------+-----+-----------------------------------------------------------+
+|      |  V  | Running, doing verification of written data.              |
++------+-----+-----------------------------------------------------------+
+| f    |     | Thread finishing.                                         |
++------+-----+-----------------------------------------------------------+
+| E    |     | Thread exited, not reaped by main thread yet.             |
++------+-----+-----------------------------------------------------------+
+| _    |     | Thread reaped.                                            |
++------+-----+-----------------------------------------------------------+
+| X    |     | Thread reaped, exited with an error.                      |
++------+-----+-----------------------------------------------------------+
+| K    |     | Thread reaped, exited due to signal.                      |
++------+-----+-----------------------------------------------------------+
+
+..
+       Example output was based on the following:
+       TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --runtime=58m \
+               --time_based --rate=2512k --bs=256K --numjobs=10 \
+               --name=readers --rw=read --name=writers --rw=write
+
+Fio will condense the thread string as not to take up more space on the command
+line than needed. For instance, if you have 10 readers and 10 writers running,
+the output would look like this::
+
+    Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
+
+Note that the status string is displayed in order, so it's possible to tell which of
+the jobs are currently doing what.  In the example above this means that jobs 1--10
+are readers and 11--20 are writers.
+
+The other values are fairly self explanatory -- number of threads currently
+running and doing I/O, the number of currently open files (f=), the estimated
+completion percentage, the rate of I/O since last check (read speed listed first,
+then write speed and optionally trim speed) in terms of bandwidth and IOPS,
+and time to completion for the current running group. It's impossible to estimate
+runtime of the following groups (if any).
+
+..
+       Example output was based on the following:
+       TZ=UTC fio --iodepth=16 --ioengine=posixaio --filename=/tmp/fiofile \
+               --direct=1 --size=100M --time_based --runtime=50s --rate_iops=89 \
+               --bs=7K --name=Client1 --rw=write
+
+When fio is done (or interrupted by :kbd:`Ctrl-C`), it will show the data for
+each thread, group of threads, and disks in that order. For each overall thread (or
+group) the output looks like::
+
+       Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
+         write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
+           slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
+           clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
+            lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
+           clat percentiles (usec):
+            |  1.00th=[  302],  5.00th=[  326], 10.00th=[  343], 20.00th=[  363],
+            | 30.00th=[  392], 40.00th=[  404], 50.00th=[  416], 60.00th=[  445],
+            | 70.00th=[  816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
+            | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
+            | 99.99th=[78119]
+          bw (  KiB/s): min=  532, max=  686, per=0.10%, avg=622.87, stdev=24.82, samples=  100
+          iops        : min=   76, max=   98, avg=88.98, stdev= 3.54, samples=  100
+         lat (usec)   : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
+         lat (msec)   : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
+         lat (msec)   : 100=0.65%
+         cpu          : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
+         IO depths    : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
+            submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+            complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+            issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
+            latency   : target=0, window=0, percentile=100.00%, depth=8
+
+The job name (or first job's name when using :option:`group_reporting`) is printed,
+along with the group id, count of jobs being aggregated, last error id seen (which
+is 0 when there are no errors), pid/tid of that thread and the time the job/group
+completed.  Below are the I/O statistics for each data direction performed (showing
+writes in the example above).  In the order listed, they denote:
+
+**read/write/trim**
+               The string before the colon shows the I/O direction the statistics
+               are for.  **IOPS** is the average I/Os performed per second.  **BW**
+               is the average bandwidth rate shown as: value in power of 2 format
+               (value in power of 10 format).  The last two values show: (**total
+               I/O performed** in power of 2 format / **runtime** of that thread).
+
+**slat**
+               Submission latency (**min** being the minimum, **max** being the
+               maximum, **avg** being the average, **stdev** being the standard
+                deviation).  This is the time from when fio initialized the I/O
+                to submission.  For synchronous ioengines this includes the time
+                up until just before the ioengine's queue function is called.
+                For asynchronous ioengines this includes the time up through the
+                completion of the ioengine's queue function (and commit function
+                if it is defined). For sync I/O this row is not displayed as the
+                slat is negligible.  This value can be in nanoseconds,
+                microseconds or milliseconds --- fio will choose the most
+                appropriate base and print that (in the example above
+                nanoseconds was the best scale).  Note: in :option:`--minimal`
+                mode latencies are always expressed in microseconds.
+
+**clat**
+               Completion latency. Same names as slat, this denotes the time from
+                submission to completion of the I/O pieces. For sync I/O, this
+                represents the time from when the I/O was submitted to the
+                operating system to when it was completed. For asynchronous
+                ioengines this is the time from when the ioengine's queue (and
+                commit if available) functions were completed to when the I/O's
+                completion was reaped by fio.
+
+               For file and directory operation engines, **clat** denotes the time
+               to complete one file or directory operation.
+
+                 **filecreate engine**:the time cost to create a new file
+
+                 **filestat engine**:  the time cost to look up an existing file
+
+                 **filedelete engine**:the time cost to delete a file
+
+                 **dircreate engine**: the time cost to create a new directory
+
+                 **dirstat engine**:   the time cost to look up an existing directory
+
+                 **dirdelete engine**: the time cost to delete a directory
+
+**lat**
+               Total latency. Same names as slat and clat, this denotes the time from
+               when fio created the I/O unit to completion of the I/O operation.
+                It is the sum of submission and completion latency.
+
+**bw**
+               Bandwidth statistics based on measurements from discrete
+               intervals. Fio continuously monitors bytes transferred and I/O
+               operations completed. By default fio calculates bandwidth in
+               each half-second interval (see :option:`bwavgtime`) and reports
+               descriptive statistics for the measurements here. Same names as
+               the xlat stats, but also includes the number of samples taken
+               (**samples**) and an approximate percentage of total aggregate
+               bandwidth this thread received in its group (**per**). This
+               last value is only really useful if the threads in this group
+               are on the same disk, since they are then competing for disk
+               access.
+
+               For file and directory operation engines, **bw** is meaningless.
+
+**iops**
+               IOPS statistics based on measurements from discrete intervals.
+               For details see the description for bw above. See
+               :option:`iopsavgtime` to control the duration of the intervals.
+               Same values reported here as for bw except for percentage.
+
+               For file and directory operation engines, **iops** is the most
+               fundamental index to denote the performance.
+               It means how many files or directories can be operated per second.
+
+                 **filecreate engine**:number of files can be created per second
+
+                 **filestat engine**:  number of files can be looked up per second
+
+                 **filedelete engine**:number of files can be deleted per second
+
+                 **dircreate engine**: number of directories can be created per second
+
+                 **dirstat engine**:   number of directories can be looked up per second
+
+                 **dirdelete engine**: number of directories can be deleted per second
+
+**lat (nsec/usec/msec)**
+               The distribution of I/O completion latencies. This is the time from when
+               I/O leaves fio and when it gets completed. Unlike the separate
+               read/write/trim sections above, the data here and in the remaining
+               sections apply to all I/Os for the reporting group. 250=0.04% means that
+               0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
+               of the I/Os required 250 to 499us for completion.
+
+**cpu**
+               CPU usage. User and system time, along with the number of context
+               switches this thread went through, usage of system and user time, and
+               finally the number of major and minor page faults. The CPU utilization
+               numbers are averages for the jobs in that reporting group, while the
+               context and fault counters are summed.
+
+**IO depths**
+               The distribution of I/O depths over the job lifetime.  The numbers are
+               divided into powers of 2 and each entry covers depths from that value
+               up to those that are lower than the next entry -- e.g., 16= covers
+               depths from 16 to 31.  Note that the range covered by a depth
+               distribution entry can be different to the range covered by the
+               equivalent submit/complete distribution entry.
+
+**IO submit**
+               How many pieces of I/O were submitting in a single submit call. Each
+               entry denotes that amount and below, until the previous entry -- e.g.,
+               16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
+               call.  Note that the range covered by a submit distribution entry can
+               be different to the range covered by the equivalent depth distribution
+               entry.
+
+**IO complete**
+               Like the above submit number, but for completions instead.
+
+**IO issued rwt**
+               The number of read/write/trim requests issued, and how many of them were
+               short or dropped.
+
+**IO latency**
+               These values are for :option:`latency_target` and related options. When
+               these options are engaged, this section describes the I/O depth required
+               to meet the specified latency target.
+
+..
+       Example output was based on the following:
+       TZ=UTC fio --ioengine=null --iodepth=2 --size=100M --numjobs=2 \
+               --rate_process=poisson --io_limit=32M --name=read --bs=128k \
+               --rate=11M --name=write --rw=write --bs=2k --rate=700k
+
+After each client has been listed, the group statistics are printed. They
+will look like this::
+
+    Run status group 0 (all jobs):
+       READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s-10.8MiB/s (10.9MB/s-11.3MB/s), io=64.0MiB (67.1MB), run=2973-3069msec
+      WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s-621KiB/s (630kB/s-636kB/s), io=64.0MiB (67.1MB), run=52747-53223msec
+
+For each data direction it prints:
+
+**bw**
+               Aggregate bandwidth of threads in this group followed by the
+               minimum and maximum bandwidth of all the threads in this group.
+               Values outside of brackets are power-of-2 format and those
+               within are the equivalent value in a power-of-10 format.
+**io**
+               Aggregate I/O performed of all threads in this group. The
+               format is the same as bw.
+**run**
+               The smallest and longest runtimes of the threads in this group.
+
+And finally, the disk statistics are printed. This is Linux specific. They will look like this::
+
+  Disk stats (read/write):
+    sda: ios=16398/16511, sectors=32321/65472, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+
+Each value is printed for both reads and writes, with reads first. The
+numbers denote:
+
+**ios**
+               Number of I/Os performed by all groups.
+**sectors**
+               Amount of data transferred in units of 512 bytes for all groups.
+**merge**
+               Number of merges performed by the I/O scheduler.
+**ticks**
+               Number of ticks we kept the disk busy.
+**in_queue**
+               Total time spent in the disk queue.
+**util**
+               The disk utilization. A value of 100% means we kept the disk
+               busy constantly, 50% would be a disk idling half of the time.
+
+It is also possible to get fio to dump the current output while it is running,
+without terminating the job. To do that, send fio the **USR1** signal.  You can
+also get regularly timed dumps by using the :option:`--status-interval`
+parameter, or by creating a file in :file:`/tmp` named
+:file:`fio-dump-status`. If fio sees this file, it will unlink it and dump the
+current output status.
+
+
+Terse output
+------------
+
+For scripted usage where you typically want to generate tables or graphs of the
+results, fio can output the results in a semicolon separated format.  The format
+is one long line of values, such as::
+
+    2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
+    A description of this job goes here.
+
+The job description (if provided) follows on a second line for terse v2.
+It appears on the same line for other terse versions.
+
+To enable terse output, use the :option:`--minimal` or
+:option:`--output-format`\=terse command line options. The
+first value is the version of the terse output format. If the output has to be
+changed for some reason, this number will be incremented by 1 to signify that
+change.
+
+Split up, the format is as follows (comments in brackets denote when a
+field was introduced or whether it's specific to some terse version):
+
+    ::
+
+        terse version, fio version [v3], jobname, groupid, error
+
+    READ status::
+
+        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+        Submission latency: min, max, mean, stdev (usec)
+        Completion latency: min, max, mean, stdev (usec)
+        Completion latency percentiles: 20 fields (see below)
+        Total latency: min, max, mean, stdev (usec)
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+        IOPS [v5]: min, max, mean, stdev, number of samples
+
+    WRITE status:
+
+    ::
+
+        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+        Submission latency: min, max, mean, stdev (usec)
+        Completion latency: min, max, mean, stdev (usec)
+        Completion latency percentiles: 20 fields (see below)
+        Total latency: min, max, mean, stdev (usec)
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+        IOPS [v5]: min, max, mean, stdev, number of samples
+
+    TRIM status [all but version 3]:
+
+        Fields are similar to READ/WRITE status.
+
+    CPU usage::
+
+        user, system, context switches, major faults, minor faults
+
+    I/O depths::
+
+        <=1, 2, 4, 8, 16, 32, >=64
+
+    I/O latencies microseconds::
+
+        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+
+    I/O latencies milliseconds::
+
+        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+
+    Disk utilization [v3]::
+
+        disk name, read ios, write ios, read merges, write merges, read ticks, write ticks,
+        time spent in queue, disk utilization percentage
+
+    Additional Info (dependent on continue_on_error, default off)::
+
+        total # errors, first error code
+
+    Additional Info (dependent on description being set)::
+
+        Text description
+
+Completion latency percentiles can be a grouping of up to 20 sets, so for the
+terse output fio writes all of them. Each field will look like this::
+
+        1.00%=6112
+
+which is the Xth percentile, and the `usec` latency associated with it.
+
+For `Disk utilization`, all disks used by fio are shown. So for each disk there
+will be a disk utilization section.
+
+Below is a single line containing short names for each of the fields in the
+minimal output v3, separated by semicolons::
+
+        terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth_kb;read_iops;read_runtime_ms;read_slat_min_us;read_slat_max_us;read_slat_mean_us;read_slat_dev_us;read_clat_min_us;read_clat_max_us;read_clat_mean_us;read_clat_dev_us;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min_us;read_lat_max_us;read_lat_mean_us;read_lat_dev_us;read_bw_min_kb;read_bw_max_kb;read_bw_agg_pct;read_bw_mean_kb;read_bw_dev_kb;write_kb;write_bandwidth_kb;write_iops;write_runtime_ms;write_slat_min_us;write_slat_max_us;write_slat_mean_us;write_slat_dev_us;write_clat_min_us;write_clat_max_us;write_clat_mean_us;write_clat_dev_us;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min_us;write_lat_max_us;write_lat_mean_us;write_lat_dev_us;write_bw_min_kb;write_bw_max_kb;write_bw_agg_pct;write_bw_mean_kb;write_bw_dev_kb;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+
+In client/server mode terse output differs from what appears when jobs are run
+locally. Disk utilization data is omitted from the standard terse output and
+for v3 and later appears on its own separate line at the end of each terse
+reporting cycle.
+
+
+JSON output
+------------
+
+The `json` output format is intended to be both human readable and convenient
+for automated parsing. For the most part its sections mirror those of the
+`normal` output. The `runtime` value is reported in msec and the `bw` value is
+reported in 1024 bytes per second units.
+
+
+JSON+ output
+------------
+
+The `json+` output format is identical to the `json` output format except that it
+adds a full dump of the completion latency bins. Each `bins` object contains a
+set of (key, value) pairs where keys are latency durations and values count how
+many I/Os had completion latencies of the corresponding duration. For example,
+consider:
+
+       "bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
+
+This data indicates that one I/O required 87,552ns to complete, two I/Os required
+100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
+
+Also included with fio is a Python script `fio_jsonplus_clat2csv` that takes
+json+ output and generates CSV-formatted latency data suitable for plotting.
+
+The latency durations actually represent the midpoints of latency intervals.
+For details refer to :file:`stat.h`.
+
+
+Trace file format
+-----------------
+
+There are two trace file format that you can encounter. The older (v1) format is
+unsupported since version 1.20-rc3 (March 2008). It will still be described
+below in case that you get an old trace and want to understand it.
+
+In any case the trace is a simple text file with a single action per line.
+
+
+Trace file format v1
+~~~~~~~~~~~~~~~~~~~~
+
+Each line represents a single I/O action in the following format::
+
+       rw, offset, length
+
+where `rw=0/1` for read/write, and the `offset` and `length` entries being in bytes.
+
+This format is not supported in fio versions >= 1.20-rc3.
+
+
+Trace file format v2
+~~~~~~~~~~~~~~~~~~~~
+
+The second version of the trace file format was added in fio version 1.17.  It
+allows one to access more than one file per trace and has a bigger set of possible
+file actions.
+
+The first line of the trace file has to be::
+
+    fio version 2 iolog
+
+Following this can be lines in two different formats, which are described below.
+
+The file management format::
+
+    filename action
+
+The `filename` is given as an absolute path. The `action` can be one of these:
+
+**add**
+               Add the given `filename` to the trace.
+**open**
+               Open the file with the given `filename`. The `filename` has to have
+               been added with the **add** action before.
+**close**
+               Close the file with the given `filename`. The file has to have been
+               opened before.
+
+
+The file I/O action format::
+
+    filename action offset length
+
+The `filename` is given as an absolute path, and has to have been added and
+opened before it can be used with this format. The `offset` and `length` are
+given in bytes. The `action` can be one of these:
+
+**wait**
+          Wait for `offset` microseconds. Everything below 100 is discarded.
+          The time is relative to the previous `wait` statement. Note that
+          action `wait` is not allowed as of version 3, as the same behavior
+          can be achieved using timestamps.
+**read**
+          Read `length` bytes beginning from `offset`.
+**write**
+          Write `length` bytes beginning from `offset`.
+**sync**
+          :manpage:`fsync(2)` the file.
+**datasync**
+          :manpage:`fdatasync(2)` the file.
+**trim**
+          Trim the given file from the given `offset` for `length` bytes.
+
+
+Trace file format v3
+~~~~~~~~~~~~~~~~~~~~
+
+The third version of the trace file format was added in fio version 3.31. It
+forces each action to have a timestamp associated with it.
+
+The first line of the trace file has to be::
+
+    fio version 3 iolog
+
+Following this can be lines in two different formats, which are described below.
+
+The file management format::
+
+    timestamp filename action
+
+The file I/O action format::
+
+    timestamp filename action offset length
+
+The `timestamp` is relative to the beginning of the run (ie starts at 0). The
+`filename`, `action`, `offset` and `length`  are identical to version 2, except
+that version 3 does not allow the `wait` action.
+
+
+I/O Replay - Merging Traces
+---------------------------
+
+Colocation is a common practice used to get the most out of a machine.
+Knowing which workloads play nicely with each other and which ones don't is
+a much harder task. While fio can replay workloads concurrently via multiple
+jobs, it leaves some variability up to the scheduler making results harder to
+reproduce. Merging is a way to make the order of events consistent.
+
+Merging is integrated into I/O replay and done when a
+:option:`merge_blktrace_file` is specified. The list of files passed to
+:option:`read_iolog` go through the merge process and output a single file
+stored to the specified file. The output file is passed on as if it were the
+only file passed to :option:`read_iolog`. An example would look like::
+
+       $ fio --read_iolog="<file1>:<file2>" --merge_blktrace_file="<output_file>"
+
+Creating only the merged file can be done by passing the command line argument
+:option:`--merge-blktrace-only`.
+
+Scaling traces can be done to see the relative impact of any particular trace
+being slowed down or sped up. :option:`merge_blktrace_scalars` takes in a colon
+separated list of percentage scalars. It is index paired with the files passed
+to :option:`read_iolog`.
+
+With scaling, it may be desirable to match the running time of all traces.
+This can be done with :option:`merge_blktrace_iters`. It is index paired with
+:option:`read_iolog` just like :option:`merge_blktrace_scalars`.
+
+In an example, given two traces, A and B, each 60s long. If we want to see
+the impact of trace A issuing IOs twice as fast and repeat trace A over the
+runtime of trace B, the following can be done::
+
+       $ fio --read_iolog="<trace_a>:"<trace_b>" --merge_blktrace_file"<output_file>" --merge_blktrace_scalars="50:100" --merge_blktrace_iters="2:1"
+
+This runs trace A at 2x the speed twice for approximately the same runtime as
+a single run of trace B.
+
+
+CPU idleness profiling
+----------------------
+
+In some cases, we want to understand CPU overhead in a test. For example, we
+test patches for the specific goodness of whether they reduce CPU usage.
+Fio implements a balloon approach to create a thread per CPU that runs at idle
+priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each CPU
+can be derived accordingly.
+
+An unit work is defined as touching a full page of unsigned characters. Mean and
+standard deviation of time to complete an unit work is reported in "unit work"
+section. Options can be chosen to report detailed percpu idleness or overall
+system idleness by aggregating percpu stats.
+
+
+Verification and triggers
+-------------------------
+
+Fio is usually run in one of two ways, when data verification is done. The first
+is a normal write job of some sort with verify enabled. When the write phase has
+completed, fio switches to reads and verifies everything it wrote. The second
+model is running just the write phase, and then later on running the same job
+(but with reads instead of writes) to repeat the same I/O patterns and verify
+the contents. Both of these methods depend on the write phase being completed,
+as fio otherwise has no idea how much data was written.
+
+With verification triggers, fio supports dumping the current write state to
+local files. Then a subsequent read verify workload can load this state and know
+exactly where to stop. This is useful for testing cases where power is cut to a
+server in a managed fashion, for instance.
+
+A verification trigger consists of two things:
+
+1) Storing the write state of each job.
+2) Executing a trigger command.
+
+The write state is relatively small, on the order of hundreds of bytes to single
+kilobytes. It contains information on the number of completions done, the last X
+completions, etc.
+
+A trigger is invoked either through creation ('touch') of a specified file in
+the system, or through a timeout setting. If fio is run with
+:option:`--trigger-file`\= :file:`/tmp/trigger-file`, then it will continually
+check for the existence of :file:`/tmp/trigger-file`. When it sees this file, it
+will fire off the trigger (thus saving state, and executing the trigger
+command).
+
+For client/server runs, there's both a local and remote trigger. If fio is
+running as a server backend, it will send the job states back to the client for
+safe storage, then execute the remote trigger, if specified. If a local trigger
+is specified, the server will still send back the write state, but the client
+will then execute the trigger.
+
+Verification trigger example
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Let's say we want to run a powercut test on the remote Linux machine 'server'.
+Our write workload is in :file:`write-test.fio`. We want to cut power to 'server' at
+some point during the run, and we'll run this test from the safety or our local
+machine, 'localbox'. On the server, we'll start the fio backend normally::
+
+       server# fio --server
+
+and on the client, we'll fire off the workload::
+
+       localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
+
+We set :file:`/tmp/my-trigger` as the trigger file, and we tell fio to execute::
+
+       echo b > /proc/sysrq-trigger
+
+on the server once it has received the trigger and sent us the write state. This
+will work, but it's not **really** cutting power to the server, it's merely
+abruptly rebooting it. If we have a remote way of cutting power to the server
+through IPMI or similar, we could do that through a local trigger command
+instead. Let's assume we have a script that does IPMI reboot of a given hostname,
+ipmi-reboot. On localbox, we could then have run fio with a local trigger
+instead::
+
+       localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
+
+For this case, fio would wait for the server to send us the write state, then
+execute ``ipmi-reboot server`` when that happened.
+
+Loading verify state
+~~~~~~~~~~~~~~~~~~~~
+
+To load stored write state, a read verification job file must contain the
+:option:`verify_state_load` option. If that is set, fio will load the previously
+stored state. For a local fio run this is done by loading the files directly,
+and on a client/server run, the server backend will ask the client to send the
+files over and load them from there.
+
+
+Log File Formats
+----------------
+
+Fio supports a variety of log file formats, for logging latencies, bandwidth,
+and IOPS. The logs share a common format, which looks like this:
+
+    *time* (`msec`), *value*, *data direction*, *block size* (`bytes`),
+    *offset* (`bytes`), *command priority*
+
+*Time* for the log entry is always in milliseconds. The *value* logged depends
+on the type of log, it will be one of the following:
+
+    **Latency log**
+               Value is latency in nsecs
+    **Bandwidth log**
+               Value is in KiB/sec
+    **IOPS log**
+               Value is IOPS
+
+*Data direction* is one of the following:
+
+       **0**
+               I/O is a READ
+       **1**
+               I/O is a WRITE
+       **2**
+               I/O is a TRIM
+
+The entry's *block size* is always in bytes. The *offset* is the position in bytes
+from the start of the file for that particular I/O. The logging of the offset can be
+toggled with :option:`log_offset`.
+
+*Command priority* is 0 for normal priority and 1 for high priority. This is controlled
+by the ioengine specific :option:`cmdprio_percentage`.
+
+Fio defaults to logging every individual I/O but when windowed logging is set
+through :option:`log_avg_msec`, either the average (by default), the maximum
+(:option:`log_window_value` is set to max) *value* seen over the specified period
+of time, or both the average *value* and maximum *value1* (:option:`log_window_value`
+is set to both) is recorded. The log file format when both the values are reported
+takes this form:
+
+    *time* (`msec`), *value*, *value1*, *data direction*, *block size* (`bytes`),
+    *offset* (`bytes`), *command priority*
+
+
+Each *data direction* seen within the window period will aggregate its values in a
+separate row. Further, when using windowed logging the *block size* and *offset*
+entries will always contain 0.
+
+
+Client/Server
+-------------
+
+Normally fio is invoked as a stand-alone application on the machine where the
+I/O workload should be generated. However, the backend and frontend of fio can
+be run separately i.e., the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled by a client on another machine.
+
+Start the server on the machine which has access to the storage DUT::
+
+       $ fio --server=args
+
+where `args` defines what fio listens to. The arguments are of the form
+``type,hostname`` or ``IP,port``. *type* is either ``ip`` (or ip4) for TCP/IP
+v4, ``ip6`` for TCP/IP v6, or ``sock`` for a local unix domain socket.
+*hostname* is either a hostname or IP address, and *port* is the port to listen
+to (only valid for TCP/IP, not a local socket). Some examples:
+
+1) ``fio --server``
+
+   Start a fio server, listening on all interfaces on the default port (8765).
+
+2) ``fio --server=ip:hostname,4444``
+
+   Start a fio server, listening on IP belonging to hostname and on port 4444.
+
+3) ``fio --server=ip6:::1,4444``
+
+   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
+
+4) ``fio --server=,4444``
+
+   Start a fio server, listening on all interfaces on port 4444.
+
+5) ``fio --server=1.2.3.4``
+
+   Start a fio server, listening on IP 1.2.3.4 on the default port.
+
+6) ``fio --server=sock:/tmp/fio.sock``
+
+   Start a fio server, listening on the local socket :file:`/tmp/fio.sock`.
+
+Once a server is running, a "client" can connect to the fio server with::
+
+       fio <local-args> --client=<server> <remote-args> <job file(s)>
+
+where `local-args` are arguments for the client where it is running, `server`
+is the connect string, and `remote-args` and `job file(s)` are sent to the
+server. The `server` string follows the same format as it does on the server
+side, to allow IP/hostname/socket and port strings.
+
+Note that all job options must be defined in job files when running fio as a
+client. Any job options specified in `remote-args` will be ignored.
+
+Fio can connect to multiple servers this way::
+
+    fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
+
+If the job file is located on the fio server, then you can tell the server to
+load a local file as well. This is done by using :option:`--remote-config` ::
+
+   fio --client=server --remote-config /path/to/file.fio
+
+Then fio will open this local (to the server) job file instead of being passed
+one from the client.
+
+If you have many servers (example: 100 VMs/containers), you can input a pathname
+of a file containing host IPs/names as the parameter value for the
+:option:`--client` option.  For example, here is an example :file:`host.list`
+file containing 2 hostnames::
+
+       host1.your.dns.domain
+       host2.your.dns.domain
+
+The fio command would then be::
+
+    fio --client=host.list <job file(s)>
+
+In this mode, you cannot input server-specific parameters or job files -- all
+servers receive the same job file.
+
+In order to let ``fio --client`` runs use a shared filesystem from multiple
+hosts, ``fio --client`` now prepends the IP address of the server to the
+filename.  For example, if fio is using the directory :file:`/mnt/nfs/fio` and is
+writing filename :file:`fileio.tmp`, with a :option:`--client` `hostfile`
+containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and
+192.168.10.121, then fio will create two files::
+
+       /mnt/nfs/fio/192.168.10.120.fileio.tmp
+       /mnt/nfs/fio/192.168.10.121.fileio.tmp
+
+Terse output in client/server mode will differ slightly from what is produced
+when fio is run in stand-alone mode. See the terse output section for details.
diff --git a/Makefile b/Makefile

index 612344d154093f33d165c81746972dae90eebd43..be57e296516d096c1e7a389d1a7d5f1362ddd953 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ include config-host.mak
  endif
  
  DEBUGFLAGS = -DFIO_INC_DEBUG
-CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
+CPPFLAGS+= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
  OPTFLAGS= -g -ffast-math
  FIO_CFLAGS= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
  LIBS   += -lm $(EXTLIBS)
@@ -28,7 +28,7 @@ PROGS = fio
  SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/hist/fio-histo-log-pctiles.py tools/fio_jsonplus_clat2csv)
  
  ifndef CONFIG_FIO_NO_OPT
-  FIO_CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2
+  FIO_CFLAGS += -O3
  endif
  ifdef CONFIG_BUILD_NATIVE
    FIO_CFLAGS += -march=native
@@ -40,6 +40,11 @@ ifdef CONFIG_PDB
    LDFLAGS += -fuse-ld=lld $(LINK_PDBFILE)
  endif
  
+# If clang, do not use builtin stpcpy as it breaks the build
+ifeq ($(CC),clang)
+  FIO_CFLAGS += -fno-builtin-stpcpy
+endif
+
  ifdef CONFIG_GFIO
    PROGS += gfio
  endif
@@ -51,12 +56,13 @@ SOURCE :=   $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
                 pshared.c options.c \
                 smalloc.c filehash.c profile.c debug.c engines/cpu.c \
                 engines/mmap.c engines/sync.c engines/null.c engines/net.c \
-               engines/ftruncate.c engines/filecreate.c engines/filestat.c \
+               engines/ftruncate.c engines/fileoperations.c \
+               engines/exec.c \
                 server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
                 gettime-thread.c helpers.c json.c idletime.c td_error.c \
                 profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
                 workqueue.c rate-submit.c optgroup.c helper_thread.c \
-               steadystate.c zone-dist.c zbd.c
+               steadystate.c zone-dist.c zbd.c dedupe.c dataplacement.c
  
  ifdef CONFIG_LIBHDFS
    HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
@@ -79,6 +85,12 @@ ifdef CONFIG_LIBNBD
    ENGINES += nbd
  endif
  
+ifdef CONFIG_LIBNFS
+  CFLAGS += $(LIBNFS_CFLAGS)
+  LIBS += $(LIBNFS_LIBS)
+  SOURCE += engines/nfs.c
+endif
+
  ifdef CONFIG_64BIT
    CPPFLAGS += -DBITS_PER_LONG=64
  else ifdef CONFIG_32BIT
@@ -86,6 +98,8 @@ else ifdef CONFIG_32BIT
  endif
  ifdef CONFIG_LIBAIO
    libaio_SRCS = engines/libaio.c
+  cmdprio_SRCS = engines/cmdprio.c
+  LIBS += -laio
    libaio_LIBS = -laio
    ENGINES += libaio
  endif
@@ -94,6 +108,29 @@ ifdef CONFIG_RDMA
    rdma_LIBS = -libverbs -lrdmacm
    ENGINES += rdma
  endif
+ifdef CONFIG_LIBRPMA_APM
+  librpma_apm_SRCS = engines/librpma_apm.c
+  librpma_fio_SRCS = engines/librpma_fio.c
+  ifdef CONFIG_LIBPMEM2_INSTALLED
+    librpma_apm_LIBS = -lrpma -lpmem2
+  else
+    librpma_apm_LIBS = -lrpma -lpmem
+  endif
+  ENGINES += librpma_apm
+endif
+ifdef CONFIG_LIBRPMA_GPSPM
+  librpma_gpspm_SRCS = engines/librpma_gpspm.c engines/librpma_gpspm_flush.pb-c.c
+  librpma_fio_SRCS = engines/librpma_fio.c
+  ifdef CONFIG_LIBPMEM2_INSTALLED
+    librpma_gpspm_LIBS = -lrpma -lpmem2 -lprotobuf-c
+  else
+    librpma_gpspm_LIBS = -lrpma -lpmem -lprotobuf-c
+  endif
+  ENGINES += librpma_gpspm
+endif
+ifdef librpma_fio_SRCS
+  SOURCE += $(librpma_fio_SRCS)
+endif
  ifdef CONFIG_POSIXAIO
    SOURCE += engines/posixaio.c
  endif
@@ -130,6 +167,11 @@ ifdef CONFIG_HTTP
    http_LIBS = -lcurl -lssl -lcrypto
    ENGINES += http
  endif
+ifdef CONFIG_DFS
+  dfs_SRCS = engines/dfs.c
+  dfs_LIBS = -luuid -ldaos -ldfs
+  ENGINES += dfs
+endif
  SOURCE += oslib/asprintf.c
  ifndef CONFIG_STRSEP
    SOURCE += oslib/strsep.c
@@ -166,11 +208,6 @@ ifdef CONFIG_MTD
    SOURCE += oslib/libmtd.c
    SOURCE += oslib/libmtd_legacy.c
  endif
-ifdef CONFIG_PMEMBLK
-  pmemblk_SRCS = engines/pmemblk.c
-  pmemblk_LIBS = -lpmemblk
-  ENGINES += pmemblk
-endif
  ifdef CONFIG_LINUX_DEVDAX
    dev-dax_SRCS = engines/dev-dax.c
    dev-dax_LIBS = -lpmem
@@ -189,10 +226,22 @@ ifdef CONFIG_LIBZBC
    libzbc_LIBS = -lzbc
    ENGINES += libzbc
  endif
-
+ifdef CONFIG_LIBXNVME
+  xnvme_SRCS = engines/xnvme.c
+  xnvme_LIBS = $(LIBXNVME_LIBS)
+  xnvme_CFLAGS = $(LIBXNVME_CFLAGS)
+  ENGINES += xnvme
+endif
+ifdef CONFIG_LIBBLKIO
+  libblkio_SRCS = engines/libblkio.c
+  libblkio_LIBS = $(LIBBLKIO_LIBS)
+  libblkio_CFLAGS = $(LIBBLKIO_CFLAGS)
+  ENGINES += libblkio
+endif
  ifeq ($(CONFIG_TARGET_OS), Linux)
    SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
-               oslib/linux-dev-lookup.c engines/io_uring.c
+               oslib/linux-dev-lookup.c engines/io_uring.c engines/nvme.c
+  cmdprio_SRCS = engines/cmdprio.c
  ifdef CONFIG_HAS_BLKZONED
    SOURCE += oslib/linux-blkzoned.c
  endif
@@ -201,7 +250,9 @@ endif
  endif
  ifeq ($(CONFIG_TARGET_OS), Android)
    SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \
-               oslib/linux-dev-lookup.c
+               oslib/linux-dev-lookup.c engines/io_uring.c engines/nvme.c \
+               engines/sg.c
+  cmdprio_SRCS = engines/cmdprio.c
  ifdef CONFIG_HAS_BLKZONED
    SOURCE += oslib/linux-blkzoned.c
  endif
@@ -243,26 +294,30 @@ ifeq ($(CONFIG_TARGET_OS), Darwin)
    LIBS  += -lpthread -ldl
  endif
  ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
-  SOURCE += os/windows/cpu-affinity.c os/windows/posix.c
-  WINDOWS_OBJS = os/windows/cpu-affinity.o os/windows/posix.o lib/hweight.o
+  SOURCE += os/windows/cpu-affinity.c os/windows/posix.c os/windows/dlls.c
+  WINDOWS_OBJS = os/windows/cpu-affinity.o os/windows/posix.o os/windows/dlls.o lib/hweight.o
    LIBS  += -lpthread -lpsapi -lws2_32 -lssp
    FIO_CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format
  endif
  
+ifdef cmdprio_SRCS
+  SOURCE += $(cmdprio_SRCS)
+endif
+
  ifdef CONFIG_DYNAMIC_ENGINES
   DYNAMIC_ENGS := $(ENGINES)
  define engine_template =
  $(1)_OBJS := $$($(1)_SRCS:.c=.o)
  $$($(1)_OBJS): CFLAGS := -fPIC $$($(1)_CFLAGS) $(CFLAGS)
  engines/fio-$(1).so: $$($(1)_OBJS)
-       $$(QUIET_LINK)$(CC) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS)
+       $$(QUIET_LINK)$(CC) $(LDFLAGS) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS)
  ENGS_OBJS += engines/fio-$(1).so
  endef
  else # !CONFIG_DYNAMIC_ENGINES
  define engine_template =
  SOURCE += $$($(1)_SRCS)
  LIBS += $$($(1)_LIBS)
-CFLAGS += $$($(1)_CFLAGS)
+override CFLAGS += $$($(1)_CFLAGS)
  endef
  endif
  
@@ -336,8 +391,7 @@ T_VS_PROGS = t/fio-verify-state
  T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o
  T_PIPE_ASYNC_PROGS = t/read-to-pipe-async
  
-T_IOU_RING_OBJS = t/io_uring.o
-T_IOU_RING_OBJS += t/arch.o
+T_IOU_RING_OBJS = t/io_uring.o lib/rand.o lib/pattern.o lib/strntol.o
  T_IOU_RING_PROGS = t/io_uring
  
  T_MEMLOCK_OBJS = t/memlock.o
@@ -346,14 +400,16 @@ T_MEMLOCK_PROGS = t/memlock
  T_TT_OBJS = t/time-test.o
  T_TT_PROGS = t/time-test
  
+ifneq (,$(findstring -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION,$(CFLAGS)))
  T_FUZZ_OBJS = t/fuzz/fuzz_parseini.o
  T_FUZZ_OBJS += $(OBJS)
  ifdef CONFIG_ARITHMETIC
  T_FUZZ_OBJS += lex.yy.o y.tab.o
  endif
+# For proper fio code teardown CFLAGS needs to include -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
  # in case there is no fuzz driver defined by environment variable LIB_FUZZING_ENGINE, use a simple one
  # For instance, with compiler clang, address sanitizer and libFuzzer as a fuzzing engine, you should define
-# export CFLAGS="-fsanitize=address,fuzzer-no-link"
+# export CFLAGS="-fsanitize=address,fuzzer-no-link -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION"
  # export LIB_FUZZING_ENGINE="-fsanitize=address"
  # export CC=clang
  # before running configure && make
@@ -362,6 +418,10 @@ ifndef LIB_FUZZING_ENGINE
  T_FUZZ_OBJS += t/fuzz/onefile.o
  endif
  T_FUZZ_PROGS = t/fuzz/fuzz_parseini
+else   # CFLAGS includes -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+T_FUZZ_OBJS =
+T_FUZZ_PROGS =
+endif
  
  T_OBJS = $(T_SMALLOC_OBJS)
  T_OBJS += $(T_IEEE_OBJS)
@@ -391,7 +451,9 @@ T_TEST_PROGS += $(T_AXMAP_PROGS)
  T_TEST_PROGS += $(T_LFSR_TEST_PROGS)
  T_TEST_PROGS += $(T_GEN_RAND_PROGS)
  T_PROGS += $(T_BTRACE_FIO_PROGS)
+ifdef CONFIG_ZLIB
  T_PROGS += $(T_DEDUPE_PROGS)
+endif
  T_PROGS += $(T_VS_PROGS)
  T_TEST_PROGS += $(T_MEMLOCK_PROGS)
  ifdef CONFIG_PREAD
@@ -483,11 +545,19 @@ else
         $(QUIET_LEX)$(LEX) $<
  endif
  
+ifneq (,$(findstring -Wimplicit-fallthrough,$(CFLAGS)))
+LEX_YY_CFLAGS := -Wno-implicit-fallthrough
+endif
+
+ifdef CONFIG_HAVE_NO_STRINGOP
+YTAB_YY_CFLAGS := -Wno-stringop-truncation
+endif
+
  lex.yy.o: lex.yy.c y.tab.h
-       $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+       $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) $(LEX_YY_CFLAGS) -c $<
  
  y.tab.o: y.tab.c y.tab.h
-       $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+       $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) $(YTAB_YY_CFLAGS) -c $<
  
  y.tab.c: exp/expression-parser.y
         $(QUIET_YACC)$(YACC) -o $@ -l -d -b y $<
@@ -579,8 +649,10 @@ t/fio-btrace2fio: $(T_BTRACE_FIO_OBJS)
         $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
  endif
  
+ifdef CONFIG_ZLIB
  t/fio-dedupe: $(T_DEDUPE_OBJS)
         $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS)
+endif
  
  t/fio-verify-state: $(T_VS_OBJS)
         $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
@@ -594,7 +666,7 @@ unittests/unittest: $(UT_OBJS) $(UT_TARGET_OBJS)
  endif
  
  clean: FORCE
-       @rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] engines/*.so profiles/*.[do] t/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
+       @rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] engines/*.so profiles/*.[do] t/*.[do] t/*/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
         @rm -f t/fio-btrace2fio t/io_uring t/read-to-pipe-async
         @rm -rf  doc/output
  
@@ -619,7 +691,7 @@ test: fio
  fulltest:
         sudo modprobe null_blk &&                                       \
         if [ ! -e /usr/include/libzbc/zbc.h ]; then                     \
-         git clone https://github.com/hgst/libzbc &&                   \
+         git clone https://github.com/westerndigitalcorporation/libzbc && \
           (cd libzbc &&                                                 \
            ./autogen.sh &&                                              \
            ./configure --prefix=/usr &&                                 \
diff --git a/README b/README

deleted file mode 100644 (file)

index 2fecf0e..0000000
--- a/README
+++ /dev/null
@@ -1,282 +0,0 @@
-Overview and history
---------------------
-
-Fio was originally written to save me the hassle of writing special test case
-programs when I wanted to test a specific workload, either for performance
-reasons or to find/reproduce a bug. The process of writing such a test app can
-be tiresome, especially if you have to do it often.  Hence I needed a tool that
-would be able to simulate a given I/O workload without resorting to writing a
-tailored test case again and again.
-
-A test work load is difficult to define, though. There can be any number of
-processes or threads involved, and they can each be using their own way of
-generating I/O. You could have someone dirtying large amounts of memory in an
-memory mapped file, or maybe several threads issuing reads using asynchronous
-I/O. fio needed to be flexible enough to simulate both of these cases, and many
-more.
-
-Fio spawns a number of threads or processes doing a particular type of I/O
-action as specified by the user. fio takes a number of global parameters, each
-inherited by the thread unless otherwise parameters given to them overriding
-that setting is given.  The typical use of fio is to write a job file matching
-the I/O load one wants to simulate.
-
-
-Source
-------
-
-Fio resides in a git repo, the canonical place is:
-
-       git://git.kernel.dk/fio.git
-
-When inside a corporate firewall, git:// URL sometimes does not work.
-If git:// does not work, use the http protocol instead:
-
-       http://git.kernel.dk/fio.git
-
-Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
-meta data as well. Other tarballs are archives of official fio releases.
-Snapshots can download from:
-
-       http://brick.kernel.dk/snaps/
-
-There are also two official mirrors. Both of these are automatically synced with
-the main repository, when changes are pushed. If the main repo is down for some
-reason, either one of these is safe to use as a backup:
-
-       git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
-
-       https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
-
-or
-
-       git://github.com/axboe/fio.git
-
-       https://github.com/axboe/fio.git
-
-
-Mailing list
-------------
-
-The fio project mailing list is meant for anything related to fio including
-general discussion, bug reporting, questions, and development. For bug reporting,
-see REPORTING-BUGS.
-
-An automated mail detailing recent commits is automatically sent to the list at
-most daily. The list address is fio@vger.kernel.org, subscribe by sending an
-email to majordomo@vger.kernel.org with
-
-       subscribe fio
-
-in the body of the email. Archives can be found here:
-
-       http://www.spinics.net/lists/fio/
-
-and archives for the old list can be found here:
-
-       http://maillist.kernel.dk/fio-devel/
-
-
-Author
-------
-
-Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
-the Linux I/O subsystem and schedulers. He got tired of writing specific test
-applications to simulate a given workload, and found that the existing I/O
-benchmark/test tools out there weren't flexible enough to do what he wanted.
-
-Jens Axboe <axboe@kernel.dk> 20060905
-
-
-Binary packages
----------------
-
-Debian:
-       Starting with Debian "Squeeze", fio packages are part of the official
-       Debian repository. http://packages.debian.org/search?keywords=fio .
-
-Ubuntu:
-       Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
-       of the Ubuntu "universe" repository.
-       http://packages.ubuntu.com/search?keywords=fio .
-
-Red Hat, Fedora, CentOS & Co:
-       Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
-       packages are part of the Fedora/EPEL repositories.
-       https://apps.fedoraproject.org/packages/fio .
-
-Mandriva:
-       Mandriva has integrated fio into their package repository, so installing
-       on that distro should be as easy as typing ``urpmi fio``.
-
-Arch Linux:
-        An Arch Linux package is provided under the Community sub-repository:
-        https://www.archlinux.org/packages/?sort=&q=fio
-
-Solaris:
-       Packages for Solaris are available from OpenCSW. Install their pkgutil
-       tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
-       ``pkgutil -i fio``.
-
-Windows:
-       Rebecca Cran <rebecca@bsdio.com> has fio packages for Windows at
-       https://bsdio.com/fio/ . The latest builds for Windows can also
-       be grabbed from https://ci.appveyor.com/project/axboe/fio by clicking
-       the latest x86 or x64 build, then selecting the ARTIFACTS tab.
-
-BSDs:
-       Packages for BSDs may be available from their binary package repositories.
-       Look for a package "fio" using their binary package managers.
-
-
-Building
---------
-
-Just type::
-
- $ ./configure
- $ make
- $ make install
-
-Note that GNU make is required. On BSDs it's available from devel/gmake within
-ports directory; on Solaris it's in the SUNWgmake package.  On platforms where
-GNU make isn't the default, type ``gmake`` instead of ``make``.
-
-Configure will print the enabled options. Note that on Linux based platforms,
-the libaio development packages must be installed to use the libaio
-engine. Depending on distro, it is usually called libaio-devel or libaio-dev.
-
-For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
-to be installed.  gfio isn't built automatically and can be enabled with a
-``--enable-gfio`` option to configure.
-
-To build fio with a cross-compiler::
-
- $ make clean
- $ make CROSS_COMPILE=/path/to/toolchain/prefix
-
-Configure will attempt to determine the target platform automatically.
-
-It's possible to build fio for ESX as well, use the ``--esx`` switch to
-configure.
-
-
-Windows
-~~~~~~~
-
-The minimum versions of Windows for building/runing fio are Windows 7/Windows
-Server 2008 R2. On Windows, Cygwin (https://www.cygwin.com/) is required in
-order to build fio. To create an MSI installer package install WiX from
-https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
-directory.
-
-How to compile fio on 64-bit Windows:
-
- 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
-    packages starting with **mingw64-x86_64**. Ensure
-    **mingw64-x86_64-zlib** are installed if you wish
-    to enable fio's log compression functionality.
- 2. Open the Cygwin Terminal.
- 3. Go to the fio directory (source files).
- 4. Run ``make clean && make -j``.
-
-To build fio for 32-bit Windows, ensure the -i686 versions of the previously
-mentioned -x86_64 packages are installed and run ``./configure
---build-32bit-win`` before ``make``.
-
-It's recommended that once built or installed, fio be run in a Command Prompt or
-other 'native' console such as console2, since there are known to be display and
-signal issues when running it under a Cygwin shell (see
-https://github.com/mintty/mintty/issues/56 and
-https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs
-for details).
-
-
-Documentation
-~~~~~~~~~~~~~
-
-Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
-To build HTML formatted documentation run ``make -C doc html`` and direct your
-browser to :file:`./doc/output/html/index.html`.  To build manual page run
-``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
-output formats are supported run ``make -C doc help``.
-
-.. _reStructuredText: http://www.sphinx-doc.org/rest.html
-.. _Sphinx: http://www.sphinx-doc.org
-
-
-Platforms
----------
-
-Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
-Windows, FreeBSD, and DragonFly. Some features and/or options may only be
-available on some of the platforms, typically because those features only apply
-to that platform (like the solarisaio engine, or the splice engine on Linux).
-
-Some features are not available on FreeBSD/Solaris even if they could be
-implemented, I'd be happy to take patches for that. An example of that is disk
-utility statistics and (I think) huge page support, support for that does exist
-in FreeBSD/Solaris.
-
-Fio uses pthread mutexes for signalling and locking and some platforms do not
-support process shared pthread mutexes. As a result, on such platforms only
-threads are supported. This could be fixed with sysv ipc locking or other
-locking alternatives.
-
-Other \*BSD platforms are untested, but fio should work there almost out of the
-box. Since I don't do test runs or even compiles on those platforms, your
-mileage may vary. Sending me patches for other platforms is greatly
-appreciated. There's a lot of value in having the same test/benchmark tool
-available on all platforms.
-
-Note that POSIX aio is not enabled by default on AIX. Messages like these::
-
-    Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because:
-        Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix.
-
-indicate one needs to enable POSIX aio. Run the following commands as root::
-
-    # lsdev -C -l posix_aio0
-        posix_aio0 Defined  Posix Asynchronous I/O
-    # cfgmgr -l posix_aio0
-    # lsdev -C -l posix_aio0
-        posix_aio0 Available  Posix Asynchronous I/O
-
-POSIX aio should work now. To make the change permanent::
-
-    # chdev -l posix_aio0 -P -a autoconfig='available'
-        posix_aio0 changed
-
-
-Running fio
------------
-
-Running fio is normally the easiest part - you just give it the job file
-(or job files) as parameters::
-
-       $ fio [options] [jobfile] ...
-
-and it will start doing what the *jobfile* tells it to do. You can give more
-than one job file on the command line, fio will serialize the running of those
-files. Internally that is the same as using the :option:`stonewall` parameter
-described in the parameter section.
-
-If the job file contains only one job, you may as well just give the parameters
-on the command line. The command line parameters are identical to the job
-parameters, with a few extra that control global parameters.  For example, for
-the job file parameter :option:`iodepth=2 <iodepth>`, the mirror command line
-option would be :option:`--iodepth 2 <iodepth>` or :option:`--iodepth=2
-<iodepth>`. You can also use the command line for giving more than one job
-entry. For each :option:`--name <name>` option that fio sees, it will start a
-new job with that name.  Command line entries following a
-:option:`--name <name>` entry will apply to that job, until there are no more
-entries or a new :option:`--name <name>` entry is seen. This is similar to the
-job file options, where each option applies to the current job until a new []
-job entry is seen.
-
-fio does not need to run as root, except if the files or devices specified in
-the job section requires that. Some other options may also be restricted, such
-as memory locking, I/O scheduler switching, and decreasing the nice value.
-
-If *jobfile* is specified as ``-``, the job file will be read from standard
-input.
diff --git a/README.rst b/README.rst

new file mode 100644 (file)

index 0000000..dd521da
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,288 @@
+Overview and history
+--------------------
+
+Fio was originally written to save me the hassle of writing special test case
+programs when I wanted to test a specific workload, either for performance
+reasons or to find/reproduce a bug. The process of writing such a test app can
+be tiresome, especially if you have to do it often.  Hence I needed a tool that
+would be able to simulate a given I/O workload without resorting to writing a
+tailored test case again and again.
+
+A test work load is difficult to define, though. There can be any number of
+processes or threads involved, and they can each be using their own way of
+generating I/O. You could have someone dirtying large amounts of memory in a
+memory mapped file, or maybe several threads issuing reads using asynchronous
+I/O. fio needed to be flexible enough to simulate both of these cases, and many
+more.
+
+Fio spawns a number of threads or processes doing a particular type of I/O
+action as specified by the user. fio takes a number of global parameters, each
+inherited by the thread unless otherwise parameters given to them overriding
+that setting is given.  The typical use of fio is to write a job file matching
+the I/O load one wants to simulate.
+
+
+Source
+------
+
+Fio resides in a git repo, the canonical place is:
+
+       https://git.kernel.dk/cgit/fio/
+
+Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
+meta data as well. Other tarballs are archives of official fio releases.
+Snapshots can download from:
+
+       https://brick.kernel.dk/snaps/
+
+There are also two official mirrors. Both of these are automatically synced with
+the main repository, when changes are pushed. If the main repo is down for some
+reason, either one of these is safe to use as a backup:
+
+       https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
+
+       https://github.com/axboe/fio.git
+
+
+Mailing list
+------------
+
+The fio project mailing list is meant for anything related to fio including
+general discussion, bug reporting, questions, and development. For bug reporting,
+see REPORTING-BUGS.
+
+An automated mail detailing recent commits is automatically sent to the list at
+most daily. The list address is fio@vger.kernel.org, subscribe by sending an
+email to majordomo@vger.kernel.org with
+
+       subscribe fio
+
+in the body of the email. Archives can be found here:
+
+       https://www.spinics.net/lists/fio/
+
+or here:
+
+       https://lore.kernel.org/fio/
+
+and archives for the old list can be found here:
+
+       http://maillist.kernel.dk/fio-devel/
+
+
+Author
+------
+
+Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
+the Linux I/O subsystem and schedulers. He got tired of writing specific test
+applications to simulate a given workload, and found that the existing I/O
+benchmark/test tools out there weren't flexible enough to do what he wanted.
+
+Jens Axboe <axboe@kernel.dk> 20060905
+
+
+Maintainers
+-----------
+
+Fio is maintained by Jens Axboe <axboe@kernel.dk and
+Vincent Fu <vincentfu@gmail.com> - however, for reporting bugs please use
+the fio reflector or the GitHub page rather than email any of them
+directly. By using the public resources, others will be able to learn from
+the responses too. Chances are also good that other members will be able to
+help with your inquiry as well.
+
+
+Binary packages
+---------------
+
+Debian:
+       Starting with Debian "Squeeze", fio packages are part of the official
+       Debian repository. https://packages.debian.org/search?keywords=fio .
+
+Ubuntu:
+       Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
+       of the Ubuntu "universe" repository.
+       https://packages.ubuntu.com/search?keywords=fio .
+
+Red Hat, Fedora, CentOS & Co:
+       Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
+       packages are part of the Fedora/EPEL repositories.
+       https://packages.fedoraproject.org/pkgs/fio/ .
+
+Mandriva:
+       Mandriva has integrated fio into their package repository, so installing
+       on that distro should be as easy as typing ``urpmi fio``.
+
+Arch Linux:
+        An Arch Linux package is provided under the Community sub-repository:
+        https://www.archlinux.org/packages/?sort=&q=fio
+
+Solaris:
+       Packages for Solaris are available from OpenCSW. Install their pkgutil
+       tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
+       ``pkgutil -i fio``.
+
+Windows:
+        Beginning with fio 3.31 Windows installers for tagged releases are
+        available on GitHub at https://github.com/axboe/fio/releases. The
+        latest installers for Windows can also be obtained as GitHub Actions
+        artifacts by selecting a build from
+        https://github.com/axboe/fio/actions. These require logging in to a
+        GitHub account.
+
+BSDs:
+       Packages for BSDs may be available from their binary package repositories.
+       Look for a package "fio" using their binary package managers.
+
+
+Building
+--------
+
+Just type::
+
+ $ ./configure
+ $ make
+ $ make install
+
+Note that GNU make is required. On BSDs it's available from devel/gmake within
+ports directory; on Solaris it's in the SUNWgmake package.  On platforms where
+GNU make isn't the default, type ``gmake`` instead of ``make``.
+
+Configure will print the enabled options. Note that on Linux based platforms,
+the libaio development packages must be installed to use the libaio
+engine. Depending on the distro, it is usually called libaio-devel or libaio-dev.
+
+For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
+to be installed.  gfio isn't built automatically and can be enabled with a
+``--enable-gfio`` option to configure.
+
+To build fio with a cross-compiler::
+
+ $ make clean
+ $ make CROSS_COMPILE=/path/to/toolchain/prefix
+
+Configure will attempt to determine the target platform automatically.
+
+It's possible to build fio for ESX as well, use the ``--esx`` switch to
+configure.
+
+
+Windows
+~~~~~~~
+
+The minimum versions of Windows for building/running fio are Windows 7/Windows
+Server 2008 R2. On Windows, Cygwin (https://www.cygwin.com/) is required in
+order to build fio. To create an MSI installer package install WiX from
+https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
+directory.
+
+How to compile fio on 64-bit Windows:
+
+ 1. Install Cygwin (https://www.cygwin.com/). Install **make** and all
+    packages starting with **mingw64-x86_64**. Ensure
+    **mingw64-x86_64-zlib** are installed if you wish
+    to enable fio's log compression functionality.
+ 2. Open the Cygwin Terminal.
+ 3. Go to the fio directory (source files).
+ 4. Run ``make clean && make -j``.
+
+To build fio for 32-bit Windows, ensure the -i686 versions of the previously
+mentioned -x86_64 packages are installed and run ``./configure
+--build-32bit-win`` before ``make``.
+
+It's recommended that once built or installed, fio be run in a Command Prompt or
+other 'native' console such as console2, since there are known to be display and
+signal issues when running it under a Cygwin shell (see
+https://github.com/mintty/mintty/issues/56 and
+https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs
+for details).
+
+
+Documentation
+~~~~~~~~~~~~~
+
+Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
+To build HTML formatted documentation run ``make -C doc html`` and direct your
+browser to :file:`./doc/output/html/index.html`.  To build manual page run
+``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
+output formats are supported run ``make -C doc help``.
+
+.. _reStructuredText: https://www.sphinx-doc.org/rest.html
+.. _Sphinx: https://www.sphinx-doc.org
+
+
+Platforms
+---------
+
+Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
+Windows, FreeBSD, and DragonFly. Some features and/or options may only be
+available on some of the platforms, typically because those features only apply
+to that platform (like the solarisaio engine, or the splice engine on Linux).
+
+Some features are not available on FreeBSD/Solaris even if they could be
+implemented, I'd be happy to take patches for that. An example of that is disk
+utility statistics and (I think) huge page support, support for that does exist
+in FreeBSD/Solaris.
+
+Fio uses pthread mutexes for signaling and locking and some platforms do not
+support process shared pthread mutexes. As a result, on such platforms only
+threads are supported. This could be fixed with sysv ipc locking or other
+locking alternatives.
+
+Other \*BSD platforms are untested, but fio should work there almost out of the
+box. Since I don't do test runs or even compiles on those platforms, your
+mileage may vary. Sending me patches for other platforms is greatly
+appreciated. There's a lot of value in having the same test/benchmark tool
+available on all platforms.
+
+Note that POSIX aio is not enabled by default on AIX. Messages like these::
+
+    Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because:
+        Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix.
+
+indicate one needs to enable POSIX aio. Run the following commands as root::
+
+    # lsdev -C -l posix_aio0
+        posix_aio0 Defined  Posix Asynchronous I/O
+    # cfgmgr -l posix_aio0
+    # lsdev -C -l posix_aio0
+        posix_aio0 Available  Posix Asynchronous I/O
+
+POSIX aio should work now. To make the change permanent::
+
+    # chdev -l posix_aio0 -P -a autoconfig='available'
+        posix_aio0 changed
+
+
+Running fio
+-----------
+
+Running fio is normally the easiest part - you just give it the job file
+(or job files) as parameters::
+
+       $ fio [options] [jobfile] ...
+
+and it will start doing what the *jobfile* tells it to do. You can give more
+than one job file on the command line, fio will serialize the running of those
+files. Internally that is the same as using the :option:`stonewall` parameter
+described in the parameter section.
+
+If the job file contains only one job, you may as well just give the parameters
+on the command line. The command line parameters are identical to the job
+parameters, with a few extra that control global parameters.  For example, for
+the job file parameter :option:`iodepth=2 <iodepth>`, the mirror command line
+option would be :option:`--iodepth 2 <iodepth>` or :option:`--iodepth=2
+<iodepth>`. You can also use the command line for giving more than one job
+entry. For each :option:`--name <name>` option that fio sees, it will start a
+new job with that name.  Command line entries following a
+:option:`--name <name>` entry will apply to that job, until there are no more
+entries or a new :option:`--name <name>` entry is seen. This is similar to the
+job file options, where each option applies to the current job until a new []
+job entry is seen.
+
+fio does not need to run as root, except if the files or devices specified in
+the job section requires that. Some other options may also be restricted, such
+as memory locking, I/O scheduler switching, and decreasing the nice value.
+
+If *jobfile* is specified as ``-``, the job file will be read from standard
+input.
diff --git a/STEADYSTATE-TODO b/STEADYSTATE-TODO

index e4b146e93c1ed4b0ac10d0a437882d652ec4cf15..2848eb5416655d2c22cf49eaf54aed4993aa4cb8 100644 (file)
--- a/STEADYSTATE-TODO
+++ b/STEADYSTATE-TODO
@@ -1,6 +1,14 @@
  Known issues/TODO (for steady-state)
  
-- Allow user to specify the frequency of measurements
+- Replace the test script with a better one
+  - Add test cases for the new check_interval option
+  - Parse debug=steadystate output to check calculations
+
+- Instead of calculating `intervals` every time, calculate it once and stash it
+  somewhere
+
+- Add the time unit to the ss_dur and check_interval variable names to reduce
+  possible confusion
  
  - Better documentation for output
  
diff --git a/arch/arch-aarch64.h b/arch/arch-aarch64.h

index 2a86cc5ab4d9b9cb1bef2dcf5d0aab24bfc38fb8..919e57967609b4b077c50396c5aaf0cc6f4a1d05 100644 (file)
--- a/arch/arch-aarch64.h
+++ b/arch/arch-aarch64.h
@@ -27,4 +27,101 @@ static inline int arch_ffz(unsigned long bitmask)
  
  #define ARCH_HAVE_FFZ
  
+#define isb()  asm volatile("isb" : : : "memory")
+
+static inline unsigned long long get_cpu_clock(void)
+{
+       unsigned long val;
+
+       isb();
+       asm volatile("mrs %0, cntvct_el0" : "=r" (val));
+       return val;
+}
+#define ARCH_HAVE_CPU_CLOCK
+
+#define ARCH_HAVE_INIT
+extern bool tsc_reliable;
+static inline int arch_init(char *envp[])
+{
+       tsc_reliable = true;
+       return 0;
+}
+
+#define __do_syscallN(...) ({                                          \
+       __asm__ volatile (                                              \
+               "svc 0"                                                 \
+               : "=r"(x0)                                              \
+               : __VA_ARGS__                                           \
+               : "memory", "cc");                                      \
+       (long) x0;                                                      \
+})
+
+#define __do_syscall0(__n) ({                                          \
+       register long x8 __asm__("x8") = __n;                           \
+       register long x0 __asm__("x0");                                 \
+                                                                       \
+       __do_syscallN("r" (x8));                                        \
+})
+
+#define __do_syscall1(__n, __a) ({                                     \
+       register long x8 __asm__("x8") = __n;                           \
+       register __typeof__(__a) x0 __asm__("x0") = __a;                \
+                                                                       \
+       __do_syscallN("r" (x8), "0" (x0));                              \
+})
+
+#define __do_syscall2(__n, __a, __b) ({                                        \
+       register long x8 __asm__("x8") = __n;                           \
+       register __typeof__(__a) x0 __asm__("x0") = __a;                \
+       register __typeof__(__b) x1 __asm__("x1") = __b;                \
+                                                                       \
+       __do_syscallN("r" (x8), "0" (x0), "r" (x1));                    \
+})
+
+#define __do_syscall3(__n, __a, __b, __c) ({                           \
+       register long x8 __asm__("x8") = __n;                           \
+       register __typeof__(__a) x0 __asm__("x0") = __a;                \
+       register __typeof__(__b) x1 __asm__("x1") = __b;                \
+       register __typeof__(__c) x2 __asm__("x2") = __c;                \
+                                                                       \
+       __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2));          \
+})
+
+#define __do_syscall4(__n, __a, __b, __c, __d) ({                      \
+       register long x8 __asm__("x8") = __n;                           \
+       register __typeof__(__a) x0 __asm__("x0") = __a;                \
+       register __typeof__(__b) x1 __asm__("x1") = __b;                \
+       register __typeof__(__c) x2 __asm__("x2") = __c;                \
+       register __typeof__(__d) x3 __asm__("x3") = __d;                \
+                                                                       \
+       __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3));\
+})
+
+#define __do_syscall5(__n, __a, __b, __c, __d, __e) ({                 \
+       register long x8 __asm__("x8") = __n;                           \
+       register __typeof__(__a) x0 __asm__("x0") = __a;                \
+       register __typeof__(__b) x1 __asm__("x1") = __b;                \
+       register __typeof__(__c) x2 __asm__("x2") = __c;                \
+       register __typeof__(__d) x3 __asm__("x3") = __d;                \
+       register __typeof__(__e) x4 __asm__("x4") = __e;                \
+                                                                       \
+       __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3), \
+                       "r"(x4));                                       \
+})
+
+#define __do_syscall6(__n, __a, __b, __c, __d, __e, __f) ({            \
+       register long x8 __asm__("x8") = __n;                           \
+       register __typeof__(__a) x0 __asm__("x0") = __a;                \
+       register __typeof__(__b) x1 __asm__("x1") = __b;                \
+       register __typeof__(__c) x2 __asm__("x2") = __c;                \
+       register __typeof__(__d) x3 __asm__("x3") = __d;                \
+       register __typeof__(__e) x4 __asm__("x4") = __e;                \
+       register __typeof__(__f) x5 __asm__("x5") = __f;                \
+                                                                       \
+       __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3), \
+                       "r" (x4), "r"(x5));                             \
+})
+
+#define FIO_ARCH_HAS_SYSCALL
+
  #endif
diff --git a/arch/arch-loongarch64.h b/arch/arch-loongarch64.h

new file mode 100644 (file)

index 0000000..43ea83b
--- /dev/null
+++ b/arch/arch-loongarch64.h
@@ -0,0 +1,10 @@
+#ifndef ARCH_LOONGARCH64_H
+#define ARCH_LOONGARCH64_H
+
+#define FIO_ARCH       (arch_loongarch64)
+
+#define read_barrier()         __asm__ __volatile__("dbar 0": : :"memory")
+#define write_barrier()                __asm__ __volatile__("dbar 0": : :"memory")
+#define nop                    __asm__ __volatile__("dbar 0": : :"memory")
+
+#endif
diff --git a/arch/arch-riscv64.h b/arch/arch-riscv64.h

new file mode 100644 (file)

index 0000000..8ac33fa
--- /dev/null
+++ b/arch/arch-riscv64.h
@@ -0,0 +1,118 @@
+#ifndef ARCH_RISCV64_H
+#define ARCH_RISCV64_H
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define FIO_ARCH       (arch_riscv64)
+
+#define nop            __asm__ __volatile__ ("nop")
+#define read_barrier()         __asm__ __volatile__("fence r, r": : :"memory")
+#define write_barrier()                __asm__ __volatile__("fence w, w": : :"memory")
+
+static inline unsigned long long get_cpu_clock(void)
+{
+       unsigned long val;
+
+       asm volatile("rdtime %0" : "=r"(val));
+       return val;
+}
+#define ARCH_HAVE_CPU_CLOCK
+
+#define ARCH_HAVE_INIT
+extern bool tsc_reliable;
+static inline int arch_init(char *envp[])
+{
+       tsc_reliable = true;
+       return 0;
+}
+
+#define __do_syscallM(...) ({                                          \
+       __asm__ volatile (                                              \
+               "ecall"                                                 \
+               : "=r"(a0)                                              \
+               : __VA_ARGS__                                           \
+               : "memory", "a1");                                      \
+       (long) a0;                                                      \
+})
+
+#define __do_syscallN(...) ({                                          \
+       __asm__ volatile (                                              \
+               "ecall"                                                 \
+               : "=r"(a0)                                              \
+               : __VA_ARGS__                                           \
+               : "memory");                                    \
+       (long) a0;                                                      \
+})
+
+#define __do_syscall0(__n) ({                                          \
+       register long a7 __asm__("a7") = __n;                           \
+       register long a0 __asm__("a0");                                 \
+                                                                       \
+       __do_syscallM("r" (a7));                                        \
+})
+
+#define __do_syscall1(__n, __a) ({                                     \
+       register long a7 __asm__("a7") = __n;                           \
+       register __typeof__(__a) a0 __asm__("a0") = __a;                \
+                                                                       \
+       __do_syscallM("r" (a7), "0" (a0));                              \
+})
+
+#define __do_syscall2(__n, __a, __b) ({                                        \
+       register long a7 __asm__("a7") = __n;                           \
+       register __typeof__(__a) a0 __asm__("a0") = __a;                \
+       register __typeof__(__b) a1 __asm__("a1") = __b;                \
+                                                                       \
+       __do_syscallN("r" (a7), "0" (a0), "r" (a1));                    \
+})
+
+#define __do_syscall3(__n, __a, __b, __c) ({                           \
+       register long a7 __asm__("a7") = __n;                           \
+       register __typeof__(__a) a0 __asm__("a0") = __a;                \
+       register __typeof__(__b) a1 __asm__("a1") = __b;                \
+       register __typeof__(__c) a2 __asm__("a2") = __c;                \
+                                                                       \
+       __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2));          \
+})
+
+#define __do_syscall4(__n, __a, __b, __c, __d) ({                      \
+       register long a7 __asm__("a7") = __n;                           \
+       register __typeof__(__a) a0 __asm__("a0") = __a;                \
+       register __typeof__(__b) a1 __asm__("a1") = __b;                \
+       register __typeof__(__c) a2 __asm__("a2") = __c;                \
+       register __typeof__(__d) a3 __asm__("a3") = __d;                \
+                                                                       \
+       __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2), "r" (a3));\
+})
+
+#define __do_syscall5(__n, __a, __b, __c, __d, __e) ({                 \
+       register long a7 __asm__("a7") = __n;                           \
+       register __typeof__(__a) a0 __asm__("a0") = __a;                \
+       register __typeof__(__b) a1 __asm__("a1") = __b;                \
+       register __typeof__(__c) a2 __asm__("a2") = __c;                \
+       register __typeof__(__d) a3 __asm__("a3") = __d;                \
+       register __typeof__(__e) a4 __asm__("a4") = __e;                \
+                                                                       \
+       __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2), "r" (a3), \
+                       "r"(a4));                                       \
+})
+
+#define __do_syscall6(__n, __a, __b, __c, __d, __e, __f) ({            \
+       register long a7 __asm__("a7") = __n;                           \
+       register __typeof__(__a) a0 __asm__("a0") = __a;                \
+       register __typeof__(__b) a1 __asm__("a1") = __b;                \
+       register __typeof__(__c) a2 __asm__("a2") = __c;                \
+       register __typeof__(__d) a3 __asm__("a3") = __d;                \
+       register __typeof__(__e) a4 __asm__("a4") = __e;                \
+       register __typeof__(__f) a5 __asm__("a5") = __f;                \
+                                                                       \
+       __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2), "r" (a3), \
+                       "r" (a4), "r"(a5));                             \
+})
+
+#define FIO_ARCH_HAS_SYSCALL
+
+#endif
diff --git a/arch/arch-x86_64.h b/arch/arch-x86_64.h

index 25850f90e7dca51f9f6f597c793948c2ccf24753..b402dc6df39dbc4c7d677747571b6bb487a9c628 100644 (file)
--- a/arch/arch-x86_64.h
+++ b/arch/arch-x86_64.h
@@ -26,6 +26,11 @@ static inline unsigned long arch_ffz(unsigned long bitmask)
         return bitmask;
  }
  
+static inline void tsc_barrier(void)
+{
+       __asm__ __volatile__("mfence":::"memory");
+}
+
  static inline unsigned long long get_cpu_clock(void)
  {
         unsigned int lo, hi;
@@ -68,4 +73,117 @@ static inline int arch_rand_seed(unsigned long *seed)
         return 0;
  }
  
+#define __do_syscall0(NUM) ({                  \
+       intptr_t rax;                           \
+                                               \
+       __asm__ volatile(                       \
+               "syscall"                       \
+               : "=a"(rax)     /* %rax */      \
+               : "a"(NUM)      /* %rax */      \
+               : "rcx", "r11", "memory"        \
+       );                                      \
+       rax;                                    \
+})
+
+#define __do_syscall1(NUM, ARG1) ({            \
+       intptr_t rax;                           \
+                                               \
+       __asm__ volatile(                       \
+               "syscall"                       \
+               : "=a"(rax)     /* %rax */      \
+               : "a"((NUM)),   /* %rax */      \
+                 "D"((ARG1))   /* %rdi */      \
+               : "rcx", "r11", "memory"        \
+       );                                      \
+       rax;                                    \
+})
+
+#define __do_syscall2(NUM, ARG1, ARG2) ({      \
+       intptr_t rax;                           \
+                                               \
+       __asm__ volatile(                       \
+               "syscall"                       \
+               : "=a"(rax)     /* %rax */      \
+               : "a"((NUM)),   /* %rax */      \
+                 "D"((ARG1)),  /* %rdi */      \
+                 "S"((ARG2))   /* %rsi */      \
+               : "rcx", "r11", "memory"        \
+       );                                      \
+       rax;                                    \
+})
+
+#define __do_syscall3(NUM, ARG1, ARG2, ARG3) ({        \
+       intptr_t rax;                           \
+                                               \
+       __asm__ volatile(                       \
+               "syscall"                       \
+               : "=a"(rax)     /* %rax */      \
+               : "a"((NUM)),   /* %rax */      \
+                 "D"((ARG1)),  /* %rdi */      \
+                 "S"((ARG2)),  /* %rsi */      \
+                 "d"((ARG3))   /* %rdx */      \
+               : "rcx", "r11", "memory"        \
+       );                                      \
+       rax;                                    \
+})
+
+#define __do_syscall4(NUM, ARG1, ARG2, ARG3, ARG4) ({                  \
+       intptr_t rax;                                                   \
+       register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4);        \
+                                                                       \
+       __asm__ volatile(                                               \
+               "syscall"                                               \
+               : "=a"(rax)     /* %rax */                              \
+               : "a"((NUM)),   /* %rax */                              \
+                 "D"((ARG1)),  /* %rdi */                              \
+                 "S"((ARG2)),  /* %rsi */                              \
+                 "d"((ARG3)),  /* %rdx */                              \
+                 "r"(__r10)    /* %r10 */                              \
+               : "rcx", "r11", "memory"                                \
+       );                                                              \
+       rax;                                                            \
+})
+
+#define __do_syscall5(NUM, ARG1, ARG2, ARG3, ARG4, ARG5) ({            \
+       intptr_t rax;                                                   \
+       register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4);        \
+       register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5);          \
+                                                                       \
+       __asm__ volatile(                                               \
+               "syscall"                                               \
+               : "=a"(rax)     /* %rax */                              \
+               : "a"((NUM)),   /* %rax */                              \
+                 "D"((ARG1)),  /* %rdi */                              \
+                 "S"((ARG2)),  /* %rsi */                              \
+                 "d"((ARG3)),  /* %rdx */                              \
+                 "r"(__r10),   /* %r10 */                              \
+                 "r"(__r8)     /* %r8 */                               \
+               : "rcx", "r11", "memory"                                \
+       );                                                              \
+       rax;                                                            \
+})
+
+#define __do_syscall6(NUM, ARG1, ARG2, ARG3, ARG4, ARG5, ARG6) ({      \
+       intptr_t rax;                                                   \
+       register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4);        \
+       register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5);          \
+       register __typeof__(ARG6) __r9 __asm__("r9") = (ARG6);          \
+                                                                       \
+       __asm__ volatile(                                               \
+               "syscall"                                               \
+               : "=a"(rax)     /* %rax */                              \
+               : "a"((NUM)),   /* %rax */                              \
+                 "D"((ARG1)),  /* %rdi */                              \
+                 "S"((ARG2)),  /* %rsi */                              \
+                 "d"((ARG3)),  /* %rdx */                              \
+                 "r"(__r10),   /* %r10 */                              \
+                 "r"(__r8),    /* %r8 */                               \
+                 "r"(__r9)     /* %r9 */                               \
+               : "rcx", "r11", "memory"                                \
+       );                                                              \
+       rax;                                                            \
+})
+
+#define FIO_ARCH_HAS_SYSCALL
+
  #endif
diff --git a/arch/arch.h b/arch/arch.h

index a25779d4fd8521fe56a38027c182e5c7fcd5a4d5..7e294ddfb888c73baceded2ad0bcb0c3d4b1d2ad 100644 (file)
--- a/arch/arch.h
+++ b/arch/arch.h
@@ -1,7 +1,11 @@
  #ifndef ARCH_H
  #define ARCH_H
  
+#ifdef __cplusplus
+#include <atomic>
+#else
  #include <stdatomic.h>
+#endif
  
  #include "../lib/types.h"
  
@@ -19,6 +23,8 @@ enum {
         arch_hppa,
         arch_mips,
         arch_aarch64,
+       arch_loongarch64,
+       arch_riscv64,
  
         arch_generic,
  
@@ -36,6 +42,21 @@ extern unsigned long arch_flags;
  
  #define ARCH_CPU_CLOCK_WRAPS
  
+#ifdef __cplusplus
+#define atomic_add(p, v)                                               \
+       std::atomic_fetch_add(p, (v))
+#define atomic_sub(p, v)                                               \
+       std::atomic_fetch_sub(p, (v))
+#define atomic_load_relaxed(p)                                 \
+       std::atomic_load_explicit(p,                            \
+                            std::memory_order_relaxed)
+#define atomic_load_acquire(p)                                 \
+       std::atomic_load_explicit(p,                            \
+                            std::memory_order_acquire)
+#define atomic_store_release(p, v)                             \
+       std::atomic_store_explicit(p, (v),                      \
+                            std::memory_order_release)
+#else
  #define atomic_add(p, v)                                       \
         atomic_fetch_add((_Atomic typeof(*(p)) *)(p), v)
  #define atomic_sub(p, v)                                       \
@@ -49,6 +70,7 @@ extern unsigned long arch_flags;
  #define atomic_store_release(p, v)                             \
         atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \
                               memory_order_release)
+#endif
  
  /* IWYU pragma: begin_exports */
  #if defined(__i386__)
@@ -77,11 +99,22 @@ extern unsigned long arch_flags;
  #include "arch-hppa.h"
  #elif defined(__aarch64__)
  #include "arch-aarch64.h"
+#elif defined(__loongarch64)
+#include "arch-loongarch64.h"
+#elif defined(__riscv) && __riscv_xlen == 64
+#include "arch-riscv64.h"
  #else
  #warning "Unknown architecture, attempting to use generic model."
  #include "arch-generic.h"
  #endif
  
+#if !defined(__x86_64__) && defined(CONFIG_SYNC_SYNC)
+static inline void tsc_barrier(void)
+{
+       __sync_synchronize();
+}
+#endif
+
  #include "../lib/ffz.h"
  /* IWYU pragma: end_exports */
  
diff --git a/backend.c b/backend.c

index f2efddd67d365dc9c4300892275caaa88796a2e1..fe03eab3870f1588bc5e09ab119e7d358d659f52 100644 (file)
--- a/backend.c
+++ b/backend.c
@@ -49,6 +49,7 @@
  #include "helper_thread.h"
  #include "pshared.h"
  #include "zone-dist.h"
+#include "fio_time.h"
  
  static struct fio_sem *startup_sem;
  static struct flist_head *cgroup_list;
@@ -90,6 +91,22 @@ static void sig_int(int sig)
         }
  }
  
+#ifdef WIN32
+static void sig_break(int sig)
+{
+       sig_int(sig);
+
+       /**
+        * Windows terminates all job processes on SIGBREAK after the handler
+        * returns, so give them time to wrap-up and give stats
+        */
+       for_each_td(td) {
+               while (td->runstate < TD_EXITED)
+                       sleep(1);
+       } end_for_each();
+}
+#endif
+
  void sig_show_status(int sig)
  {
         show_running_run_stats();
@@ -112,7 +129,7 @@ static void set_sig_handlers(void)
  /* Windows uses SIGBREAK as a quit signal from other applications */
  #ifdef WIN32
         memset(&act, 0, sizeof(act));
-       act.sa_handler = sig_int;
+       act.sa_handler = sig_break;
         act.sa_flags = SA_RESTART;
         sigaction(SIGBREAK, &act, NULL);
  #endif
@@ -136,13 +153,10 @@ static void set_sig_handlers(void)
  static bool __check_min_rate(struct thread_data *td, struct timespec *now,
                              enum fio_ddir ddir)
  {
-       unsigned long long bytes = 0;
-       unsigned long iops = 0;
-       unsigned long spent;
-       unsigned long long rate;
-       unsigned long long ratemin = 0;
-       unsigned int rate_iops = 0;
-       unsigned int rate_iops_min = 0;
+       unsigned long long current_rate_check_bytes = td->this_io_bytes[ddir];
+       unsigned long current_rate_check_blocks = td->this_io_blocks[ddir];
+       unsigned long long option_rate_bytes_min = td->o.ratemin[ddir];
+       unsigned int option_rate_iops_min = td->o.rate_iops_min[ddir];
  
         assert(ddir_rw(ddir));
  
@@ -155,68 +169,44 @@ static bool __check_min_rate(struct thread_data *td, struct timespec *now,
         if (mtime_since(&td->start, now) < 2000)
                 return false;
  
-       iops += td->this_io_blocks[ddir];
-       bytes += td->this_io_bytes[ddir];
-       ratemin += td->o.ratemin[ddir];
-       rate_iops += td->o.rate_iops[ddir];
-       rate_iops_min += td->o.rate_iops_min[ddir];
-
         /*
-        * if rate blocks is set, sample is running
+        * if last_rate_check_blocks or last_rate_check_bytes is set,
+        * we can compute a rate per ratecycle
          */
-       if (td->rate_bytes[ddir] || td->rate_blocks[ddir]) {
-               spent = mtime_since(&td->lastrate[ddir], now);
-               if (spent < td->o.ratecycle)
+       if (td->last_rate_check_bytes[ddir] || td->last_rate_check_blocks[ddir]) {
+               unsigned long spent = mtime_since(&td->last_rate_check_time[ddir], now);
+               if (spent < td->o.ratecycle || spent==0)
                         return false;
  
-               if (td->o.rate[ddir] || td->o.ratemin[ddir]) {
+               if (td->o.ratemin[ddir]) {
                         /*
                          * check bandwidth specified rate
                          */
-                       if (bytes < td->rate_bytes[ddir]) {
-                               log_err("%s: rate_min=%lluB/s not met, only transferred %lluB\n",
-                                       td->o.name, ratemin, bytes);
+                       unsigned long long current_rate_bytes =
+                               ((current_rate_check_bytes - td->last_rate_check_bytes[ddir]) * 1000) / spent;
+                       if (current_rate_bytes < option_rate_bytes_min) {
+                               log_err("%s: rate_min=%lluB/s not met, got %lluB/s\n",
+                                       td->o.name, option_rate_bytes_min, current_rate_bytes);
                                 return true;
-                       } else {
-                               if (spent)
-                                       rate = ((bytes - td->rate_bytes[ddir]) * 1000) / spent;
-                               else
-                                       rate = 0;
-
-                               if (rate < ratemin ||
-                                   bytes < td->rate_bytes[ddir]) {
-                                       log_err("%s: rate_min=%lluB/s not met, got %lluB/s\n",
-                                               td->o.name, ratemin, rate);
-                                       return true;
-                               }
                         }
                 } else {
                         /*
                          * checks iops specified rate
                          */
-                       if (iops < rate_iops) {
-                               log_err("%s: rate_iops_min=%u not met, only performed %lu IOs\n",
-                                               td->o.name, rate_iops, iops);
+                       unsigned long long current_rate_iops =
+                               ((current_rate_check_blocks - td->last_rate_check_blocks[ddir]) * 1000) / spent;
+
+                       if (current_rate_iops < option_rate_iops_min) {
+                               log_err("%s: rate_iops_min=%u not met, got %llu IOPS\n",
+                                       td->o.name, option_rate_iops_min, current_rate_iops);
                                 return true;
-                       } else {
-                               if (spent)
-                                       rate = ((iops - td->rate_blocks[ddir]) * 1000) / spent;
-                               else
-                                       rate = 0;
-
-                               if (rate < rate_iops_min ||
-                                   iops < td->rate_blocks[ddir]) {
-                                       log_err("%s: rate_iops_min=%u not met, got %llu IOPS\n",
-                                               td->o.name, rate_iops_min, rate);
-                                       return true;
-                               }
                         }
                 }
         }
  
-       td->rate_bytes[ddir] = bytes;
-       td->rate_blocks[ddir] = iops;
-       memcpy(&td->lastrate[ddir], now, sizeof(*now));
+       td->last_rate_check_bytes[ddir] = current_rate_check_bytes;
+       td->last_rate_check_blocks[ddir] = current_rate_check_blocks;
+       memcpy(&td->last_rate_check_time[ddir], now, sizeof(*now));
         return false;
  }
  
@@ -393,7 +383,7 @@ static bool break_on_this_error(struct thread_data *td, enum fio_ddir ddir,
                         td_clear_error(td);
                         *retptr = 0;
                         return false;
-               } else if (td->o.fill_device && err == ENOSPC) {
+               } else if (td->o.fill_device && (err == ENOSPC || err == EDQUOT)) {
                         /*
                          * We expect to hit this error if
                          * fill_device option is set.
@@ -477,7 +467,7 @@ int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
                                 if (!from_verify)
                                         unlog_io_piece(td, io_u);
                                 td_verror(td, EIO, "full resid");
-                               put_io_u(td, io_u);
+                               clear_io_u(td, io_u);
                                 break;
                         }
  
@@ -645,15 +635,6 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes)
         if (td->error)
                 return;
  
-       /*
-        * verify_state needs to be reset before verification
-        * proceeds so that expected random seeds match actual
-        * random seeds in headers. The main loop will reset
-        * all random number generators if randrepeat is set.
-        */
-       if (!td->o.rand_repeatable)
-               td_fill_verify_state_seed(td);
-
         td_set_runstate(td, TD_VERIFYING);
  
         io_u = NULL;
@@ -690,7 +671,7 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes)
                                 break;
                         }
                 } else {
-                       if (ddir_rw_sum(td->bytes_done) + td->o.rw_min_bs > verify_bytes)
+                       if (td->bytes_verified + td->o.rw_min_bs > verify_bytes)
                                 break;
  
                         while ((io_u = get_io_u(td)) != NULL) {
@@ -719,6 +700,8 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes)
                                         break;
                                 } else if (io_u->ddir == DDIR_WRITE) {
                                         io_u->ddir = DDIR_READ;
+                                       io_u->numberio = td->verify_read_issues;
+                                       td->verify_read_issues++;
                                         populate_verify_io_u(td, io_u);
                                         break;
                                 } else {
@@ -837,7 +820,7 @@ static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
         if (td->o.rate_process == RATE_PROCESS_POISSON) {
                 uint64_t val, iops;
  
-               iops = bps / td->o.bs[ddir];
+               iops = bps / td->o.min_bs[ddir];
                 val = (int64_t) (1000000 / iops) *
                                 -logf(__rand_0_1(&td->poisson_state[ddir]));
                 if (val) {
@@ -858,24 +841,79 @@ static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
         return 0;
  }
  
+static void init_thinktime(struct thread_data *td)
+{
+       if (td->o.thinktime_blocks_type == THINKTIME_BLOCKS_TYPE_COMPLETE)
+               td->thinktime_blocks_counter = td->io_blocks;
+       else
+               td->thinktime_blocks_counter = td->io_issues;
+       td->last_thinktime = td->epoch;
+       td->last_thinktime_blocks = 0;
+}
+
  static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir,
                              struct timespec *time)
  {
         unsigned long long b;
+       unsigned long long runtime_left;
         uint64_t total;
         int left;
+       struct timespec now;
+       bool stall = false;
+
+       if (td->o.thinktime_iotime) {
+               fio_gettime(&now, NULL);
+               if (utime_since(&td->last_thinktime, &now)
+                   >= td->o.thinktime_iotime) {
+                       stall = true;
+               } else if (!fio_option_is_set(&td->o, thinktime_blocks)) {
+                       /*
+                        * When thinktime_iotime is set and thinktime_blocks is
+                        * not set, skip the thinktime_blocks check, since
+                        * thinktime_blocks default value 1 does not work
+                        * together with thinktime_iotime.
+                        */
+                       return;
+               }
+
+       }
  
         b = ddir_rw_sum(td->thinktime_blocks_counter);
-       if (b % td->o.thinktime_blocks || !b)
+       if (b >= td->last_thinktime_blocks + td->o.thinktime_blocks)
+               stall = true;
+
+       if (!stall)
                 return;
  
         io_u_quiesce(td);
  
+       left = td->o.thinktime_spin;
+       if (td->o.timeout) {
+               runtime_left = td->o.timeout - utime_since_now(&td->epoch);
+               if (runtime_left < (unsigned long long)left)
+                       left = runtime_left;
+       }
+
         total = 0;
-       if (td->o.thinktime_spin)
-               total = usec_spin(td->o.thinktime_spin);
+       if (left)
+               total = usec_spin(left);
+
+       /*
+        * usec_spin() might run for slightly longer than intended in a VM
+        * where the vCPU could get descheduled or the hypervisor could steal
+        * CPU time. Ensure "left" doesn't become negative.
+        */
+       if (total < td->o.thinktime)
+               left = td->o.thinktime - total;
+       else
+               left = 0;
+
+       if (td->o.timeout) {
+               runtime_left = td->o.timeout - utime_since_now(&td->epoch);
+               if (runtime_left < (unsigned long long)left)
+                       left = runtime_left;
+       }
  
-       left = td->o.thinktime - total;
         if (left)
                 total += usec_sleep(td, left);
  
@@ -902,6 +940,12 @@ static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir,
  
         if (time && should_check_rate(td))
                 fio_gettime(time, NULL);
+
+       td->last_thinktime_blocks = b;
+       if (td->o.thinktime_iotime) {
+               fio_gettime(&now, NULL);
+               td->last_thinktime = now;
+       }
  }
  
  /*
@@ -933,6 +977,11 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
         */
         if (td_write(td) && td_random(td) && td->o.norandommap)
                 total_bytes = max(total_bytes, (uint64_t) td->o.io_size);
+
+       /* Don't break too early if io_size > size */
+       if (td_rw(td) && !td_random(td))
+               total_bytes = max(total_bytes, (uint64_t)td->o.io_size);
+
         /*
          * If verify_backlog is enabled, we'll run the verify in this
          * handler as well. For that case, we may need up to twice the
@@ -943,9 +992,11 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
                 total_bytes += td->o.size;
  
         /* In trimwrite mode, each byte is trimmed and then written, so
-        * allow total_bytes to be twice as big */
-       if (td_trimwrite(td))
+        * allow total_bytes or number of ios to be twice as big */
+       if (td_trimwrite(td)) {
                 total_bytes += td->total_io_size;
+               td->o.number_ios *= 2;
+       }
  
         while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
                 (!flist_empty(&td->trim_list)) || !io_issue_bytes_exceeded(td) ||
@@ -1000,8 +1051,13 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
                         break;
                 }
  
-               if (io_u->ddir == DDIR_WRITE && td->flags & TD_F_DO_VERIFY)
-                       populate_verify_io_u(td, io_u);
+               if (io_u->ddir == DDIR_WRITE && td->flags & TD_F_DO_VERIFY) {
+                       if (!(io_u->flags & IO_U_F_PATTERN_DONE)) {
+                               io_u_set(td, io_u, IO_U_F_PATTERN_DONE);
+                               io_u->numberio = td->io_issues[io_u->ddir];
+                               populate_verify_io_u(td, io_u);
+                       }
+               }
  
                 ddir = io_u->ddir;
  
@@ -1055,8 +1111,10 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
                                 td->rate_io_issue_bytes[__ddir] += blen;
                         }
  
-                       if (should_check_rate(td))
+                       if (should_check_rate(td)) {
                                 td->rate_next_io_time[__ddir] = usec_for_io(td, __ddir);
+                               fio_gettime(&comp_time, NULL);
+                       }
  
                 } else {
                         ret = io_u_submit(td, io_u);
@@ -1081,6 +1139,9 @@ reap:
                 if (ret < 0)
                         break;
  
+               if (ddir_rw(ddir) && td->o.thinkcycles)
+                       cycles_spin(td->o.thinkcycles);
+
                 if (ddir_rw(ddir) && td->o.thinktime)
                         handle_thinktime(td, ddir, &comp_time);
  
@@ -1105,7 +1166,7 @@ reap:
         if (td->trim_entries)
                 log_err("fio: %lu trim entries leaked?\n", td->trim_entries);
  
-       if (td->o.fill_device && td->error == ENOSPC) {
+       if (td->o.fill_device && (td->error == ENOSPC || td->error == EDQUOT)) {
                 td->error = 0;
                 fio_mark_td_terminate(td);
         }
@@ -1120,7 +1181,8 @@ reap:
  
                 if (i) {
                         ret = io_u_queued_complete(td, i);
-                       if (td->o.fill_device && td->error == ENOSPC)
+                       if (td->o.fill_device &&
+                           (td->error == ENOSPC || td->error == EDQUOT))
                                 td->error = 0;
                 }
  
@@ -1135,8 +1197,11 @@ reap:
                                                                 f->file_name);
                         }
                 }
-       } else
+       } else {
+               if (td->o.io_submit_mode == IO_MODE_OFFLOAD)
+                       workqueue_flush(&td->io_wq);
                 cleanup_pending_aio(td);
+       }
  
         /*
          * stop job if we failed doing any IO
@@ -1261,7 +1326,8 @@ static int init_io_u(struct thread_data *td)
                 }
         }
  
-       init_io_u_buffers(td);
+       if (init_io_u_buffers(td))
+               return 1;
  
         if (init_file_completion_logging(td, max_units))
                 return 1;
@@ -1272,7 +1338,7 @@ static int init_io_u(struct thread_data *td)
  int init_io_u_buffers(struct thread_data *td)
  {
         struct io_u *io_u;
-       unsigned long long max_bs, min_write;
+       unsigned long long max_bs, min_write, trim_bs = 0;
         int i, max_units;
         int data_xfer = 1;
         char *p;
@@ -1283,7 +1349,18 @@ int init_io_u_buffers(struct thread_data *td)
         td->orig_buffer_size = (unsigned long long) max_bs
                                         * (unsigned long long) max_units;
  
-       if (td_ioengine_flagged(td, FIO_NOIO) || !(td_read(td) || td_write(td)))
+       if (td_trim(td) && td->o.num_range > 1) {
+               trim_bs = td->o.num_range * sizeof(struct trim_range);
+               td->orig_buffer_size = trim_bs
+                                       * (unsigned long long) max_units;
+       }
+
+       /*
+        * For reads, writes, and multi-range trim operations we need a
+        * data buffer
+        */
+       if (td_ioengine_flagged(td, FIO_NOIO) ||
+           !(td_read(td) || td_write(td) || (td_trim(td) && td->o.num_range > 1)))
                 data_xfer = 0;
  
         /*
@@ -1292,7 +1369,7 @@ int init_io_u_buffers(struct thread_data *td)
          * overflow later. this adjustment may be too much if we get
          * lucky and the allocator gives us an aligned address.
          */
-       if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+       if (td->o.odirect || td->o.mem_align ||
             td_ioengine_flagged(td, FIO_RAWIO))
                 td->orig_buffer_size += page_mask + td->o.mem_align;
  
@@ -1311,7 +1388,7 @@ int init_io_u_buffers(struct thread_data *td)
         if (data_xfer && allocate_io_mem(td))
                 return 1;
  
-       if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+       if (td->o.odirect || td->o.mem_align ||
             td_ioengine_flagged(td, FIO_RAWIO))
                 p = PTR_ALIGN(td->orig_buffer, page_mask) + td->o.mem_align;
         else
@@ -1335,28 +1412,28 @@ int init_io_u_buffers(struct thread_data *td)
                                 fill_verify_pattern(td, io_u->buf, max_bs, io_u, 0, 0);
                         }
                 }
-               p += max_bs;
+               if (td_trim(td) && td->o.num_range > 1)
+                       p += trim_bs;
+               else
+                       p += max_bs;
         }
  
         return 0;
  }
  
+#ifdef FIO_HAVE_IOSCHED_SWITCH
  /*
- * This function is Linux specific.
+ * These functions are Linux specific.
   * FIO_HAVE_IOSCHED_SWITCH enabled currently means it's Linux.
   */
-static int switch_ioscheduler(struct thread_data *td)
+static int set_ioscheduler(struct thread_data *td, struct fio_file *file)
  {
-#ifdef FIO_HAVE_IOSCHED_SWITCH
         char tmp[256], tmp2[128], *p;
         FILE *f;
         int ret;
  
-       if (td_ioengine_flagged(td, FIO_DISKLESSIO))
-               return 0;
-
-       assert(td->files && td->files[0]);
-       sprintf(tmp, "%s/queue/scheduler", td->files[0]->du->sysfs_root);
+       assert(file->du && file->du->sysfs_root);
+       sprintf(tmp, "%s/queue/scheduler", file->du->sysfs_root);
  
         f = fopen(tmp, "r+");
         if (!f) {
@@ -1409,7 +1486,7 @@ static int switch_ioscheduler(struct thread_data *td)
  
         sprintf(tmp2, "[%s]", td->o.ioscheduler);
         if (!strstr(tmp, tmp2)) {
-               log_err("fio: io scheduler %s not found\n", td->o.ioscheduler);
+               log_err("fio: unable to set io scheduler to %s\n", td->o.ioscheduler);
                 td_verror(td, EINVAL, "iosched_switch");
                 fclose(f);
                 return 1;
@@ -1417,11 +1494,55 @@ static int switch_ioscheduler(struct thread_data *td)
  
         fclose(f);
         return 0;
+}
+
+static int switch_ioscheduler(struct thread_data *td)
+{
+       struct fio_file *f;
+       unsigned int i;
+       int ret = 0;
+
+       if (td_ioengine_flagged(td, FIO_DISKLESSIO))
+               return 0;
+
+       assert(td->files && td->files[0]);
+
+       for_each_file(td, f, i) {
+
+               /* Only consider regular files and block device files */
+               switch (f->filetype) {
+               case FIO_TYPE_FILE:
+               case FIO_TYPE_BLOCK:
+                       /*
+                        * Make sure that the device hosting the file could
+                        * be determined.
+                        */
+                       if (!f->du)
+                               continue;
+                       break;
+               case FIO_TYPE_CHAR:
+               case FIO_TYPE_PIPE:
+               default:
+                       continue;
+               }
+
+               ret = set_ioscheduler(td, f);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
  #else
+
+static int switch_ioscheduler(struct thread_data *td)
+{
         return 0;
-#endif
  }
  
+#endif /* FIO_HAVE_IOSCHED_SWITCH */
+
  static bool keep_running(struct thread_data *td)
  {
         unsigned long long limit;
@@ -1544,7 +1665,7 @@ static void *thread_main(void *data)
         uint64_t bytes_done[DDIR_RWDIR_CNT];
         int deadlock_loop_cnt;
         bool clear_state;
-       int res, ret;
+       int ret;
  
         sk_out_assign(sk_out);
         free(fd);
@@ -1699,9 +1820,29 @@ static void *thread_main(void *data)
         if (!init_iolog(td))
                 goto err;
  
+       /* ioprio_set() has to be done before td_io_init() */
+       if (fio_option_is_set(o, ioprio) ||
+           fio_option_is_set(o, ioprio_class) ||
+           fio_option_is_set(o, ioprio_hint)) {
+               ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class,
+                                o->ioprio, o->ioprio_hint);
+               if (ret == -1) {
+                       td_verror(td, errno, "ioprio_set");
+                       goto err;
+               }
+               td->ioprio = ioprio_value(o->ioprio_class, o->ioprio,
+                                         o->ioprio_hint);
+               td->ts.ioprio = td->ioprio;
+       }
+
         if (td_io_init(td))
                 goto err;
  
+       if (td_ioengine_flagged(td, FIO_SYNCIO) && td->o.iodepth > 1 && td->o.io_submit_mode != IO_MODE_OFFLOAD) {
+               log_info("note: both iodepth >= 1 and synchronous I/O engine "
+                        "are selected, queue depth will be capped at 1\n");
+       }
+
         if (init_io_u(td))
                 goto err;
  
@@ -1711,15 +1852,6 @@ static void *thread_main(void *data)
         if (o->verify_async && verify_async_init(td))
                 goto err;
  
-       if (fio_option_is_set(o, ioprio) ||
-           fio_option_is_set(o, ioprio_class)) {
-               ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, o->ioprio);
-               if (ret == -1) {
-                       td_verror(td, errno, "ioprio_set");
-                       goto err;
-               }
-       }
-
         if (o->cgroup && cgroup_setup(td, cgroup_list, &cgroup_mnt))
                 goto err;
  
@@ -1749,24 +1881,21 @@ static void *thread_main(void *data)
         if (rate_submit_init(td, sk_out))
                 goto err;
  
-       if (td->o.thinktime_blocks_type == THINKTIME_BLOCKS_TYPE_COMPLETE)
-               td->thinktime_blocks_counter = td->io_blocks;
-       else
-               td->thinktime_blocks_counter = td->io_issues;
-
-       set_epoch_time(td, o->log_unix_epoch);
+       set_epoch_time(td, o->log_alternate_epoch_clock_id, o->job_start_clock_id);
         fio_getrusage(&td->ru_start);
         memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
         memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
         memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
  
+       init_thinktime(td);
+
         if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] ||
                         o->ratemin[DDIR_TRIM]) {
-               memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
+               memcpy(&td->last_rate_check_time[DDIR_READ], &td->bw_sample_time,
                                         sizeof(td->bw_sample_time));
-               memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time,
+               memcpy(&td->last_rate_check_time[DDIR_WRITE], &td->bw_sample_time,
                                         sizeof(td->bw_sample_time));
-               memcpy(&td->lastrate[DDIR_TRIM], &td->bw_sample_time,
+               memcpy(&td->last_rate_check_time[DDIR_TRIM], &td->bw_sample_time,
                                         sizeof(td->bw_sample_time));
         }
  
@@ -1791,8 +1920,12 @@ static void *thread_main(void *data)
                 if (td->o.verify_only && td_write(td))
                         verify_bytes = do_dry_run(td);
                 else {
+                       if (!td->o.rand_repeatable)
+                               /* save verify rand state to replay hdr seeds later at verify */
+                               frand_copy(&td->verify_state_last_do_io, &td->verify_state);
                         do_io(td, bytes_done);
-
+                       if (!td->o.rand_repeatable)
+                               frand_copy(&td->verify_state, &td->verify_state_last_do_io);
                         if (!ddir_rw_sum(bytes_done)) {
                                 fio_mark_td_terminate(td);
                                 verify_bytes = 0;
@@ -1832,7 +1965,8 @@ static void *thread_main(void *data)
                         }
                 } while (1);
  
-               if (td_read(td) && td->io_bytes[DDIR_READ])
+               if (td->io_bytes[DDIR_READ] && (td_read(td) ||
+                       ((td->flags & TD_F_VER_BACKLOG) && td_write(td))))
                         update_runtime(td, elapsed_us, DDIR_READ);
                 if (td_write(td) && td->io_bytes[DDIR_WRITE])
                         update_runtime(td, elapsed_us, DDIR_WRITE);
@@ -1875,13 +2009,23 @@ static void *thread_main(void *data)
          * another thread is checking its io_u's for overlap
          */
         if (td_offload_overlap(td)) {
-               int res = pthread_mutex_lock(&overlap_check);
-               assert(res == 0);
+               int res;
+
+               res = pthread_mutex_lock(&overlap_check);
+               if (res) {
+                       td->error = errno;
+                       goto err;
+               }
         }
         td_set_runstate(td, TD_FINISHING);
         if (td_offload_overlap(td)) {
+               int res;
+
                 res = pthread_mutex_unlock(&overlap_check);
-               assert(res == 0);
+               if (res) {
+                       td->error = errno;
+                       goto err;
+               }
         }
  
         update_rusage_stat(td);
@@ -1954,18 +2098,17 @@ err:
  static void reap_threads(unsigned int *nr_running, uint64_t *t_rate,
                          uint64_t *m_rate)
  {
-       struct thread_data *td;
         unsigned int cputhreads, realthreads, pending;
-       int i, status, ret;
+       int ret;
  
         /*
          * reap exited threads (TD_EXITED -> TD_REAPED)
          */
         realthreads = pending = cputhreads = 0;
-       for_each_td(td, i) {
-               int flags = 0;
+       for_each_td(td) {
+               int flags = 0, status;
  
-                if (!strcmp(td->o.ioengine, "cpuio"))
+               if (!strcmp(td->o.ioengine, "cpuio"))
                         cputhreads++;
                 else
                         realthreads++;
@@ -2055,7 +2198,7 @@ reaped:
                 done_secs += mtime_since_now(&td->epoch) / 1000;
                 profile_td_exit(td);
                 flow_exit_job(td);
-       }
+       } end_for_each();
  
         if (*nr_running == cputhreads && !pending && realthreads)
                 fio_terminate_threads(TERMINATE_ALL, TERMINATE_ALL);
@@ -2182,13 +2325,11 @@ static bool waitee_running(struct thread_data *me)
  {
         const char *waitee = me->o.wait_for;
         const char *self = me->o.name;
-       struct thread_data *td;
-       int i;
  
         if (!waitee)
                 return false;
  
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if (!strcmp(td->o.name, self) || strcmp(td->o.name, waitee))
                         continue;
  
@@ -2198,7 +2339,7 @@ static bool waitee_running(struct thread_data *me)
                                         runstate_to_name(td->runstate));
                         return true;
                 }
-       }
+       } end_for_each();
  
         dprint(FD_PROCESS, "%s: %s completed, can run\n", self, waitee);
         return false;
@@ -2222,14 +2363,14 @@ static void run_threads(struct sk_out *sk_out)
         set_sig_handlers();
  
         nr_thread = nr_process = 0;
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if (check_mount_writes(td))
                         return;
                 if (td->o.use_thread)
                         nr_thread++;
                 else
                         nr_process++;
-       }
+       } end_for_each();
  
         if (output_format & FIO_OUTPUT_NORMAL) {
                 struct buf_output out;
@@ -2255,7 +2396,7 @@ static void run_threads(struct sk_out *sk_out)
         nr_started = 0;
         m_rate = t_rate = 0;
  
-       for_each_td(td, i) {
+       for_each_td(td) {
                 print_status_init(td->thread_number - 1);
  
                 if (!td->o.create_serialize)
@@ -2291,7 +2432,7 @@ reap:
                                         td_io_close_file(td, f);
                         }
                 }
-       }
+       } end_for_each();
  
         /* start idle threads before io threads start to run */
         fio_idle_prof_start();
@@ -2307,7 +2448,7 @@ reap:
                 /*
                  * create threads (TD_NOT_CREATED -> TD_CREATED)
                  */
-               for_each_td(td, i) {
+               for_each_td(td) {
                         if (td->runstate != TD_NOT_CREATED)
                                 continue;
  
@@ -2376,15 +2517,21 @@ reap:
                                                         strerror(ret));
                         } else {
                                 pid_t pid;
+                               void *eo;
                                 dprint(FD_PROCESS, "will fork\n");
+                               eo = td->eo;
+                               read_barrier();
                                 pid = fork();
                                 if (!pid) {
                                         int ret;
  
                                         ret = (int)(uintptr_t)thread_main(fd);
                                         _exit(ret);
-                               } else if (i == fio_debug_jobno)
+                               } else if (__td_index == fio_debug_jobno)
                                         *fio_debug_jobp = pid;
+                               free(eo);
+                               free(fd);
+                               fd = NULL;
                         }
                         dprint(FD_MUTEX, "wait on startup_sem\n");
                         if (fio_sem_down_timeout(startup_sem, 10000)) {
@@ -2396,7 +2543,7 @@ reap:
                                 break;
                         }
                         dprint(FD_MUTEX, "done waiting on startup_sem\n");
-               }
+               } end_for_each();
  
                 /*
                  * Wait for the started threads to transition to
@@ -2441,7 +2588,7 @@ reap:
                 /*
                  * start created threads (TD_INITIALIZED -> TD_RUNNING).
                  */
-               for_each_td(td, i) {
+               for_each_td(td) {
                         if (td->runstate != TD_INITIALIZED)
                                 continue;
  
@@ -2455,7 +2602,7 @@ reap:
                         t_rate += ddir_rw_sum(td->o.rate);
                         todo--;
                         fio_sem_up(td->sem);
-               }
+               } end_for_each();
  
                 reap_threads(&nr_running, &t_rate, &m_rate);
  
@@ -2481,9 +2628,7 @@ static void free_disk_util(void)
  
  int fio_backend(struct sk_out *sk_out)
  {
-       struct thread_data *td;
         int i;
-
         if (exec_profile) {
                 if (load_profile(exec_profile))
                         return 1;
@@ -2503,6 +2648,11 @@ int fio_backend(struct sk_out *sk_out)
                 setup_log(&agg_io_log[DDIR_TRIM], &p, "agg-trim_bw.log");
         }
  
+       if (init_global_dedupe_working_set_seeds()) {
+               log_err("fio: failed to initialize global dedupe working set\n");
+               return 1;
+       }
+
         startup_sem = fio_sem_init(FIO_SEM_LOCKED);
         if (!sk_out)
                 is_local_backend = true;
@@ -2534,16 +2684,20 @@ int fio_backend(struct sk_out *sk_out)
                 }
         }
  
-       for_each_td(td, i) {
+       for_each_td(td) {
+               struct thread_stat *ts = &td->ts;
+
+               free_clat_prio_stats(ts);
                 steadystate_free(td);
                 fio_options_free(td);
+               fio_dump_options_free(td);
                 if (td->rusage_sem) {
                         fio_sem_remove(td->rusage_sem);
                         td->rusage_sem = NULL;
                 }
                 fio_sem_remove(td->sem);
                 td->sem = NULL;
-       }
+       } end_for_each();
  
         free_disk_util();
         if (cgroup_list) {
diff --git a/blktrace.c b/blktrace.c

index 64a610a95944c9ec1e5d4fdaba56b987e24c7943..ef9ce6bffd8686e1e5f5e7bf4f00828490495c2c 100644 (file)
--- a/blktrace.c
+++ b/blktrace.c
@@ -4,71 +4,35 @@
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
+#include <errno.h>
+#include <sys/sysmacros.h>
  
  #include "flist.h"
  #include "fio.h"
+#include "iolog.h"
  #include "blktrace.h"
  #include "blktrace_api.h"
  #include "oslib/linux-dev-lookup.h"
  
-#define TRACE_FIFO_SIZE        8192
-
-/*
- * fifo refill frontend, to avoid reading data in trace sized bites
- */
-static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd)
-{
-       char buf[TRACE_FIFO_SIZE];
-       unsigned int total;
-       int ret;
-
-       total = sizeof(buf);
-       if (total > fifo_room(fifo))
-               total = fifo_room(fifo);
-
-       ret = read(fd, buf, total);
-       if (ret < 0) {
-               int read_err = errno;
-
-               assert(read_err > 0);
-               td_verror(td, read_err, "read blktrace file");
-               return -read_err;
-       }
-
-       if (ret > 0)
-               ret = fifo_put(fifo, buf, ret);
-
-       dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret);
-       return ret;
-}
-
-/*
- * Retrieve 'len' bytes from the fifo, refilling if necessary.
- */
-static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd,
-                         void *buf, unsigned int len)
-{
-       if (fifo_len(fifo) < len) {
-               int ret = refill_fifo(td, fifo, fd);
-
-               if (ret < 0)
-                       return ret;
-       }
-
-       return fifo_get(fifo, buf, len);
-}
+struct file_cache {
+       unsigned int maj;
+       unsigned int min;
+       unsigned int fileno;
+};
  
  /*
   * Just discard the pdu by seeking past it.
   */
-static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd,
-                      struct blk_io_trace *t)
+static int discard_pdu(FILE* f, struct blk_io_trace *t)
  {
         if (t->pdu_len == 0)
                 return 0;
  
         dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len);
-       return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len);
+       if (fseek(f, t->pdu_len, SEEK_CUR) < 0)
+               return -errno;
+
+       return t->pdu_len;
  }
  
  /*
@@ -130,28 +94,28 @@ static void trace_add_open_close_event(struct thread_data *td, int fileno, enum
         flist_add_tail(&ipo->list, &td->io_log_list);
  }
  
-static int trace_add_file(struct thread_data *td, __u32 device)
+static int trace_add_file(struct thread_data *td, __u32 device,
+                         struct file_cache *cache)
  {
-       static unsigned int last_maj, last_min, last_fileno;
         unsigned int maj = FMAJOR(device);
         unsigned int min = FMINOR(device);
         struct fio_file *f;
         char dev[256];
         unsigned int i;
  
-       if (last_maj == maj && last_min == min)
-               return last_fileno;
+       if (cache->maj == maj && cache->min == min)
+               return cache->fileno;
  
-       last_maj = maj;
-       last_min = min;
+       cache->maj = maj;
+       cache->min = min;
  
         /*
          * check for this file in our list
          */
         for_each_file(td, f, i)
                 if (f->major == maj && f->minor == min) {
-                       last_fileno = f->fileno;
-                       return last_fileno;
+                       cache->fileno = f->fileno;
+                       return cache->fileno;
                 }
  
         strcpy(dev, "/dev");
@@ -171,10 +135,10 @@ static int trace_add_file(struct thread_data *td, __u32 device)
                 td->files[fileno]->major = maj;
                 td->files[fileno]->minor = min;
                 trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
-               last_fileno = fileno;
+               cache->fileno = fileno;
         }
  
-       return last_fileno;
+       return cache->fileno;
  }
  
  static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t)
@@ -215,7 +179,7 @@ static void store_ipo(struct thread_data *td, unsigned long long offset,
         queue_io_piece(td, ipo);
  }
  
-static void handle_trace_notify(struct blk_io_trace *t)
+static bool handle_trace_notify(struct blk_io_trace *t)
  {
         switch (t->action) {
         case BLK_TN_PROCESS:
@@ -232,22 +196,24 @@ static void handle_trace_notify(struct blk_io_trace *t)
                 dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action);
                 break;
         }
+       return false;
  }
  
-static void handle_trace_discard(struct thread_data *td,
+static bool handle_trace_discard(struct thread_data *td,
                                  struct blk_io_trace *t,
                                  unsigned long long ttime,
-                                unsigned long *ios, unsigned int *bs)
+                                unsigned long *ios, unsigned long long *bs,
+                                struct file_cache *cache)
  {
         struct io_piece *ipo;
         int fileno;
  
         if (td->o.replay_skip & (1u << DDIR_TRIM))
-               return;
+               return false;
  
         ipo = calloc(1, sizeof(*ipo));
         init_ipo(ipo);
-       fileno = trace_add_file(td, t->device);
+       fileno = trace_add_file(td, t->device, cache);
  
         ios[DDIR_TRIM]++;
         if (t->bytes > bs[DDIR_TRIM])
@@ -270,6 +236,7 @@ static void handle_trace_discard(struct thread_data *td,
                                                         ipo->offset, ipo->len,
                                                         ipo->delay);
         queue_io_piece(td, ipo);
+       return true;
  }
  
  static void dump_trace(struct blk_io_trace *t)
@@ -277,29 +244,29 @@ static void dump_trace(struct blk_io_trace *t)
         log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action);
  }
  
-static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
+static bool handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
                             unsigned long long ttime, unsigned long *ios,
-                           unsigned int *bs)
+                           unsigned long long *bs, struct file_cache *cache)
  {
         int rw;
         int fileno;
  
-       fileno = trace_add_file(td, t->device);
+       fileno = trace_add_file(td, t->device, cache);
  
         rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
  
         if (rw) {
                 if (td->o.replay_skip & (1u << DDIR_WRITE))
-                       return;
+                       return false;
         } else {
                 if (td->o.replay_skip & (1u << DDIR_READ))
-                       return;
+                       return false;
         }
  
         if (!t->bytes) {
                 if (!fio_did_warn(FIO_WARN_BTRACE_ZERO))
                         dump_trace(t);
-               return;
+               return false;
         }
  
         if (t->bytes > bs[rw])
@@ -308,20 +275,22 @@ static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
         ios[rw]++;
         td->o.size += t->bytes;
         store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
+       return true;
  }
  
-static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
-                              unsigned long long ttime, unsigned long *ios)
+static bool handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
+                              unsigned long long ttime, unsigned long *ios,
+                              struct file_cache *cache)
  {
         struct io_piece *ipo;
         int fileno;
  
         if (td->o.replay_skip & (1u << DDIR_SYNC))
-               return;
+               return false;
  
         ipo = calloc(1, sizeof(*ipo));
         init_ipo(ipo);
-       fileno = trace_add_file(td, t->device);
+       fileno = trace_add_file(td, t->device, cache);
  
         ipo->delay = ttime / 1000;
         ipo->ddir = DDIR_SYNC;
@@ -329,48 +298,43 @@ static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
  
         ios[DDIR_SYNC]++;
         dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay);
+
+       if (!(td->flags & TD_F_SYNCS))
+               td->flags |= TD_F_SYNCS;
+
         queue_io_piece(td, ipo);
+       return true;
  }
  
  /*
   * We only care for queue traces, most of the others are side effects
   * due to internal workings of the block layer.
   */
-static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
-                        unsigned long *ios, unsigned int *bs)
+static bool queue_trace(struct thread_data *td, struct blk_io_trace *t,
+                        unsigned long *ios, unsigned long long *bs,
+                        struct file_cache *cache)
  {
-       static unsigned long long last_ttime;
+       unsigned long long *last_ttime = &td->io_log_last_ttime;
         unsigned long long delay = 0;
  
         if ((t->action & 0xffff) != __BLK_TA_QUEUE)
-               return;
+               return false;
  
         if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
-               if (!last_ttime || td->o.no_stall)
-                       delay = 0;
-               else if (td->o.replay_time_scale == 100)
-                       delay = t->time - last_ttime;
-               else {
-                       double tmp = t->time - last_ttime;
-                       double scale;
-
-                       scale = (double) 100.0 / (double) td->o.replay_time_scale;
-                       tmp *= scale;
-                       delay = tmp;
-               }
-               last_ttime = t->time;
+               delay = delay_since_ttime(td, t->time);
+               *last_ttime = t->time;
         }
  
         t_bytes_align(&td->o, t);
  
         if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
-               handle_trace_notify(t);
+               return handle_trace_notify(t);
         else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
-               handle_trace_discard(td, t, delay, ios, bs);
+               return handle_trace_discard(td, t, delay, ios, bs, cache);
         else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH))
-               handle_trace_flush(td, t, delay, ios);
+               return handle_trace_flush(td, t, delay, ios, cache);
         else
-               handle_trace_fs(td, t, delay, ios, bs);
+               return handle_trace_fs(td, t, delay, ios, bs, cache);
  }
  
  static void byteswap_trace(struct blk_io_trace *t)
@@ -438,43 +402,82 @@ static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth)
   * Load a blktrace file by reading all the blk_io_trace entries, and storing
   * them as io_pieces like the fio text version would do.
   */
-bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
+bool init_blktrace_read(struct thread_data *td, const char *filename, int need_swap)
+{
+       int old_state;
+
+       td->io_log_rfile = fopen(filename, "rb");
+       if (!td->io_log_rfile) {
+               td_verror(td, errno, "open blktrace file");
+               goto err;
+       }
+       td->io_log_blktrace_swap = need_swap;
+       td->io_log_last_ttime = 0;
+       td->o.size = 0;
+
+       free_release_files(td);
+
+       old_state = td_bump_runstate(td, TD_SETTING_UP);
+
+       if (!read_blktrace(td)) {
+               goto err;
+       }
+
+       td_restore_runstate(td, old_state);
+
+       if (!td->files_index) {
+               log_err("fio: did not find replay device(s)\n");
+               return false;
+       }
+
+       return true;
+
+err:
+       if (td->io_log_rfile) {
+               fclose(td->io_log_rfile);
+               td->io_log_rfile = NULL;
+       }
+       return false;
+}
+
+bool read_blktrace(struct thread_data* td)
  {
         struct blk_io_trace t;
+       struct file_cache cache = {
+               .maj = ~0U,
+               .min = ~0U,
+       };
         unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { };
-       unsigned int rw_bs[DDIR_RWDIR_CNT] = { };
+       unsigned long long rw_bs[DDIR_RWDIR_CNT] = { };
         unsigned long skipped_writes;
-       struct fifo *fifo;
-       int fd, i, old_state, max_depth;
-       struct fio_file *f;
+       FILE *f = td->io_log_rfile;
+       int i, max_depth;
+       struct fio_file *fiof;
         int this_depth[DDIR_RWDIR_CNT] = { };
         int depth[DDIR_RWDIR_CNT] = { };
+       int64_t items_to_fetch = 0;
  
-       fd = open(filename, O_RDONLY);
-       if (fd < 0) {
-               td_verror(td, errno, "open blktrace file");
-               return false;
+       if (td->o.read_iolog_chunked) {
+               items_to_fetch = iolog_items_to_fetch(td);
+               if (!items_to_fetch)
+                       return true;
         }
  
-       fifo = fifo_alloc(TRACE_FIFO_SIZE);
-
-       old_state = td_bump_runstate(td, TD_SETTING_UP);
-
-       td->o.size = 0;
         skipped_writes = 0;
         do {
-               int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
+               int ret = fread(&t, 1, sizeof(t), f);
  
-               if (ret < 0)
+               if (ferror(f)) {
+                       td_verror(td, errno, "read blktrace file");
                         goto err;
-               else if (!ret)
+               } else if (feof(f)) {
                         break;
-               else if (ret < (int) sizeof(t)) {
-                       log_err("fio: short fifo get\n");
+               } else if (ret < (int) sizeof(t)) {
+                       log_err("fio: iolog short read\n");
                         break;
                 }
  
-               if (need_swap)
+               if (td->io_log_blktrace_swap)
                         byteswap_trace(&t);
  
                 if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
@@ -487,13 +490,10 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
                                                                 t.magic & 0xff);
                         goto err;
                 }
-               ret = discard_pdu(td, fifo, fd, &t);
+               ret = discard_pdu(f, &t);
                 if (ret < 0) {
                         td_verror(td, -ret, "blktrace lseek");
                         goto err;
-               } else if (t.pdu_len != ret) {
-                       log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
-                       goto err;
                 }
                 if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
                         if ((t.action & 0xffff) == __BLK_TA_QUEUE)
@@ -510,22 +510,54 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
                         }
                 }
  
-               handle_trace(td, &t, ios, rw_bs);
-       } while (1);
+               if (!queue_trace(td, &t, ios, rw_bs, &cache))
+                       continue;
  
-       for_each_file(td, f, i)
-               trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE);
+               if (td->o.read_iolog_chunked) {
+                       td->io_log_current++;
+                       items_to_fetch--;
+                       if (items_to_fetch == 0)
+                               break;
+               }
+       } while (1);
  
-       fifo_free(fifo);
-       close(fd);
+       if (td->o.read_iolog_chunked) {
+               td->io_log_highmark = td->io_log_current;
+               td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
+               fio_gettime(&td->io_log_highmark_time, NULL);
+       }
  
-       td_restore_runstate(td, old_state);
+       if (skipped_writes)
+               log_err("fio: %s skips replay of %lu writes due to read-only\n",
+                                               td->o.name, skipped_writes);
  
-       if (!td->files_index) {
-               log_err("fio: did not find replay device(s)\n");
-               return false;
+       if (td->o.read_iolog_chunked) {
+               if (td->io_log_current == 0) {
+                       return false;
+               }
+               td->o.td_ddir = TD_DDIR_RW;
+               if ((rw_bs[DDIR_READ] > td->o.max_bs[DDIR_READ] ||
+                    rw_bs[DDIR_WRITE] > td->o.max_bs[DDIR_WRITE] ||
+                    rw_bs[DDIR_TRIM] > td->o.max_bs[DDIR_TRIM]) &&
+                   td->orig_buffer)
+               {
+                       td->o.max_bs[DDIR_READ] = max(td->o.max_bs[DDIR_READ], rw_bs[DDIR_READ]);
+                       td->o.max_bs[DDIR_WRITE] = max(td->o.max_bs[DDIR_WRITE], rw_bs[DDIR_WRITE]);
+                       td->o.max_bs[DDIR_TRIM] = max(td->o.max_bs[DDIR_TRIM], rw_bs[DDIR_TRIM]);
+                       io_u_quiesce(td);
+                       free_io_mem(td);
+                       if (init_io_u_buffers(td))
+                               return false;
+               }
+               return true;
         }
  
+       for_each_file(td, fiof, i)
+               trace_add_open_close_event(td, fiof->fileno, FIO_LOG_CLOSE_FILE);
+
+       fclose(td->io_log_rfile);
+       td->io_log_rfile = NULL;
+
         /*
          * For stacked devices, we don't always get a COMPLETE event so
          * the depth grows to insane values. Limit it to something sane(r).
@@ -539,10 +571,6 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
                 max_depth = max(depth[i], max_depth);
         }
  
-       if (skipped_writes)
-               log_err("fio: %s skips replay of %lu writes due to read-only\n",
-                                               td->o.name, skipped_writes);
-
         if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] &&
             !ios[DDIR_SYNC]) {
                 log_err("fio: found no ios in blktrace data\n");
@@ -563,14 +591,6 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
                 td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM];
         }
  
-       /*
-        * We need to do direct/raw ios to the device, to avoid getting
-        * read-ahead in our way. But only do so if the minimum block size
-        * is a multiple of 4k, otherwise we don't know if it's safe to do so.
-        */
-       if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095))
-               td->o.odirect = 1;
-
         /*
          * If depth wasn't manually set, use probed depth
          */
@@ -579,8 +599,7 @@ bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
  
         return true;
  err:
-       close(fd);
-       fifo_free(fifo);
+       fclose(f);
         return false;
  }
  
@@ -625,15 +644,14 @@ static void merge_finish_file(struct blktrace_cursor *bcs, int i, int *nr_logs)
  {
         bcs[i].iter++;
         if (bcs[i].iter < bcs[i].nr_iter) {
-               lseek(bcs[i].fd, 0, SEEK_SET);
+               fseek(bcs[i].f, 0, SEEK_SET);
                 return;
         }
  
         *nr_logs -= 1;
  
         /* close file */
-       fifo_free(bcs[i].fifo);
-       close(bcs[i].fd);
+       fclose(bcs[i].f);
  
         /* keep active files contiguous */
         memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i]));
@@ -646,15 +664,16 @@ static int read_trace(struct thread_data *td, struct blktrace_cursor *bc)
  
  read_skip:
         /* read an io trace */
-       ret = trace_fifo_get(td, bc->fifo, bc->fd, t, sizeof(*t));
-       if (ret < 0) {
+       ret = fread(&t, 1, sizeof(t), bc->f);
+       if (ferror(bc->f)) {
+               td_verror(td, errno, "read blktrace file");
                 return ret;
-       } else if (!ret) {
+       } else if (feof(bc->f)) {
                 if (!bc->length)
                         bc->length = bc->t.time;
                 return ret;
         } else if (ret < (int) sizeof(*t)) {
-               log_err("fio: short fifo get\n");
+               log_err("fio: iolog short read\n");
                 return -1;
         }
  
@@ -664,14 +683,10 @@ read_skip:
         /* skip over actions that fio does not care about */
         if ((t->action & 0xffff) != __BLK_TA_QUEUE ||
             t_get_ddir(t) == DDIR_INVAL) {
-               ret = discard_pdu(td, bc->fifo, bc->fd, t);
+               ret = discard_pdu(bc->f, t);
                 if (ret < 0) {
                         td_verror(td, -ret, "blktrace lseek");
                         return ret;
-               } else if (t->pdu_len != ret) {
-                       log_err("fio: discarded %d of %d\n", ret,
-                               t->pdu_len);
-                       return -1;
                 }
                 goto read_skip;
         }
@@ -729,14 +744,13 @@ int merge_blktrace_iologs(struct thread_data *td)
         str = ptr = strdup(td->o.read_iolog_file);
         nr_logs = 0;
         for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) {
-               bcs[i].fd = open(name, O_RDONLY);
-               if (bcs[i].fd < 0) {
+               bcs[i].f = fopen(name, "rb");
+               if (!bcs[i].f) {
                         log_err("fio: could not open file: %s\n", name);
-                       ret = bcs[i].fd;
+                       ret = -errno;
                         free(str);
                         goto err_file;
                 }
-               bcs[i].fifo = fifo_alloc(TRACE_FIFO_SIZE);
                 nr_logs++;
  
                 if (!is_blktrace(name, &bcs[i].swap)) {
@@ -761,14 +775,10 @@ int merge_blktrace_iologs(struct thread_data *td)
                 i = find_earliest_io(bcs, nr_logs);
                 bc = &bcs[i];
                 /* skip over the pdu */
-               ret = discard_pdu(td, bc->fifo, bc->fd, &bc->t);
+               ret = discard_pdu(bc->f, &bc->t);
                 if (ret < 0) {
                         td_verror(td, -ret, "blktrace lseek");
                         goto err_file;
-               } else if (bc->t.pdu_len != ret) {
-                       log_err("fio: discarded %d of %d\n", ret,
-                               bc->t.pdu_len);
-                       goto err_file;
                 }
  
                 ret = write_trace(merge_fp, &bc->t);
@@ -786,8 +796,7 @@ int merge_blktrace_iologs(struct thread_data *td)
  err_file:
         /* cleanup */
         for (i = 0; i < nr_logs; i++) {
-               fifo_free(bcs[i].fifo);
-               close(bcs[i].fd);
+               fclose(bcs[i].f);
         }
  err_merge_buf:
         free(merge_buf);
diff --git a/blktrace.h b/blktrace.h

index a0e82faa05eed81bb3e8819a200208089a6009d7..c53b717ba4e2d3069f170d1552027817cc734106 100644 (file)
--- a/blktrace.h
+++ b/blktrace.h
@@ -10,7 +10,7 @@
  
  struct blktrace_cursor {
         struct fifo             *fifo;  // fifo queue for reading
-       int                     fd;     // blktrace file
+       FILE                    *f;     // blktrace file
         __u64                   length; // length of trace
         struct blk_io_trace     t;      // current io trace
         int                     swap;   // bitwise reverse required
@@ -20,7 +20,9 @@ struct blktrace_cursor {
  };
  
  bool is_blktrace(const char *, int *);
-bool load_blktrace(struct thread_data *, const char *, int);
+bool init_blktrace_read(struct thread_data *, const char *, int);
+bool read_blktrace(struct thread_data* td);
+
  int merge_blktrace_iologs(struct thread_data *td);
  
  #else
@@ -30,12 +32,18 @@ static inline bool is_blktrace(const char *fname, int *need_swap)
         return false;
  }
  
-static inline bool load_blktrace(struct thread_data *td, const char *fname,
+static inline bool init_blktrace_read(struct thread_data *td, const char *fname,
                                  int need_swap)
  {
         return false;
  }
  
+static inline bool read_blktrace(struct thread_data* td)
+{
+       return false;
+}
+
+
  static inline int merge_blktrace_iologs(struct thread_data *td)
  {
         return false;
diff --git a/cairo_text_helpers.c b/cairo_text_helpers.c

index 19fb8e03c198b38f0c732cf55d2a61f5b76b933a..5bdd60219fd27c530a9f8436c073b40e568ce800 100644 (file)
--- a/cairo_text_helpers.c
+++ b/cairo_text_helpers.c
@@ -1,3 +1,5 @@
+#include "cairo_text_helpers.h"
+
  #include <cairo.h>
  #include <gtk/gtk.h>
  #include <math.h>
diff --git a/cairo_text_helpers.h b/cairo_text_helpers.h

index 014001ad2f65fde81a0dc01cffb9f4d101367d8f..d0f52d51ffc89a9cd09e44c4f64a2617cf91a8e9 100644 (file)
--- a/cairo_text_helpers.h
+++ b/cairo_text_helpers.h
@@ -1,6 +1,8 @@
  #ifndef CAIRO_TEXT_HELPERS_H
  #define CAIRO_TEXT_HELPERS_H
  
+#include <cairo.h>
+
  void draw_centered_text(cairo_t *cr, const char *font, double x, double y,
                                double fontsize, const char *text);
  
diff --git a/cconv.c b/cconv.c

index b10868fb3de6b2dd0844799ab36fd145ff68448e..16112248a6b71016c2542a5097ca35750f9a470b 100644 (file)
--- a/cconv.c
+++ b/cconv.c
@@ -48,14 +48,24 @@ static void free_thread_options_to_cpu(struct thread_options *o)
         free(o->profile);
         free(o->cgroup);
  
+       free(o->verify_pattern);
+       free(o->buffer_pattern);
+
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
                 free(o->bssplit[i]);
                 free(o->zone_split[i]);
         }
  }
  
-void convert_thread_options_to_cpu(struct thread_options *o,
-                                  struct thread_options_pack *top)
+size_t thread_options_pack_size(struct thread_options *o)
+{
+       return sizeof(struct thread_options_pack) + o->verify_pattern_bytes +
+               o->buffer_pattern_bytes;
+}
+
+int convert_thread_options_to_cpu(struct thread_options *o,
+                                 struct thread_options_pack *top,
+                                 size_t top_sz)
  {
         int i, j;
  
@@ -101,6 +111,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->serialize_overlap = le32_to_cpu(top->serialize_overlap);
         o->size = le64_to_cpu(top->size);
         o->io_size = le64_to_cpu(top->io_size);
+       o->num_range = le32_to_cpu(top->num_range);
         o->size_percent = le32_to_cpu(top->size_percent);
         o->io_size_percent = le32_to_cpu(top->io_size_percent);
         o->fill_device = le32_to_cpu(top->fill_device);
@@ -143,6 +154,8 @@ void convert_thread_options_to_cpu(struct thread_options *o,
                 o->rate_iops_min[i] = le32_to_cpu(top->rate_iops_min[i]);
  
                 o->perc_rand[i] = le32_to_cpu(top->perc_rand[i]);
+
+               o->max_latency[i] = le64_to_cpu(top->max_latency[i]);
         }
  
         o->ratecycle = le32_to_cpu(top->ratecycle);
@@ -169,10 +182,21 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->verify_interval = le32_to_cpu(top->verify_interval);
         o->verify_offset = le32_to_cpu(top->verify_offset);
  
-       memcpy(o->verify_pattern, top->verify_pattern, MAX_PATTERN_SIZE);
-       memcpy(o->buffer_pattern, top->buffer_pattern, MAX_PATTERN_SIZE);
-
         o->verify_pattern_bytes = le32_to_cpu(top->verify_pattern_bytes);
+       o->buffer_pattern_bytes = le32_to_cpu(top->buffer_pattern_bytes);
+       if (o->verify_pattern_bytes >= MAX_PATTERN_SIZE ||
+           o->buffer_pattern_bytes >= MAX_PATTERN_SIZE ||
+           thread_options_pack_size(o) > top_sz)
+               return -EINVAL;
+
+       o->verify_pattern = realloc(o->verify_pattern,
+                                   o->verify_pattern_bytes);
+       o->buffer_pattern = realloc(o->buffer_pattern,
+                                   o->buffer_pattern_bytes);
+       memcpy(o->verify_pattern, top->patterns, o->verify_pattern_bytes);
+       memcpy(o->buffer_pattern, &top->patterns[o->verify_pattern_bytes],
+              o->buffer_pattern_bytes);
+
         o->verify_fatal = le32_to_cpu(top->verify_fatal);
         o->verify_dump = le32_to_cpu(top->verify_dump);
         o->verify_async = le32_to_cpu(top->verify_async);
@@ -183,16 +207,19 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->do_disk_util = le32_to_cpu(top->do_disk_util);
         o->override_sync = le32_to_cpu(top->override_sync);
         o->rand_repeatable = le32_to_cpu(top->rand_repeatable);
-       o->allrand_repeatable = le32_to_cpu(top->allrand_repeatable);
         o->rand_seed = le64_to_cpu(top->rand_seed);
+       o->log_entries = le32_to_cpu(top->log_entries);
         o->log_avg_msec = le32_to_cpu(top->log_avg_msec);
         o->log_hist_msec = le32_to_cpu(top->log_hist_msec);
         o->log_hist_coarseness = le32_to_cpu(top->log_hist_coarseness);
         o->log_max = le32_to_cpu(top->log_max);
         o->log_offset = le32_to_cpu(top->log_offset);
+       o->log_prio = le32_to_cpu(top->log_prio);
         o->log_gz = le32_to_cpu(top->log_gz);
         o->log_gz_store = le32_to_cpu(top->log_gz_store);
-       o->log_unix_epoch = le32_to_cpu(top->log_unix_epoch);
+       o->log_alternate_epoch = le32_to_cpu(top->log_alternate_epoch);
+       o->log_alternate_epoch_clock_id = le32_to_cpu(top->log_alternate_epoch_clock_id);
+       o->job_start_clock_id = le32_to_cpu(top->job_start_clock_id);
         o->norandommap = le32_to_cpu(top->norandommap);
         o->softrandommap = le32_to_cpu(top->softrandommap);
         o->bs_unaligned = le32_to_cpu(top->bs_unaligned);
@@ -207,10 +234,12 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->random_generator = le32_to_cpu(top->random_generator);
         o->hugepage_size = le32_to_cpu(top->hugepage_size);
         o->rw_min_bs = le64_to_cpu(top->rw_min_bs);
+       o->thinkcycles = le32_to_cpu(top->thinkcycles);
         o->thinktime = le32_to_cpu(top->thinktime);
         o->thinktime_spin = le32_to_cpu(top->thinktime_spin);
         o->thinktime_blocks = le32_to_cpu(top->thinktime_blocks);
         o->thinktime_blocks_type = le32_to_cpu(top->thinktime_blocks_type);
+       o->thinktime_iotime = le32_to_cpu(top->thinktime_iotime);
         o->fsync_blocks = le32_to_cpu(top->fsync_blocks);
         o->fdatasync_blocks = le32_to_cpu(top->fdatasync_blocks);
         o->barrier_blocks = le32_to_cpu(top->barrier_blocks);
@@ -224,11 +253,14 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->ss_ramp_time = le64_to_cpu(top->ss_ramp_time);
         o->ss_state = le32_to_cpu(top->ss_state);
         o->ss_limit.u.f = fio_uint64_to_double(le64_to_cpu(top->ss_limit.u.i));
+       o->ss_check_interval = le64_to_cpu(top->ss_check_interval);
         o->zone_range = le64_to_cpu(top->zone_range);
         o->zone_size = le64_to_cpu(top->zone_size);
         o->zone_capacity = le64_to_cpu(top->zone_capacity);
         o->zone_skip = le64_to_cpu(top->zone_skip);
         o->zone_mode = le32_to_cpu(top->zone_mode);
+       o->max_open_zones = __le32_to_cpu(top->max_open_zones);
+       o->ignore_zone_limits = le32_to_cpu(top->ignore_zone_limits);
         o->lockmem = le64_to_cpu(top->lockmem);
         o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent);
         o->offset_increment = le64_to_cpu(top->offset_increment);
@@ -251,6 +283,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->nice = le32_to_cpu(top->nice);
         o->ioprio = le32_to_cpu(top->ioprio);
         o->ioprio_class = le32_to_cpu(top->ioprio_class);
+       o->ioprio_hint = le32_to_cpu(top->ioprio_hint);
         o->file_service_type = le32_to_cpu(top->file_service_type);
         o->group_reporting = le32_to_cpu(top->group_reporting);
         o->stats = le32_to_cpu(top->stats);
@@ -259,7 +292,6 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->zero_buffers = le32_to_cpu(top->zero_buffers);
         o->refill_buffers = le32_to_cpu(top->refill_buffers);
         o->scramble_buffers = le32_to_cpu(top->scramble_buffers);
-       o->buffer_pattern_bytes = le32_to_cpu(top->buffer_pattern_bytes);
         o->time_based = le32_to_cpu(top->time_based);
         o->disable_lat = le32_to_cpu(top->disable_lat);
         o->disable_clat = le32_to_cpu(top->disable_clat);
@@ -289,12 +321,14 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->sync_file_range = le32_to_cpu(top->sync_file_range);
         o->latency_target = le64_to_cpu(top->latency_target);
         o->latency_window = le64_to_cpu(top->latency_window);
-       o->max_latency = le64_to_cpu(top->max_latency);
         o->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(top->latency_percentile.u.i));
         o->latency_run = le32_to_cpu(top->latency_run);
         o->compress_percentage = le32_to_cpu(top->compress_percentage);
         o->compress_chunk = le32_to_cpu(top->compress_chunk);
         o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
+       o->dedupe_mode = le32_to_cpu(top->dedupe_mode);
+       o->dedupe_working_set_percentage = le32_to_cpu(top->dedupe_working_set_percentage);
+       o->dedupe_global = le32_to_cpu(top->dedupe_global);
         o->block_error_hist = le32_to_cpu(top->block_error_hist);
         o->replay_align = le32_to_cpu(top->replay_align);
         o->replay_scale = le32_to_cpu(top->replay_scale);
@@ -318,11 +352,20 @@ void convert_thread_options_to_cpu(struct thread_options *o,
  
         for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
                 o->merge_blktrace_iters[i].u.f = fio_uint64_to_double(le64_to_cpu(top->merge_blktrace_iters[i].u.i));
+
+       o->fdp = le32_to_cpu(top->fdp);
+       o->dp_type = le32_to_cpu(top->dp_type);
+       o->dp_id_select = le32_to_cpu(top->dp_id_select);
+       o->dp_nr_ids = le32_to_cpu(top->dp_nr_ids);
+       for (i = 0; i < o->dp_nr_ids; i++)
+               o->dp_ids[i] = le32_to_cpu(top->dp_ids[i]);
  #if 0
         uint8_t cpumask[FIO_TOP_STR_MAX];
         uint8_t verify_cpumask[FIO_TOP_STR_MAX];
         uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
  #endif
+
+       return 0;
  }
  
  void convert_thread_options_to_net(struct thread_options_pack *top,
@@ -407,14 +450,17 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->do_disk_util = cpu_to_le32(o->do_disk_util);
         top->override_sync = cpu_to_le32(o->override_sync);
         top->rand_repeatable = cpu_to_le32(o->rand_repeatable);
-       top->allrand_repeatable = cpu_to_le32(o->allrand_repeatable);
         top->rand_seed = __cpu_to_le64(o->rand_seed);
+       top->log_entries = cpu_to_le32(o->log_entries);
         top->log_avg_msec = cpu_to_le32(o->log_avg_msec);
         top->log_max = cpu_to_le32(o->log_max);
         top->log_offset = cpu_to_le32(o->log_offset);
+       top->log_prio = cpu_to_le32(o->log_prio);
         top->log_gz = cpu_to_le32(o->log_gz);
         top->log_gz_store = cpu_to_le32(o->log_gz_store);
-       top->log_unix_epoch = cpu_to_le32(o->log_unix_epoch);
+       top->log_alternate_epoch = cpu_to_le32(o->log_alternate_epoch);
+       top->log_alternate_epoch_clock_id = cpu_to_le32(o->log_alternate_epoch_clock_id);
+       top->job_start_clock_id = cpu_to_le32(o->job_start_clock_id);
         top->norandommap = cpu_to_le32(o->norandommap);
         top->softrandommap = cpu_to_le32(o->softrandommap);
         top->bs_unaligned = cpu_to_le32(o->bs_unaligned);
@@ -429,10 +475,12 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->random_generator = cpu_to_le32(o->random_generator);
         top->hugepage_size = cpu_to_le32(o->hugepage_size);
         top->rw_min_bs = __cpu_to_le64(o->rw_min_bs);
+       top->thinkcycles = cpu_to_le32(o->thinkcycles);
         top->thinktime = cpu_to_le32(o->thinktime);
         top->thinktime_spin = cpu_to_le32(o->thinktime_spin);
         top->thinktime_blocks = cpu_to_le32(o->thinktime_blocks);
         top->thinktime_blocks_type = __cpu_to_le32(o->thinktime_blocks_type);
+       top->thinktime_iotime = __cpu_to_le32(o->thinktime_iotime);
         top->fsync_blocks = cpu_to_le32(o->fsync_blocks);
         top->fdatasync_blocks = cpu_to_le32(o->fdatasync_blocks);
         top->barrier_blocks = cpu_to_le32(o->barrier_blocks);
@@ -453,6 +501,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->nice = cpu_to_le32(o->nice);
         top->ioprio = cpu_to_le32(o->ioprio);
         top->ioprio_class = cpu_to_le32(o->ioprio_class);
+       top->ioprio_hint = cpu_to_le32(o->ioprio_hint);
         top->file_service_type = cpu_to_le32(o->file_service_type);
         top->group_reporting = cpu_to_le32(o->group_reporting);
         top->stats = cpu_to_le32(o->stats);
@@ -491,12 +540,14 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->sync_file_range = cpu_to_le32(o->sync_file_range);
         top->latency_target = __cpu_to_le64(o->latency_target);
         top->latency_window = __cpu_to_le64(o->latency_window);
-       top->max_latency = __cpu_to_le64(o->max_latency);
         top->latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(o->latency_percentile.u.f));
         top->latency_run = __cpu_to_le32(o->latency_run);
         top->compress_percentage = cpu_to_le32(o->compress_percentage);
         top->compress_chunk = cpu_to_le32(o->compress_chunk);
         top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
+       top->dedupe_mode = cpu_to_le32(o->dedupe_mode);
+       top->dedupe_working_set_percentage = cpu_to_le32(o->dedupe_working_set_percentage);
+       top->dedupe_global = cpu_to_le32(o->dedupe_global);
         top->block_error_hist = cpu_to_le32(o->block_error_hist);
         top->replay_align = cpu_to_le32(o->replay_align);
         top->replay_scale = cpu_to_le32(o->replay_scale);
@@ -550,13 +601,17 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
                 top->rate_iops_min[i] = cpu_to_le32(o->rate_iops_min[i]);
  
                 top->perc_rand[i] = cpu_to_le32(o->perc_rand[i]);
+
+               top->max_latency[i] = __cpu_to_le64(o->max_latency[i]);
         }
  
-       memcpy(top->verify_pattern, o->verify_pattern, MAX_PATTERN_SIZE);
-       memcpy(top->buffer_pattern, o->buffer_pattern, MAX_PATTERN_SIZE);
+       memcpy(top->patterns, o->verify_pattern, o->verify_pattern_bytes);
+       memcpy(&top->patterns[o->verify_pattern_bytes], o->buffer_pattern,
+              o->buffer_pattern_bytes);
  
         top->size = __cpu_to_le64(o->size);
         top->io_size = __cpu_to_le64(o->io_size);
+       top->num_range = __cpu_to_le32(o->num_range);
         top->verify_backlog = __cpu_to_le64(o->verify_backlog);
         top->start_delay = __cpu_to_le64(o->start_delay);
         top->start_delay_high = __cpu_to_le64(o->start_delay_high);
@@ -566,11 +621,14 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->ss_ramp_time = __cpu_to_le64(top->ss_ramp_time);
         top->ss_state = cpu_to_le32(top->ss_state);
         top->ss_limit.u.i = __cpu_to_le64(fio_double_to_uint64(o->ss_limit.u.f));
+       top->ss_check_interval = __cpu_to_le64(top->ss_check_interval);
         top->zone_range = __cpu_to_le64(o->zone_range);
         top->zone_size = __cpu_to_le64(o->zone_size);
         top->zone_capacity = __cpu_to_le64(o->zone_capacity);
         top->zone_skip = __cpu_to_le64(o->zone_skip);
         top->zone_mode = __cpu_to_le32(o->zone_mode);
+       top->max_open_zones = __cpu_to_le32(o->max_open_zones);
+       top->ignore_zone_limits = cpu_to_le32(o->ignore_zone_limits);
         top->lockmem = __cpu_to_le64(o->lockmem);
         top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add);
         top->file_size_low = __cpu_to_le64(o->file_size_low);
@@ -593,12 +651,18 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
  
         for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
                 top->merge_blktrace_iters[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->merge_blktrace_iters[i].u.f));
+
+       top->fdp = cpu_to_le32(o->fdp);
+       top->dp_type = cpu_to_le32(o->dp_type);
+       top->dp_id_select = cpu_to_le32(o->dp_id_select);
+       top->dp_nr_ids = cpu_to_le32(o->dp_nr_ids);
+       for (i = 0; i < o->dp_nr_ids; i++)
+               top->dp_ids[i] = cpu_to_le32(o->dp_ids[i]);
  #if 0
         uint8_t cpumask[FIO_TOP_STR_MAX];
         uint8_t verify_cpumask[FIO_TOP_STR_MAX];
         uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
  #endif
-
  }
  
  /*
@@ -608,18 +672,36 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
   */
  int fio_test_cconv(struct thread_options *__o)
  {
-       struct thread_options o;
-       struct thread_options_pack top1, top2;
-
-       memset(&top1, 0, sizeof(top1));
-       memset(&top2, 0, sizeof(top2));
-
-       convert_thread_options_to_net(&top1, __o);
-       memset(&o, 0, sizeof(o));
-       convert_thread_options_to_cpu(&o, &top1);
-       convert_thread_options_to_net(&top2, &o);
-
-       free_thread_options_to_cpu(&o);
-
-       return memcmp(&top1, &top2, sizeof(top1));
+       struct thread_options o1 = *__o, o2;
+       struct thread_options_pack *top1, *top2;
+       size_t top_sz;
+       int ret;
+
+       o1.verify_pattern_bytes = 61;
+       o1.verify_pattern = malloc(o1.verify_pattern_bytes);
+       memset(o1.verify_pattern, 'V', o1.verify_pattern_bytes);
+       o1.buffer_pattern_bytes = 15;
+       o1.buffer_pattern = malloc(o1.buffer_pattern_bytes);
+       memset(o1.buffer_pattern, 'B', o1.buffer_pattern_bytes);
+
+       top_sz = thread_options_pack_size(&o1);
+       top1 = calloc(1, top_sz);
+       top2 = calloc(1, top_sz);
+
+       convert_thread_options_to_net(top1, &o1);
+       memset(&o2, 0, sizeof(o2));
+       ret = convert_thread_options_to_cpu(&o2, top1, top_sz);
+       if (ret)
+               goto out;
+
+       convert_thread_options_to_net(top2, &o2);
+       ret = memcmp(top1, top2, top_sz);
+
+out:
+       free_thread_options_to_cpu(&o2);
+       free(top2);
+       free(top1);
+       free(o1.buffer_pattern);
+       free(o1.verify_pattern);
+       return ret;
  }
diff --git a/ci/actions-build.sh b/ci/actions-build.sh

new file mode 100755 (executable)

index 0000000..47d4f04
--- /dev/null
+++ b/ci/actions-build.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+SCRIPT_DIR=$(dirname "$0")
+# shellcheck disable=SC1091
+. "${SCRIPT_DIR}/common.sh"
+
+main() {
+    local extra_cflags="-Werror"
+    local configure_flags=()
+
+    set_ci_target_os
+    case "${CI_TARGET_BUILD}/${CI_TARGET_OS}" in
+        android*/*)
+            export UNAME=Android
+            if [ -z "${CI_TARGET_ARCH}" ]; then
+                echo "Error: CI_TARGET_ARCH has not been set"
+                return 1
+            fi
+            NDK=$PWD/android-ndk-r24/toolchains/llvm/prebuilt/linux-x86_64/bin
+            export PATH="${NDK}:${PATH}"
+            if [ "${CI_TARGET_BUILD}" = "android" ]; then
+                export LIBS="-landroid"
+            fi
+            CC=${NDK}/${CI_TARGET_ARCH}-clang
+            if [ ! -e "${CC}" ]; then
+                echo "Error: could not find ${CC}"
+                return 1
+            fi
+            ;;
+        */linux)
+            case "${CI_TARGET_ARCH}" in
+                "i686")
+                    extra_cflags="${extra_cflags} -m32"
+                    export LDFLAGS="-m32"
+                    ;;
+                "x86_64")
+                    configure_flags+=(
+                        "--enable-cuda"
+                        "--enable-libiscsi"
+                        "--enable-libnbd"
+                    )
+                    ;;
+            esac
+           ;;
+        */windows)
+           configure_flags+=("--disable-native")
+            case "${CI_TARGET_ARCH}" in
+                "i686")
+                   configure_flags+=("--build-32bit-win")
+                    ;;
+                "x86_64")
+                    ;;
+            esac
+            if [ "${CI_TARGET_BUILD}" = "windows-msys2-64" ]; then
+                configure_flags+=("--disable-tls")
+            fi
+           ;;
+    esac
+    configure_flags+=(--extra-cflags="${extra_cflags}")
+
+    ./configure "${configure_flags[@]}"
+    make -j "$(nproc 2>/dev/null || sysctl -n hw.logicalcpu)"
+# macOS does not have nproc, so we have to use sysctl to obtain the number of
+# logical CPUs.
+}
+
+main
diff --git a/ci/actions-full-test.sh b/ci/actions-full-test.sh

new file mode 100755 (executable)

index 0000000..d2fb420
--- /dev/null
+++ b/ci/actions-full-test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+main() {
+    case "${CI_TARGET_BUILD}" in
+       android*)
+           return 0;;
+    esac
+
+    echo "Running long running tests..."
+    export PYTHONUNBUFFERED="TRUE"
+    if [[ "${CI_TARGET_ARCH}" == "arm64" ]]; then
+        python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20"
+    else
+        python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
+    fi
+    make -C doc html
+}
+
+main
diff --git a/ci/actions-install-librpma.sh b/ci/actions-install-librpma.sh

new file mode 100755 (executable)

index 0000000..31f9f71
--- /dev/null
+++ b/ci/actions-install-librpma.sh
@@ -0,0 +1,21 @@
+#!/bin/bash -e
+
+LIBRPMA_VERSION="1.0.0"
+ZIP_FILE=rpma.zip
+
+WORKDIR=$(pwd)
+
+# install librpma
+wget -O $ZIP_FILE https://github.com/pmem/rpma/archive/${LIBRPMA_VERSION}.zip
+unzip $ZIP_FILE
+mkdir -p rpma-${LIBRPMA_VERSION}/build
+cd rpma-${LIBRPMA_VERSION}/build
+cmake .. -DCMAKE_BUILD_TYPE=Release \
+       -DCMAKE_INSTALL_PREFIX=/usr \
+       -DBUILD_DOC=OFF \
+       -DBUILD_EXAMPLES=OFF \
+       -DBUILD_TESTS=OFF
+make -j"$(nproc)"
+sudo make -j"$(nproc)" install
+cd "$WORKDIR"
+rm -rf $ZIP_FILE rpma-${LIBRPMA_VERSION}
diff --git a/ci/actions-install.sh b/ci/actions-install.sh

new file mode 100755 (executable)

index 0000000..6eb2d79
--- /dev/null
+++ b/ci/actions-install.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+SCRIPT_DIR=$(dirname "$0")
+# shellcheck disable=SC1091
+. "${SCRIPT_DIR}/common.sh"
+
+install_ubuntu() {
+    local pkgs
+
+    cat <<DPKGCFG | sudo tee /etc/dpkg/dpkg.cfg.d/dpkg-speedup > /dev/null
+# Skip fsync
+force-unsafe-io
+# Don't install documentation
+path-exclude=/usr/share/man/*
+path-exclude=/usr/share/locale/*/LC_MESSAGES/*.mo
+path-exclude=/usr/share/doc/*
+DPKGCFG
+    # Packages available on i686 and x86_64
+    pkgs=(
+        libaio-dev
+        libcunit1-dev
+        libcurl4-openssl-dev
+        libfl-dev
+        libnuma-dev
+       libnfs-dev
+        valgrind
+    )
+    case "${CI_TARGET_ARCH}" in
+        "i686")
+            sudo dpkg --add-architecture i386
+            pkgs=("${pkgs[@]/%/:i386}")
+            pkgs+=(
+                gcc-multilib
+                pkg-config:i386
+                zlib1g-dev:i386
+                libc6:i386
+                libgcc-s1:i386
+            )
+            ;;
+        "x86_64")
+            pkgs+=(
+                libglusterfs-dev
+                libgoogle-perftools-dev
+                libiscsi-dev
+                libnbd-dev
+                libpmem-dev
+                libpmem2-dev
+                libprotobuf-c-dev
+                librbd-dev
+                libtcmalloc-minimal4
+                nvidia-cuda-dev
+                libibverbs-dev
+                librdmacm-dev
+            )
+           echo "Removing libunwind-14-dev because of conflicts with libunwind-dev"
+           sudo apt remove -y libunwind-14-dev
+            ;;
+    esac
+
+    # Architecture-independent packages and packages for which we don't
+    # care about the architecture.
+    pkgs+=(
+        python3-scipy
+       python3-sphinx
+       python3-statsmodels
+    )
+
+    echo "Updating APT..."
+    sudo apt-get -qq update
+    echo "Installing packages... ${pkgs[@]}"
+    sudo apt-get install -o APT::Immediate-Configure=false --no-install-recommends -qq -y "${pkgs[@]}"
+    if [ "${CI_TARGET_ARCH}" == "x86_64" ]; then
+        # install librpma from sources
+        ci/actions-install-librpma.sh
+    fi
+}
+
+install_linux() {
+    install_ubuntu
+}
+
+install_macos() {
+    # Assumes homebrew and python3 are already installed
+    #echo "Updating homebrew..."
+    #brew update >/dev/null 2>&1
+    echo "Installing packages..."
+    HOMEBREW_NO_AUTO_UPDATE=1 brew install cunit libnfs
+    pip3 install scipy six statsmodels sphinx
+}
+
+install_windows() {
+       pip3 install scipy six statsmodels sphinx
+}
+
+main() {
+    case "${CI_TARGET_BUILD}" in
+       android*)
+           echo "Installing Android NDK..."
+           wget --quiet https://dl.google.com/android/repository/android-ndk-r24-linux.zip
+           unzip -q android-ndk-r24-linux.zip
+           return 0
+           ;;
+    esac
+
+    set_ci_target_os
+
+    install_function="install_${CI_TARGET_OS}"
+    ${install_function}
+
+    echo "Python3 path: $(type -p python3 2>&1)"
+    echo "Python3 version: $(python3 -V 2>&1)"
+}
+
+main
diff --git a/ci/actions-smoke-test.sh b/ci/actions-smoke-test.sh

new file mode 100755 (executable)

index 0000000..494462a
--- /dev/null
+++ b/ci/actions-smoke-test.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+main() {
+    case "${CI_TARGET_BUILD}" in
+       android*)
+           return 0;;
+    esac
+
+    echo "Running smoke tests..."
+    make test
+}
+
+main
diff --git a/ci/appveyor-install.sh b/ci/appveyor-install.sh

deleted file mode 100755 (executable)

index c73e4cb..0000000
--- a/ci/appveyor-install.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-# The PATH to appropriate distro commands must already be set before invoking
-# this script
-# The following environment variables must be set:
-# PLATFORM={i686,x64}
-# DISTRO={cygwin,msys2}
-# The following environment can optionally be set:
-# CYG_MIRROR=<URL>
-set -eu
-
-case "${ARCHITECTURE}" in
-    "x64")
-        PACKAGE_ARCH="x86_64"
-        ;;
-    "x86")
-        PACKAGE_ARCH="i686"
-        ;;
-esac
-
-echo "Installing packages..."
-case "${DISTRO}" in
-    "cygwin")
-        CYG_MIRROR=${CYG_MIRROR:-"http://cygwin.mirror.constant.com"}
-        setup-x86_64.exe --quiet-mode --no-shortcuts --only-site \
-            --site "${CYG_MIRROR}" --packages \
-            "mingw64-${PACKAGE_ARCH}-CUnit,mingw64-${PACKAGE_ARCH}-zlib"
-        ;;
-    "msys2")
-        #pacman --noconfirm -Syuu # MSYS2 core update
-        #pacman --noconfirm -Syuu # MSYS2 normal update
-        pacman.exe --noconfirm -S \
-            mingw-w64-${PACKAGE_ARCH}-clang \
-            mingw-w64-${PACKAGE_ARCH}-cunit \
-            mingw-w64-${PACKAGE_ARCH}-lld
-        ;;
-esac
-
-python.exe -m pip install scipy six
-
-echo "Python3 path: $(type -p python3 2>&1)"
-echo "Python3 version: $(python3 -V 2>&1)"
diff --git a/ci/common.sh b/ci/common.sh

new file mode 100644 (file)

index 0000000..3cf6a41
--- /dev/null
+++ b/ci/common.sh
@@ -0,0 +1,34 @@
+# shellcheck shell=bash
+
+function set_ci_target_os {
+    # Function that exports CI_TARGET_OS to the current OS if it is not already
+    # set.
+
+    # Don't override CI_TARGET_OS if already set
+    CI_TARGET_OS=${CI_TARGET_OS:-}
+    if [[ -z ${CI_TARGET_OS} ]]; then
+        # Detect operating system
+        case "${OSTYPE}" in
+            linux*)
+                CI_TARGET_OS="linux"
+                ;;
+            darwin*)
+                CI_TARGET_OS="macos"
+                ;;
+            cygwin|msys*)
+                CI_TARGET_OS="windows"
+                ;;
+            bsd*)
+                CI_TARGET_OS="bsd"
+                ;;
+            *)
+                CI_TARGET_OS=""
+        esac
+    fi
+
+    # Don't override CI_TARGET_ARCH if already set
+    CI_TARGET_ARCH=${CI_TARGET_ARCH:-}
+    if [[ -z ${CI_TARGET_ARCH} ]]; then
+        CI_TARGET_ARCH="$(uname -m)"
+    fi
+}
diff --git a/ci/travis-build.sh b/ci/travis-build.sh

deleted file mode 100755 (executable)

index 923d882..0000000
--- a/ci/travis-build.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-set -eu
-
-CI_TARGET_ARCH="${BUILD_ARCH:-$TRAVIS_CPU_ARCH}"
-EXTRA_CFLAGS="-Werror"
-export PYTHONUNBUFFERED=TRUE
-CONFIGURE_FLAGS=()
-
-case "$TRAVIS_OS_NAME" in
-    "linux")
-        CONFIGURE_FLAGS+=(--enable-libiscsi)
-        case "$CI_TARGET_ARCH" in
-            "x86")
-                EXTRA_CFLAGS="${EXTRA_CFLAGS} -m32"
-                export LDFLAGS="-m32"
-                ;;
-            "amd64")
-                CONFIGURE_FLAGS+=(--enable-cuda)
-                ;;
-        esac
-    ;;
-esac
-CONFIGURE_FLAGS+=(--extra-cflags="${EXTRA_CFLAGS}")
-
-./configure "${CONFIGURE_FLAGS[@]}" &&
-    make &&
-    make test &&
-    if [[ "$CI_TARGET_ARCH" == "arm64" ]]; then
-        sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20"
-    else
-        sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
-    fi
diff --git a/ci/travis-install.sh b/ci/travis-install.sh

deleted file mode 100755 (executable)

index 103695d..0000000
--- a/ci/travis-install.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-set -eu
-
-CI_TARGET_ARCH="${BUILD_ARCH:-$TRAVIS_CPU_ARCH}"
-case "$TRAVIS_OS_NAME" in
-    "linux")
-       # Architecture-dependent packages.
-       pkgs=(
-           libaio-dev
-           libcunit1-dev
-           libfl-dev
-           libgoogle-perftools-dev
-           libibverbs-dev
-           libiscsi-dev
-           libnuma-dev
-           librbd-dev
-           librdmacm-dev
-           libz-dev
-       )
-       case "$CI_TARGET_ARCH" in
-           "x86")
-               pkgs=("${pkgs[@]/%/:i386}")
-               pkgs+=(
-                   gcc-multilib
-                   pkg-config:i386
-               )
-               ;;
-           "amd64")
-               pkgs+=(nvidia-cuda-dev)
-               ;;
-       esac
-       if [[ $CI_TARGET_ARCH != "x86" ]]; then
-               pkgs+=(glusterfs-common)
-       fi
-       # Architecture-independent packages and packages for which we don't
-       # care about the architecture.
-       pkgs+=(
-           bison
-           flex
-           python3
-           python3-scipy
-           python3-six
-       )
-       sudo apt-get -qq update
-       sudo apt-get install --no-install-recommends -qq -y "${pkgs[@]}"
-       ;;
-    "osx")
-       brew update >/dev/null 2>&1
-       brew install cunit
-       pip3 install scipy six
-       ;;
-esac
-
-echo "Python3 path: $(type -p python3 2>&1)"
-echo "Python3 version: $(python3 -V 2>&1)"
diff --git a/client.c b/client.c

index 29d8750a5b2b6948bc8206399d6a1be774a22a78..4cb7dffede753d52f641afd1d4f1a448588a8509 100644 (file)
--- a/client.c
+++ b/client.c
@@ -34,7 +34,7 @@ static void handle_start(struct fio_client *client, struct fio_net_cmd *cmd);
  static void convert_text(struct fio_net_cmd *cmd);
  static void client_display_thread_status(struct jobs_eta *je);
  
-struct client_ops fio_client_ops = {
+struct client_ops const fio_client_ops = {
         .text           = handle_text,
         .disk_util      = handle_du,
         .thread_status  = handle_ts,
@@ -284,9 +284,10 @@ static int fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn)
  static void fio_drain_client_text(struct fio_client *client)
  {
         do {
-               struct fio_net_cmd *cmd;
+               struct fio_net_cmd *cmd = NULL;
  
-               cmd = fio_net_recv_cmd(client->fd, false);
+               if (fio_server_poll_fd(client->fd, POLLIN, 0))
+                       cmd = fio_net_recv_cmd(client->fd, false);
                 if (!cmd)
                         break;
  
@@ -368,8 +369,7 @@ static struct fio_client *get_new_client(void)
  {
         struct fio_client *client;
  
-       client = malloc(sizeof(*client));
-       memset(client, 0, sizeof(*client));
+       client = calloc(1, sizeof(*client));
  
         INIT_FLIST_HEAD(&client->list);
         INIT_FLIST_HEAD(&client->hash_list);
@@ -446,7 +446,7 @@ int fio_client_add_ini_file(void *cookie, const char *ini_file, bool remote)
         return 0;
  }
  
-int fio_client_add(struct client_ops *ops, const char *hostname, void **cookie)
+int fio_client_add(struct client_ops const *ops, const char *hostname, void **cookie)
  {
         struct fio_client *existing = *cookie;
         struct fio_client *client;
@@ -792,8 +792,7 @@ static int __fio_client_send_remote_ini(struct fio_client *client,
         dprint(FD_NET, "send remote ini %s to %s\n", filename, client->hostname);
  
         p_size = sizeof(*pdu) + strlen(filename) + 1;
-       pdu = malloc(p_size);
-       memset(pdu, 0, p_size);
+       pdu = calloc(1, p_size);
         pdu->name_len = strlen(filename);
         strcpy((char *) pdu->file, filename);
         pdu->client_type = cpu_to_le16((uint16_t) client->type);
@@ -921,13 +920,20 @@ int fio_clients_send_ini(const char *filename)
  int fio_client_update_options(struct fio_client *client,
                               struct thread_options *o, uint64_t *tag)
  {
-       struct cmd_add_job_pdu pdu;
+       size_t cmd_sz = offsetof(struct cmd_add_job_pdu, top) +
+               thread_options_pack_size(o);
+       struct cmd_add_job_pdu *pdu;
+       int ret;
  
-       pdu.thread_number = cpu_to_le32(client->thread_number);
-       pdu.groupid = cpu_to_le32(client->groupid);
-       convert_thread_options_to_net(&pdu.top, o);
+       pdu = malloc(cmd_sz);
+       pdu->thread_number = cpu_to_le32(client->thread_number);
+       pdu->groupid = cpu_to_le32(client->groupid);
+       convert_thread_options_to_net(&pdu->top, o);
  
-       return fio_net_send_cmd(client->fd, FIO_NET_CMD_UPDATE_JOB, &pdu, sizeof(pdu), tag, &client->cmd_list);
+       ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_UPDATE_JOB, pdu,
+                              cmd_sz, tag, &client->cmd_list);
+       free(pdu);
+       return ret;
  }
  
  static void convert_io_stat(struct io_stat *dst, struct io_stat *src)
@@ -950,9 +956,12 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
         dst->error              = le32_to_cpu(src->error);
         dst->thread_number      = le32_to_cpu(src->thread_number);
         dst->groupid            = le32_to_cpu(src->groupid);
+       dst->job_start          = le64_to_cpu(src->job_start);
         dst->pid                = le32_to_cpu(src->pid);
         dst->members            = le32_to_cpu(src->members);
         dst->unified_rw_rep     = le32_to_cpu(src->unified_rw_rep);
+       dst->ioprio             = le32_to_cpu(src->ioprio);
+       dst->disable_prio_stat  = le32_to_cpu(src->disable_prio_stat);
  
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
                 convert_io_stat(&dst->clat_stat[i], &src->clat_stat[i]);
@@ -1035,14 +1044,6 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
         dst->nr_block_infos     = le64_to_cpu(src->nr_block_infos);
         for (i = 0; i < dst->nr_block_infos; i++)
                 dst->block_infos[i] = le32_to_cpu(src->block_infos[i]);
-       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
-                       dst->io_u_plat_high_prio[i][j] = le64_to_cpu(src->io_u_plat_high_prio[i][j]);
-                       dst->io_u_plat_low_prio[i][j] = le64_to_cpu(src->io_u_plat_low_prio[i][j]);
-               }
-               convert_io_stat(&dst->clat_high_prio_stat[i], &src->clat_high_prio_stat[i]);
-               convert_io_stat(&dst->clat_low_prio_stat[i], &src->clat_low_prio_stat[i]);
-       }
  
         dst->ss_dur             = le64_to_cpu(src->ss_dur);
         dst->ss_state           = le32_to_cpu(src->ss_state);
@@ -1052,6 +1053,19 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
         dst->ss_deviation.u.f   = fio_uint64_to_double(le64_to_cpu(src->ss_deviation.u.i));
         dst->ss_criterion.u.f   = fio_uint64_to_double(le64_to_cpu(src->ss_criterion.u.i));
  
+       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+               dst->nr_clat_prio[i] = le32_to_cpu(src->nr_clat_prio[i]);
+               for (j = 0; j < dst->nr_clat_prio[i]; j++) {
+                       for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+                               dst->clat_prio[i][j].io_u_plat[k] =
+                                       le64_to_cpu(src->clat_prio[i][j].io_u_plat[k]);
+                       convert_io_stat(&dst->clat_prio[i][j].clat_stat,
+                                       &src->clat_prio[i][j].clat_stat);
+                       dst->clat_prio[i][j].ioprio =
+                               le32_to_cpu(dst->clat_prio[i][j].ioprio);
+               }
+       }
+
         if (dst->ss_state & FIO_SS_DATA) {
                 for (i = 0; i < dst->ss_dur; i++ ) {
                         dst->ss_iops_data[i] = le64_to_cpu(src->ss_iops_data[i]);
@@ -1111,7 +1125,7 @@ static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd)
         if (sum_stat_clients <= 1)
                 return;
  
-       sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
+       sum_thread_stats(&client_ts, &p->ts);
         sum_group_stats(&client_gs, &p->rs);
  
         client_ts.members++;
@@ -1438,10 +1452,13 @@ static int fio_client_handle_iolog(struct fio_client *client,
         if (store_direct) {
                 ssize_t wrote;
                 size_t sz;
-               int fd;
+               int fd, flags;
  
-               fd = open((const char *) log_pathname,
-                               O_WRONLY | O_CREAT | O_TRUNC, 0644);
+               if (pdu->per_job_logs)
+                       flags = O_WRONLY | O_CREAT | O_TRUNC;
+               else
+                       flags = O_WRONLY | O_CREAT | O_APPEND;
+               fd = open((const char *) log_pathname, flags, 0644);
                 if (fd < 0) {
                         log_err("fio: open log %s: %s\n",
                                 log_pathname, strerror(errno));
@@ -1462,7 +1479,13 @@ static int fio_client_handle_iolog(struct fio_client *client,
                 ret = 0;
         } else {
                 FILE *f;
-               f = fopen((const char *) log_pathname, "w");
+               const char *mode;
+
+               if (pdu->per_job_logs)
+                       mode = "w";
+               else
+                       mode = "a";
+               f = fopen((const char *) log_pathname, mode);
                 if (!f) {
                         log_err("fio: fopen log %s : %s\n",
                                 log_pathname, strerror(errno));
@@ -1679,7 +1702,9 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
         ret->log_type           = le32_to_cpu(ret->log_type);
         ret->compressed         = le32_to_cpu(ret->compressed);
         ret->log_offset         = le32_to_cpu(ret->log_offset);
+       ret->log_prio           = le32_to_cpu(ret->log_prio);
         ret->log_hist_coarseness = le32_to_cpu(ret->log_hist_coarseness);
+       ret->per_job_logs       = le32_to_cpu(ret->per_job_logs);
  
         if (*store_direct)
                 return ret;
@@ -1693,9 +1718,13 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
                         s = (struct io_sample *)((char *)s + sizeof(struct io_u_plat_entry) * i);
  
                 s->time         = le64_to_cpu(s->time);
-               s->data.val     = le64_to_cpu(s->data.val);
+               if (ret->log_type != IO_LOG_TYPE_HIST) {
+                       s->data.val.val0        = le64_to_cpu(s->data.val.val0);
+                       s->data.val.val1        = le64_to_cpu(s->data.val.val1);
+               }
                 s->__ddir       = __le32_to_cpu(s->__ddir);
                 s->bs           = le64_to_cpu(s->bs);
+               s->priority     = le16_to_cpu(s->priority);
  
                 if (ret->log_offset) {
                         struct io_sample_offset *so = (void *) s;
@@ -1756,9 +1785,8 @@ fail:
  
  int fio_handle_client(struct fio_client *client)
  {
-       struct client_ops *ops = client->ops;
+       struct client_ops const *ops = client->ops;
         struct fio_net_cmd *cmd;
-       int size;
  
         dprint(FD_NET, "client: handle %s\n", client->hostname);
  
@@ -1792,14 +1820,26 @@ int fio_handle_client(struct fio_client *client)
                 }
         case FIO_NET_CMD_TS: {
                 struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
+               uint64_t offset;
+               int i;
+
+               for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+                       if (le32_to_cpu(p->ts.nr_clat_prio[i])) {
+                               offset = le64_to_cpu(p->ts.clat_prio_offset[i]);
+                               p->ts.clat_prio[i] =
+                                       (struct clat_prio_stat *)((char *)p + offset);
+                       }
+               }
  
                 dprint(FD_NET, "client: ts->ss_state = %u\n", (unsigned int) le32_to_cpu(p->ts.ss_state));
                 if (le32_to_cpu(p->ts.ss_state) & FIO_SS_DATA) {
                         dprint(FD_NET, "client: received steadystate ring buffers\n");
  
-                       size = le64_to_cpu(p->ts.ss_dur);
-                       p->ts.ss_iops_data = (uint64_t *) ((struct cmd_ts_pdu *)cmd->payload + 1);
-                       p->ts.ss_bw_data = p->ts.ss_iops_data + size;
+                       offset = le64_to_cpu(p->ts.ss_iops_data_offset);
+                       p->ts.ss_iops_data = (uint64_t *)((char *)p + offset);
+
+                       offset = le64_to_cpu(p->ts.ss_bw_data_offset);
+                       p->ts.ss_bw_data = (uint64_t *)((char *)p + offset);
                 }
  
                 convert_ts(&p->ts, &p->ts);
@@ -1930,7 +1970,7 @@ int fio_clients_send_trigger(const char *cmd)
         return 0;
  }
  
-static void request_client_etas(struct client_ops *ops)
+static void request_client_etas(struct client_ops const *ops)
  {
         struct fio_client *client;
         struct flist_head *entry;
@@ -2062,7 +2102,7 @@ static int fio_check_clients_timed_out(void)
         return ret;
  }
  
-int fio_handle_clients(struct client_ops *ops)
+int fio_handle_clients(struct client_ops const *ops)
  {
         struct pollfd *pfds;
         int i, ret = 0, retval = 0;
@@ -2150,6 +2190,7 @@ int fio_handle_clients(struct client_ops *ops)
  
         fio_client_json_fini();
  
+       free_clat_prio_stats(&client_ts);
         free(pfds);
         return retval || error_clients;
  }
diff --git a/client.h b/client.h

index 8033325ed0a94371643cb5ebf8c1f69b029e8889..d77b6076f11c7dce2ec48b5a3c90bcfdfb75763f 100644 (file)
--- a/client.h
+++ b/client.h
@@ -69,7 +69,7 @@ struct fio_client {
         uint16_t argc;
         char **argv;
  
-       struct client_ops *ops;
+       struct client_ops const *ops;
         void *client_data;
  
         struct client_file *files;
@@ -84,7 +84,7 @@ typedef void (client_eta_op)(struct jobs_eta *je);
  typedef void (client_timed_out_op)(struct fio_client *);
  typedef void (client_jobs_eta_op)(struct fio_client *client, struct jobs_eta *je);
  
-extern struct client_ops fio_client_ops;
+extern struct client_ops const fio_client_ops;
  
  struct client_ops {
         client_cmd_op           *text;
@@ -128,8 +128,8 @@ extern int fio_start_client(struct fio_client *);
  extern int fio_start_all_clients(void);
  extern int fio_clients_send_ini(const char *);
  extern int fio_client_send_ini(struct fio_client *, const char *, bool);
-extern int fio_handle_clients(struct client_ops *);
-extern int fio_client_add(struct client_ops *, const char *, void **);
+extern int fio_handle_clients(struct client_ops const*);
+extern int fio_client_add(struct client_ops const*, const char *, void **);
  extern struct fio_client *fio_client_add_explicit(struct client_ops *, const char *, int, int);
  extern void fio_client_add_cmd_option(void *, const char *);
  extern int fio_client_add_ini_file(void *, const char *, bool);
diff --git a/compiler/compiler.h b/compiler/compiler.h

index 44fa87b90cf8c5959ec76c8544aa90480b89f318..fefadeaa89fc5e4043d3024a98488f07fe9cbdb9 100644 (file)
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -67,13 +67,14 @@
  #endif
  
  #ifndef __has_attribute
+#define __has_attribute(x) __GCC4_has_attribute_##x
  #define __GCC4_has_attribute___fallthrough__   0
  #endif
  
  #if __has_attribute(__fallthrough__)
-#define fallthrough     __attribute__((__fallthrough__))
+#define fio_fallthrough         __attribute__((__fallthrough__))
  #else
-#define fallthrough    do {} while (0)  /* fallthrough */
+#define fio_fallthrough        do {} while (0)  /* fallthrough */
  #endif
  
  #endif
diff --git a/configure b/configure

index 748f7014c63541b26ef66dd8b72fcd1a8a27f777..3eef022b909f48e5331c640af63f8cb674c28350 100755 (executable)
--- a/configure
+++ b/configure
@@ -116,6 +116,10 @@ has() {
    type "$1" >/dev/null 2>&1
  }
  
+num() {
+  echo "$1" | grep -E -q "^[0-9]+$"
+}
+
  check_define() {
    cat > $TMPC <<EOF
  #if !defined($1)
@@ -129,6 +133,20 @@ EOF
    compile_object
  }
  
+check_val() {
+    cat > $TMPC <<EOF
+#if $1 == $2
+int main(void)
+{
+  return 0;
+}
+#else
+#error $1 is not equal $2
+#endif
+EOF
+  compile_object
+}
+
  output_sym() {
    echo "$1=y" >> $config_host_mak
    echo "#define $1" >> $config_host_h
@@ -137,12 +155,12 @@ output_sym() {
  check_min_lib_version() {
    _feature=$3
  
-  if "${cross_prefix}"pkg-config --atleast-version="$2" "$1" > /dev/null 2>&1; then
+  if pkg-config --atleast-version="$2" "$1" > /dev/null 2>&1; then
      return 0
    fi
    : "${_feature:=${1}}"
-  if "${cross_prefix}"pkg-config --version > /dev/null 2>&1; then
-    if eval "echo \$$_feature" = "yes" ; then
+  if pkg-config --version > /dev/null 2>&1; then
+    if test "$(eval echo \"\$$_feature\")" = "yes" ; then
        feature_not_found "$_feature" "$1 >= $2"
      fi
    else
@@ -159,7 +177,6 @@ show_help="no"
  exit_val=0
  gfio_check="no"
  libhdfs="no"
-pmemblk="no"
  devdax="no"
  pmem="no"
  cuda="no"
@@ -170,7 +187,13 @@ disable_native="no"
  march_set="no"
  libiscsi="no"
  libnbd="no"
+libnfs=""
+xnvme=""
+isal=""
+libblkio=""
  libzbc=""
+dfs=""
+seed_buckets=""
  dynamic_engines="no"
  prefix=/usr/local
  
@@ -238,10 +261,28 @@ for opt do
    ;;
    --disable-libzbc) libzbc="no"
    ;;
+  --disable-xnvme) xnvme="no"
+  ;;
+  --disable-isal) isal="no"
+  ;;
+  --disable-libblkio) libblkio="no"
+  ;;
    --disable-tcmalloc) disable_tcmalloc="yes"
    ;;
+  --disable-libnfs) libnfs="no"
+  ;;
+  --enable-libnfs) libnfs="yes"
+  ;;
    --dynamic-libengines) dynamic_engines="yes"
    ;;
+  --disable-dfs) dfs="no"
+  ;;
+  --enable-asan) asan="yes"
+  ;;
+  --seed-buckets=*) seed_buckets="$optarg"
+  ;;
+  --disable-tls) tls_check="no"
+  ;;
    --help)
      show_help="yes"
      ;;
@@ -270,6 +311,8 @@ if test "$show_help" = "yes" ; then
    echo "--disable-http          Disable HTTP support even if found"
    echo "--disable-gfapi         Disable gfapi"
    echo "--enable-libhdfs        Enable hdfs support"
+  echo "--enable-libnfs         Enable nfs support"
+  echo "--disable-libnfs        Disable nfs support"
    echo "--disable-lex           Disable use of lex/yacc for math"
    echo "--disable-pmem          Disable pmem based engines even if found"
    echo "--enable-lex            Enable use of lex/yacc for math"
@@ -281,14 +324,21 @@ if test "$show_help" = "yes" ; then
    echo "--with-ime=             Install path for DDN's Infinite Memory Engine"
    echo "--enable-libiscsi       Enable iscsi support"
    echo "--enable-libnbd         Enable libnbd (NBD engine) support"
+  echo "--disable-xnvme         Disable xnvme support even if found"
+  echo "--disable-isal          Disable isal support even if found"
+  echo "--disable-libblkio      Disable libblkio support even if found"
    echo "--disable-libzbc        Disable libzbc even if found"
-  echo "--disable-tcmalloc     Disable tcmalloc support"
-  echo "--dynamic-libengines   Lib-based ioengines as dynamic libraries"
+  echo "--disable-tcmalloc      Disable tcmalloc support"
+  echo "--dynamic-libengines    Lib-based ioengines as dynamic libraries"
+  echo "--disable-dfs           Disable DAOS File System support even if found"
+  echo "--enable-asan           Enable address sanitizer"
+  echo "--seed-buckets=         Number of seed buckets for the refill-buffer"
+  echo "--disable-tls          Disable __thread local storage"
    exit $exit_val
  fi
  
  cross_prefix=${cross_prefix-${CROSS_COMPILE}}
-# Preferred compiler (can be overriden later after we know the platform):
+# Preferred compiler (can be overridden later after we know the platform):
  #  ${CC} (if set)
  #  ${cross_prefix}gcc (if cross-prefix specified)
  #  gcc if available
@@ -413,6 +463,8 @@ CYGWIN*)
    clock_gettime="yes" # clock_monotonic probe has dependency on this
    clock_monotonic="yes"
    sched_idle="yes"
+  pthread_condattr_setclock="no"
+  pthread_affinity="no"
    ;;
  esac
  
@@ -465,13 +517,23 @@ elif check_define __aarch64__ ; then
    cpu="aarch64"
  elif check_define __hppa__ ; then
    cpu="hppa"
+elif check_define __loongarch64 ; then
+  cpu="loongarch64"
+elif check_define __riscv ; then
+  if check_val __riscv_xlen 32 ; then
+    cpu="riscv32"
+  elif check_val __riscv_xlen 64 ; then
+    cpu="riscv64"
+  elif check_val __riscv_xlen 128 ; then
+    cpu="riscv128"
+  fi
  else
    cpu=`uname -m`
  fi
  
  # Normalise host CPU name and set ARCH.
  case "$cpu" in
-  ia64|ppc|ppc64|s390|s390x|sparc64)
+  ia64|ppc|ppc64|s390|s390x|sparc64|loongarch64|riscv64)
      cpu="$cpu"
    ;;
    i386|i486|i586|i686|i86pc|BePC)
@@ -634,6 +696,25 @@ if compile_prog "" "-lz" "zlib" ; then
  fi
  print_config "zlib" "$zlib"
  
+##########################################
+# fcntl(F_FULLFSYNC) support
+if test "$fcntl_sync" != "yes" ; then
+  fcntl_sync="no"
+fi
+cat > $TMPC << EOF
+#include <unistd.h>
+#include <fcntl.h>
+
+int main(int argc, char **argv)
+{
+  return fcntl(0, F_FULLFSYNC);
+}
+EOF
+if compile_prog "" "" "fcntl(F_FULLFSYNC)" ; then
+    fcntl_sync="yes"
+fi
+print_config "fcntl(F_FULLFSYNC)" "$fcntl_sync"
+
  ##########################################
  # linux-aio probe
  if test "$libaio" != "yes" ; then
@@ -758,10 +839,8 @@ print_config "POSIX pshared support" "$posix_pshared"
  
  ##########################################
  # POSIX pthread_condattr_setclock() probe
-if test "$pthread_condattr_setclock" != "yes" ; then
-  pthread_condattr_setclock="no"
-fi
-cat > $TMPC <<EOF
+if test "$pthread_condattr_setclock" != "no" ; then
+  cat > $TMPC <<EOF
  #include <pthread.h>
  int main(void)
  {
@@ -770,11 +849,12 @@ int main(void)
    return 0;
  }
  EOF
-if compile_prog "" "$LIBS" "pthread_condattr_setclock" ; then
-  pthread_condattr_setclock=yes
-elif compile_prog "" "$LIBS -lpthread" "pthread_condattr_setclock" ; then
-  pthread_condattr_setclock=yes
-  LIBS="$LIBS -lpthread"
+  if compile_prog "" "$LIBS" "pthread_condattr_setclock" ; then
+    pthread_condattr_setclock=yes
+  elif compile_prog "" "$LIBS -lpthread" "pthread_condattr_setclock" ; then
+    pthread_condattr_setclock=yes
+    LIBS="$LIBS -lpthread"
+  fi
  fi
  print_config "pthread_condattr_setclock()" "$pthread_condattr_setclock"
  
@@ -788,7 +868,8 @@ cat > $TMPC <<EOF
  #include <signal.h> /* pthread_sigmask() */
  int main(void)
  {
-  return pthread_sigmask(0, NULL, NULL);
+  sigset_t sigmask;
+  return pthread_sigmask(0, NULL, &sigmask);
  }
  EOF
  if compile_prog "" "$LIBS" "pthread_sigmask" ; then
@@ -799,6 +880,29 @@ elif compile_prog "" "$LIBS -lpthread" "pthread_sigmask" ; then
  fi
  print_config "pthread_sigmask()" "$pthread_sigmask"
  
+##########################################
+# pthread_getaffinity_np() probe
+if test "$pthread_getaffinity" != "yes" ; then
+  pthread_getaffinity="no"
+fi
+cat > $TMPC <<EOF
+#include <stddef.h> /* NULL */
+#include <signal.h> /* pthread_sigmask() */
+#include <pthread.h>
+int main(void)
+{
+  cpu_set_t set;
+  return pthread_getaffinity_np(pthread_self(), sizeof(set), &set);
+}
+EOF
+if compile_prog "" "$LIBS" "pthread_getaffinity" ; then
+  pthread_getaffinity="yes"
+elif compile_prog "" "$LIBS -lpthread" "pthread_getaffinity" ; then
+  pthread_getaffinity="yes"
+  LIBS="$LIBS -lpthread"
+fi
+print_config "pthread_getaffinity_np()" "$pthread_getaffinity"
+
  ##########################################
  # solaris aio probe
  if test "$solaris_aio" != "yes" ; then
@@ -920,6 +1024,48 @@ if test "$disable_rdma" != "yes" && compile_prog "" "-lrdmacm" "rdma"; then
  fi
  print_config "rdmacm" "$rdmacm"
  
+##########################################
+# librpma probe
+# The librpma engines require librpma>=v0.11.0 with rpma_cq_get_wc().
+if test "$librpma" != "yes" ; then
+  librpma="no"
+fi
+cat > $TMPC << EOF
+#include <librpma.h>
+int main(void)
+{
+  void *ptr = rpma_cq_get_wc;
+  (void) ptr; /* unused */
+  return 0;
+}
+EOF
+if test "$disable_rdma" != "yes" && compile_prog "" "-lrpma" "rpma"; then
+    librpma="yes"
+fi
+print_config "librpma" "$librpma"
+
+##########################################
+# libprotobuf-c probe
+if test "$libprotobuf_c" != "yes" ; then
+  libprotobuf_c="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <protobuf-c/protobuf-c.h>
+#if !defined(PROTOBUF_C_VERSION_NUMBER)
+# error PROTOBUF_C_VERSION_NUMBER is not defined!
+#endif
+int main(int argc, char **argv)
+{
+  (void)protobuf_c_message_check(NULL);
+  return 0;
+}
+EOF
+if compile_prog "" "-lprotobuf-c" "protobuf_c"; then
+    libprotobuf_c="yes"
+fi
+print_config "libprotobuf_c" "$libprotobuf_c"
+
  ##########################################
  # asprintf() and vasprintf() probes
  if test "$have_asprintf" != "yes" ; then
@@ -1027,7 +1173,8 @@ cat > $TMPC << EOF
  #include <sched.h>
  int main(int argc, char **argv)
  {
-  cpu_set_t mask;
+  cpu_set_t mask = { };
+
    return sched_setaffinity(0, sizeof(mask), &mask);
  }
  EOF
@@ -1038,7 +1185,8 @@ else
  #include <sched.h>
  int main(int argc, char **argv)
  {
-  cpu_set_t mask;
+  cpu_set_t mask = { };
+
    return sched_setaffinity(0, &mask);
  }
  EOF
@@ -1059,7 +1207,9 @@ cat > $TMPC << EOF
  #include <time.h>
  int main(int argc, char **argv)
  {
-  return clock_gettime(0, NULL);
+  struct timespec ts;
+
+  return clock_gettime(0, &ts);
  }
  EOF
  if compile_prog "" "" "clock_gettime"; then
@@ -1081,7 +1231,9 @@ if test "$clock_gettime" = "yes" ; then
  #include <time.h>
  int main(int argc, char **argv)
  {
-  return clock_gettime(CLOCK_MONOTONIC, NULL);
+  struct timespec ts;
+
+  return clock_gettime(CLOCK_MONOTONIC, &ts);
  }
  EOF
    if compile_prog "" "$LIBS" "clock monotonic"; then
@@ -1222,6 +1374,23 @@ if compile_prog "" "" "sync_file_range"; then
  fi
  print_config "sync_file_range" "$sync_file_range"
  
+##########################################
+# ASharedMemory_create() probe
+if test "$ASharedMemory_create" != "yes" ; then
+  ASharedMemory_create="no"
+fi
+cat > $TMPC << EOF
+#include <android/sharedmem.h>
+int main(int argc, char **argv)
+{
+  return ASharedMemory_create("", 0);
+}
+EOF
+if compile_prog "" "" "ASharedMemory_create"; then
+  ASharedMemory_create="yes"
+fi
+print_config "ASharedMemory_create" "$ASharedMemory_create"
+
  ##########################################
  # ext4 move extent probe
  if test "$ext4_me" != "yes" ; then
@@ -1429,7 +1598,8 @@ print_config "socklen_t" "$socklen_t"
  if test "$tls_thread" != "yes" ; then
    tls_thread="no"
  fi
-cat > $TMPC << EOF
+if test "$tls_check" != "no"; then
+  cat > $TMPC << EOF
  #include <stdio.h>
  static __thread int ret;
  int main(int argc, char **argv)
@@ -1440,6 +1610,7 @@ EOF
  if compile_prog "" "" "__thread"; then
    tls_thread="yes"
  fi
+fi
  print_config "__thread" "$tls_thread"
  
  ##########################################
@@ -1460,14 +1631,14 @@ int main(void)
    return GTK_CHECK_VERSION(2, 18, 0) ? 0 : 1; /* 0 on success */
  }
  EOF
-GTK_CFLAGS=$(${cross_prefix}pkg-config --cflags gtk+-2.0 gthread-2.0)
+GTK_CFLAGS=$(pkg-config --cflags gtk+-2.0 gthread-2.0)
  ORG_LDFLAGS=$LDFLAGS
  LDFLAGS=$(echo $LDFLAGS | sed s/"-static"//g)
  if test "$?" != "0" ; then
    echo "configure: gtk and gthread not found"
    exit 1
  fi
-GTK_LIBS=$(${cross_prefix}pkg-config --libs gtk+-2.0 gthread-2.0)
+GTK_LIBS=$(pkg-config --libs gtk+-2.0 gthread-2.0)
  if test "$?" != "0" ; then
    echo "configure: gtk and gthread not found"
    exit 1
@@ -1520,7 +1691,8 @@ cat > $TMPC << EOF
  #include <sched.h>
  int main(int argc, char **argv)
  {
-  struct sched_param p;
+  struct sched_param p = { };
+
    return sched_setscheduler(0, SCHED_IDLE, &p);
  }
  EOF
@@ -1556,6 +1728,25 @@ elif compile_prog "" "-lws2_32" "TCP_NODELAY"; then
  fi
  print_config "TCP_NODELAY" "$tcp_nodelay"
  
+##########################################
+# Check whether we have vsock
+if test "$vsock" != "yes" ; then
+  vsock="no"
+fi
+cat > $TMPC << EOF
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <linux/vm_sockets.h>
+int main(int argc, char **argv)
+{
+  return socket(AF_VSOCK, SOCK_STREAM, 0);
+}
+EOF
+if compile_prog "" "" "vsock"; then
+  vsock="yes"
+fi
+print_config "vsock" "$vsock"
+
  ##########################################
  # Check whether we have SO_SNDBUF
  if test "$window_size" != "yes" ; then
@@ -1642,7 +1833,9 @@ cat > $TMPC << EOF
  #include <sys/uio.h>
  int main(int argc, char **argv)
  {
-  return pwritev(0, NULL, 1, 0) + preadv(0, NULL, 1, 0);
+  struct iovec iov[1] = { };
+
+  return pwritev(0, iov, 1, 0) + preadv(0, iov, 1, 0);
  }
  EOF
  if compile_prog "" "" "pwritev"; then
@@ -1660,7 +1853,9 @@ cat > $TMPC << EOF
  #include <sys/uio.h>
  int main(int argc, char **argv)
  {
-  return pwritev2(0, NULL, 1, 0, 0) + preadv2(0, NULL, 1, 0, 0);
+  struct iovec iov[1] = { };
+
+  return pwritev2(0, iov, 1, 0, 0) + preadv2(0, iov, 1, 0, 0);
  }
  EOF
  if compile_prog "" "" "pwritev2"; then
@@ -1686,14 +1881,14 @@ cat > $TMPC << EOF
  #include <stdio.h>
  int main(int argc, char **argv)
  {
-  struct addrinfo hints;
-  struct in6_addr addr;
+  struct addrinfo hints = { };
+  struct in6_addr addr = in6addr_any;
    int ret;
  
    ret = getaddrinfo(NULL, NULL, &hints, NULL);
    freeaddrinfo(NULL);
-  printf("%s\n", gai_strerror(ret));
-  addr = in6addr_any;
+  printf("%s %d\n", gai_strerror(ret), addr.s6_addr[0]);
+
    return 0;
  }
  EOF
@@ -2004,7 +2199,7 @@ if test "$libhdfs" = "yes" ; then
      hdfs_conf_error=1
    fi
    if test "$FIO_LIBHDFS_INCLUDE" = "" ; then
-    echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs inlude path"
+    echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs include path"
      hdfs_conf_error=1
    fi
    if test "$FIO_LIBHDFS_LIB" = "" ; then
@@ -2054,9 +2249,7 @@ cat > $TMPC << EOF
  #include <stdlib.h>
  int main(int argc, char **argv)
  {
-  int rc;
-  rc = pmem_is_pmem(NULL, 0);
-  return 0;
+  return pmem_is_pmem(NULL, 0);
  }
  EOF
  if compile_prog "" "-lpmem" "libpmem"; then
@@ -2075,7 +2268,7 @@ if test "$libpmem" = "yes"; then
  #include <stdlib.h>
  int main(int argc, char **argv)
  {
-  pmem_memcpy(NULL, NULL, NULL, NULL);
+  pmem_memcpy(NULL, NULL, 0, 0);
    return 0;
  }
  EOF
@@ -2086,26 +2279,24 @@ fi
  print_config "libpmem1_5" "$libpmem1_5"
  
  ##########################################
-# Check whether we have libpmemblk
-# libpmem is a prerequisite
-if test "$libpmemblk" != "yes" ; then
-  libpmemblk="no"
+# Check whether we have libpmem2
+if test "$libpmem2" != "yes" ; then
+  libpmem2="no"
  fi
-if test "$libpmem" = "yes"; then
-  cat > $TMPC << EOF
-#include <libpmemblk.h>
+cat > $TMPC << EOF
+#include <libpmem2.h>
  int main(int argc, char **argv)
  {
-  PMEMblkpool *pbp;
-  pbp = pmemblk_open("", 0);
+  struct pmem2_config *cfg;
+  pmem2_config_new(&cfg);
+  pmem2_config_delete(&cfg);
    return 0;
  }
  EOF
-  if compile_prog "" "-lpmemblk" "libpmemblk"; then
-    libpmemblk="yes"
-  fi
+if compile_prog "" "-lpmem2" "libpmem2"; then
+  libpmem2="yes"
  fi
-print_config "libpmemblk" "$libpmemblk"
+print_config "libpmem2" "$libpmem2"
  
  # Choose libpmem-based ioengines
  if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
@@ -2113,15 +2304,8 @@ if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
    if test "$libpmem1_5" = "yes"; then
      pmem="yes"
    fi
-  if test "$libpmemblk" = "yes"; then
-    pmemblk="yes"
-  fi
  fi
  
-##########################################
-# Report whether pmemblk engine is enabled
-print_config "PMDK pmemblk engine" "$pmemblk"
-
  ##########################################
  # Report whether dev-dax engine is enabled
  print_config "PMDK dev-dax engine" "$devdax"
@@ -2179,6 +2363,49 @@ if test "$libnbd" != "no" ; then
  fi
  print_config "NBD engine" "$libnbd"
  
+##########################################
+# check for dfs (DAOS File System)
+if test "$dfs" != "no" ; then
+  cat > $TMPC << EOF
+#include <fcntl.h>
+#include <daos.h>
+#include <daos_fs.h>
+
+int main(int argc, char **argv)
+{
+  daos_handle_t        poh;
+  daos_handle_t        coh;
+  dfs_t                *dfs;
+
+  (void) dfs_mount(poh, coh, O_RDWR, &dfs);
+
+  return 0;
+}
+EOF
+  if compile_prog "" "-luuid -ldfs -ldaos" "dfs"; then
+    dfs="yes"
+  else
+    dfs="no"
+  fi
+fi
+print_config "DAOS File System (dfs) Engine" "$dfs"
+
+##########################################
+# Check if we have libnfs (for userspace nfs support).
+if test "$libnfs" != "no" ; then
+  if $(pkg-config libnfs > /dev/null 2>&1); then
+    libnfs="yes"
+    libnfs_cflags=$(pkg-config --cflags libnfs)
+    libnfs_libs=$(pkg-config --libs libnfs)
+  else
+    if test "$libnfs" = "yes" ; then
+      feature_not_found "libnfs" "libnfs"
+    fi
+    libnfs="no"
+  fi
+fi
+print_config "NFS engine" "$libnfs"
+
  ##########################################
  # Check if we have lex/yacc available
  yacc="no"
@@ -2249,7 +2476,7 @@ int main(int argc, char **argv)
    FILE *mtab = setmntent(NULL, "r");
    struct mntent *mnt = getmntent(mtab);
    endmntent(mtab);
-  return 0;
+  return mnt != NULL;
  }
  EOF
  if compile_prog "" "" "getmntent"; then
@@ -2382,7 +2609,7 @@ if compile_prog "" "" "valgrind_dev"; then
  fi
  print_config "Valgrind headers" "$valgrind_dev"
  
-if test "$targetos" = "Linux" ; then
+if test "$targetos" = "Linux" || test "$targetos" = "Android"; then
  ##########################################
  # <linux/blkzoned.h> probe
  if test "$linux_blkzoned" != "yes" ; then
@@ -2430,6 +2657,10 @@ int main(int argc, char **argv)
  }
  EOF
  if test "$libzbc" != "no" ; then
+  if [ -e /usr/include/libzbc/libzbc ]; then
+    # SUSE Linux.
+    CFLAGS="$CFLAGS -I/usr/include/libzbc"
+  fi
    if compile_prog "" "-lzbc" "libzbc"; then
      libzbc="yes"
      if ! check_min_lib_version libzbc 5; then
@@ -2444,22 +2675,94 @@ if test "$libzbc" != "no" ; then
  fi
  print_config "libzbc engine" "$libzbc"
  
+if test "$targetos" = "Linux" || test "$targetos" = "Android"; then
  ##########################################
-# check march=armv8-a+crc+crypto
-if test "$march_armv8_a_crc_crypto" != "yes" ; then
-  march_armv8_a_crc_crypto="no"
+# Check NVME_URING_CMD support
+cat > $TMPC << EOF
+#include <linux/nvme_ioctl.h>
+int main(void)
+{
+  return sizeof(struct nvme_uring_cmd);
+}
+EOF
+if compile_prog "" "" "nvme uring cmd"; then
+  output_sym "CONFIG_NVME_URING_CMD"
+  nvme_uring_cmd="yes"
+else
+  nvme_uring_cmd="no"
+fi
+print_config "NVMe uring command support" "$nvme_uring_cmd"
  fi
+
+##########################################
+# Check if we have xnvme
+if test "$xnvme" != "no" ; then
+  if check_min_lib_version xnvme 0.7.4; then
+    xnvme="yes"
+    xnvme_cflags=$(pkg-config --cflags xnvme)
+    xnvme_libs=$(pkg-config --libs xnvme)
+  else
+    xnvme="no"
+  fi
+fi
+print_config "xnvme engine" "$xnvme"
+
+if test "$targetos" = "Linux" ; then
+##########################################
+# Check ISA-L support
+cat > $TMPC << EOF
+#include <isa-l/crc.h>
+#include <stddef.h>
+int main(void)
+{
+  return crc16_t10dif(0, NULL, 4096);
+}
+EOF
+if test "$isal" != "no" ; then
+  if compile_prog "" "-lisal" "ISAL"; then
+    isal="yes"
+    LIBS="-lisal $LIBS"
+  else
+    isal="no"
+  fi
+fi
+print_config "isal" "$isal"
+fi
+
+##########################################
+# Check if we have libblkio
+if test "$libblkio" != "no" ; then
+  if check_min_lib_version blkio 1.0.0; then
+    libblkio="yes"
+    libblkio_cflags=$(pkg-config --cflags blkio)
+    libblkio_libs=$(pkg-config --libs blkio)
+  else
+    if test "$libblkio" = "yes" ; then
+      feature_not_found "libblkio" "libblkio-dev or libblkio-devel"
+    fi
+    libblkio="no"
+  fi
+fi
+print_config "libblkio engine" "$libblkio"
+
+##########################################
+# check march=armv8-a+crc+crypto
+march_armv8_a_crc_crypto="no"
  if test "$cpu" = "arm64" ; then
    cat > $TMPC <<EOF
+#if __linux__
  #include <arm_acle.h>
  #include <arm_neon.h>
  #include <sys/auxv.h>
+#endif
  
  int main(void)
  {
    /* Can we also do a runtime probe? */
  #if __linux__
    return getauxval(AT_HWCAP);
+#elif defined(__APPLE__)
+  return 0;
  #else
  # error "Don't know how to do runtime probe for ARM CRC32c"
  #endif
@@ -2506,9 +2809,9 @@ int main(int argc, char* argv[]) {
     return 0;
  }
  EOF
-  if compile_prog "" "-lcuda -lcudart -lcufile" "libcufile"; then
+  if compile_prog "" "-lcuda -lcudart -lcufile -ldl" "libcufile"; then
      libcufile="yes"
-    LIBS="-lcuda -lcudart -lcufile $LIBS"
+    LIBS="-lcuda -lcudart -lcufile -ldl $LIBS"
    else
      if test "$libcufile" = "yes" ; then
        feature_not_found "libcufile" ""
@@ -2587,6 +2890,22 @@ if compile_prog "-Wimplicit-fallthrough=2" "" "-Wimplicit-fallthrough=2"; then
  fi
  print_config "-Wimplicit-fallthrough=2" "$fallthrough"
  
+##########################################
+# check if the compiler has -Wno-stringop-concatenation
+no_stringop="no"
+cat > $TMPC << EOF
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+       return printf("%s\n", argv[0]);
+}
+EOF
+if compile_prog "-Wno-stringop-truncation -Werror" "" "no_stringop"; then
+  no_stringop="yes"
+fi
+print_config "-Wno-stringop-truncation" "$no_stringop"
+
  ##########################################
  # check for MADV_HUGEPAGE support
  if test "$thp" != "yes" ; then
@@ -2749,6 +3068,9 @@ fi
  if test "$pthread_sigmask" = "yes" ; then
    output_sym "CONFIG_PTHREAD_SIGMASK"
  fi
+if test "$pthread_getaffinity" = "yes" ; then
+  output_sym "CONFIG_PTHREAD_GETAFFINITY"
+fi
  if test "$have_asprintf" = "yes" ; then
      output_sym "CONFIG_HAVE_ASPRINTF"
  fi
@@ -2776,6 +3098,9 @@ fi
  if test "$sync_file_range" = "yes" ; then
    output_sym "CONFIG_SYNC_FILE_RANGE"
  fi
+if test "$ASharedMemory_create" = "yes" ; then
+  output_sym "CONFIG_ASHAREDMEMORY_CREATE"
+fi
  if test "$sfaa" = "yes" ; then
    output_sym "CONFIG_SFAA"
  fi
@@ -2788,18 +3113,23 @@ fi
  if test "$libverbs" = "yes" -a "$rdmacm" = "yes" ; then
    output_sym "CONFIG_RDMA"
  fi
+# librpma is supported on the 'x86_64' architecture for now
+if test "$cpu" = "x86_64" -a "$libverbs" = "yes" -a "$rdmacm" = "yes" \
+    -a "$librpma" = "yes" \
+    && test "$libpmem" = "yes" -o "$libpmem2" = "yes" ; then
+  output_sym "CONFIG_LIBRPMA_APM"
+fi
+if test "$cpu" = "x86_64" -a "$libverbs" = "yes" -a "$rdmacm" = "yes" \
+    -a "$librpma" = "yes" -a "$libprotobuf_c" = "yes" \
+    && test "$libpmem" = "yes" -o "$libpmem2" = "yes" ; then
+  output_sym "CONFIG_LIBRPMA_GPSPM"
+fi
  if test "$clock_gettime" = "yes" ; then
    output_sym "CONFIG_CLOCK_GETTIME"
  fi
  if test "$clock_monotonic" = "yes" ; then
    output_sym "CONFIG_CLOCK_MONOTONIC"
  fi
-if test "$clock_monotonic_raw" = "yes" ; then
-  output_sym "CONFIG_CLOCK_MONOTONIC_RAW"
-fi
-if test "$clock_monotonic_precise" = "yes" ; then
-  output_sym "CONFIG_CLOCK_MONOTONIC_PRECISE"
-fi
  if test "$clockid_t" = "yes"; then
    output_sym "CONFIG_CLOCKID_T"
  fi
@@ -2881,6 +3211,9 @@ fi
  if test "$ipv6" = "yes" ; then
    output_sym "CONFIG_IPV6"
  fi
+if test "$vsock" = "yes"; then
+  output_sym "CONFIG_VSOCK"
+fi
  if test "$http" = "yes" ; then
    output_sym "CONFIG_HTTP"
  fi
@@ -2926,15 +3259,15 @@ fi
  if test "$mtd" = "yes" ; then
    output_sym "CONFIG_MTD"
  fi
-if test "$pmemblk" = "yes" ; then
-  output_sym "CONFIG_PMEMBLK"
-fi
  if test "$devdax" = "yes" ; then
    output_sym "CONFIG_LINUX_DEVDAX"
  fi
  if test "$pmem" = "yes" ; then
    output_sym "CONFIG_LIBPMEM"
  fi
+if test "$libpmem2" = "yes" ; then
+  output_sym "CONFIG_LIBPMEM2_INSTALLED"
+fi
  if test "$libime" = "yes" ; then
    output_sym "CONFIG_IME"
  fi
@@ -2980,7 +3313,7 @@ if test "$libzbc" = "yes" ; then
    output_sym "CONFIG_LIBZBC"
  fi
  if test "$zlib" = "no" ; then
-  echo "Consider installing zlib-dev (zlib-devel, some fio features depend on it."
+  echo "Consider installing zlib1g-dev (zlib-devel) as some fio features depend on it."
    if test "$build_static" = "yes"; then
      echo "Note that some distros have separate packages for static libraries."
    fi
@@ -2994,6 +3327,9 @@ fi
  if test "$libcufile" = "yes" ; then
    output_sym "CONFIG_LIBCUFILE"
  fi
+if test "$dfs" = "yes" ; then
+  output_sym "CONFIG_DFS"
+fi
  if test "$march_set" = "no" && test "$build_native" = "yes" ; then
    output_sym "CONFIG_BUILD_NATIVE"
  fi
@@ -3018,6 +3354,9 @@ fi
  if test "$fallthrough" = "yes"; then
    CFLAGS="$CFLAGS -Wimplicit-fallthrough"
  fi
+if test "$no_stringop" = "yes"; then
+  output_sym "CONFIG_HAVE_NO_STRINGOP"
+fi
  if test "$thp" = "yes" ; then
    output_sym "CONFIG_HAVE_THP"
  fi
@@ -3033,13 +3372,37 @@ if test "$libnbd" = "yes" ; then
    echo "LIBNBD_CFLAGS=$libnbd_cflags" >> $config_host_mak
    echo "LIBNBD_LIBS=$libnbd_libs" >> $config_host_mak
  fi
+if test "$libnfs" = "yes" ; then
+  output_sym "CONFIG_LIBNFS"
+  echo "LIBNFS_CFLAGS=$libnfs_cflags" >> $config_host_mak
+  echo "LIBNFS_LIBS=$libnfs_libs" >> $config_host_mak
+fi
+if test "$xnvme" = "yes" ; then
+  output_sym "CONFIG_LIBXNVME"
+  echo "LIBXNVME_CFLAGS=$xnvme_cflags" >> $config_host_mak
+  echo "LIBXNVME_LIBS=$xnvme_libs" >> $config_host_mak
+fi
+if test "$isal" = "yes" ; then
+  output_sym "CONFIG_LIBISAL"
+fi
+if test "$libblkio" = "yes" ; then
+  output_sym "CONFIG_LIBBLKIO"
+  echo "LIBBLKIO_CFLAGS=$libblkio_cflags" >> $config_host_mak
+  echo "LIBBLKIO_LIBS=$libblkio_libs" >> $config_host_mak
+fi
  if test "$dynamic_engines" = "yes" ; then
    output_sym "CONFIG_DYNAMIC_ENGINES"
  fi
  if test "$pdb" = yes; then
    output_sym "CONFIG_PDB"
  fi
-
+if test "$fcntl_sync" = "yes" ; then
+  output_sym "CONFIG_FCNTL_SYNC"
+fi
+if test "$asan" = "yes"; then
+  CFLAGS="$CFLAGS -fsanitize=address"
+  LDFLAGS="$LDFLAGS -fsanitize=address"
+fi
  print_config "Lib-based ioengines dynamic" "$dynamic_engines"
  cat > $TMPC << EOF
  int main(int argc, char **argv)
@@ -3059,6 +3422,15 @@ if test "$disable_tcmalloc" != "yes"; then
    fi
  fi
  print_config "TCMalloc support" "$tcmalloc"
+if ! num "$seed_buckets"; then
+  seed_buckets=4
+elif test "$seed_buckets" -lt 2; then
+  seed_buckets=2
+elif test "$seed_buckets" -gt 16; then
+  seed_buckets=16
+fi
+echo "#define CONFIG_SEED_BUCKETS $seed_buckets" >> $config_host_h
+print_config "seed_buckets" "$seed_buckets"
  
  echo "LIBS+=$LIBS" >> $config_host_mak
  echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
diff --git a/crc/crc-t10dif.h b/crc/crc-t10dif.h

new file mode 100644 (file)

index 0000000..fde4ccd
--- /dev/null
+++ b/crc/crc-t10dif.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CRC_T10DIF_H
+#define __CRC_T10DIF_H
+
+extern unsigned short fio_crc_t10dif(unsigned short crc,
+                                    const unsigned char *buffer,
+                                    unsigned int len);
+
+#endif
diff --git a/crc/crc64.c b/crc/crc64.c

index bf24a97bf2b2e37efffd46d7890beddfa48e6b40..c910e5b84583f48211b1d0d5d0310e9f9b73f82e 100644 (file)
--- a/crc/crc64.c
+++ b/crc/crc64.c
@@ -1,4 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * crc64nvme[256] table is from the generator polynomial specified by NVMe
+ * 64b CRC and is defined as,
+ *
+ * x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + x^47 +
+ * x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + x^26 + x^23 +
+ * x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + x^4 + x^3 + 1
+ *
+ */
+
  #include "crc64.h"
+#include "crc64table.h"
  
  /*
   * poly 0x95AC9329AC4BC9B5ULL and init 0xFFFFFFFFFFFFFFFFULL
@@ -102,3 +114,23 @@ unsigned long long fio_crc64(const unsigned char *buffer, unsigned long length)
         return crc;
  }
  
+/**
+ * fio_crc64_nvme - Calculate bitwise NVMe CRC64
+ * @crc: seed value for computation. 0 for a new CRC calculation, or the
+ *      previous crc64 value if computing incrementally.
+ * @p: pointer to buffer over which CRC64 is run
+ * @len: length of buffer @p
+ */
+unsigned long long fio_crc64_nvme(unsigned long long crc, const void *p,
+                                 unsigned int len)
+{
+       const unsigned char *_p = p;
+       unsigned int i;
+
+       crc = ~crc;
+
+       for (i = 0; i < len; i++)
+               crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *_p++];
+
+       return ~crc;
+}
diff --git a/crc/crc64.h b/crc/crc64.h

index fe9cad3e269a642a8c4cb1a75320f2016ef5c1d5..e586edee2de03b18ede3a1c034be0be7fc3b8af8 100644 (file)
--- a/crc/crc64.h
+++ b/crc/crc64.h
@@ -3,4 +3,7 @@
  
  unsigned long long fio_crc64(const unsigned char *, unsigned long);
  
+unsigned long long fio_crc64_nvme(unsigned long long crc, const void *p,
+                                 unsigned int len);
+
  #endif
diff --git a/crc/crc64table.h b/crc/crc64table.h

new file mode 100644 (file)

index 0000000..04224d4
--- /dev/null
+++ b/crc/crc64table.h
@@ -0,0 +1,130 @@
+static const unsigned long long crc64nvmetable[256] = {
+       0x0000000000000000ULL,  0x7f6ef0c830358979ULL,
+       0xfedde190606b12f2ULL,  0x81b31158505e9b8bULL,
+       0xc962e5739841b68fULL,  0xb60c15bba8743ff6ULL,
+       0x37bf04e3f82aa47dULL,  0x48d1f42bc81f2d04ULL,
+       0xa61cecb46814fe75ULL,  0xd9721c7c5821770cULL,
+       0x58c10d24087fec87ULL,  0x27affdec384a65feULL,
+       0x6f7e09c7f05548faULL,  0x1010f90fc060c183ULL,
+       0x91a3e857903e5a08ULL,  0xeecd189fa00bd371ULL,
+       0x78e0ff3b88be6f81ULL,  0x078e0ff3b88be6f8ULL,
+       0x863d1eabe8d57d73ULL,  0xf953ee63d8e0f40aULL,
+       0xb1821a4810ffd90eULL,  0xceecea8020ca5077ULL,
+       0x4f5ffbd87094cbfcULL,  0x30310b1040a14285ULL,
+       0xdefc138fe0aa91f4ULL,  0xa192e347d09f188dULL,
+       0x2021f21f80c18306ULL,  0x5f4f02d7b0f40a7fULL,
+       0x179ef6fc78eb277bULL,  0x68f0063448deae02ULL,
+       0xe943176c18803589ULL,  0x962de7a428b5bcf0ULL,
+       0xf1c1fe77117cdf02ULL,  0x8eaf0ebf2149567bULL,
+       0x0f1c1fe77117cdf0ULL,  0x7072ef2f41224489ULL,
+       0x38a31b04893d698dULL,  0x47cdebccb908e0f4ULL,
+       0xc67efa94e9567b7fULL,  0xb9100a5cd963f206ULL,
+       0x57dd12c379682177ULL,  0x28b3e20b495da80eULL,
+       0xa900f35319033385ULL,  0xd66e039b2936bafcULL,
+       0x9ebff7b0e12997f8ULL,  0xe1d10778d11c1e81ULL,
+       0x606216208142850aULL,  0x1f0ce6e8b1770c73ULL,
+       0x8921014c99c2b083ULL,  0xf64ff184a9f739faULL,
+       0x77fce0dcf9a9a271ULL,  0x08921014c99c2b08ULL,
+       0x4043e43f0183060cULL,  0x3f2d14f731b68f75ULL,
+       0xbe9e05af61e814feULL,  0xc1f0f56751dd9d87ULL,
+       0x2f3dedf8f1d64ef6ULL,  0x50531d30c1e3c78fULL,
+       0xd1e00c6891bd5c04ULL,  0xae8efca0a188d57dULL,
+       0xe65f088b6997f879ULL,  0x9931f84359a27100ULL,
+       0x1882e91b09fcea8bULL,  0x67ec19d339c963f2ULL,
+       0xd75adabd7a6e2d6fULL,  0xa8342a754a5ba416ULL,
+       0x29873b2d1a053f9dULL,  0x56e9cbe52a30b6e4ULL,
+       0x1e383fcee22f9be0ULL,  0x6156cf06d21a1299ULL,
+       0xe0e5de5e82448912ULL,  0x9f8b2e96b271006bULL,
+       0x71463609127ad31aULL,  0x0e28c6c1224f5a63ULL,
+       0x8f9bd7997211c1e8ULL,  0xf0f5275142244891ULL,
+       0xb824d37a8a3b6595ULL,  0xc74a23b2ba0eececULL,
+       0x46f932eaea507767ULL,  0x3997c222da65fe1eULL,
+       0xafba2586f2d042eeULL,  0xd0d4d54ec2e5cb97ULL,
+       0x5167c41692bb501cULL,  0x2e0934dea28ed965ULL,
+       0x66d8c0f56a91f461ULL,  0x19b6303d5aa47d18ULL,
+       0x980521650afae693ULL,  0xe76bd1ad3acf6feaULL,
+       0x09a6c9329ac4bc9bULL,  0x76c839faaaf135e2ULL,
+       0xf77b28a2faafae69ULL,  0x8815d86aca9a2710ULL,
+       0xc0c42c4102850a14ULL,  0xbfaadc8932b0836dULL,
+       0x3e19cdd162ee18e6ULL,  0x41773d1952db919fULL,
+       0x269b24ca6b12f26dULL,  0x59f5d4025b277b14ULL,
+       0xd846c55a0b79e09fULL,  0xa72835923b4c69e6ULL,
+       0xeff9c1b9f35344e2ULL,  0x90973171c366cd9bULL,
+       0x1124202993385610ULL,  0x6e4ad0e1a30ddf69ULL,
+       0x8087c87e03060c18ULL,  0xffe938b633338561ULL,
+       0x7e5a29ee636d1eeaULL,  0x0134d92653589793ULL,
+       0x49e52d0d9b47ba97ULL,  0x368bddc5ab7233eeULL,
+       0xb738cc9dfb2ca865ULL,  0xc8563c55cb19211cULL,
+       0x5e7bdbf1e3ac9decULL,  0x21152b39d3991495ULL,
+       0xa0a63a6183c78f1eULL,  0xdfc8caa9b3f20667ULL,
+       0x97193e827bed2b63ULL,  0xe877ce4a4bd8a21aULL,
+       0x69c4df121b863991ULL,  0x16aa2fda2bb3b0e8ULL,
+       0xf86737458bb86399ULL,  0x8709c78dbb8deae0ULL,
+       0x06bad6d5ebd3716bULL,  0x79d4261ddbe6f812ULL,
+       0x3105d23613f9d516ULL,  0x4e6b22fe23cc5c6fULL,
+       0xcfd833a67392c7e4ULL,  0xb0b6c36e43a74e9dULL,
+       0x9a6c9329ac4bc9b5ULL,  0xe50263e19c7e40ccULL,
+       0x64b172b9cc20db47ULL,  0x1bdf8271fc15523eULL,
+       0x530e765a340a7f3aULL,  0x2c608692043ff643ULL,
+       0xadd397ca54616dc8ULL,  0xd2bd67026454e4b1ULL,
+       0x3c707f9dc45f37c0ULL,  0x431e8f55f46abeb9ULL,
+       0xc2ad9e0da4342532ULL,  0xbdc36ec59401ac4bULL,
+       0xf5129aee5c1e814fULL,  0x8a7c6a266c2b0836ULL,
+       0x0bcf7b7e3c7593bdULL,  0x74a18bb60c401ac4ULL,
+       0xe28c6c1224f5a634ULL,  0x9de29cda14c02f4dULL,
+       0x1c518d82449eb4c6ULL,  0x633f7d4a74ab3dbfULL,
+       0x2bee8961bcb410bbULL,  0x548079a98c8199c2ULL,
+       0xd53368f1dcdf0249ULL,  0xaa5d9839ecea8b30ULL,
+       0x449080a64ce15841ULL,  0x3bfe706e7cd4d138ULL,
+       0xba4d61362c8a4ab3ULL,  0xc52391fe1cbfc3caULL,
+       0x8df265d5d4a0eeceULL,  0xf29c951de49567b7ULL,
+       0x732f8445b4cbfc3cULL,  0x0c41748d84fe7545ULL,
+       0x6bad6d5ebd3716b7ULL,  0x14c39d968d029fceULL,
+       0x95708ccedd5c0445ULL,  0xea1e7c06ed698d3cULL,
+       0xa2cf882d2576a038ULL,  0xdda178e515432941ULL,
+       0x5c1269bd451db2caULL,  0x237c997575283bb3ULL,
+       0xcdb181ead523e8c2ULL,  0xb2df7122e51661bbULL,
+       0x336c607ab548fa30ULL,  0x4c0290b2857d7349ULL,
+       0x04d364994d625e4dULL,  0x7bbd94517d57d734ULL,
+       0xfa0e85092d094cbfULL,  0x856075c11d3cc5c6ULL,
+       0x134d926535897936ULL,  0x6c2362ad05bcf04fULL,
+       0xed9073f555e26bc4ULL,  0x92fe833d65d7e2bdULL,
+       0xda2f7716adc8cfb9ULL,  0xa54187de9dfd46c0ULL,
+       0x24f29686cda3dd4bULL,  0x5b9c664efd965432ULL,
+       0xb5517ed15d9d8743ULL,  0xca3f8e196da80e3aULL,
+       0x4b8c9f413df695b1ULL,  0x34e26f890dc31cc8ULL,
+       0x7c339ba2c5dc31ccULL,  0x035d6b6af5e9b8b5ULL,
+       0x82ee7a32a5b7233eULL,  0xfd808afa9582aa47ULL,
+       0x4d364994d625e4daULL,  0x3258b95ce6106da3ULL,
+       0xb3eba804b64ef628ULL,  0xcc8558cc867b7f51ULL,
+       0x8454ace74e645255ULL,  0xfb3a5c2f7e51db2cULL,
+       0x7a894d772e0f40a7ULL,  0x05e7bdbf1e3ac9deULL,
+       0xeb2aa520be311aafULL,  0x944455e88e0493d6ULL,
+       0x15f744b0de5a085dULL,  0x6a99b478ee6f8124ULL,
+       0x224840532670ac20ULL,  0x5d26b09b16452559ULL,
+       0xdc95a1c3461bbed2ULL,  0xa3fb510b762e37abULL,
+       0x35d6b6af5e9b8b5bULL,  0x4ab846676eae0222ULL,
+       0xcb0b573f3ef099a9ULL,  0xb465a7f70ec510d0ULL,
+       0xfcb453dcc6da3dd4ULL,  0x83daa314f6efb4adULL,
+       0x0269b24ca6b12f26ULL,  0x7d0742849684a65fULL,
+       0x93ca5a1b368f752eULL,  0xeca4aad306bafc57ULL,
+       0x6d17bb8b56e467dcULL,  0x12794b4366d1eea5ULL,
+       0x5aa8bf68aecec3a1ULL,  0x25c64fa09efb4ad8ULL,
+       0xa4755ef8cea5d153ULL,  0xdb1bae30fe90582aULL,
+       0xbcf7b7e3c7593bd8ULL,  0xc399472bf76cb2a1ULL,
+       0x422a5673a732292aULL,  0x3d44a6bb9707a053ULL,
+       0x759552905f188d57ULL,  0x0afba2586f2d042eULL,
+       0x8b48b3003f739fa5ULL,  0xf42643c80f4616dcULL,
+       0x1aeb5b57af4dc5adULL,  0x6585ab9f9f784cd4ULL,
+       0xe436bac7cf26d75fULL,  0x9b584a0fff135e26ULL,
+       0xd389be24370c7322ULL,  0xace74eec0739fa5bULL,
+       0x2d545fb4576761d0ULL,  0x523aaf7c6752e8a9ULL,
+       0xc41748d84fe75459ULL,  0xbb79b8107fd2dd20ULL,
+       0x3acaa9482f8c46abULL,  0x45a459801fb9cfd2ULL,
+       0x0d75adabd7a6e2d6ULL,  0x721b5d63e7936bafULL,
+       0xf3a84c3bb7cdf024ULL,  0x8cc6bcf387f8795dULL,
+       0x620ba46c27f3aa2cULL,  0x1d6554a417c62355ULL,
+       0x9cd645fc4798b8deULL,  0xe3b8b53477ad31a7ULL,
+       0xab69411fbfb21ca3ULL,  0xd407b1d78f8795daULL,
+       0x55b4a08fdfd90e51ULL,  0x2ada5047efec8728ULL,
+};
diff --git a/crc/crct10dif_common.c b/crc/crct10dif_common.c

new file mode 100644 (file)

index 0000000..1763b1c
--- /dev/null
+++ b/crc/crct10dif_common.c
@@ -0,0 +1,91 @@
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform
+ *
+ * Copyright (c) 2007 Oracle Corporation.  All rights reserved.
+ * Written by Martin K. Petersen <martin.petersen@oracle.com>
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifdef CONFIG_LIBISAL
+#include <isa-l/crc.h>
+
+extern unsigned short fio_crc_t10dif(unsigned short crc,
+                                    const unsigned char *buffer,
+                                    unsigned int len)
+{
+       return crc16_t10dif(crc, buffer, len);
+}
+
+#else
+#include "crc-t10dif.h"
+
+/* Table generated using the following polynomium:
+ * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
+ * gt: 0x8bb7
+ */
+static const unsigned short t10_dif_crc_table[256] = {
+       0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
+       0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
+       0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
+       0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
+       0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
+       0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
+       0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
+       0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
+       0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
+       0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
+       0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
+       0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
+       0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
+       0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
+       0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
+       0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
+       0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
+       0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
+       0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
+       0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
+       0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
+       0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
+       0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
+       0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
+       0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
+       0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
+       0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
+       0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
+       0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
+       0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
+       0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
+       0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
+};
+
+extern unsigned short fio_crc_t10dif(unsigned short crc,
+                                    const unsigned char *buffer,
+                                    unsigned int len)
+{
+       unsigned int i;
+
+       for (i = 0 ; i < len ; i++)
+               crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff];
+
+       return crc;
+}
+
+#endif
diff --git a/crc/murmur3.c b/crc/murmur3.c

index ba408a9e80c8df60f8a9466cb862024cdad69135..08660bc8cb4a7e4b4308e8e6ce905eb6015448b3 100644 (file)
--- a/crc/murmur3.c
+++ b/crc/murmur3.c
@@ -30,10 +30,10 @@ static uint32_t murmur3_tail(const uint8_t *data, const int nblocks,
         switch (len & 3) {
         case 3:
                 k1 ^= tail[2] << 16;
-               fallthrough;
+               fio_fallthrough;
         case 2:
                 k1 ^= tail[1] << 8;
-               fallthrough;
+               fio_fallthrough;
         case 1:
                 k1 ^= tail[0];
                 k1 *= c1;
diff --git a/crc/xxhash.c b/crc/xxhash.c

index 4736c528fc07750dd5641721756e1ac860a8f9d2..0119564be33ea76ba57764d2950d528e0aa87df4 100644 (file)
--- a/crc/xxhash.c
+++ b/crc/xxhash.c
@@ -50,10 +50,10 @@ You can contact the author at :
  //#define XXH_ACCEPT_NULL_INPUT_POINTER 1
  
  // XXH_FORCE_NATIVE_FORMAT :
-// By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+// By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
  // Results are therefore identical for little-endian and big-endian CPU.
  // This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
-// Should endian-independance be of no importance for your application, you may set the #define below to 1.
+// Should endian-independence be of no importance for your application, you may set the #define below to 1.
  // It will improve speed for Big-endian CPU.
  // This option has no impact on Little_Endian CPU.
  #define XXH_FORCE_NATIVE_FORMAT 0
diff --git a/dataplacement.c b/dataplacement.c

new file mode 100644 (file)

index 0000000..1d5b21e
--- /dev/null
+++ b/dataplacement.c
@@ -0,0 +1,149 @@
+/*
+ * Note: This is similar to a very basic setup
+ * of ZBD devices
+ *
+ * Specify fdp=1 (With char devices /dev/ng0n1)
+ */
+
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "fio.h"
+#include "file.h"
+
+#include "pshared.h"
+#include "dataplacement.h"
+
+static int fdp_ruh_info(struct thread_data *td, struct fio_file *f,
+                       struct fio_ruhs_info *ruhs)
+{
+       int ret = -EINVAL;
+
+       if (!td->io_ops) {
+               log_err("fio: no ops set in fdp init?!\n");
+               return ret;
+       }
+
+       if (td->io_ops->fdp_fetch_ruhs) {
+               ret = td->io_ops->fdp_fetch_ruhs(td, f, ruhs);
+               if (ret < 0) {
+                       td_verror(td, errno, "fdp fetch ruhs failed");
+                       log_err("%s: fdp fetch ruhs failed (%d)\n",
+                               f->file_name, errno);
+               }
+       } else {
+               log_err("%s: engine (%s) lacks fetch ruhs\n",
+                       f->file_name, td->io_ops->name);
+       }
+
+       return ret;
+}
+
+static int init_ruh_info(struct thread_data *td, struct fio_file *f)
+{
+       struct fio_ruhs_info *ruhs, *tmp;
+       int i, ret;
+
+       ruhs = scalloc(1, sizeof(*ruhs) + FDP_MAX_RUHS * sizeof(*ruhs->plis));
+       if (!ruhs)
+               return -ENOMEM;
+
+       /* set up the data structure used for FDP to work with the supplied stream IDs */
+       if (td->o.dp_type == FIO_DP_STREAMS) {
+               if (!td->o.dp_nr_ids) {
+                       log_err("fio: stream IDs must be provided for dataplacement=streams\n");
+                       return -EINVAL;
+               }
+               ruhs->nr_ruhs = td->o.dp_nr_ids;
+               for (int i = 0; i < ruhs->nr_ruhs; i++)
+                       ruhs->plis[i] = td->o.dp_ids[i];
+
+               f->ruhs_info = ruhs;
+               return 0;
+       }
+
+       ret = fdp_ruh_info(td, f, ruhs);
+       if (ret) {
+               log_info("fio: ruh info failed for %s (%d)\n",
+                        f->file_name, -ret);
+               goto out;
+       }
+
+       if (ruhs->nr_ruhs > FDP_MAX_RUHS)
+               ruhs->nr_ruhs = FDP_MAX_RUHS;
+
+       if (td->o.dp_nr_ids == 0) {
+               f->ruhs_info = ruhs;
+               return 0;
+       }
+
+       for (i = 0; i < td->o.dp_nr_ids; i++) {
+               if (td->o.dp_ids[i] >= ruhs->nr_ruhs) {
+                       ret = -EINVAL;
+                       goto out;
+               }
+       }
+
+       tmp = scalloc(1, sizeof(*tmp) + ruhs->nr_ruhs * sizeof(*tmp->plis));
+       if (!tmp) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       tmp->nr_ruhs = td->o.dp_nr_ids;
+       for (i = 0; i < td->o.dp_nr_ids; i++)
+               tmp->plis[i] = ruhs->plis[td->o.dp_ids[i]];
+       f->ruhs_info = tmp;
+out:
+       sfree(ruhs);
+       return ret;
+}
+
+int dp_init(struct thread_data *td)
+{
+       struct fio_file *f;
+       int i, ret = 0;
+
+       for_each_file(td, f, i) {
+               ret = init_ruh_info(td, f);
+               if (ret)
+                       break;
+       }
+       return ret;
+}
+
+void fdp_free_ruhs_info(struct fio_file *f)
+{
+       if (!f->ruhs_info)
+               return;
+       sfree(f->ruhs_info);
+       f->ruhs_info = NULL;
+}
+
+void dp_fill_dspec_data(struct thread_data *td, struct io_u *io_u)
+{
+       struct fio_file *f = io_u->file;
+       struct fio_ruhs_info *ruhs = f->ruhs_info;
+       int dspec;
+
+       if (!ruhs || io_u->ddir != DDIR_WRITE) {
+               io_u->dtype = 0;
+               io_u->dspec = 0;
+               return;
+       }
+
+       if (td->o.dp_id_select == FIO_DP_RR) {
+               if (ruhs->pli_loc >= ruhs->nr_ruhs)
+                       ruhs->pli_loc = 0;
+
+               dspec = ruhs->plis[ruhs->pli_loc++];
+       } else {
+               ruhs->pli_loc = rand_between(&td->fdp_state, 0, ruhs->nr_ruhs - 1);
+               dspec = ruhs->plis[ruhs->pli_loc];
+       }
+
+       io_u->dtype = td->o.dp_type == FIO_DP_FDP ? FDP_DIR_DTYPE : STREAMS_DIR_DTYPE;
+       io_u->dspec = dspec;
+       dprint(FD_IO, "dtype set to 0x%x, dspec set to 0x%x\n", io_u->dtype, io_u->dspec);
+}
diff --git a/dataplacement.h b/dataplacement.h

new file mode 100644 (file)

index 0000000..b5718c8
--- /dev/null
+++ b/dataplacement.h
@@ -0,0 +1,37 @@
+#ifndef FIO_DATAPLACEMENT_H
+#define FIO_DATAPLACEMENT_H
+
+#include "io_u.h"
+
+#define STREAMS_DIR_DTYPE      1
+#define FDP_DIR_DTYPE          2
+#define FDP_MAX_RUHS           128
+#define FIO_MAX_DP_IDS                 16
+
+/*
+ * How fio chooses what placement identifier to use next. Choice of
+ * uniformly random, or roundrobin.
+ */
+enum {
+       FIO_DP_RANDOM   = 0x1,
+       FIO_DP_RR       = 0x2,
+};
+
+
+enum {
+       FIO_DP_NONE     = 0x0,
+       FIO_DP_FDP      = 0x1,
+       FIO_DP_STREAMS  = 0x2,
+};
+
+struct fio_ruhs_info {
+       uint32_t nr_ruhs;
+       uint32_t pli_loc;
+       uint16_t plis[];
+};
+
+int dp_init(struct thread_data *td);
+void fdp_free_ruhs_info(struct fio_file *f);
+void dp_fill_dspec_data(struct thread_data *td, struct io_u *io_u);
+
+#endif /* FIO_DATAPLACEMENT_H */
diff --git a/dedupe.c b/dedupe.c

new file mode 100644 (file)

index 0000000..6170568
--- /dev/null
+++ b/dedupe.c
@@ -0,0 +1,71 @@
+#include "fio.h"
+
+/**
+ * initializes the global dedup workset.
+ * this needs to be called after all jobs' seeds
+ * have been initialized
+ */
+int init_global_dedupe_working_set_seeds(void)
+{
+       for_each_td(td) {
+               if (!td->o.dedupe_global)
+                       continue;
+
+               if (init_dedupe_working_set_seeds(td, 1))
+                       return 1;
+       } end_for_each();
+
+       return 0;
+}
+
+int init_dedupe_working_set_seeds(struct thread_data *td, bool global_dedup)
+{
+       int tindex;
+       struct thread_data *td_seed;
+       unsigned long long i, j, num_seed_advancements, pages_per_seed;
+       struct frand_state dedupe_working_set_state = {0};
+
+       if (!td->o.dedupe_percentage || !(td->o.dedupe_mode == DEDUPE_MODE_WORKING_SET))
+               return 0;
+
+       tindex = td->thread_number - 1;
+       num_seed_advancements = td->o.min_bs[DDIR_WRITE] /
+               min_not_zero(td->o.min_bs[DDIR_WRITE], (unsigned long long) td->o.compress_chunk);
+       /*
+        * The dedupe working set keeps seeds of unique data (generated by buf_state).
+        * Dedupe-ed pages will be generated using those seeds.
+        */
+       td->num_unique_pages = (td->o.size * (unsigned long long)td->o.dedupe_working_set_percentage / 100) / td->o.min_bs[DDIR_WRITE];
+       td->dedupe_working_set_states = malloc(sizeof(struct frand_state) * td->num_unique_pages);
+       if (!td->dedupe_working_set_states) {
+               log_err("fio: could not allocate dedupe working set\n");
+               return 1;
+       }
+
+       frand_copy(&dedupe_working_set_state, &td->buf_state);
+       frand_copy(&td->dedupe_working_set_states[0], &dedupe_working_set_state);
+       pages_per_seed = max(td->num_unique_pages / thread_number, 1ull);
+       for (i = 1; i < td->num_unique_pages; i++) {
+               /*
+                * When compression is used the seed is advanced multiple times to
+                * generate the buffer. We want to regenerate the same buffer when
+                * deduping against this page
+                */
+               for (j = 0; j < num_seed_advancements; j++)
+                       __get_next_seed(&dedupe_working_set_state);
+
+               /*
+                * When global dedup is used, we rotate the seeds to allow
+                * generating same buffers across different jobs. Deduplication buffers
+                * are spread evenly across jobs participating in global dedupe
+                */
+               if (global_dedup && i % pages_per_seed == 0) {
+                       td_seed = tnumber_to_td(++tindex % thread_number);
+                       frand_copy(&dedupe_working_set_state, &td_seed->buf_state);
+               }
+
+               frand_copy(&td->dedupe_working_set_states[i], &dedupe_working_set_state);
+       }
+
+       return 0;
+}
diff --git a/dedupe.h b/dedupe.h

new file mode 100644 (file)

index 0000000..bd1f9c0
--- /dev/null
+++ b/dedupe.h
@@ -0,0 +1,7 @@
+#ifndef DEDUPE_H
+#define DEDUPE_H
+
+int init_dedupe_working_set_seeds(struct thread_data *td, bool global_dedupe);
+int init_global_dedupe_working_set_seeds(void);
+
+#endif
diff --git a/diskutil.c b/diskutil.c

index 0051a7a035875b3a51493ac527125b413ad1bb7d..69b3dd263f7b11b83b544c394e3d7920150540a7 100644 (file)
--- a/diskutil.c
+++ b/diskutil.c
@@ -1,3 +1,4 @@
+#include <inttypes.h>
  #include <stdio.h>
  #include <string.h>
  #include <sys/types.h>
@@ -44,8 +45,6 @@ static void disk_util_free(struct disk_util *du)
  
  static int get_io_ticks(struct disk_util *du, struct disk_util_stat *dus)
  {
-       unsigned in_flight;
-       unsigned long long sectors[2];
         char line[256];
         FILE *f;
         char *p;
@@ -65,23 +64,34 @@ static int get_io_ticks(struct disk_util *du, struct disk_util_stat *dus)
  
         dprint(FD_DISKUTIL, "%s: %s", du->path, p);
  
-       ret = sscanf(p, "%llu %llu %llu %llu %llu %llu %llu %llu %u %llu %llu\n",
-                               (unsigned long long *) &dus->s.ios[0],
-                               (unsigned long long *) &dus->s.merges[0],
-                               &sectors[0],
-                               (unsigned long long *) &dus->s.ticks[0],
-                               (unsigned long long *) &dus->s.ios[1],
-                               (unsigned long long *) &dus->s.merges[1],
-                               &sectors[1],
-                               (unsigned long long *) &dus->s.ticks[1],
-                               &in_flight,
-                               (unsigned long long *) &dus->s.io_ticks,
-                               (unsigned long long *) &dus->s.time_in_queue);
+       ret = sscanf(p, "%"SCNu64" %"SCNu64" %"SCNu64" %"SCNu64" "
+                    "%"SCNu64" %"SCNu64" %"SCNu64" %"SCNu64" "
+                    "%*u %"SCNu64" %"SCNu64"\n",
+                    &dus->s.ios[0], &dus->s.merges[0], &dus->s.sectors[0],
+                    &dus->s.ticks[0],
+                    &dus->s.ios[1], &dus->s.merges[1], &dus->s.sectors[1],
+                    &dus->s.ticks[1],
+                    &dus->s.io_ticks, &dus->s.time_in_queue);
         fclose(f);
-       dprint(FD_DISKUTIL, "%s: stat read ok? %d\n", du->path, ret == 1);
-       dus->s.sectors[0] = sectors[0];
-       dus->s.sectors[1] = sectors[1];
-       return ret != 11;
+       dprint(FD_DISKUTIL, "%s: stat read ok? %d\n", du->path, ret == 10);
+       return ret != 10;
+}
+
+static uint64_t safe_32bit_diff(uint64_t nval, uint64_t oval)
+{
+       /* Linux kernel prints some of the stat fields as 32-bit integers. It is
+        * possible that the value overflows, but since fio uses unsigned 64-bit
+        * arithmetic in update_io_tick_disk(), it instead results in a huge
+        * bogus value being added to the respective accumulating field. Just
+        * in case Linux starts reporting these metrics as 64-bit values in the
+        * future, check that overflow actually happens around the 32-bit
+        * unsigned boundary; assume overflow only happens once between
+        * successive polls.
+        */
+       if (oval <= nval || oval >= (1ull << 32))
+               return nval - oval;
+       else
+               return (1ull << 32) + nval - oval;
  }
  
  static void update_io_tick_disk(struct disk_util *du)
@@ -103,15 +113,16 @@ static void update_io_tick_disk(struct disk_util *du)
         dus->s.ios[1] += (__dus.s.ios[1] - ldus->s.ios[1]);
         dus->s.merges[0] += (__dus.s.merges[0] - ldus->s.merges[0]);
         dus->s.merges[1] += (__dus.s.merges[1] - ldus->s.merges[1]);
-       dus->s.ticks[0] += (__dus.s.ticks[0] - ldus->s.ticks[0]);
-       dus->s.ticks[1] += (__dus.s.ticks[1] - ldus->s.ticks[1]);
-       dus->s.io_ticks += (__dus.s.io_ticks - ldus->s.io_ticks);
-       dus->s.time_in_queue += (__dus.s.time_in_queue - ldus->s.time_in_queue);
+       dus->s.ticks[0] += safe_32bit_diff(__dus.s.ticks[0], ldus->s.ticks[0]);
+       dus->s.ticks[1] += safe_32bit_diff(__dus.s.ticks[1], ldus->s.ticks[1]);
+       dus->s.io_ticks += safe_32bit_diff(__dus.s.io_ticks, ldus->s.io_ticks);
+       dus->s.time_in_queue +=
+                       safe_32bit_diff(__dus.s.time_in_queue, ldus->s.time_in_queue);
  
         fio_gettime(&t, NULL);
         dus->s.msec += mtime_since(&du->time, &t);
-       memcpy(&du->time, &t, sizeof(t));
-       memcpy(&ldus->s, &__dus.s, sizeof(__dus.s));
+       du->time = t;
+       ldus->s = __dus.s;
  }
  
  int update_io_ticks(void)
@@ -166,14 +177,10 @@ static int get_device_numbers(char *file_name, int *maj, int *min)
                 if (S_ISBLK(st.st_mode)) {
                         majdev = major(st.st_rdev);
                         mindev = minor(st.st_rdev);
-               } else if (S_ISCHR(st.st_mode)) {
-                       majdev = major(st.st_rdev);
-                       mindev = minor(st.st_rdev);
-                       if (fio_lookup_raw(st.st_rdev, &majdev, &mindev))
-                               return -1;
-               } else if (S_ISFIFO(st.st_mode))
+               } else if (S_ISCHR(st.st_mode) ||
+                          S_ISFIFO(st.st_mode)) {
                         return -1;
-               else {
+               } else {
                         majdev = major(st.st_dev);
                         mindev = minor(st.st_dev);
                 }
diff --git a/diskutil.h b/diskutil.h

index 83bcbf895e799f989645d86cbc23f0dbdf698f39..9b28379983ac2b3c8b6eb0053f1cc432b26b1b92 100644 (file)
--- a/diskutil.h
+++ b/diskutil.h
@@ -2,9 +2,24 @@
  #define FIO_DISKUTIL_H
  #define FIO_DU_NAME_SZ         64
  
+#include <stdint.h>
+#include <limits.h>
+
  #include "helper_thread.h"
  #include "fio_sem.h"
-
+#include "flist.h"
+#include "lib/ieee754.h"
+
+/**
+ * @ios: Number of I/O operations that have been completed successfully.
+ * @merges: Number of I/O operations that have been merged.
+ * @sectors: I/O size in 512-byte units.
+ * @ticks: Time spent on I/O in milliseconds.
+ * @io_ticks: CPU time spent on I/O in milliseconds.
+ * @time_in_queue: Weighted time spent doing I/O in milliseconds.
+ *
+ * For the array members, index 0 refers to reads and index 1 refers to writes.
+ */
  struct disk_util_stats {
         uint64_t ios[2];
         uint64_t merges[2];
@@ -16,7 +31,7 @@ struct disk_util_stats {
  };
  
  /*
- * Disk utils as read in /sys/block/<dev>/stat
+ * Disk utilization as read from /sys/block/<dev>/stat
   */
  struct disk_util_stat {
         uint8_t name[FIO_DU_NAME_SZ];
diff --git a/doc/Makefile b/doc/Makefile

index 3b979f9acb20f62bc733b514db2cc3dda2d56d9f..a444d83a50c069bce203de188915307764a3762a 100644 (file)
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -2,7 +2,7 @@
  #
  
  # You can set these variables from the command line.
-SPHINXOPTS    =
+SPHINXOPTS    = -W --keep-going
  SPHINXBUILD   = sphinx-build
  PAPER         =
  BUILDDIR      = output
diff --git a/doc/conf.py b/doc/conf.py

index 10b72ecb913b0ad9643605e7b6143e7990ea9c81..18a8dccce329d5cbda0eb0d331275c4f791a9886 100644 (file)
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -85,13 +85,6 @@ def fio_version():
  
  version, release = fio_version()
  
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
  # There are two options for replacing |today|: either, you set today to some
  # non-false value, then it is used:
  #
@@ -325,7 +318,7 @@ latex_documents = [
  # One entry per manual page. List of tuples
  # (source start file, name, description, authors, manual section).
  man_pages = [
-    ('fio_man', 'fio', 'flexible I/O tester',
+    ('fio_doc', 'fio', 'flexible I/O tester',
       [author], 1)
  ]
  
diff --git a/doc/fio_doc.rst b/doc/fio_doc.rst

index b5987b52a893f34cb3b16e8e44eeaf597de9641e..cad84c7c7ef5bb367b210442674f949cd55317e7 100644 (file)
--- a/doc/fio_doc.rst
+++ b/doc/fio_doc.rst
@@ -2,50 +2,50 @@ fio - Flexible I/O tester rev. |version|
  ========================================
  
  
-.. include:: ../README
+.. include:: ../README.rst
  
  
-.. include:: ../HOWTO
+.. include:: ../HOWTO.rst
  
+.. only:: not man
  
+        Examples
+        ========
  
-Examples
-========
+        .. include:: fio_examples.rst
  
-.. include:: fio_examples.rst
  
  
+        TODO
+        ====
  
-TODO
-====
  
+        GFIO TODO
+        ---------
  
-GFIO TODO
----------
+        .. include:: ../GFIO-TODO
  
-.. include:: ../GFIO-TODO
  
+        Server TODO
+        -----------
  
-Server TODO
------------
+        .. include:: ../SERVER-TODO
  
-.. include:: ../SERVER-TODO
  
+        Steady State TODO
+        -----------------
  
-Steady State TODO
------------------
+        .. include:: ../STEADYSTATE-TODO
  
-.. include:: ../STEADYSTATE-TODO
  
  
+        Moral License
+        =============
  
-Moral License
-=============
+        .. include:: ../MORAL-LICENSE
  
-.. include:: ../MORAL-LICENSE
  
+        License
+        =======
  
-License
-=======
-
-.. literalinclude:: ../COPYING
+        .. literalinclude:: ../COPYING
diff --git a/doc/fio_man.rst b/doc/fio_man.rst

deleted file mode 100644 (file)

index c6a6438..0000000
--- a/doc/fio_man.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-:orphan:
-
-Fio Manpage
-===========
-
-(rev. |release|)
-
-
-.. include:: ../README
-
-
-.. include:: ../HOWTO
diff --git a/engines/cmdprio.c b/engines/cmdprio.c

new file mode 100644 (file)

index 0000000..153e369
--- /dev/null
+++ b/engines/cmdprio.c
@@ -0,0 +1,536 @@
+/*
+ * IO priority handling helper functions common to the libaio and io_uring
+ * engines.
+ */
+
+#include "cmdprio.h"
+
+/*
+ * Temporary array used during parsing. Will be freed after the corresponding
+ * struct bsprio_desc has been generated and saved in cmdprio->bsprio_desc.
+ */
+struct cmdprio_parse_result {
+       struct split_prio *entries;
+       int nr_entries;
+};
+
+/*
+ * Temporary array used during init. Will be freed after the corresponding
+ * struct clat_prio_stat array has been saved in td->ts.clat_prio and the
+ * matching clat_prio_indexes have been saved in each struct cmdprio_prio.
+ */
+struct cmdprio_values {
+       unsigned int *prios;
+       int nr_prios;
+};
+
+static int find_clat_prio_index(unsigned int *all_prios, int nr_prios,
+                               int32_t prio)
+{
+       int i;
+
+       for (i = 0; i < nr_prios; i++) {
+               if (all_prios[i] == prio)
+                       return i;
+       }
+
+       return -1;
+}
+
+/**
+ * assign_clat_prio_index - In order to avoid stat.c the need to loop through
+ * all possible priorities each time add_clat_sample() / add_lat_sample() is
+ * called, save which index to use in each cmdprio_prio. This will later be
+ * propagated to the io_u, if the specific io_u was determined to use a cmdprio
+ * priority value.
+ */
+static void assign_clat_prio_index(struct cmdprio_prio *prio,
+                                  struct cmdprio_values *values)
+{
+       int clat_prio_index = find_clat_prio_index(values->prios,
+                                                  values->nr_prios,
+                                                  prio->prio);
+       if (clat_prio_index == -1) {
+               clat_prio_index = values->nr_prios;
+               values->prios[clat_prio_index] = prio->prio;
+               values->nr_prios++;
+       }
+       prio->clat_prio_index = clat_prio_index;
+}
+
+/**
+ * init_cmdprio_values - Allocate a temporary array that can hold all unique
+ * priorities (per ddir), so that we can assign_clat_prio_index() for each
+ * cmdprio_prio during setup. This temporary array is freed after setup.
+ */
+static int init_cmdprio_values(struct cmdprio_values *values,
+                              int max_unique_prios, struct thread_stat *ts)
+{
+       values->prios = calloc(max_unique_prios + 1,
+                              sizeof(*values->prios));
+       if (!values->prios)
+               return 1;
+
+       /* td->ioprio/ts->ioprio is always stored at index 0. */
+       values->prios[0] = ts->ioprio;
+       values->nr_prios++;
+
+       return 0;
+}
+
+/**
+ * init_ts_clat_prio - Allocates and fills a clat_prio_stat array which holds
+ * all unique priorities (per ddir).
+ */
+static int init_ts_clat_prio(struct thread_stat *ts, enum fio_ddir ddir,
+                            struct cmdprio_values *values)
+{
+       int i;
+
+       if (alloc_clat_prio_stat_ddir(ts, ddir, values->nr_prios))
+               return 1;
+
+       for (i = 0; i < values->nr_prios; i++)
+               ts->clat_prio[ddir][i].ioprio = values->prios[i];
+
+       return 0;
+}
+
+static int fio_cmdprio_fill_bsprio(struct cmdprio_bsprio *bsprio,
+                                  struct split_prio *entries,
+                                  struct cmdprio_values *values,
+                                  int implicit_cmdprio, int start, int end)
+{
+       struct cmdprio_prio *prio;
+       int i = end - start + 1;
+
+       bsprio->prios = calloc(i, sizeof(*bsprio->prios));
+       if (!bsprio->prios)
+               return 1;
+
+       bsprio->bs = entries[start].bs;
+       bsprio->nr_prios = 0;
+       for (i = start; i <= end; i++) {
+               prio = &bsprio->prios[bsprio->nr_prios];
+               prio->perc = entries[i].perc;
+               if (entries[i].prio == -1)
+                       prio->prio = implicit_cmdprio;
+               else
+                       prio->prio = entries[i].prio;
+               assign_clat_prio_index(prio, values);
+               bsprio->tot_perc += entries[i].perc;
+               if (bsprio->tot_perc > 100) {
+                       log_err("fio: cmdprio_bssplit total percentage "
+                               "for bs: %"PRIu64" exceeds 100\n",
+                               bsprio->bs);
+                       free(bsprio->prios);
+                       return 1;
+               }
+               bsprio->nr_prios++;
+       }
+
+       return 0;
+}
+
+static int
+fio_cmdprio_generate_bsprio_desc(struct cmdprio_bsprio_desc *bsprio_desc,
+                                struct cmdprio_parse_result *parse_res,
+                                struct cmdprio_values *values,
+                                int implicit_cmdprio)
+{
+       struct split_prio *entries = parse_res->entries;
+       int nr_entries = parse_res->nr_entries;
+       struct cmdprio_bsprio *bsprio;
+       int i, start, count = 0;
+
+       /*
+        * The parsed result is sorted by blocksize, so count only the number
+        * of different blocksizes, to know how many cmdprio_bsprio we need.
+        */
+       for (i = 0; i < nr_entries; i++) {
+               while (i + 1 < nr_entries && entries[i].bs == entries[i + 1].bs)
+                       i++;
+               count++;
+       }
+
+       /*
+        * This allocation is not freed on error. Instead, the calling function
+        * is responsible for calling fio_cmdprio_cleanup() on error.
+        */
+       bsprio_desc->bsprios = calloc(count, sizeof(*bsprio_desc->bsprios));
+       if (!bsprio_desc->bsprios)
+               return 1;
+
+       start = 0;
+       bsprio_desc->nr_bsprios = 0;
+       for (i = 0; i < nr_entries; i++) {
+               while (i + 1 < nr_entries && entries[i].bs == entries[i + 1].bs)
+                       i++;
+               bsprio = &bsprio_desc->bsprios[bsprio_desc->nr_bsprios];
+               /*
+                * All parsed entries with the same blocksize get saved in the
+                * same cmdprio_bsprio, to expedite the search in the hot path.
+                */
+               if (fio_cmdprio_fill_bsprio(bsprio, entries, values,
+                                           implicit_cmdprio, start, i))
+                       return 1;
+
+               start = i + 1;
+               bsprio_desc->nr_bsprios++;
+       }
+
+       return 0;
+}
+
+static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg,
+                                   enum fio_ddir ddir, char *str, bool data)
+{
+       struct cmdprio_parse_result *parse_res_arr = cb_arg;
+       struct cmdprio_parse_result *parse_res = &parse_res_arr[ddir];
+
+       if (ddir == DDIR_TRIM)
+               return 0;
+
+       if (split_parse_prio_ddir(to, &parse_res->entries,
+                                 &parse_res->nr_entries, str))
+               return 1;
+
+       return 0;
+}
+
+static int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
+                                    struct cmdprio_parse_result *parse_res)
+{
+       char *str, *p;
+       int ret = 0;
+
+       p = str = strdup(input);
+
+       strip_blank_front(&str);
+       strip_blank_end(str);
+
+       ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, parse_res,
+                             false);
+
+       free(p);
+       return ret;
+}
+
+/**
+ * fio_cmdprio_percentage - Returns the percentage of I/Os that should
+ * use a cmdprio priority value (rather than the default context priority).
+ *
+ * For CMDPRIO_MODE_BSSPLIT, if the percentage is non-zero, we will also
+ * return the matching bsprio, to avoid the same linear search elsewhere.
+ * For CMDPRIO_MODE_PERC, we will never return a bsprio.
+ */
+static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u,
+                                 struct cmdprio_bsprio **bsprio)
+{
+       struct cmdprio_bsprio *bsprio_entry;
+       enum fio_ddir ddir = io_u->ddir;
+       int i;
+
+       switch (cmdprio->mode) {
+       case CMDPRIO_MODE_PERC:
+               *bsprio = NULL;
+               return cmdprio->perc_entry[ddir].perc;
+       case CMDPRIO_MODE_BSSPLIT:
+               for (i = 0; i < cmdprio->bsprio_desc[ddir].nr_bsprios; i++) {
+                       bsprio_entry = &cmdprio->bsprio_desc[ddir].bsprios[i];
+                       if (bsprio_entry->bs == io_u->buflen) {
+                               *bsprio = bsprio_entry;
+                               return bsprio_entry->tot_perc;
+                       }
+               }
+               break;
+       default:
+               /*
+                * An I/O engine should never call this function if cmdprio
+                * is not is use.
+                */
+               assert(0);
+       }
+
+       /*
+        * This is totally fine, the given blocksize simply does not
+        * have any (non-zero) cmdprio_bssplit entries defined.
+        */
+       *bsprio = NULL;
+       return 0;
+}
+
+/**
+ * fio_cmdprio_set_ioprio - Set an io_u ioprio according to cmdprio options
+ *
+ * Generates a random percentage value to determine if an io_u ioprio needs
+ * to be set. If the random percentage value is within the user specified
+ * percentage of I/Os that should use a cmdprio priority value (rather than
+ * the default priority), then this function updates the io_u with an ioprio
+ * value as defined by the cmdprio/cmdprio_hint/cmdprio_class or
+ * cmdprio_bssplit options.
+ *
+ * Return true if the io_u ioprio was changed and false otherwise.
+ */
+bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio,
+                           struct io_u *io_u)
+{
+       struct cmdprio_bsprio *bsprio;
+       unsigned int p, rand;
+       uint32_t perc = 0;
+       int i;
+
+       p = fio_cmdprio_percentage(cmdprio, io_u, &bsprio);
+       if (!p)
+               return false;
+
+       rand = rand_between(&td->prio_state, 0, 99);
+       if (rand >= p)
+               return false;
+
+       switch (cmdprio->mode) {
+       case CMDPRIO_MODE_PERC:
+               io_u->ioprio = cmdprio->perc_entry[io_u->ddir].prio;
+               io_u->clat_prio_index =
+                       cmdprio->perc_entry[io_u->ddir].clat_prio_index;
+               return true;
+       case CMDPRIO_MODE_BSSPLIT:
+               assert(bsprio);
+               for (i = 0; i < bsprio->nr_prios; i++) {
+                       struct cmdprio_prio *prio = &bsprio->prios[i];
+
+                       perc += prio->perc;
+                       if (rand < perc) {
+                               io_u->ioprio = prio->prio;
+                               io_u->clat_prio_index = prio->clat_prio_index;
+                               return true;
+                       }
+               }
+               break;
+       default:
+               assert(0);
+       }
+
+       /* When rand < p (total perc), we should always find a cmdprio_prio. */
+       assert(0);
+       return false;
+}
+
+static int fio_cmdprio_gen_perc(struct thread_data *td, struct cmdprio *cmdprio)
+{
+       struct cmdprio_options *options = cmdprio->options;
+       struct cmdprio_prio *prio;
+       struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {};
+       struct thread_stat *ts = &td->ts;
+       enum fio_ddir ddir;
+       int ret;
+
+       for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+               /*
+                * Do not allocate a clat_prio array nor set the cmdprio struct
+                * if zero percent of the I/Os (for the ddir) should use a
+                * cmdprio priority value, or when the ddir is not enabled.
+                */
+               if (!options->percentage[ddir] ||
+                   (ddir == DDIR_READ && !td_read(td)) ||
+                   (ddir == DDIR_WRITE && !td_write(td)))
+                       continue;
+
+               ret = init_cmdprio_values(&values[ddir], 1, ts);
+               if (ret)
+                       goto err;
+
+               prio = &cmdprio->perc_entry[ddir];
+               prio->perc = options->percentage[ddir];
+               prio->prio = ioprio_value(options->class[ddir],
+                                         options->level[ddir],
+                                         options->hint[ddir]);
+               assign_clat_prio_index(prio, &values[ddir]);
+
+               ret = init_ts_clat_prio(ts, ddir, &values[ddir]);
+               if (ret)
+                       goto err;
+
+               free(values[ddir].prios);
+               values[ddir].prios = NULL;
+               values[ddir].nr_prios = 0;
+       }
+
+       return 0;
+
+err:
+       for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++)
+               free(values[ddir].prios);
+       free_clat_prio_stats(ts);
+
+       return ret;
+}
+
+static int fio_cmdprio_parse_and_gen_bssplit(struct thread_data *td,
+                                            struct cmdprio *cmdprio)
+{
+       struct cmdprio_options *options = cmdprio->options;
+       struct cmdprio_parse_result parse_res[CMDPRIO_RWDIR_CNT] = {};
+       struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {};
+       struct thread_stat *ts = &td->ts;
+       int ret, implicit_cmdprio;
+       enum fio_ddir ddir;
+
+       ret = fio_cmdprio_bssplit_parse(td, options->bssplit_str,
+                                       &parse_res[0]);
+       if (ret)
+               goto err;
+
+       for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+               /*
+                * Do not allocate a clat_prio array nor set the cmdprio structs
+                * if there are no non-zero entries (for the ddir), or when the
+                * ddir is not enabled.
+                */
+               if (!parse_res[ddir].nr_entries ||
+                   (ddir == DDIR_READ && !td_read(td)) ||
+                   (ddir == DDIR_WRITE && !td_write(td))) {
+                       free(parse_res[ddir].entries);
+                       parse_res[ddir].entries = NULL;
+                       parse_res[ddir].nr_entries = 0;
+                       continue;
+               }
+
+               ret = init_cmdprio_values(&values[ddir],
+                                         parse_res[ddir].nr_entries, ts);
+               if (ret)
+                       goto err;
+
+               implicit_cmdprio = ioprio_value(options->class[ddir],
+                                               options->level[ddir],
+                                               options->hint[ddir]);
+
+               ret = fio_cmdprio_generate_bsprio_desc(&cmdprio->bsprio_desc[ddir],
+                                                      &parse_res[ddir],
+                                                      &values[ddir],
+                                                      implicit_cmdprio);
+               if (ret)
+                       goto err;
+
+               free(parse_res[ddir].entries);
+               parse_res[ddir].entries = NULL;
+               parse_res[ddir].nr_entries = 0;
+
+               ret = init_ts_clat_prio(ts, ddir, &values[ddir]);
+               if (ret)
+                       goto err;
+
+               free(values[ddir].prios);
+               values[ddir].prios = NULL;
+               values[ddir].nr_prios = 0;
+       }
+
+       return 0;
+
+err:
+       for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+               free(parse_res[ddir].entries);
+               free(values[ddir].prios);
+       }
+       free_clat_prio_stats(ts);
+       fio_cmdprio_cleanup(cmdprio);
+
+       return ret;
+}
+
+static int fio_cmdprio_parse_and_gen(struct thread_data *td,
+                                    struct cmdprio *cmdprio)
+{
+       struct cmdprio_options *options = cmdprio->options;
+       int i, ret;
+
+       /*
+        * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class
+        * is not set, default to RT priority class.
+        */
+       for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
+               /*
+                * A cmdprio value is only used when fio_cmdprio_percentage()
+                * returns non-zero, so it is safe to set a class even for a
+                * DDIR that will never use it.
+                */
+               if (!options->class[i])
+                       options->class[i] = IOPRIO_CLASS_RT;
+       }
+
+       switch (cmdprio->mode) {
+       case CMDPRIO_MODE_BSSPLIT:
+               ret = fio_cmdprio_parse_and_gen_bssplit(td, cmdprio);
+               break;
+       case CMDPRIO_MODE_PERC:
+               ret = fio_cmdprio_gen_perc(td, cmdprio);
+               break;
+       default:
+               assert(0);
+               return 1;
+       }
+
+       return ret;
+}
+
+void fio_cmdprio_cleanup(struct cmdprio *cmdprio)
+{
+       enum fio_ddir ddir;
+       int i;
+
+       for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+               for (i = 0; i < cmdprio->bsprio_desc[ddir].nr_bsprios; i++)
+                       free(cmdprio->bsprio_desc[ddir].bsprios[i].prios);
+               free(cmdprio->bsprio_desc[ddir].bsprios);
+               cmdprio->bsprio_desc[ddir].bsprios = NULL;
+               cmdprio->bsprio_desc[ddir].nr_bsprios = 0;
+       }
+
+       /*
+        * options points to a cmdprio_options struct that is part of td->eo.
+        * td->eo itself will be freed by free_ioengine().
+        */
+       cmdprio->options = NULL;
+}
+
+int fio_cmdprio_init(struct thread_data *td, struct cmdprio *cmdprio,
+                    struct cmdprio_options *options)
+{
+       struct thread_options *to = &td->o;
+       bool has_cmdprio_percentage = false;
+       bool has_cmdprio_bssplit = false;
+       int i;
+
+       cmdprio->options = options;
+
+       if (options->bssplit_str && strlen(options->bssplit_str))
+               has_cmdprio_bssplit = true;
+
+       for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
+               if (options->percentage[i])
+                       has_cmdprio_percentage = true;
+       }
+
+       /*
+        * Check for option conflicts
+        */
+       if (has_cmdprio_percentage && has_cmdprio_bssplit) {
+               log_err("%s: cmdprio_percentage and cmdprio_bssplit options "
+                       "are mutually exclusive\n",
+                       to->name);
+               return 1;
+       }
+
+       if (has_cmdprio_bssplit)
+               cmdprio->mode = CMDPRIO_MODE_BSSPLIT;
+       else if (has_cmdprio_percentage)
+               cmdprio->mode = CMDPRIO_MODE_PERC;
+       else
+               cmdprio->mode = CMDPRIO_MODE_NONE;
+
+       /* Nothing left to do if cmdprio is not used */
+       if (cmdprio->mode == CMDPRIO_MODE_NONE)
+               return 0;
+
+       return fio_cmdprio_parse_and_gen(td, cmdprio);
+}
diff --git a/engines/cmdprio.h b/engines/cmdprio.h

new file mode 100644 (file)

index 0000000..81e6c39
--- /dev/null
+++ b/engines/cmdprio.h
@@ -0,0 +1,166 @@
+/*
+ * IO priority handling declarations and helper functions common to the
+ * libaio and io_uring engines.
+ */
+
+#ifndef FIO_CMDPRIO_H
+#define FIO_CMDPRIO_H
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+/* read and writes only, no trim */
+#define CMDPRIO_RWDIR_CNT 2
+
+enum {
+       CMDPRIO_MODE_NONE,
+       CMDPRIO_MODE_PERC,
+       CMDPRIO_MODE_BSSPLIT,
+};
+
+struct cmdprio_prio {
+       int32_t prio;
+       uint32_t perc;
+       uint16_t clat_prio_index;
+};
+
+struct cmdprio_bsprio {
+       uint64_t bs;
+       uint32_t tot_perc;
+       unsigned int nr_prios;
+       struct cmdprio_prio *prios;
+};
+
+struct cmdprio_bsprio_desc {
+       struct cmdprio_bsprio *bsprios;
+       unsigned int nr_bsprios;
+};
+
+struct cmdprio_options {
+       unsigned int percentage[CMDPRIO_RWDIR_CNT];
+       unsigned int class[CMDPRIO_RWDIR_CNT];
+       unsigned int level[CMDPRIO_RWDIR_CNT];
+       unsigned int hint[CMDPRIO_RWDIR_CNT];
+       char *bssplit_str;
+};
+
+#ifdef FIO_HAVE_IOPRIO_CLASS
+#define CMDPRIO_OPTIONS(opt_struct, opt_group)                                 \
+       {                                                                       \
+               .name   = "cmdprio_percentage",                                 \
+               .lname  = "high priority percentage",                           \
+               .type   = FIO_OPT_INT,                                          \
+               .off1   = offsetof(opt_struct,                                  \
+                                  cmdprio_options.percentage[DDIR_READ]),      \
+               .off2   = offsetof(opt_struct,                                  \
+                                  cmdprio_options.percentage[DDIR_WRITE]),     \
+               .minval = 0,                                                    \
+               .maxval = 100,                                                  \
+               .help   = "Send high priority I/O this percentage of the time", \
+               .category = FIO_OPT_C_ENGINE,                                   \
+               .group  = opt_group,                                            \
+       },                                                                      \
+       {                                                                       \
+               .name   = "cmdprio_class",                                      \
+               .lname  = "Asynchronous I/O priority class",                    \
+               .type   = FIO_OPT_INT,                                          \
+               .off1   = offsetof(opt_struct,                                  \
+                                  cmdprio_options.class[DDIR_READ]),           \
+               .off2   = offsetof(opt_struct,                                  \
+                                  cmdprio_options.class[DDIR_WRITE]),          \
+               .help   = "Set asynchronous IO priority class",                 \
+               .minval = IOPRIO_MIN_PRIO_CLASS + 1,                            \
+               .maxval = IOPRIO_MAX_PRIO_CLASS,                                \
+               .interval = 1,                                                  \
+               .category = FIO_OPT_C_ENGINE,                                   \
+               .group  = opt_group,                                            \
+       },                                                                      \
+       {                                                                       \
+               .name   = "cmdprio_hint",                                       \
+               .lname  = "Asynchronous I/O priority hint",                     \
+               .type   = FIO_OPT_INT,                                          \
+               .off1   = offsetof(opt_struct,                                  \
+                                  cmdprio_options.hint[DDIR_READ]),            \
+               .off2   = offsetof(opt_struct,                                  \
+                                  cmdprio_options.hint[DDIR_WRITE]),           \
+               .help   = "Set asynchronous IO priority hint",                  \
+               .minval = IOPRIO_MIN_PRIO_HINT,                                 \
+               .maxval = IOPRIO_MAX_PRIO_HINT,                                 \
+               .interval = 1,                                                  \
+               .category = FIO_OPT_C_ENGINE,                                   \
+               .group  = opt_group,                                            \
+       },                                                                      \
+       {                                                                       \
+               .name   = "cmdprio",                                            \
+               .lname  = "Asynchronous I/O priority level",                    \
+               .type   = FIO_OPT_INT,                                          \
+               .off1   = offsetof(opt_struct,                                  \
+                                  cmdprio_options.level[DDIR_READ]),           \
+               .off2   = offsetof(opt_struct,                                  \
+                                  cmdprio_options.level[DDIR_WRITE]),          \
+               .help   = "Set asynchronous IO priority level",                 \
+               .minval = IOPRIO_MIN_PRIO,                                      \
+               .maxval = IOPRIO_MAX_PRIO,                                      \
+               .interval = 1,                                                  \
+               .category = FIO_OPT_C_ENGINE,                                   \
+               .group  = opt_group,                                            \
+       },                                                                      \
+       {                                                                       \
+               .name   = "cmdprio_bssplit",                                    \
+               .lname  = "Priority percentage block size split",               \
+               .type   = FIO_OPT_STR_STORE,                                    \
+               .off1   = offsetof(opt_struct, cmdprio_options.bssplit_str),    \
+               .help   = "Set priority percentages for different block sizes", \
+               .category = FIO_OPT_C_ENGINE,                                   \
+               .group  = opt_group,                                            \
+       }
+#else
+#define CMDPRIO_OPTIONS(opt_struct, opt_group)                                 \
+       {                                                                       \
+               .name   = "cmdprio_percentage",                                 \
+               .lname  = "high priority percentage",                           \
+               .type   = FIO_OPT_UNSUPPORTED,                                  \
+               .help   = "Platform does not support I/O priority classes",     \
+       },                                                                      \
+       {                                                                       \
+               .name   = "cmdprio_class",                                      \
+               .lname  = "Asynchronous I/O priority class",                    \
+               .type   = FIO_OPT_UNSUPPORTED,                                  \
+               .help   = "Platform does not support I/O priority classes",     \
+       },                                                                      \
+       {                                                                       \
+               .name   = "cmdprio_hint",                                       \
+               .lname  = "Asynchronous I/O priority hint",                     \
+               .type   = FIO_OPT_UNSUPPORTED,                                  \
+               .help   = "Platform does not support I/O priority classes",     \
+       },                                                                      \
+       {                                                                       \
+               .name   = "cmdprio",                                            \
+               .lname  = "Asynchronous I/O priority level",                    \
+               .type   = FIO_OPT_UNSUPPORTED,                                  \
+               .help   = "Platform does not support I/O priority classes",     \
+       },                                                                      \
+       {                                                                       \
+               .name   = "cmdprio_bssplit",                                    \
+               .lname  = "Priority percentage block size split",               \
+               .type   = FIO_OPT_UNSUPPORTED,                                  \
+               .help   = "Platform does not support I/O priority classes",     \
+       }
+#endif
+
+struct cmdprio {
+       struct cmdprio_options *options;
+       struct cmdprio_prio perc_entry[CMDPRIO_RWDIR_CNT];
+       struct cmdprio_bsprio_desc bsprio_desc[CMDPRIO_RWDIR_CNT];
+       unsigned int mode;
+};
+
+bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio,
+                           struct io_u *io_u);
+
+void fio_cmdprio_cleanup(struct cmdprio *cmdprio);
+
+int fio_cmdprio_init(struct thread_data *td, struct cmdprio *cmdprio,
+                    struct cmdprio_options *options);
+
+#endif
diff --git a/engines/dfs.c b/engines/dfs.c

new file mode 100644 (file)

index 0000000..664e8b1
--- /dev/null
+++ b/engines/dfs.c
@@ -0,0 +1,595 @@
+/**
+ * FIO engine for DAOS File System (dfs).
+ *
+ * (C) Copyright 2020-2021 Intel Corporation.
+ */
+
+#include <fio.h>
+#include <optgroup.h>
+
+#include <daos.h>
+#include <daos_fs.h>
+
+static bool            daos_initialized;
+static int             num_threads;
+static pthread_mutex_t daos_mutex = PTHREAD_MUTEX_INITIALIZER;
+daos_handle_t          poh;  /* pool handle */
+daos_handle_t          coh;  /* container handle */
+daos_oclass_id_t       cid = OC_UNKNOWN;  /* object class */
+dfs_t                  *dfs; /* dfs mount reference */
+
+struct daos_iou {
+       struct io_u     *io_u;
+       daos_event_t    ev;
+       d_sg_list_t     sgl;
+       d_iov_t         iov;
+       daos_size_t     size;
+       bool            complete;
+};
+
+struct daos_data {
+       daos_handle_t   eqh;
+       dfs_obj_t       *obj;
+       struct io_u     **io_us;
+       int             queued;
+       int             num_ios;
+};
+
+struct daos_fio_options {
+       void            *pad;
+       char            *pool;   /* Pool UUID */
+       char            *cont;   /* Container UUID */
+       daos_size_t     chsz;    /* Chunk size */
+       char            *oclass; /* object class */
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+       char            *svcl;   /* service replica list, deprecated */
+#endif
+};
+
+static struct fio_option options[] = {
+       {
+               .name           = "pool",
+               .lname          = "pool uuid or label",
+               .type           = FIO_OPT_STR_STORE,
+               .off1           = offsetof(struct daos_fio_options, pool),
+               .help           = "DAOS pool uuid or label",
+               .category       = FIO_OPT_C_ENGINE,
+               .group          = FIO_OPT_G_DFS,
+       },
+       {
+               .name           = "cont",
+               .lname          = "container uuid or label",
+               .type           = FIO_OPT_STR_STORE,
+               .off1           = offsetof(struct daos_fio_options, cont),
+               .help           = "DAOS container uuid or label",
+               .category       = FIO_OPT_C_ENGINE,
+               .group          = FIO_OPT_G_DFS,
+       },
+       {
+               .name           = "chunk_size",
+               .lname          = "DFS chunk size",
+               .type           = FIO_OPT_ULL,
+               .off1           = offsetof(struct daos_fio_options, chsz),
+               .help           = "DFS chunk size in bytes",
+               .def            = "0", /* use container default */
+               .category       = FIO_OPT_C_ENGINE,
+               .group          = FIO_OPT_G_DFS,
+       },
+       {
+               .name           = "object_class",
+               .lname          = "object class",
+               .type           = FIO_OPT_STR_STORE,
+               .off1           = offsetof(struct daos_fio_options, oclass),
+               .help           = "DAOS object class",
+               .category       = FIO_OPT_C_ENGINE,
+               .group          = FIO_OPT_G_DFS,
+       },
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+       {
+               .name           = "svcl",
+               .lname          = "List of service ranks",
+               .type           = FIO_OPT_STR_STORE,
+               .off1           = offsetof(struct daos_fio_options, svcl),
+               .help           = "List of pool replicated service ranks",
+               .category       = FIO_OPT_C_ENGINE,
+               .group          = FIO_OPT_G_DFS,
+       },
+#endif
+       {
+               .name           = NULL,
+       },
+};
+
+static int daos_fio_global_init(struct thread_data *td)
+{
+       struct daos_fio_options *eo = td->eo;
+       daos_pool_info_t        pool_info;
+       daos_cont_info_t        co_info;
+       int                     rc = 0;
+
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+       if (!eo->pool || !eo->cont || !eo->svcl) {
+#else
+       if (!eo->pool || !eo->cont) {
+#endif
+               log_err("Missing required DAOS options\n");
+               return EINVAL;
+       }
+
+       rc = daos_init();
+       if (rc != -DER_ALREADY && rc) {
+               log_err("Failed to initialize daos %d\n", rc);
+               td_verror(td, rc, "daos_init");
+               return rc;
+       }
+
+#if !defined(DAOS_API_VERSION_MAJOR) || \
+    (DAOS_API_VERSION_MAJOR == 1 && DAOS_API_VERSION_MINOR < 3)
+       uuid_t pool_uuid, co_uuid;
+
+       rc = uuid_parse(eo->pool, pool_uuid);
+       if (rc) {
+               log_err("Failed to parse 'Pool uuid': %s\n", eo->pool);
+               td_verror(td, EINVAL, "uuid_parse(eo->pool)");
+               return EINVAL;
+       }
+
+       rc = uuid_parse(eo->cont, co_uuid);
+       if (rc) {
+               log_err("Failed to parse 'Cont uuid': %s\n", eo->cont);
+               td_verror(td, EINVAL, "uuid_parse(eo->cont)");
+               return EINVAL;
+       }
+#endif
+
+       /* Connect to the DAOS pool */
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+       d_rank_list_t *svcl = NULL;
+
+       svcl = daos_rank_list_parse(eo->svcl, ":");
+       if (svcl == NULL) {
+               log_err("Failed to parse svcl\n");
+               td_verror(td, EINVAL, "daos_rank_list_parse");
+               return EINVAL;
+       }
+
+       rc = daos_pool_connect(pool_uuid, NULL, svcl, DAOS_PC_RW,
+                       &poh, &pool_info, NULL);
+       d_rank_list_free(svcl);
+#elif (DAOS_API_VERSION_MAJOR == 1 && DAOS_API_VERSION_MINOR < 3)
+       rc = daos_pool_connect(pool_uuid, NULL, DAOS_PC_RW, &poh, &pool_info,
+                              NULL);
+#else
+       rc = daos_pool_connect(eo->pool, NULL, DAOS_PC_RW, &poh, &pool_info,
+                              NULL);
+#endif
+       if (rc) {
+               log_err("Failed to connect to pool %d\n", rc);
+               td_verror(td, rc, "daos_pool_connect");
+               return rc;
+       }
+
+       /* Open the DAOS container */
+#if !defined(DAOS_API_VERSION_MAJOR) || \
+    (DAOS_API_VERSION_MAJOR == 1 && DAOS_API_VERSION_MINOR < 3)
+       rc = daos_cont_open(poh, co_uuid, DAOS_COO_RW, &coh, &co_info, NULL);
+#else
+       rc = daos_cont_open(poh, eo->cont, DAOS_COO_RW, &coh, &co_info, NULL);
+#endif
+       if (rc) {
+               log_err("Failed to open container: %d\n", rc);
+               td_verror(td, rc, "daos_cont_open");
+               (void)daos_pool_disconnect(poh, NULL);
+               return rc;
+       }
+
+       /* Mount encapsulated filesystem */
+       rc = dfs_mount(poh, coh, O_RDWR, &dfs);
+       if (rc) {
+               log_err("Failed to mount DFS namespace: %d\n", rc);
+               td_verror(td, rc, "dfs_mount");
+               (void)daos_pool_disconnect(poh, NULL);
+               (void)daos_cont_close(coh, NULL);
+               return rc;
+       }
+
+       /* Retrieve object class to use, if specified */
+       if (eo->oclass)
+               cid = daos_oclass_name2id(eo->oclass);
+
+       return 0;
+}
+
+static int daos_fio_global_cleanup()
+{
+       int rc;
+       int ret = 0;
+
+       rc = dfs_umount(dfs);
+       if (rc) {
+               log_err("failed to umount dfs: %d\n", rc);
+               ret = rc;
+       }
+       rc = daos_cont_close(coh, NULL);
+       if (rc) {
+               log_err("failed to close container: %d\n", rc);
+               if (ret == 0)
+                       ret = rc;
+       }
+       rc = daos_pool_disconnect(poh, NULL);
+       if (rc) {
+               log_err("failed to disconnect pool: %d\n", rc);
+               if (ret == 0)
+                       ret = rc;
+       }
+       rc = daos_fini();
+       if (rc) {
+               log_err("failed to finalize daos: %d\n", rc);
+               if (ret == 0)
+                       ret = rc;
+       }
+
+       return ret;
+}
+
+static int daos_fio_setup(struct thread_data *td)
+{
+       return 0;
+}
+
+static int daos_fio_init(struct thread_data *td)
+{
+       struct daos_data        *dd;
+       int                     rc = 0;
+
+       pthread_mutex_lock(&daos_mutex);
+
+       dd = malloc(sizeof(*dd));
+       if (dd == NULL) {
+               log_err("Failed to allocate DAOS-private data\n");
+               rc = ENOMEM;
+               goto out;
+       }
+
+       dd->queued      = 0;
+       dd->num_ios     = td->o.iodepth;
+       dd->io_us       = calloc(dd->num_ios, sizeof(struct io_u *));
+       if (dd->io_us == NULL) {
+               log_err("Failed to allocate IO queue\n");
+               rc = ENOMEM;
+               goto out;
+       }
+
+       /* initialize DAOS stack if not already up */
+       if (!daos_initialized) {
+               rc = daos_fio_global_init(td);
+               if (rc)
+                       goto out;
+               daos_initialized = true;
+       }
+
+       rc = daos_eq_create(&dd->eqh);
+       if (rc) {
+               log_err("Failed to create event queue: %d\n", rc);
+               td_verror(td, rc, "daos_eq_create");
+               goto out;
+       }
+
+       td->io_ops_data = dd;
+       num_threads++;
+out:
+       if (rc) {
+               if (dd) {
+                       free(dd->io_us);
+                       free(dd);
+               }
+               if (num_threads == 0 && daos_initialized) {
+                       /* don't clobber error return value */
+                       (void)daos_fio_global_cleanup();
+                       daos_initialized = false;
+               }
+       }
+       pthread_mutex_unlock(&daos_mutex);
+       return rc;
+}
+
+static void daos_fio_cleanup(struct thread_data *td)
+{
+       struct daos_data        *dd = td->io_ops_data;
+       int                     rc;
+
+       if (dd == NULL)
+               return;
+
+       rc = daos_eq_destroy(dd->eqh, DAOS_EQ_DESTROY_FORCE);
+       if (rc < 0) {
+               log_err("failed to destroy event queue: %d\n", rc);
+               td_verror(td, rc, "daos_eq_destroy");
+       }
+
+       free(dd->io_us);
+       free(dd);
+
+       pthread_mutex_lock(&daos_mutex);
+       num_threads--;
+       if (daos_initialized && num_threads == 0) {
+               int ret;
+
+               ret = daos_fio_global_cleanup();
+               if (ret < 0 && rc == 0) {
+                       log_err("failed to clean up: %d\n", ret);
+                       td_verror(td, ret, "daos_fio_global_cleanup");
+               }
+               daos_initialized = false;
+       }
+       pthread_mutex_unlock(&daos_mutex);
+}
+
+static int daos_fio_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+       char            *file_name = f->file_name;
+       struct stat     stbuf = {0};
+       int             rc;
+
+       dprint(FD_FILE, "dfs stat %s\n", f->file_name);
+
+       if (!daos_initialized)
+               return 0;
+
+       rc = dfs_stat(dfs, NULL, file_name, &stbuf);
+       if (rc) {
+               log_err("Failed to stat %s: %d\n", f->file_name, rc);
+               td_verror(td, rc, "dfs_stat");
+               return rc;
+       }
+
+       f->real_file_size = stbuf.st_size;
+       return 0;
+}
+
+static int daos_fio_close(struct thread_data *td, struct fio_file *f)
+{
+       struct daos_data        *dd = td->io_ops_data;
+       int                     rc;
+
+       dprint(FD_FILE, "dfs release %s\n", f->file_name);
+
+       rc = dfs_release(dd->obj);
+       if (rc) {
+               log_err("Failed to release %s: %d\n", f->file_name, rc);
+               td_verror(td, rc, "dfs_release");
+               return rc;
+       }
+
+       return 0;
+}
+
+static int daos_fio_open(struct thread_data *td, struct fio_file *f)
+{
+       struct daos_data        *dd = td->io_ops_data;
+       struct daos_fio_options *eo = td->eo;
+       int                     flags = 0;
+       int                     rc;
+
+       dprint(FD_FILE, "dfs open %s (%s/%d/%d)\n",
+              f->file_name, td_write(td) & !read_only ? "rw" : "r",
+              td->o.create_on_open, td->o.allow_create);
+
+       if (td->o.create_on_open && td->o.allow_create)
+               flags |= O_CREAT;
+
+       if (td_write(td)) {
+               if (!read_only)
+                       flags |= O_RDWR;
+               if (td->o.allow_create)
+                       flags |= O_CREAT;
+       } else if (td_read(td)) {
+               flags |= O_RDONLY;
+       }
+
+       rc = dfs_open(dfs, NULL, f->file_name,
+                     S_IFREG | S_IRUSR | S_IWUSR,
+                     flags, cid, eo->chsz, NULL, &dd->obj);
+       if (rc) {
+               log_err("Failed to open %s: %d\n", f->file_name, rc);
+               td_verror(td, rc, "dfs_open");
+               return rc;
+       }
+
+       return 0;
+}
+
+static int daos_fio_unlink(struct thread_data *td, struct fio_file *f)
+{
+       int rc;
+
+       dprint(FD_FILE, "dfs remove %s\n", f->file_name);
+
+       rc = dfs_remove(dfs, NULL, f->file_name, false, NULL);
+       if (rc) {
+               log_err("Failed to remove %s: %d\n", f->file_name, rc);
+               td_verror(td, rc, "dfs_remove");
+               return rc;
+       }
+
+       return 0;
+}
+
+static int daos_fio_invalidate(struct thread_data *td, struct fio_file *f)
+{
+       dprint(FD_FILE, "dfs invalidate %s\n", f->file_name);
+       return 0;
+}
+
+static void daos_fio_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+       struct daos_iou *io = io_u->engine_data;
+
+       if (io) {
+               io_u->engine_data = NULL;
+               free(io);
+       }
+}
+
+static int daos_fio_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+       struct daos_iou *io;
+
+       io = malloc(sizeof(struct daos_iou));
+       if (!io) {
+               td_verror(td, ENOMEM, "malloc");
+               return ENOMEM;
+       }
+       io->io_u = io_u;
+       io_u->engine_data = io;
+       return 0;
+}
+
+static struct io_u * daos_fio_event(struct thread_data *td, int event)
+{
+       struct daos_data *dd = td->io_ops_data;
+
+       return dd->io_us[event];
+}
+
+static int daos_fio_getevents(struct thread_data *td, unsigned int min,
+                             unsigned int max, const struct timespec *t)
+{
+       struct daos_data        *dd = td->io_ops_data;
+       daos_event_t            *evp[max];
+       unsigned int            events = 0;
+       int                     i;
+       int                     rc;
+
+       while (events < min) {
+               rc = daos_eq_poll(dd->eqh, 0, DAOS_EQ_NOWAIT, max, evp);
+               if (rc < 0) {
+                       log_err("Event poll failed: %d\n", rc);
+                       td_verror(td, rc, "daos_eq_poll");
+                       return events;
+               }
+
+               for (i = 0; i < rc; i++) {
+                       struct daos_iou *io;
+                       struct io_u     *io_u;
+
+                       io = container_of(evp[i], struct daos_iou, ev);
+                       if (io->complete)
+                               log_err("Completion on already completed I/O\n");
+
+                       io_u = io->io_u;
+                       if (io->ev.ev_error)
+                               io_u->error = io->ev.ev_error;
+                       else
+                               io_u->resid = 0;
+
+                       dd->io_us[events] = io_u;
+                       dd->queued--;
+                       daos_event_fini(&io->ev);
+                       io->complete = true;
+                       events++;
+               }
+       }
+
+       dprint(FD_IO, "dfs eq_pool returning %d (%u/%u)\n", events, min, max);
+
+       return events;
+}
+
+static enum fio_q_status daos_fio_queue(struct thread_data *td,
+                                       struct io_u *io_u)
+{
+       struct daos_data        *dd = td->io_ops_data;
+       struct daos_iou         *io = io_u->engine_data;
+       daos_off_t              offset = io_u->offset;
+       int                     rc;
+
+       if (dd->queued == td->o.iodepth)
+               return FIO_Q_BUSY;
+
+       io->sgl.sg_nr = 1;
+       io->sgl.sg_nr_out = 0;
+       d_iov_set(&io->iov, io_u->xfer_buf, io_u->xfer_buflen);
+       io->sgl.sg_iovs = &io->iov;
+       io->size = io_u->xfer_buflen;
+
+       io->complete = false;
+       rc = daos_event_init(&io->ev, dd->eqh, NULL);
+       if (rc) {
+               log_err("Event init failed: %d\n", rc);
+               io_u->error = rc;
+               return FIO_Q_COMPLETED;
+       }
+
+       switch (io_u->ddir) {
+       case DDIR_WRITE:
+               rc = dfs_write(dfs, dd->obj, &io->sgl, offset, &io->ev);
+               if (rc) {
+                       log_err("dfs_write failed: %d\n", rc);
+                       io_u->error = rc;
+                       return FIO_Q_COMPLETED;
+               }
+               break;
+       case DDIR_READ:
+               rc = dfs_read(dfs, dd->obj, &io->sgl, offset, &io->size,
+                             &io->ev);
+               if (rc) {
+                       log_err("dfs_read failed: %d\n", rc);
+                       io_u->error = rc;
+                       return FIO_Q_COMPLETED;
+               }
+               break;
+       case DDIR_SYNC:
+               io_u->error = 0;
+               return FIO_Q_COMPLETED;
+       default:
+               dprint(FD_IO, "Invalid IO type: %d\n", io_u->ddir);
+               io_u->error = -DER_INVAL;
+               return FIO_Q_COMPLETED;
+       }
+
+       dd->queued++;
+       return FIO_Q_QUEUED;
+}
+
+static int daos_fio_prep(struct thread_data fio_unused *td, struct io_u *io_u)
+{
+       return 0;
+}
+
+/* ioengine_ops for get_ioengine() */
+FIO_STATIC struct ioengine_ops ioengine = {
+       .name                   = "dfs",
+       .version                = FIO_IOOPS_VERSION,
+       .flags                  = FIO_DISKLESSIO | FIO_NODISKUTIL,
+
+       .setup                  = daos_fio_setup,
+       .init                   = daos_fio_init,
+       .prep                   = daos_fio_prep,
+       .cleanup                = daos_fio_cleanup,
+
+       .open_file              = daos_fio_open,
+       .invalidate             = daos_fio_invalidate,
+       .get_file_size          = daos_fio_get_file_size,
+       .close_file             = daos_fio_close,
+       .unlink_file            = daos_fio_unlink,
+
+       .queue                  = daos_fio_queue,
+       .getevents              = daos_fio_getevents,
+       .event                  = daos_fio_event,
+       .io_u_init              = daos_fio_io_u_init,
+       .io_u_free              = daos_fio_io_u_free,
+
+       .option_struct_size     = sizeof(struct daos_fio_options),
+       .options                = options,
+};
+
+static void fio_init fio_dfs_register(void)
+{
+       register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_dfs_unregister(void)
+{
+       unregister_ioengine(&ioengine);
+}
diff --git a/engines/e4defrag.c b/engines/e4defrag.c

index 0a0004d0477fc795dd415d417a6c935f4cfb3131..37cc2ada817dad98faa6968c9d3c8b2e8d0a8af9 100644 (file)
--- a/engines/e4defrag.c
+++ b/engines/e4defrag.c
@@ -77,12 +77,11 @@ static int fio_e4defrag_init(struct thread_data *td)
                 return 1;
         }
  
-       ed = malloc(sizeof(*ed));
+       ed = calloc(1, sizeof(*ed));
         if (!ed) {
                 td_verror(td, ENOMEM, "io_queue_init");
                 return 1;
         }
-       memset(ed, 0 ,sizeof(*ed));
  
         if (td->o.directory)
                 len = sprintf(donor_name, "%s/", td->o.directory);
diff --git a/engines/exec.c b/engines/exec.c

new file mode 100644 (file)

index 0000000..20e50e0
--- /dev/null
+++ b/engines/exec.c
@@ -0,0 +1,394 @@
+/*
+ * Exec engine
+ *
+ * Doesn't transfer any data, merely run 3rd party tools
+ *
+ */
+#include "../fio.h"
+#include "../optgroup.h"
+#include <signal.h>
+
+struct exec_options {
+       void *pad;
+       char *program;
+       char *arguments;
+       int grace_time;
+       unsigned int std_redirect;
+       pid_t pid;
+};
+
+static struct fio_option options[] = {
+       {
+               .name = "program",
+               .lname = "Program",
+               .type = FIO_OPT_STR_STORE,
+               .off1 = offsetof(struct exec_options, program),
+               .help = "Program to execute",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_INVALID,
+       },
+       {
+               .name = "arguments",
+               .lname = "Arguments",
+               .type = FIO_OPT_STR_STORE,
+               .off1 = offsetof(struct exec_options, arguments),
+               .help = "Arguments to pass",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_INVALID,
+       },
+       {
+               .name = "grace_time",
+               .lname = "Grace time",
+               .type = FIO_OPT_INT,
+               .minval = 0,
+               .def = "1",
+               .off1 = offsetof(struct exec_options, grace_time),
+               .help = "Grace time before sending a SIGKILL",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_INVALID,
+       },
+       {
+               .name = "std_redirect",
+               .lname = "Std redirect",
+               .type = FIO_OPT_BOOL,
+               .def = "1",
+               .off1 = offsetof(struct exec_options, std_redirect),
+               .help = "Redirect stdout & stderr to files",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_INVALID,
+       },
+       {
+               .name = NULL,
+       },
+};
+
+char *str_replace(char *orig, const char *rep, const char *with)
+{
+       /*
+        * Replace a substring by another.
+        *
+        * Returns the new string if occurrences were found
+        * Returns orig if no occurrence is found
+        */
+       char *result, *insert, *tmp;
+       int len_rep, len_with, len_front, count;
+
+       /* sanity checks and initialization */
+       if (!orig || !rep)
+               return orig;
+
+       len_rep = strlen(rep);
+       if (len_rep == 0)
+               return orig;
+
+       if (!with)
+               with = "";
+       len_with = strlen(with);
+
+       insert = orig;
+       for (count = 0; (tmp = strstr(insert, rep)); ++count) {
+               insert = tmp + len_rep;
+       }
+
+       tmp = result = malloc(strlen(orig) + (len_with - len_rep) * count + 1);
+
+       if (!result)
+               return orig;
+
+       while (count--) {
+               insert = strstr(orig, rep);
+               len_front = insert - orig;
+               tmp = strncpy(tmp, orig, len_front) + len_front;
+               tmp = strcpy(tmp, with) + len_with;
+               orig += len_front + len_rep;
+       }
+       strcpy(tmp, orig);
+       return result;
+}
+
+char *expand_variables(struct thread_options *o, char *arguments)
+{
+       char str[16];
+       char *expanded_runtime, *expanded_name;
+       snprintf(str, sizeof(str), "%lld", o->timeout / 1000000);
+
+       /* %r is replaced by the runtime in seconds */
+       expanded_runtime = str_replace(arguments, "%r", str);
+
+       /* %n is replaced by the name of the running job */
+       expanded_name = str_replace(expanded_runtime, "%n", o->name);
+
+       free(expanded_runtime);
+       return expanded_name;
+}
+
+static int exec_background(struct thread_options *o, struct exec_options *eo)
+{
+       char *outfilename = NULL, *errfilename = NULL;
+       int outfd = 0, errfd = 0;
+       pid_t pid;
+       char *expanded_arguments = NULL;
+       /* For the arguments splitting */
+       char **arguments_array = NULL;
+       char *p;
+       char *exec_cmd = NULL;
+       size_t arguments_nb_items = 0, q;
+
+       if (asprintf(&outfilename, "%s.stdout", o->name) < 0)
+               return -1;
+
+       if (asprintf(&errfilename, "%s.stderr", o->name) < 0) {
+               free(outfilename);
+               return -1;
+       }
+
+       /* If we have variables in the arguments, let's expand them */
+       expanded_arguments = expand_variables(o, eo->arguments);
+
+       if (eo->std_redirect) {
+               log_info("%s : Saving output of %s %s : stdout=%s stderr=%s\n",
+                        o->name, eo->program, expanded_arguments, outfilename,
+                        errfilename);
+
+               /* Creating the stderr & stdout output files */
+               outfd = open(outfilename, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+               if (outfd < 0) {
+                       log_err("fio: cannot open output file %s : %s\n",
+                               outfilename, strerror(errno));
+                       free(outfilename);
+                       free(errfilename);
+                       free(expanded_arguments);
+                       return -1;
+               }
+
+               errfd = open(errfilename, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+               if (errfd < 0) {
+                       log_err("fio: cannot open output file %s : %s\n",
+                               errfilename, strerror(errno));
+                       free(outfilename);
+                       free(errfilename);
+                       free(expanded_arguments);
+                       close(outfd);
+                       return -1;
+               }
+       } else {
+               log_info("%s : Running %s %s\n",
+                        o->name, eo->program, expanded_arguments);
+       }
+
+       pid = fork();
+
+       /* We are on the control thread (parent side of the fork */
+       if (pid > 0) {
+               eo->pid = pid;
+               if (eo->std_redirect) {
+                       /* The output file is for the client side of the fork */
+                       close(outfd);
+                       close(errfd);
+                       free(outfilename);
+                       free(errfilename);
+               }
+               free(expanded_arguments);
+               return 0;
+       }
+
+       /* If the fork failed */
+       if (pid < 0) {
+               log_err("fio: forking failed %s \n", strerror(errno));
+               if (eo->std_redirect) {
+                       close(outfd);
+                       close(errfd);
+                       free(outfilename);
+                       free(errfilename);
+               }
+               free(expanded_arguments);
+               return -1;
+       }
+
+       /* We are in the worker (child side of the fork) */
+       if (pid == 0) {
+               if (eo->std_redirect) {
+                       /* replace stdout by the output file we create */
+                       dup2(outfd, 1);
+                       /* replace stderr by the output file we create */
+                       dup2(errfd, 2);
+                       close(outfd);
+                       close(errfd);
+                       free(outfilename);
+                       free(errfilename);
+               }
+
+               /*
+                * Let's split the command line into a null terminated array to
+                * be passed to the exec'd program.
+                * But don't asprintf expanded_arguments if NULL as it would be
+                * converted to a '(null)' argument, while we want no arguments
+                * at all.
+                */
+               if (expanded_arguments != NULL) {
+                       if (asprintf(&exec_cmd, "%s %s", eo->program, expanded_arguments) < 0) {
+                               free(expanded_arguments);
+                               return -1;
+                       }
+               } else {
+                       if (asprintf(&exec_cmd, "%s", eo->program) < 0)
+                               return -1;
+               }
+
+               /*
+                * Let's build an argv array to based on the program name and
+                * arguments
+                */
+               p = exec_cmd;
+               for (;;) {
+                       p += strspn(p, " ");
+
+                       if (!(q = strcspn(p, " ")))
+                               break;
+
+                       if (q) {
+                               arguments_array =
+                                   realloc(arguments_array,
+                                           (arguments_nb_items +
+                                            1) * sizeof(char *));
+                               arguments_array[arguments_nb_items] =
+                                   malloc(q + 1);
+                               strncpy(arguments_array[arguments_nb_items], p,
+                                       q);
+                               arguments_array[arguments_nb_items][q] = 0;
+                               arguments_nb_items++;
+                               p += q;
+                       }
+               }
+
+               /* Adding a null-terminated item to close the list */
+               arguments_array =
+                   realloc(arguments_array,
+                           (arguments_nb_items + 1) * sizeof(char *));
+               arguments_array[arguments_nb_items] = NULL;
+
+               /*
+                * Replace the fio program from the child fork by the target
+                * program
+                */
+               execvp(arguments_array[0], arguments_array);
+       }
+       /* We never reach this place */
+       /* Let's free the malloc'ed structures to make static checkers happy */
+       if (expanded_arguments)
+               free(expanded_arguments);
+       if (arguments_array)
+               free(arguments_array);
+       return 0;
+}
+
+static enum fio_q_status
+fio_exec_queue(struct thread_data *td, struct io_u fio_unused * io_u)
+{
+       struct thread_options *o = &td->o;
+       struct exec_options *eo = td->eo;
+
+       /* Let's execute the program the first time we get queued */
+       if (eo->pid == -1) {
+               exec_background(o, eo);
+       } else {
+               /*
+                * The program is running in background, let's check on a
+                * regular basis
+                * if the time is over and if we need to stop the tool
+                */
+               usleep(o->thinktime);
+               if (utime_since_now(&td->start) > o->timeout) {
+                       /* Let's stop the child */
+                       kill(eo->pid, SIGTERM);
+                       /*
+                        * Let's give grace_time (1 sec by default) to the 3rd
+                        * party tool to stop
+                        */
+                       sleep(eo->grace_time);
+               }
+       }
+
+       return FIO_Q_COMPLETED;
+}
+
+static int fio_exec_init(struct thread_data *td)
+{
+       struct thread_options *o = &td->o;
+       struct exec_options *eo = td->eo;
+       int td_previous_state;
+
+       eo->pid = -1;
+
+       if (!eo->program) {
+               td_vmsg(td, EINVAL,
+                       "no program is defined, it is mandatory to define one",
+                       "exec");
+               return 1;
+       }
+
+       log_info("%s : program=%s, arguments=%s\n",
+                td->o.name, eo->program, eo->arguments);
+
+       /* Saving the current thread state */
+       td_previous_state = td->runstate;
+
+       /*
+        * Reporting that we are preparing the engine
+        * This is useful as the qsort() calibration takes time
+        * This prevents the job from starting before init is completed
+        */
+       td_set_runstate(td, TD_SETTING_UP);
+
+       /*
+        * set thinktime_sleep and thinktime_spin appropriately
+        */
+       o->thinktime_blocks = 1;
+       o->thinktime_blocks_type = THINKTIME_BLOCKS_TYPE_COMPLETE;
+       o->thinktime_spin = 0;
+       /* 50ms pause when waiting for the program to complete */
+       o->thinktime = 50000;
+
+       o->nr_files = o->open_files = 1;
+
+       /* Let's restore the previous state. */
+       td_set_runstate(td, td_previous_state);
+       return 0;
+}
+
+static void fio_exec_cleanup(struct thread_data *td)
+{
+       struct exec_options *eo = td->eo;
+       /* Send a sigkill to ensure the job is well terminated */
+       if (eo->pid > 0)
+               kill(eo->pid, SIGKILL);
+}
+
+static int
+fio_exec_open(struct thread_data fio_unused * td,
+             struct fio_file fio_unused * f)
+{
+       return 0;
+}
+
+static struct ioengine_ops ioengine = {
+       .name = "exec",
+       .version = FIO_IOOPS_VERSION,
+       .queue = fio_exec_queue,
+       .init = fio_exec_init,
+       .cleanup = fio_exec_cleanup,
+       .open_file = fio_exec_open,
+       .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOIO,
+       .options = options,
+       .option_struct_size = sizeof(struct exec_options),
+};
+
+static void fio_init fio_exec_register(void)
+{
+       register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_exec_unregister(void)
+{
+       unregister_ioengine(&ioengine);
+}
diff --git a/engines/falloc.c b/engines/falloc.c

index 6382569b9a92c42c1ad5d76a75d9441808de6cc8..4b05ed68fb467263ddbebabe7e79dd5231f92b63 100644 (file)
--- a/engines/falloc.c
+++ b/engines/falloc.c
@@ -25,8 +25,8 @@ static int open_file(struct thread_data *td, struct fio_file *f)
  
         dprint(FD_FILE, "fd open %s\n", f->file_name);
  
-       if (f->filetype != FIO_TYPE_FILE) {
-               log_err("fio: only files are supported fallocate \n");
+       if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK) {
+               log_err("fio: only files and blockdev are supported fallocate \n");
                 return 1;
         }
         if (!strcmp(f->file_name, "-")) {
diff --git a/engines/filecreate.c b/engines/filecreate.c

deleted file mode 100644 (file)

index 5fec854..0000000
--- a/engines/filecreate.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * filecreate engine
- *
- * IO engine that doesn't do any IO, just creates files and tracks the latency
- * of the file creation.
- */
-#include <stdio.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include "../fio.h"
-
-struct fc_data {
-       enum fio_ddir stat_ddir;
-};
-
-static int open_file(struct thread_data *td, struct fio_file *f)
-{
-       struct timespec start;
-       int do_lat = !td->o.disable_lat;
-
-       dprint(FD_FILE, "fd open %s\n", f->file_name);
-
-       if (f->filetype != FIO_TYPE_FILE) {
-               log_err("fio: only files are supported fallocate \n");
-               return 1;
-       }
-       if (!strcmp(f->file_name, "-")) {
-               log_err("fio: can't read/write to stdin/out\n");
-               return 1;
-       }
-
-       if (do_lat)
-               fio_gettime(&start, NULL);
-
-       f->fd = open(f->file_name, O_CREAT|O_RDWR, 0600);
-
-       if (f->fd == -1) {
-               char buf[FIO_VERROR_SIZE];
-               int e = errno;
-
-               snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
-               td_verror(td, e, buf);
-               return 1;
-       }
-
-       if (do_lat) {
-               struct fc_data *data = td->io_ops_data;
-               uint64_t nsec;
-
-               nsec = ntime_since_now(&start);
-               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0);
-       }
-
-       return 0;
-}
-
-static enum fio_q_status queue_io(struct thread_data *td,
-                                 struct io_u fio_unused *io_u)
-{
-       return FIO_Q_COMPLETED;
-}
-
-/*
- * Ensure that we at least have a block size worth of IO to do for each
- * file. If the job file has td->o.size < nr_files * block_size, then
- * fio won't do anything.
- */
-static int get_file_size(struct thread_data *td, struct fio_file *f)
-{
-       f->real_file_size = td_min_bs(td);
-       return 0;
-}
-
-static int init(struct thread_data *td)
-{
-       struct fc_data *data;
-
-       data = calloc(1, sizeof(*data));
-
-       if (td_read(td))
-               data->stat_ddir = DDIR_READ;
-       else if (td_write(td))
-               data->stat_ddir = DDIR_WRITE;
-
-       td->io_ops_data = data;
-       return 0;
-}
-
-static void cleanup(struct thread_data *td)
-{
-       struct fc_data *data = td->io_ops_data;
-
-       free(data);
-}
-
-static struct ioengine_ops ioengine = {
-       .name           = "filecreate",
-       .version        = FIO_IOOPS_VERSION,
-       .init           = init,
-       .cleanup        = cleanup,
-       .queue          = queue_io,
-       .get_file_size  = get_file_size,
-       .open_file      = open_file,
-       .close_file     = generic_close_file,
-       .flags          = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
-                               FIO_NOSTATS | FIO_NOFILEHASH,
-};
-
-static void fio_init fio_filecreate_register(void)
-{
-       register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_filecreate_unregister(void)
-{
-       unregister_ioengine(&ioengine);
-}
diff --git a/engines/fileoperations.c b/engines/fileoperations.c

new file mode 100644 (file)

index 0000000..c52f090
--- /dev/null
+++ b/engines/fileoperations.c
@@ -0,0 +1,425 @@
+/*
+ * file/directory operations engine
+ *
+ * IO engine that doesn't do any IO, just operates files/directories
+ * and tracks the latency of the operation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "../fio.h"
+#include "../optgroup.h"
+#include "../oslib/statx.h"
+
+enum fio_engine {
+       UNKNOWN_OP_ENGINE = 0,
+       FILE_OP_ENGINE = 1,
+       DIR_OP_ENGINE = 2,
+};
+
+struct fc_data {
+       enum fio_ddir stat_ddir;
+       enum fio_engine op_engine;
+};
+
+struct filestat_options {
+       void *pad;
+       unsigned int stat_type;
+};
+
+enum {
+       FIO_FILESTAT_STAT       = 1,
+       FIO_FILESTAT_LSTAT      = 2,
+       FIO_FILESTAT_STATX      = 3,
+};
+
+static struct fio_option options[] = {
+       {
+               .name   = "stat_type",
+               .lname  = "stat_type",
+               .type   = FIO_OPT_STR,
+               .off1   = offsetof(struct filestat_options, stat_type),
+               .help   = "Specify stat system call type to measure lookup/getattr performance",
+               .def    = "stat",
+               .posval = {
+                         { .ival = "stat",
+                           .oval = FIO_FILESTAT_STAT,
+                           .help = "Use stat(2)",
+                         },
+                         { .ival = "lstat",
+                           .oval = FIO_FILESTAT_LSTAT,
+                           .help = "Use lstat(2)",
+                         },
+                         { .ival = "statx",
+                           .oval = FIO_FILESTAT_STATX,
+                           .help = "Use statx(2) if exists",
+                         },
+               },
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_FILESTAT,
+       },
+       {
+               .name   = NULL,
+       },
+};
+
+static int setup_dirs(struct thread_data *td)
+{
+       int ret = 0;
+       int i;
+       struct fio_file *f;
+
+       for_each_file(td, f, i) {
+               dprint(FD_FILE, "setup directory %s\n", f->file_name);
+               ret = fio_mkdir(f->file_name, 0700);
+               if ((ret && errno != EEXIST)) {
+                       log_err("create directory %s failed with %d\n",
+                               f->file_name, errno);
+                       break;
+               }
+               ret = 0;
+       }
+       return ret;
+}
+
+static int open_file(struct thread_data *td, struct fio_file *f)
+{
+       struct timespec start;
+       int do_lat = !td->o.disable_lat;
+       struct fc_data *fcd = td->io_ops_data;
+
+       dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+       if (f->filetype != FIO_TYPE_FILE) {
+               log_err("fio: only files are supported\n");
+               return 1;
+       }
+       if (!strcmp(f->file_name, "-")) {
+               log_err("fio: can't read/write to stdin/out\n");
+               return 1;
+       }
+
+       if (do_lat)
+               fio_gettime(&start, NULL);
+
+       if (fcd->op_engine == FILE_OP_ENGINE)
+               f->fd = open(f->file_name, O_CREAT|O_RDWR, 0600);
+       else if (fcd->op_engine == DIR_OP_ENGINE)
+               f->fd = fio_mkdir(f->file_name, S_IFDIR);
+       else {
+               log_err("fio: unknown file/directory operation engine\n");
+               return 1;
+       }
+
+       if (f->fd == -1) {
+               char buf[FIO_VERROR_SIZE];
+               int e = errno;
+
+               snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
+               td_verror(td, e, buf);
+               return 1;
+       }
+
+       if (do_lat) {
+               struct fc_data *data = td->io_ops_data;
+               uint64_t nsec;
+
+               nsec = ntime_since_now(&start);
+               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
+       }
+
+       return 0;
+}
+
+static int stat_file(struct thread_data *td, struct fio_file *f)
+{
+       struct filestat_options *o = td->eo;
+       struct timespec start;
+       int do_lat = !td->o.disable_lat;
+       struct stat statbuf;
+#ifndef WIN32
+       struct statx statxbuf;
+       char *abspath;
+#endif
+       int ret;
+
+       dprint(FD_FILE, "fd stat %s\n", f->file_name);
+
+       if (f->filetype != FIO_TYPE_FILE) {
+               log_err("fio: only files are supported\n");
+               return 1;
+       }
+       if (!strcmp(f->file_name, "-")) {
+               log_err("fio: can't read/write to stdin/out\n");
+               return 1;
+       }
+
+       if (do_lat)
+               fio_gettime(&start, NULL);
+
+       switch (o->stat_type) {
+       case FIO_FILESTAT_STAT:
+               ret = stat(f->file_name, &statbuf);
+               break;
+       case FIO_FILESTAT_LSTAT:
+               ret = lstat(f->file_name, &statbuf);
+               break;
+       case FIO_FILESTAT_STATX:
+#ifndef WIN32
+               abspath = realpath(f->file_name, NULL);
+               if (abspath) {
+                       ret = statx(-1, abspath, 0, STATX_ALL, &statxbuf);
+                       free(abspath);
+               } else
+                       ret = -1;
+#else
+               ret = -1;
+#endif
+               break;
+       default:
+               ret = -1;
+               break;
+       }
+
+       if (ret == -1) {
+               char buf[FIO_VERROR_SIZE];
+               int e = errno;
+
+               snprintf(buf, sizeof(buf), "stat(%s) type=%u", f->file_name,
+                       o->stat_type);
+               td_verror(td, e, buf);
+               return 1;
+       }
+
+       if (do_lat) {
+               struct fc_data *data = td->io_ops_data;
+               uint64_t nsec;
+
+               nsec = ntime_since_now(&start);
+               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
+       }
+
+       return 0;
+}
+
+static int delete_file(struct thread_data *td, struct fio_file *f)
+{
+       struct timespec start;
+       int do_lat = !td->o.disable_lat;
+       struct fc_data *fcd = td->io_ops_data;
+       int ret;
+
+       dprint(FD_FILE, "fd delete %s\n", f->file_name);
+
+       if (f->filetype != FIO_TYPE_FILE) {
+               log_err("fio: only files are supported\n");
+               return 1;
+       }
+       if (!strcmp(f->file_name, "-")) {
+               log_err("fio: can't read/write to stdin/out\n");
+               return 1;
+       }
+
+       if (do_lat)
+               fio_gettime(&start, NULL);
+
+       if (fcd->op_engine == FILE_OP_ENGINE)
+               ret = unlink(f->file_name);
+       else if (fcd->op_engine == DIR_OP_ENGINE)
+               ret = rmdir(f->file_name);
+       else {
+               log_err("fio: unknown file/directory operation engine\n");
+               return 1;
+       }
+
+       if (ret == -1) {
+               char buf[FIO_VERROR_SIZE];
+               int e = errno;
+
+               snprintf(buf, sizeof(buf), "delete(%s)", f->file_name);
+               td_verror(td, e, buf);
+               return 1;
+       }
+
+       if (do_lat) {
+               struct fc_data *data = td->io_ops_data;
+               uint64_t nsec;
+
+               nsec = ntime_since_now(&start);
+               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
+       }
+
+       return 0;
+}
+
+static int invalidate_do_nothing(struct thread_data *td, struct fio_file *f)
+{
+       /* do nothing because file not opened */
+       return 0;
+}
+
+static enum fio_q_status queue_io(struct thread_data *td, struct io_u *io_u)
+{
+       return FIO_Q_COMPLETED;
+}
+
+/*
+ * Ensure that we at least have a block size worth of IO to do for each
+ * file. If the job file has td->o.size < nr_files * block_size, then
+ * fio won't do anything.
+ */
+static int get_file_size(struct thread_data *td, struct fio_file *f)
+{
+       f->real_file_size = td_min_bs(td);
+       return 0;
+}
+
+static int init(struct thread_data *td)
+{
+       struct fc_data *data;
+
+       data = calloc(1, sizeof(*data));
+
+       if (td_read(td))
+               data->stat_ddir = DDIR_READ;
+       else if (td_write(td))
+               data->stat_ddir = DDIR_WRITE;
+
+       data->op_engine = UNKNOWN_OP_ENGINE;
+
+       if (!strncmp(td->o.ioengine, "file", 4)) {
+               data->op_engine = FILE_OP_ENGINE;
+               dprint(FD_FILE, "Operate engine type: file\n");
+       }
+       if (!strncmp(td->o.ioengine, "dir", 3)) {
+               data->op_engine = DIR_OP_ENGINE;
+               dprint(FD_FILE, "Operate engine type: directory\n");
+       }
+
+       td->io_ops_data = data;
+       return 0;
+}
+
+static void cleanup(struct thread_data *td)
+{
+       struct fc_data *data = td->io_ops_data;
+
+       free(data);
+}
+
+static int remove_dir(struct thread_data *td, struct fio_file *f)
+{
+       dprint(FD_FILE, "remove directory %s\n", f->file_name);
+       return rmdir(f->file_name);
+}
+
+static struct ioengine_ops ioengine_filecreate = {
+       .name           = "filecreate",
+       .version        = FIO_IOOPS_VERSION,
+       .init           = init,
+       .cleanup        = cleanup,
+       .queue          = queue_io,
+       .get_file_size  = get_file_size,
+       .open_file      = open_file,
+       .close_file     = generic_close_file,
+       .flags          = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+                               FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static struct ioengine_ops ioengine_filestat = {
+       .name           = "filestat",
+       .version        = FIO_IOOPS_VERSION,
+       .init           = init,
+       .cleanup        = cleanup,
+       .queue          = queue_io,
+       .invalidate     = invalidate_do_nothing,
+       .get_file_size  = generic_get_file_size,
+       .open_file      = stat_file,
+       .flags          =  FIO_SYNCIO | FIO_FAKEIO |
+                               FIO_NOSTATS | FIO_NOFILEHASH,
+       .options        = options,
+       .option_struct_size = sizeof(struct filestat_options),
+};
+
+static struct ioengine_ops ioengine_filedelete = {
+       .name           = "filedelete",
+       .version        = FIO_IOOPS_VERSION,
+       .init           = init,
+       .invalidate     = invalidate_do_nothing,
+       .cleanup        = cleanup,
+       .queue          = queue_io,
+       .get_file_size  = generic_get_file_size,
+       .open_file      = delete_file,
+       .flags          =  FIO_SYNCIO | FIO_FAKEIO |
+                               FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static struct ioengine_ops ioengine_dircreate = {
+       .name           = "dircreate",
+       .version        = FIO_IOOPS_VERSION,
+       .init           = init,
+       .cleanup        = cleanup,
+       .queue          = queue_io,
+       .get_file_size  = get_file_size,
+       .open_file      = open_file,
+       .close_file     = generic_close_file,
+       .unlink_file    = remove_dir,
+       .flags          = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+                               FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static struct ioengine_ops ioengine_dirstat = {
+       .name           = "dirstat",
+       .version        = FIO_IOOPS_VERSION,
+       .setup          = setup_dirs,
+       .init           = init,
+       .cleanup        = cleanup,
+       .queue          = queue_io,
+       .invalidate     = invalidate_do_nothing,
+       .get_file_size  = generic_get_file_size,
+       .open_file      = stat_file,
+       .unlink_file    = remove_dir,
+       .flags          =  FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+                               FIO_NOSTATS | FIO_NOFILEHASH,
+       .options        = options,
+       .option_struct_size = sizeof(struct filestat_options),
+};
+
+static struct ioengine_ops ioengine_dirdelete = {
+       .name           = "dirdelete",
+       .version        = FIO_IOOPS_VERSION,
+       .setup          = setup_dirs,
+       .init           = init,
+       .invalidate     = invalidate_do_nothing,
+       .cleanup        = cleanup,
+       .queue          = queue_io,
+       .get_file_size  = get_file_size,
+       .open_file      = delete_file,
+       .unlink_file    = remove_dir,
+       .flags          = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+                               FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static void fio_init fio_fileoperations_register(void)
+{
+       register_ioengine(&ioengine_filecreate);
+       register_ioengine(&ioengine_filestat);
+       register_ioengine(&ioengine_filedelete);
+       register_ioengine(&ioengine_dircreate);
+       register_ioengine(&ioengine_dirstat);
+       register_ioengine(&ioengine_dirdelete);
+}
+
+static void fio_exit fio_fileoperations_unregister(void)
+{
+       unregister_ioengine(&ioengine_filecreate);
+       unregister_ioengine(&ioengine_filestat);
+       unregister_ioengine(&ioengine_filedelete);
+       unregister_ioengine(&ioengine_dircreate);
+       unregister_ioengine(&ioengine_dirstat);
+       unregister_ioengine(&ioengine_dirdelete);
+}
diff --git a/engines/filestat.c b/engines/filestat.c

deleted file mode 100644 (file)

index 405f028..0000000
--- a/engines/filestat.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * filestat engine
- *
- * IO engine that doesn't do any IO, just stat files and tracks the latency
- * of the file stat.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include "../fio.h"
-#include "../optgroup.h"
-#include "../oslib/statx.h"
-
-struct fc_data {
-       enum fio_ddir stat_ddir;
-};
-
-struct filestat_options {
-       void *pad;
-       unsigned int stat_type;
-};
-
-enum {
-       FIO_FILESTAT_STAT       = 1,
-       FIO_FILESTAT_LSTAT      = 2,
-       FIO_FILESTAT_STATX      = 3,
-};
-
-static struct fio_option options[] = {
-       {
-               .name   = "stat_type",
-               .lname  = "stat_type",
-               .type   = FIO_OPT_STR,
-               .off1   = offsetof(struct filestat_options, stat_type),
-               .help   = "Specify stat system call type to measure lookup/getattr performance",
-               .def    = "stat",
-               .posval = {
-                         { .ival = "stat",
-                           .oval = FIO_FILESTAT_STAT,
-                           .help = "Use stat(2)",
-                         },
-                         { .ival = "lstat",
-                           .oval = FIO_FILESTAT_LSTAT,
-                           .help = "Use lstat(2)",
-                         },
-                         { .ival = "statx",
-                           .oval = FIO_FILESTAT_STATX,
-                           .help = "Use statx(2) if exists",
-                         },
-               },
-               .category = FIO_OPT_C_ENGINE,
-               .group  = FIO_OPT_G_FILESTAT,
-       },
-       {
-               .name   = NULL,
-       },
-};
-
-static int stat_file(struct thread_data *td, struct fio_file *f)
-{
-       struct filestat_options *o = td->eo;
-       struct timespec start;
-       int do_lat = !td->o.disable_lat;
-       struct stat statbuf;
-#ifndef WIN32
-       struct statx statxbuf;
-       char *abspath;
-#endif
-       int ret;
-
-       dprint(FD_FILE, "fd stat %s\n", f->file_name);
-
-       if (f->filetype != FIO_TYPE_FILE) {
-               log_err("fio: only files are supported\n");
-               return 1;
-       }
-       if (!strcmp(f->file_name, "-")) {
-               log_err("fio: can't read/write to stdin/out\n");
-               return 1;
-       }
-
-       if (do_lat)
-               fio_gettime(&start, NULL);
-
-       switch (o->stat_type){
-       case FIO_FILESTAT_STAT:
-               ret = stat(f->file_name, &statbuf);
-               break;
-       case FIO_FILESTAT_LSTAT:
-               ret = lstat(f->file_name, &statbuf);
-               break;
-       case FIO_FILESTAT_STATX:
-#ifndef WIN32
-               abspath = realpath(f->file_name, NULL);
-               if (abspath) {
-                       ret = statx(-1, abspath, 0, STATX_ALL, &statxbuf);
-                       free(abspath);
-               } else
-                       ret = -1;
-#else
-               ret = -1;
-#endif
-               break;
-       default:
-               ret = -1;
-               break;
-       }
-
-       if (ret == -1) {
-               char buf[FIO_VERROR_SIZE];
-               int e = errno;
-
-               snprintf(buf, sizeof(buf), "stat(%s) type=%u", f->file_name,
-                       o->stat_type);
-               td_verror(td, e, buf);
-               return 1;
-       }
-
-       if (do_lat) {
-               struct fc_data *data = td->io_ops_data;
-               uint64_t nsec;
-
-               nsec = ntime_since_now(&start);
-               add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0);
-       }
-
-       return 0;
-}
-
-static enum fio_q_status queue_io(struct thread_data *td, struct io_u fio_unused *io_u)
-{
-       return FIO_Q_COMPLETED;
-}
-
-static int init(struct thread_data *td)
-{
-       struct fc_data *data;
-
-       data = calloc(1, sizeof(*data));
-
-       if (td_read(td))
-               data->stat_ddir = DDIR_READ;
-       else if (td_write(td))
-               data->stat_ddir = DDIR_WRITE;
-
-       td->io_ops_data = data;
-       return 0;
-}
-
-static void cleanup(struct thread_data *td)
-{
-       struct fc_data *data = td->io_ops_data;
-
-       free(data);
-}
-
-static int stat_invalidate(struct thread_data *td, struct fio_file *f)
-{
-       /* do nothing because file not opened */
-       return 0;
-}
-
-static struct ioengine_ops ioengine = {
-       .name           = "filestat",
-       .version        = FIO_IOOPS_VERSION,
-       .init           = init,
-       .cleanup        = cleanup,
-       .queue          = queue_io,
-       .invalidate     = stat_invalidate,
-       .get_file_size  = generic_get_file_size,
-       .open_file      = stat_file,
-       .flags          =  FIO_SYNCIO | FIO_FAKEIO |
-                               FIO_NOSTATS | FIO_NOFILEHASH,
-       .options        = options,
-       .option_struct_size = sizeof(struct filestat_options),
-};
-
-static void fio_init fio_filestat_register(void)
-{
-       register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_filestat_unregister(void)
-{
-       unregister_ioengine(&ioengine);
-}
diff --git a/engines/http.c b/engines/http.c

index 7a61b132b92bd72b94ebdc9915e311235b53fc56..99f4e119a4f5f0fcd5d59134ae07e0923c9559af 100644 (file)
--- a/engines/http.c
+++ b/engines/http.c
@@ -29,6 +29,10 @@
  #include "fio.h"
  #include "../optgroup.h"
  
+/*
+ * Silence OpenSSL 3.0 deprecated function warnings
+ */
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
  
  enum {
         FIO_HTTP_WEBDAV     = 0,
@@ -53,6 +57,9 @@ struct http_options {
         char *s3_key;
         char *s3_keyid;
         char *s3_region;
+       char *s3_sse_customer_key;
+       char *s3_sse_customer_algorithm;
+       char *s3_storage_class;
         char *swift_auth_token;
         int verbose;
         unsigned int mode;
@@ -157,6 +164,36 @@ static struct fio_option options[] = {
                 .category = FIO_OPT_C_ENGINE,
                 .group    = FIO_OPT_G_HTTP,
         },
+       {
+               .name     = "http_s3_sse_customer_key",
+               .lname    = "SSE Customer Key",
+               .type     = FIO_OPT_STR_STORE,
+               .help     = "S3 SSE Customer Key",
+               .off1     = offsetof(struct http_options, s3_sse_customer_key),
+               .def      = "",
+               .category = FIO_OPT_C_ENGINE,
+               .group    = FIO_OPT_G_HTTP,
+       },
+       {
+               .name     = "http_s3_sse_customer_algorithm",
+               .lname    = "SSE Customer Algorithm",
+               .type     = FIO_OPT_STR_STORE,
+               .help     = "S3 SSE Customer Algorithm",
+               .off1     = offsetof(struct http_options, s3_sse_customer_algorithm),
+               .def      = "AES256",
+               .category = FIO_OPT_C_ENGINE,
+               .group    = FIO_OPT_G_HTTP,
+       },
+       {
+               .name     = "http_s3_storage_class",
+               .lname    = "S3 Storage class",
+               .type     = FIO_OPT_STR_STORE,
+               .help     = "S3 Storage Class",
+               .off1     = offsetof(struct http_options, s3_storage_class),
+               .def      = "STANDARD",
+               .category = FIO_OPT_C_ENGINE,
+               .group    = FIO_OPT_G_HTTP,
+       },
         {
                 .name     = "http_mode",
                 .lname    = "Request mode to use",
@@ -213,6 +250,7 @@ static char *_aws_uriencode(const char *uri)
         for (i = 0; (c = uri[i]); i++) {
                 if (n > bufsize-5) {
                         log_err("encoding the URL failed\n");
+                       free(r);
                         return NULL;
                 }
  
@@ -262,6 +300,54 @@ static char *_gen_hex_md5(const char *p, size_t len)
         return _conv_hex(hash, MD5_DIGEST_LENGTH);
  }
  
+static char *_conv_base64_encode(const unsigned char *p, size_t len)
+{
+       char *r, *ret;
+       int i;
+       static const char sEncodingTable[] = {
+               'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
+               'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+               'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+               'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+               'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+               'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+               'w', 'x', 'y', 'z', '0', '1', '2', '3',
+               '4', '5', '6', '7', '8', '9', '+', '/'
+       };
+
+       size_t out_len = 4 * ((len + 2) / 3);
+       ret = r = malloc(out_len + 1);
+
+       for (i = 0; i < len - 2; i += 3) {
+               *r++ = sEncodingTable[(p[i] >> 2) & 0x3F];
+               *r++ = sEncodingTable[((p[i] & 0x3) << 4) | ((int) (p[i + 1] & 0xF0) >> 4)];
+               *r++ = sEncodingTable[((p[i + 1] & 0xF) << 2) | ((int) (p[i + 2] & 0xC0) >> 6)];
+               *r++ = sEncodingTable[p[i + 2] & 0x3F];
+       }
+
+       if (i < len) {
+               *r++ = sEncodingTable[(p[i] >> 2) & 0x3F];
+               if (i == (len - 1)) {
+                       *r++ = sEncodingTable[((p[i] & 0x3) << 4)];
+                       *r++ = '=';
+               } else {
+                       *r++ = sEncodingTable[((p[i] & 0x3) << 4) | ((int) (p[i + 1] & 0xF0) >> 4)];
+                       *r++ = sEncodingTable[((p[i + 1] & 0xF) << 2)];
+               }
+               *r++ = '=';
+       }
+
+       ret[out_len]=0;
+       return ret;
+}
+
+static char *_gen_base64_md5(const unsigned char *p, size_t len)
+{
+       unsigned char hash[MD5_DIGEST_LENGTH];
+       MD5((unsigned char*)p, len, hash);
+       return _conv_base64_encode(hash, MD5_DIGEST_LENGTH);
+}
+
  static void _hmac(unsigned char *md, void *key, int key_len, char *data) {
  #ifndef CONFIG_HAVE_OPAQUE_HMAC_CTX
         HMAC_CTX _ctx;
@@ -297,10 +383,9 @@ static int _curl_trace(CURL *handle, curl_infotype type,
         switch (type) {
         case CURLINFO_TEXT:
                 fprintf(stderr, "== Info: %s", data);
-               /* fall through */
+               fio_fallthrough;
         default:
         case CURLINFO_SSL_DATA_OUT:
-               /* fall through */
         case CURLINFO_SSL_DATA_IN:
                 return 0;
  
@@ -332,8 +417,8 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
         char date_iso[32];
         char method[8];
         char dkey[128];
-       char creq[512];
-       char sts[256];
+       char creq[4096];
+       char sts[512];
         char s[512];
         char *uri_encoded = NULL;
         char *dsha = NULL;
@@ -342,6 +427,9 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
         const char *service = "s3";
         const char *aws = "aws4_request";
         unsigned char md[SHA256_DIGEST_LENGTH];
+       unsigned char sse_key[33] = {0};
+       char *sse_key_base64 = NULL;
+       char *sse_key_md5_base64 = NULL;
  
         time_t t = time(NULL);
         struct tm *gtm = gmtime(&t);
@@ -350,6 +438,9 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
         strftime (date_iso, sizeof(date_iso), "%Y%m%dT%H%M%SZ", gtm);
         uri_encoded = _aws_uriencode(uri);
  
+       if (o->s3_sse_customer_key != NULL)
+               strncpy((char*)sse_key, o->s3_sse_customer_key, sizeof(sse_key) - 1);
+
         if (op == DDIR_WRITE) {
                 dsha = _gen_hex_sha256(buf, len);
                 sprintf(method, "PUT");
@@ -363,22 +454,50 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
         }
  
         /* Create the canonical request first */
-       snprintf(creq, sizeof(creq),
-       "%s\n"
-       "%s\n"
-       "\n"
-       "host:%s\n"
-       "x-amz-content-sha256:%s\n"
-       "x-amz-date:%s\n"
-       "\n"
-       "host;x-amz-content-sha256;x-amz-date\n"
-       "%s"
-       , method
-       , uri_encoded, o->host, dsha, date_iso, dsha);
+       if (sse_key[0] != '\0') {
+               sse_key_base64 = _conv_base64_encode(sse_key, sizeof(sse_key) - 1);
+               sse_key_md5_base64 = _gen_base64_md5(sse_key, sizeof(sse_key) - 1);
+               snprintf(creq, sizeof(creq),
+                       "%s\n"
+                       "%s\n"
+                       "\n"
+                       "host:%s\n"
+                       "x-amz-content-sha256:%s\n"
+                       "x-amz-date:%s\n"
+                       "x-amz-server-side-encryption-customer-algorithm:%s\n"
+                       "x-amz-server-side-encryption-customer-key:%s\n"
+                       "x-amz-server-side-encryption-customer-key-md5:%s\n"
+                       "x-amz-storage-class:%s\n"
+                       "\n"
+                       "host;x-amz-content-sha256;x-amz-date;"
+                       "x-amz-server-side-encryption-customer-algorithm;"
+                       "x-amz-server-side-encryption-customer-key;"
+                       "x-amz-server-side-encryption-customer-key-md5;"
+                       "x-amz-storage-class\n"
+                       "%s"
+                       , method
+                       , uri_encoded, o->host, dsha, date_iso
+                       , o->s3_sse_customer_algorithm, sse_key_base64
+                       , sse_key_md5_base64, o->s3_storage_class, dsha);
+       } else {
+               snprintf(creq, sizeof(creq),
+                       "%s\n"
+                       "%s\n"
+                       "\n"
+                       "host:%s\n"
+                       "x-amz-content-sha256:%s\n"
+                       "x-amz-date:%s\n"
+                       "x-amz-storage-class:%s\n"
+                       "\n"
+                       "host;x-amz-content-sha256;x-amz-date;x-amz-storage-class\n"
+                       "%s"
+                       , method
+                       , uri_encoded, o->host, dsha, date_iso, o->s3_storage_class, dsha);
+       }
  
         csha = _gen_hex_sha256(creq, strlen(creq));
         snprintf(sts, sizeof(sts), "AWS4-HMAC-SHA256\n%s\n%s/%s/%s/%s\n%s",
-               date_iso, date_short, o->s3_region, service, aws, csha);
+                       date_iso, date_short, o->s3_region, service, aws, csha);
  
         snprintf((char *)dkey, sizeof(dkey), "AWS4%s", o->s3_key);
         _hmac(md, dkey, strlen(dkey), date_short);
@@ -389,7 +508,7 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
  
         signature = _conv_hex(md, SHA256_DIGEST_LENGTH);
  
-       /* Surpress automatic Accept: header */
+       /* Suppress automatic Accept: header */
         slist = curl_slist_append(slist, "Accept:");
  
         snprintf(s, sizeof(s), "x-amz-content-sha256: %s", dsha);
@@ -398,9 +517,32 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
         snprintf(s, sizeof(s), "x-amz-date: %s", date_iso);
         slist = curl_slist_append(slist, s);
  
-       snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
-       "SignedHeaders=host;x-amz-content-sha256;x-amz-date,Signature=%s",
-       o->s3_keyid, date_short, o->s3_region, signature);
+       if (sse_key[0] != '\0') {
+               snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-algorithm: %s", o->s3_sse_customer_algorithm);
+               slist = curl_slist_append(slist, s);
+               snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-key: %s", sse_key_base64);
+               slist = curl_slist_append(slist, s);
+               snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-key-md5: %s", sse_key_md5_base64);
+               slist = curl_slist_append(slist, s);
+       }
+
+       snprintf(s, sizeof(s), "x-amz-storage-class: %s", o->s3_storage_class);
+       slist = curl_slist_append(slist, s);
+
+       if (sse_key[0] != '\0') {
+               snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
+                       "SignedHeaders=host;x-amz-content-sha256;"
+                       "x-amz-date;x-amz-server-side-encryption-customer-algorithm;"
+                       "x-amz-server-side-encryption-customer-key;"
+                       "x-amz-server-side-encryption-customer-key-md5;"
+                       "x-amz-storage-class,"
+                       "Signature=%s",
+               o->s3_keyid, date_short, o->s3_region, signature);
+       } else {
+               snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
+                       "SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-storage-class,Signature=%s",
+                       o->s3_keyid, date_short, o->s3_region, signature);
+       }
         slist = curl_slist_append(slist, s);
  
         curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist);
@@ -409,6 +551,10 @@ static void _add_aws_auth_header(CURL *curl, struct curl_slist *slist, struct ht
         free(csha);
         free(dsha);
         free(signature);
+       if (sse_key_base64 != NULL) {
+               free(sse_key_base64);
+               free(sse_key_md5_base64);
+       }
  }
  
  static void _add_swift_header(CURL *curl, struct curl_slist *slist, struct http_options *o,
@@ -420,7 +566,7 @@ static void _add_swift_header(CURL *curl, struct curl_slist *slist, struct http_
         if (op == DDIR_WRITE) {
                 dsha = _gen_hex_md5(buf, len);
         }
-       /* Surpress automatic Accept: header */
+       /* Suppress automatic Accept: header */
         slist = curl_slist_append(slist, "Accept:");
  
         snprintf(s, sizeof(s), "etag: %s", dsha);
@@ -494,7 +640,6 @@ static enum fio_q_status fio_http_queue(struct thread_data *td,
         char url[1024];
         long status;
         CURLcode res;
-       int r = -1;
  
         fio_ro_check(td, io_u);
         memset(&_curl_stream, 0, sizeof(_curl_stream));
@@ -527,8 +672,8 @@ static enum fio_q_status fio_http_queue(struct thread_data *td,
                         if (status == 100 || (status >= 200 && status <= 204))
                                 goto out;
                         log_err("DDIR_WRITE failed with HTTP status code %ld\n", status);
-                       goto err;
                 }
+               goto err;
         } else if (io_u->ddir == DDIR_READ) {
                 curl_easy_setopt(http->curl, CURLOPT_READDATA, NULL);
                 curl_easy_setopt(http->curl, CURLOPT_WRITEDATA, &_curl_stream);
@@ -566,7 +711,7 @@ static enum fio_q_status fio_http_queue(struct thread_data *td,
         log_err("WARNING: Only DDIR_READ/DDIR_WRITE/DDIR_TRIM are supported!\n");
  
  err:
-       io_u->error = r;
+       io_u->error = EIO;
         td_verror(td, io_u->error, "transfer");
  out:
         curl_slist_free_all(slist);
diff --git a/engines/ime.c b/engines/ime.c

index 440cc29e8e31bcfc9e0ce7a81a59410698925412..037b8419e253c2340587eadd207a8296c18f8581 100644 (file)
--- a/engines/ime.c
+++ b/engines/ime.c
@@ -83,7 +83,7 @@ struct ime_data {
         };
         struct iovec    *iovecs;                /* array of queued iovecs */
         struct io_u     **io_us;                /* array of queued io_u pointers */
-       struct io_u     **event_io_us;  /* array of the events retieved afer get_events*/
+       struct io_u     **event_io_us;  /* array of the events retrieved after get_events*/
         unsigned int    queued;                 /* iovecs/io_us in the queue */
         unsigned int    events;                 /* number of committed iovecs/io_us */
  
@@ -188,10 +188,6 @@ static int fio_ime_open_file(struct thread_data *td, struct fio_file *f)
                 return 1;
         }
  
-       if (td->o.oatomic) {
-               td_verror(td, EINVAL, "IME does not support atomic IO");
-               return 1;
-       }
         if (td->o.odirect)
                 flags |= O_DIRECT;
         flags |= td->o.sync_io;
diff --git a/engines/io_uring.c b/engines/io_uring.c

index c9036ba079b808b65f15a7d3ea920424018471ca..9069fa3e81a5caaf03c038481f072c25fcca3385 100644 (file)
--- a/engines/io_uring.c
+++ b/engines/io_uring.c
@@ -18,11 +18,21 @@
  #include "../lib/memalign.h"
  #include "../lib/fls.h"
  #include "../lib/roundup.h"
+#include "../verify.h"
  
  #ifdef ARCH_HAVE_IOURING
  
  #include "../lib/types.h"
  #include "../os/linux/io_uring.h"
+#include "cmdprio.h"
+#include "zbd.h"
+#include "nvme.h"
+
+#include <sys/stat.h>
+
+enum uring_cmd_type {
+       FIO_URING_CMD_NVME = 1,
+};
  
  struct io_sq_ring {
         unsigned *head;
@@ -50,6 +60,7 @@ struct ioring_data {
         int ring_fd;
  
         struct io_u **io_u_index;
+       char *md_buf;
  
         int *fds;
  
@@ -64,17 +75,19 @@ struct ioring_data {
         int queued;
         int cq_ring_off;
         unsigned iodepth;
-       bool ioprio_class_set;
-       bool ioprio_set;
         int prepped;
  
         struct ioring_mmap mmap[3];
+
+       struct cmdprio cmdprio;
+
+       struct nvme_dsm *dsm;
  };
  
  struct ioring_options {
-       void *pad;
+       struct thread_data *td;
         unsigned int hipri;
-       unsigned int cmdprio_percentage;
+       struct cmdprio_options cmdprio_options;
         unsigned int fixedbufs;
         unsigned int registerfiles;
         unsigned int sqpoll_thread;
@@ -84,6 +97,13 @@ struct ioring_options {
         unsigned int uncached;
         unsigned int nowait;
         unsigned int force_async;
+       unsigned int md_per_io_size;
+       unsigned int pi_act;
+       unsigned int apptag;
+       unsigned int apptag_mask;
+       unsigned int prchk;
+       char *pi_chk;
+       enum uring_cmd_type cmd_type;
  };
  
  static const int ddir_to_op[2][2] = {
@@ -115,26 +135,6 @@ static struct fio_option options[] = {
                 .category = FIO_OPT_C_ENGINE,
                 .group  = FIO_OPT_G_IOURING,
         },
-#ifdef FIO_HAVE_IOPRIO_CLASS
-       {
-               .name   = "cmdprio_percentage",
-               .lname  = "high priority percentage",
-               .type   = FIO_OPT_INT,
-               .off1   = offsetof(struct ioring_options, cmdprio_percentage),
-               .minval = 1,
-               .maxval = 100,
-               .help   = "Send high priority I/O this percentage of the time",
-               .category = FIO_OPT_C_ENGINE,
-               .group  = FIO_OPT_G_IOURING,
-       },
-#else
-       {
-               .name   = "cmdprio_percentage",
-               .lname  = "high priority percentage",
-               .type   = FIO_OPT_UNSUPPORTED,
-               .help   = "Your platform does not support I/O priority classes",
-       },
-#endif
         {
                 .name   = "fixedbufs",
                 .lname  = "Fixed (pre-mapped) IO buffers",
@@ -156,7 +156,7 @@ static struct fio_option options[] = {
         {
                 .name   = "sqthread_poll",
                 .lname  = "Kernel SQ thread polling",
-               .type   = FIO_OPT_INT,
+               .type   = FIO_OPT_STR_SET,
                 .off1   = offsetof(struct ioring_options, sqpoll_thread),
                 .help   = "Offload submission/completion to kernel thread",
                 .category = FIO_OPT_C_ENGINE,
@@ -208,6 +208,73 @@ static struct fio_option options[] = {
                 .category = FIO_OPT_C_ENGINE,
                 .group  = FIO_OPT_G_IOURING,
         },
+       {
+               .name   = "cmd_type",
+               .lname  = "Uring cmd type",
+               .type   = FIO_OPT_STR,
+               .off1   = offsetof(struct ioring_options, cmd_type),
+               .help   = "Specify uring-cmd type",
+               .def    = "nvme",
+               .posval = {
+                         { .ival = "nvme",
+                           .oval = FIO_URING_CMD_NVME,
+                           .help = "Issue nvme-uring-cmd",
+                         },
+               },
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
+       CMDPRIO_OPTIONS(struct ioring_options, FIO_OPT_G_IOURING),
+       {
+               .name   = "md_per_io_size",
+               .lname  = "Separate Metadata Buffer Size per I/O",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct ioring_options, md_per_io_size),
+               .def    = "0",
+               .help   = "Size of separate metadata buffer per I/O (Default: 0)",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
+       {
+               .name   = "pi_act",
+               .lname  = "Protection Information Action",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct ioring_options, pi_act),
+               .def    = "1",
+               .help   = "Protection Information Action bit (pi_act=1 or pi_act=0)",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
+       {
+               .name   = "pi_chk",
+               .lname  = "Protection Information Check",
+               .type   = FIO_OPT_STR_STORE,
+               .off1   = offsetof(struct ioring_options, pi_chk),
+               .def    = NULL,
+               .help   = "Control of Protection Information Checking (pi_chk=GUARD,REFTAG,APPTAG)",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
+       {
+               .name   = "apptag",
+               .lname  = "Application Tag used in Protection Information",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct ioring_options, apptag),
+               .def    = "0x1234",
+               .help   = "Application Tag used in Protection Information field (Default: 0x1234)",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
+       {
+               .name   = "apptag_mask",
+               .lname  = "Application Tag Mask",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct ioring_options, apptag_mask),
+               .def    = "0xffff",
+               .help   = "Application Tag Mask used with Application Tag (Default: 0xffff)",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_IOURING,
+       },
         {
                 .name   = NULL,
         },
@@ -216,8 +283,13 @@ static struct fio_option options[] = {
  static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit,
                          unsigned int min_complete, unsigned int flags)
  {
+#ifdef FIO_ARCH_HAS_SYSCALL
+       return __do_syscall6(__NR_io_uring_enter, ld->ring_fd, to_submit,
+                               min_complete, flags, NULL, 0);
+#else
         return syscall(__NR_io_uring_enter, ld->ring_fd, to_submit,
                         min_complete, flags, NULL, 0);
+#endif
  }
  
  static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
@@ -234,6 +306,7 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
                 sqe->flags = IOSQE_FIXED_FILE;
         } else {
                 sqe->fd = f->fd;
+               sqe->flags = 0;
         }
  
         if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
@@ -261,16 +334,24 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
                                 sqe->len = 1;
                         }
                 }
+               sqe->rw_flags = 0;
                 if (!td->o.odirect && o->uncached)
-                       sqe->rw_flags = RWF_UNCACHED;
+                       sqe->rw_flags |= RWF_UNCACHED;
                 if (o->nowait)
                         sqe->rw_flags |= RWF_NOWAIT;
-               if (ld->ioprio_class_set)
-                       sqe->ioprio = td->o.ioprio_class << 13;
-               if (ld->ioprio_set)
-                       sqe->ioprio |= td->o.ioprio;
+
+               /*
+                * Since io_uring can have a submission context (sqthread_poll)
+                * that is different from the process context, we cannot rely on
+                * the IO priority set by ioprio_set() (options prio, prioclass,
+                * and priohint) to be inherited.
+                * td->ioprio will have the value of the "default prio", so set
+                * this unconditionally. This value might get overridden by
+                * fio_ioring_cmdprio_prep() if the option cmdprio_percentage or
+                * cmdprio_bssplit is used.
+                */
+               sqe->ioprio = td->ioprio;
                 sqe->off = io_u->offset;
-               sqe->rw_flags = 0;
         } else if (ddir_sync(io_u->ddir)) {
                 sqe->ioprio = 0;
                 if (io_u->ddir == DDIR_SYNC_FILE_RANGE) {
@@ -297,6 +378,63 @@ static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
         return 0;
  }
  
+static int fio_ioring_cmd_prep(struct thread_data *td, struct io_u *io_u)
+{
+       struct ioring_data *ld = td->io_ops_data;
+       struct ioring_options *o = td->eo;
+       struct fio_file *f = io_u->file;
+       struct nvme_uring_cmd *cmd;
+       struct io_uring_sqe *sqe;
+       struct nvme_dsm *dsm;
+       void *ptr = ld->dsm;
+       unsigned int dsm_size;
+
+       /* only supports nvme_uring_cmd */
+       if (o->cmd_type != FIO_URING_CMD_NVME)
+               return -EINVAL;
+
+       if (io_u->ddir == DDIR_TRIM && td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM)
+               return 0;
+
+       sqe = &ld->sqes[(io_u->index) << 1];
+
+       if (o->registerfiles) {
+               sqe->fd = f->engine_pos;
+               sqe->flags = IOSQE_FIXED_FILE;
+       } else {
+               sqe->fd = f->fd;
+       }
+       sqe->rw_flags = 0;
+       if (!td->o.odirect && o->uncached)
+               sqe->rw_flags |= RWF_UNCACHED;
+       if (o->nowait)
+               sqe->rw_flags |= RWF_NOWAIT;
+
+       sqe->opcode = IORING_OP_URING_CMD;
+       sqe->user_data = (unsigned long) io_u;
+       if (o->nonvectored)
+               sqe->cmd_op = NVME_URING_CMD_IO;
+       else
+               sqe->cmd_op = NVME_URING_CMD_IO_VEC;
+       if (o->force_async && ++ld->prepped == o->force_async) {
+               ld->prepped = 0;
+               sqe->flags |= IOSQE_ASYNC;
+       }
+       if (o->fixedbufs) {
+               sqe->uring_cmd_flags = IORING_URING_CMD_FIXED;
+               sqe->buf_index = io_u->index;
+       }
+
+       cmd = (struct nvme_uring_cmd *)sqe->cmd;
+       dsm_size = sizeof(*ld->dsm) + td->o.num_range * sizeof(struct nvme_dsm_range);
+       ptr += io_u->index * dsm_size;
+       dsm = (struct nvme_dsm *)ptr;
+
+       return fio_nvme_uring_cmd_prep(cmd, io_u,
+                       o->nonvectored ? NULL : &ld->iovecs[io_u->index],
+                       dsm);
+}
+
  static struct io_u *fio_ioring_event(struct thread_data *td, int event)
  {
         struct ioring_data *ld = td->io_ops_data;
@@ -320,6 +458,42 @@ static struct io_u *fio_ioring_event(struct thread_data *td, int event)
         return io_u;
  }
  
+static struct io_u *fio_ioring_cmd_event(struct thread_data *td, int event)
+{
+       struct ioring_data *ld = td->io_ops_data;
+       struct ioring_options *o = td->eo;
+       struct io_uring_cqe *cqe;
+       struct io_u *io_u;
+       struct nvme_data *data;
+       unsigned index;
+       int ret;
+
+       index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
+       if (o->cmd_type == FIO_URING_CMD_NVME)
+               index <<= 1;
+
+       cqe = &ld->cq_ring.cqes[index];
+       io_u = (struct io_u *) (uintptr_t) cqe->user_data;
+
+       if (cqe->res != 0) {
+               io_u->error = -cqe->res;
+               return io_u;
+       } else {
+               io_u->error = 0;
+       }
+
+       if (o->cmd_type == FIO_URING_CMD_NVME) {
+               data = FILE_ENG_DATA(io_u->file);
+               if (data->pi_type && (io_u->ddir == DDIR_READ) && !o->pi_act) {
+                       ret = fio_nvme_pi_verify(data, io_u);
+                       if (ret)
+                               io_u->error = ret;
+               }
+       }
+
+       return io_u;
+}
+
  static int fio_ioring_cqring_reap(struct thread_data *td, unsigned int events,
                                    unsigned int max)
  {
@@ -356,6 +530,7 @@ static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
                 r = fio_ioring_cqring_reap(td, events, max);
                 if (r) {
                         events += r;
+                       max -= r;
                         if (actual_min != 0)
                                 actual_min -= r;
                         continue;
@@ -367,6 +542,7 @@ static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
                         if (r < 0) {
                                 if (errno == EAGAIN || errno == EINTR)
                                         continue;
+                               r = -errno;
                                 td_verror(td, errno, "io_uring_enter");
                                 break;
                         }
@@ -376,23 +552,49 @@ static int fio_ioring_getevents(struct thread_data *td, unsigned int min,
         return r < 0 ? r : events;
  }
  
-static void fio_ioring_prio_prep(struct thread_data *td, struct io_u *io_u)
+static inline void fio_ioring_cmd_nvme_pi(struct thread_data *td,
+                                         struct io_u *io_u)
  {
-       struct ioring_options *o = td->eo;
         struct ioring_data *ld = td->io_ops_data;
-       if (rand_between(&td->prio_state, 0, 99) < o->cmdprio_percentage) {
-               ld->sqes[io_u->index].ioprio = IOPRIO_CLASS_RT << IOPRIO_CLASS_SHIFT;
-               io_u->flags |= IO_U_F_PRIORITY;
+       struct ioring_options *o = td->eo;
+       struct nvme_uring_cmd *cmd;
+       struct io_uring_sqe *sqe;
+       struct nvme_cmd_ext_io_opts ext_opts = {0};
+       struct nvme_data *data = FILE_ENG_DATA(io_u->file);
+
+       if (io_u->ddir == DDIR_TRIM)
+               return;
+
+       sqe = &ld->sqes[(io_u->index) << 1];
+       cmd = (struct nvme_uring_cmd *)sqe->cmd;
+
+       if (data->pi_type) {
+               if (o->pi_act)
+                       ext_opts.io_flags |= NVME_IO_PRINFO_PRACT;
+               ext_opts.io_flags |= o->prchk;
+               ext_opts.apptag = o->apptag;
+               ext_opts.apptag_mask = o->apptag_mask;
         }
-       return;
+
+       fio_nvme_pi_fill(cmd, io_u, &ext_opts);
+}
+
+static inline void fio_ioring_cmdprio_prep(struct thread_data *td,
+                                          struct io_u *io_u)
+{
+       struct ioring_data *ld = td->io_ops_data;
+       struct cmdprio *cmdprio = &ld->cmdprio;
+
+       if (fio_cmdprio_set_ioprio(td, cmdprio, io_u))
+               ld->sqes[io_u->index].ioprio = io_u->ioprio;
  }
  
  static enum fio_q_status fio_ioring_queue(struct thread_data *td,
                                           struct io_u *io_u)
  {
         struct ioring_data *ld = td->io_ops_data;
-       struct io_sq_ring *ring = &ld->sq_ring;
         struct ioring_options *o = td->eo;
+       struct io_sq_ring *ring = &ld->sq_ring;
         unsigned tail, next_tail;
  
         fio_ro_check(td, io_u);
@@ -400,11 +602,12 @@ static enum fio_q_status fio_ioring_queue(struct thread_data *td,
         if (ld->queued == ld->iodepth)
                 return FIO_Q_BUSY;
  
-       if (io_u->ddir == DDIR_TRIM) {
+       if (io_u->ddir == DDIR_TRIM && td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM) {
                 if (ld->queued)
                         return FIO_Q_BUSY;
  
                 do_io_u_trim(td, io_u);
+
                 io_u_mark_submit(td, 1);
                 io_u_mark_complete(td, 1);
                 return FIO_Q_COMPLETED;
@@ -412,11 +615,16 @@ static enum fio_q_status fio_ioring_queue(struct thread_data *td,
  
         tail = *ring->tail;
         next_tail = tail + 1;
-       if (next_tail == atomic_load_acquire(ring->head))
+       if (next_tail == atomic_load_relaxed(ring->head))
                 return FIO_Q_BUSY;
  
-       if (o->cmdprio_percentage)
-               fio_ioring_prio_prep(td, io_u);
+       if (ld->cmdprio.mode != CMDPRIO_MODE_NONE)
+               fio_ioring_cmdprio_prep(td, io_u);
+
+       if (!strcmp(td->io_ops->name, "io_uring_cmd") &&
+               o->cmd_type == FIO_URING_CMD_NVME)
+               fio_ioring_cmd_nvme_pi(td, io_u);
+
         ring->array[tail & ld->sq_ring_mask] = io_u->index;
         atomic_store_release(ring->tail, next_tail);
  
@@ -444,6 +652,12 @@ static void fio_ioring_queued(struct thread_data *td, int start, int nr)
  
                 start++;
         }
+
+       /*
+        * only used for iolog
+        */
+       if (td->o.read_iolog_file)
+               memcpy(&td->last_issue, &now, sizeof(now));
  }
  
  static int fio_ioring_commit(struct thread_data *td)
@@ -462,12 +676,16 @@ static int fio_ioring_commit(struct thread_data *td)
          */
         if (o->sqpoll_thread) {
                 struct io_sq_ring *ring = &ld->sq_ring;
+               unsigned start = *ld->sq_ring.tail - ld->queued;
                 unsigned flags;
  
-               flags = atomic_load_acquire(ring->flags);
+               flags = atomic_load_relaxed(ring->flags);
                 if (flags & IORING_SQ_NEED_WAKEUP)
                         io_uring_enter(ld, ld->queued, 0,
                                         IORING_ENTER_SQ_WAKEUP);
+               fio_ioring_queued(td, start, ld->queued);
+               io_u_mark_submit(td, ld->queued);
+
                 ld->queued = 0;
                 return 0;
         }
@@ -495,6 +713,7 @@ static int fio_ioring_commit(struct thread_data *td)
                                 usleep(1);
                                 continue;
                         }
+                       ret = -errno;
                         td_verror(td, errno, "io_uring_enter submit");
                         break;
                 }
@@ -520,9 +739,12 @@ static void fio_ioring_cleanup(struct thread_data *td)
                 if (!(td->flags & TD_F_CHILD))
                         fio_ioring_unmap(ld);
  
+               fio_cmdprio_cleanup(&ld->cmdprio);
                 free(ld->io_u_index);
+               free(ld->md_buf);
                 free(ld->iovecs);
                 free(ld->fds);
+               free(ld->dsm);
                 free(ld);
         }
  }
@@ -546,14 +768,22 @@ static int fio_ioring_mmap(struct ioring_data *ld, struct io_uring_params *p)
         sring->array = ptr + p->sq_off.array;
         ld->sq_ring_mask = *sring->ring_mask;
  
-       ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
+       if (p->flags & IORING_SETUP_SQE128)
+               ld->mmap[1].len = 2 * p->sq_entries * sizeof(struct io_uring_sqe);
+       else
+               ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
         ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
                                 MAP_SHARED | MAP_POPULATE, ld->ring_fd,
                                 IORING_OFF_SQES);
         ld->mmap[1].ptr = ld->sqes;
  
-       ld->mmap[2].len = p->cq_off.cqes +
-                               p->cq_entries * sizeof(struct io_uring_cqe);
+       if (p->flags & IORING_SETUP_CQE32) {
+               ld->mmap[2].len = p->cq_off.cqes +
+                                       2 * p->cq_entries * sizeof(struct io_uring_cqe);
+       } else {
+               ld->mmap[2].len = p->cq_off.cqes +
+                                       p->cq_entries * sizeof(struct io_uring_cqe);
+       }
         ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
                         MAP_SHARED | MAP_POPULATE, ld->ring_fd,
                         IORING_OFF_CQ_RING);
@@ -581,11 +811,10 @@ static void fio_ioring_probe(struct thread_data *td)
         /* default to off, as that's always safe */
         o->nonvectored = 0;
  
-       p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
+       p = calloc(1, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
         if (!p)
                 return;
  
-       memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
         ret = syscall(__NR_io_uring_register, ld->ring_fd,
                         IORING_REGISTER_PROBE, p, 256);
         if (ret < 0)
@@ -619,11 +848,137 @@ static int fio_ioring_queue_init(struct thread_data *td)
                         p.flags |= IORING_SETUP_SQ_AFF;
                         p.sq_thread_cpu = o->sqpoll_cpu;
                 }
+
+               /*
+                * Submission latency for sqpoll_thread is just the time it
+                * takes to fill in the SQ ring entries, and any syscall if
+                * IORING_SQ_NEED_WAKEUP is set, we don't need to log that time
+                * separately.
+                */
+               td->o.disable_slat = 1;
         }
  
+       /*
+        * Clamp CQ ring size at our SQ ring size, we don't need more entries
+        * than that.
+        */
+       p.flags |= IORING_SETUP_CQSIZE;
+       p.cq_entries = depth;
+
+       /*
+        * Setup COOP_TASKRUN as we don't need to get IPI interrupted for
+        * completing IO operations.
+        */
+       p.flags |= IORING_SETUP_COOP_TASKRUN;
+
+       /*
+        * io_uring is always a single issuer, and we can defer task_work
+        * runs until we reap events.
+        */
+       p.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN;
+
+retry:
         ret = syscall(__NR_io_uring_setup, depth, &p);
-       if (ret < 0)
+       if (ret < 0) {
+               if (errno == EINVAL && p.flags & IORING_SETUP_DEFER_TASKRUN) {
+                       p.flags &= ~IORING_SETUP_DEFER_TASKRUN;
+                       p.flags &= ~IORING_SETUP_SINGLE_ISSUER;
+                       goto retry;
+               }
+               if (errno == EINVAL && p.flags & IORING_SETUP_COOP_TASKRUN) {
+                       p.flags &= ~IORING_SETUP_COOP_TASKRUN;
+                       goto retry;
+               }
+               if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
+                       p.flags &= ~IORING_SETUP_CQSIZE;
+                       goto retry;
+               }
+               return ret;
+       }
+
+       ld->ring_fd = ret;
+
+       fio_ioring_probe(td);
+
+       if (o->fixedbufs) {
+               ret = syscall(__NR_io_uring_register, ld->ring_fd,
+                               IORING_REGISTER_BUFFERS, ld->iovecs, depth);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return fio_ioring_mmap(ld, &p);
+}
+
+static int fio_ioring_cmd_queue_init(struct thread_data *td)
+{
+       struct ioring_data *ld = td->io_ops_data;
+       struct ioring_options *o = td->eo;
+       int depth = td->o.iodepth;
+       struct io_uring_params p;
+       int ret;
+
+       memset(&p, 0, sizeof(p));
+
+       if (o->hipri)
+               p.flags |= IORING_SETUP_IOPOLL;
+       if (o->sqpoll_thread) {
+               p.flags |= IORING_SETUP_SQPOLL;
+               if (o->sqpoll_set) {
+                       p.flags |= IORING_SETUP_SQ_AFF;
+                       p.sq_thread_cpu = o->sqpoll_cpu;
+               }
+
+               /*
+                * Submission latency for sqpoll_thread is just the time it
+                * takes to fill in the SQ ring entries, and any syscall if
+                * IORING_SQ_NEED_WAKEUP is set, we don't need to log that time
+                * separately.
+                */
+               td->o.disable_slat = 1;
+       }
+       if (o->cmd_type == FIO_URING_CMD_NVME) {
+               p.flags |= IORING_SETUP_SQE128;
+               p.flags |= IORING_SETUP_CQE32;
+       }
+
+       /*
+        * Clamp CQ ring size at our SQ ring size, we don't need more entries
+        * than that.
+        */
+       p.flags |= IORING_SETUP_CQSIZE;
+       p.cq_entries = depth;
+
+       /*
+        * Setup COOP_TASKRUN as we don't need to get IPI interrupted for
+        * completing IO operations.
+        */
+       p.flags |= IORING_SETUP_COOP_TASKRUN;
+
+       /*
+        * io_uring is always a single issuer, and we can defer task_work
+        * runs until we reap events.
+        */
+       p.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN;
+
+retry:
+       ret = syscall(__NR_io_uring_setup, depth, &p);
+       if (ret < 0) {
+               if (errno == EINVAL && p.flags & IORING_SETUP_DEFER_TASKRUN) {
+                       p.flags &= ~IORING_SETUP_DEFER_TASKRUN;
+                       p.flags &= ~IORING_SETUP_SINGLE_ISSUER;
+                       goto retry;
+               }
+               if (errno == EINVAL && p.flags & IORING_SETUP_COOP_TASKRUN) {
+                       p.flags &= ~IORING_SETUP_COOP_TASKRUN;
+                       goto retry;
+               }
+               if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
+                       p.flags &= ~IORING_SETUP_CQSIZE;
+                       goto retry;
+               }
                 return ret;
+       }
  
         ld->ring_fd = ret;
  
@@ -696,11 +1051,11 @@ static int fio_ioring_post_init(struct thread_data *td)
  
         err = fio_ioring_queue_init(td);
         if (err) {
-               int __errno = errno;
+               int init_err = errno;
  
-               if (__errno == ENOSYS)
+               if (init_err == ENOSYS)
                         log_err("fio: your kernel doesn't support io_uring\n");
-               td_verror(td, __errno, "io_queue_init");
+               td_verror(td, init_err, "io_queue_init");
                 return 1;
         }
  
@@ -722,18 +1077,75 @@ static int fio_ioring_post_init(struct thread_data *td)
         return 0;
  }
  
-static int fio_ioring_init(struct thread_data *td)
+static int fio_ioring_cmd_post_init(struct thread_data *td)
  {
+       struct ioring_data *ld = td->io_ops_data;
         struct ioring_options *o = td->eo;
-       struct ioring_data *ld;
-       struct thread_options *to = &td->o;
+       struct io_u *io_u;
+       int err, i;
  
-       if (to->io_submit_mode == IO_MODE_OFFLOAD) {
-               log_err("fio: io_submit_mode=offload is not compatible (or "
-                       "useful) with io_uring\n");
+       for (i = 0; i < td->o.iodepth; i++) {
+               struct iovec *iov = &ld->iovecs[i];
+
+               io_u = ld->io_u_index[i];
+               iov->iov_base = io_u->buf;
+               iov->iov_len = td_max_bs(td);
+       }
+
+       err = fio_ioring_cmd_queue_init(td);
+       if (err) {
+               int init_err = errno;
+
+               td_verror(td, init_err, "io_queue_init");
                 return 1;
         }
  
+       for (i = 0; i < td->o.iodepth; i++) {
+               struct io_uring_sqe *sqe;
+
+               if (o->cmd_type == FIO_URING_CMD_NVME) {
+                       sqe = &ld->sqes[i << 1];
+                       memset(sqe, 0, 2 * sizeof(*sqe));
+               } else {
+                       sqe = &ld->sqes[i];
+                       memset(sqe, 0, sizeof(*sqe));
+               }
+       }
+
+       if (o->registerfiles) {
+               err = fio_ioring_register_files(td);
+               if (err) {
+                       td_verror(td, errno, "ioring_register_files");
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static void parse_prchk_flags(struct ioring_options *o)
+{
+       if (!o->pi_chk)
+               return;
+
+       if (strstr(o->pi_chk, "GUARD") != NULL)
+               o->prchk = NVME_IO_PRINFO_PRCHK_GUARD;
+       if (strstr(o->pi_chk, "REFTAG") != NULL)
+               o->prchk |= NVME_IO_PRINFO_PRCHK_REF;
+       if (strstr(o->pi_chk, "APPTAG") != NULL)
+               o->prchk |= NVME_IO_PRINFO_PRCHK_APP;
+}
+
+static int fio_ioring_init(struct thread_data *td)
+{
+       struct ioring_options *o = td->eo;
+       struct ioring_data *ld;
+       struct nvme_dsm *dsm;
+       void *ptr;
+       unsigned int dsm_size;
+       unsigned long long md_size;
+       int ret, i;
+
         /* sqthread submission requires registered files */
         if (o->sqpoll_thread)
                 o->registerfiles = 1;
@@ -752,25 +1164,60 @@ static int fio_ioring_init(struct thread_data *td)
  
         /* io_u index */
         ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
+
+       /*
+        * metadata buffer for nvme command.
+        * We are only supporting iomem=malloc / mem=malloc as of now.
+        */
+       if (!strcmp(td->io_ops->name, "io_uring_cmd") &&
+           (o->cmd_type == FIO_URING_CMD_NVME) && o->md_per_io_size) {
+               md_size = (unsigned long long) o->md_per_io_size
+                               * (unsigned long long) td->o.iodepth;
+               md_size += page_mask + td->o.mem_align;
+               if (td->o.mem_align && td->o.mem_align > page_size)
+                       md_size += td->o.mem_align - page_size;
+               if (td->o.mem_type == MEM_MALLOC) {
+                       ld->md_buf = malloc(md_size);
+                       if (!ld->md_buf) {
+                               free(ld);
+                               return 1;
+                       }
+               } else {
+                       log_err("fio: Only iomem=malloc or mem=malloc is supported\n");
+                       free(ld);
+                       return 1;
+               }
+       }
+       parse_prchk_flags(o);
+
         ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec));
  
         td->io_ops_data = ld;
  
-       /*
-        * Check for option conflicts
-        */
-       if ((fio_option_is_set(to, ioprio) || fio_option_is_set(to, ioprio_class)) &&
-                       o->cmdprio_percentage != 0) {
-               log_err("%s: cmdprio_percentage option and mutually exclusive "
-                               "prio or prioclass option is set, exiting\n", to->name);
-               td_verror(td, EINVAL, "fio_io_uring_init");
+       ret = fio_cmdprio_init(td, &ld->cmdprio, &o->cmdprio_options);
+       if (ret) {
+               td_verror(td, EINVAL, "fio_ioring_init");
                 return 1;
         }
  
-       if (fio_option_is_set(&td->o, ioprio_class))
-               ld->ioprio_class_set = true;
-       if (fio_option_is_set(&td->o, ioprio))
-               ld->ioprio_set = true;
+       /*
+        * For io_uring_cmd, trims are async operations unless we are operating
+        * in zbd mode where trim means zone reset.
+        */
+       if (!strcmp(td->io_ops->name, "io_uring_cmd") && td_trim(td) &&
+           td->o.zone_mode == ZONE_MODE_ZBD) {
+               td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM;
+       } else {
+               dsm_size = sizeof(*ld->dsm) +
+                       td->o.num_range * sizeof(struct nvme_dsm_range);
+               ld->dsm = calloc(td->o.iodepth, dsm_size);
+               ptr = ld->dsm;
+               for (i = 0; i < td->o.iodepth; i++) {
+                       dsm = (struct nvme_dsm *)ptr;
+                       dsm->nr_ranges = td->o.num_range;
+                       ptr += dsm_size;
+               }
+       }
  
         return 0;
  }
@@ -778,11 +1225,42 @@ static int fio_ioring_init(struct thread_data *td)
  static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u)
  {
         struct ioring_data *ld = td->io_ops_data;
+       struct ioring_options *o = td->eo;
+       struct nvme_pi_data *pi_data;
+       char *p;
  
         ld->io_u_index[io_u->index] = io_u;
+
+       if (!strcmp(td->io_ops->name, "io_uring_cmd")) {
+               p = PTR_ALIGN(ld->md_buf, page_mask) + td->o.mem_align;
+               p += o->md_per_io_size * io_u->index;
+               io_u->mmap_data = p;
+
+               if (!o->pi_act) {
+                       pi_data = calloc(1, sizeof(*pi_data));
+                       pi_data->io_flags |= o->prchk;
+                       pi_data->apptag_mask = o->apptag_mask;
+                       pi_data->apptag = o->apptag;
+                       io_u->engine_data = pi_data;
+               }
+       }
+
         return 0;
  }
  
+static void fio_ioring_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+       struct ioring_options *o = td->eo;
+       struct nvme_pi *pi;
+
+       if (!strcmp(td->io_ops->name, "io_uring_cmd") &&
+           (o->cmd_type == FIO_URING_CMD_NVME)) {
+               pi = io_u->engine_data;
+               free(pi);
+               io_u->engine_data = NULL;
+       }
+}
+
  static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f)
  {
         struct ioring_data *ld = td->io_ops_data;
@@ -795,6 +1273,82 @@ static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f)
         return 0;
  }
  
+static int fio_ioring_cmd_open_file(struct thread_data *td, struct fio_file *f)
+{
+       struct ioring_data *ld = td->io_ops_data;
+       struct ioring_options *o = td->eo;
+
+       if (o->cmd_type == FIO_URING_CMD_NVME) {
+               struct nvme_data *data = NULL;
+               unsigned int lba_size = 0;
+               __u64 nlba = 0;
+               int ret;
+
+               /* Store the namespace-id and lba size. */
+               data = FILE_ENG_DATA(f);
+               if (data == NULL) {
+                       data = calloc(1, sizeof(struct nvme_data));
+                       ret = fio_nvme_get_info(f, &nlba, o->pi_act, data);
+                       if (ret) {
+                               free(data);
+                               return ret;
+                       }
+
+                       FILE_SET_ENG_DATA(f, data);
+               }
+
+               lba_size = data->lba_ext ? data->lba_ext : data->lba_size;
+
+               for_each_rw_ddir(ddir) {
+                       if (td->o.min_bs[ddir] % lba_size || td->o.max_bs[ddir] % lba_size) {
+                               if (data->lba_ext) {
+                                       log_err("%s: block size must be a multiple of %u "
+                                               "(LBA data size + Metadata size)\n", f->file_name, lba_size);
+                                       if (td->o.min_bs[ddir] == td->o.max_bs[ddir] &&
+                                           !(td->o.min_bs[ddir] % data->lba_size)) {
+                                               /* fixed block size is actually a multiple of LBA data size */
+                                               unsigned long long suggestion = lba_size *
+                                                       (td->o.min_bs[ddir] / data->lba_size);
+                                               log_err("Did you mean to use a block size of %llu?\n", suggestion);
+                                       }
+                               } else {
+                                       log_err("%s: block size must be a multiple of LBA data size\n",
+                                               f->file_name);
+                               }
+                               td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
+                               return 1;
+                       }
+                       if (data->ms && !data->lba_ext && ddir != DDIR_TRIM &&
+                           (o->md_per_io_size < ((td->o.max_bs[ddir] / data->lba_size) *
+                                                 data->ms))) {
+                               log_err("%s: md_per_io_size should be at least %llu bytes\n",
+                                       f->file_name,
+                                       ((td->o.max_bs[ddir] / data->lba_size) * data->ms));
+                               td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
+                               return 1;
+                       }
+                }
+
+               /*
+                * For extended logical block sizes we cannot use verify when
+                * end to end data protection checks are enabled, as the PI
+                * section of data buffer conflicts with verify.
+                */
+               if (data->ms && data->pi_type && data->lba_ext &&
+                   td->o.verify != VERIFY_NONE) {
+                       log_err("%s: for extended LBA, verify cannot be used when E2E data protection is enabled\n",
+                               f->file_name);
+                       td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
+                       return 1;
+               }
+       }
+       if (!ld || !o->registerfiles)
+               return generic_open_file(td, f);
+
+       f->fd = ld->fds[f->engine_pos];
+       return 0;
+}
+
  static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
  {
         struct ioring_data *ld = td->io_ops_data;
@@ -807,10 +1361,110 @@ static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
         return 0;
  }
  
-static struct ioengine_ops ioengine = {
+static int fio_ioring_cmd_close_file(struct thread_data *td,
+                                    struct fio_file *f)
+{
+       struct ioring_data *ld = td->io_ops_data;
+       struct ioring_options *o = td->eo;
+
+       if (o->cmd_type == FIO_URING_CMD_NVME) {
+               struct nvme_data *data = FILE_ENG_DATA(f);
+
+               FILE_SET_ENG_DATA(f, NULL);
+               free(data);
+       }
+       if (!ld || !o->registerfiles)
+               return generic_close_file(td, f);
+
+       f->fd = -1;
+       return 0;
+}
+
+static int fio_ioring_cmd_get_file_size(struct thread_data *td,
+                                       struct fio_file *f)
+{
+       struct ioring_options *o = td->eo;
+
+       if (fio_file_size_known(f))
+               return 0;
+
+       if (o->cmd_type == FIO_URING_CMD_NVME) {
+               struct nvme_data *data = NULL;
+               __u64 nlba = 0;
+               int ret;
+
+               data = calloc(1, sizeof(struct nvme_data));
+               ret = fio_nvme_get_info(f, &nlba, o->pi_act, data);
+               if (ret) {
+                       free(data);
+                       return ret;
+               }
+
+               f->real_file_size = data->lba_size * nlba;
+               fio_file_set_size_known(f);
+
+               FILE_SET_ENG_DATA(f, data);
+               return 0;
+       }
+       return generic_get_file_size(td, f);
+}
+
+static int fio_ioring_cmd_get_zoned_model(struct thread_data *td,
+                                         struct fio_file *f,
+                                         enum zbd_zoned_model *model)
+{
+       return fio_nvme_get_zoned_model(td, f, model);
+}
+
+static int fio_ioring_cmd_report_zones(struct thread_data *td,
+                                      struct fio_file *f, uint64_t offset,
+                                      struct zbd_zone *zbdz,
+                                      unsigned int nr_zones)
+{
+       return fio_nvme_report_zones(td, f, offset, zbdz, nr_zones);
+}
+
+static int fio_ioring_cmd_reset_wp(struct thread_data *td, struct fio_file *f,
+                                  uint64_t offset, uint64_t length)
+{
+       return fio_nvme_reset_wp(td, f, offset, length);
+}
+
+static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td,
+                                            struct fio_file *f,
+                                            unsigned int *max_open_zones)
+{
+       return fio_nvme_get_max_open_zones(td, f, max_open_zones);
+}
+
+static int fio_ioring_cmd_fetch_ruhs(struct thread_data *td, struct fio_file *f,
+                                    struct fio_ruhs_info *fruhs_info)
+{
+       struct nvme_fdp_ruh_status *ruhs;
+       int bytes, ret, i;
+
+       bytes = sizeof(*ruhs) + FDP_MAX_RUHS * sizeof(struct nvme_fdp_ruh_status_desc);
+       ruhs = scalloc(1, bytes);
+       if (!ruhs)
+               return -ENOMEM;
+
+       ret = fio_nvme_iomgmt_ruhs(td, f, ruhs, bytes);
+       if (ret)
+               goto free;
+
+       fruhs_info->nr_ruhs = le16_to_cpu(ruhs->nruhsd);
+       for (i = 0; i < fruhs_info->nr_ruhs; i++)
+               fruhs_info->plis[i] = le16_to_cpu(ruhs->ruhss[i].pid);
+free:
+       sfree(ruhs);
+       return ret;
+}
+
+static struct ioengine_ops ioengine_uring = {
         .name                   = "io_uring",
         .version                = FIO_IOOPS_VERSION,
-       .flags                  = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD,
+       .flags                  = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD |
+                                       FIO_ASYNCIO_SETS_ISSUE_TIME,
         .init                   = fio_ioring_init,
         .post_init              = fio_ioring_post_init,
         .io_u_init              = fio_ioring_io_u_init,
@@ -827,13 +1481,43 @@ static struct ioengine_ops ioengine = {
         .option_struct_size     = sizeof(struct ioring_options),
  };
  
+static struct ioengine_ops ioengine_uring_cmd = {
+       .name                   = "io_uring_cmd",
+       .version                = FIO_IOOPS_VERSION,
+       .flags                  = FIO_NO_OFFLOAD | FIO_MEMALIGN | FIO_RAWIO |
+                                       FIO_ASYNCIO_SETS_ISSUE_TIME |
+                                       FIO_MULTI_RANGE_TRIM,
+       .init                   = fio_ioring_init,
+       .post_init              = fio_ioring_cmd_post_init,
+       .io_u_init              = fio_ioring_io_u_init,
+       .io_u_free              = fio_ioring_io_u_free,
+       .prep                   = fio_ioring_cmd_prep,
+       .queue                  = fio_ioring_queue,
+       .commit                 = fio_ioring_commit,
+       .getevents              = fio_ioring_getevents,
+       .event                  = fio_ioring_cmd_event,
+       .cleanup                = fio_ioring_cleanup,
+       .open_file              = fio_ioring_cmd_open_file,
+       .close_file             = fio_ioring_cmd_close_file,
+       .get_file_size          = fio_ioring_cmd_get_file_size,
+       .get_zoned_model        = fio_ioring_cmd_get_zoned_model,
+       .report_zones           = fio_ioring_cmd_report_zones,
+       .reset_wp               = fio_ioring_cmd_reset_wp,
+       .get_max_open_zones     = fio_ioring_cmd_get_max_open_zones,
+       .options                = options,
+       .option_struct_size     = sizeof(struct ioring_options),
+       .fdp_fetch_ruhs         = fio_ioring_cmd_fetch_ruhs,
+};
+
  static void fio_init fio_ioring_register(void)
  {
-       register_ioengine(&ioengine);
+       register_ioengine(&ioengine_uring);
+       register_ioengine(&ioengine_uring_cmd);
  }
  
  static void fio_exit fio_ioring_unregister(void)
  {
-       unregister_ioengine(&ioengine);
+       unregister_ioengine(&ioengine_uring);
+       unregister_ioengine(&ioengine_uring_cmd);
  }
  #endif
diff --git a/engines/libaio.c b/engines/libaio.c

index b909b79e9c7169f7898e2aa32be37f895cafe4a0..aaccc7ce097fc6db724fdcea9f4e8b60cc42fa4a 100644 (file)
--- a/engines/libaio.c
+++ b/engines/libaio.c
@@ -15,6 +15,7 @@
  #include "../lib/pow2.h"
  #include "../optgroup.h"
  #include "../lib/memalign.h"
+#include "cmdprio.h"
  
  /* Should be defined in newest aio_abi.h */
  #ifndef IOCB_FLAG_IOPRIO
@@ -50,12 +51,14 @@ struct libaio_data {
         unsigned int queued;
         unsigned int head;
         unsigned int tail;
+
+       struct cmdprio cmdprio;
  };
  
  struct libaio_options {
-       void *pad;
+       struct thread_data *td;
         unsigned int userspace_reap;
-       unsigned int cmdprio_percentage;
+       struct cmdprio_options cmdprio_options;
         unsigned int nowait;
  };
  
@@ -69,26 +72,6 @@ static struct fio_option options[] = {
                 .category = FIO_OPT_C_ENGINE,
                 .group  = FIO_OPT_G_LIBAIO,
         },
-#ifdef FIO_HAVE_IOPRIO_CLASS
-       {
-               .name   = "cmdprio_percentage",
-               .lname  = "high priority percentage",
-               .type   = FIO_OPT_INT,
-               .off1   = offsetof(struct libaio_options, cmdprio_percentage),
-               .minval = 1,
-               .maxval = 100,
-               .help   = "Send high priority I/O this percentage of the time",
-               .category = FIO_OPT_C_ENGINE,
-               .group  = FIO_OPT_G_LIBAIO,
-       },
-#else
-       {
-               .name   = "cmdprio_percentage",
-               .lname  = "high priority percentage",
-               .type   = FIO_OPT_UNSUPPORTED,
-               .help   = "Your platform does not support I/O priority classes",
-       },
-#endif
         {
                 .name   = "nowait",
                 .lname  = "RWF_NOWAIT",
@@ -98,6 +81,7 @@ static struct fio_option options[] = {
                 .category = FIO_OPT_C_ENGINE,
                 .group  = FIO_OPT_G_LIBAIO,
         },
+       CMDPRIO_OPTIONS(struct libaio_options, FIO_OPT_G_LIBAIO),
         {
                 .name   = NULL,
         },
@@ -132,15 +116,16 @@ static int fio_libaio_prep(struct thread_data *td, struct io_u *io_u)
         return 0;
  }
  
-static void fio_libaio_prio_prep(struct thread_data *td, struct io_u *io_u)
+static inline void fio_libaio_cmdprio_prep(struct thread_data *td,
+                                          struct io_u *io_u)
  {
-       struct libaio_options *o = td->eo;
-       if (rand_between(&td->prio_state, 0, 99) < o->cmdprio_percentage) {
-               io_u->iocb.aio_reqprio = IOPRIO_CLASS_RT << IOPRIO_CLASS_SHIFT;
+       struct libaio_data *ld = td->io_ops_data;
+       struct cmdprio *cmdprio = &ld->cmdprio;
+
+       if (fio_cmdprio_set_ioprio(td, cmdprio, io_u)) {
+               io_u->iocb.aio_reqprio = io_u->ioprio;
                 io_u->iocb.u.c.flags |= IOCB_FLAG_IOPRIO;
-               io_u->flags |= IO_U_F_PRIORITY;
         }
-       return;
  }
  
  static struct io_u *fio_libaio_event(struct thread_data *td, int event)
@@ -223,14 +208,16 @@ static int fio_libaio_getevents(struct thread_data *td, unsigned int min,
                     && actual_min == 0
                     && ((struct aio_ring *)(ld->aio_ctx))->magic
                                 == AIO_RING_MAGIC) {
-                       r = user_io_getevents(ld->aio_ctx, max,
+                       r = user_io_getevents(ld->aio_ctx, max - events,
                                 ld->aio_events + events);
                 } else {
                         r = io_getevents(ld->aio_ctx, actual_min,
-                               max, ld->aio_events + events, lt);
+                               max - events, ld->aio_events + events, lt);
                 }
-               if (r > 0)
+               if (r > 0) {
                         events += r;
+                       actual_min -= min((unsigned int)events, actual_min);
+               }
                 else if ((min && r == 0) || r == -EAGAIN) {
                         fio_libaio_commit(td);
                         if (actual_min)
@@ -246,7 +233,6 @@ static enum fio_q_status fio_libaio_queue(struct thread_data *td,
                                           struct io_u *io_u)
  {
         struct libaio_data *ld = td->io_ops_data;
-       struct libaio_options *o = td->eo;
  
         fio_ro_check(td, io_u);
  
@@ -277,8 +263,8 @@ static enum fio_q_status fio_libaio_queue(struct thread_data *td,
                 return FIO_Q_COMPLETED;
         }
  
-       if (o->cmdprio_percentage)
-               fio_libaio_prio_prep(td, io_u);
+       if (ld->cmdprio.mode != CMDPRIO_MODE_NONE)
+               fio_libaio_cmdprio_prep(td, io_u);
  
         ld->iocbs[ld->head] = &io_u->iocb;
         ld->io_us[ld->head] = io_u;
@@ -304,6 +290,12 @@ static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us,
                 memcpy(&io_u->issue_time, &now, sizeof(now));
                 io_u_queued(td, io_u);
         }
+
+       /*
+        * only used for iolog
+        */
+       if (td->o.read_iolog_file)
+               memcpy(&td->last_issue, &now, sizeof(now));
  }
  
  static int fio_libaio_commit(struct thread_data *td)
@@ -396,6 +388,8 @@ static void fio_libaio_cleanup(struct thread_data *td)
                  */
                 if (!(td->flags & TD_F_CHILD))
                         io_destroy(ld->aio_ctx);
+
+               fio_cmdprio_cleanup(&ld->cmdprio);
                 free(ld->aio_events);
                 free(ld->iocbs);
                 free(ld->io_us);
@@ -420,8 +414,8 @@ static int fio_libaio_post_init(struct thread_data *td)
  static int fio_libaio_init(struct thread_data *td)
  {
         struct libaio_data *ld;
-       struct thread_options *to = &td->o;
         struct libaio_options *o = td->eo;
+       int ret;
  
         ld = calloc(1, sizeof(*ld));
  
@@ -432,23 +426,21 @@ static int fio_libaio_init(struct thread_data *td)
         ld->io_us = calloc(ld->entries, sizeof(struct io_u *));
  
         td->io_ops_data = ld;
-       /*
-        * Check for option conflicts
-        */
-       if ((fio_option_is_set(to, ioprio) || fio_option_is_set(to, ioprio_class)) &&
-                       o->cmdprio_percentage != 0) {
-               log_err("%s: cmdprio_percentage option and mutually exclusive "
-                               "prio or prioclass option is set, exiting\n", to->name);
+
+       ret = fio_cmdprio_init(td, &ld->cmdprio, &o->cmdprio_options);
+       if (ret) {
                 td_verror(td, EINVAL, "fio_libaio_init");
                 return 1;
         }
+
         return 0;
  }
  
  FIO_STATIC struct ioengine_ops ioengine = {
         .name                   = "libaio",
         .version                = FIO_IOOPS_VERSION,
-       .flags                  = FIO_ASYNCIO_SYNC_TRIM,
+       .flags                  = FIO_ASYNCIO_SYNC_TRIM |
+                                       FIO_ASYNCIO_SETS_ISSUE_TIME,
         .init                   = fio_libaio_init,
         .post_init              = fio_libaio_post_init,
         .prep                   = fio_libaio_prep,
diff --git a/engines/libblkio.c b/engines/libblkio.c

new file mode 100644 (file)

index 0000000..ee42d11
--- /dev/null
+++ b/engines/libblkio.c
@@ -0,0 +1,912 @@
+/*
+ * libblkio engine
+ *
+ * IO engine using libblkio to access various block I/O interfaces:
+ * https://gitlab.com/libblkio/libblkio
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <blkio.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+#include "../options.h"
+#include "../parse.h"
+
+/* per-process state */
+static struct {
+       pthread_mutex_t mutex;
+       int initted_threads;
+       int initted_hipri_threads;
+       struct blkio *b;
+} proc_state = { PTHREAD_MUTEX_INITIALIZER, 0, 0, NULL };
+
+static void fio_blkio_proc_lock(void) {
+       int ret;
+       ret = pthread_mutex_lock(&proc_state.mutex);
+       assert(ret == 0);
+}
+
+static void fio_blkio_proc_unlock(void) {
+       int ret;
+       ret = pthread_mutex_unlock(&proc_state.mutex);
+       assert(ret == 0);
+}
+
+/* per-thread state */
+struct fio_blkio_data {
+       struct blkioq *q;
+       int completion_fd; /* may be -1 if not FIO_BLKIO_WAIT_MODE_EVENTFD */
+
+       bool has_mem_region; /* whether mem_region is valid */
+       struct blkio_mem_region mem_region; /* only if allocated by libblkio */
+
+       struct iovec *iovecs; /* for vectored requests */
+       struct blkio_completion *completions;
+};
+
+enum fio_blkio_wait_mode {
+       FIO_BLKIO_WAIT_MODE_BLOCK,
+       FIO_BLKIO_WAIT_MODE_EVENTFD,
+       FIO_BLKIO_WAIT_MODE_LOOP,
+};
+
+struct fio_blkio_options {
+       void *pad; /* option fields must not have offset 0 */
+
+       char *driver;
+
+       char *path;
+       char *pre_connect_props;
+
+       int num_entries;
+       int queue_size;
+       char *pre_start_props;
+
+       unsigned int hipri;
+       unsigned int vectored;
+       unsigned int write_zeroes_on_trim;
+       enum fio_blkio_wait_mode wait_mode;
+       unsigned int force_enable_completion_eventfd;
+};
+
+static struct fio_option options[] = {
+       {
+               .name   = "libblkio_driver",
+               .lname  = "libblkio driver name",
+               .type   = FIO_OPT_STR_STORE,
+               .off1   = offsetof(struct fio_blkio_options, driver),
+               .help   = "Name of the driver to be used by libblkio",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name   = "libblkio_path",
+               .lname  = "libblkio \"path\" property",
+               .type   = FIO_OPT_STR_STORE,
+               .off1   = offsetof(struct fio_blkio_options, path),
+               .help   = "Value to set the \"path\" property to",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name   = "libblkio_pre_connect_props",
+               .lname  = "Additional properties to be set before blkio_connect()",
+               .type   = FIO_OPT_STR_STORE,
+               .off1   = offsetof(struct fio_blkio_options, pre_connect_props),
+               .help   = "",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name   = "libblkio_num_entries",
+               .lname  = "libblkio \"num-entries\" property",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct fio_blkio_options, num_entries),
+               .help   = "Value to set the \"num-entries\" property to",
+               .minval = 1,
+               .interval = 1,
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name   = "libblkio_queue_size",
+               .lname  = "libblkio \"queue-size\" property",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct fio_blkio_options, queue_size),
+               .help   = "Value to set the \"queue-size\" property to",
+               .minval = 1,
+               .interval = 1,
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name   = "libblkio_pre_start_props",
+               .lname  = "Additional properties to be set before blkio_start()",
+               .type   = FIO_OPT_STR_STORE,
+               .off1   = offsetof(struct fio_blkio_options, pre_start_props),
+               .help   = "",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name   = "hipri",
+               .lname  = "Use poll queues",
+               .type   = FIO_OPT_STR_SET,
+               .off1   = offsetof(struct fio_blkio_options, hipri),
+               .help   = "Use poll queues",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name   = "libblkio_vectored",
+               .lname  = "Use blkioq_{readv,writev}()",
+               .type   = FIO_OPT_STR_SET,
+               .off1   = offsetof(struct fio_blkio_options, vectored),
+               .help   = "Use blkioq_{readv,writev}() instead of blkioq_{read,write}()",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name   = "libblkio_write_zeroes_on_trim",
+               .lname  = "Use blkioq_write_zeroes() for TRIM",
+               .type   = FIO_OPT_STR_SET,
+               .off1   = offsetof(struct fio_blkio_options,
+                                  write_zeroes_on_trim),
+               .help   = "Use blkioq_write_zeroes() for TRIM instead of blkioq_discard()",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name   = "libblkio_wait_mode",
+               .lname  = "How to wait for completions",
+               .type   = FIO_OPT_STR,
+               .off1   = offsetof(struct fio_blkio_options, wait_mode),
+               .help   = "How to wait for completions",
+               .def    = "block",
+               .posval = {
+                         { .ival = "block",
+                           .oval = FIO_BLKIO_WAIT_MODE_BLOCK,
+                           .help = "Blocking blkioq_do_io()",
+                         },
+                         { .ival = "eventfd",
+                           .oval = FIO_BLKIO_WAIT_MODE_EVENTFD,
+                           .help = "Blocking read() on the completion eventfd",
+                         },
+                         { .ival = "loop",
+                           .oval = FIO_BLKIO_WAIT_MODE_LOOP,
+                           .help = "Busy loop with non-blocking blkioq_do_io()",
+                         },
+               },
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name   = "libblkio_force_enable_completion_eventfd",
+               .lname  = "Force enable the completion eventfd, even if unused",
+               .type   = FIO_OPT_STR_SET,
+               .off1   = offsetof(struct fio_blkio_options,
+                                  force_enable_completion_eventfd),
+               .help   = "This can impact performance",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBBLKIO,
+       },
+       {
+               .name = NULL,
+       },
+};
+
+static int fio_blkio_set_props_from_str(struct blkio *b, const char *opt_name,
+                                       const char *str) {
+       int ret = 0;
+       char *new_str, *name, *value;
+
+       if (!str)
+               return 0;
+
+       /* iteration can mutate string, so copy it */
+       new_str = strdup(str);
+       if (!new_str) {
+               log_err("fio: strdup() failed\n");
+               return 1;
+       }
+
+       /* iterate over property name-value pairs */
+       while ((name = get_next_str(&new_str))) {
+               /* split into property name and value */
+               value = strchr(name, '=');
+               if (!value) {
+                       log_err("fio: missing '=' in option %s\n", opt_name);
+                       ret = 1;
+                       break;
+               }
+
+               *value = '\0';
+               ++value;
+
+               /* strip whitespace from property name */
+               strip_blank_front(&name);
+               strip_blank_end(name);
+
+               if (name[0] == '\0') {
+                       log_err("fio: empty property name in option %s\n",
+                               opt_name);
+                       ret = 1;
+                       break;
+               }
+
+               /* strip whitespace from property value */
+               strip_blank_front(&value);
+               strip_blank_end(value);
+
+               /* set property */
+               if (blkio_set_str(b, name, value) != 0) {
+                       log_err("fio: error setting property '%s' to '%s': %s\n",
+                               name, value, blkio_get_error_msg());
+                       ret = 1;
+                       break;
+               }
+       }
+
+       free(new_str);
+       return ret;
+}
+
+/*
+ * Log the failure of a libblkio function.
+ *
+ * `(void)func` is to ensure `func` exists and prevent typos
+ */
+#define fio_blkio_log_err(func) \
+       ({ \
+               (void)func; \
+               log_err("fio: %s() failed: %s\n", #func, \
+                       blkio_get_error_msg()); \
+       })
+
+static bool possibly_null_strs_equal(const char *a, const char *b)
+{
+       return (!a && !b) || (a && b && strcmp(a, b) == 0);
+}
+
+/*
+ * Returns the total number of subjobs using the 'libblkio' ioengine and setting
+ * the 'thread' option in the entire workload that have the given value for the
+ * 'hipri' option.
+ */
+static int total_threaded_subjobs(bool hipri)
+{
+       int count = 0;
+
+       for_each_td(td) {
+               const struct fio_blkio_options *options = td->eo;
+               if (strcmp(td->o.ioengine, "libblkio") == 0 &&
+                   td->o.use_thread && (bool)options->hipri == hipri)
+                       ++count;
+       } end_for_each();
+
+       return count;
+}
+
+static struct {
+       bool set_up;
+       bool direct;
+       struct fio_blkio_options opts;
+} first_threaded_subjob = { 0 };
+
+static void fio_blkio_log_opt_compat_err(const char *option_name)
+{
+       log_err("fio: jobs using engine libblkio and sharing a process must agree on the %s option\n",
+               option_name);
+}
+
+/*
+ * If td represents a subjob with option 'thread', check if its options are
+ * compatible with those of other threaded subjobs that were already set up.
+ */
+static int fio_blkio_check_opt_compat(struct thread_data *td)
+{
+       const struct fio_blkio_options *options = td->eo, *prev_options;
+
+       if (!td->o.use_thread)
+               return 0; /* subjob doesn't use 'thread' */
+
+       if (!first_threaded_subjob.set_up) {
+               /* first subjob using 'thread', store options for later */
+               first_threaded_subjob.set_up    = true;
+               first_threaded_subjob.direct    = td->o.odirect;
+               first_threaded_subjob.opts      = *options;
+               return 0;
+       }
+
+       /* not first subjob using 'thread', check option compatibility */
+       prev_options = &first_threaded_subjob.opts;
+
+       if (td->o.odirect != first_threaded_subjob.direct) {
+               fio_blkio_log_opt_compat_err("direct/buffered");
+               return 1;
+       }
+
+       if (strcmp(options->driver, prev_options->driver) != 0) {
+               fio_blkio_log_opt_compat_err("libblkio_driver");
+               return 1;
+       }
+
+       if (!possibly_null_strs_equal(options->path, prev_options->path)) {
+               fio_blkio_log_opt_compat_err("libblkio_path");
+               return 1;
+       }
+
+       if (!possibly_null_strs_equal(options->pre_connect_props,
+                                     prev_options->pre_connect_props)) {
+               fio_blkio_log_opt_compat_err("libblkio_pre_connect_props");
+               return 1;
+       }
+
+       if (options->num_entries != prev_options->num_entries) {
+               fio_blkio_log_opt_compat_err("libblkio_num_entries");
+               return 1;
+       }
+
+       if (options->queue_size != prev_options->queue_size) {
+               fio_blkio_log_opt_compat_err("libblkio_queue_size");
+               return 1;
+       }
+
+       if (!possibly_null_strs_equal(options->pre_start_props,
+                                     prev_options->pre_start_props)) {
+               fio_blkio_log_opt_compat_err("libblkio_pre_start_props");
+               return 1;
+       }
+
+       return 0;
+}
+
+static int fio_blkio_create_and_connect(struct thread_data *td,
+                                       struct blkio **out_blkio)
+{
+       const struct fio_blkio_options *options = td->eo;
+       struct blkio *b;
+       int ret;
+
+       if (!options->driver) {
+               log_err("fio: engine libblkio requires option libblkio_driver to be set\n");
+               return 1;
+       }
+
+       if (blkio_create(options->driver, &b) != 0) {
+               fio_blkio_log_err(blkio_create);
+               return 1;
+       }
+
+       /* don't fail if driver doesn't have a "direct" property */
+       ret = blkio_set_bool(b, "direct", td->o.odirect);
+       if (ret != 0 && ret != -ENOENT) {
+               fio_blkio_log_err(blkio_set_bool);
+               goto err_blkio_destroy;
+       }
+
+       if (blkio_set_bool(b, "read-only", read_only) != 0) {
+               fio_blkio_log_err(blkio_set_bool);
+               goto err_blkio_destroy;
+       }
+
+       if (options->path) {
+               if (blkio_set_str(b, "path", options->path) != 0) {
+                       fio_blkio_log_err(blkio_set_str);
+                       goto err_blkio_destroy;
+               }
+       }
+
+       if (fio_blkio_set_props_from_str(b, "libblkio_pre_connect_props",
+                                        options->pre_connect_props) != 0)
+               goto err_blkio_destroy;
+
+       if (blkio_connect(b) != 0) {
+               fio_blkio_log_err(blkio_connect);
+               goto err_blkio_destroy;
+       }
+
+       if (options->num_entries != 0) {
+               if (blkio_set_int(b, "num-entries",
+                                 options->num_entries) != 0) {
+                       fio_blkio_log_err(blkio_set_int);
+                       goto err_blkio_destroy;
+               }
+       }
+
+       if (options->queue_size != 0) {
+               if (blkio_set_int(b, "queue-size", options->queue_size) != 0) {
+                       fio_blkio_log_err(blkio_set_int);
+                       goto err_blkio_destroy;
+               }
+       }
+
+       if (fio_blkio_set_props_from_str(b, "libblkio_pre_start_props",
+                                        options->pre_start_props) != 0)
+               goto err_blkio_destroy;
+
+       *out_blkio = b;
+       return 0;
+
+err_blkio_destroy:
+       blkio_destroy(&b);
+       return 1;
+}
+
+static bool incompatible_threaded_subjob_options = false;
+
+/*
+ * This callback determines the device/file size, so it creates and connects a
+ * blkio instance. But it is invoked from the main thread in the original fio
+ * process, not from the processes in which jobs will actually run. It thus
+ * subsequently destroys the blkio, which is recreated in the init() callback.
+ */
+static int fio_blkio_setup(struct thread_data *td)
+{
+       const struct fio_blkio_options *options = td->eo;
+       struct blkio *b;
+       int ret = 0;
+       uint64_t capacity;
+
+       assert(td->files_index == 1);
+
+       if (fio_blkio_check_opt_compat(td) != 0) {
+               incompatible_threaded_subjob_options = true;
+               return 1;
+       }
+
+       if (options->hipri &&
+               options->wait_mode == FIO_BLKIO_WAIT_MODE_EVENTFD) {
+               log_err("fio: option hipri is incompatible with option libblkio_wait_mode=eventfd\n");
+               return 1;
+       }
+
+       if (options->hipri && options->force_enable_completion_eventfd) {
+               log_err("fio: option hipri is incompatible with option libblkio_force_enable_completion_eventfd\n");
+               return 1;
+       }
+
+       if (fio_blkio_create_and_connect(td, &b) != 0)
+               return 1;
+
+       if (blkio_get_uint64(b, "capacity", &capacity) != 0) {
+               fio_blkio_log_err(blkio_get_uint64);
+               ret = 1;
+               goto out_blkio_destroy;
+       }
+
+       td->files[0]->real_file_size = capacity;
+       fio_file_set_size_known(td->files[0]);
+
+out_blkio_destroy:
+       blkio_destroy(&b);
+       return ret;
+}
+
+static int fio_blkio_init(struct thread_data *td)
+{
+       const struct fio_blkio_options *options = td->eo;
+       struct fio_blkio_data *data;
+       int flags;
+
+       if (td->o.use_thread && incompatible_threaded_subjob_options) {
+               /*
+                * Different subjobs using option 'thread' specified
+                * incompatible options. We don't know which configuration
+                * should win, so we just fail all such subjobs.
+                */
+               return 1;
+       }
+
+       /*
+        * Request enqueueing is fast, and it's not possible to know exactly
+        * when a request is submitted, so never report submission latencies.
+        */
+       td->o.disable_slat = 1;
+
+       data = calloc(1, sizeof(*data));
+       if (!data) {
+               log_err("fio: calloc() failed\n");
+               return 1;
+       }
+
+       data->iovecs = calloc(td->o.iodepth, sizeof(data->iovecs[0]));
+       data->completions = calloc(td->o.iodepth, sizeof(data->completions[0]));
+       if (!data->iovecs || !data->completions) {
+               log_err("fio: calloc() failed\n");
+               goto err_free;
+       }
+
+       fio_blkio_proc_lock();
+
+       if (proc_state.initted_threads == 0) {
+               /* initialize per-process blkio */
+               int num_queues, num_poll_queues;
+
+               if (td->o.use_thread) {
+                       num_queues      = total_threaded_subjobs(false);
+                       num_poll_queues = total_threaded_subjobs(true);
+               } else {
+                       num_queues      = options->hipri ? 0 : 1;
+                       num_poll_queues = options->hipri ? 1 : 0;
+               }
+
+               if (fio_blkio_create_and_connect(td, &proc_state.b) != 0)
+                       goto err_unlock;
+
+               if (blkio_set_int(proc_state.b, "num-queues",
+                                 num_queues) != 0) {
+                       fio_blkio_log_err(blkio_set_int);
+                       goto err_blkio_destroy;
+               }
+
+               if (blkio_set_int(proc_state.b, "num-poll-queues",
+                                 num_poll_queues) != 0) {
+                       fio_blkio_log_err(blkio_set_int);
+                       goto err_blkio_destroy;
+               }
+
+               if (blkio_start(proc_state.b) != 0) {
+                       fio_blkio_log_err(blkio_start);
+                       goto err_blkio_destroy;
+               }
+       }
+
+       if (options->hipri) {
+               int i = proc_state.initted_hipri_threads;
+               data->q = blkio_get_poll_queue(proc_state.b, i);
+       } else {
+               int i = proc_state.initted_threads -
+                               proc_state.initted_hipri_threads;
+               data->q = blkio_get_queue(proc_state.b, i);
+       }
+
+       if (options->wait_mode == FIO_BLKIO_WAIT_MODE_EVENTFD ||
+               options->force_enable_completion_eventfd) {
+               /* enable completion fd and make it blocking */
+               blkioq_set_completion_fd_enabled(data->q, true);
+               data->completion_fd = blkioq_get_completion_fd(data->q);
+
+               flags = fcntl(data->completion_fd, F_GETFL);
+               if (flags < 0) {
+                       log_err("fio: fcntl(F_GETFL) failed: %s\n",
+                               strerror(errno));
+                       goto err_blkio_destroy;
+               }
+
+               if (fcntl(data->completion_fd, F_SETFL,
+                         flags & ~O_NONBLOCK) != 0) {
+                       log_err("fio: fcntl(F_SETFL) failed: %s\n",
+                               strerror(errno));
+                       goto err_blkio_destroy;
+               }
+       } else {
+               data->completion_fd = -1;
+       }
+
+       ++proc_state.initted_threads;
+       if (options->hipri)
+               ++proc_state.initted_hipri_threads;
+
+       /* Set data last so cleanup() does nothing if init() fails. */
+       td->io_ops_data = data;
+
+       fio_blkio_proc_unlock();
+
+       return 0;
+
+err_blkio_destroy:
+       if (proc_state.initted_threads == 0)
+               blkio_destroy(&proc_state.b);
+err_unlock:
+       if (proc_state.initted_threads == 0)
+               proc_state.b = NULL;
+       fio_blkio_proc_unlock();
+err_free:
+       free(data->completions);
+       free(data->iovecs);
+       free(data);
+       return 1;
+}
+
+static int fio_blkio_post_init(struct thread_data *td)
+{
+       struct fio_blkio_data *data = td->io_ops_data;
+
+       if (!data->has_mem_region) {
+               /*
+                * Memory was allocated by the fio core and not iomem_alloc(),
+                * so we need to register it as a memory region here.
+                *
+                * `td->orig_buffer_size` is computed like `len` below, but then
+                * fio can add some padding to it to make sure it is
+                * sufficiently aligned to the page size and the mem_align
+                * option. However, this can make it become unaligned to the
+                * "mem-region-alignment" property in ways that the user can't
+                * control, so we essentially recompute `td->orig_buffer_size`
+                * here but without adding that padding.
+                */
+
+               unsigned long long max_block_size;
+               struct blkio_mem_region region;
+
+               max_block_size = max(td->o.max_bs[DDIR_READ],
+                                    max(td->o.max_bs[DDIR_WRITE],
+                                        td->o.max_bs[DDIR_TRIM]));
+
+               region = (struct blkio_mem_region) {
+                       .addr   = td->orig_buffer,
+                       .len    = (size_t)max_block_size *
+                                       (size_t)td->o.iodepth,
+                       .fd     = -1,
+               };
+
+               if (blkio_map_mem_region(proc_state.b, &region) != 0) {
+                       fio_blkio_log_err(blkio_map_mem_region);
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static void fio_blkio_cleanup(struct thread_data *td)
+{
+       struct fio_blkio_data *data = td->io_ops_data;
+
+       /*
+        * Subjobs from different jobs can be terminated at different times, so
+        * this callback may be invoked for one subjob while another is still
+        * doing I/O. Those subjobs may share the process, so we must wait until
+        * the last subjob in the process wants to clean up to actually destroy
+        * the blkio.
+        */
+
+       if (data) {
+               free(data->completions);
+               free(data->iovecs);
+               free(data);
+
+               fio_blkio_proc_lock();
+               if (--proc_state.initted_threads == 0) {
+                       blkio_destroy(&proc_state.b);
+                       proc_state.b = NULL;
+               }
+               fio_blkio_proc_unlock();
+       }
+}
+
+#define align_up(x, y) ((((x) + (y) - 1) / (y)) * (y))
+
+static int fio_blkio_iomem_alloc(struct thread_data *td, size_t size)
+{
+       struct fio_blkio_data *data = td->io_ops_data;
+       int ret;
+       uint64_t mem_region_alignment;
+
+       if (blkio_get_uint64(proc_state.b, "mem-region-alignment",
+                            &mem_region_alignment) != 0) {
+               fio_blkio_log_err(blkio_get_uint64);
+               return 1;
+       }
+
+       /* round up size to satisfy mem-region-alignment */
+       size = align_up(size, (size_t)mem_region_alignment);
+
+       fio_blkio_proc_lock();
+
+       if (blkio_alloc_mem_region(proc_state.b, &data->mem_region,
+                                  size) != 0) {
+               fio_blkio_log_err(blkio_alloc_mem_region);
+               ret = 1;
+               goto out;
+       }
+
+       if (blkio_map_mem_region(proc_state.b, &data->mem_region) != 0) {
+               fio_blkio_log_err(blkio_map_mem_region);
+               ret = 1;
+               goto out_free;
+       }
+
+       td->orig_buffer = data->mem_region.addr;
+       data->has_mem_region = true;
+
+       ret = 0;
+       goto out;
+
+out_free:
+       blkio_free_mem_region(proc_state.b, &data->mem_region);
+out:
+       fio_blkio_proc_unlock();
+       return ret;
+}
+
+static void fio_blkio_iomem_free(struct thread_data *td)
+{
+       struct fio_blkio_data *data = td->io_ops_data;
+
+       if (data && data->has_mem_region) {
+               fio_blkio_proc_lock();
+               blkio_unmap_mem_region(proc_state.b, &data->mem_region);
+               blkio_free_mem_region(proc_state.b, &data->mem_region);
+               fio_blkio_proc_unlock();
+
+               data->has_mem_region = false;
+       }
+}
+
+static int fio_blkio_open_file(struct thread_data *td, struct fio_file *f)
+{
+       return 0;
+}
+
+static enum fio_q_status fio_blkio_queue(struct thread_data *td,
+                                        struct io_u *io_u)
+{
+       const struct fio_blkio_options *options = td->eo;
+       struct fio_blkio_data *data = td->io_ops_data;
+
+       fio_ro_check(td, io_u);
+
+       switch (io_u->ddir) {
+               case DDIR_READ:
+                       if (options->vectored) {
+                               struct iovec *iov = &data->iovecs[io_u->index];
+                               iov->iov_base = io_u->xfer_buf;
+                               iov->iov_len = (size_t)io_u->xfer_buflen;
+
+                               blkioq_readv(data->q, io_u->offset, iov, 1,
+                                            io_u, 0);
+                       } else {
+                               blkioq_read(data->q, io_u->offset,
+                                           io_u->xfer_buf,
+                                           (size_t)io_u->xfer_buflen, io_u, 0);
+                       }
+                       break;
+               case DDIR_WRITE:
+                       if (options->vectored) {
+                               struct iovec *iov = &data->iovecs[io_u->index];
+                               iov->iov_base = io_u->xfer_buf;
+                               iov->iov_len = (size_t)io_u->xfer_buflen;
+
+                               blkioq_writev(data->q, io_u->offset, iov, 1,
+                                             io_u, 0);
+                       } else {
+                               blkioq_write(data->q, io_u->offset,
+                                            io_u->xfer_buf,
+                                            (size_t)io_u->xfer_buflen, io_u,
+                                            0);
+                       }
+                       break;
+               case DDIR_TRIM:
+                       if (options->write_zeroes_on_trim) {
+                               blkioq_write_zeroes(data->q, io_u->offset,
+                                                   io_u->xfer_buflen, io_u, 0);
+                       } else {
+                               blkioq_discard(data->q, io_u->offset,
+                                              io_u->xfer_buflen, io_u, 0);
+                       }
+                       break;
+               case DDIR_SYNC:
+               case DDIR_DATASYNC:
+                       blkioq_flush(data->q, io_u, 0);
+                       break;
+               default:
+                       io_u->error = ENOTSUP;
+                       io_u_log_error(td, io_u);
+                       return FIO_Q_COMPLETED;
+       }
+
+       return FIO_Q_QUEUED;
+}
+
+static int fio_blkio_getevents(struct thread_data *td, unsigned int min,
+                              unsigned int max, const struct timespec *t)
+{
+       const struct fio_blkio_options *options = td->eo;
+       struct fio_blkio_data *data = td->io_ops_data;
+       int ret, n;
+       uint64_t event;
+
+       switch (options->wait_mode) {
+       case FIO_BLKIO_WAIT_MODE_BLOCK:
+               n = blkioq_do_io(data->q, data->completions, (int)min, (int)max,
+                                NULL);
+               if (n < 0) {
+                       fio_blkio_log_err(blkioq_do_io);
+                       return -1;
+               }
+               return n;
+       case FIO_BLKIO_WAIT_MODE_EVENTFD:
+               n = blkioq_do_io(data->q, data->completions, 0, (int)max, NULL);
+               if (n < 0) {
+                       fio_blkio_log_err(blkioq_do_io);
+                       return -1;
+               }
+               while (n < (int)min) {
+                       ret = read(data->completion_fd, &event, sizeof(event));
+                       if (ret != sizeof(event)) {
+                               log_err("fio: read() on the completion fd returned %d\n",
+                                       ret);
+                               return -1;
+                       }
+
+                       ret = blkioq_do_io(data->q, data->completions + n, 0,
+                                          (int)max - n, NULL);
+                       if (ret < 0) {
+                               fio_blkio_log_err(blkioq_do_io);
+                               return -1;
+                       }
+
+                       n += ret;
+               }
+               return n;
+       case FIO_BLKIO_WAIT_MODE_LOOP:
+               for (n = 0; n < (int)min; ) {
+                       ret = blkioq_do_io(data->q, data->completions + n, 0,
+                                          (int)max - n, NULL);
+                       if (ret < 0) {
+                               fio_blkio_log_err(blkioq_do_io);
+                               return -1;
+                       }
+
+                       n += ret;
+               }
+               return n;
+       default:
+               return -1;
+       }
+}
+
+static struct io_u *fio_blkio_event(struct thread_data *td, int event)
+{
+       struct fio_blkio_data *data = td->io_ops_data;
+       struct blkio_completion *completion = &data->completions[event];
+       struct io_u *io_u = completion->user_data;
+
+       io_u->error = -completion->ret;
+
+       return io_u;
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+       .name                   = "libblkio",
+       .version                = FIO_IOOPS_VERSION,
+       .flags                  = FIO_DISKLESSIO | FIO_NOEXTEND |
+                                 FIO_NO_OFFLOAD | FIO_SKIPPABLE_IOMEM_ALLOC,
+
+       .setup                  = fio_blkio_setup,
+       .init                   = fio_blkio_init,
+       .post_init              = fio_blkio_post_init,
+       .cleanup                = fio_blkio_cleanup,
+
+       .iomem_alloc            = fio_blkio_iomem_alloc,
+       .iomem_free             = fio_blkio_iomem_free,
+
+       .open_file              = fio_blkio_open_file,
+
+       .queue                  = fio_blkio_queue,
+       .getevents              = fio_blkio_getevents,
+       .event                  = fio_blkio_event,
+
+       .options                = options,
+       .option_struct_size     = sizeof(struct fio_blkio_options),
+};
+
+static void fio_init fio_blkio_register(void)
+{
+       register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_blkio_unregister(void)
+{
+       unregister_ioengine(&ioengine);
+}
diff --git a/engines/libcufile.c b/engines/libcufile.c

index e575b7864db642f7584cd1d4b7f6d4c7bd608cb1..2bedf26136fb7ac8b1fa1c65fcf4f22f76ef94b6 100644 (file)
--- a/engines/libcufile.c
+++ b/engines/libcufile.c
@@ -606,6 +606,7 @@ FIO_STATIC struct ioengine_ops ioengine = {
         .version             = FIO_IOOPS_VERSION,
         .init                = fio_libcufile_init,
         .queue               = fio_libcufile_queue,
+       .get_file_size       = generic_get_file_size,
         .open_file           = fio_libcufile_open_file,
         .close_file          = fio_libcufile_close_file,
         .iomem_alloc         = fio_libcufile_iomem_alloc,
diff --git a/engines/libhdfs.c b/engines/libhdfs.c

index eb55c3c549e880eb9abfeef8ab2093095e15bc7b..d0a268408435418f261616a734f335b03aa91286 100644 (file)
--- a/engines/libhdfs.c
+++ b/engines/libhdfs.c
@@ -27,7 +27,7 @@ struct hdfsio_data {
  };
  
  struct hdfsio_options {
-       void *pad;                      /* needed because offset can't be 0 for a option defined used offsetof */
+       void *pad;                      /* needed because offset can't be 0 for an option defined used offsetof */
         char *host;
         char *directory;
         unsigned int port;
@@ -315,8 +315,7 @@ static int fio_hdfsio_setup(struct thread_data *td)
         uint64_t file_size, total_file_size;
  
         if (!td->io_ops_data) {
-               hd = malloc(sizeof(*hd));
-               memset(hd, 0, sizeof(*hd));
+               hd = calloc(1, sizeof(*hd));
                 
                 hd->curr_file_id = -1;
  
diff --git a/engines/libiscsi.c b/engines/libiscsi.c

index c97b5709ae779eda9af451c0418d352ad365e772..37c9b55a91c932f259bbc1698aae507b19ee0419 100644 (file)
--- a/engines/libiscsi.c
+++ b/engines/libiscsi.c
@@ -68,8 +68,7 @@ static int fio_iscsi_setup_lun(struct iscsi_info *iscsi_info,
         struct scsi_readcapacity16      *rc16       = NULL;
         int                              ret        = 0;
  
-       iscsi_lun = malloc(sizeof(struct iscsi_lun));
-       memset(iscsi_lun, 0, sizeof(struct iscsi_lun));
+       iscsi_lun = calloc(1, sizeof(struct iscsi_lun));
  
         iscsi_lun->iscsi_info = iscsi_info;
  
diff --git a/engines/libpmem.c b/engines/libpmem.c

index 2338f0fa24f42b4f8318ca9a4bc299283e28c8bf..ab29a45383b5fe70ccb9a7334cbb3cf14472b78b 100644 (file)
--- a/engines/libpmem.c
+++ b/engines/libpmem.c
@@ -2,7 +2,7 @@
   * libpmem: IO engine that uses PMDK libpmem to read and write data
   *
   * Copyright (C) 2017 Nippon Telegraph and Telephone Corporation.
- * Copyright 2018-2020, Intel Corporation
+ * Copyright 2018-2021, Intel Corporation
   *
   * This program is free software; you can redistribute it and/or
   * modify it under the terms of the GNU General Public License,
@@ -18,7 +18,8 @@
  /*
   * libpmem engine
   *
- * IO engine that uses libpmem to write data (and memcpy to read)
+ * IO engine that uses libpmem (part of PMDK collection) to write data
+ *     and libc's memcpy to read. It requires PMDK >= 1.5.
   *
   * To use:
   *   ioengine=libpmem
@@ -43,25 +44,13 @@
   *     mkdir /mnt/pmem0
   *     mount -o dax /dev/pmem0 /mnt/pmem0
   *
- * See examples/libpmem.fio for more.
- *
- *
- * libpmem.so
- *   By default, the libpmem engine will let the system find the libpmem.so
- *   that it uses. You can use an alternative libpmem by setting the
- *   FIO_PMEM_LIB environment variable to the full path to the desired
- *   libpmem.so. This engine requires PMDK >= 1.5.
+ * See examples/libpmem.fio for complete usage example.
   */
  
  #include <stdio.h>
-#include <limits.h>
  #include <stdlib.h>
  #include <unistd.h>
  #include <errno.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/sysmacros.h>
-#include <libgen.h>
  #include <libpmem.h>
  
  #include "../fio.h"
@@ -77,8 +66,8 @@ static int fio_libpmem_init(struct thread_data *td)
  {
         struct thread_options *o = &td->o;
  
-       dprint(FD_IO,"o->rw_min_bs %llu \n o->fsync_blocks %u \n o->fdatasync_blocks %u \n",
-                       o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks);
+       dprint(FD_IO, "o->rw_min_bs %llu\n o->fsync_blocks %u\n o->fdatasync_blocks %u\n",
+                       o->rw_min_bs, o->fsync_blocks, o->fdatasync_blocks);
         dprint(FD_IO, "DEBUG fio_libpmem_init\n");
  
         if ((o->rw_min_bs & page_mask) &&
@@ -91,23 +80,17 @@ static int fio_libpmem_init(struct thread_data *td)
  }
  
  /*
- * This is the pmem_map_file execution function
+ * This is the pmem_map_file execution function, a helper to
+ * fio_libpmem_open_file function.
   */
  static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
                             size_t length, off_t off)
  {
         struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
-       mode_t mode = 0;
+       mode_t mode = S_IWUSR | S_IRUSR;
         size_t mapped_len;
         int is_pmem;
  
-       if(td_rw(td))
-               mode = S_IWUSR | S_IRUSR;
-       else if (td_write(td))
-               mode = S_IWUSR;
-       else
-               mode = S_IRUSR;
-
         dprint(FD_IO, "DEBUG fio_libpmem_file\n");
         dprint(FD_IO, "f->file_name = %s td->o.verify = %d \n", f->file_name,
                         td->o.verify);
@@ -142,11 +125,11 @@ static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f)
  {
         struct fio_libpmem_data *fdd;
  
-       dprint(FD_IO,"DEBUG fio_libpmem_open_file\n");
-       dprint(FD_IO,"f->io_size=%ld \n",f->io_size);
-       dprint(FD_IO,"td->o.size=%lld \n",td->o.size);
-       dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth);
-       dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch);
+       dprint(FD_IO, "DEBUG fio_libpmem_open_file\n");
+       dprint(FD_IO, "f->io_size=%ld\n", f->io_size);
+       dprint(FD_IO, "td->o.size=%lld\n", td->o.size);
+       dprint(FD_IO, "td->o.iodepth=%d\n", td->o.iodepth);
+       dprint(FD_IO, "td->o.iodepth_batch=%d\n", td->o.iodepth_batch);
  
         if (fio_file_open(f))
                 td_io_close_file(td, f);
@@ -167,8 +150,8 @@ static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u)
         struct fio_file *f = io_u->file;
         struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
  
-       dprint(FD_IO, "DEBUG fio_libpmem_prep\n" );
-       dprint(FD_IO," io_u->offset %llu : fdd->libpmem_off %ld : "
+       dprint(FD_IO, "DEBUG fio_libpmem_prep\n");
+       dprint(FD_IO, "io_u->offset %llu : fdd->libpmem_off %ld : "
                         "io_u->buflen %llu : fdd->libpmem_sz %ld\n",
                         io_u->offset, fdd->libpmem_off,
                         io_u->buflen, fdd->libpmem_sz);
@@ -192,8 +175,9 @@ static enum fio_q_status fio_libpmem_queue(struct thread_data *td,
         io_u->error = 0;
  
         dprint(FD_IO, "DEBUG fio_libpmem_queue\n");
-       dprint(FD_IO,"td->o.odirect %d td->o.sync_io %d \n",td->o.odirect, td->o.sync_io);
-       /* map both O_SYNC / DSYNC to not using NODRAIN */
+       dprint(FD_IO, "td->o.odirect %d td->o.sync_io %d\n",
+                       td->o.odirect, td->o.sync_io);
+       /* map both O_SYNC / DSYNC to not use NODRAIN */
         flags = td->o.sync_io ? 0 : PMEM_F_MEM_NODRAIN;
         flags |= td->o.odirect ? PMEM_F_MEM_NONTEMPORAL : PMEM_F_MEM_TEMPORAL;
  
@@ -203,7 +187,7 @@ static enum fio_q_status fio_libpmem_queue(struct thread_data *td,
                 break;
         case DDIR_WRITE:
                 dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n",
-                               io_u->mmap_data, io_u->xfer_buf );
+                               io_u->mmap_data, io_u->xfer_buf);
                 pmem_memcpy(io_u->mmap_data,
                                         io_u->xfer_buf,
                                         io_u->xfer_buflen,
@@ -227,13 +211,7 @@ static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f)
         struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
         int ret = 0;
  
-       dprint(FD_IO,"DEBUG fio_libpmem_close_file\n");
-       dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
-
-       if (!td->o.odirect) {
-               dprint(FD_IO,"pmem_drain\n");
-               pmem_drain();
-       }
+       dprint(FD_IO, "DEBUG fio_libpmem_close_file\n");
  
         if (fdd->libpmem_ptr)
                 ret = pmem_unmap(fdd->libpmem_ptr, fdd->libpmem_sz);
diff --git a/engines/librpma_apm.c b/engines/librpma_apm.c

new file mode 100644 (file)

index 0000000..896240d
--- /dev/null
+++ b/engines/librpma_apm.c
@@ -0,0 +1,254 @@
+/*
+* librpma_apm: IO engine that uses PMDK librpma to read and write data,
+ *             based on Appliance Persistency Method
+ *
+ * Copyright 2020-2021, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include "librpma_fio.h"
+
+/* client side implementation */
+
+static inline int client_io_flush(struct thread_data *td,
+               struct io_u *first_io_u, struct io_u *last_io_u,
+               unsigned long long int len);
+
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index);
+
+static int client_init(struct thread_data *td)
+{
+       struct librpma_fio_client_data *ccd;
+       unsigned int sq_size;
+       uint32_t cq_size;
+       struct rpma_conn_cfg *cfg = NULL;
+       struct rpma_peer_cfg *pcfg = NULL;
+       int ret;
+
+       /* not supported readwrite = trim / randtrim / trimwrite */
+       if (td_trim(td)) {
+               td_verror(td, EINVAL, "Not supported mode.");
+               return -1;
+       }
+
+       /*
+        * Calculate the required queue sizes where:
+        * - the send queue (SQ) has to be big enough to accommodate
+        *   all io_us (WRITEs) and all flush requests (FLUSHes)
+        * - the completion queue (CQ) has to be big enough to accommodate all
+        *   success and error completions (cq_size = sq_size)
+        */
+       if (td_random(td) || td_rw(td)) {
+               /*
+                * sq_size = max(rand_read_sq_size, rand_write_sq_size)
+                * where rand_read_sq_size < rand_write_sq_size because read
+                * does not require flush afterwards
+                * rand_write_sq_size = N * (WRITE + FLUSH)
+                *
+                * Note: rw is no different from random write since having
+                * interleaved reads with writes in extreme forces you to flush
+                * as often as when the writes are random.
+                */
+               sq_size = 2 * td->o.iodepth;
+       } else if (td_write(td)) {
+               /* sequential TD_DDIR_WRITE only */
+               if (td->o.sync_io) {
+                       sq_size = 2; /* WRITE + FLUSH */
+               } else {
+                       /*
+                        * N * WRITE + B * FLUSH where:
+                        * - B == ceil(iodepth / iodepth_batch)
+                        *   which is the number of batches for N writes
+                        */
+                       sq_size = td->o.iodepth + LIBRPMA_FIO_CEIL(td->o.iodepth,
+                                       td->o.iodepth_batch);
+               }
+       } else {
+               /* TD_DDIR_READ only */
+               if (td->o.sync_io) {
+                       sq_size = 1; /* READ */
+               } else {
+                       sq_size = td->o.iodepth; /* N x READ */
+               }
+       }
+       cq_size = sq_size;
+
+       /* create a connection configuration object */
+       if ((ret = rpma_conn_cfg_new(&cfg))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_new");
+               return -1;
+       }
+
+       /* apply queue sizes */
+       if ((ret = rpma_conn_cfg_set_sq_size(cfg, sq_size))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size");
+               goto err_cfg_delete;
+       }
+       if ((ret = rpma_conn_cfg_set_cq_size(cfg, cq_size))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size");
+               goto err_cfg_delete;
+       }
+
+       if (librpma_fio_client_init(td, cfg))
+               goto err_cfg_delete;
+
+       ccd = td->io_ops_data;
+
+       if (ccd->server_mr_flush_type == RPMA_FLUSH_TYPE_PERSISTENT) {
+               if (!ccd->ws->direct_write_to_pmem) {
+                       if (td->thread_number == 1)
+                               log_err(
+                                       "Fio librpma engine will not work until the Direct Write to PMem on the server side is possible (direct_write_to_pmem)\n");
+                       goto err_cleanup_common;
+               }
+
+               /* configure peer's direct write to pmem support */
+               if ((ret = rpma_peer_cfg_new(&pcfg))) {
+                       librpma_td_verror(td, ret, "rpma_peer_cfg_new");
+                       goto err_cleanup_common;
+               }
+
+               if ((ret = rpma_peer_cfg_set_direct_write_to_pmem(pcfg, true))) {
+                       librpma_td_verror(td, ret,
+                               "rpma_peer_cfg_set_direct_write_to_pmem");
+                       (void) rpma_peer_cfg_delete(&pcfg);
+                       goto err_cleanup_common;
+               }
+
+               if ((ret = rpma_conn_apply_remote_peer_cfg(ccd->conn, pcfg))) {
+                       librpma_td_verror(td, ret,
+                               "rpma_conn_apply_remote_peer_cfg");
+                       (void) rpma_peer_cfg_delete(&pcfg);
+                       goto err_cleanup_common;
+               }
+
+               (void) rpma_peer_cfg_delete(&pcfg);
+       } else if (td->thread_number == 1) {
+               /* XXX log_info mixes with the JSON output */
+               log_err(
+                       "Note: Direct Write to PMem is not supported by default nor required if you use DRAM instead of PMem on the server side (direct_write_to_pmem).\n"
+                       "Remember that flushing to DRAM does not make your data persistent and may be used only for experimental purposes.\n");
+       }
+
+       if ((ret = rpma_conn_cfg_delete(&cfg))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_delete");
+               /* non fatal error - continue */
+       }
+
+       ccd->flush = client_io_flush;
+       ccd->get_io_u_index = client_get_io_u_index;
+
+       return 0;
+
+err_cleanup_common:
+       librpma_fio_client_cleanup(td);
+
+err_cfg_delete:
+       (void) rpma_conn_cfg_delete(&cfg);
+
+       return -1;
+}
+
+static void client_cleanup(struct thread_data *td)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+
+       if (ccd == NULL)
+               return;
+
+       free(ccd->client_data);
+
+       librpma_fio_client_cleanup(td);
+}
+
+static inline int client_io_flush(struct thread_data *td,
+               struct io_u *first_io_u, struct io_u *last_io_u,
+               unsigned long long int len)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       size_t dst_offset = first_io_u->offset;
+       int ret;
+
+       if ((ret = rpma_flush(ccd->conn, ccd->server_mr, dst_offset, len,
+                       ccd->server_mr_flush_type, RPMA_F_COMPLETION_ALWAYS,
+                       (void *)(uintptr_t)last_io_u->index))) {
+               librpma_td_verror(td, ret, "rpma_flush");
+               return -1;
+       }
+
+       return 0;
+}
+
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index)
+{
+       memcpy(io_u_index, &wc->wr_id, sizeof(*io_u_index));
+
+       return 1;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_client = {
+       .name                   = "librpma_apm_client",
+       .version                = FIO_IOOPS_VERSION,
+       .init                   = client_init,
+       .post_init              = librpma_fio_client_post_init,
+       .get_file_size          = librpma_fio_client_get_file_size,
+       .open_file              = librpma_fio_file_nop,
+       .queue                  = librpma_fio_client_queue,
+       .commit                 = librpma_fio_client_commit,
+       .getevents              = librpma_fio_client_getevents,
+       .event                  = librpma_fio_client_event,
+       .errdetails             = librpma_fio_client_errdetails,
+       .close_file             = librpma_fio_file_nop,
+       .cleanup                = client_cleanup,
+       .flags                  = FIO_DISKLESSIO | FIO_ASYNCIO_SETS_ISSUE_TIME,
+       .options                = librpma_fio_options,
+       .option_struct_size     = sizeof(struct librpma_fio_options_values),
+};
+
+/* server side implementation */
+
+static int server_open_file(struct thread_data *td, struct fio_file *f)
+{
+       return librpma_fio_server_open_file(td, f, NULL);
+}
+
+static enum fio_q_status server_queue(struct thread_data *td, struct io_u *io_u)
+{
+       return FIO_Q_COMPLETED;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_server = {
+       .name                   = "librpma_apm_server",
+       .version                = FIO_IOOPS_VERSION,
+       .init                   = librpma_fio_server_init,
+       .open_file              = server_open_file,
+       .close_file             = librpma_fio_server_close_file,
+       .queue                  = server_queue,
+       .invalidate             = librpma_fio_file_nop,
+       .cleanup                = librpma_fio_server_cleanup,
+       .flags                  = FIO_SYNCIO,
+       .options                = librpma_fio_options,
+       .option_struct_size     = sizeof(struct librpma_fio_options_values),
+};
+
+/* register both engines */
+
+static void fio_init fio_librpma_apm_register(void)
+{
+       register_ioengine(&ioengine_client);
+       register_ioengine(&ioengine_server);
+}
+
+static void fio_exit fio_librpma_apm_unregister(void)
+{
+       unregister_ioengine(&ioengine_client);
+       unregister_ioengine(&ioengine_server);
+}
diff --git a/engines/librpma_fio.c b/engines/librpma_fio.c

new file mode 100644 (file)

index 0000000..42d6163
--- /dev/null
+++ b/engines/librpma_fio.c
@@ -0,0 +1,1079 @@
+/*
+ * librpma_fio: librpma_apm and librpma_gpspm engines' common part.
+ *
+ * Copyright 2021-2022, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifdef CONFIG_LIBPMEM2_INSTALLED
+#include "librpma_fio_pmem2.h"
+#else
+#include "librpma_fio_pmem.h"
+#endif /* CONFIG_LIBPMEM2_INSTALLED */
+
+struct fio_option librpma_fio_options[] = {
+       {
+               .name   = "serverip",
+               .lname  = "rpma_server_ip",
+               .type   = FIO_OPT_STR_STORE,
+               .off1   = offsetof(struct librpma_fio_options_values, server_ip),
+               .help   = "IP address the server is listening on",
+               .def    = "",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBRPMA,
+       },
+       {
+               .name   = "port",
+               .lname  = "rpma_server port",
+               .type   = FIO_OPT_STR_STORE,
+               .off1   = offsetof(struct librpma_fio_options_values, port),
+               .help   = "port the server is listening on",
+               .def    = "7204",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBRPMA,
+       },
+       {
+               .name   = "direct_write_to_pmem",
+               .lname  = "Direct Write to PMem (via RDMA) from the remote host is possible",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct librpma_fio_options_values,
+                                       direct_write_to_pmem),
+               .help   = "Set to true ONLY when Direct Write to PMem from the remote host is possible (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)",
+               .def    = "",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBRPMA,
+       },
+       {
+               .name   = "busy_wait_polling",
+               .lname  = "Set to 0 to wait for completion instead of busy-wait polling completion.",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct librpma_fio_options_values,
+                                       busy_wait_polling),
+               .help   = "Set to false if you want to reduce CPU usage",
+               .def    = "1",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_LIBRPMA,
+       },
+       {
+               .name   = NULL,
+       },
+};
+
+int librpma_fio_td_port(const char *port_base_str, struct thread_data *td,
+               char *port_out)
+{
+       unsigned long int port_ul = strtoul(port_base_str, NULL, 10);
+       unsigned int port_new;
+
+       port_out[0] = '\0';
+
+       if (port_ul == ULONG_MAX) {
+               td_verror(td, errno, "strtoul");
+               return -1;
+       }
+       port_ul += td->thread_number - 1;
+       if (port_ul >= UINT_MAX) {
+               log_err("[%u] port number (%lu) bigger than UINT_MAX\n",
+                       td->thread_number, port_ul);
+               return -1;
+       }
+
+       port_new = port_ul;
+       snprintf(port_out, LIBRPMA_FIO_PORT_STR_LEN_MAX - 1, "%u", port_new);
+
+       return 0;
+}
+
+char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
+       struct librpma_fio_mem *mem)
+{
+       char *mem_ptr = NULL;
+       int ret;
+
+       if ((ret = posix_memalign((void **)&mem_ptr, page_size, size))) {
+               log_err("fio: posix_memalign() failed\n");
+               td_verror(td, ret, "posix_memalign");
+               return NULL;
+       }
+
+       mem->mem_ptr = mem_ptr;
+       mem->size_mmap = 0;
+
+       return mem_ptr;
+}
+
+char *librpma_fio_allocate_pmem(struct thread_data *td, struct fio_file *f,
+               size_t size, struct librpma_fio_mem *mem)
+{
+       size_t ws_offset;
+       mem->mem_ptr = NULL;
+
+       if (size % page_size) {
+               log_err("fio: size (%zu) is not aligned to page size (%zu)\n",
+                       size, page_size);
+               return NULL;
+       }
+
+       if (f->filetype == FIO_TYPE_CHAR) {
+               /* Each thread uses a separate offset within DeviceDAX. */
+               ws_offset = (td->thread_number - 1) * size;
+       } else {
+               /* Each thread uses a separate FileSystemDAX file. No offset is needed. */
+               ws_offset = 0;
+       }
+
+       if (!f->file_name) {
+               log_err("fio: filename is not set\n");
+               return NULL;
+       }
+
+       if (librpma_fio_pmem_map_file(f, size, mem, ws_offset)) {
+               log_err("fio: librpma_fio_pmem_map_file(%s) failed\n",
+                       f->file_name);
+               return NULL;
+       }
+
+       log_info("fio: size of memory mapped from the file %s: %zu\n",
+               f->file_name, mem->size_mmap);
+
+       log_info("fio: library used to map PMem from file: %s\n", RPMA_PMEM_USED);
+
+       return mem->mem_ptr ? mem->mem_ptr + ws_offset : NULL;
+}
+
+void librpma_fio_free(struct librpma_fio_mem *mem)
+{
+       if (mem->size_mmap)
+               librpma_fio_unmap(mem);
+       else
+               free(mem->mem_ptr);
+}
+
+#define LIBRPMA_FIO_RETRY_MAX_NO       10
+#define LIBRPMA_FIO_RETRY_DELAY_S      5
+
+int librpma_fio_client_init(struct thread_data *td,
+               struct rpma_conn_cfg *cfg)
+{
+       struct librpma_fio_client_data *ccd;
+       struct librpma_fio_options_values *o = td->eo;
+       struct ibv_context *dev = NULL;
+       char port_td[LIBRPMA_FIO_PORT_STR_LEN_MAX];
+       struct rpma_conn_req *req = NULL;
+       enum rpma_conn_event event;
+       struct rpma_conn_private_data pdata;
+       enum rpma_log_level log_level_aux = RPMA_LOG_LEVEL_WARNING;
+       int remote_flush_type;
+       int retry;
+       int ret;
+
+       /* --debug=net sets RPMA_LOG_THRESHOLD_AUX to RPMA_LOG_LEVEL_INFO */
+#ifdef FIO_INC_DEBUG
+       if ((1UL << FD_NET) & fio_debug)
+               log_level_aux = RPMA_LOG_LEVEL_INFO;
+#endif
+
+       /* configure logging thresholds to see more details */
+       rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
+       rpma_log_set_threshold(RPMA_LOG_THRESHOLD_AUX, log_level_aux);
+
+       /* obtain an IBV context for a remote IP address */
+       if ((ret = rpma_utils_get_ibv_context(o->server_ip,
+                       RPMA_UTIL_IBV_CONTEXT_REMOTE, &dev))) {
+               librpma_td_verror(td, ret, "rpma_utils_get_ibv_context");
+               return -1;
+       }
+
+       /* allocate client's data */
+       ccd = calloc(1, sizeof(*ccd));
+       if (ccd == NULL) {
+               td_verror(td, errno, "calloc");
+               return -1;
+       }
+
+       /* allocate all in-memory queues */
+       ccd->io_us_queued = calloc(td->o.iodepth, sizeof(*ccd->io_us_queued));
+       if (ccd->io_us_queued == NULL) {
+               td_verror(td, errno, "calloc");
+               goto err_free_ccd;
+       }
+
+       ccd->io_us_flight = calloc(td->o.iodepth, sizeof(*ccd->io_us_flight));
+       if (ccd->io_us_flight == NULL) {
+               td_verror(td, errno, "calloc");
+               goto err_free_io_u_queues;
+       }
+
+       ccd->io_us_completed = calloc(td->o.iodepth,
+                       sizeof(*ccd->io_us_completed));
+       if (ccd->io_us_completed == NULL) {
+               td_verror(td, errno, "calloc");
+               goto err_free_io_u_queues;
+       }
+
+       /* create a new peer object */
+       if ((ret = rpma_peer_new(dev, &ccd->peer))) {
+               librpma_td_verror(td, ret, "rpma_peer_new");
+               goto err_free_io_u_queues;
+       }
+
+       /* create a connection request */
+       if (librpma_fio_td_port(o->port, td, port_td))
+               goto err_peer_delete;
+
+       for (retry = 0; retry < LIBRPMA_FIO_RETRY_MAX_NO; retry++) {
+               if ((ret = rpma_conn_req_new(ccd->peer, o->server_ip, port_td,
+                               cfg, &req))) {
+                       librpma_td_verror(td, ret, "rpma_conn_req_new");
+                       goto err_peer_delete;
+               }
+
+               /*
+                * Connect the connection request
+                * and obtain the connection object.
+                */
+               if ((ret = rpma_conn_req_connect(&req, NULL, &ccd->conn))) {
+                       librpma_td_verror(td, ret, "rpma_conn_req_connect");
+                       goto err_req_delete;
+               }
+
+               /* wait for the connection to establish */
+               if ((ret = rpma_conn_next_event(ccd->conn, &event))) {
+                       librpma_td_verror(td, ret, "rpma_conn_next_event");
+                       goto err_conn_delete;
+               } else if (event == RPMA_CONN_ESTABLISHED) {
+                       break;
+               } else if (event == RPMA_CONN_REJECTED) {
+                       (void) rpma_conn_disconnect(ccd->conn);
+                       (void) rpma_conn_delete(&ccd->conn);
+                       if (retry < LIBRPMA_FIO_RETRY_MAX_NO - 1) {
+                               log_err("Thread [%d]: Retrying (#%i) ...\n",
+                                       td->thread_number, retry + 1);
+                               sleep(LIBRPMA_FIO_RETRY_DELAY_S);
+                       } else {
+                               log_err(
+                                       "Thread [%d]: The maximum number of retries exceeded. Closing.\n",
+                                       td->thread_number);
+                       }
+               } else {
+                       log_err(
+                               "rpma_conn_next_event returned an unexptected event: (%s != RPMA_CONN_ESTABLISHED)\n",
+                               rpma_utils_conn_event_2str(event));
+                       goto err_conn_delete;
+               }
+       }
+
+       if (retry > 0)
+               log_err("Thread [%d]: Connected after retry #%i\n",
+                       td->thread_number, retry);
+
+       if (ccd->conn == NULL)
+               goto err_peer_delete;
+
+       /* get the connection's main CQ */
+       if ((ret = rpma_conn_get_cq(ccd->conn, &ccd->cq))) {
+               librpma_td_verror(td, ret, "rpma_conn_get_cq");
+               goto err_conn_delete;
+       }
+
+       /* get the connection's private data sent from the server */
+       if ((ret = rpma_conn_get_private_data(ccd->conn, &pdata))) {
+               librpma_td_verror(td, ret, "rpma_conn_get_private_data");
+               goto err_conn_delete;
+       }
+
+       /* get the server's workspace representation */
+       ccd->ws = pdata.ptr;
+
+       /* create the server's memory representation */
+       if ((ret = rpma_mr_remote_from_descriptor(&ccd->ws->descriptor[0],
+                       ccd->ws->mr_desc_size, &ccd->server_mr))) {
+               librpma_td_verror(td, ret, "rpma_mr_remote_from_descriptor");
+               goto err_conn_delete;
+       }
+
+       /* get the total size of the shared server memory */
+       if ((ret = rpma_mr_remote_get_size(ccd->server_mr, &ccd->ws_size))) {
+               librpma_td_verror(td, ret, "rpma_mr_remote_get_size");
+               goto err_conn_delete;
+       }
+
+       /* get flush type of the remote node */
+       if ((ret = rpma_mr_remote_get_flush_type(ccd->server_mr,
+                       &remote_flush_type))) {
+               librpma_td_verror(td, ret, "rpma_mr_remote_get_flush_type");
+               goto err_conn_delete;
+       }
+
+       ccd->server_mr_flush_type =
+               (remote_flush_type & RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT) ?
+               RPMA_FLUSH_TYPE_PERSISTENT : RPMA_FLUSH_TYPE_VISIBILITY;
+
+       /*
+        * Assure an io_us buffer allocation is page-size-aligned which is required
+        * to register for RDMA. User-provided value is intentionally ignored.
+        */
+       td->o.mem_align = page_size;
+
+       td->io_ops_data = ccd;
+
+       return 0;
+
+err_conn_delete:
+       (void) rpma_conn_disconnect(ccd->conn);
+       (void) rpma_conn_delete(&ccd->conn);
+
+err_req_delete:
+       (void) rpma_conn_req_delete(&req);
+
+err_peer_delete:
+       (void) rpma_peer_delete(&ccd->peer);
+
+err_free_io_u_queues:
+       free(ccd->io_us_queued);
+       free(ccd->io_us_flight);
+       free(ccd->io_us_completed);
+
+err_free_ccd:
+       free(ccd);
+
+       return -1;
+}
+
+void librpma_fio_client_cleanup(struct thread_data *td)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       enum rpma_conn_event ev;
+       int ret;
+
+       if (ccd == NULL)
+               return;
+
+       /* delete the iou's memory registration */
+       if ((ret = rpma_mr_dereg(&ccd->orig_mr)))
+               librpma_td_verror(td, ret, "rpma_mr_dereg");
+       /* delete the iou's memory registration */
+       if ((ret = rpma_mr_remote_delete(&ccd->server_mr)))
+               librpma_td_verror(td, ret, "rpma_mr_remote_delete");
+       /* initiate disconnection */
+       if ((ret = rpma_conn_disconnect(ccd->conn)))
+               librpma_td_verror(td, ret, "rpma_conn_disconnect");
+       /* wait for disconnection to end up */
+       if ((ret = rpma_conn_next_event(ccd->conn, &ev))) {
+               librpma_td_verror(td, ret, "rpma_conn_next_event");
+       } else if (ev != RPMA_CONN_CLOSED) {
+               log_err(
+                       "client_cleanup received an unexpected event (%s != RPMA_CONN_CLOSED)\n",
+                       rpma_utils_conn_event_2str(ev));
+       }
+       /* delete the connection */
+       if ((ret = rpma_conn_delete(&ccd->conn)))
+               librpma_td_verror(td, ret, "rpma_conn_delete");
+       /* delete the peer */
+       if ((ret = rpma_peer_delete(&ccd->peer)))
+               librpma_td_verror(td, ret, "rpma_peer_delete");
+       /* free the software queues */
+       free(ccd->io_us_queued);
+       free(ccd->io_us_flight);
+       free(ccd->io_us_completed);
+       free(ccd);
+       td->io_ops_data = NULL; /* zero ccd */
+}
+
+int librpma_fio_file_nop(struct thread_data *td, struct fio_file *f)
+{
+       /* NOP */
+       return 0;
+}
+
+int librpma_fio_client_post_init(struct thread_data *td)
+{
+       struct librpma_fio_client_data *ccd =  td->io_ops_data;
+       size_t io_us_size;
+       int ret;
+
+       /*
+        * td->orig_buffer is not aligned. The engine requires aligned io_us
+        * so FIO aligns up the address using the formula below.
+        */
+       ccd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) +
+                       td->o.mem_align;
+
+       /*
+        * td->orig_buffer_size beside the space really consumed by io_us
+        * has paddings which can be omitted for the memory registration.
+        */
+       io_us_size = (unsigned long long)td_max_bs(td) *
+                       (unsigned long long)td->o.iodepth;
+
+       if ((ret = rpma_mr_reg(ccd->peer, ccd->orig_buffer_aligned, io_us_size,
+                       RPMA_MR_USAGE_READ_DST | RPMA_MR_USAGE_READ_SRC |
+                       RPMA_MR_USAGE_WRITE_DST | RPMA_MR_USAGE_WRITE_SRC |
+                       RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT, &ccd->orig_mr)))
+               librpma_td_verror(td, ret, "rpma_mr_reg");
+       return ret;
+}
+
+int librpma_fio_client_get_file_size(struct thread_data *td,
+               struct fio_file *f)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+
+       f->real_file_size = ccd->ws_size;
+       fio_file_set_size_known(f);
+
+       return 0;
+}
+
+static enum fio_q_status client_queue_sync(struct thread_data *td,
+               struct io_u *io_u)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       struct ibv_wc wc;
+       unsigned io_u_index;
+       int ret;
+
+       /* execute io_u */
+       if (io_u->ddir == DDIR_READ) {
+               /* post an RDMA read operation */
+               if (librpma_fio_client_io_read(td, io_u,
+                               RPMA_F_COMPLETION_ALWAYS))
+                       goto err;
+       } else if (io_u->ddir == DDIR_WRITE) {
+               /* post an RDMA write operation */
+               if (librpma_fio_client_io_write(td, io_u))
+                       goto err;
+               if (ccd->flush(td, io_u, io_u, io_u->xfer_buflen))
+                       goto err;
+       } else {
+               log_err("unsupported IO mode: %s\n", io_ddir_name(io_u->ddir));
+               goto err;
+       }
+
+       do {
+               /* get a completion */
+               ret = rpma_cq_get_wc(ccd->cq, 1, &wc, NULL);
+               if (ret == RPMA_E_NO_COMPLETION) {
+                       /* lack of completion is not an error */
+                       continue;
+               } else if (ret != 0) {
+                       /* an error occurred */
+                       librpma_td_verror(td, ret, "rpma_cq_get_wc");
+                       goto err;
+               }
+
+               /* if io_us has completed with an error */
+               if (wc.status != IBV_WC_SUCCESS)
+                       goto err;
+
+               if (wc.opcode == IBV_WC_SEND)
+                       ++ccd->op_send_completed;
+               else {
+                       if (wc.opcode == IBV_WC_RECV)
+                               ++ccd->op_recv_completed;
+
+                       break;
+               }
+       } while (1);
+
+       if (ccd->get_io_u_index(&wc, &io_u_index) != 1)
+               goto err;
+
+       if (io_u->index != io_u_index) {
+               log_err(
+                       "no matching io_u for received completion found (io_u_index=%u)\n",
+                       io_u_index);
+               goto err;
+       }
+
+       /* make sure all SENDs are completed before exit - clean up SQ */
+       if (librpma_fio_client_io_complete_all_sends(td))
+               goto err;
+
+       return FIO_Q_COMPLETED;
+
+err:
+       io_u->error = -1;
+       return FIO_Q_COMPLETED;
+}
+
+enum fio_q_status librpma_fio_client_queue(struct thread_data *td,
+               struct io_u *io_u)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+
+       if (ccd->io_u_queued_nr == (int)td->o.iodepth)
+               return FIO_Q_BUSY;
+
+       if (td->o.sync_io)
+               return client_queue_sync(td, io_u);
+
+       /* io_u -> queued[] */
+       ccd->io_us_queued[ccd->io_u_queued_nr] = io_u;
+       ccd->io_u_queued_nr++;
+
+       return FIO_Q_QUEUED;
+}
+
+int librpma_fio_client_commit(struct thread_data *td)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       int flags = RPMA_F_COMPLETION_ON_ERROR;
+       struct timespec now;
+       bool fill_time;
+       int i;
+       struct io_u *flush_first_io_u = NULL;
+       unsigned long long int flush_len = 0;
+
+       if (!ccd->io_us_queued)
+               return -1;
+
+       /* execute all io_us from queued[] */
+       for (i = 0; i < ccd->io_u_queued_nr; i++) {
+               struct io_u *io_u = ccd->io_us_queued[i];
+
+               if (io_u->ddir == DDIR_READ) {
+                       if (i + 1 == ccd->io_u_queued_nr ||
+                           ccd->io_us_queued[i + 1]->ddir == DDIR_WRITE)
+                               flags = RPMA_F_COMPLETION_ALWAYS;
+                       /* post an RDMA read operation */
+                       if (librpma_fio_client_io_read(td, io_u, flags))
+                               return -1;
+               } else if (io_u->ddir == DDIR_WRITE) {
+                       /* post an RDMA write operation */
+                       if (librpma_fio_client_io_write(td, io_u))
+                               return -1;
+
+                       /* cache the first io_u in the sequence */
+                       if (flush_first_io_u == NULL)
+                               flush_first_io_u = io_u;
+
+                       /*
+                        * the flush length is the sum of all io_u's creating
+                        * the sequence
+                        */
+                       flush_len += io_u->xfer_buflen;
+
+                       /*
+                        * if io_u's are random the rpma_flush is required
+                        * after each one of them
+                        */
+                       if (!td_random(td)) {
+                               /*
+                                * When the io_u's are sequential and
+                                * the current io_u is not the last one and
+                                * the next one is also a write operation
+                                * the flush can be postponed by one io_u and
+                                * cover all of them which build a continuous
+                                * sequence.
+                                */
+                               if ((i + 1 < ccd->io_u_queued_nr) &&
+                                   (ccd->io_us_queued[i + 1]->ddir == DDIR_WRITE))
+                                       continue;
+                       }
+
+                       /* flush all writes which build a continuous sequence */
+                       if (ccd->flush(td, flush_first_io_u, io_u, flush_len))
+                               return -1;
+
+                       /*
+                        * reset the flush parameters in preparation for
+                        * the next one
+                        */
+                       flush_first_io_u = NULL;
+                       flush_len = 0;
+               } else {
+                       log_err("unsupported IO mode: %s\n",
+                               io_ddir_name(io_u->ddir));
+                       return -1;
+               }
+       }
+
+       if ((fill_time = fio_fill_issue_time(td))) {
+               fio_gettime(&now, NULL);
+
+               /*
+                * only used for iolog
+                */
+               if (td->o.read_iolog_file)
+                       memcpy(&td->last_issue, &now, sizeof(now));
+
+       }
+       /* move executed io_us from queued[] to flight[] */
+       for (i = 0; i < ccd->io_u_queued_nr; i++) {
+               struct io_u *io_u = ccd->io_us_queued[i];
+
+               /* FIO does not do this if the engine is asynchronous */
+               if (fill_time)
+                       memcpy(&io_u->issue_time, &now, sizeof(now));
+
+               /* move executed io_us from queued[] to flight[] */
+               ccd->io_us_flight[ccd->io_u_flight_nr] = io_u;
+               ccd->io_u_flight_nr++;
+
+               /*
+                * FIO says:
+                * If an engine has the commit hook
+                * it has to call io_u_queued() itself.
+                */
+               io_u_queued(td, io_u);
+       }
+
+       /* FIO does not do this if an engine has the commit hook. */
+       io_u_mark_submit(td, ccd->io_u_queued_nr);
+       ccd->io_u_queued_nr = 0;
+
+       return 0;
+}
+
+/*
+ * RETURN VALUE
+ * - > 0  - a number of completed io_us
+ * -   0  - when no complicitions received
+ * - (-1) - when an error occurred
+ */
+static int client_getevent_process(struct thread_data *td)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       struct ibv_wc wc;
+       /* io_u->index of completed io_u (wc.wr_id) */
+       unsigned int io_u_index;
+       /* # of completed io_us */
+       int cmpl_num = 0;
+       /* helpers */
+       struct io_u *io_u;
+       int i;
+       int ret;
+
+       /* get a completion */
+       if ((ret = rpma_cq_get_wc(ccd->cq, 1, &wc, NULL))) {
+               /* lack of completion is not an error */
+               if (ret == RPMA_E_NO_COMPLETION) {
+                       /* lack of completion is not an error */
+                       return 0;
+               }
+
+               /* an error occurred */
+               librpma_td_verror(td, ret, "rpma_cq_get_wc");
+               return -1;
+       }
+
+       /* if io_us has completed with an error */
+       if (wc.status != IBV_WC_SUCCESS) {
+               td->error = wc.status;
+               return -1;
+       }
+
+       if (wc.opcode == IBV_WC_SEND)
+               ++ccd->op_send_completed;
+       else if (wc.opcode == IBV_WC_RECV)
+               ++ccd->op_recv_completed;
+
+       if ((ret = ccd->get_io_u_index(&wc, &io_u_index)) != 1)
+               return ret;
+
+       /* look for an io_u being completed */
+       for (i = 0; i < ccd->io_u_flight_nr; ++i) {
+               if (ccd->io_us_flight[i]->index == io_u_index) {
+                       cmpl_num = i + 1;
+                       break;
+               }
+       }
+
+       /* if no matching io_u has been found */
+       if (cmpl_num == 0) {
+               log_err(
+                       "no matching io_u for received completion found (io_u_index=%u)\n",
+                       io_u_index);
+               return -1;
+       }
+
+       /* move completed io_us to the completed in-memory queue */
+       for (i = 0; i < cmpl_num; ++i) {
+               /* get and prepare io_u */
+               io_u = ccd->io_us_flight[i];
+
+               /* append to the queue */
+               ccd->io_us_completed[ccd->io_u_completed_nr] = io_u;
+               ccd->io_u_completed_nr++;
+       }
+
+       /* remove completed io_us from the flight queue */
+       for (i = cmpl_num; i < ccd->io_u_flight_nr; ++i)
+               ccd->io_us_flight[i - cmpl_num] = ccd->io_us_flight[i];
+       ccd->io_u_flight_nr -= cmpl_num;
+
+       return cmpl_num;
+}
+
+int librpma_fio_client_getevents(struct thread_data *td, unsigned int min,
+               unsigned int max, const struct timespec *t)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       /* total # of completed io_us */
+       int cmpl_num_total = 0;
+       /* # of completed io_us from a single event */
+       int cmpl_num;
+
+       do {
+               cmpl_num = client_getevent_process(td);
+               if (cmpl_num > 0) {
+                       /* new completions collected */
+                       cmpl_num_total += cmpl_num;
+               } else if (cmpl_num == 0) {
+                       /*
+                        * It is required to make sure that CQEs for SENDs
+                        * will flow at least at the same pace as CQEs for RECVs.
+                        */
+                       if (cmpl_num_total >= min &&
+                           ccd->op_send_completed >= ccd->op_recv_completed)
+                               break;
+
+                       /*
+                        * To reduce CPU consumption one can use
+                        * the rpma_cq_wait() function.
+                        * Note this greatly increase the latency
+                        * and make the results less stable.
+                        * The bandwidth stays more or less the same.
+                        */
+               } else {
+                       /* an error occurred */
+                       return -1;
+               }
+
+               /*
+                * The expected max can be exceeded if CQEs for RECVs will come up
+                * faster than CQEs for SENDs. But it is required to make sure CQEs for
+                * SENDs will flow at least at the same pace as CQEs for RECVs.
+                */
+       } while (cmpl_num_total < max ||
+                       ccd->op_send_completed < ccd->op_recv_completed);
+
+       /*
+        * All posted SENDs are completed and RECVs for them (responses) are
+        * completed. This is the initial situation so the counters are reset.
+        */
+       if (ccd->op_send_posted == ccd->op_send_completed &&
+                       ccd->op_send_completed == ccd->op_recv_completed) {
+               ccd->op_send_posted = 0;
+               ccd->op_send_completed = 0;
+               ccd->op_recv_completed = 0;
+       }
+
+       return cmpl_num_total;
+}
+
+struct io_u *librpma_fio_client_event(struct thread_data *td, int event)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       struct io_u *io_u;
+       int i;
+
+       /* get the first io_u from the queue */
+       io_u = ccd->io_us_completed[0];
+
+       /* remove the first io_u from the queue */
+       for (i = 1; i < ccd->io_u_completed_nr; ++i)
+               ccd->io_us_completed[i - 1] = ccd->io_us_completed[i];
+       ccd->io_u_completed_nr--;
+
+       dprint_io_u(io_u, "client_event");
+
+       return io_u;
+}
+
+char *librpma_fio_client_errdetails(struct io_u *io_u)
+{
+       /* get the string representation of an error */
+       enum ibv_wc_status status = io_u->error;
+       const char *status_str = ibv_wc_status_str(status);
+
+       char *details = strdup(status_str);
+       if (details == NULL) {
+               fprintf(stderr, "Error: %s\n", status_str);
+               fprintf(stderr, "Fatal error: out of memory. Aborting.\n");
+               abort();
+       }
+
+       /* FIO frees the returned string when it becomes obsolete */
+       return details;
+}
+
+int librpma_fio_server_init(struct thread_data *td)
+{
+       struct librpma_fio_options_values *o = td->eo;
+       struct librpma_fio_server_data *csd;
+       struct ibv_context *dev = NULL;
+       enum rpma_log_level log_level_aux = RPMA_LOG_LEVEL_WARNING;
+       int ret = -1;
+
+       /* --debug=net sets RPMA_LOG_THRESHOLD_AUX to RPMA_LOG_LEVEL_INFO */
+#ifdef FIO_INC_DEBUG
+       if ((1UL << FD_NET) & fio_debug)
+               log_level_aux = RPMA_LOG_LEVEL_INFO;
+#endif
+
+       /* configure logging thresholds to see more details */
+       rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
+       rpma_log_set_threshold(RPMA_LOG_THRESHOLD_AUX, log_level_aux);
+
+
+       /* obtain an IBV context for a remote IP address */
+       if ((ret = rpma_utils_get_ibv_context(o->server_ip,
+                       RPMA_UTIL_IBV_CONTEXT_LOCAL, &dev))) {
+               librpma_td_verror(td, ret, "rpma_utils_get_ibv_context");
+               return -1;
+       }
+
+       /* allocate server's data */
+       csd = calloc(1, sizeof(*csd));
+       if (csd == NULL) {
+               td_verror(td, errno, "calloc");
+               return -1;
+       }
+
+       /* create a new peer object */
+       if ((ret = rpma_peer_new(dev, &csd->peer))) {
+               librpma_td_verror(td, ret, "rpma_peer_new");
+               goto err_free_csd;
+       }
+
+       td->io_ops_data = csd;
+
+       return 0;
+
+err_free_csd:
+       free(csd);
+
+       return -1;
+}
+
+void librpma_fio_server_cleanup(struct thread_data *td)
+{
+       struct librpma_fio_server_data *csd =  td->io_ops_data;
+       int ret;
+
+       if (csd == NULL)
+               return;
+
+       /* free the peer */
+       if ((ret = rpma_peer_delete(&csd->peer)))
+               librpma_td_verror(td, ret, "rpma_peer_delete");
+
+       free(csd);
+}
+
+int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
+               struct rpma_conn_cfg *cfg)
+{
+       struct librpma_fio_server_data *csd = td->io_ops_data;
+       struct librpma_fio_options_values *o = td->eo;
+       enum rpma_conn_event conn_event = RPMA_CONN_UNDEFINED;
+       struct librpma_fio_workspace ws = {0};
+       struct rpma_conn_private_data pdata;
+       uint32_t max_msg_num;
+       struct rpma_conn_req *conn_req;
+       struct rpma_conn *conn;
+       struct rpma_mr_local *mr;
+       char port_td[LIBRPMA_FIO_PORT_STR_LEN_MAX];
+       struct rpma_ep *ep;
+       size_t mem_size = td->o.size;
+       size_t mr_desc_size;
+       void *ws_ptr;
+       bool is_dram;
+       int usage_mem_type;
+       int ret;
+
+       if (!f->file_name) {
+               log_err("fio: filename is not set\n");
+               return -1;
+       }
+
+       /* start a listening endpoint at addr:port */
+       if (librpma_fio_td_port(o->port, td, port_td))
+               return -1;
+
+       if ((ret = rpma_ep_listen(csd->peer, o->server_ip, port_td, &ep))) {
+               librpma_td_verror(td, ret, "rpma_ep_listen");
+               return -1;
+       }
+
+       is_dram = !strcmp(f->file_name, "malloc");
+       if (is_dram) {
+               /* allocation from DRAM using posix_memalign() */
+               ws_ptr = librpma_fio_allocate_dram(td, mem_size, &csd->mem);
+               usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_VISIBILITY;
+       } else {
+               /* allocation from PMEM using pmem_map_file() */
+               ws_ptr = librpma_fio_allocate_pmem(td, f, mem_size, &csd->mem);
+               usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT;
+       }
+
+       if (ws_ptr == NULL)
+               goto err_ep_shutdown;
+
+       f->real_file_size = mem_size;
+
+       if ((ret = rpma_mr_reg(csd->peer, ws_ptr, mem_size,
+                       RPMA_MR_USAGE_READ_DST | RPMA_MR_USAGE_READ_SRC |
+                       RPMA_MR_USAGE_WRITE_DST | RPMA_MR_USAGE_WRITE_SRC |
+                       usage_mem_type, &mr))) {
+               librpma_td_verror(td, ret, "rpma_mr_reg");
+               goto err_free;
+       }
+
+       if (!is_dram && f->filetype == FIO_TYPE_FILE) {
+               ret = rpma_mr_advise(mr, 0, mem_size,
+                               IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE,
+                               IBV_ADVISE_MR_FLAG_FLUSH);
+               if (ret) {
+                       librpma_td_verror(td, ret, "rpma_mr_advise");
+                       /* an invalid argument is an error */
+                       if (ret == RPMA_E_INVAL)
+                               goto err_mr_dereg;
+
+                       /* log_err used instead of log_info to avoid corruption of the JSON output */
+                       log_err("Note: having rpma_mr_advise(3) failed because of RPMA_E_NOSUPP or RPMA_E_PROVIDER may come with a performance penalty, but it is not a blocker for running the benchmark.\n");
+               }
+       }
+
+       /* get size of the memory region's descriptor */
+       if ((ret = rpma_mr_get_descriptor_size(mr, &mr_desc_size))) {
+               librpma_td_verror(td, ret, "rpma_mr_get_descriptor_size");
+               goto err_mr_dereg;
+       }
+
+       /* verify size of the memory region's descriptor */
+       if (mr_desc_size > LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE) {
+               log_err(
+                       "size of the memory region's descriptor is too big (max=%i)\n",
+                       LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE);
+               goto err_mr_dereg;
+       }
+
+       /* get the memory region's descriptor */
+       if ((ret = rpma_mr_get_descriptor(mr, &ws.descriptor[0]))) {
+               librpma_td_verror(td, ret, "rpma_mr_get_descriptor");
+               goto err_mr_dereg;
+       }
+
+       if (cfg != NULL) {
+               if ((ret = rpma_conn_cfg_get_rq_size(cfg, &max_msg_num))) {
+                       librpma_td_verror(td, ret, "rpma_conn_cfg_get_rq_size");
+                       goto err_mr_dereg;
+               }
+
+               /* verify whether iodepth fits into uint16_t */
+               if (max_msg_num > UINT16_MAX) {
+                       log_err("fio: iodepth too big (%u > %u)\n",
+                               max_msg_num, UINT16_MAX);
+                       return -1;
+               }
+
+               ws.max_msg_num = max_msg_num;
+       }
+
+       /* prepare a workspace description */
+       ws.direct_write_to_pmem = o->direct_write_to_pmem;
+       ws.mr_desc_size = mr_desc_size;
+       pdata.ptr = &ws;
+       pdata.len = sizeof(ws);
+
+       /* receive an incoming connection request */
+       if ((ret = rpma_ep_next_conn_req(ep, cfg, &conn_req))) {
+               librpma_td_verror(td, ret, "rpma_ep_next_conn_req");
+               goto err_mr_dereg;
+       }
+
+       if (csd->prepare_connection && csd->prepare_connection(td, conn_req))
+               goto err_req_delete;
+
+       /* accept the connection request and obtain the connection object */
+       if ((ret = rpma_conn_req_connect(&conn_req, &pdata, &conn))) {
+               librpma_td_verror(td, ret, "rpma_conn_req_connect");
+               goto err_req_delete;
+       }
+
+       /* wait for the connection to be established */
+       if ((ret = rpma_conn_next_event(conn, &conn_event))) {
+               librpma_td_verror(td, ret, "rpma_conn_next_event");
+               goto err_conn_delete;
+       } else if (conn_event != RPMA_CONN_ESTABLISHED) {
+               log_err("rpma_conn_next_event returned an unexptected event\n");
+               goto err_conn_delete;
+       }
+
+       /* end-point is no longer needed */
+       (void) rpma_ep_shutdown(&ep);
+
+       csd->ws_mr = mr;
+       csd->ws_ptr = ws_ptr;
+       csd->conn = conn;
+
+       /* get the connection's main CQ */
+       if ((ret = rpma_conn_get_cq(csd->conn, &csd->cq))) {
+               librpma_td_verror(td, ret, "rpma_conn_get_cq");
+               goto err_conn_delete;
+       }
+
+       return 0;
+
+err_conn_delete:
+       (void) rpma_conn_delete(&conn);
+
+err_req_delete:
+       (void) rpma_conn_req_delete(&conn_req);
+
+err_mr_dereg:
+       (void) rpma_mr_dereg(&mr);
+
+err_free:
+       librpma_fio_free(&csd->mem);
+
+err_ep_shutdown:
+       (void) rpma_ep_shutdown(&ep);
+
+       return -1;
+}
+
+int librpma_fio_server_close_file(struct thread_data *td, struct fio_file *f)
+{
+       struct librpma_fio_server_data *csd = td->io_ops_data;
+       enum rpma_conn_event conn_event = RPMA_CONN_UNDEFINED;
+       int rv = 0;
+       int ret;
+
+       /* wait for the connection to be closed */
+       ret = rpma_conn_next_event(csd->conn, &conn_event);
+       if (!ret && conn_event != RPMA_CONN_CLOSED) {
+               log_err("rpma_conn_next_event returned an unexptected event\n");
+               rv = -1;
+       }
+
+       if ((ret = rpma_conn_disconnect(csd->conn))) {
+               librpma_td_verror(td, ret, "rpma_conn_disconnect");
+               rv = -1;
+       }
+
+       if ((ret = rpma_conn_delete(&csd->conn))) {
+               librpma_td_verror(td, ret, "rpma_conn_delete");
+               rv = -1;
+       }
+
+       if ((ret = rpma_mr_dereg(&csd->ws_mr))) {
+               librpma_td_verror(td, ret, "rpma_mr_dereg");
+               rv = -1;
+       }
+
+       librpma_fio_free(&csd->mem);
+
+       return rv;
+}
diff --git a/engines/librpma_fio.h b/engines/librpma_fio.h

new file mode 100644 (file)

index 0000000..480ded1
--- /dev/null
+++ b/engines/librpma_fio.h
@@ -0,0 +1,282 @@
+/*
+ * librpma_fio: librpma_apm and librpma_gpspm engines' common header.
+ *
+ * Copyright 2021-2022, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef LIBRPMA_FIO_H
+#define LIBRPMA_FIO_H 1
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#include <librpma.h>
+
+/* servers' and clients' common */
+
+#define librpma_td_verror(td, err, func) \
+       td_vmsg((td), (err), rpma_err_2str(err), (func))
+
+/* ceil(a / b) = (a + b - 1) / b */
+#define LIBRPMA_FIO_CEIL(a, b) (((a) + (b) - 1) / (b))
+
+/* common option structure for server and client */
+struct librpma_fio_options_values {
+       /*
+        * FIO considers .off1 == 0 absent so the first meaningful field has to
+        * have padding ahead of it.
+        */
+       void *pad;
+       char *server_ip;
+       /* base server listening port */
+       char *port;
+       /* Direct Write to PMem is possible */
+       unsigned int direct_write_to_pmem;
+       /* Set to 0 to wait for completion instead of busy-wait polling completion. */
+       unsigned int busy_wait_polling;
+};
+
+extern struct fio_option librpma_fio_options[];
+
+/*
+ * Limited by the maximum length of the private data
+ * for rdma_connect() in case of RDMA_PS_TCP (28 bytes).
+ */
+#define LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE 24
+
+struct librpma_fio_workspace {
+       uint16_t max_msg_num;   /* # of RQ slots */
+       uint8_t direct_write_to_pmem; /* Direct Write to PMem is possible */
+       uint8_t mr_desc_size;   /* size of mr_desc in descriptor[] */
+       /* buffer containing mr_desc */
+       char descriptor[LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE];
+};
+
+#define LIBRPMA_FIO_PORT_STR_LEN_MAX 12
+
+int librpma_fio_td_port(const char *port_base_str, struct thread_data *td,
+               char *port_out);
+
+struct librpma_fio_mem {
+       /* memory buffer */
+       char *mem_ptr;
+
+       /* size of the mapped persistent memory */
+       size_t size_mmap;
+
+#ifdef CONFIG_LIBPMEM2_INSTALLED
+       /* libpmem2 structure used for mapping PMem */
+       struct pmem2_map *map;
+#endif
+};
+
+char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
+               struct librpma_fio_mem *mem);
+
+char *librpma_fio_allocate_pmem(struct thread_data *td, struct fio_file *f,
+               size_t size, struct librpma_fio_mem *mem);
+
+void librpma_fio_free(struct librpma_fio_mem *mem);
+
+/* clients' common */
+
+typedef int (*librpma_fio_flush_t)(struct thread_data *td,
+               struct io_u *first_io_u, struct io_u *last_io_u,
+               unsigned long long int len);
+
+/*
+ * RETURN VALUE
+ * - ( 1) - on success
+ * - ( 0) - skip
+ * - (-1) - on error
+ */
+typedef int (*librpma_fio_get_io_u_index_t)(struct ibv_wc *wc,
+               unsigned int *io_u_index);
+
+struct librpma_fio_client_data {
+       struct rpma_peer *peer;
+       struct rpma_conn *conn;
+       struct rpma_cq *cq;
+
+       /* aligned td->orig_buffer */
+       char *orig_buffer_aligned;
+
+       /* ious's base address memory registration (cd->orig_buffer_aligned) */
+       struct rpma_mr_local *orig_mr;
+
+       struct librpma_fio_workspace *ws;
+
+       /* a server's memory representation */
+       struct rpma_mr_remote *server_mr;
+       enum rpma_flush_type server_mr_flush_type;
+
+       /* remote workspace description */
+       size_t ws_size;
+
+       /* in-memory queues */
+       struct io_u **io_us_queued;
+       int io_u_queued_nr;
+       struct io_u **io_us_flight;
+       int io_u_flight_nr;
+       struct io_u **io_us_completed;
+       int io_u_completed_nr;
+
+       /* SQ control. Note: all of them have to be kept in sync. */
+       uint32_t op_send_posted;
+       uint32_t op_send_completed;
+       uint32_t op_recv_completed;
+
+       librpma_fio_flush_t flush;
+       librpma_fio_get_io_u_index_t get_io_u_index;
+
+       /* engine-specific client data */
+       void *client_data;
+};
+
+int librpma_fio_client_init(struct thread_data *td,
+               struct rpma_conn_cfg *cfg);
+void librpma_fio_client_cleanup(struct thread_data *td);
+
+int librpma_fio_file_nop(struct thread_data *td, struct fio_file *f);
+int librpma_fio_client_get_file_size(struct thread_data *td,
+               struct fio_file *f);
+
+int librpma_fio_client_post_init(struct thread_data *td);
+
+enum fio_q_status librpma_fio_client_queue(struct thread_data *td,
+               struct io_u *io_u);
+
+int librpma_fio_client_commit(struct thread_data *td);
+
+int librpma_fio_client_getevents(struct thread_data *td, unsigned int min,
+               unsigned int max, const struct timespec *t);
+
+struct io_u *librpma_fio_client_event(struct thread_data *td, int event);
+
+char *librpma_fio_client_errdetails(struct io_u *io_u);
+
+static inline int librpma_fio_client_io_read(struct thread_data *td,
+               struct io_u *io_u, int flags)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       size_t dst_offset = (char *)(io_u->xfer_buf) - ccd->orig_buffer_aligned;
+       size_t src_offset = io_u->offset;
+       int ret;
+
+       if ((ret = rpma_read(ccd->conn, ccd->orig_mr, dst_offset,
+                       ccd->server_mr, src_offset, io_u->xfer_buflen,
+                       flags, (void *)(uintptr_t)io_u->index))) {
+               librpma_td_verror(td, ret, "rpma_read");
+               return -1;
+       }
+
+       return 0;
+}
+
+static inline int librpma_fio_client_io_write(struct thread_data *td,
+               struct io_u *io_u)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       size_t src_offset = (char *)(io_u->xfer_buf) - ccd->orig_buffer_aligned;
+       size_t dst_offset = io_u->offset;
+       int ret;
+
+       if ((ret = rpma_write(ccd->conn, ccd->server_mr, dst_offset,
+                       ccd->orig_mr, src_offset, io_u->xfer_buflen,
+                       RPMA_F_COMPLETION_ON_ERROR,
+                       (void *)(uintptr_t)io_u->index))) {
+               librpma_td_verror(td, ret, "rpma_write");
+               return -1;
+       }
+
+       return 0;
+}
+
+static inline int librpma_fio_client_io_complete_all_sends(
+               struct thread_data *td)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       struct ibv_wc wc;
+       int ret;
+
+       while (ccd->op_send_posted != ccd->op_send_completed) {
+               /* get a completion */
+               ret = rpma_cq_get_wc(ccd->cq, 1, &wc, NULL);
+               if (ret == RPMA_E_NO_COMPLETION) {
+                       /* lack of completion is not an error */
+                       continue;
+               } else if (ret != 0) {
+                       /* an error occurred */
+                       librpma_td_verror(td, ret, "rpma_cq_get_wc");
+                       break;
+               }
+
+               if (wc.status != IBV_WC_SUCCESS)
+                       return -1;
+
+               if (wc.opcode == IBV_WC_SEND)
+                       ++ccd->op_send_completed;
+               else {
+                       log_err(
+                               "A completion other than IBV_WC_SEND got during cleaning up the CQ from SENDs\n");
+                       return -1;
+               }
+       }
+
+       /*
+        * All posted SENDs are completed and RECVs for them (responses) are
+        * completed. This is the initial situation so the counters are reset.
+        */
+       if (ccd->op_send_posted == ccd->op_send_completed &&
+                       ccd->op_send_completed == ccd->op_recv_completed) {
+               ccd->op_send_posted = 0;
+               ccd->op_send_completed = 0;
+               ccd->op_recv_completed = 0;
+       }
+
+       return 0;
+}
+
+/* servers' common */
+
+typedef int (*librpma_fio_prepare_connection_t)(
+               struct thread_data *td,
+               struct rpma_conn_req *conn_req);
+
+struct librpma_fio_server_data {
+       struct rpma_peer *peer;
+
+       /* resources of an incoming connection */
+       struct rpma_conn *conn;
+       struct rpma_cq *cq;
+
+       char *ws_ptr;
+       struct rpma_mr_local *ws_mr;
+       struct librpma_fio_mem mem;
+
+       /* engine-specific server data */
+       void *server_data;
+
+       librpma_fio_prepare_connection_t prepare_connection;
+};
+
+int librpma_fio_server_init(struct thread_data *td);
+
+void librpma_fio_server_cleanup(struct thread_data *td);
+
+int librpma_fio_server_open_file(struct thread_data *td,
+               struct fio_file *f, struct rpma_conn_cfg *cfg);
+
+int librpma_fio_server_close_file(struct thread_data *td,
+               struct fio_file *f);
+
+#endif /* LIBRPMA_FIO_H */
diff --git a/engines/librpma_fio_pmem.h b/engines/librpma_fio_pmem.h

new file mode 100644 (file)

index 0000000..4854292
--- /dev/null
+++ b/engines/librpma_fio_pmem.h
@@ -0,0 +1,67 @@
+/*
+ * librpma_fio_pmem: allocates pmem using libpmem.
+ *
+ * Copyright 2022, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <libpmem.h>
+#include "librpma_fio.h"
+
+#define RPMA_PMEM_USED "libpmem"
+
+static int librpma_fio_pmem_map_file(struct fio_file *f, size_t size,
+               struct librpma_fio_mem *mem, size_t ws_offset)
+{
+       int is_pmem = 0;
+       size_t size_mmap = 0;
+
+       /* map the file */
+       mem->mem_ptr = pmem_map_file(f->file_name, 0 /* len */, 0 /* flags */,
+                       0 /* mode */, &size_mmap, &is_pmem);
+       if (mem->mem_ptr == NULL) {
+               /* pmem_map_file() sets errno on failure */
+               log_err("fio: pmem_map_file(%s) failed: %s (errno %i)\n",
+                       f->file_name, strerror(errno), errno);
+               return -1;
+       }
+
+       /* pmem is expected */
+       if (!is_pmem) {
+               log_err("fio: %s is not located in persistent memory\n",
+                       f->file_name);
+               goto err_unmap;
+       }
+
+       /* check size of allocated persistent memory */
+       if (size_mmap < ws_offset + size) {
+               log_err(
+                       "fio: %s is too small to handle so many threads (%zu < %zu)\n",
+                       f->file_name, size_mmap, ws_offset + size);
+               goto err_unmap;
+       }
+
+       log_info("fio: size of memory mapped from the file %s: %zu\n",
+               f->file_name, size_mmap);
+
+       mem->size_mmap = size_mmap;
+
+       return 0;
+
+err_unmap:
+       (void) pmem_unmap(mem->mem_ptr, size_mmap);
+       return -1;
+}
+
+static inline void librpma_fio_unmap(struct librpma_fio_mem *mem)
+{
+       (void) pmem_unmap(mem->mem_ptr, mem->size_mmap);
+}
diff --git a/engines/librpma_fio_pmem2.h b/engines/librpma_fio_pmem2.h

new file mode 100644 (file)

index 0000000..09a51f5
--- /dev/null
+++ b/engines/librpma_fio_pmem2.h
@@ -0,0 +1,91 @@
+/*
+ * librpma_fio_pmem2: allocates pmem using libpmem2.
+ *
+ * Copyright 2022, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <libpmem2.h>
+#include "librpma_fio.h"
+
+#define RPMA_PMEM_USED "libpmem2"
+
+static int librpma_fio_pmem_map_file(struct fio_file *f, size_t size,
+               struct librpma_fio_mem *mem, size_t ws_offset)
+{
+       int fd;
+       struct pmem2_config *cfg = NULL;
+       struct pmem2_map *map = NULL;
+       struct pmem2_source *src = NULL;
+
+       size_t size_mmap;
+
+       if((fd = open(f->file_name, O_RDWR)) < 0) {
+               log_err("fio: cannot open fio file\n");
+               return -1;
+       }
+
+       if (pmem2_source_from_fd(&src, fd) != 0) {
+               log_err("fio: pmem2_source_from_fd() failed\n");
+               goto err_close;
+       }
+
+       if (pmem2_config_new(&cfg) != 0) {
+               log_err("fio: pmem2_config_new() failed\n");
+               goto err_source_delete;
+       }
+
+       if (pmem2_config_set_required_store_granularity(cfg,
+                                       PMEM2_GRANULARITY_CACHE_LINE) != 0) {
+               log_err("fio: pmem2_config_set_required_store_granularity() failed: %s\n", pmem2_errormsg());
+               goto err_config_delete;
+       }
+
+       if (pmem2_map_new(&map, cfg, src) != 0) {
+               log_err("fio: pmem2_map_new(%s) failed: %s\n", f->file_name, pmem2_errormsg());
+               goto err_config_delete;
+       }
+
+       size_mmap = pmem2_map_get_size(map);
+
+       /* check size of allocated persistent memory */
+       if (size_mmap < ws_offset + size) {
+               log_err(
+                       "fio: %s is too small to handle so many threads (%zu < %zu)\n",
+                       f->file_name, size_mmap, ws_offset + size);
+               goto err_map_delete;
+       }
+
+       mem->mem_ptr = pmem2_map_get_address(map);
+       mem->size_mmap = size_mmap;
+       mem->map = map;
+       pmem2_config_delete(&cfg);
+       pmem2_source_delete(&src);
+       close(fd);
+
+       return 0;
+
+err_map_delete:
+       pmem2_map_delete(&map);
+err_config_delete:
+       pmem2_config_delete(&cfg);
+err_source_delete:
+       pmem2_source_delete(&src);
+err_close:
+       close(fd);
+
+       return -1;
+}
+
+static inline void librpma_fio_unmap(struct librpma_fio_mem *mem)
+{
+       (void) pmem2_map_delete(&mem->map);
+}
diff --git a/engines/librpma_gpspm.c b/engines/librpma_gpspm.c

new file mode 100644 (file)

index 0000000..70116d0
--- /dev/null
+++ b/engines/librpma_gpspm.c
@@ -0,0 +1,784 @@
+/*
+ * librpma_gpspm: IO engine that uses PMDK librpma to write data,
+ *             based on General Purpose Server Persistency Method
+ *
+ * Copyright 2020-2022, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include "librpma_fio.h"
+
+#ifdef CONFIG_LIBPMEM2_INSTALLED
+#include <libpmem2.h>
+#else
+#include <libpmem.h>
+#endif
+
+/* Generated by the protocol buffer compiler from: librpma_gpspm_flush.proto */
+#include "librpma_gpspm_flush.pb-c.h"
+
+#define MAX_MSG_SIZE (512)
+#define IO_U_BUF_LEN (2 * MAX_MSG_SIZE)
+#define SEND_OFFSET (0)
+#define RECV_OFFSET (SEND_OFFSET + MAX_MSG_SIZE)
+
+#define GPSPM_FLUSH_REQUEST__LAST \
+       { PROTOBUF_C_MESSAGE_INIT(&gpspm_flush_request__descriptor), 0, 0, 0 }
+
+/*
+ * 'Flush_req_last' is the last flush request
+ * the client has to send to server to indicate
+ * that the client is done.
+ */
+static const GPSPMFlushRequest Flush_req_last = GPSPM_FLUSH_REQUEST__LAST;
+
+#define IS_NOT_THE_LAST_MESSAGE(flush_req) \
+       (flush_req->length != Flush_req_last.length || \
+       flush_req->offset != Flush_req_last.offset)
+
+/* client side implementation */
+
+/* get next io_u message buffer in the round-robin fashion */
+#define IO_U_NEXT_BUF_OFF_CLIENT(cd) \
+       (IO_U_BUF_LEN * ((cd->msg_curr++) % cd->msg_num))
+
+struct client_data {
+       /* memory for sending and receiving buffered */
+       char *io_us_msgs;
+
+       /* resources for messaging buffer */
+       uint32_t msg_num;
+       uint32_t msg_curr;
+       struct rpma_mr_local *msg_mr;
+};
+
+static inline int client_io_flush(struct thread_data *td,
+               struct io_u *first_io_u, struct io_u *last_io_u,
+               unsigned long long int len);
+
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index);
+
+static int client_init(struct thread_data *td)
+{
+       struct librpma_fio_client_data *ccd;
+       struct client_data *cd;
+       uint32_t write_num;
+       struct rpma_conn_cfg *cfg = NULL;
+       int ret;
+
+       /*
+        * not supported:
+        * - readwrite = read / trim / randread / randtrim /
+        *               / rw / randrw / trimwrite
+        */
+       if (td_read(td) || td_trim(td)) {
+               td_verror(td, EINVAL, "Not supported mode.");
+               return -1;
+       }
+
+       /* allocate client's data */
+       cd = calloc(1, sizeof(*cd));
+       if (cd == NULL) {
+               td_verror(td, errno, "calloc");
+               return -1;
+       }
+
+       /*
+        * Calculate the required number of WRITEs and FLUSHes.
+        *
+        * Note: Each flush is a request (SEND) and response (RECV) pair.
+        */
+       if (td_random(td)) {
+               write_num = td->o.iodepth; /* WRITE * N */
+               cd->msg_num = td->o.iodepth; /* FLUSH * N */
+       } else {
+               if (td->o.sync_io) {
+                       write_num = 1; /* WRITE */
+                       cd->msg_num = 1; /* FLUSH */
+               } else {
+                       write_num = td->o.iodepth; /* WRITE * N */
+                       /*
+                        * FLUSH * B where:
+                        * - B == ceil(iodepth / iodepth_batch)
+                        *   which is the number of batches for N writes
+                        */
+                       cd->msg_num = LIBRPMA_FIO_CEIL(td->o.iodepth,
+                                       td->o.iodepth_batch);
+               }
+       }
+
+       /* create a connection configuration object */
+       if ((ret = rpma_conn_cfg_new(&cfg))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_new");
+               goto err_free_cd;
+       }
+
+       /*
+        * Calculate the required queue sizes where:
+        * - the send queue (SQ) has to be big enough to accommodate
+        *   all io_us (WRITEs) and all flush requests (SENDs)
+        * - the receive queue (RQ) has to be big enough to accommodate
+        *   all flush responses (RECVs)
+        * - the completion queue (CQ) has to be big enough to accommodate all
+        *   success and error completions (sq_size + rq_size)
+        */
+       if ((ret = rpma_conn_cfg_set_sq_size(cfg, write_num + cd->msg_num))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size");
+               goto err_cfg_delete;
+       }
+       if ((ret = rpma_conn_cfg_set_rq_size(cfg, cd->msg_num))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_set_rq_size");
+               goto err_cfg_delete;
+       }
+       if ((ret = rpma_conn_cfg_set_cq_size(cfg, write_num + cd->msg_num * 2))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size");
+               goto err_cfg_delete;
+       }
+
+       if (librpma_fio_client_init(td, cfg))
+               goto err_cfg_delete;
+
+       ccd = td->io_ops_data;
+
+       if (ccd->ws->direct_write_to_pmem &&
+           ccd->server_mr_flush_type == RPMA_FLUSH_TYPE_PERSISTENT &&
+           td->thread_number == 1) {
+               /* XXX log_info mixes with the JSON output */
+               log_err(
+                       "Note: The server side supports Direct Write to PMem and it is equipped with PMem (direct_write_to_pmem).\n"
+                       "You can use librpma_client and librpma_server engines for better performance instead of GPSPM.\n");
+       }
+
+       /* validate the server's RQ capacity */
+       if (cd->msg_num > ccd->ws->max_msg_num) {
+               log_err(
+                       "server's RQ size (iodepth) too small to handle the client's workspace requirements (%u < %u)\n",
+                       ccd->ws->max_msg_num, cd->msg_num);
+               goto err_cleanup_common;
+       }
+
+       if ((ret = rpma_conn_cfg_delete(&cfg))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_delete");
+               /* non fatal error - continue */
+       }
+
+       ccd->flush = client_io_flush;
+       ccd->get_io_u_index = client_get_io_u_index;
+       ccd->client_data = cd;
+
+       return 0;
+
+err_cleanup_common:
+       librpma_fio_client_cleanup(td);
+
+err_cfg_delete:
+       (void) rpma_conn_cfg_delete(&cfg);
+
+err_free_cd:
+       free(cd);
+
+       return -1;
+}
+
+static int client_post_init(struct thread_data *td)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       struct client_data *cd = ccd->client_data;
+       unsigned int io_us_msgs_size;
+       int ret;
+
+       /* message buffers initialization and registration */
+       io_us_msgs_size = cd->msg_num * IO_U_BUF_LEN;
+       if ((ret = posix_memalign((void **)&cd->io_us_msgs, page_size,
+                       io_us_msgs_size))) {
+               td_verror(td, ret, "posix_memalign");
+               return ret;
+       }
+       if ((ret = rpma_mr_reg(ccd->peer, cd->io_us_msgs, io_us_msgs_size,
+                       RPMA_MR_USAGE_SEND | RPMA_MR_USAGE_RECV,
+                       &cd->msg_mr))) {
+               librpma_td_verror(td, ret, "rpma_mr_reg");
+               return ret;
+       }
+
+       return librpma_fio_client_post_init(td);
+}
+
+static void client_cleanup(struct thread_data *td)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       struct client_data *cd;
+       size_t flush_req_size;
+       size_t io_u_buf_off;
+       size_t send_offset;
+       void *send_ptr;
+       int ret;
+
+       if (ccd == NULL)
+               return;
+
+       cd = ccd->client_data;
+       if (cd == NULL) {
+               librpma_fio_client_cleanup(td);
+               return;
+       }
+
+       /*
+        * Make sure all SEND completions are collected ergo there are free
+        * slots in the SQ for the last SEND message.
+        *
+        * Note: If any operation will fail we still can send the termination
+        * notice.
+        */
+       (void) librpma_fio_client_io_complete_all_sends(td);
+
+       /* prepare the last flush message and pack it to the send buffer */
+       flush_req_size = gpspm_flush_request__get_packed_size(&Flush_req_last);
+       if (flush_req_size > MAX_MSG_SIZE) {
+               log_err(
+                       "Packed flush request size is bigger than available send buffer space (%zu > %d\n",
+                       flush_req_size, MAX_MSG_SIZE);
+       } else {
+               io_u_buf_off = IO_U_NEXT_BUF_OFF_CLIENT(cd);
+               send_offset = io_u_buf_off + SEND_OFFSET;
+               send_ptr = cd->io_us_msgs + send_offset;
+               (void) gpspm_flush_request__pack(&Flush_req_last, send_ptr);
+
+               /* send the flush message */
+               if ((ret = rpma_send(ccd->conn, cd->msg_mr, send_offset,
+                               flush_req_size, RPMA_F_COMPLETION_ALWAYS,
+                               NULL)))
+                       librpma_td_verror(td, ret, "rpma_send");
+
+               ++ccd->op_send_posted;
+
+               /* Wait for the SEND to complete */
+               (void) librpma_fio_client_io_complete_all_sends(td);
+       }
+
+       /* deregister the messaging buffer memory */
+       if ((ret = rpma_mr_dereg(&cd->msg_mr)))
+               librpma_td_verror(td, ret, "rpma_mr_dereg");
+
+       free(ccd->client_data);
+
+       librpma_fio_client_cleanup(td);
+}
+
+static inline int client_io_flush(struct thread_data *td,
+               struct io_u *first_io_u, struct io_u *last_io_u,
+               unsigned long long int len)
+{
+       struct librpma_fio_client_data *ccd = td->io_ops_data;
+       struct client_data *cd = ccd->client_data;
+       size_t io_u_buf_off = IO_U_NEXT_BUF_OFF_CLIENT(cd);
+       size_t send_offset = io_u_buf_off + SEND_OFFSET;
+       size_t recv_offset = io_u_buf_off + RECV_OFFSET;
+       void *send_ptr = cd->io_us_msgs + send_offset;
+       void *recv_ptr = cd->io_us_msgs + recv_offset;
+       GPSPMFlushRequest flush_req = GPSPM_FLUSH_REQUEST__INIT;
+       size_t flush_req_size = 0;
+       int ret;
+
+       /* prepare a response buffer */
+       if ((ret = rpma_recv(ccd->conn, cd->msg_mr, recv_offset, MAX_MSG_SIZE,
+                       recv_ptr))) {
+               librpma_td_verror(td, ret, "rpma_recv");
+               return -1;
+       }
+
+       /* prepare a flush message and pack it to a send buffer */
+       flush_req.offset = first_io_u->offset;
+       flush_req.length = len;
+       flush_req.op_context = last_io_u->index;
+       flush_req_size = gpspm_flush_request__get_packed_size(&flush_req);
+       if (flush_req_size > MAX_MSG_SIZE) {
+               log_err(
+                       "Packed flush request size is bigger than available send buffer space (%"
+                       PRIu64 " > %d\n", flush_req_size, MAX_MSG_SIZE);
+               return -1;
+       }
+       (void) gpspm_flush_request__pack(&flush_req, send_ptr);
+
+       /* send the flush message */
+       if ((ret = rpma_send(ccd->conn, cd->msg_mr, send_offset, flush_req_size,
+                       RPMA_F_COMPLETION_ALWAYS, NULL))) {
+               librpma_td_verror(td, ret, "rpma_send");
+               return -1;
+       }
+
+       ++ccd->op_send_posted;
+
+       return 0;
+}
+
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index)
+{
+       GPSPMFlushResponse *flush_resp;
+
+       if (wc->opcode != IBV_WC_RECV)
+               return 0;
+
+       /* unpack a response from the received buffer */
+       flush_resp = gpspm_flush_response__unpack(NULL,
+                       wc->byte_len, (void *)wc->wr_id);
+       if (flush_resp == NULL) {
+               log_err("Cannot unpack the flush response buffer\n");
+               return -1;
+       }
+
+       memcpy(io_u_index, &flush_resp->op_context, sizeof(*io_u_index));
+
+       gpspm_flush_response__free_unpacked(flush_resp, NULL);
+
+       return 1;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_client = {
+       .name                   = "librpma_gpspm_client",
+       .version                = FIO_IOOPS_VERSION,
+       .init                   = client_init,
+       .post_init              = client_post_init,
+       .get_file_size          = librpma_fio_client_get_file_size,
+       .open_file              = librpma_fio_file_nop,
+       .queue                  = librpma_fio_client_queue,
+       .commit                 = librpma_fio_client_commit,
+       .getevents              = librpma_fio_client_getevents,
+       .event                  = librpma_fio_client_event,
+       .errdetails             = librpma_fio_client_errdetails,
+       .close_file             = librpma_fio_file_nop,
+       .cleanup                = client_cleanup,
+       .flags                  = FIO_DISKLESSIO | FIO_ASYNCIO_SETS_ISSUE_TIME,
+       .options                = librpma_fio_options,
+       .option_struct_size     = sizeof(struct librpma_fio_options_values),
+};
+
+/* server side implementation */
+
+#define IO_U_BUFF_OFF_SERVER(i) (i * IO_U_BUF_LEN)
+
+typedef void (*librpma_fio_persist_fn)(const void *ptr, size_t size);
+
+struct server_data {
+       /* aligned td->orig_buffer */
+       char *orig_buffer_aligned;
+
+       /* resources for messaging buffer from DRAM allocated by fio */
+       struct rpma_mr_local *msg_mr;
+
+       uint32_t msg_sqe_available; /* # of free SQ slots */
+
+       /* in-memory queues */
+       struct ibv_wc *msgs_queued;
+       uint32_t msg_queued_nr;
+
+       librpma_fio_persist_fn persist;
+};
+
+static int server_init(struct thread_data *td)
+{
+       struct librpma_fio_server_data *csd;
+       struct server_data *sd;
+       int ret = -1;
+
+       if ((ret = librpma_fio_server_init(td)))
+               return ret;
+
+       csd = td->io_ops_data;
+
+       /* allocate server's data */
+       sd = calloc(1, sizeof(*sd));
+       if (sd == NULL) {
+               td_verror(td, errno, "calloc");
+               goto err_server_cleanup;
+       }
+
+       /* allocate in-memory queue */
+       sd->msgs_queued = calloc(td->o.iodepth, sizeof(*sd->msgs_queued));
+       if (sd->msgs_queued == NULL) {
+               td_verror(td, errno, "calloc");
+               goto err_free_sd;
+       }
+
+#ifdef CONFIG_LIBPMEM2_INSTALLED
+       /* get libpmem2 persist function from pmem2_map */
+       sd->persist = pmem2_get_persist_fn(csd->mem.map);
+#else
+       sd->persist = pmem_persist;
+#endif
+
+       /*
+        * Assure a single io_u buffer can store both SEND and RECV messages and
+        * an io_us buffer allocation is page-size-aligned which is required
+        * to register for RDMA. User-provided values are intentionally ignored.
+        */
+       td->o.max_bs[DDIR_READ] = IO_U_BUF_LEN;
+       td->o.mem_align = page_size;
+
+       csd->server_data = sd;
+
+       return 0;
+
+err_free_sd:
+       free(sd);
+
+err_server_cleanup:
+       librpma_fio_server_cleanup(td);
+
+       return -1;
+}
+
+static int server_post_init(struct thread_data *td)
+{
+       struct librpma_fio_server_data *csd = td->io_ops_data;
+       struct server_data *sd = csd->server_data;
+       size_t io_us_size;
+       size_t io_u_buflen;
+       int ret;
+
+       /*
+        * td->orig_buffer is not aligned. The engine requires aligned io_us
+        * so FIO aligns up the address using the formula below.
+        */
+       sd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) +
+                       td->o.mem_align;
+
+       /*
+        * XXX
+        * Each io_u message buffer contains recv and send messages.
+        * Aligning each of those buffers may potentially give
+        * some performance benefits.
+        */
+       io_u_buflen = td_max_bs(td);
+
+       /* check whether io_u buffer is big enough */
+       if (io_u_buflen < IO_U_BUF_LEN) {
+               log_err(
+                       "blocksize too small to accommodate assumed maximal request/response pair size (%" PRIu64 " < %d)\n",
+                       io_u_buflen, IO_U_BUF_LEN);
+               return -1;
+       }
+
+       /*
+        * td->orig_buffer_size beside the space really consumed by io_us
+        * has paddings which can be omitted for the memory registration.
+        */
+       io_us_size = (unsigned long long)io_u_buflen *
+                       (unsigned long long)td->o.iodepth;
+
+       if ((ret = rpma_mr_reg(csd->peer, sd->orig_buffer_aligned, io_us_size,
+                       RPMA_MR_USAGE_SEND | RPMA_MR_USAGE_RECV,
+                       &sd->msg_mr))) {
+               librpma_td_verror(td, ret, "rpma_mr_reg");
+               return -1;
+       }
+
+       return 0;
+}
+
+static void server_cleanup(struct thread_data *td)
+{
+       struct librpma_fio_server_data *csd = td->io_ops_data;
+       struct server_data *sd;
+       int ret;
+
+       if (csd == NULL)
+               return;
+
+       sd = csd->server_data;
+
+       if (sd != NULL) {
+               /* rpma_mr_dereg(messaging buffer from DRAM) */
+               if ((ret = rpma_mr_dereg(&sd->msg_mr)))
+                       librpma_td_verror(td, ret, "rpma_mr_dereg");
+
+               free(sd->msgs_queued);
+               free(sd);
+       }
+
+       librpma_fio_server_cleanup(td);
+}
+
+static int prepare_connection(struct thread_data *td,
+               struct rpma_conn_req *conn_req)
+{
+       struct librpma_fio_server_data *csd = td->io_ops_data;
+       struct server_data *sd = csd->server_data;
+       int ret;
+       int i;
+
+       /* prepare buffers for a flush requests */
+       sd->msg_sqe_available = td->o.iodepth;
+       for (i = 0; i < td->o.iodepth; i++) {
+               size_t offset_recv_msg = IO_U_BUFF_OFF_SERVER(i) + RECV_OFFSET;
+               if ((ret = rpma_conn_req_recv(conn_req, sd->msg_mr,
+                               offset_recv_msg, MAX_MSG_SIZE,
+                               (const void *)(uintptr_t)i))) {
+                       librpma_td_verror(td, ret, "rpma_conn_req_recv");
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static int server_open_file(struct thread_data *td, struct fio_file *f)
+{
+       struct librpma_fio_server_data *csd = td->io_ops_data;
+       struct rpma_conn_cfg *cfg = NULL;
+       uint16_t max_msg_num = td->o.iodepth;
+       int ret;
+
+       csd->prepare_connection = prepare_connection;
+
+       /* create a connection configuration object */
+       if ((ret = rpma_conn_cfg_new(&cfg))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_new");
+               return -1;
+       }
+
+       /*
+        * Calculate the required queue sizes where:
+        * - the send queue (SQ) has to be big enough to accommodate
+        *   all possible flush requests (SENDs)
+        * - the receive queue (RQ) has to be big enough to accommodate
+        *   all flush responses (RECVs)
+        * - the completion queue (CQ) has to be big enough to accommodate
+        *   all success and error completions (sq_size + rq_size)
+        */
+       if ((ret = rpma_conn_cfg_set_sq_size(cfg, max_msg_num))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size");
+               goto err_cfg_delete;
+       }
+       if ((ret = rpma_conn_cfg_set_rq_size(cfg, max_msg_num))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_set_rq_size");
+               goto err_cfg_delete;
+       }
+       if ((ret = rpma_conn_cfg_set_cq_size(cfg, max_msg_num * 2))) {
+               librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size");
+               goto err_cfg_delete;
+       }
+
+       ret = librpma_fio_server_open_file(td, f, cfg);
+
+err_cfg_delete:
+       (void) rpma_conn_cfg_delete(&cfg);
+
+       return ret;
+}
+
+static int server_qe_process(struct thread_data *td, struct ibv_wc *wc)
+{
+       struct librpma_fio_server_data *csd = td->io_ops_data;
+       struct server_data *sd = csd->server_data;
+       GPSPMFlushRequest *flush_req;
+       GPSPMFlushResponse flush_resp = GPSPM_FLUSH_RESPONSE__INIT;
+       size_t flush_resp_size = 0;
+       size_t send_buff_offset;
+       size_t recv_buff_offset;
+       size_t io_u_buff_offset;
+       void *send_buff_ptr;
+       void *recv_buff_ptr;
+       void *op_ptr;
+       int msg_index;
+       int ret;
+
+       /* calculate SEND/RECV pair parameters */
+       msg_index = (int)(uintptr_t)wc->wr_id;
+       io_u_buff_offset = IO_U_BUFF_OFF_SERVER(msg_index);
+       send_buff_offset = io_u_buff_offset + SEND_OFFSET;
+       recv_buff_offset = io_u_buff_offset + RECV_OFFSET;
+       send_buff_ptr = sd->orig_buffer_aligned + send_buff_offset;
+       recv_buff_ptr = sd->orig_buffer_aligned + recv_buff_offset;
+
+       /* unpack a flush request from the received buffer */
+       flush_req = gpspm_flush_request__unpack(NULL, wc->byte_len,
+                       recv_buff_ptr);
+       if (flush_req == NULL) {
+               log_err("cannot unpack the flush request buffer\n");
+               goto err_terminate;
+       }
+
+       if (IS_NOT_THE_LAST_MESSAGE(flush_req)) {
+               op_ptr = csd->ws_ptr + flush_req->offset;
+               sd->persist(op_ptr, flush_req->length);
+       } else {
+               /*
+                * This is the last message - the client is done.
+                */
+               gpspm_flush_request__free_unpacked(flush_req, NULL);
+               td->done = true;
+               return 0;
+       }
+
+       /* initiate the next receive operation */
+       if ((ret = rpma_recv(csd->conn, sd->msg_mr, recv_buff_offset,
+                       MAX_MSG_SIZE,
+                       (const void *)(uintptr_t)msg_index))) {
+               librpma_td_verror(td, ret, "rpma_recv");
+               goto err_free_unpacked;
+       }
+
+       /* prepare a flush response and pack it to a send buffer */
+       flush_resp.op_context = flush_req->op_context;
+       flush_resp_size = gpspm_flush_response__get_packed_size(&flush_resp);
+       if (flush_resp_size > MAX_MSG_SIZE) {
+               log_err(
+                       "Size of the packed flush response is bigger than the available space of the send buffer (%"
+                       PRIu64 " > %i\n", flush_resp_size, MAX_MSG_SIZE);
+               goto err_free_unpacked;
+       }
+
+       (void) gpspm_flush_response__pack(&flush_resp, send_buff_ptr);
+
+       /* send the flush response */
+       if ((ret = rpma_send(csd->conn, sd->msg_mr, send_buff_offset,
+                       flush_resp_size, RPMA_F_COMPLETION_ALWAYS, NULL))) {
+               librpma_td_verror(td, ret, "rpma_send");
+               goto err_free_unpacked;
+       }
+       --sd->msg_sqe_available;
+
+       gpspm_flush_request__free_unpacked(flush_req, NULL);
+
+       return 0;
+
+err_free_unpacked:
+       gpspm_flush_request__free_unpacked(flush_req, NULL);
+
+err_terminate:
+       td->terminate = true;
+
+       return -1;
+}
+
+static inline int server_queue_process(struct thread_data *td)
+{
+       struct librpma_fio_server_data *csd = td->io_ops_data;
+       struct server_data *sd = csd->server_data;
+       int ret;
+       int i;
+
+       /* min(# of queue entries, # of SQ entries available) */
+       uint32_t qes_to_process = min(sd->msg_queued_nr, sd->msg_sqe_available);
+       if (qes_to_process == 0)
+               return 0;
+
+       /* process queued completions */
+       for (i = 0; i < qes_to_process; ++i) {
+               if ((ret = server_qe_process(td, &sd->msgs_queued[i])))
+                       return ret;
+       }
+
+       /* progress the queue */
+       for (i = 0; i < sd->msg_queued_nr - qes_to_process; ++i) {
+               memcpy(&sd->msgs_queued[i],
+                       &sd->msgs_queued[qes_to_process + i],
+                       sizeof(sd->msgs_queued[i]));
+       }
+
+       sd->msg_queued_nr -= qes_to_process;
+
+       return 0;
+}
+
+static int server_cmpl_process(struct thread_data *td)
+{
+       struct librpma_fio_server_data *csd = td->io_ops_data;
+       struct server_data *sd = csd->server_data;
+       struct ibv_wc *wc = &sd->msgs_queued[sd->msg_queued_nr];
+       struct librpma_fio_options_values *o = td->eo;
+       int ret;
+
+       ret = rpma_cq_get_wc(csd->cq, 1, wc, NULL);
+       if (ret == RPMA_E_NO_COMPLETION) {
+               if (o->busy_wait_polling)
+                       return 0; /* lack of completion is not an error */
+
+               ret = rpma_cq_wait(csd->cq);
+               if (ret == RPMA_E_NO_COMPLETION)
+                       return 0; /* lack of completion is not an error */
+               if (ret) {
+                       librpma_td_verror(td, ret, "rpma_cq_wait");
+                       goto err_terminate;
+               }
+
+               ret = rpma_cq_get_wc(csd->cq, 1, wc, NULL);
+               if (ret == RPMA_E_NO_COMPLETION)
+                       return 0; /* lack of completion is not an error */
+               if (ret) {
+                       librpma_td_verror(td, ret, "rpma_cq_get_wc");
+                       goto err_terminate;
+               }
+       } else if (ret) {
+               librpma_td_verror(td, ret, "rpma_cq_get_wc");
+               goto err_terminate;
+       }
+
+       /* validate the completion */
+       if (wc->status != IBV_WC_SUCCESS)
+               goto err_terminate;
+
+       if (wc->opcode == IBV_WC_RECV)
+               ++sd->msg_queued_nr;
+       else if (wc->opcode == IBV_WC_SEND)
+               ++sd->msg_sqe_available;
+
+       return 0;
+
+err_terminate:
+       td->terminate = true;
+
+       return -1;
+}
+
+static enum fio_q_status server_queue(struct thread_data *td, struct io_u *io_u)
+{
+       do {
+               if (server_cmpl_process(td))
+                       return FIO_Q_BUSY;
+
+               if (server_queue_process(td))
+                       return FIO_Q_BUSY;
+
+       } while (!td->done);
+
+       return FIO_Q_COMPLETED;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_server = {
+       .name                   = "librpma_gpspm_server",
+       .version                = FIO_IOOPS_VERSION,
+       .init                   = server_init,
+       .post_init              = server_post_init,
+       .open_file              = server_open_file,
+       .close_file             = librpma_fio_server_close_file,
+       .queue                  = server_queue,
+       .invalidate             = librpma_fio_file_nop,
+       .cleanup                = server_cleanup,
+       .flags                  = FIO_SYNCIO,
+       .options                = librpma_fio_options,
+       .option_struct_size     = sizeof(struct librpma_fio_options_values),
+};
+
+/* register both engines */
+
+static void fio_init fio_librpma_gpspm_register(void)
+{
+       register_ioengine(&ioengine_client);
+       register_ioengine(&ioengine_server);
+}
+
+static void fio_exit fio_librpma_gpspm_unregister(void)
+{
+       unregister_ioengine(&ioengine_client);
+       unregister_ioengine(&ioengine_server);
+}
diff --git a/engines/librpma_gpspm_flush.pb-c.c b/engines/librpma_gpspm_flush.pb-c.c

new file mode 100644 (file)

index 0000000..3ff2475
--- /dev/null
+++ b/engines/librpma_gpspm_flush.pb-c.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2020, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* Generated by the protocol buffer compiler. DO NOT EDIT! */
+/* Generated from: librpma_gpspm_flush.proto */
+
+/* Do not generate deprecated warnings for self */
+#ifndef PROTOBUF_C__NO_DEPRECATED
+#define PROTOBUF_C__NO_DEPRECATED
+#endif
+
+#include "librpma_gpspm_flush.pb-c.h"
+void   gpspm_flush_request__init
+                     (GPSPMFlushRequest         *message)
+{
+  static const GPSPMFlushRequest init_value = GPSPM_FLUSH_REQUEST__INIT;
+  *message = init_value;
+}
+size_t gpspm_flush_request__get_packed_size
+                     (const GPSPMFlushRequest *message)
+{
+  assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+  return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t gpspm_flush_request__pack
+                     (const GPSPMFlushRequest *message,
+                      uint8_t       *out)
+{
+  assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+  return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t gpspm_flush_request__pack_to_buffer
+                     (const GPSPMFlushRequest *message,
+                      ProtobufCBuffer *buffer)
+{
+  assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+  return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+GPSPMFlushRequest *
+       gpspm_flush_request__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data)
+{
+  return (GPSPMFlushRequest *)
+     protobuf_c_message_unpack (&gpspm_flush_request__descriptor,
+                                allocator, len, data);
+}
+void   gpspm_flush_request__free_unpacked
+                     (GPSPMFlushRequest *message,
+                      ProtobufCAllocator *allocator)
+{
+  if(!message)
+    return;
+  assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+  protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+void   gpspm_flush_response__init
+                     (GPSPMFlushResponse         *message)
+{
+  static const GPSPMFlushResponse init_value = GPSPM_FLUSH_RESPONSE__INIT;
+  *message = init_value;
+}
+size_t gpspm_flush_response__get_packed_size
+                     (const GPSPMFlushResponse *message)
+{
+  assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+  return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t gpspm_flush_response__pack
+                     (const GPSPMFlushResponse *message,
+                      uint8_t       *out)
+{
+  assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+  return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t gpspm_flush_response__pack_to_buffer
+                     (const GPSPMFlushResponse *message,
+                      ProtobufCBuffer *buffer)
+{
+  assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+  return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+GPSPMFlushResponse *
+       gpspm_flush_response__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data)
+{
+  return (GPSPMFlushResponse *)
+     protobuf_c_message_unpack (&gpspm_flush_response__descriptor,
+                                allocator, len, data);
+}
+void   gpspm_flush_response__free_unpacked
+                     (GPSPMFlushResponse *message,
+                      ProtobufCAllocator *allocator)
+{
+  if(!message)
+    return;
+  assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+  protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+static const ProtobufCFieldDescriptor gpspm_flush_request__field_descriptors[3] =
+{
+  {
+    "offset",
+    1,
+    PROTOBUF_C_LABEL_REQUIRED,
+    PROTOBUF_C_TYPE_FIXED64,
+    0,   /* quantifier_offset */
+    offsetof(GPSPMFlushRequest, offset),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "length",
+    2,
+    PROTOBUF_C_LABEL_REQUIRED,
+    PROTOBUF_C_TYPE_FIXED64,
+    0,   /* quantifier_offset */
+    offsetof(GPSPMFlushRequest, length),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+  {
+    "op_context",
+    3,
+    PROTOBUF_C_LABEL_REQUIRED,
+    PROTOBUF_C_TYPE_FIXED64,
+    0,   /* quantifier_offset */
+    offsetof(GPSPMFlushRequest, op_context),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+};
+static const unsigned gpspm_flush_request__field_indices_by_name[] = {
+  1,   /* field[1] = length */
+  0,   /* field[0] = offset */
+  2,   /* field[2] = op_context */
+};
+static const ProtobufCIntRange gpspm_flush_request__number_ranges[1 + 1] =
+{
+  { 1, 0 },
+  { 0, 3 }
+};
+const ProtobufCMessageDescriptor gpspm_flush_request__descriptor =
+{
+  PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+  "GPSPM_flush_request",
+  "GPSPMFlushRequest",
+  "GPSPMFlushRequest",
+  "",
+  sizeof(GPSPMFlushRequest),
+  3,
+  gpspm_flush_request__field_descriptors,
+  gpspm_flush_request__field_indices_by_name,
+  1,  gpspm_flush_request__number_ranges,
+  (ProtobufCMessageInit) gpspm_flush_request__init,
+  NULL,NULL,NULL    /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor gpspm_flush_response__field_descriptors[1] =
+{
+  {
+    "op_context",
+    1,
+    PROTOBUF_C_LABEL_REQUIRED,
+    PROTOBUF_C_TYPE_FIXED64,
+    0,   /* quantifier_offset */
+    offsetof(GPSPMFlushResponse, op_context),
+    NULL,
+    NULL,
+    0,             /* flags */
+    0,NULL,NULL    /* reserved1,reserved2, etc */
+  },
+};
+static const unsigned gpspm_flush_response__field_indices_by_name[] = {
+  0,   /* field[0] = op_context */
+};
+static const ProtobufCIntRange gpspm_flush_response__number_ranges[1 + 1] =
+{
+  { 1, 0 },
+  { 0, 1 }
+};
+const ProtobufCMessageDescriptor gpspm_flush_response__descriptor =
+{
+  PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+  "GPSPM_flush_response",
+  "GPSPMFlushResponse",
+  "GPSPMFlushResponse",
+  "",
+  sizeof(GPSPMFlushResponse),
+  1,
+  gpspm_flush_response__field_descriptors,
+  gpspm_flush_response__field_indices_by_name,
+  1,  gpspm_flush_response__number_ranges,
+  (ProtobufCMessageInit) gpspm_flush_response__init,
+  NULL,NULL,NULL    /* reserved[123] */
+};
diff --git a/engines/librpma_gpspm_flush.pb-c.h b/engines/librpma_gpspm_flush.pb-c.h

new file mode 100644 (file)

index 0000000..ad475a9
--- /dev/null
+++ b/engines/librpma_gpspm_flush.pb-c.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright 2020, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* Generated by the protocol buffer compiler. DO NOT EDIT! */
+/* Generated from: librpma_gpspm_flush.proto */
+
+#ifndef PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED
+#define PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED
+
+#include <protobuf-c/protobuf-c.h>
+
+PROTOBUF_C__BEGIN_DECLS
+
+#if PROTOBUF_C_VERSION_NUMBER < 1000000
+# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
+#elif 1003003 < PROTOBUF_C_MIN_COMPILER_VERSION
+# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
+#endif
+
+
+typedef struct _GPSPMFlushRequest GPSPMFlushRequest;
+typedef struct _GPSPMFlushResponse GPSPMFlushResponse;
+
+
+/* --- enums --- */
+
+
+/* --- messages --- */
+
+struct  _GPSPMFlushRequest
+{
+  ProtobufCMessage base;
+  uint64_t offset;
+  uint64_t length;
+  uint64_t op_context;
+};
+#define GPSPM_FLUSH_REQUEST__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&gpspm_flush_request__descriptor) \
+    , 0, 0, 0 }
+
+
+struct  _GPSPMFlushResponse
+{
+  ProtobufCMessage base;
+  uint64_t op_context;
+};
+#define GPSPM_FLUSH_RESPONSE__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&gpspm_flush_response__descriptor) \
+    , 0 }
+
+
+/* GPSPMFlushRequest methods */
+void   gpspm_flush_request__init
+                     (GPSPMFlushRequest         *message);
+size_t gpspm_flush_request__get_packed_size
+                     (const GPSPMFlushRequest   *message);
+size_t gpspm_flush_request__pack
+                     (const GPSPMFlushRequest   *message,
+                      uint8_t             *out);
+size_t gpspm_flush_request__pack_to_buffer
+                     (const GPSPMFlushRequest   *message,
+                      ProtobufCBuffer     *buffer);
+GPSPMFlushRequest *
+       gpspm_flush_request__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+void   gpspm_flush_request__free_unpacked
+                     (GPSPMFlushRequest *message,
+                      ProtobufCAllocator *allocator);
+/* GPSPMFlushResponse methods */
+void   gpspm_flush_response__init
+                     (GPSPMFlushResponse         *message);
+size_t gpspm_flush_response__get_packed_size
+                     (const GPSPMFlushResponse   *message);
+size_t gpspm_flush_response__pack
+                     (const GPSPMFlushResponse   *message,
+                      uint8_t             *out);
+size_t gpspm_flush_response__pack_to_buffer
+                     (const GPSPMFlushResponse   *message,
+                      ProtobufCBuffer     *buffer);
+GPSPMFlushResponse *
+       gpspm_flush_response__unpack
+                     (ProtobufCAllocator  *allocator,
+                      size_t               len,
+                      const uint8_t       *data);
+void   gpspm_flush_response__free_unpacked
+                     (GPSPMFlushResponse *message,
+                      ProtobufCAllocator *allocator);
+/* --- per-message closures --- */
+
+typedef void (*GPSPMFlushRequest_Closure)
+                 (const GPSPMFlushRequest *message,
+                  void *closure_data);
+typedef void (*GPSPMFlushResponse_Closure)
+                 (const GPSPMFlushResponse *message,
+                  void *closure_data);
+
+/* --- services --- */
+
+
+/* --- descriptors --- */
+
+extern const ProtobufCMessageDescriptor gpspm_flush_request__descriptor;
+extern const ProtobufCMessageDescriptor gpspm_flush_response__descriptor;
+
+PROTOBUF_C__END_DECLS
+
+
+#endif  /* PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED */
diff --git a/engines/librpma_gpspm_flush.proto b/engines/librpma_gpspm_flush.proto

new file mode 100644 (file)

index 0000000..91765a7
--- /dev/null
+++ b/engines/librpma_gpspm_flush.proto
@@ -0,0 +1,15 @@
+syntax = "proto2";
+
+message GPSPM_flush_request {
+    /* an offset of a region to be flushed within its memory registration */
+    required fixed64 offset = 1;
+    /* a length of a region to be flushed */
+    required fixed64 length = 2;
+    /* a user-defined operation context */
+    required fixed64 op_context = 3;
+}
+
+message GPSPM_flush_response {
+    /* the operation context of a completed request */
+    required fixed64 op_context = 1;
+}
diff --git a/engines/libzbc.c b/engines/libzbc.c

index 2aacf7bbebecd3ca0d2a5f11a5346ab26c429c86..1bf1e8c8d838c4b7a954bf92d62ea21cf2b2b811 100644 (file)
--- a/engines/libzbc.c
+++ b/engines/libzbc.c
@@ -14,11 +14,13 @@
  #include "fio.h"
  #include "err.h"
  #include "zbd_types.h"
+#include "zbd.h"
  
  struct libzbc_data {
         struct zbc_device       *zdev;
         enum zbc_dev_model      model;
         uint64_t                nr_sectors;
+       uint32_t                max_open_seq_req;
  };
  
  static int libzbc_get_dev_info(struct libzbc_data *ld, struct fio_file *f)
@@ -32,6 +34,7 @@ static int libzbc_get_dev_info(struct libzbc_data *ld, struct fio_file *f)
         zbc_get_device_info(ld->zdev, zinfo);
         ld->model = zinfo->zbd_model;
         ld->nr_sectors = zinfo->zbd_sectors;
+       ld->max_open_seq_req = zinfo->zbd_max_nr_open_seq_req;
  
         dprint(FD_ZBD, "%s: vendor_id:%s, type: %s, model: %s\n",
                f->file_name, zinfo->zbd_vendor_id,
@@ -61,24 +64,11 @@ static int libzbc_open_dev(struct thread_data *td, struct fio_file *f,
                 return -EINVAL;
         }
  
-       if (td_write(td)) {
+       if (td_write(td) || td_trim(td)) {
                 if (!read_only)
                         flags |= O_RDWR;
         } else if (td_read(td)) {
-               if (f->filetype == FIO_TYPE_CHAR && !read_only)
-                       flags |= O_RDWR;
-               else
                         flags |= O_RDONLY;
-       } else if (td_trim(td)) {
-               td_verror(td, EINVAL, "libzbc does not support trim");
-               log_err("%s: libzbc does not support trim\n", f->file_name);
-               return -EINVAL;
-       }
-
-       if (td->o.oatomic) {
-               td_verror(td, EINVAL, "libzbc does not support O_ATOMIC");
-               log_err("%s: libzbc does not support O_ATOMIC\n", f->file_name);
-               return -EINVAL;
         }
  
         ld = calloc(1, sizeof(*ld));
@@ -86,7 +76,7 @@ static int libzbc_open_dev(struct thread_data *td, struct fio_file *f,
                 return -ENOMEM;
  
         ret = zbc_open(f->file_name,
-                      flags | ZBC_O_DRV_BLOCK | ZBC_O_DRV_SCSI | ZBC_O_DRV_ATA,
+                      flags | ZBC_O_DRV_SCSI | ZBC_O_DRV_ATA,
                        &ld->zdev);
         if (ret) {
                 log_err("%s: zbc_open() failed, err=%d\n",
@@ -178,10 +168,8 @@ static int libzbc_get_zoned_model(struct thread_data *td, struct fio_file *f,
         struct libzbc_data *ld;
         int ret;
  
-       if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) {
-               *model = ZBD_IGNORE;
-               return 0;
-       }
+       if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR)
+               return -EINVAL;
  
         ret = libzbc_open_dev(td, f, &ld);
         if (ret)
@@ -335,6 +323,57 @@ err:
         return -ret;
  }
  
+static int libzbc_finish_zone(struct thread_data *td, struct fio_file *f,
+                             uint64_t offset, uint64_t length)
+{
+       struct libzbc_data *ld = td->io_ops_data;
+       uint64_t sector = offset >> 9;
+       unsigned int nr_zones;
+       struct zbc_errno err;
+       int i, ret;
+
+       assert(ld);
+       assert(ld->zdev);
+
+       nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size;
+       assert(nr_zones > 0);
+
+       for (i = 0; i < nr_zones; i++, sector += td->o.zone_size >> 9) {
+               ret = zbc_finish_zone(ld->zdev, sector, 0);
+               if (ret)
+                       goto err;
+       }
+
+       return 0;
+
+err:
+       zbc_errno(ld->zdev, &err);
+       td_verror(td, errno, "zbc_finish_zone failed");
+       if (err.sk)
+               log_err("%s: finish zone failed %s:%s\n",
+                       f->file_name,
+                       zbc_sk_str(err.sk), zbc_asc_ascq_str(err.asc_ascq));
+       return -ret;
+}
+
+static int libzbc_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                                    unsigned int *max_open_zones)
+{
+       struct libzbc_data *ld;
+       int ret;
+
+       ret = libzbc_open_dev(td, f, &ld);
+       if (ret)
+               return ret;
+
+       if (ld->max_open_seq_req == ZBC_NO_LIMIT)
+               *max_open_zones = 0;
+       else
+               *max_open_zones = ld->max_open_seq_req;
+
+       return 0;
+}
+
  ssize_t libzbc_rw(struct thread_data *td, struct io_u *io_u)
  {
         struct libzbc_data *ld = td->io_ops_data;
@@ -393,7 +432,11 @@ static enum fio_q_status libzbc_queue(struct thread_data *td, struct io_u *io_u)
                 ret = zbc_flush(ld->zdev);
                 if (ret)
                         log_err("zbc_flush error %zd\n", ret);
-       } else if (io_u->ddir != DDIR_TRIM) {
+       } else if (io_u->ddir == DDIR_TRIM) {
+               ret = zbd_do_io_u_trim(td, io_u);
+               if (!ret)
+                       ret = EINVAL;
+       } else {
                 log_err("Unsupported operation %u\n", io_u->ddir);
                 ret = -EINVAL;
         }
@@ -414,6 +457,8 @@ FIO_STATIC struct ioengine_ops ioengine = {
         .get_zoned_model        = libzbc_get_zoned_model,
         .report_zones           = libzbc_report_zones,
         .reset_wp               = libzbc_reset_wp,
+       .get_max_open_zones     = libzbc_get_max_open_zones,
+       .finish_zone            = libzbc_finish_zone,
         .queue                  = libzbc_queue,
         .flags                  = FIO_SYNCIO | FIO_NOEXTEND | FIO_RAWIO,
  };
diff --git a/engines/nbd.c b/engines/nbd.c

index b0ba75e69428f63324a764cc702f679bf6c1e5da..7c2d5f4ba6d9491a8ad787468ccd8fdd96ec7241 100644 (file)
--- a/engines/nbd.c
+++ b/engines/nbd.c
@@ -52,7 +52,7 @@ static struct fio_option options[] = {
         },
  };
  
-/* Alocates nbd_data. */
+/* Allocates nbd_data. */
  static int nbd_setup(struct thread_data *td)
  {
         struct nbd_data *nbd_data;
diff --git a/engines/net.c b/engines/net.c

index c6cec5845aac48a3a8643b88c89b5f27fcdb8995..29150bb348ac8b2342f2441c1198ce21e5499fa1 100644 (file)
--- a/engines/net.c
+++ b/engines/net.c
@@ -18,6 +18,16 @@
  #include <sys/socket.h>
  #include <sys/un.h>
  
+#ifdef CONFIG_VSOCK
+#include <linux/vm_sockets.h>
+#else
+struct sockaddr_vm {
+};
+#ifndef AF_VSOCK
+#define AF_VSOCK       -1
+#endif
+#endif
+
  #include "../fio.h"
  #include "../verify.h"
  #include "../optgroup.h"
@@ -30,6 +40,7 @@ struct netio_data {
         struct sockaddr_in addr;
         struct sockaddr_in6 addr6;
         struct sockaddr_un addr_un;
+       struct sockaddr_vm addr_vm;
         uint64_t udp_send_seq;
         uint64_t udp_recv_seq;
  };
@@ -69,6 +80,7 @@ enum {
         FIO_TYPE_UNIX   = 3,
         FIO_TYPE_TCP_V6 = 4,
         FIO_TYPE_UDP_V6 = 5,
+       FIO_TYPE_VSOCK_STREAM   = 6,
  };
  
  static int str_hostname_cb(void *data, const char *input);
@@ -126,6 +138,10 @@ static struct fio_option options[] = {
                             .oval = FIO_TYPE_UNIX,
                             .help = "UNIX domain socket",
                           },
+                         { .ival = "vsock",
+                           .oval = FIO_TYPE_VSOCK_STREAM,
+                           .help = "Virtual socket",
+                         },
                 },
                 .category = FIO_OPT_C_ENGINE,
                 .group  = FIO_OPT_G_NETIO,
@@ -223,6 +239,11 @@ static inline int is_ipv6(struct netio_options *o)
         return o->proto == FIO_TYPE_UDP_V6 || o->proto == FIO_TYPE_TCP_V6;
  }
  
+static inline int is_vsock(struct netio_options *o)
+{
+       return o->proto == FIO_TYPE_VSOCK_STREAM;
+}
+
  static int set_window_size(struct thread_data *td, int fd)
  {
  #ifdef CONFIG_NET_WINDOWSIZE
@@ -732,6 +753,9 @@ static int fio_netio_connect(struct thread_data *td, struct fio_file *f)
         } else if (o->proto == FIO_TYPE_UNIX) {
                 domain = AF_UNIX;
                 type = SOCK_STREAM;
+       } else if (is_vsock(o)) {
+               domain = AF_VSOCK;
+               type = SOCK_STREAM;
         } else {
                 log_err("fio: bad network type %d\n", o->proto);
                 f->fd = -1;
@@ -809,7 +833,14 @@ static int fio_netio_connect(struct thread_data *td, struct fio_file *f)
                         close(f->fd);
                         return 1;
                 }
+       } else if (is_vsock(o)) {
+               socklen_t len = sizeof(nd->addr_vm);
  
+               if (connect(f->fd, (struct sockaddr *) &nd->addr_vm, len) < 0) {
+                       td_verror(td, errno, "connect");
+                       close(f->fd);
+                       return 1;
+               }
         } else {
                 struct sockaddr_un *addr = &nd->addr_un;
                 socklen_t len;
@@ -849,6 +880,9 @@ static int fio_netio_accept(struct thread_data *td, struct fio_file *f)
         if (o->proto == FIO_TYPE_TCP) {
                 socklen = sizeof(nd->addr);
                 f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr, &socklen);
+       } else if (is_vsock(o)) {
+               socklen = sizeof(nd->addr_vm);
+               f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr_vm, &socklen);
         } else {
                 socklen = sizeof(nd->addr6);
                 f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr6, &socklen);
@@ -890,6 +924,9 @@ static void fio_netio_send_close(struct thread_data *td, struct fio_file *f)
         if (is_ipv6(o)) {
                 to = (struct sockaddr *) &nd->addr6;
                 len = sizeof(nd->addr6);
+       } else if (is_vsock(o)) {
+               to = NULL;
+               len = 0;
         } else {
                 to = (struct sockaddr *) &nd->addr;
                 len = sizeof(nd->addr);
@@ -960,6 +997,9 @@ static int fio_netio_send_open(struct thread_data *td, struct fio_file *f)
         if (is_ipv6(o)) {
                 len = sizeof(nd->addr6);
                 to = (struct sockaddr *) &nd->addr6;
+       } else if (is_vsock(o)) {
+               len = sizeof(nd->addr_vm);
+               to = (struct sockaddr *) &nd->addr_vm;
         } else {
                 len = sizeof(nd->addr);
                 to = (struct sockaddr *) &nd->addr;
@@ -1023,13 +1063,17 @@ static int fio_fill_addr(struct thread_data *td, const char *host, int af,
  
         memset(&hints, 0, sizeof(hints));
  
-       if (is_tcp(o))
+       if (is_tcp(o) || is_vsock(o))
                 hints.ai_socktype = SOCK_STREAM;
         else
                 hints.ai_socktype = SOCK_DGRAM;
  
         if (is_ipv6(o))
                 hints.ai_family = AF_INET6;
+#ifdef CONFIG_VSOCK
+       else if (is_vsock(o))
+               hints.ai_family = AF_VSOCK;
+#endif
         else
                 hints.ai_family = AF_INET;
  
@@ -1110,12 +1154,50 @@ static int fio_netio_setup_connect_unix(struct thread_data *td,
         return 0;
  }
  
+static int fio_netio_setup_connect_vsock(struct thread_data *td,
+                                       const char *host, unsigned short port)
+{
+#ifdef CONFIG_VSOCK
+       struct netio_data *nd = td->io_ops_data;
+       struct sockaddr_vm *addr = &nd->addr_vm;
+       int cid;
+
+       if (!host) {
+               log_err("fio: connect with no host to connect to.\n");
+               if (td_read(td))
+                       log_err("fio: did you forget to set 'listen'?\n");
+
+               td_verror(td, EINVAL, "no hostname= set");
+               return 1;
+       }
+
+       addr->svm_family = AF_VSOCK;
+       addr->svm_port = port;
+
+       if (host) {
+               cid = atoi(host);
+               if (cid < 0 || cid > UINT32_MAX) {
+                       log_err("fio: invalid CID %d\n", cid);
+                       return 1;
+               }
+               addr->svm_cid = cid;
+       }
+
+       return 0;
+#else
+       td_verror(td, -EINVAL, "vsock not supported");
+       return 1;
+#endif
+}
+
  static int fio_netio_setup_connect(struct thread_data *td)
  {
         struct netio_options *o = td->eo;
  
         if (is_udp(o) || is_tcp(o))
                 return fio_netio_setup_connect_inet(td, td->o.filename,o->port);
+       else if (is_vsock(o))
+               return fio_netio_setup_connect_vsock(td, td->o.filename, o->port);
         else
                 return fio_netio_setup_connect_unix(td, td->o.filename);
  }
@@ -1268,6 +1350,47 @@ static int fio_netio_setup_listen_inet(struct thread_data *td, short port)
         return 0;
  }
  
+static int fio_netio_setup_listen_vsock(struct thread_data *td, short port, int type)
+{
+#ifdef CONFIG_VSOCK
+       struct netio_data *nd = td->io_ops_data;
+       struct sockaddr_vm *addr = &nd->addr_vm;
+       int fd, opt;
+       socklen_t len;
+
+       fd = socket(AF_VSOCK, type, 0);
+       if (fd < 0) {
+               td_verror(td, errno, "socket");
+               return 1;
+       }
+
+       opt = 1;
+       if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (void *) &opt, sizeof(opt)) < 0) {
+               td_verror(td, errno, "setsockopt");
+               close(fd);
+               return 1;
+       }
+
+       len = sizeof(*addr);
+
+       nd->addr_vm.svm_family = AF_VSOCK;
+       nd->addr_vm.svm_cid = VMADDR_CID_ANY;
+       nd->addr_vm.svm_port = port;
+
+       if (bind(fd, (struct sockaddr *) addr, len) < 0) {
+               td_verror(td, errno, "bind");
+               close(fd);
+               return 1;
+       }
+
+       nd->listenfd = fd;
+       return 0;
+#else
+       td_verror(td, -EINVAL, "vsock not supported");
+       return -1;
+#endif
+}
+
  static int fio_netio_setup_listen(struct thread_data *td)
  {
         struct netio_data *nd = td->io_ops_data;
@@ -1276,6 +1399,8 @@ static int fio_netio_setup_listen(struct thread_data *td)
  
         if (is_udp(o) || is_tcp(o))
                 ret = fio_netio_setup_listen_inet(td, o->port);
+       else if (is_vsock(o))
+               ret = fio_netio_setup_listen_vsock(td, o->port, SOCK_STREAM);
         else
                 ret = fio_netio_setup_listen_unix(td, td->o.filename);
  
@@ -1311,6 +1436,9 @@ static int fio_netio_init(struct thread_data *td)
         if (o->proto == FIO_TYPE_UNIX && o->port) {
                 log_err("fio: network IO port not valid with unix socket\n");
                 return 1;
+       } else if (is_vsock(o) && !o->port) {
+               log_err("fio: network IO requires port for vsock\n");
+               return 1;
         } else if (o->proto != FIO_TYPE_UNIX && !o->port) {
                 log_err("fio: network IO requires port for tcp or udp\n");
                 return 1;
@@ -1318,7 +1446,7 @@ static int fio_netio_init(struct thread_data *td)
  
         o->port += td->subjob_number;
  
-       if (!is_tcp(o)) {
+       if (!is_tcp(o) && !is_vsock(o)) {
                 if (o->listen) {
                         log_err("fio: listen only valid for TCP proto IO\n");
                         return 1;
@@ -1370,9 +1498,7 @@ static int fio_netio_setup(struct thread_data *td)
         }
  
         if (!td->io_ops_data) {
-               nd = malloc(sizeof(*nd));
-
-               memset(nd, 0, sizeof(*nd));
+               nd = calloc(1, sizeof(*nd));
                 nd->listenfd = -1;
                 nd->pipes[0] = nd->pipes[1] = -1;
                 td->io_ops_data = nd;
diff --git a/engines/nfs.c b/engines/nfs.c

new file mode 100644 (file)

index 0000000..ce748d1
--- /dev/null
+++ b/engines/nfs.c
@@ -0,0 +1,334 @@
+#include <stdlib.h>
+#include <poll.h>
+#include <nfsc/libnfs.h>
+#include <nfsc/libnfs-raw.h>
+#include <nfsc/libnfs-raw-mount.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+enum nfs_op_type {
+       NFS_READ_WRITE = 0,
+       NFS_STAT_MKDIR_RMDIR,
+       NFS_STAT_TOUCH_RM,
+};
+
+struct fio_libnfs_options {
+       struct nfs_context *context;
+       char *nfs_url;
+       /* nfs_callback needs this info, but doesn't have fio td structure to
+        * pull it from
+        */
+       unsigned int queue_depth;
+
+       /* the following implement a circular queue of outstanding IOs */
+
+       /* IOs issued to libnfs, that have not returned yet */
+       int outstanding_events;
+       /* event last returned via fio_libnfs_event */
+       int prev_requested_event_index;
+       int next_buffered_event; /* round robin-pointer within events[] */
+       int buffered_event_count; /* IOs completed by libnfs, waiting for FIO */
+       int free_event_buffer_index; /* next free buffer */
+       struct io_u**events;
+};
+
+struct nfs_data {
+       struct nfsfh *nfsfh;
+       struct fio_libnfs_options *options;
+};
+
+static struct fio_option options[] = {
+       {
+               .name   = "nfs_url",
+               .lname  = "nfs_url",
+               .type   = FIO_OPT_STR_STORE,
+               .help   = "URL in libnfs format, eg nfs://<server|ipv4|"
+                         "ipv6>/path[?arg=val[&arg=val]*]",
+               .off1   = offsetof(struct fio_libnfs_options, nfs_url),
+               .category = FIO_OPT_C_ENGINE,
+               .group  = __FIO_OPT_G_NFS,
+       },
+       {
+               .name     = NULL,
+       },
+};
+
+static struct io_u *fio_libnfs_event(struct thread_data *td, int event)
+{
+       struct fio_libnfs_options *o = td->eo;
+       struct io_u *io_u = o->events[o->next_buffered_event];
+
+       assert(o->events[o->next_buffered_event]);
+       o->events[o->next_buffered_event] = NULL;
+       o->next_buffered_event = (o->next_buffered_event + 1) % td->o.iodepth;
+
+       /* validate our state machine */
+       assert(o->buffered_event_count);
+       o->buffered_event_count--;
+       assert(io_u);
+
+       /* assert that fio_libnfs_event is being called in sequential fashion */
+       assert(event == 0 || o->prev_requested_event_index + 1 == event);
+       if (o->buffered_event_count == 0)
+               o->prev_requested_event_index = -1;
+       else
+               o->prev_requested_event_index = event;
+       return io_u;
+}
+
+/*
+ * fio core logic seems to stop calling this event-loop if we ever return with
+ * 0 events
+ */
+#define SHOULD_WAIT(td, o, flush)                      \
+       ((o)->outstanding_events == (td)->o.iodepth ||  \
+               (flush && (o)->outstanding_events))
+
+static int nfs_event_loop(struct thread_data *td, bool flush)
+{
+       struct fio_libnfs_options *o = td->eo;
+       struct pollfd pfds[1]; /* nfs:0 */
+
+       /* we already have stuff queued for fio, no need to waste cpu on poll() */
+       if (o->buffered_event_count)
+               return o->buffered_event_count;
+
+       do {
+               int timeout = SHOULD_WAIT(td, o, flush) ? -1 : 0;
+               int ret = 0;
+
+               pfds[0].fd = nfs_get_fd(o->context);
+               pfds[0].events = nfs_which_events(o->context);
+               ret = poll(&pfds[0], 1, timeout);
+               if (ret < 0) {
+                       if (errno == EINTR || errno == EAGAIN)
+                               continue;
+                       log_err("nfs: failed to poll events: %s\n", strerror(errno));
+                       break;
+               }
+
+               ret = nfs_service(o->context, pfds[0].revents);
+               if (ret < 0) {
+                       log_err("nfs: socket is in an unrecoverable error state.\n");
+                       break;
+               }
+       } while (SHOULD_WAIT(td, o, flush));
+
+       return o->buffered_event_count;
+}
+
+static int fio_libnfs_getevents(struct thread_data *td, unsigned int min,
+                               unsigned int max, const struct timespec *t)
+{
+       return nfs_event_loop(td, false);
+}
+
+static void nfs_callback(int res, struct nfs_context *nfs, void *data,
+                        void *private_data)
+{
+       struct io_u *io_u = private_data;
+       struct nfs_data *nfs_data = io_u->file->engine_data;
+       struct fio_libnfs_options *o = nfs_data->options;
+       if (res < 0) {
+               log_err("Failed NFS operation(code:%d): %s\n", res,
+                                               nfs_get_error(o->context));
+               io_u->error = -res;
+               /* res is used for read math below, don't want to pass negative
+                * there
+                */
+               res = 0;
+       } else if (io_u->ddir == DDIR_READ) {
+               memcpy(io_u->buf, data, res);
+               if (res == 0)
+                       log_err("Got NFS EOF, this is probably not expected\n");
+       }
+       /* fio uses resid to track remaining data */
+       io_u->resid = io_u->xfer_buflen - res;
+
+       assert(!o->events[o->free_event_buffer_index]);
+       o->events[o->free_event_buffer_index] = io_u;
+       o->free_event_buffer_index = (o->free_event_buffer_index + 1) % o->queue_depth;
+       o->outstanding_events--;
+       o->buffered_event_count++;
+}
+
+static int queue_write(struct fio_libnfs_options *o, struct io_u *io_u)
+{
+       struct nfs_data *nfs_data = io_u->engine_data;
+
+       return nfs_pwrite_async(o->context, nfs_data->nfsfh, io_u->offset,
+                               io_u->buflen, io_u->buf, nfs_callback, io_u);
+}
+
+static int queue_read(struct fio_libnfs_options *o, struct io_u *io_u)
+{
+       struct nfs_data *nfs_data = io_u->engine_data;
+
+       return nfs_pread_async(o->context, nfs_data->nfsfh, io_u->offset,
+                               io_u->buflen, nfs_callback, io_u);
+}
+
+static enum fio_q_status fio_libnfs_queue(struct thread_data *td,
+                                         struct io_u *io_u)
+{
+       struct nfs_data *nfs_data = io_u->file->engine_data;
+       struct fio_libnfs_options *o = nfs_data->options;
+       struct nfs_context *nfs = o->context;
+       enum fio_q_status ret = FIO_Q_QUEUED;
+       int err;
+
+       io_u->engine_data = nfs_data;
+       switch (io_u->ddir) {
+       case DDIR_WRITE:
+               err = queue_write(o, io_u);
+               break;
+       case DDIR_READ:
+               err = queue_read(o, io_u);
+               break;
+       case DDIR_TRIM:
+               log_err("nfs: trim is not supported");
+               err = -1;
+               break;
+       default:
+               log_err("nfs: unhandled io %d\n", io_u->ddir);
+               err = -1;
+       }
+       if (err) {
+               log_err("nfs: Failed to queue nfs op: %s\n", nfs_get_error(nfs));
+               td->error = 1;
+               return FIO_Q_COMPLETED;
+       }
+       o->outstanding_events++;
+       return ret;
+}
+
+/*
+ * Do a mount if one has not been done before 
+ */
+static int do_mount(struct thread_data *td, const char *url)
+{
+       size_t event_size = sizeof(struct io_u **) * td->o.iodepth;
+       struct fio_libnfs_options *options = td->eo;
+       struct nfs_url *nfs_url = NULL;
+       int ret = 0;
+       int path_len = 0;
+       char *mnt_dir = NULL;
+
+       if (options->context)
+               return 0;
+
+       options->context = nfs_init_context();
+       if (!options->context) {
+               log_err("nfs: failed to init nfs context\n");
+               return -1;
+       }
+
+       options->events = calloc(1, event_size);
+
+       options->prev_requested_event_index = -1;
+       options->queue_depth = td->o.iodepth;
+
+       nfs_url = nfs_parse_url_full(options->context, url);
+       path_len = strlen(nfs_url->path);
+       mnt_dir = malloc(path_len + strlen(nfs_url->file) + 1);
+       strcpy(mnt_dir, nfs_url->path);
+       strcpy(mnt_dir + strlen(nfs_url->path), nfs_url->file);
+       ret = nfs_mount(options->context, nfs_url->server, mnt_dir);
+       free(mnt_dir);
+       nfs_destroy_url(nfs_url);
+       return ret;
+}
+
+static int fio_libnfs_setup(struct thread_data *td)
+{
+       /* Using threads with libnfs causes fio to hang on exit, lower
+        * performance
+        */
+       td->o.use_thread = 0;
+       return 0;
+}
+
+static void fio_libnfs_cleanup(struct thread_data *td)
+{
+       struct fio_libnfs_options *o = td->eo;
+
+       nfs_umount(o->context);
+       nfs_destroy_context(o->context);
+       free(o->events);
+}
+
+static int fio_libnfs_open(struct thread_data *td, struct fio_file *f)
+{
+       struct fio_libnfs_options *options = td->eo;
+       struct nfs_data *nfs_data = NULL;
+       int flags = 0;
+       int ret;
+
+       if (!options->nfs_url) {
+               log_err("nfs: nfs_url is a required parameter\n");
+               return -1;
+       }
+
+       ret = do_mount(td, options->nfs_url);
+
+       if (ret) {
+               log_err("nfs: Failed to mount %s with code %d: %s\n",
+                       options->nfs_url, ret, nfs_get_error(options->context));
+               return ret;
+       }
+       nfs_data = calloc(1, sizeof(struct nfs_data));
+       nfs_data->options = options;
+
+       if (td->o.td_ddir == TD_DDIR_WRITE)
+               flags |= O_CREAT | O_RDWR;
+       else
+               flags |= O_RDWR;
+
+       ret = nfs_open(options->context, f->file_name, flags, &nfs_data->nfsfh);
+
+       if (ret)
+               log_err("Failed to open %s: %s\n", f->file_name,
+                                       nfs_get_error(options->context));
+       f->engine_data = nfs_data;
+       return ret;
+}
+
+static int fio_libnfs_close(struct thread_data *td, struct fio_file *f)
+{
+       struct nfs_data *nfs_data = f->engine_data;
+       struct fio_libnfs_options *o = nfs_data->options;
+       int ret = 0;
+
+       if (nfs_data->nfsfh)
+               ret = nfs_close(o->context, nfs_data->nfsfh);
+
+       free(nfs_data);
+       f->engine_data = NULL;
+       return ret;
+}
+
+static struct ioengine_ops ioengine = {
+       .name           = "nfs",
+       .version        = FIO_IOOPS_VERSION,
+       .setup          = fio_libnfs_setup,
+       .queue          = fio_libnfs_queue,
+       .getevents      = fio_libnfs_getevents,
+       .event          = fio_libnfs_event,
+       .cleanup        = fio_libnfs_cleanup,
+       .open_file      = fio_libnfs_open,
+       .close_file     = fio_libnfs_close,
+       .flags          = FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+       .options        = options,
+       .option_struct_size     = sizeof(struct fio_libnfs_options),
+};
+
+static void fio_init fio_nfs_register(void)
+{
+       register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_nfs_unregister(void)
+{
+       unregister_ioengine(&ioengine);
+}
diff --git a/engines/null.c b/engines/null.c

index 4cc0102b993e64a01e5f1804ea6a6ccfe7951a36..7236ec94886a47136e450a81c0d366d84f46ec2e 100644 (file)
--- a/engines/null.c
+++ b/engines/null.c
@@ -6,7 +6,8 @@
   *
   * It also can act as external C++ engine - compiled with:
   *
- * g++ -O2 -g -shared -rdynamic -fPIC -o cpp_null null.c -DFIO_EXTERNAL_ENGINE
+ * g++ -O2 -g -shared -rdynamic -fPIC -o cpp_null null.c \
+ *     -include ../config-host.h -DFIO_EXTERNAL_ENGINE
   *
   * to test it execute:
   *
@@ -43,9 +44,28 @@ static int null_getevents(struct null_data *nd, unsigned int min_events,
         return ret;
  }
  
+static void null_queued(struct thread_data *td, struct null_data *nd)
+{
+       struct timespec now;
+
+       if (!fio_fill_issue_time(td))
+               return;
+
+       fio_gettime(&now, NULL);
+
+       for (int i = 0; i < nd->queued; i++) {
+               struct io_u *io_u = nd->io_us[i];
+
+               memcpy(&io_u->issue_time, &now, sizeof(now));
+               io_u_queued(td, io_u);
+       }
+}
+
  static int null_commit(struct thread_data *td, struct null_data *nd)
  {
         if (!nd->events) {
+               null_queued(td, nd);
+
  #ifndef FIO_EXTERNAL_ENGINE
                 io_u_mark_submit(td, nd->queued);
  #endif
@@ -86,16 +106,18 @@ static void null_cleanup(struct null_data *nd)
  
  static struct null_data *null_init(struct thread_data *td)
  {
-       struct null_data *nd = (struct null_data *) malloc(sizeof(*nd));
+       struct null_data *nd;
+       nd = malloc(sizeof(*nd));
  
         memset(nd, 0, sizeof(*nd));
  
         if (td->o.iodepth != 1) {
-               nd->io_us = (struct io_u **) malloc(td->o.iodepth * sizeof(struct io_u *));
-               memset(nd->io_us, 0, td->o.iodepth * sizeof(struct io_u *));
+               nd->io_us = calloc(td->o.iodepth, sizeof(struct io_u *));
+               td->io_ops->flags |= FIO_ASYNCIO_SETS_ISSUE_TIME;
         } else
                 td->io_ops->flags |= FIO_SYNCIO;
  
+       td_set_ioengine_flags(td);
         return nd;
  }
  
@@ -201,7 +223,7 @@ struct NullData {
                 return null_commit(td, impl_);
         }
  
-       int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+       fio_q_status fio_null_queue(struct thread_data *td, struct io_u *io_u)
         {
                 return null_queue(td, impl_, io_u);
         }
@@ -233,7 +255,7 @@ static int fio_null_commit(struct thread_data *td)
         return NullData::get(td)->fio_null_commit(td);
  }
  
-static int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+static fio_q_status fio_null_queue(struct thread_data *td, struct io_u *io_u)
  {
         return NullData::get(td)->fio_null_queue(td, io_u);
  }
diff --git a/engines/nvme.c b/engines/nvme.c

new file mode 100644 (file)

index 0000000..c6629e8
--- /dev/null
+++ b/engines/nvme.c
@@ -0,0 +1,904 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * nvme structure declarations and helper functions for the
+ * io_uring_cmd engine.
+ */
+
+#include "nvme.h"
+#include "../crc/crc-t10dif.h"
+#include "../crc/crc64.h"
+
+static inline __u64 get_slba(struct nvme_data *data, __u64 offset)
+{
+       if (data->lba_ext)
+               return offset / data->lba_ext;
+
+       return offset >> data->lba_shift;
+}
+
+static inline __u32 get_nlb(struct nvme_data *data, __u64 len)
+{
+       if (data->lba_ext)
+               return len / data->lba_ext - 1;
+
+       return (len >> data->lba_shift) - 1;
+}
+
+static void fio_nvme_generate_pi_16b_guard(struct nvme_data *data,
+                                          struct io_u *io_u,
+                                          struct nvme_cmd_ext_io_opts *opts)
+{
+       struct nvme_pi_data *pi_data = io_u->engine_data;
+       struct nvme_16b_guard_pif *pi;
+       unsigned char *buf = io_u->xfer_buf;
+       unsigned char *md_buf = io_u->mmap_data;
+       __u64 slba = get_slba(data, io_u->offset);
+       __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
+       __u32 lba_num = 0;
+       __u16 guard = 0;
+
+       if (data->pi_loc) {
+               if (data->lba_ext)
+                       pi_data->interval = data->lba_ext - data->ms;
+               else
+                       pi_data->interval = 0;
+       } else {
+               if (data->lba_ext)
+                       pi_data->interval = data->lba_ext - sizeof(struct nvme_16b_guard_pif);
+               else
+                       pi_data->interval = data->ms - sizeof(struct nvme_16b_guard_pif);
+       }
+
+       if (io_u->ddir != DDIR_WRITE)
+               return;
+
+       while (lba_num < nlb) {
+               if (data->lba_ext)
+                       pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval);
+               else
+                       pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval);
+
+               if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
+                       if (data->lba_ext) {
+                               guard = fio_crc_t10dif(0, buf, pi_data->interval);
+                       } else {
+                               guard = fio_crc_t10dif(0, buf, data->lba_size);
+                               guard = fio_crc_t10dif(guard, md_buf, pi_data->interval);
+                       }
+                       pi->guard = cpu_to_be16(guard);
+               }
+
+               if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
+                       pi->apptag = cpu_to_be16(pi_data->apptag);
+
+               if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
+                       switch (data->pi_type) {
+                       case NVME_NS_DPS_PI_TYPE1:
+                       case NVME_NS_DPS_PI_TYPE2:
+                               pi->srtag = cpu_to_be32((__u32)slba + lba_num);
+                               break;
+                       case NVME_NS_DPS_PI_TYPE3:
+                               break;
+                       }
+               }
+               if (data->lba_ext) {
+                       buf += data->lba_ext;
+               } else {
+                       buf += data->lba_size;
+                       md_buf += data->ms;
+               }
+               lba_num++;
+       }
+}
+
+static int fio_nvme_verify_pi_16b_guard(struct nvme_data *data,
+                                       struct io_u *io_u)
+{
+       struct nvme_pi_data *pi_data = io_u->engine_data;
+       struct nvme_16b_guard_pif *pi;
+       struct fio_file *f = io_u->file;
+       unsigned char *buf = io_u->xfer_buf;
+       unsigned char *md_buf = io_u->mmap_data;
+       __u64 slba = get_slba(data, io_u->offset);
+       __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
+       __u32 lba_num = 0;
+       __u16 unmask_app, unmask_app_exp, guard = 0;
+
+       while (lba_num < nlb) {
+               if (data->lba_ext)
+                       pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval);
+               else
+                       pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval);
+
+               if (data->pi_type == NVME_NS_DPS_PI_TYPE3) {
+                       if (pi->apptag == NVME_PI_APP_DISABLE &&
+                           pi->srtag == NVME_PI_REF_DISABLE)
+                               goto next;
+               } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 ||
+                          data->pi_type == NVME_NS_DPS_PI_TYPE2) {
+                       if (pi->apptag == NVME_PI_APP_DISABLE)
+                               goto next;
+               }
+
+               if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
+                       if (data->lba_ext) {
+                               guard = fio_crc_t10dif(0, buf, pi_data->interval);
+                       } else {
+                               guard = fio_crc_t10dif(0, buf, data->lba_size);
+                               guard = fio_crc_t10dif(guard, md_buf, pi_data->interval);
+                       }
+                       if (be16_to_cpu(pi->guard) != guard) {
+                               log_err("%s: Guard compare error: LBA: %llu Expected=%x, Actual=%x\n",
+                                       f->file_name, (unsigned long long)slba,
+                                       guard, be16_to_cpu(pi->guard));
+                               return -EIO;
+                       }
+               }
+
+               if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) {
+                       unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask;
+                       unmask_app_exp = pi_data->apptag & pi_data->apptag_mask;
+                       if (unmask_app != unmask_app_exp) {
+                               log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
+                                       f->file_name, (unsigned long long)slba,
+                                       unmask_app_exp, unmask_app);
+                               return -EIO;
+                       }
+               }
+
+               if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
+                       switch (data->pi_type) {
+                       case NVME_NS_DPS_PI_TYPE1:
+                       case NVME_NS_DPS_PI_TYPE2:
+                               if (be32_to_cpu(pi->srtag) !=
+                                   ((__u32)slba + lba_num)) {
+                                       log_err("%s: REFTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
+                                               f->file_name, (unsigned long long)slba,
+                                               (__u32)slba + lba_num,
+                                               be32_to_cpu(pi->srtag));
+                                       return -EIO;
+                               }
+                               break;
+                       case NVME_NS_DPS_PI_TYPE3:
+                               break;
+                       }
+               }
+next:
+               if (data->lba_ext) {
+                       buf += data->lba_ext;
+               } else {
+                       buf += data->lba_size;
+                       md_buf += data->ms;
+               }
+               lba_num++;
+       }
+
+       return 0;
+}
+
+static void fio_nvme_generate_pi_64b_guard(struct nvme_data *data,
+                                          struct io_u *io_u,
+                                          struct nvme_cmd_ext_io_opts *opts)
+{
+       struct nvme_pi_data *pi_data = io_u->engine_data;
+       struct nvme_64b_guard_pif *pi;
+       unsigned char *buf = io_u->xfer_buf;
+       unsigned char *md_buf = io_u->mmap_data;
+       uint64_t guard = 0;
+       __u64 slba = get_slba(data, io_u->offset);
+       __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
+       __u32 lba_num = 0;
+
+       if (data->pi_loc) {
+               if (data->lba_ext)
+                       pi_data->interval = data->lba_ext - data->ms;
+               else
+                       pi_data->interval = 0;
+       } else {
+               if (data->lba_ext)
+                       pi_data->interval = data->lba_ext - sizeof(struct nvme_64b_guard_pif);
+               else
+                       pi_data->interval = data->ms - sizeof(struct nvme_64b_guard_pif);
+       }
+
+       if (io_u->ddir != DDIR_WRITE)
+               return;
+
+       while (lba_num < nlb) {
+               if (data->lba_ext)
+                       pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval);
+               else
+                       pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval);
+
+               if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
+                       if (data->lba_ext) {
+                               guard = fio_crc64_nvme(0, buf, pi_data->interval);
+                       } else {
+                               guard = fio_crc64_nvme(0, buf, data->lba_size);
+                               guard = fio_crc64_nvme(guard, md_buf, pi_data->interval);
+                       }
+                       pi->guard = cpu_to_be64(guard);
+               }
+
+               if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
+                       pi->apptag = cpu_to_be16(pi_data->apptag);
+
+               if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
+                       switch (data->pi_type) {
+                       case NVME_NS_DPS_PI_TYPE1:
+                       case NVME_NS_DPS_PI_TYPE2:
+                               put_unaligned_be48(slba + lba_num, pi->srtag);
+                               break;
+                       case NVME_NS_DPS_PI_TYPE3:
+                               break;
+                       }
+               }
+               if (data->lba_ext) {
+                       buf += data->lba_ext;
+               } else {
+                       buf += data->lba_size;
+                       md_buf += data->ms;
+               }
+               lba_num++;
+       }
+}
+
+static int fio_nvme_verify_pi_64b_guard(struct nvme_data *data,
+                                       struct io_u *io_u)
+{
+       struct nvme_pi_data *pi_data = io_u->engine_data;
+       struct nvme_64b_guard_pif *pi;
+       struct fio_file *f = io_u->file;
+       unsigned char *buf = io_u->xfer_buf;
+       unsigned char *md_buf = io_u->mmap_data;
+       __u64 slba = get_slba(data, io_u->offset);
+       __u64 ref, ref_exp, guard = 0;
+       __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
+       __u32 lba_num = 0;
+       __u16 unmask_app, unmask_app_exp;
+
+       while (lba_num < nlb) {
+               if (data->lba_ext)
+                       pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval);
+               else
+                       pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval);
+
+               if (data->pi_type == NVME_NS_DPS_PI_TYPE3) {
+                       if (pi->apptag == NVME_PI_APP_DISABLE &&
+                           fio_nvme_pi_ref_escape(pi->srtag))
+                               goto next;
+               } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 ||
+                          data->pi_type == NVME_NS_DPS_PI_TYPE2) {
+                       if (pi->apptag == NVME_PI_APP_DISABLE)
+                               goto next;
+               }
+
+               if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
+                       if (data->lba_ext) {
+                               guard = fio_crc64_nvme(0, buf, pi_data->interval);
+                       } else {
+                               guard = fio_crc64_nvme(0, buf, data->lba_size);
+                               guard = fio_crc64_nvme(guard, md_buf, pi_data->interval);
+                       }
+                       if (be64_to_cpu((uint64_t)pi->guard) != guard) {
+                               log_err("%s: Guard compare error: LBA: %llu Expected=%llx, Actual=%llx\n",
+                                       f->file_name, (unsigned long long)slba,
+                                       guard, be64_to_cpu((uint64_t)pi->guard));
+                               return -EIO;
+                       }
+               }
+
+               if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) {
+                       unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask;
+                       unmask_app_exp = pi_data->apptag & pi_data->apptag_mask;
+                       if (unmask_app != unmask_app_exp) {
+                               log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
+                                       f->file_name, (unsigned long long)slba,
+                                       unmask_app_exp, unmask_app);
+                               return -EIO;
+                       }
+               }
+
+               if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
+                       switch (data->pi_type) {
+                       case NVME_NS_DPS_PI_TYPE1:
+                       case NVME_NS_DPS_PI_TYPE2:
+                               ref = get_unaligned_be48(pi->srtag);
+                               ref_exp = (slba + lba_num) & ((1ULL << 48) - 1);
+                               if (ref != ref_exp) {
+                                       log_err("%s: REFTAG compare error: LBA: %llu Expected=%llx, Actual=%llx\n",
+                                               f->file_name, (unsigned long long)slba,
+                                               ref_exp, ref);
+                                       return -EIO;
+                               }
+                               break;
+                       case NVME_NS_DPS_PI_TYPE3:
+                               break;
+                       }
+               }
+next:
+               if (data->lba_ext) {
+                       buf += data->lba_ext;
+               } else {
+                       buf += data->lba_size;
+                       md_buf += data->ms;
+               }
+               lba_num++;
+       }
+
+       return 0;
+}
+void fio_nvme_uring_cmd_trim_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+                                 struct nvme_dsm *dsm)
+{
+       struct nvme_data *data = FILE_ENG_DATA(io_u->file);
+       struct trim_range *range;
+       uint8_t *buf_point;
+       int i;
+
+       cmd->opcode = nvme_cmd_dsm;
+       cmd->nsid = data->nsid;
+       cmd->cdw11 = NVME_ATTRIBUTE_DEALLOCATE;
+       cmd->addr = (__u64) (uintptr_t) (&dsm->range[0]);
+
+       if (dsm->nr_ranges == 1) {
+               dsm->range[0].slba = get_slba(data, io_u->offset);
+               /* nlb is a 1-based value for deallocate */
+               dsm->range[0].nlb = get_nlb(data, io_u->xfer_buflen) + 1;
+               cmd->cdw10 = 0;
+               cmd->data_len = sizeof(struct nvme_dsm_range);
+       } else {
+               buf_point = io_u->xfer_buf;
+               for (i = 0; i < io_u->number_trim; i++) {
+                       range = (struct trim_range *)buf_point;
+                       dsm->range[i].slba = get_slba(data, range->start);
+                       /* nlb is a 1-based value for deallocate */
+                       dsm->range[i].nlb = get_nlb(data, range->len) + 1;
+                       buf_point += sizeof(struct trim_range);
+               }
+               cmd->cdw10 = io_u->number_trim - 1;
+               cmd->data_len = io_u->number_trim * sizeof(struct nvme_dsm_range);
+       }
+}
+
+int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+                           struct iovec *iov, struct nvme_dsm *dsm)
+{
+       struct nvme_data *data = FILE_ENG_DATA(io_u->file);
+       __u64 slba;
+       __u32 nlb;
+
+       memset(cmd, 0, sizeof(struct nvme_uring_cmd));
+
+       switch (io_u->ddir) {
+       case DDIR_READ:
+               cmd->opcode = nvme_cmd_read;
+               break;
+       case DDIR_WRITE:
+               cmd->opcode = nvme_cmd_write;
+               break;
+       case DDIR_TRIM:
+               fio_nvme_uring_cmd_trim_prep(cmd, io_u, dsm);
+               return 0;
+       default:
+               return -ENOTSUP;
+       }
+
+       slba = get_slba(data, io_u->offset);
+       nlb = get_nlb(data, io_u->xfer_buflen);
+
+       /* cdw10 and cdw11 represent starting lba */
+       cmd->cdw10 = slba & 0xffffffff;
+       cmd->cdw11 = slba >> 32;
+       /* cdw12 represent number of lba's for read/write */
+       cmd->cdw12 = nlb | (io_u->dtype << 20);
+       cmd->cdw13 = io_u->dspec << 16;
+       if (iov) {
+               iov->iov_base = io_u->xfer_buf;
+               iov->iov_len = io_u->xfer_buflen;
+               cmd->addr = (__u64)(uintptr_t)iov;
+               cmd->data_len = 1;
+       } else {
+               cmd->addr = (__u64)(uintptr_t)io_u->xfer_buf;
+               cmd->data_len = io_u->xfer_buflen;
+       }
+       if (data->lba_shift && data->ms) {
+               cmd->metadata = (__u64)(uintptr_t)io_u->mmap_data;
+               cmd->metadata_len = (nlb + 1) * data->ms;
+       }
+       cmd->nsid = data->nsid;
+       return 0;
+}
+
+void fio_nvme_pi_fill(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+                     struct nvme_cmd_ext_io_opts *opts)
+{
+       struct nvme_data *data = FILE_ENG_DATA(io_u->file);
+       __u64 slba;
+
+       slba = get_slba(data, io_u->offset);
+       cmd->cdw12 |= opts->io_flags;
+
+       if (data->pi_type && !(opts->io_flags & NVME_IO_PRINFO_PRACT)) {
+               if (data->guard_type == NVME_NVM_NS_16B_GUARD)
+                       fio_nvme_generate_pi_16b_guard(data, io_u, opts);
+               else if (data->guard_type == NVME_NVM_NS_64B_GUARD)
+                       fio_nvme_generate_pi_64b_guard(data, io_u, opts);
+       }
+
+       switch (data->pi_type) {
+       case NVME_NS_DPS_PI_TYPE1:
+       case NVME_NS_DPS_PI_TYPE2:
+               switch (data->guard_type) {
+               case NVME_NVM_NS_16B_GUARD:
+                       if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF)
+                               cmd->cdw14 = (__u32)slba;
+                       break;
+               case NVME_NVM_NS_64B_GUARD:
+                       if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
+                               cmd->cdw14 = (__u32)slba;
+                               cmd->cdw3 = ((slba >> 32) & 0xffff);
+                       }
+                       break;
+               default:
+                       break;
+               }
+               if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
+                       cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag);
+               break;
+       case NVME_NS_DPS_PI_TYPE3:
+               if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
+                       cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag);
+               break;
+       case NVME_NS_DPS_PI_NONE:
+               break;
+       }
+}
+
+int fio_nvme_pi_verify(struct nvme_data *data, struct io_u *io_u)
+{
+       int ret = 0;
+
+       switch (data->guard_type) {
+       case NVME_NVM_NS_16B_GUARD:
+               ret = fio_nvme_verify_pi_16b_guard(data, io_u);
+               break;
+       case NVME_NVM_NS_64B_GUARD:
+               ret = fio_nvme_verify_pi_64b_guard(data, io_u);
+               break;
+       default:
+               break;
+       }
+
+       return ret;
+}
+
+static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
+                        enum nvme_csi csi, void *data)
+{
+       struct nvme_passthru_cmd cmd = {
+               .opcode         = nvme_admin_identify,
+               .nsid           = nsid,
+               .addr           = (__u64)(uintptr_t)data,
+               .data_len       = NVME_IDENTIFY_DATA_SIZE,
+               .cdw10          = cns,
+               .cdw11          = csi << NVME_IDENTIFY_CSI_SHIFT,
+               .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+       };
+
+       return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
+}
+
+int fio_nvme_get_info(struct fio_file *f, __u64 *nlba, __u32 pi_act,
+                     struct nvme_data *data)
+{
+       struct nvme_id_ns ns;
+       struct nvme_id_ctrl ctrl;
+       struct nvme_nvm_id_ns nvm_ns;
+       int namespace_id;
+       int fd, err;
+       __u32 format_idx, elbaf;
+
+       if (f->filetype != FIO_TYPE_CHAR) {
+               log_err("ioengine io_uring_cmd only works with nvme ns "
+                       "generic char devices (/dev/ngXnY)\n");
+               return 1;
+       }
+
+       fd = open(f->file_name, O_RDONLY);
+       if (fd < 0)
+               return -errno;
+
+       namespace_id = ioctl(fd, NVME_IOCTL_ID);
+       if (namespace_id < 0) {
+               err = -errno;
+               log_err("%s: failed to fetch namespace-id\n", f->file_name);
+               goto out;
+       }
+
+       err = nvme_identify(fd, 0, NVME_IDENTIFY_CNS_CTRL, NVME_CSI_NVM, &ctrl);
+       if (err) {
+               log_err("%s: failed to fetch identify ctrl\n", f->file_name);
+               goto out;
+       }
+
+       /*
+        * Identify namespace to get namespace-id, namespace size in LBA's
+        * and LBA data size.
+        */
+       err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
+                               NVME_CSI_NVM, &ns);
+       if (err) {
+               log_err("%s: failed to fetch identify namespace\n",
+                       f->file_name);
+               goto out;
+       }
+
+       data->nsid = namespace_id;
+
+       /*
+        * 16 or 64 as maximum number of supported LBA formats.
+        * From flbas bit 0-3 indicates lsb and bit 5-6 indicates msb
+        * of the format index used to format the namespace.
+        */
+       if (ns.nlbaf < 16)
+               format_idx = ns.flbas & 0xf;
+       else
+               format_idx = (ns.flbas & 0xf) + (((ns.flbas >> 5) & 0x3) << 4);
+
+       data->lba_size = 1 << ns.lbaf[format_idx].ds;
+       data->ms = le16_to_cpu(ns.lbaf[format_idx].ms);
+
+       /* Check for end to end data protection support */
+       if (data->ms && (ns.dps & NVME_NS_DPS_PI_MASK))
+               data->pi_type = (ns.dps & NVME_NS_DPS_PI_MASK);
+
+       if (!data->pi_type)
+               goto check_elba;
+
+       if (ctrl.ctratt & NVME_CTRL_CTRATT_ELBAS) {
+               err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_CSI_NS,
+                                       NVME_CSI_NVM, &nvm_ns);
+               if (err) {
+                       log_err("%s: failed to fetch identify nvm namespace\n",
+                               f->file_name);
+                       goto out;
+               }
+
+               elbaf = le32_to_cpu(nvm_ns.elbaf[format_idx]);
+
+               /* Currently we don't support storage tags */
+               if (elbaf & NVME_ID_NS_NVM_STS_MASK) {
+                       log_err("%s: Storage tag not supported\n",
+                               f->file_name);
+                       err = -ENOTSUP;
+                       goto out;
+               }
+
+               data->guard_type = (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) &
+                               NVME_ID_NS_NVM_GUARD_MASK;
+
+               /* No 32 bit guard, as storage tag is mandatory for it */
+               switch (data->guard_type) {
+               case NVME_NVM_NS_16B_GUARD:
+                       data->pi_size = sizeof(struct nvme_16b_guard_pif);
+                       break;
+               case NVME_NVM_NS_64B_GUARD:
+                       data->pi_size = sizeof(struct nvme_64b_guard_pif);
+                       break;
+               default:
+                       break;
+               }
+       } else {
+               data->guard_type = NVME_NVM_NS_16B_GUARD;
+               data->pi_size = sizeof(struct nvme_16b_guard_pif);
+       }
+
+       /*
+        * when PRACT bit is set to 1, and metadata size is equal to protection
+        * information size, controller inserts and removes PI for write and
+        * read commands respectively.
+        */
+       if (pi_act && data->ms == data->pi_size)
+               data->ms = 0;
+
+       data->pi_loc = (ns.dps & NVME_NS_DPS_PI_FIRST);
+
+check_elba:
+       /*
+        * Bit 4 for flbas indicates if metadata is transferred at the end of
+        * logical block creating an extended LBA.
+        */
+       if (data->ms && ((ns.flbas >> 4) & 0x1))
+               data->lba_ext = data->lba_size + data->ms;
+       else
+               data->lba_shift = ilog2(data->lba_size);
+
+       *nlba = ns.nsze;
+
+out:
+       close(fd);
+       return err;
+}
+
+int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
+                            enum zbd_zoned_model *model)
+{
+       struct nvme_data *data = FILE_ENG_DATA(f);
+       struct nvme_id_ns ns;
+       struct nvme_passthru_cmd cmd;
+       int fd, ret = 0;
+
+       if (f->filetype != FIO_TYPE_CHAR)
+               return -EINVAL;
+
+       /* File is not yet opened */
+       fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+       if (fd < 0)
+               return -errno;
+
+       /* Using nvme_id_ns for data as sizes are same */
+       ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_CTRL,
+                               NVME_CSI_ZNS, &ns);
+       if (ret) {
+               *model = ZBD_NONE;
+               goto out;
+       }
+
+       memset(&cmd, 0, sizeof(struct nvme_passthru_cmd));
+
+       /* Using nvme_id_ns for data as sizes are same */
+       ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+                               NVME_CSI_ZNS, &ns);
+       if (ret) {
+               *model = ZBD_NONE;
+               goto out;
+       }
+
+       *model = ZBD_HOST_MANAGED;
+out:
+       close(fd);
+       return 0;
+}
+
+static int nvme_report_zones(int fd, __u32 nsid, __u64 slba, __u32 zras_feat,
+                            __u32 data_len, void *data)
+{
+       struct nvme_passthru_cmd cmd = {
+               .opcode         = nvme_zns_cmd_mgmt_recv,
+               .nsid           = nsid,
+               .addr           = (__u64)(uintptr_t)data,
+               .data_len       = data_len,
+               .cdw10          = slba & 0xffffffff,
+               .cdw11          = slba >> 32,
+               .cdw12          = (data_len >> 2) - 1,
+               .cdw13          = NVME_ZNS_ZRA_REPORT_ZONES | zras_feat,
+               .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+       };
+
+       return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+}
+
+int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
+                         uint64_t offset, struct zbd_zone *zbdz,
+                         unsigned int nr_zones)
+{
+       struct nvme_data *data = FILE_ENG_DATA(f);
+       struct nvme_zone_report *zr;
+       struct nvme_zns_id_ns zns_ns;
+       struct nvme_id_ns ns;
+       unsigned int i = 0, j, zones_fetched = 0;
+       unsigned int max_zones, zones_chunks = 1024;
+       int fd, ret = 0;
+       __u32 zr_len;
+       __u64 zlen;
+
+       /* File is not yet opened */
+       fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+       if (fd < 0)
+               return -errno;
+
+       zones_fetched = 0;
+       zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
+       zr = calloc(1, zr_len);
+       if (!zr) {
+               close(fd);
+               return -ENOMEM;
+       }
+
+       ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_NS,
+                               NVME_CSI_NVM, &ns);
+       if (ret) {
+               log_err("%s: nvme_identify_ns failed, err=%d\n", f->file_name,
+                       ret);
+               goto out;
+       }
+
+       ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+                               NVME_CSI_ZNS, &zns_ns);
+       if (ret) {
+               log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
+                       f->file_name, ret);
+               goto out;
+       }
+       zlen = zns_ns.lbafe[ns.flbas & 0x0f].zsze << data->lba_shift;
+
+       max_zones = (f->real_file_size - offset) / zlen;
+       if (max_zones < nr_zones)
+               nr_zones = max_zones;
+
+       if (nr_zones < zones_chunks)
+               zones_chunks = nr_zones;
+
+       while (zones_fetched < nr_zones) {
+               if (zones_fetched + zones_chunks >= nr_zones) {
+                       zones_chunks = nr_zones - zones_fetched;
+                       zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
+               }
+               ret = nvme_report_zones(fd, data->nsid, offset >> data->lba_shift,
+                                       NVME_ZNS_ZRAS_FEAT_ERZ, zr_len, (void *)zr);
+               if (ret) {
+                       log_err("%s: nvme_zns_report_zones failed, err=%d\n",
+                               f->file_name, ret);
+                       goto out;
+               }
+
+               /* Transform the zone-report */
+               for (j = 0; j < zr->nr_zones; j++, i++) {
+                       struct nvme_zns_desc *desc = (struct nvme_zns_desc *)&(zr->entries[j]);
+
+                       zbdz[i].start = desc->zslba << data->lba_shift;
+                       zbdz[i].len = zlen;
+                       zbdz[i].wp = desc->wp << data->lba_shift;
+                       zbdz[i].capacity = desc->zcap << data->lba_shift;
+
+                       /* Zone Type is stored in first 4 bits. */
+                       switch (desc->zt & 0x0f) {
+                       case NVME_ZONE_TYPE_SEQWRITE_REQ:
+                               zbdz[i].type = ZBD_ZONE_TYPE_SWR;
+                               break;
+                       default:
+                               log_err("%s: invalid type for zone at offset %llu.\n",
+                                       f->file_name, (unsigned long long) desc->zslba);
+                               ret = -EIO;
+                               goto out;
+                       }
+
+                       /* Zone State is stored in last 4 bits. */
+                       switch (desc->zs >> 4) {
+                       case NVME_ZNS_ZS_EMPTY:
+                               zbdz[i].cond = ZBD_ZONE_COND_EMPTY;
+                               break;
+                       case NVME_ZNS_ZS_IMPL_OPEN:
+                               zbdz[i].cond = ZBD_ZONE_COND_IMP_OPEN;
+                               break;
+                       case NVME_ZNS_ZS_EXPL_OPEN:
+                               zbdz[i].cond = ZBD_ZONE_COND_EXP_OPEN;
+                               break;
+                       case NVME_ZNS_ZS_CLOSED:
+                               zbdz[i].cond = ZBD_ZONE_COND_CLOSED;
+                               break;
+                       case NVME_ZNS_ZS_FULL:
+                               zbdz[i].cond = ZBD_ZONE_COND_FULL;
+                               break;
+                       case NVME_ZNS_ZS_READ_ONLY:
+                       case NVME_ZNS_ZS_OFFLINE:
+                       default:
+                               /* Treat all these conditions as offline (don't use!) */
+                               zbdz[i].cond = ZBD_ZONE_COND_OFFLINE;
+                               zbdz[i].wp = zbdz[i].start;
+                       }
+               }
+               zones_fetched += zr->nr_zones;
+               offset += zr->nr_zones * zlen;
+       }
+
+       ret = zones_fetched;
+out:
+       free(zr);
+       close(fd);
+
+       return ret;
+}
+
+int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
+                     uint64_t offset, uint64_t length)
+{
+       struct nvme_data *data = FILE_ENG_DATA(f);
+       unsigned int nr_zones;
+       unsigned long long zslba;
+       int i, fd, ret = 0;
+
+       /* If the file is not yet opened, open it for this function. */
+       fd = f->fd;
+       if (fd < 0) {
+               fd = open(f->file_name, O_RDWR | O_LARGEFILE);
+               if (fd < 0)
+                       return -errno;
+       }
+
+       zslba = offset >> data->lba_shift;
+       nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size;
+
+       for (i = 0; i < nr_zones; i++, zslba += (td->o.zone_size >> data->lba_shift)) {
+               struct nvme_passthru_cmd cmd = {
+                       .opcode         = nvme_zns_cmd_mgmt_send,
+                       .nsid           = data->nsid,
+                       .cdw10          = zslba & 0xffffffff,
+                       .cdw11          = zslba >> 32,
+                       .cdw13          = NVME_ZNS_ZSA_RESET,
+                       .addr           = (__u64)(uintptr_t)NULL,
+                       .data_len       = 0,
+                       .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+               };
+
+               ret = ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+       }
+
+       if (f->fd < 0)
+               close(fd);
+       return -ret;
+}
+
+int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                               unsigned int *max_open_zones)
+{
+       struct nvme_data *data = FILE_ENG_DATA(f);
+       struct nvme_zns_id_ns zns_ns;
+       int fd, ret = 0;
+
+       fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+       if (fd < 0)
+               return -errno;
+
+       ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+                               NVME_CSI_ZNS, &zns_ns);
+       if (ret) {
+               log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
+                       f->file_name, ret);
+               goto out;
+       }
+
+       *max_open_zones = zns_ns.mor + 1;
+out:
+       close(fd);
+       return ret;
+}
+
+static inline int nvme_fdp_reclaim_unit_handle_status(int fd, __u32 nsid,
+                                                     __u32 data_len, void *data)
+{
+       struct nvme_passthru_cmd cmd = {
+               .opcode         = nvme_cmd_io_mgmt_recv,
+               .nsid           = nsid,
+               .addr           = (__u64)(uintptr_t)data,
+               .data_len       = data_len,
+               .cdw10          = 1,
+               .cdw11          = (data_len >> 2) - 1,
+       };
+
+       return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+}
+
+int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
+                        struct nvme_fdp_ruh_status *ruhs, __u32 bytes)
+{
+       struct nvme_data *data = FILE_ENG_DATA(f);
+       int fd, ret;
+
+       fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+       if (fd < 0)
+               return -errno;
+
+       ret = nvme_fdp_reclaim_unit_handle_status(fd, data->nsid, bytes, ruhs);
+       if (ret) {
+               log_err("%s: nvme_fdp_reclaim_unit_handle_status failed, err=%d\n",
+                       f->file_name, ret);
+               errno = ENOTSUP;
+       } else
+               errno = 0;
+
+       ret = -errno;
+       close(fd);
+       return ret;
+}
diff --git a/engines/nvme.h b/engines/nvme.h

new file mode 100644 (file)

index 0000000..2d5204f
--- /dev/null
+++ b/engines/nvme.h
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * nvme structure declarations and helper functions for the
+ * io_uring_cmd engine.
+ */
+
+#ifndef FIO_NVME_H
+#define FIO_NVME_H
+
+#include <linux/nvme_ioctl.h>
+#include "../fio.h"
+
+/*
+ * If the uapi headers installed on the system lacks nvme uring command
+ * support, use the local version to prevent compilation issues.
+ */
+#ifndef CONFIG_NVME_URING_CMD
+struct nvme_uring_cmd {
+       __u8    opcode;
+       __u8    flags;
+       __u16   rsvd1;
+       __u32   nsid;
+       __u32   cdw2;
+       __u32   cdw3;
+       __u64   metadata;
+       __u64   addr;
+       __u32   metadata_len;
+       __u32   data_len;
+       __u32   cdw10;
+       __u32   cdw11;
+       __u32   cdw12;
+       __u32   cdw13;
+       __u32   cdw14;
+       __u32   cdw15;
+       __u32   timeout_ms;
+       __u32   rsvd2;
+};
+
+#define NVME_URING_CMD_IO      _IOWR('N', 0x80, struct nvme_uring_cmd)
+#define NVME_URING_CMD_IO_VEC  _IOWR('N', 0x81, struct nvme_uring_cmd)
+#endif /* CONFIG_NVME_URING_CMD */
+
+#define NVME_DEFAULT_IOCTL_TIMEOUT 0
+#define NVME_IDENTIFY_DATA_SIZE 4096
+#define NVME_IDENTIFY_CSI_SHIFT 24
+#define NVME_NQN_LENGTH        256
+
+#define NVME_PI_APP_DISABLE 0xFFFF
+#define NVME_PI_REF_DISABLE 0xFFFFFFFF
+
+#define NVME_ZNS_ZRA_REPORT_ZONES 0
+#define NVME_ZNS_ZRAS_FEAT_ERZ (1 << 16)
+#define NVME_ZNS_ZSA_RESET 0x4
+#define NVME_ZONE_TYPE_SEQWRITE_REQ 0x2
+
+#define NVME_ATTRIBUTE_DEALLOCATE (1 << 2)
+
+enum nvme_identify_cns {
+       NVME_IDENTIFY_CNS_NS            = 0x00,
+       NVME_IDENTIFY_CNS_CTRL          = 0x01,
+       NVME_IDENTIFY_CNS_CSI_NS        = 0x05,
+       NVME_IDENTIFY_CNS_CSI_CTRL      = 0x06,
+};
+
+enum nvme_csi {
+       NVME_CSI_NVM                    = 0,
+       NVME_CSI_KV                     = 1,
+       NVME_CSI_ZNS                    = 2,
+};
+
+enum nvme_admin_opcode {
+       nvme_admin_identify             = 0x06,
+};
+
+enum nvme_io_opcode {
+       nvme_cmd_write                  = 0x01,
+       nvme_cmd_read                   = 0x02,
+       nvme_cmd_dsm                    = 0x09,
+       nvme_cmd_io_mgmt_recv           = 0x12,
+       nvme_zns_cmd_mgmt_send          = 0x79,
+       nvme_zns_cmd_mgmt_recv          = 0x7a,
+};
+
+enum nvme_zns_zs {
+       NVME_ZNS_ZS_EMPTY               = 0x1,
+       NVME_ZNS_ZS_IMPL_OPEN           = 0x2,
+       NVME_ZNS_ZS_EXPL_OPEN           = 0x3,
+       NVME_ZNS_ZS_CLOSED              = 0x4,
+       NVME_ZNS_ZS_READ_ONLY           = 0xd,
+       NVME_ZNS_ZS_FULL                = 0xe,
+       NVME_ZNS_ZS_OFFLINE             = 0xf,
+};
+
+enum nvme_id_ctrl_ctratt {
+       NVME_CTRL_CTRATT_ELBAS          = 1 << 15,
+};
+
+enum {
+       NVME_ID_NS_NVM_STS_MASK         = 0x7f,
+       NVME_ID_NS_NVM_GUARD_SHIFT      = 7,
+       NVME_ID_NS_NVM_GUARD_MASK       = 0x3,
+};
+
+enum {
+       NVME_NVM_NS_16B_GUARD           = 0,
+       NVME_NVM_NS_32B_GUARD           = 1,
+       NVME_NVM_NS_64B_GUARD           = 2,
+};
+
+struct nvme_data {
+       __u32 nsid;
+       __u32 lba_shift;
+       __u32 lba_size;
+       __u32 lba_ext;
+       __u16 ms;
+       __u16 pi_size;
+       __u8 pi_type;
+       __u8 guard_type;
+       __u8 pi_loc;
+};
+
+enum nvme_id_ns_dps {
+       NVME_NS_DPS_PI_NONE             = 0,
+       NVME_NS_DPS_PI_TYPE1            = 1,
+       NVME_NS_DPS_PI_TYPE2            = 2,
+       NVME_NS_DPS_PI_TYPE3            = 3,
+       NVME_NS_DPS_PI_MASK             = 7 << 0,
+       NVME_NS_DPS_PI_FIRST            = 1 << 3,
+};
+
+enum nvme_io_control_flags {
+       NVME_IO_PRINFO_PRCHK_REF        = 1U << 26,
+       NVME_IO_PRINFO_PRCHK_APP        = 1U << 27,
+       NVME_IO_PRINFO_PRCHK_GUARD      = 1U << 28,
+       NVME_IO_PRINFO_PRACT            = 1U << 29,
+};
+
+struct nvme_pi_data {
+       __u32 interval;
+       __u32 io_flags;
+       __u16 apptag;
+       __u16 apptag_mask;
+};
+
+struct nvme_lbaf {
+       __le16                  ms;
+       __u8                    ds;
+       __u8                    rp;
+};
+
+/* 16 bit guard protection Information format */
+struct nvme_16b_guard_pif {
+       __be16 guard;
+       __be16 apptag;
+       __be32 srtag;
+};
+
+/* 64 bit guard protection Information format */
+struct nvme_64b_guard_pif {
+       __be64 guard;
+       __be16 apptag;
+       __u8 srtag[6];
+};
+
+struct nvme_id_ns {
+       __le64                  nsze;
+       __le64                  ncap;
+       __le64                  nuse;
+       __u8                    nsfeat;
+       __u8                    nlbaf;
+       __u8                    flbas;
+       __u8                    mc;
+       __u8                    dpc;
+       __u8                    dps;
+       __u8                    nmic;
+       __u8                    rescap;
+       __u8                    fpi;
+       __u8                    dlfeat;
+       __le16                  nawun;
+       __le16                  nawupf;
+       __le16                  nacwu;
+       __le16                  nabsn;
+       __le16                  nabo;
+       __le16                  nabspf;
+       __le16                  noiob;
+       __u8                    nvmcap[16];
+       __le16                  npwg;
+       __le16                  npwa;
+       __le16                  npdg;
+       __le16                  npda;
+       __le16                  nows;
+       __le16                  mssrl;
+       __le32                  mcl;
+       __u8                    msrc;
+       __u8                    rsvd81[11];
+       __le32                  anagrpid;
+       __u8                    rsvd96[3];
+       __u8                    nsattr;
+       __le16                  nvmsetid;
+       __le16                  endgid;
+       __u8                    nguid[16];
+       __u8                    eui64[8];
+       struct nvme_lbaf        lbaf[64];
+       __u8                    vs[3712];
+};
+
+struct nvme_id_psd {
+       __le16                  mp;
+       __u8                    rsvd2;
+       __u8                    flags;
+       __le32                  enlat;
+       __le32                  exlat;
+       __u8                    rrt;
+       __u8                    rrl;
+       __u8                    rwt;
+       __u8                    rwl;
+       __le16                  idlp;
+       __u8                    ips;
+       __u8                    rsvd19;
+       __le16                  actp;
+       __u8                    apws;
+       __u8                    rsvd23[9];
+};
+
+struct nvme_id_ctrl {
+       __le16                  vid;
+       __le16                  ssvid;
+       char                    sn[20];
+       char                    mn[40];
+       char                    fr[8];
+       __u8                    rab;
+       __u8                    ieee[3];
+       __u8                    cmic;
+       __u8                    mdts;
+       __le16                  cntlid;
+       __le32                  ver;
+       __le32                  rtd3r;
+       __le32                  rtd3e;
+       __le32                  oaes;
+       __le32                  ctratt;
+       __le16                  rrls;
+       __u8                    rsvd102[9];
+       __u8                    cntrltype;
+       __u8                    fguid[16];
+       __le16                  crdt1;
+       __le16                  crdt2;
+       __le16                  crdt3;
+       __u8                    rsvd134[119];
+       __u8                    nvmsr;
+       __u8                    vwci;
+       __u8                    mec;
+       __le16                  oacs;
+       __u8                    acl;
+       __u8                    aerl;
+       __u8                    frmw;
+       __u8                    lpa;
+       __u8                    elpe;
+       __u8                    npss;
+       __u8                    avscc;
+       __u8                    apsta;
+       __le16                  wctemp;
+       __le16                  cctemp;
+       __le16                  mtfa;
+       __le32                  hmpre;
+       __le32                  hmmin;
+       __u8                    tnvmcap[16];
+       __u8                    unvmcap[16];
+       __le32                  rpmbs;
+       __le16                  edstt;
+       __u8                    dsto;
+       __u8                    fwug;
+       __le16                  kas;
+       __le16                  hctma;
+       __le16                  mntmt;
+       __le16                  mxtmt;
+       __le32                  sanicap;
+       __le32                  hmminds;
+       __le16                  hmmaxd;
+       __le16                  nsetidmax;
+       __le16                  endgidmax;
+       __u8                    anatt;
+       __u8                    anacap;
+       __le32                  anagrpmax;
+       __le32                  nanagrpid;
+       __le32                  pels;
+       __le16                  domainid;
+       __u8                    rsvd358[10];
+       __u8                    megcap[16];
+       __u8                    rsvd384[128];
+       __u8                    sqes;
+       __u8                    cqes;
+       __le16                  maxcmd;
+       __le32                  nn;
+       __le16                  oncs;
+       __le16                  fuses;
+       __u8                    fna;
+       __u8                    vwc;
+       __le16                  awun;
+       __le16                  awupf;
+       __u8                    icsvscc;
+       __u8                    nwpc;
+       __le16                  acwu;
+       __le16                  ocfs;
+       __le32                  sgls;
+       __le32                  mnan;
+       __u8                    maxdna[16];
+       __le32                  maxcna;
+       __u8                    rsvd564[204];
+       char                    subnqn[NVME_NQN_LENGTH];
+       __u8                    rsvd1024[768];
+
+       /* Fabrics Only */
+       __le32                  ioccsz;
+       __le32                  iorcsz;
+       __le16                  icdoff;
+       __u8                    fcatt;
+       __u8                    msdbd;
+       __le16                  ofcs;
+       __u8                    dctype;
+       __u8                    rsvd1807[241];
+
+       struct nvme_id_psd      psd[32];
+       __u8                    vs[1024];
+};
+
+struct nvme_nvm_id_ns {
+       __le64                  lbstm;
+       __u8                    pic;
+       __u8                    rsvd9[3];
+       __le32                  elbaf[64];
+       __u8                    rsvd268[3828];
+};
+
+static inline int ilog2(uint32_t i)
+{
+       int log = -1;
+
+       while (i) {
+               i >>= 1;
+               log++;
+       }
+       return log;
+}
+
+struct nvme_zns_lbafe {
+       __le64  zsze;
+       __u8    zdes;
+       __u8    rsvd9[7];
+};
+
+struct nvme_zns_id_ns {
+       __le16                  zoc;
+       __le16                  ozcs;
+       __le32                  mar;
+       __le32                  mor;
+       __le32                  rrl;
+       __le32                  frl;
+       __le32                  rrl1;
+       __le32                  rrl2;
+       __le32                  rrl3;
+       __le32                  frl1;
+       __le32                  frl2;
+       __le32                  frl3;
+       __le32                  numzrwa;
+       __le16                  zrwafg;
+       __le16                  zrwasz;
+       __u8                    zrwacap;
+       __u8                    rsvd53[2763];
+       struct nvme_zns_lbafe   lbafe[64];
+       __u8                    vs[256];
+};
+
+struct nvme_zns_desc {
+       __u8    zt;
+       __u8    zs;
+       __u8    za;
+       __u8    zai;
+       __u8    rsvd4[4];
+       __le64  zcap;
+       __le64  zslba;
+       __le64  wp;
+       __u8    rsvd32[32];
+};
+
+struct nvme_zone_report {
+       __le64                  nr_zones;
+       __u8                    rsvd8[56];
+       struct nvme_zns_desc    entries[];
+};
+
+struct nvme_fdp_ruh_status_desc {
+       __u16 pid;
+       __u16 ruhid;
+       __u32 earutr;
+       __u64 ruamw;
+       __u8  rsvd16[16];
+};
+
+struct nvme_fdp_ruh_status {
+       __u8  rsvd0[14];
+       __le16 nruhsd;
+       struct nvme_fdp_ruh_status_desc ruhss[];
+};
+
+struct nvme_dsm_range {
+       __le32  cattr;
+       __le32  nlb;
+       __le64  slba;
+};
+
+struct nvme_dsm {
+       __u32 nr_ranges;
+       struct nvme_dsm_range range[];
+};
+
+struct nvme_cmd_ext_io_opts {
+       __u32 io_flags;
+       __u16 apptag;
+       __u16 apptag_mask;
+};
+
+int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
+                        struct nvme_fdp_ruh_status *ruhs, __u32 bytes);
+
+int fio_nvme_get_info(struct fio_file *f, __u64 *nlba, __u32 pi_act,
+                     struct nvme_data *data);
+
+int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+                           struct iovec *iov, struct nvme_dsm *dsm);
+
+void fio_nvme_pi_fill(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+                     struct nvme_cmd_ext_io_opts *opts);
+
+int fio_nvme_pi_verify(struct nvme_data *data, struct io_u *io_u);
+
+int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
+                            enum zbd_zoned_model *model);
+
+int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
+                         uint64_t offset, struct zbd_zone *zbdz,
+                         unsigned int nr_zones);
+
+int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
+                     uint64_t offset, uint64_t length);
+
+int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                               unsigned int *max_open_zones);
+
+static inline void put_unaligned_be48(__u64 val, __u8 *p)
+{
+       *p++ = val >> 40;
+       *p++ = val >> 32;
+       *p++ = val >> 24;
+       *p++ = val >> 16;
+       *p++ = val >> 8;
+       *p++ = val;
+}
+
+static inline __u64 get_unaligned_be48(__u8 *p)
+{
+       return (__u64)p[0] << 40 | (__u64)p[1] << 32 | (__u64)p[2] << 24 |
+               p[3] << 16 | p[4] << 8 | p[5];
+}
+
+static inline bool fio_nvme_pi_ref_escape(__u8 *reftag)
+{
+       __u8 ref_esc[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+       return memcmp(reftag, ref_esc, sizeof(ref_esc)) == 0;
+}
+
+#endif
diff --git a/engines/pmemblk.c b/engines/pmemblk.c

deleted file mode 100644 (file)

index fc6358e..0000000
--- a/engines/pmemblk.c
+++ /dev/null
@@ -1,448 +0,0 @@
-/*
- * pmemblk: IO engine that uses PMDK libpmemblk to read and write data
- *
- * Copyright (C) 2016 Hewlett Packard Enterprise Development LP
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License,
- * version 2 as published by the Free Software Foundation..
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the Free
- * Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- * Boston, MA 02110-1301, USA.
- */
-
-/*
- * pmemblk engine
- *
- * IO engine that uses libpmemblk to read and write data
- *
- * To use:
- *   ioengine=pmemblk
- *
- * Other relevant settings:
- *   thread=1   REQUIRED
- *   iodepth=1
- *   direct=1
- *   unlink=1
- *   filename=/mnt/pmem0/fiotestfile,BSIZE,FSIZEMiB
- *
- *   thread must be set to 1 for pmemblk as multiple processes cannot
- *     open the same block pool file.
- *
- *   iodepth should be set to 1 as pmemblk is always synchronous.
- *   Use numjobs to scale up.
- *
- *   direct=1 is implied as pmemblk is always direct. A warning message
- *   is printed if this is not specified.
- *
- *   unlink=1 removes the block pool file after testing, and is optional.
- *
- *   The pmem device must have a DAX-capable filesystem and be mounted
- *   with DAX enabled.  filename must point to a file on that filesystem.
- *
- *   Example:
- *     mkfs.xfs /dev/pmem0
- *     mkdir /mnt/pmem0
- *     mount -o dax /dev/pmem0 /mnt/pmem0
- *
- *   When specifying the filename, if the block pool file does not already
- *   exist, then the pmemblk engine creates the pool file if you specify
- *   the block and file sizes.  BSIZE is the block size in bytes.
- *   FSIZEMB is the pool file size in MiB.
- *
- *   See examples/pmemblk.fio for more.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/uio.h>
-#include <errno.h>
-#include <assert.h>
-#include <string.h>
-#include <libpmem.h>
-#include <libpmemblk.h>
-
-#include "../fio.h"
-
-/*
- * libpmemblk
- */
-typedef struct fio_pmemblk_file *fio_pmemblk_file_t;
-
-struct fio_pmemblk_file {
-       fio_pmemblk_file_t pmb_next;
-       char *pmb_filename;
-       uint64_t pmb_refcnt;
-       PMEMblkpool *pmb_pool;
-       size_t pmb_bsize;
-       size_t pmb_nblocks;
-};
-
-static fio_pmemblk_file_t Cache;
-
-static pthread_mutex_t CacheLock = PTHREAD_MUTEX_INITIALIZER;
-
-#define PMB_CREATE   (0x0001)  /* should create file */
-
-fio_pmemblk_file_t fio_pmemblk_cache_lookup(const char *filename)
-{
-       fio_pmemblk_file_t i;
-
-       for (i = Cache; i != NULL; i = i->pmb_next)
-               if (!strcmp(filename, i->pmb_filename))
-                       return i;
-
-       return NULL;
-}
-
-static void fio_pmemblk_cache_insert(fio_pmemblk_file_t pmb)
-{
-       pmb->pmb_next = Cache;
-       Cache = pmb;
-}
-
-static void fio_pmemblk_cache_remove(fio_pmemblk_file_t pmb)
-{
-       fio_pmemblk_file_t i;
-
-       if (pmb == Cache) {
-               Cache = Cache->pmb_next;
-               pmb->pmb_next = NULL;
-               return;
-       }
-
-       for (i = Cache; i != NULL; i = i->pmb_next)
-               if (pmb == i->pmb_next) {
-                       i->pmb_next = i->pmb_next->pmb_next;
-                       pmb->pmb_next = NULL;
-                       return;
-               }
-}
-
-/*
- * to control block size and gross file size at the libpmemblk
- * level, we allow the block size and file size to be appended
- * to the file name:
- *
- *   path[,bsize,fsizemib]
- *
- * note that we do not use the fio option "filesize" to dictate
- * the file size because we can only give libpmemblk the gross
- * file size, which is different from the net or usable file
- * size (which is probably what fio wants).
- *
- * the final path without the parameters is returned in ppath.
- * the block size and file size are returned in pbsize and fsize.
- *
- * note that the user specifies the file size in MiB, but
- * we return bytes from here.
- */
-static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize,
-                          uint64_t *pfsize)
-{
-       char *path;
-       char *s;
-       uint64_t bsize;
-       uint64_t fsizemib;
-
-       path = strdup(pathspec);
-       if (!path) {
-               *ppath = NULL;
-               return;
-       }
-
-       /* extract sizes, if given */
-       s = strrchr(path, ',');
-       if (s && (fsizemib = strtoull(s + 1, NULL, 10))) {
-               *s = 0;
-               s = strrchr(path, ',');
-               if (s && (bsize = strtoull(s + 1, NULL, 10))) {
-                       *s = 0;
-                       *ppath = path;
-                       *pbsize = bsize;
-                       *pfsize = fsizemib << 20;
-                       return;
-               }
-       }
-
-       /* size specs not found */
-       strcpy(path, pathspec);
-       *ppath = path;
-       *pbsize = 0;
-       *pfsize = 0;
-}
-
-static fio_pmemblk_file_t pmb_open(const char *pathspec, int flags)
-{
-       fio_pmemblk_file_t pmb;
-       char *path = NULL;
-       uint64_t bsize = 0;
-       uint64_t fsize = 0;
-
-       pmb_parse_path(pathspec, &path, &bsize, &fsize);
-       if (!path)
-               return NULL;
-
-       pthread_mutex_lock(&CacheLock);
-
-       pmb = fio_pmemblk_cache_lookup(path);
-       if (!pmb) {
-               pmb = malloc(sizeof(*pmb));
-               if (!pmb)
-                       goto error;
-
-               /* try opening existing first, create it if needed */
-               pmb->pmb_pool = pmemblk_open(path, bsize);
-               if (!pmb->pmb_pool && (errno == ENOENT) &&
-                   (flags & PMB_CREATE) && (0 < fsize) && (0 < bsize)) {
-                       pmb->pmb_pool =
-                           pmemblk_create(path, bsize, fsize, 0644);
-               }
-               if (!pmb->pmb_pool) {
-                       log_err("pmemblk: unable to open pmemblk pool file %s (%s)\n",
-                            path, strerror(errno));
-                       goto error;
-               }
-
-               pmb->pmb_filename = path;
-               pmb->pmb_next = NULL;
-               pmb->pmb_refcnt = 0;
-               pmb->pmb_bsize = pmemblk_bsize(pmb->pmb_pool);
-               pmb->pmb_nblocks = pmemblk_nblock(pmb->pmb_pool);
-
-               fio_pmemblk_cache_insert(pmb);
-       } else {
-               free(path);
-       }
-
-       pmb->pmb_refcnt += 1;
-
-       pthread_mutex_unlock(&CacheLock);
-
-       return pmb;
-
-error:
-       if (pmb) {
-               if (pmb->pmb_pool)
-                       pmemblk_close(pmb->pmb_pool);
-               pmb->pmb_pool = NULL;
-               pmb->pmb_filename = NULL;
-               free(pmb);
-       }
-       if (path)
-               free(path);
-
-       pthread_mutex_unlock(&CacheLock);
-       return NULL;
-}
-
-static void pmb_close(fio_pmemblk_file_t pmb, const bool keep)
-{
-       pthread_mutex_lock(&CacheLock);
-
-       pmb->pmb_refcnt--;
-
-       if (!keep && !pmb->pmb_refcnt) {
-               pmemblk_close(pmb->pmb_pool);
-               pmb->pmb_pool = NULL;
-               free(pmb->pmb_filename);
-               pmb->pmb_filename = NULL;
-               fio_pmemblk_cache_remove(pmb);
-               free(pmb);
-       }
-
-       pthread_mutex_unlock(&CacheLock);
-}
-
-static int pmb_get_flags(struct thread_data *td, uint64_t *pflags)
-{
-       static int thread_warned = 0;
-       static int odirect_warned = 0;
-
-       uint64_t flags = 0;
-
-       if (!td->o.use_thread) {
-               if (!thread_warned) {
-                       thread_warned = 1;
-                       log_err("pmemblk: must set thread=1 for pmemblk engine\n");
-               }
-               return 1;
-       }
-
-       if (!td->o.odirect && !odirect_warned) {
-               odirect_warned = 1;
-               log_info("pmemblk: direct == 0, but pmemblk is always direct\n");
-       }
-
-       if (td->o.allow_create)
-               flags |= PMB_CREATE;
-
-       (*pflags) = flags;
-       return 0;
-}
-
-static int fio_pmemblk_open_file(struct thread_data *td, struct fio_file *f)
-{
-       uint64_t flags = 0;
-       fio_pmemblk_file_t pmb;
-
-       if (pmb_get_flags(td, &flags))
-               return 1;
-
-       pmb = pmb_open(f->file_name, flags);
-       if (!pmb)
-               return 1;
-
-       FILE_SET_ENG_DATA(f, pmb);
-       return 0;
-}
-
-static int fio_pmemblk_close_file(struct thread_data fio_unused *td,
-                                 struct fio_file *f)
-{
-       fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
-
-       if (pmb)
-               pmb_close(pmb, false);
-
-       FILE_SET_ENG_DATA(f, NULL);
-       return 0;
-}
-
-static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f)
-{
-       uint64_t flags = 0;
-       fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
-
-       if (fio_file_size_known(f))
-               return 0;
-
-       if (!pmb) {
-               if (pmb_get_flags(td, &flags))
-                       return 1;
-               pmb = pmb_open(f->file_name, flags);
-               if (!pmb)
-                       return 1;
-       }
-
-       f->real_file_size = pmb->pmb_bsize * pmb->pmb_nblocks;
-
-       fio_file_set_size_known(f);
-
-       if (!FILE_ENG_DATA(f))
-               pmb_close(pmb, true);
-
-       return 0;
-}
-
-static enum fio_q_status fio_pmemblk_queue(struct thread_data *td,
-                                          struct io_u *io_u)
-{
-       struct fio_file *f = io_u->file;
-       fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
-
-       unsigned long long off;
-       unsigned long len;
-       void *buf;
-
-       fio_ro_check(td, io_u);
-
-       switch (io_u->ddir) {
-       case DDIR_READ:
-       case DDIR_WRITE:
-               off = io_u->offset;
-               len = io_u->xfer_buflen;
-
-               io_u->error = EINVAL;
-               if (off % pmb->pmb_bsize)
-                       break;
-               if (len % pmb->pmb_bsize)
-                       break;
-               if ((off + len) / pmb->pmb_bsize > pmb->pmb_nblocks)
-                       break;
-
-               io_u->error = 0;
-               buf = io_u->xfer_buf;
-               off /= pmb->pmb_bsize;
-               len /= pmb->pmb_bsize;
-               while (0 < len) {
-                       if (io_u->ddir == DDIR_READ &&
-                          0 != pmemblk_read(pmb->pmb_pool, buf, off)) {
-                               io_u->error = errno;
-                               break;
-                       } else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) {
-                               io_u->error = errno;
-                               break;
-                       }
-                       buf += pmb->pmb_bsize;
-                       off++;
-                       len--;
-               }
-               off *= pmb->pmb_bsize;
-               len *= pmb->pmb_bsize;
-               io_u->resid = io_u->xfer_buflen - (off - io_u->offset);
-               break;
-       case DDIR_SYNC:
-       case DDIR_DATASYNC:
-       case DDIR_SYNC_FILE_RANGE:
-               /* we're always sync'd */
-               io_u->error = 0;
-               break;
-       default:
-               io_u->error = EINVAL;
-               break;
-       }
-
-       return FIO_Q_COMPLETED;
-}
-
-static int fio_pmemblk_unlink_file(struct thread_data *td, struct fio_file *f)
-{
-       char *path = NULL;
-       uint64_t bsize = 0;
-       uint64_t fsize = 0;
-
-       /*
-        * we need our own unlink in case the user has specified
-        * the block and file sizes in the path name.  we parse
-        * the file_name to determine the file name we actually used.
-        */
-
-       pmb_parse_path(f->file_name, &path, &bsize, &fsize);
-       if (!path)
-               return ENOENT;
-
-       unlink(path);
-       free(path);
-       return 0;
-}
-
-FIO_STATIC struct ioengine_ops ioengine = {
-       .name = "pmemblk",
-       .version = FIO_IOOPS_VERSION,
-       .queue = fio_pmemblk_queue,
-       .open_file = fio_pmemblk_open_file,
-       .close_file = fio_pmemblk_close_file,
-       .get_file_size = fio_pmemblk_get_file_size,
-       .unlink_file = fio_pmemblk_unlink_file,
-       .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
-};
-
-static void fio_init fio_pmemblk_register(void)
-{
-       register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_pmemblk_unregister(void)
-{
-       unregister_ioengine(&ioengine);
-}
diff --git a/engines/posixaio.c b/engines/posixaio.c

index 135d088c7a029ef1ac3e55192d724d147875bbe7..0f4eea68a9ee6019e256498c0279be611dfd3473 100644 (file)
--- a/engines/posixaio.c
+++ b/engines/posixaio.c
@@ -197,11 +197,9 @@ static void fio_posixaio_cleanup(struct thread_data *td)
  
  static int fio_posixaio_init(struct thread_data *td)
  {
-       struct posixaio_data *pd = malloc(sizeof(*pd));
-
-       memset(pd, 0, sizeof(*pd));
-       pd->aio_events = malloc(td->o.iodepth * sizeof(struct io_u *));
-       memset(pd->aio_events, 0, td->o.iodepth * sizeof(struct io_u *));
+       struct posixaio_data *pd;
+       pd = calloc(1, sizeof(*pd));
+       pd->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
  
         td->io_ops_data = pd;
         return 0;
diff --git a/engines/rados.c b/engines/rados.c

index 42ee48ff02b3f6371027ab4cbcbc304aefbbea10..d0d15c5b54c8380353b56e8b9856a8475a1b9b25 100644 (file)
--- a/engines/rados.c
+++ b/engines/rados.c
@@ -37,7 +37,9 @@ struct rados_options {
         char *cluster_name;
         char *pool_name;
         char *client_name;
+       char *conf;
         int busy_poll;
+       int touch_objects;
  };
  
  static struct fio_option options[] = {
@@ -68,6 +70,16 @@ static struct fio_option options[] = {
                 .category = FIO_OPT_C_ENGINE,
                 .group    = FIO_OPT_G_RBD,
         },
+       {
+               .name     = "conf",
+               .lname    = "ceph configuration file path",
+               .type     = FIO_OPT_STR_STORE,
+               .help     = "Path of the ceph configuration file",
+               .off1     = offsetof(struct rados_options, conf),
+               .def      = "/etc/ceph/ceph.conf",
+               .category = FIO_OPT_C_ENGINE,
+               .group    = FIO_OPT_G_RBD,
+       },
         {
                 .name     = "busy_poll",
                 .lname    = "busy poll mode",
@@ -78,6 +90,16 @@ static struct fio_option options[] = {
                 .category = FIO_OPT_C_ENGINE,
                 .group    = FIO_OPT_G_RBD,
         },
+       {
+               .name     = "touch_objects",
+               .lname    = "touch objects on start",
+               .type     = FIO_OPT_BOOL,
+               .help     = "Touch (create) objects on start",
+               .off1     = offsetof(struct rados_options, touch_objects),
+               .def      = "1",
+               .category = FIO_OPT_C_ENGINE,
+               .group    = FIO_OPT_G_RBD,
+       },
         {
                 .name     = NULL,
         },
@@ -140,7 +162,7 @@ static int _fio_rados_connect(struct thread_data *td)
                 char *client_name = NULL;
  
                 /*
-               * If we specify cluser name, the rados_create2
+               * If we specify cluster name, the rados_create2
                 * will not assume 'client.'. name is considered
                 * as a full type.id namestr
                 */
@@ -173,7 +195,7 @@ static int _fio_rados_connect(struct thread_data *td)
                 goto failed_early;
         }
  
-       r = rados_conf_read_file(rados->cluster, NULL);
+       r = rados_conf_read_file(rados->cluster, o->conf);
         if (r < 0) {
                 log_err("rados_conf_read_file failed.\n");
                 goto failed_early;
@@ -194,9 +216,11 @@ static int _fio_rados_connect(struct thread_data *td)
         for (i = 0; i < td->o.nr_files; i++) {
                 f = td->files[i];
                 f->real_file_size = file_size;
-               r = rados_write(rados->io_ctx, f->file_name, "", 0, 0);
-               if (r < 0) {
-                       goto failed_obj_create;
+               if (o->touch_objects) {
+                       r = rados_write(rados->io_ctx, f->file_name, "", 0, 0);
+                       if (r < 0) {
+                               goto failed_obj_create;
+                       }
                 }
         }
         return 0;
diff --git a/engines/rbd.c b/engines/rbd.c

index c6203d4c2a68173da20073f3f75863b6ec63ed38..2f25889ac877a09730bf6e037f5c9a3d7042dc64 100644 (file)
--- a/engines/rbd.c
+++ b/engines/rbd.c
@@ -173,7 +173,7 @@ static int _fio_rbd_connect(struct thread_data *td)
                 char *client_name = NULL; 
  
                 /*
-                * If we specify cluser name, the rados_create2
+                * If we specify cluster name, the rados_create2
                  * will not assume 'client.'. name is considered
                  * as a full type.id namestr
                  */
@@ -633,7 +633,7 @@ static int fio_rbd_setup(struct thread_data *td)
  
         /* taken from "net" engine. Pretend we deal with files,
          * even if we do not have any ideas about files.
-        * The size of the RBD is set instead of a artificial file.
+        * The size of the RBD is set instead of an artificial file.
          */
         if (!td->files_index) {
                 add_file(td, td->o.filename ? : "rbd", 0, 0);
diff --git a/engines/rdma.c b/engines/rdma.c

index f4471869813693eb140bec7d87a31123b539ea89..07336f3b889f1dc6441bcd787f49b673e56a733a 100644 (file)
--- a/engines/rdma.c
+++ b/engines/rdma.c
@@ -276,7 +276,6 @@ static int cq_event_handler(struct thread_data *td, enum ibv_wc_opcode opcode)
         int i;
  
         while ((ret = ibv_poll_cq(rd->cq, 1, &wc)) == 1) {
-               ret = 0;
                 compevnum++;
  
                 if (wc.status) {
@@ -832,6 +831,12 @@ static void fio_rdmaio_queued(struct thread_data *td, struct io_u **io_us,
                 memcpy(&io_u->issue_time, &now, sizeof(now));
                 io_u_queued(td, io_u);
         }
+
+       /*
+        * only used for iolog
+        */
+       if (td->o.read_iolog_file)
+               memcpy(&td->last_issue, &now, sizeof(now));
  }
  
  static int fio_rdmaio_commit(struct thread_data *td)
@@ -850,8 +855,6 @@ static int fio_rdmaio_commit(struct thread_data *td)
                         ret = fio_rdmaio_send(td, io_us, rd->io_u_queued_nr);
                 else if (!rd->is_client)
                         ret = fio_rdmaio_recv(td, io_us, rd->io_u_queued_nr);
-               else
-                       ret = 0;        /* must be a SYNC */
  
                 if (ret > 0) {
                         fio_rdmaio_queued(td, io_us, ret);
@@ -1194,7 +1197,7 @@ static int check_set_rlimits(struct thread_data *td)
  
  static int compat_options(struct thread_data *td)
  {
-       // The original RDMA engine had an ugly / seperator
+       // The original RDMA engine had an ugly / separator
         // on the filename for it's options. This function
         // retains backwards compatibility with it. Note we do not
         // support setting the bindname option is this legacy mode.
@@ -1290,23 +1293,18 @@ static int fio_rdmaio_init(struct thread_data *td)
  
         if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
             (rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
-               rd->rmt_us =
-                       malloc(FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
-               memset(rd->rmt_us, 0,
-                       FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
+               rd->rmt_us = calloc(FIO_RDMA_MAX_IO_DEPTH,
+                                   sizeof(struct remote_u));
                 rd->rmt_nr = 0;
         }
  
-       rd->io_us_queued = malloc(td->o.iodepth * sizeof(struct io_u *));
-       memset(rd->io_us_queued, 0, td->o.iodepth * sizeof(struct io_u *));
+       rd->io_us_queued = calloc(td->o.iodepth, sizeof(struct io_u *));
         rd->io_u_queued_nr = 0;
  
-       rd->io_us_flight = malloc(td->o.iodepth * sizeof(struct io_u *));
-       memset(rd->io_us_flight, 0, td->o.iodepth * sizeof(struct io_u *));
+       rd->io_us_flight = calloc(td->o.iodepth, sizeof(struct io_u *));
         rd->io_u_flight_nr = 0;
  
-       rd->io_us_completed = malloc(td->o.iodepth * sizeof(struct io_u *));
-       memset(rd->io_us_completed, 0, td->o.iodepth * sizeof(struct io_u *));
+       rd->io_us_completed = calloc(td->o.iodepth, sizeof(struct io_u *));
         rd->io_u_completed_nr = 0;
  
         if (td_read(td)) {      /* READ as the server */
@@ -1333,8 +1331,7 @@ static int fio_rdmaio_post_init(struct thread_data *td)
         for (i = 0; i < td->io_u_freelist.nr; i++) {
                 struct io_u *io_u = td->io_u_freelist.io_us[i];
  
-               io_u->engine_data = malloc(sizeof(struct rdma_io_u_data));
-               memset(io_u->engine_data, 0, sizeof(struct rdma_io_u_data));
+               io_u->engine_data = calloc(1, sizeof(struct rdma_io_u_data));
                 ((struct rdma_io_u_data *)io_u->engine_data)->wr_id = i;
  
                 io_u->mr = ibv_reg_mr(rd->pd, io_u->buf, max_bs,
@@ -1380,10 +1377,8 @@ static int fio_rdmaio_setup(struct thread_data *td)
         }
  
         if (!td->io_ops_data) {
-               rd = malloc(sizeof(*rd));
-
-               memset(rd, 0, sizeof(*rd));
-               init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME, 0);
+               rd = calloc(1, sizeof(*rd));
+               init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_64, 0);
                 td->io_ops_data = rd;
         }
  
@@ -1404,7 +1399,8 @@ FIO_STATIC struct ioengine_ops ioengine = {
         .cleanup                = fio_rdmaio_cleanup,
         .open_file              = fio_rdmaio_open_file,
         .close_file             = fio_rdmaio_close_file,
-       .flags                  = FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
+       .flags                  = FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO |
+                                       FIO_ASYNCIO_SETS_ISSUE_TIME,
         .options                = options,
         .option_struct_size     = sizeof(struct rdmaio_options),
  };
diff --git a/engines/sg.c b/engines/sg.c

index 0c2d2c8b861ad22fdc3080e51a605194450ffcc5..0bb5be4a9d64f6195a0165eee8f8e54e279f1ded 100644 (file)
--- a/engines/sg.c
+++ b/engines/sg.c
@@ -66,8 +66,13 @@
  
  enum {
         FIO_SG_WRITE            = 1,
-       FIO_SG_WRITE_VERIFY     = 2,
-       FIO_SG_WRITE_SAME       = 3
+       FIO_SG_WRITE_VERIFY,
+       FIO_SG_WRITE_SAME,
+       FIO_SG_WRITE_SAME_NDOB,
+       FIO_SG_WRITE_STREAM,
+       FIO_SG_VERIFY_BYTCHK_00,
+       FIO_SG_VERIFY_BYTCHK_01,
+       FIO_SG_VERIFY_BYTCHK_11,
  };
  
  struct sg_options {
@@ -76,6 +81,7 @@ struct sg_options {
         unsigned int readfua;
         unsigned int writefua;
         unsigned int write_mode;
+       uint16_t stream_id;
  };
  
  static struct fio_option options[] = {
@@ -120,18 +126,58 @@ static struct fio_option options[] = {
                             .oval = FIO_SG_WRITE,
                             .help = "Issue standard SCSI WRITE commands",
                           },
-                         { .ival = "verify",
+                         { .ival = "write_and_verify",
                             .oval = FIO_SG_WRITE_VERIFY,
                             .help = "Issue SCSI WRITE AND VERIFY commands",
                           },
-                         { .ival = "same",
+                         { .ival = "verify",
+                           .oval = FIO_SG_WRITE_VERIFY,
+                           .help = "Issue SCSI WRITE AND VERIFY commands. This "
+                                   "option is deprecated. Use write_and_verify instead.",
+                         },
+                         { .ival = "write_same",
                             .oval = FIO_SG_WRITE_SAME,
                             .help = "Issue SCSI WRITE SAME commands",
                           },
+                         { .ival = "same",
+                           .oval = FIO_SG_WRITE_SAME,
+                           .help = "Issue SCSI WRITE SAME commands. This "
+                                   "option is deprecated. Use write_same instead.",
+                         },
+                         { .ival = "write_same_ndob",
+                           .oval = FIO_SG_WRITE_SAME_NDOB,
+                           .help = "Issue SCSI WRITE SAME(16) commands with NDOB flag set",
+                         },
+                         { .ival = "verify_bytchk_00",
+                           .oval = FIO_SG_VERIFY_BYTCHK_00,
+                           .help = "Issue SCSI VERIFY commands with BYTCHK set to 00",
+                         },
+                         { .ival = "verify_bytchk_01",
+                           .oval = FIO_SG_VERIFY_BYTCHK_01,
+                           .help = "Issue SCSI VERIFY commands with BYTCHK set to 01",
+                         },
+                         { .ival = "verify_bytchk_11",
+                           .oval = FIO_SG_VERIFY_BYTCHK_11,
+                           .help = "Issue SCSI VERIFY commands with BYTCHK set to 11",
+                         },
+                         { .ival = "write_stream",
+                           .oval = FIO_SG_WRITE_STREAM,
+                           .help = "Issue SCSI WRITE STREAM(16) commands",
+                         },
                 },
                 .category = FIO_OPT_C_ENGINE,
                 .group  = FIO_OPT_G_SG,
         },
+       {
+               .name   = "stream_id",
+               .lname  = "stream id for WRITE STREAM(16) commands",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct sg_options, stream_id),
+               .help   = "Stream ID for WRITE STREAM(16) commands",
+               .def    = "0",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_SG,
+       },
         {
                 .name   = NULL,
         },
@@ -171,6 +217,11 @@ struct sgio_data {
  #endif
  };
  
+static inline uint16_t sgio_get_be16(uint8_t *buf)
+{
+       return be16_to_cpu(*((uint16_t *) buf));
+}
+
  static inline uint32_t sgio_get_be32(uint8_t *buf)
  {
         return be32_to_cpu(*((uint32_t *) buf));
@@ -471,10 +522,9 @@ static enum fio_q_status fio_sgio_rw_doio(struct thread_data *td,
                         if (__io_u == io_u)
                                 break;
  
-                       if (io_u_sync_complete(td, __io_u)) {
-                               ret = -1;
+                       if (io_u_sync_complete(td, __io_u))
                                 break;
-                       }
+
                 } while (1);
  
                 return FIO_Q_COMPLETED;
@@ -503,9 +553,9 @@ static enum fio_q_status fio_sgio_doio(struct thread_data *td,
  }
  
  static void fio_sgio_rw_lba(struct sg_io_hdr *hdr, unsigned long long lba,
-                           unsigned long long nr_blocks)
+                           unsigned long long nr_blocks, bool override16)
  {
-       if (lba < MAX_10B_LBA) {
+       if (lba < MAX_10B_LBA && !override16) {
                 sgio_set_be32((uint32_t) lba, &hdr->cmdp[2]);
                 sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[7]);
         } else {
@@ -546,7 +596,7 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
                 if (o->readfua)
                         hdr->cmdp[1] |= 0x08;
  
-               fio_sgio_rw_lba(hdr, lba, nr_blocks);
+               fio_sgio_rw_lba(hdr, lba, nr_blocks, false);
  
         } else if (io_u->ddir == DDIR_WRITE) {
                 sgio_hdr_init(sd, hdr, io_u, 1);
@@ -577,9 +627,46 @@ static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
                         else
                                 hdr->cmdp[0] = 0x93; // write same(16)
                         break;
+               case FIO_SG_WRITE_SAME_NDOB:
+                       hdr->cmdp[0] = 0x93; // write same(16)
+                       hdr->cmdp[1] |= 0x1; // no data output buffer
+                       hdr->dxfer_len = 0;
+                       break;
+               case FIO_SG_WRITE_STREAM:
+                       hdr->cmdp[0] = 0x9a; // write stream (16)
+                       if (o->writefua)
+                               hdr->cmdp[1] |= 0x08;
+                       sgio_set_be64(lba, &hdr->cmdp[2]);
+                       sgio_set_be16((uint16_t) io_u->file->engine_pos, &hdr->cmdp[10]);
+                       sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[12]);
+                       break;
+               case FIO_SG_VERIFY_BYTCHK_00:
+                       if (lba < MAX_10B_LBA)
+                               hdr->cmdp[0] = 0x2f; // VERIFY(10)
+                       else
+                               hdr->cmdp[0] = 0x8f; // VERIFY(16)
+                       hdr->dxfer_len = 0;
+                       break;
+               case FIO_SG_VERIFY_BYTCHK_01:
+                       if (lba < MAX_10B_LBA)
+                               hdr->cmdp[0] = 0x2f; // VERIFY(10)
+                       else
+                               hdr->cmdp[0] = 0x8f; // VERIFY(16)
+                       hdr->cmdp[1] |= 0x02;           // BYTCHK = 01b
+                       break;
+               case FIO_SG_VERIFY_BYTCHK_11:
+                       if (lba < MAX_10B_LBA)
+                               hdr->cmdp[0] = 0x2f; // VERIFY(10)
+                       else
+                               hdr->cmdp[0] = 0x8f; // VERIFY(16)
+                       hdr->cmdp[1] |= 0x06;           // BYTCHK = 11b
+                       hdr->dxfer_len = sd->bs;
+                       break;
                 };
  
-               fio_sgio_rw_lba(hdr, lba, nr_blocks);
+               if (o->write_mode != FIO_SG_WRITE_STREAM)
+                       fio_sgio_rw_lba(hdr, lba, nr_blocks,
+                               o->write_mode == FIO_SG_WRITE_SAME_NDOB);
  
         } else if (io_u->ddir == DDIR_TRIM) {
                 struct sgio_trim *st;
@@ -971,9 +1058,60 @@ static int fio_sgio_type_check(struct thread_data *td, struct fio_file *f)
         return 0;
  }
  
+static int fio_sgio_stream_control(struct fio_file *f, bool open_stream, uint16_t *stream_id)
+{
+       struct sg_io_hdr hdr;
+       unsigned char cmd[16];
+       unsigned char sb[64];
+       unsigned char buf[8];
+       int ret;
+
+       memset(&hdr, 0, sizeof(hdr));
+       memset(cmd, 0, sizeof(cmd));
+       memset(sb, 0, sizeof(sb));
+       memset(buf, 0, sizeof(buf));
+
+       hdr.interface_id = 'S';
+       hdr.cmdp = cmd;
+       hdr.cmd_len = 16;
+       hdr.sbp = sb;
+       hdr.mx_sb_len = sizeof(sb);
+       hdr.timeout = SCSI_TIMEOUT_MS;
+       hdr.cmdp[0] = 0x9e;
+       hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+       hdr.dxferp = buf;
+       hdr.dxfer_len = sizeof(buf);
+       sgio_set_be32(sizeof(buf), &hdr.cmdp[10]);
+
+       if (open_stream)
+               hdr.cmdp[1] = 0x34;
+       else {
+               hdr.cmdp[1] = 0x54;
+               sgio_set_be16(*stream_id, &hdr.cmdp[4]);
+       }
+
+       ret = ioctl(f->fd, SG_IO, &hdr);
+
+       if (ret < 0)
+               return ret;
+
+       if (hdr.info & SG_INFO_CHECK)
+               return 1;
+
+       if (open_stream) {
+               *stream_id = sgio_get_be16(&buf[4]);
+               dprint(FD_FILE, "sgio_stream_control: opened stream %u\n", (unsigned int) *stream_id);
+               assert(*stream_id != 0);
+       } else
+               dprint(FD_FILE, "sgio_stream_control: closed stream %u\n", (unsigned int) *stream_id);
+
+       return 0;
+}
+
  static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
  {
         struct sgio_data *sd = td->io_ops_data;
+       struct sg_options *o = td->eo;
         int ret;
  
         ret = generic_open_file(td, f);
@@ -982,12 +1120,36 @@ static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
  
         if (sd && !sd->type_checked && fio_sgio_type_check(td, f)) {
                 ret = generic_close_file(td, f);
-               return 1;
+               return ret;
+       }
+
+       if (o->write_mode == FIO_SG_WRITE_STREAM) {
+               if (o->stream_id)
+                       f->engine_pos = o->stream_id;
+               else {
+                       ret = fio_sgio_stream_control(f, true, (uint16_t *) &f->engine_pos);
+                       if (ret)
+                               return ret;
+               }
         }
  
         return 0;
  }
  
+int fio_sgio_close(struct thread_data *td, struct fio_file *f)
+{
+       struct sg_options *o = td->eo;
+       int ret;
+
+       if (!o->stream_id && o->write_mode == FIO_SG_WRITE_STREAM) {
+               ret = fio_sgio_stream_control(f, false, (uint16_t *) &f->engine_pos);
+               if (ret)
+                       return ret;
+       }
+
+       return generic_close_file(td, f);
+}
+
  /*
   * Build an error string with details about the driver, host or scsi
   * error contained in the sg header Caller will use as necessary.
@@ -1169,10 +1331,12 @@ static char *fio_sgio_errdetails(struct io_u *io_u)
                         strlcat(msg, ". ", MAXERRDETAIL);
                 }
                 if (hdr->sb_len_wr) {
+                       const uint8_t *const sbp = hdr->sbp;
+
                         snprintf(msgchunk, MAXMSGCHUNK, "Sense Data (%d bytes):", hdr->sb_len_wr);
                         strlcat(msg, msgchunk, MAXERRDETAIL);
                         for (i = 0; i < hdr->sb_len_wr; i++) {
-                               snprintf(msgchunk, MAXMSGCHUNK, " %02x", hdr->sbp[i]);
+                               snprintf(msgchunk, MAXMSGCHUNK, " %02x", sbp[i]);
                                 strlcat(msg, msgchunk, MAXERRDETAIL);
                         }
                         strlcat(msg, ". ", MAXERRDETAIL);
@@ -1262,9 +1426,9 @@ static struct ioengine_ops ioengine = {
         .event          = fio_sgio_event,
         .cleanup        = fio_sgio_cleanup,
         .open_file      = fio_sgio_open,
-       .close_file     = generic_close_file,
+       .close_file     = fio_sgio_close,
         .get_file_size  = fio_sgio_get_file_size,
-       .flags          = FIO_SYNCIO | FIO_RAWIO,
+       .flags          = FIO_SYNCIO | FIO_RAWIO | FIO_RO_NEEDS_RW_OPEN,
         .options        = options,
         .option_struct_size     = sizeof(struct sg_options)
  };
diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c

index 7f3e4cb3a13c7d9cdb4fcee9b877f5a9b26e9e5d..cff83a10ef6c2631f6df2045bec6f4ffe6a238d8 100644 (file)
--- a/engines/skeleton_external.c
+++ b/engines/skeleton_external.c
@@ -156,7 +156,6 @@ static int fio_skeleton_close(struct thread_data *td, struct fio_file *f)
  /*
   * Hook for getting the zoned model of a zoned block device for zonemode=zbd.
   * The zoned model can be one of (see zbd_types.h):
- * - ZBD_IGNORE: skip regular files
   * - ZBD_NONE: regular block device (zone emulation will be used)
   * - ZBD_HOST_AWARE: host aware zoned block device
   * - ZBD_HOST_MANAGED: host managed zoned block device
@@ -193,6 +192,18 @@ static int fio_skeleton_reset_wp(struct thread_data *td, struct fio_file *f,
         return 0;
  }
  
+/*
+ * Hook called for getting the maximum number of open zones for a
+ * ZBD_HOST_MANAGED zoned block device.
+ * A @max_open_zones value set to zero means no limit.
+ */
+static int fio_skeleton_get_max_open_zones(struct thread_data *td,
+                                          struct fio_file *f,
+                                          unsigned int *max_open_zones)
+{
+       return 0;
+}
+
  /*
   * Note that the structure is exported, so that fio can get it via
   * dlsym(..., "ioengine"); for (and only for) external engines.
@@ -212,6 +223,7 @@ struct ioengine_ops ioengine = {
         .get_zoned_model = fio_skeleton_get_zoned_model,
         .report_zones   = fio_skeleton_report_zones,
         .reset_wp       = fio_skeleton_reset_wp,
+       .get_max_open_zones = fio_skeleton_get_max_open_zones,
         .options        = options,
         .option_struct_size     = sizeof(struct fio_skeleton_options),
  };
diff --git a/engines/solarisaio.c b/engines/solarisaio.c

index 21e95935b20bdc387a41f5c176fd806ff3c8f504..b2b47fede675c8cef09e4b921297d3e4a6bed27f 100644 (file)
--- a/engines/solarisaio.c
+++ b/engines/solarisaio.c
@@ -185,8 +185,9 @@ static void fio_solarisaio_init_sigio(void)
  
  static int fio_solarisaio_init(struct thread_data *td)
  {
-       struct solarisaio_data *sd = malloc(sizeof(*sd));
         unsigned int max_depth;
+       struct solarisaio_data *sd;
+       sd = calloc(1, sizeof(*sd));
  
         max_depth = td->o.iodepth;
         if (max_depth > MAXASYNCHIO) {
@@ -195,9 +196,7 @@ static int fio_solarisaio_init(struct thread_data *td)
                                                         max_depth);
         }
  
-       memset(sd, 0, sizeof(*sd));
-       sd->aio_events = malloc(max_depth * sizeof(struct io_u *));
-       memset(sd->aio_events, 0, max_depth * sizeof(struct io_u *));
+       sd->aio_events = calloc(max_depth, sizeof(struct io_u *));
         sd->max_depth = max_depth;
  
  #ifdef USE_SIGNAL_COMPLETIONS
diff --git a/engines/sync.c b/engines/sync.c

index 339ba9997010ac4290ac9a5699ba26f7d4926710..d19991222e80ba46dd3decdb343704781439cc87 100644 (file)
--- a/engines/sync.c
+++ b/engines/sync.c
@@ -402,8 +402,7 @@ static int fio_vsyncio_init(struct thread_data *td)
  {
         struct syncio_data *sd;
  
-       sd = malloc(sizeof(*sd));
-       memset(sd, 0, sizeof(*sd));
+       sd = calloc(1, sizeof(*sd));
         sd->last_offset = -1ULL;
         sd->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
         sd->io_us = malloc(td->o.iodepth * sizeof(struct io_u *));
diff --git a/engines/windowsaio.c b/engines/windowsaio.c

index 9868e816adb68b8e196dc42d1832b8869e36fec3..6681f8bbabfa9dd4b273720f444e70e4e628d20d 100644 (file)
--- a/engines/windowsaio.c
+++ b/engines/windowsaio.c
@@ -11,6 +11,7 @@
  #include <errno.h>
  
  #include "../fio.h"
+#include "../optgroup.h"
  
  typedef BOOL (WINAPI *CANCELIOEX)(HANDLE hFile, LPOVERLAPPED lpOverlapped);
  
@@ -35,6 +36,26 @@ struct thread_ctx {
         struct windowsaio_data *wd;
  };
  
+struct windowsaio_options {
+       struct thread_data *td;
+       unsigned int no_completion_thread;
+};
+
+static struct fio_option options[] = {
+       {
+               .name   = "no_completion_thread",
+               .lname  = "No completion polling thread",
+               .type   = FIO_OPT_STR_SET,
+               .off1   = offsetof(struct windowsaio_options, no_completion_thread),
+               .help   = "Use to avoid separate completion polling thread",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_WINDOWSAIO,
+       },
+       {
+               .name   = NULL,
+       },
+};
+
  static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter);
  
  static int fio_windowsaio_init(struct thread_data *td)
@@ -80,6 +101,7 @@ static int fio_windowsaio_init(struct thread_data *td)
                 struct thread_ctx *ctx;
                 struct windowsaio_data *wd;
                 HANDLE hFile;
+               struct windowsaio_options *o = td->eo;
  
                 hFile = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
                 if (hFile == INVALID_HANDLE_VALUE) {
@@ -91,29 +113,30 @@ static int fio_windowsaio_init(struct thread_data *td)
                 wd->iothread_running = TRUE;
                 wd->iocp = hFile;
  
-               if (!rc)
-                       ctx = malloc(sizeof(struct thread_ctx));
+               if (o->no_completion_thread == 0) {
+                       if (!rc)
+                               ctx = malloc(sizeof(struct thread_ctx));
  
-               if (!rc && ctx == NULL) {
-                       log_err("windowsaio: failed to allocate memory for thread context structure\n");
-                       CloseHandle(hFile);
-                       rc = 1;
-               }
+                       if (!rc && ctx == NULL) {
+                               log_err("windowsaio: failed to allocate memory for thread context structure\n");
+                               CloseHandle(hFile);
+                               rc = 1;
+                       }
  
-               if (!rc) {
-                       DWORD threadid;
+                       if (!rc) {
+                               DWORD threadid;
  
-                       ctx->iocp = hFile;
-                       ctx->wd = wd;
-                       wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
-                       if (!wd->iothread)
-                               log_err("windowsaio: failed to create io completion thread\n");
-                       else if (fio_option_is_set(&td->o, cpumask))
-                               fio_setaffinity(threadid, td->o.cpumask);
+                               ctx->iocp = hFile;
+                               ctx->wd = wd;
+                               wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
+                               if (!wd->iothread)
+                                       log_err("windowsaio: failed to create io completion thread\n");
+                               else if (fio_option_is_set(&td->o, cpumask))
+                                       fio_setaffinity(threadid, td->o.cpumask);
+                       }
+                       if (rc || wd->iothread == NULL)
+                               rc = 1;
                 }
-
-               if (rc || wd->iothread == NULL)
-                       rc = 1;
         }
  
         return rc;
@@ -225,7 +248,7 @@ static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f)
                 log_err("fio: unknown fadvise type %d\n", td->o.fadvise_hint);
         }
  
-       if (!td_write(td) || read_only)
+       if ((!td_write(td) && !(td->flags & TD_F_SYNCS)) || read_only)
                 access = GENERIC_READ;
         else
                 access = (GENERIC_READ | GENERIC_WRITE);
@@ -302,9 +325,63 @@ static struct io_u* fio_windowsaio_event(struct thread_data *td, int event)
         return wd->aio_events[event];
  }
  
-static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
-                                   unsigned int max,
-                                   const struct timespec *t)
+/* dequeue completion entrees directly (no separate completion thread) */
+static int fio_windowsaio_getevents_nothread(struct thread_data *td, unsigned int min,
+                                   unsigned int max, const struct timespec *t)
+{
+       struct windowsaio_data *wd = td->io_ops_data;
+       unsigned int dequeued = 0;
+       struct io_u *io_u;
+       DWORD start_count = 0;
+       DWORD end_count = 0;
+       DWORD mswait = 250;
+       struct fio_overlapped *fov;
+
+       if (t != NULL) {
+               mswait = (t->tv_sec * 1000) + (t->tv_nsec / 1000000);
+               start_count = GetTickCount();
+               end_count = start_count + (t->tv_sec * 1000) + (t->tv_nsec / 1000000);
+       }
+
+       do {
+               BOOL ret;
+               OVERLAPPED *ovl;
+
+               ULONG entries = min(16, max-dequeued);
+               OVERLAPPED_ENTRY oe[16];
+               ret = GetQueuedCompletionStatusEx(wd->iocp, oe, 16, &entries, mswait, 0);
+               if (ret && entries) {
+                       int entry_num;
+
+                       for (entry_num=0; entry_num<entries; entry_num++) {
+                               ovl = oe[entry_num].lpOverlapped;
+                               fov = CONTAINING_RECORD(ovl, struct fio_overlapped, o);
+                               io_u = fov->io_u;
+
+                               if (ovl->Internal == ERROR_SUCCESS) {
+                                       io_u->resid = io_u->xfer_buflen - ovl->InternalHigh;
+                                       io_u->error = 0;
+                               } else {
+                                       io_u->resid = io_u->xfer_buflen;
+                                       io_u->error = win_to_posix_error(GetLastError());
+                               }
+
+                               fov->io_complete = FALSE;
+                               wd->aio_events[dequeued] = io_u;
+                               dequeued++;
+                       }
+               }
+
+               if (dequeued >= min ||
+                       (t != NULL && timeout_expired(start_count, end_count)))
+                       break;
+       } while (1);
+       return dequeued;
+}
+
+/* dequeue completion entrees creates by separate IoCompletionRoutine thread */
+static int fio_windowaio_getevents_thread(struct thread_data *td, unsigned int min,
+                                   unsigned int max, const struct timespec *t)
  {
         struct windowsaio_data *wd = td->io_ops_data;
         unsigned int dequeued = 0;
@@ -334,7 +411,6 @@ static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
                                 wd->aio_events[dequeued] = io_u;
                                 dequeued++;
                         }
-
                 }
                 if (dequeued >= min)
                         break;
@@ -353,6 +429,16 @@ static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
         return dequeued;
  }
  
+static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
+                                   unsigned int max, const struct timespec *t)
+{
+       struct windowsaio_options *o = td->eo;
+
+       if (o->no_completion_thread)
+               return fio_windowsaio_getevents_nothread(td, min, max, t);
+       return fio_windowaio_getevents_thread(td, min, max, t);
+}
+
  static enum fio_q_status fio_windowsaio_queue(struct thread_data *td,
                                               struct io_u *io_u)
  {
@@ -484,6 +570,8 @@ static struct ioengine_ops ioengine = {
         .get_file_size  = generic_get_file_size,
         .io_u_init      = fio_windowsaio_io_u_init,
         .io_u_free      = fio_windowsaio_io_u_free,
+       .options        = options,
+       .option_struct_size     = sizeof(struct windowsaio_options),
  };
  
  static void fio_init fio_windowsaio_register(void)
diff --git a/engines/xnvme.c b/engines/xnvme.c

new file mode 100644 (file)

index 0000000..6ba4aa4
--- /dev/null
+++ b/engines/xnvme.c
@@ -0,0 +1,1393 @@
+/*
+ * fio xNVMe IO Engine
+ *
+ * IO engine using the xNVMe C API.
+ *
+ * See: http://xnvme.io/
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stdlib.h>
+#include <assert.h>
+#include <libxnvme.h>
+#include "fio.h"
+#include "verify.h"
+#include "zbd_types.h"
+#include "dataplacement.h"
+#include "optgroup.h"
+
+static pthread_mutex_t g_serialize = PTHREAD_MUTEX_INITIALIZER;
+
+struct xnvme_fioe_fwrap {
+       /* fio file representation */
+       struct fio_file *fio_file;
+
+       /* xNVMe device handle */
+       struct xnvme_dev *dev;
+       /* xNVMe device geometry */
+       const struct xnvme_geo *geo;
+
+       struct xnvme_queue *queue;
+
+       uint32_t ssw;
+       uint32_t lba_nbytes;
+       uint32_t md_nbytes;
+       uint32_t lba_pow2;
+
+       uint8_t _pad[16];
+};
+XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_fwrap) == 64, "Incorrect size")
+
+struct xnvme_fioe_data {
+       /* I/O completion queue */
+       struct io_u **iocq;
+
+       /* # of iocq entries; incremented via getevents()/cb_pool() */
+       uint64_t completed;
+
+       /*
+        *  # of errors; incremented when observed on completion via
+        *  getevents()/cb_pool()
+        */
+       uint64_t ecount;
+
+       /* Controller which device/file to select */
+       int32_t prev;
+       int32_t cur;
+
+       /* Number of devices/files for which open() has been called */
+       int64_t nopen;
+       /* Number of devices/files allocated in files[] */
+       uint64_t nallocated;
+
+       struct iovec *iovec;
+       struct iovec *md_iovec;
+
+       struct xnvme_fioe_fwrap files[];
+};
+XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_data) == 64, "Incorrect size")
+
+struct xnvme_fioe_request {
+       /* Context for NVMe PI */
+       struct xnvme_pi_ctx pi_ctx;
+
+       /* Separate metadata buffer pointer */
+       void *md_buf;
+};
+
+struct xnvme_fioe_options {
+       void *padding;
+       unsigned int hipri;
+       unsigned int sqpoll_thread;
+       unsigned int xnvme_dev_nsid;
+       unsigned int xnvme_iovec;
+       unsigned int md_per_io_size;
+       unsigned int pi_act;
+       unsigned int apptag;
+       unsigned int apptag_mask;
+       unsigned int prchk;
+       char *xnvme_be;
+       char *xnvme_mem;
+       char *xnvme_async;
+       char *xnvme_sync;
+       char *xnvme_admin;
+       char *xnvme_dev_subnqn;
+};
+
+static int str_pi_chk_cb(void *data, const char *str)
+{
+       struct xnvme_fioe_options *o = data;
+
+       if (strstr(str, "GUARD") != NULL)
+               o->prchk = XNVME_PI_FLAGS_GUARD_CHECK;
+       if (strstr(str, "REFTAG") != NULL)
+               o->prchk |= XNVME_PI_FLAGS_REFTAG_CHECK;
+       if (strstr(str, "APPTAG") != NULL)
+               o->prchk |= XNVME_PI_FLAGS_APPTAG_CHECK;
+
+       return 0;
+}
+
+static struct fio_option options[] = {
+       {
+               .name = "hipri",
+               .lname = "High Priority",
+               .type = FIO_OPT_STR_SET,
+               .off1 = offsetof(struct xnvme_fioe_options, hipri),
+               .help = "Use polled IO completions",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_XNVME,
+       },
+       {
+               .name = "sqthread_poll",
+               .lname = "Kernel SQ thread polling",
+               .type = FIO_OPT_STR_SET,
+               .off1 = offsetof(struct xnvme_fioe_options, sqpoll_thread),
+               .help = "Offload submission/completion to kernel thread",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_XNVME,
+       },
+       {
+               .name = "xnvme_be",
+               .lname = "xNVMe Backend",
+               .type = FIO_OPT_STR_STORE,
+               .off1 = offsetof(struct xnvme_fioe_options, xnvme_be),
+               .help = "Select xNVMe backend [spdk,linux,fbsd]",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_XNVME,
+       },
+       {
+               .name = "xnvme_mem",
+               .lname = "xNVMe Memory Backend",
+               .type = FIO_OPT_STR_STORE,
+               .off1 = offsetof(struct xnvme_fioe_options, xnvme_mem),
+               .help = "Select xNVMe memory backend",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_XNVME,
+       },
+       {
+               .name = "xnvme_async",
+               .lname = "xNVMe Asynchronous command-interface",
+               .type = FIO_OPT_STR_STORE,
+               .off1 = offsetof(struct xnvme_fioe_options, xnvme_async),
+               .help = "Select xNVMe async. interface: "
+                       "[emu,thrpool,io_uring,io_uring_cmd,libaio,posix,vfio,nil]",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_XNVME,
+       },
+       {
+               .name = "xnvme_sync",
+               .lname = "xNVMe Synchronous. command-interface",
+               .type = FIO_OPT_STR_STORE,
+               .off1 = offsetof(struct xnvme_fioe_options, xnvme_sync),
+               .help = "Select xNVMe sync. interface: [nvme,psync,block]",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_XNVME,
+       },
+       {
+               .name = "xnvme_admin",
+               .lname = "xNVMe Admin command-interface",
+               .type = FIO_OPT_STR_STORE,
+               .off1 = offsetof(struct xnvme_fioe_options, xnvme_admin),
+               .help = "Select xNVMe admin. cmd-interface: [nvme,block]",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_XNVME,
+       },
+       {
+               .name = "xnvme_dev_nsid",
+               .lname = "xNVMe Namespace-Identifier, for user-space NVMe driver",
+               .type = FIO_OPT_INT,
+               .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_nsid),
+               .help = "xNVMe Namespace-Identifier, for user-space NVMe driver",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_XNVME,
+       },
+       {
+               .name = "xnvme_dev_subnqn",
+               .lname = "Subsystem nqn for Fabrics",
+               .type = FIO_OPT_STR_STORE,
+               .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_subnqn),
+               .help = "Subsystem NQN for Fabrics",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_XNVME,
+       },
+       {
+               .name = "xnvme_iovec",
+               .lname = "Vectored IOs",
+               .type = FIO_OPT_STR_SET,
+               .off1 = offsetof(struct xnvme_fioe_options, xnvme_iovec),
+               .help = "Send vectored IOs",
+               .category = FIO_OPT_C_ENGINE,
+               .group = FIO_OPT_G_XNVME,
+       },
+       {
+               .name   = "md_per_io_size",
+               .lname  = "Separate Metadata Buffer Size per I/O",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct xnvme_fioe_options, md_per_io_size),
+               .def    = "0",
+               .help   = "Size of separate metadata buffer per I/O (Default: 0)",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_XNVME,
+       },
+       {
+               .name   = "pi_act",
+               .lname  = "Protection Information Action",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct xnvme_fioe_options, pi_act),
+               .def    = "1",
+               .help   = "Protection Information Action bit (pi_act=1 or pi_act=0)",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_XNVME,
+       },
+       {
+               .name   = "pi_chk",
+               .lname  = "Protection Information Check",
+               .type   = FIO_OPT_STR_STORE,
+               .def    = NULL,
+               .help   = "Control of Protection Information Checking (pi_chk=GUARD,REFTAG,APPTAG)",
+               .cb     = str_pi_chk_cb,
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_XNVME,
+       },
+       {
+               .name   = "apptag",
+               .lname  = "Application Tag used in Protection Information",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct xnvme_fioe_options, apptag),
+               .def    = "0x1234",
+               .help   = "Application Tag used in Protection Information field (Default: 0x1234)",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_XNVME,
+       },
+       {
+               .name   = "apptag_mask",
+               .lname  = "Application Tag Mask",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct xnvme_fioe_options, apptag_mask),
+               .def    = "0xffff",
+               .help   = "Application Tag Mask used with Application Tag (Default: 0xffff)",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_XNVME,
+       },
+
+       {
+               .name = NULL,
+       },
+};
+
+static void cb_pool(struct xnvme_cmd_ctx *ctx, void *cb_arg)
+{
+       struct io_u *io_u = cb_arg;
+       struct xnvme_fioe_data *xd = io_u->mmap_data;
+       struct xnvme_fioe_request *fio_req = io_u->engine_data;
+       struct xnvme_fioe_fwrap *fwrap = &xd->files[io_u->file->fileno];
+       bool pi_act = (fio_req->pi_ctx.pi_flags >> 3);
+       int err;
+
+       if (xnvme_cmd_ctx_cpl_status(ctx)) {
+               xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF);
+               xd->ecount += 1;
+               io_u->error = EIO;
+       }
+
+       if (!io_u->error && fwrap->geo->pi_type && (io_u->ddir == DDIR_READ) && !pi_act) {
+               err = xnvme_pi_verify(&fio_req->pi_ctx, io_u->xfer_buf,
+                                     fio_req->md_buf, io_u->xfer_buflen / fwrap->lba_nbytes);
+               if (err) {
+                       xd->ecount += 1;
+                       io_u->error = EIO;
+               }
+       }
+
+       xd->iocq[xd->completed++] = io_u;
+       xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+}
+
+static struct xnvme_opts xnvme_opts_from_fioe(struct thread_data *td)
+{
+       struct xnvme_fioe_options *o = td->eo;
+       struct xnvme_opts opts = xnvme_opts_default();
+
+       opts.nsid = o->xnvme_dev_nsid;
+       opts.subnqn = o->xnvme_dev_subnqn;
+       opts.be = o->xnvme_be;
+       opts.mem = o->xnvme_mem;
+       opts.async = o->xnvme_async;
+       opts.sync = o->xnvme_sync;
+       opts.admin = o->xnvme_admin;
+
+       opts.poll_io = o->hipri;
+       opts.poll_sq = o->sqpoll_thread;
+
+       opts.direct = td->o.odirect;
+
+       return opts;
+}
+
+static void _dev_close(struct thread_data *td, struct xnvme_fioe_fwrap *fwrap)
+{
+       if (fwrap->dev)
+               xnvme_queue_term(fwrap->queue);
+
+       xnvme_dev_close(fwrap->dev);
+
+       memset(fwrap, 0, sizeof(*fwrap));
+}
+
+static void xnvme_fioe_cleanup(struct thread_data *td)
+{
+       struct xnvme_fioe_data *xd = NULL;
+       int err;
+
+       if (!td->io_ops_data)
+               return;
+
+       xd = td->io_ops_data;
+
+       err = pthread_mutex_lock(&g_serialize);
+       if (err)
+               log_err("ioeng->cleanup(): pthread_mutex_lock(), err(%d)\n", err);
+               /* NOTE: not returning here */
+
+       for (uint64_t i = 0; i < xd->nallocated; ++i)
+               _dev_close(td, &xd->files[i]);
+
+       if (!err) {
+               err = pthread_mutex_unlock(&g_serialize);
+               if (err)
+                       log_err("ioeng->cleanup(): pthread_mutex_unlock(), err(%d)\n", err);
+       }
+
+       free(xd->iocq);
+       free(xd->iovec);
+       free(xd->md_iovec);
+       free(xd);
+       td->io_ops_data = NULL;
+}
+
+static int _verify_options(struct thread_data *td, struct fio_file *f,
+                          struct xnvme_fioe_fwrap *fwrap)
+{
+       struct xnvme_fioe_options *o = td->eo;
+       unsigned int correct_md_size;
+
+       for_each_rw_ddir(ddir) {
+               if (td->o.min_bs[ddir] % fwrap->lba_nbytes || td->o.max_bs[ddir] % fwrap->lba_nbytes) {
+                       if (!fwrap->lba_pow2) {
+                               log_err("ioeng->_verify_options(%s): block size must be a multiple of %u "
+                                       "(LBA data size + Metadata size)\n", f->file_name, fwrap->lba_nbytes);
+                       } else {
+                               log_err("ioeng->_verify_options(%s): block size must be a multiple of LBA data size\n",
+                                       f->file_name);
+                       }
+                       return 1;
+               }
+               if (ddir == DDIR_TRIM)
+                       continue;
+
+               correct_md_size = (td->o.max_bs[ddir] / fwrap->lba_nbytes) * fwrap->md_nbytes;
+               if (fwrap->md_nbytes && fwrap->lba_pow2 && (o->md_per_io_size < correct_md_size)) {
+                       log_err("ioeng->_verify_options(%s): md_per_io_size should be at least %u bytes\n",
+                               f->file_name, correct_md_size);
+                       return 1;
+               }
+       }
+
+       /*
+        * For extended logical block sizes we cannot use verify when
+        * end to end data protection checks are enabled, as the PI
+        * section of data buffer conflicts with verify.
+        */
+       if (fwrap->md_nbytes && fwrap->geo->pi_type && !fwrap->lba_pow2 &&
+           td->o.verify != VERIFY_NONE) {
+               log_err("ioeng->_verify_options(%s): for extended LBA, verify cannot be used when E2E data protection is enabled\n",
+                       f->file_name);
+               return 1;
+       }
+
+       return 0;
+}
+
+/**
+ * Helper function setting up device handles as addressed by the naming
+ * convention of the given `fio_file` filename.
+ *
+ * Checks thread-options for explicit control of asynchronous implementation via
+ * the ``--xnvme_async={thrpool,emu,posix,io_uring,libaio,nil}``.
+ */
+static int _dev_open(struct thread_data *td, struct fio_file *f)
+{
+       struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+       struct xnvme_fioe_options *o = td->eo;
+       struct xnvme_fioe_data *xd = td->io_ops_data;
+       struct xnvme_fioe_fwrap *fwrap;
+       int flags = 0;
+       int err;
+
+       if (f->fileno > (int)xd->nallocated) {
+               log_err("ioeng->_dev_open(%s): invalid assumption\n", f->file_name);
+               return 1;
+       }
+
+       fwrap = &xd->files[f->fileno];
+
+       err = pthread_mutex_lock(&g_serialize);
+       if (err) {
+               log_err("ioeng->_dev_open(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
+                       err);
+               return -err;
+       }
+
+       fwrap->dev = xnvme_dev_open(f->file_name, &opts);
+       if (!fwrap->dev) {
+               log_err("ioeng->_dev_open(%s): xnvme_dev_open(), err(%d)\n", f->file_name, errno);
+               goto failure;
+       }
+       fwrap->geo = xnvme_dev_get_geo(fwrap->dev);
+
+       if (xnvme_queue_init(fwrap->dev, td->o.iodepth, flags, &(fwrap->queue))) {
+               log_err("ioeng->_dev_open(%s): xnvme_queue_init(), err(?)\n", f->file_name);
+               goto failure;
+       }
+       xnvme_queue_set_cb(fwrap->queue, cb_pool, NULL);
+
+       fwrap->ssw = xnvme_dev_get_ssw(fwrap->dev);
+       fwrap->lba_nbytes = fwrap->geo->lba_nbytes;
+       fwrap->md_nbytes = fwrap->geo->nbytes_oob;
+
+       if (fwrap->geo->lba_extended)
+               fwrap->lba_pow2 = 0;
+       else
+               fwrap->lba_pow2 = 1;
+
+       /*
+        * When PI action is set and PI size is equal to metadata size, the
+        * controller inserts/removes PI. So update the LBA data and metadata
+        * sizes accordingly.
+        */
+       if (o->pi_act && fwrap->geo->pi_type &&
+           fwrap->geo->nbytes_oob == xnvme_pi_size(fwrap->geo->pi_format)) {
+               if (fwrap->geo->lba_extended) {
+                       fwrap->lba_nbytes -= fwrap->geo->nbytes_oob;
+                       fwrap->lba_pow2 = 1;
+               }
+               fwrap->md_nbytes = 0;
+       }
+
+       if (_verify_options(td, f, fwrap)) {
+               td_verror(td, EINVAL, "_dev_open");
+               goto failure;
+       }
+
+       fwrap->fio_file = f;
+       fwrap->fio_file->filetype = FIO_TYPE_BLOCK;
+       fwrap->fio_file->real_file_size = fwrap->geo->tbytes;
+       fio_file_set_size_known(fwrap->fio_file);
+
+       err = pthread_mutex_unlock(&g_serialize);
+       if (err)
+               log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
+                       err);
+
+       return 0;
+
+failure:
+       xnvme_queue_term(fwrap->queue);
+       xnvme_dev_close(fwrap->dev);
+
+       err = pthread_mutex_unlock(&g_serialize);
+       if (err)
+               log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
+                       err);
+
+       return 1;
+}
+
+static int xnvme_fioe_init(struct thread_data *td)
+{
+       struct xnvme_fioe_data *xd = NULL;
+       struct xnvme_fioe_options *o = td->eo;
+       struct fio_file *f;
+       unsigned int i;
+
+       if (!td->o.use_thread) {
+               log_err("ioeng->init(): --thread=1 is required\n");
+               return 1;
+       }
+
+       /* Allocate xd and iocq */
+       xd = calloc(1, sizeof(*xd) + sizeof(*xd->files) * td->o.nr_files);
+       if (!xd) {
+               log_err("ioeng->init(): !calloc(), err(%d)\n", errno);
+               return 1;
+       }
+
+       xd->iocq = calloc(td->o.iodepth, sizeof(struct io_u *));
+       if (!xd->iocq) {
+               free(xd);
+               log_err("ioeng->init(): !calloc(xd->iocq), err(%d)\n", errno);
+               return 1;
+       }
+
+       if (o->xnvme_iovec) {
+               xd->iovec = calloc(td->o.iodepth, sizeof(*xd->iovec));
+               if (!xd->iovec) {
+                       free(xd->iocq);
+                       free(xd);
+                       log_err("ioeng->init(): !calloc(xd->iovec), err(%d)\n", errno);
+                       return 1;
+               }
+       }
+
+       if (o->xnvme_iovec && o->md_per_io_size) {
+               xd->md_iovec = calloc(td->o.iodepth, sizeof(*xd->md_iovec));
+               if (!xd->md_iovec) {
+                       free(xd->iocq);
+                       free(xd->iovec);
+                       free(xd);
+                       log_err("ioeng->init(): !calloc(xd->md_iovec), err(%d)\n", errno);
+                       return 1;
+               }
+       }
+
+       xd->prev = -1;
+       td->io_ops_data = xd;
+
+       for_each_file(td, f, i)
+       {
+               if (_dev_open(td, f)) {
+                       /*
+                        * Note: We are not freeing xd, iocq, iovec and md_iovec.
+                        * This will be done as part of cleanup routine.
+                        */
+                       log_err("ioeng->init(): failed; _dev_open(%s)\n", f->file_name);
+                       return 1;
+               }
+
+               ++(xd->nallocated);
+       }
+
+       if (xd->nallocated != td->o.nr_files) {
+               log_err("ioeng->init(): failed; nallocated != td->o.nr_files\n");
+               return 1;
+       }
+
+       return 0;
+}
+
+/* NOTE: using the first device for buffer-allocators) */
+static int xnvme_fioe_iomem_alloc(struct thread_data *td, size_t total_mem)
+{
+       struct xnvme_fioe_data *xd = td->io_ops_data;
+       struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
+
+       if (!fwrap->dev) {
+               log_err("ioeng->iomem_alloc(): failed; no dev-handle\n");
+               return 1;
+       }
+
+       td->orig_buffer = xnvme_buf_alloc(fwrap->dev, total_mem);
+
+       return td->orig_buffer == NULL;
+}
+
+/* NOTE: using the first device for buffer-allocators) */
+static void xnvme_fioe_iomem_free(struct thread_data *td)
+{
+       struct xnvme_fioe_data *xd = NULL;
+       struct xnvme_fioe_fwrap *fwrap = NULL;
+
+       if (!td->io_ops_data)
+               return;
+
+       xd = td->io_ops_data;
+       fwrap = &xd->files[0];
+
+       if (!fwrap->dev) {
+               log_err("ioeng->iomem_free(): failed no dev-handle\n");
+               return;
+       }
+
+       xnvme_buf_free(fwrap->dev, td->orig_buffer);
+}
+
+static int xnvme_fioe_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+       struct xnvme_fioe_request *fio_req;
+       struct xnvme_fioe_options *o = td->eo;
+       struct xnvme_fioe_data *xd = td->io_ops_data;
+       struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
+
+       if (!fwrap->dev) {
+               log_err("ioeng->io_u_init(): failed; no dev-handle\n");
+               return 1;
+       }
+
+       io_u->mmap_data = td->io_ops_data;
+       io_u->engine_data = NULL;
+
+       fio_req = calloc(1, sizeof(*fio_req));
+       if (!fio_req) {
+               log_err("ioeng->io_u_init(): !calloc(fio_req), err(%d)\n", errno);
+               return 1;
+       }
+
+       if (o->md_per_io_size) {
+               fio_req->md_buf = xnvme_buf_alloc(fwrap->dev, o->md_per_io_size);
+               if (!fio_req->md_buf) {
+                       free(fio_req);
+                       return 1;
+               }
+       }
+
+       io_u->engine_data = fio_req;
+
+       return 0;
+}
+
+static void xnvme_fioe_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+       struct xnvme_fioe_data *xd = NULL;
+       struct xnvme_fioe_fwrap *fwrap = NULL;
+       struct xnvme_fioe_request *fio_req = NULL;
+
+       if (!td->io_ops_data)
+               return;
+
+       xd = td->io_ops_data;
+       fwrap = &xd->files[0];
+
+       if (!fwrap->dev) {
+               log_err("ioeng->io_u_free(): failed no dev-handle\n");
+               return;
+       }
+
+       fio_req = io_u->engine_data;
+       if (fio_req->md_buf)
+               xnvme_buf_free(fwrap->dev, fio_req->md_buf);
+
+       free(fio_req);
+
+       io_u->mmap_data = NULL;
+}
+
+static struct io_u *xnvme_fioe_event(struct thread_data *td, int event)
+{
+       struct xnvme_fioe_data *xd = td->io_ops_data;
+
+       assert(event >= 0);
+       assert((unsigned)event < xd->completed);
+
+       return xd->iocq[event];
+}
+
+static int xnvme_fioe_getevents(struct thread_data *td, unsigned int min, unsigned int max,
+                               const struct timespec *t)
+{
+       struct xnvme_fioe_data *xd = td->io_ops_data;
+       struct xnvme_fioe_fwrap *fwrap = NULL;
+       int nfiles = xd->nallocated;
+       int err = 0;
+
+       if (xd->prev != -1 && ++xd->prev < nfiles) {
+               fwrap = &xd->files[xd->prev];
+               xd->cur = xd->prev;
+       }
+
+       xd->completed = 0;
+       for (;;) {
+               if (fwrap == NULL || xd->cur == nfiles) {
+                       fwrap = &xd->files[0];
+                       xd->cur = 0;
+               }
+
+               while (fwrap != NULL && xd->cur < nfiles && err >= 0) {
+                       err = xnvme_queue_poke(fwrap->queue, max - xd->completed);
+                       if (err < 0) {
+                               switch (err) {
+                               case -EBUSY:
+                               case -EAGAIN:
+                                       usleep(1);
+                                       break;
+
+                               default:
+                                       log_err("ioeng->getevents(): unhandled IO error\n");
+                                       assert(false);
+                                       return 0;
+                               }
+                       }
+                       if (xd->completed >= min) {
+                               xd->prev = xd->cur;
+                               return xd->completed;
+                       }
+                       xd->cur++;
+                       fwrap = &xd->files[xd->cur];
+
+                       if (err < 0) {
+                               switch (err) {
+                               case -EBUSY:
+                               case -EAGAIN:
+                                       usleep(1);
+                                       break;
+                               }
+                       }
+               }
+       }
+
+       xd->cur = 0;
+
+       return xd->completed;
+}
+
+static enum fio_q_status xnvme_fioe_queue(struct thread_data *td, struct io_u *io_u)
+{
+       struct xnvme_fioe_data *xd = td->io_ops_data;
+       struct xnvme_fioe_options *o = td->eo;
+       struct xnvme_fioe_fwrap *fwrap;
+       struct xnvme_cmd_ctx *ctx;
+       struct xnvme_fioe_request *fio_req = io_u->engine_data;
+       uint32_t nsid;
+       uint64_t slba;
+       uint16_t nlb;
+       int err;
+       bool vectored_io = ((struct xnvme_fioe_options *)td->eo)->xnvme_iovec;
+       uint32_t dir = io_u->dtype;
+
+       fio_ro_check(td, io_u);
+
+       fwrap = &xd->files[io_u->file->fileno];
+       nsid = xnvme_dev_get_nsid(fwrap->dev);
+
+       if (fwrap->lba_pow2) {
+               slba = io_u->offset >> fwrap->ssw;
+               nlb = (io_u->xfer_buflen >> fwrap->ssw) - 1;
+       } else {
+               slba = io_u->offset / fwrap->lba_nbytes;
+               nlb = (io_u->xfer_buflen / fwrap->lba_nbytes) - 1;
+       }
+
+       ctx = xnvme_queue_get_cmd_ctx(fwrap->queue);
+       ctx->async.cb_arg = io_u;
+
+       ctx->cmd.common.nsid = nsid;
+       ctx->cmd.nvm.slba = slba;
+       ctx->cmd.nvm.nlb = nlb;
+       if (dir) {
+               ctx->cmd.nvm.dtype = io_u->dtype;
+               ctx->cmd.nvm.cdw13.dspec = io_u->dspec;
+       }
+
+       switch (io_u->ddir) {
+       case DDIR_READ:
+               ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ;
+               break;
+
+       case DDIR_WRITE:
+               ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE;
+               break;
+
+       default:
+               log_err("ioeng->queue(): ENOSYS: %u\n", io_u->ddir);
+               xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+
+               io_u->error = ENOSYS;
+               assert(false);
+               return FIO_Q_COMPLETED;
+       }
+
+       if (fwrap->geo->pi_type && !o->pi_act) {
+               err = xnvme_pi_ctx_init(&fio_req->pi_ctx, fwrap->lba_nbytes,
+                                       fwrap->geo->nbytes_oob, fwrap->geo->lba_extended,
+                                       fwrap->geo->pi_loc, fwrap->geo->pi_type,
+                                       (o->pi_act << 3 | o->prchk), slba, o->apptag_mask,
+                                       o->apptag, fwrap->geo->pi_format);
+               if (err) {
+                       log_err("ioeng->queue(): err: '%d'\n", err);
+
+                       xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+
+                       io_u->error = abs(err);
+                       return FIO_Q_COMPLETED;
+               }
+
+               if (io_u->ddir == DDIR_WRITE)
+                       xnvme_pi_generate(&fio_req->pi_ctx, io_u->xfer_buf, fio_req->md_buf,
+                                         nlb + 1);
+       }
+
+       if (fwrap->geo->pi_type)
+               ctx->cmd.nvm.prinfo = (o->pi_act << 3 | o->prchk);
+
+       switch (fwrap->geo->pi_type) {
+       case XNVME_PI_TYPE1:
+       case XNVME_PI_TYPE2:
+               switch (fwrap->geo->pi_format) {
+               case XNVME_SPEC_NVM_NS_16B_GUARD:
+                       if (o->prchk & XNVME_PI_FLAGS_REFTAG_CHECK)
+                               ctx->cmd.nvm.ilbrt = (uint32_t)slba;
+                       break;
+               case XNVME_SPEC_NVM_NS_64B_GUARD:
+                       if (o->prchk & XNVME_PI_FLAGS_REFTAG_CHECK) {
+                               ctx->cmd.nvm.ilbrt = (uint32_t)slba;
+                               ctx->cmd.common.cdw03 = ((slba >> 32) & 0xffff);
+                       }
+                       break;
+               default:
+                       break;
+               }
+               if (o->prchk & XNVME_PI_FLAGS_APPTAG_CHECK) {
+                       ctx->cmd.nvm.lbat = o->apptag;
+                       ctx->cmd.nvm.lbatm = o->apptag_mask;
+               }
+               break;
+       case XNVME_PI_TYPE3:
+               if (o->prchk & XNVME_PI_FLAGS_APPTAG_CHECK) {
+                       ctx->cmd.nvm.lbat = o->apptag;
+                       ctx->cmd.nvm.lbatm = o->apptag_mask;
+               }
+               break;
+       case XNVME_PI_DISABLE:
+               break;
+       }
+
+       if (vectored_io) {
+               xd->iovec[io_u->index].iov_base = io_u->xfer_buf;
+               xd->iovec[io_u->index].iov_len = io_u->xfer_buflen;
+               if (fwrap->md_nbytes && fwrap->lba_pow2) {
+                       xd->md_iovec[io_u->index].iov_base = fio_req->md_buf;
+                       xd->md_iovec[io_u->index].iov_len = fwrap->md_nbytes * (nlb + 1);
+                       err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen,
+                                             &xd->md_iovec[io_u->index], 1,
+                                             fwrap->md_nbytes * (nlb + 1));
+               } else {
+                       err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen,
+                                             NULL, 0, 0);
+               }
+       } else {
+               if (fwrap->md_nbytes && fwrap->lba_pow2)
+                       err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen,
+                                            fio_req->md_buf, fwrap->md_nbytes * (nlb + 1));
+               else
+                       err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen, NULL, 0);
+       }
+       switch (err) {
+       case 0:
+               return FIO_Q_QUEUED;
+
+       case -EBUSY:
+       case -EAGAIN:
+               xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+               return FIO_Q_BUSY;
+
+       default:
+               log_err("ioeng->queue(): err: '%d'\n", err);
+
+               xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+
+               io_u->error = abs(err);
+               assert(false);
+               return FIO_Q_COMPLETED;
+       }
+}
+
+static int xnvme_fioe_close(struct thread_data *td, struct fio_file *f)
+{
+       struct xnvme_fioe_data *xd = td->io_ops_data;
+
+       dprint(FD_FILE, "xnvme close %s -- nopen: %ld\n", f->file_name, xd->nopen);
+
+       --(xd->nopen);
+
+       return 0;
+}
+
+static int xnvme_fioe_open(struct thread_data *td, struct fio_file *f)
+{
+       struct xnvme_fioe_data *xd = td->io_ops_data;
+
+       dprint(FD_FILE, "xnvme open %s -- nopen: %ld\n", f->file_name, xd->nopen);
+
+       if (f->fileno > (int)xd->nallocated) {
+               log_err("ioeng->open(): f->fileno > xd->nallocated; invalid assumption\n");
+               return 1;
+       }
+       if (xd->files[f->fileno].fio_file != f) {
+               log_err("ioeng->open(): fio_file != f; invalid assumption\n");
+               return 1;
+       }
+
+       ++(xd->nopen);
+
+       return 0;
+}
+
+static int xnvme_fioe_invalidate(struct thread_data *td, struct fio_file *f)
+{
+       /* Consider only doing this with be:spdk */
+       return 0;
+}
+
+static int xnvme_fioe_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                                        unsigned int *max_open_zones)
+{
+       struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+       struct xnvme_dev *dev;
+       const struct xnvme_spec_znd_idfy_ns *zns;
+       int err = 0, err_lock;
+
+       if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
+           f->filetype != FIO_TYPE_CHAR) {
+               log_info("ioeng->get_max_open_zoned(): ignoring filetype: %d\n", f->filetype);
+               return 0;
+       }
+       err_lock = pthread_mutex_lock(&g_serialize);
+       if (err_lock) {
+               log_err("ioeng->get_max_open_zones(): pthread_mutex_lock(), err(%d)\n", err_lock);
+               return -err_lock;
+       }
+
+       dev = xnvme_dev_open(f->file_name, &opts);
+       if (!dev) {
+               log_err("ioeng->get_max_open_zones(): xnvme_dev_open(), err(%d)\n", err_lock);
+               err = -errno;
+               goto exit;
+       }
+       if (xnvme_dev_get_geo(dev)->type != XNVME_GEO_ZONED) {
+               errno = EINVAL;
+               err = -errno;
+               goto exit;
+       }
+
+       zns = (void *)xnvme_dev_get_ns_css(dev);
+       if (!zns) {
+               log_err("ioeng->get_max_open_zones(): xnvme_dev_get_ns_css(), err(%d)\n", errno);
+               err = -errno;
+               goto exit;
+       }
+
+       /*
+        * intentional overflow as the value is zero-based and NVMe
+        * defines 0xFFFFFFFF as unlimited thus overflowing to 0 which
+        * is how fio indicates unlimited and otherwise just converting
+        * to one-based.
+        */
+       *max_open_zones = zns->mor + 1;
+
+exit:
+       xnvme_dev_close(dev);
+       err_lock = pthread_mutex_unlock(&g_serialize);
+       if (err_lock)
+               log_err("ioeng->get_max_open_zones(): pthread_mutex_unlock(), err(%d)\n",
+                       err_lock);
+
+       return err;
+}
+
+/**
+ * Currently, this function is called before of I/O engine initialization, so,
+ * we cannot consult the file-wrapping done when 'fioe' initializes.
+ * Instead we just open based on the given filename.
+ *
+ * TODO: unify the different setup methods, consider keeping the handle around,
+ * and consider how to support the --be option in this usecase
+ */
+static int xnvme_fioe_get_zoned_model(struct thread_data *td, struct fio_file *f,
+                                     enum zbd_zoned_model *model)
+{
+       struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+       struct xnvme_dev *dev;
+       int err = 0, err_lock;
+
+       if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
+           f->filetype != FIO_TYPE_CHAR) {
+               log_info("ioeng->get_zoned_model(): ignoring filetype: %d\n", f->filetype);
+               return -EINVAL;
+       }
+
+       err = pthread_mutex_lock(&g_serialize);
+       if (err) {
+               log_err("ioeng->get_zoned_model(): pthread_mutex_lock(), err(%d)\n", err);
+               return -err;
+       }
+
+       dev = xnvme_dev_open(f->file_name, &opts);
+       if (!dev) {
+               log_err("ioeng->get_zoned_model(): xnvme_dev_open(%s) failed, errno: %d\n",
+                       f->file_name, errno);
+               err = -errno;
+               goto exit;
+       }
+
+       switch (xnvme_dev_get_geo(dev)->type) {
+       case XNVME_GEO_UNKNOWN:
+               dprint(FD_ZBD, "%s: got 'unknown', assigning ZBD_NONE\n", f->file_name);
+               *model = ZBD_NONE;
+               break;
+
+       case XNVME_GEO_CONVENTIONAL:
+               dprint(FD_ZBD, "%s: got 'conventional', assigning ZBD_NONE\n", f->file_name);
+               *model = ZBD_NONE;
+               break;
+
+       case XNVME_GEO_ZONED:
+               dprint(FD_ZBD, "%s: got 'zoned', assigning ZBD_HOST_MANAGED\n", f->file_name);
+               *model = ZBD_HOST_MANAGED;
+               break;
+
+       default:
+               dprint(FD_ZBD, "%s: hit-default, assigning ZBD_NONE\n", f->file_name);
+               *model = ZBD_NONE;
+               errno = EINVAL;
+               err = -errno;
+               break;
+       }
+
+exit:
+       xnvme_dev_close(dev);
+
+       err_lock = pthread_mutex_unlock(&g_serialize);
+       if (err_lock)
+               log_err("ioeng->get_zoned_model(): pthread_mutex_unlock(), err(%d)\n", err_lock);
+
+       return err;
+}
+
+/**
+ * Fills the given ``zbdz`` with at most ``nr_zones`` zone-descriptors.
+ *
+ * The implementation converts the NVMe Zoned Command Set log-pages for Zone
+ * descriptors into the Linux Kernel Zoned Block Report format.
+ *
+ * NOTE: This function is called before I/O engine initialization, that is,
+ * before ``_dev_open`` has been called and file-wrapping is setup. Thus is has
+ * to do the ``_dev_open`` itself, and shut it down again once it is done
+ * retrieving the log-pages and converting them to the report format.
+ *
+ * TODO: unify the different setup methods, consider keeping the handle around,
+ * and consider how to support the --async option in this usecase
+ */
+static int xnvme_fioe_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
+                                  struct zbd_zone *zbdz, unsigned int nr_zones)
+{
+       struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+       const struct xnvme_spec_znd_idfy_lbafe *lbafe = NULL;
+       struct xnvme_dev *dev = NULL;
+       const struct xnvme_geo *geo = NULL;
+       struct xnvme_znd_report *rprt = NULL;
+       uint32_t ssw;
+       uint64_t slba;
+       unsigned int limit = 0;
+       int err = 0, err_lock;
+
+       dprint(FD_ZBD, "%s: report_zones() offset: %zu, nr_zones: %u\n", f->file_name, offset,
+              nr_zones);
+
+       err = pthread_mutex_lock(&g_serialize);
+       if (err) {
+               log_err("ioeng->report_zones(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
+                       err);
+               return -err;
+       }
+
+       dev = xnvme_dev_open(f->file_name, &opts);
+       if (!dev) {
+               log_err("ioeng->report_zones(%s): xnvme_dev_open(), err(%d)\n", f->file_name,
+                       errno);
+               goto exit;
+       }
+
+       geo = xnvme_dev_get_geo(dev);
+       ssw = xnvme_dev_get_ssw(dev);
+       lbafe = xnvme_znd_dev_get_lbafe(dev);
+
+       limit = nr_zones > geo->nzone ? geo->nzone : nr_zones;
+
+       dprint(FD_ZBD, "%s: limit: %u\n", f->file_name, limit);
+
+       slba = ((offset >> ssw) / geo->nsect) * geo->nsect;
+
+       rprt = xnvme_znd_report_from_dev(dev, slba, limit, 0);
+       if (!rprt) {
+               log_err("ioeng->report_zones(%s): xnvme_znd_report_from_dev(), err(%d)\n",
+                       f->file_name, errno);
+               err = -errno;
+               goto exit;
+       }
+       if (rprt->nentries != limit) {
+               log_err("ioeng->report_zones(%s): nentries != nr_zones\n", f->file_name);
+               err = 1;
+               goto exit;
+       }
+       if (offset > geo->tbytes) {
+               log_err("ioeng->report_zones(%s): out-of-bounds\n", f->file_name);
+               goto exit;
+       }
+
+       /* Transform the zone-report */
+       for (uint32_t idx = 0; idx < rprt->nentries; ++idx) {
+               struct xnvme_spec_znd_descr *descr = XNVME_ZND_REPORT_DESCR(rprt, idx);
+
+               zbdz[idx].start = descr->zslba << ssw;
+               zbdz[idx].len = lbafe->zsze << ssw;
+               zbdz[idx].capacity = descr->zcap << ssw;
+               zbdz[idx].wp = descr->wp << ssw;
+
+               switch (descr->zt) {
+               case XNVME_SPEC_ZND_TYPE_SEQWR:
+                       zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
+                       break;
+
+               default:
+                       log_err("ioeng->report_zones(%s): invalid type for zone at offset(%zu)\n",
+                               f->file_name, zbdz[idx].start);
+                       err = -EIO;
+                       goto exit;
+               }
+
+               switch (descr->zs) {
+               case XNVME_SPEC_ZND_STATE_EMPTY:
+                       zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
+                       break;
+               case XNVME_SPEC_ZND_STATE_IOPEN:
+                       zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
+                       break;
+               case XNVME_SPEC_ZND_STATE_EOPEN:
+                       zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
+                       break;
+               case XNVME_SPEC_ZND_STATE_CLOSED:
+                       zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
+                       break;
+               case XNVME_SPEC_ZND_STATE_FULL:
+                       zbdz[idx].cond = ZBD_ZONE_COND_FULL;
+                       break;
+
+               case XNVME_SPEC_ZND_STATE_RONLY:
+               case XNVME_SPEC_ZND_STATE_OFFLINE:
+               default:
+                       zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
+                       break;
+               }
+       }
+
+exit:
+       xnvme_buf_virt_free(rprt);
+
+       xnvme_dev_close(dev);
+
+       err_lock = pthread_mutex_unlock(&g_serialize);
+       if (err_lock)
+               log_err("ioeng->report_zones(): pthread_mutex_unlock(), err: %d\n", err_lock);
+
+       dprint(FD_ZBD, "err: %d, nr_zones: %d\n", err, (int)nr_zones);
+
+       return err ? err : (int)limit;
+}
+
+/**
+ * NOTE: This function may get called before I/O engine initialization, that is,
+ * before ``_dev_open`` has been called and file-wrapping is setup. In such
+ * case it has to do ``_dev_open`` itself, and shut it down again once it is
+ * done resetting write pointer of zones.
+ */
+static int xnvme_fioe_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset,
+                              uint64_t length)
+{
+       struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+       struct xnvme_fioe_data *xd = NULL;
+       struct xnvme_fioe_fwrap *fwrap = NULL;
+       struct xnvme_dev *dev = NULL;
+       const struct xnvme_geo *geo = NULL;
+       uint64_t first, last;
+       uint32_t ssw;
+       uint32_t nsid;
+       int err = 0, err_lock;
+
+       if (td->io_ops_data) {
+               xd = td->io_ops_data;
+               fwrap = &xd->files[f->fileno];
+
+               assert(fwrap->dev);
+               assert(fwrap->geo);
+
+               dev = fwrap->dev;
+               geo = fwrap->geo;
+               ssw = fwrap->ssw;
+       } else {
+               err = pthread_mutex_lock(&g_serialize);
+               if (err) {
+                       log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", err);
+                       return -err;
+               }
+
+               dev = xnvme_dev_open(f->file_name, &opts);
+               if (!dev) {
+                       log_err("ioeng->reset_wp(): xnvme_dev_open(%s) failed, errno(%d)\n",
+                               f->file_name, errno);
+                       goto exit;
+               }
+               geo = xnvme_dev_get_geo(dev);
+               ssw = xnvme_dev_get_ssw(dev);
+       }
+
+       nsid = xnvme_dev_get_nsid(dev);
+
+       first = ((offset >> ssw) / geo->nsect) * geo->nsect;
+       last = (((offset + length) >> ssw) / geo->nsect) * geo->nsect;
+       dprint(FD_ZBD, "first: 0x%lx, last: 0x%lx\n", first, last);
+
+       for (uint64_t zslba = first; zslba < last; zslba += geo->nsect) {
+               struct xnvme_cmd_ctx ctx = xnvme_cmd_ctx_from_dev(dev);
+
+               if (zslba >= (geo->nsect * geo->nzone)) {
+                       log_err("ioeng->reset_wp(): out-of-bounds\n");
+                       err = 0;
+                       break;
+               }
+
+               err = xnvme_znd_mgmt_send(&ctx, nsid, zslba, false,
+                                         XNVME_SPEC_ZND_CMD_MGMT_SEND_RESET, 0x0, NULL);
+               if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
+                       err = err ? err : -EIO;
+                       log_err("ioeng->reset_wp(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
+                       goto exit;
+               }
+       }
+
+exit:
+       if (!td->io_ops_data) {
+               xnvme_dev_close(dev);
+
+               err_lock = pthread_mutex_unlock(&g_serialize);
+               if (err_lock)
+                       log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err_lock);
+       }
+
+       return err;
+}
+
+static int xnvme_fioe_fetch_ruhs(struct thread_data *td, struct fio_file *f,
+                                struct fio_ruhs_info *fruhs_info)
+{
+       struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+       struct xnvme_dev *dev;
+       struct xnvme_spec_ruhs *ruhs;
+       struct xnvme_cmd_ctx ctx;
+       uint32_t ruhs_nbytes;
+       uint32_t nsid;
+       int err = 0, err_lock;
+
+       if (f->filetype != FIO_TYPE_CHAR && f->filetype != FIO_TYPE_FILE) {
+               log_err("ioeng->fdp_ruhs(): ignoring filetype: %d\n", f->filetype);
+               return -EINVAL;
+       }
+
+       err = pthread_mutex_lock(&g_serialize);
+       if (err) {
+               log_err("ioeng->fdp_ruhs(): pthread_mutex_lock(), err(%d)\n", err);
+               return -err;
+       }
+
+       dev = xnvme_dev_open(f->file_name, &opts);
+       if (!dev) {
+               log_err("ioeng->fdp_ruhs(): xnvme_dev_open(%s) failed, errno: %d\n",
+                       f->file_name, errno);
+               err = -errno;
+               goto exit;
+       }
+
+       ruhs_nbytes = sizeof(*ruhs) + (FDP_MAX_RUHS * sizeof(struct xnvme_spec_ruhs_desc));
+       ruhs = xnvme_buf_alloc(dev, ruhs_nbytes);
+       if (!ruhs) {
+               err = -errno;
+               goto exit;
+       }
+       memset(ruhs, 0, ruhs_nbytes);
+
+       ctx = xnvme_cmd_ctx_from_dev(dev);
+       nsid = xnvme_dev_get_nsid(dev);
+
+       err = xnvme_nvm_mgmt_recv(&ctx, nsid, XNVME_SPEC_IO_MGMT_RECV_RUHS, 0, ruhs, ruhs_nbytes);
+
+       if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
+               err = err ? err : -EIO;
+               log_err("ioeng->fdp_ruhs(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
+               goto free_buffer;
+       }
+
+       fruhs_info->nr_ruhs = ruhs->nruhsd;
+       for (uint32_t idx = 0; idx < fruhs_info->nr_ruhs; ++idx) {
+               fruhs_info->plis[idx] = le16_to_cpu(ruhs->desc[idx].pi);
+       }
+
+free_buffer:
+       xnvme_buf_free(dev, ruhs);
+exit:
+       xnvme_dev_close(dev);
+
+       err_lock = pthread_mutex_unlock(&g_serialize);
+       if (err_lock)
+               log_err("ioeng->fdp_ruhs(): pthread_mutex_unlock(), err(%d)\n", err_lock);
+
+       return err;
+}
+
+static int xnvme_fioe_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+       struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+       struct xnvme_dev *dev;
+       int ret = 0, err;
+
+       if (fio_file_size_known(f))
+               return 0;
+
+       ret = pthread_mutex_lock(&g_serialize);
+       if (ret) {
+               log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", ret);
+               return -ret;
+       }
+
+       dev = xnvme_dev_open(f->file_name, &opts);
+       if (!dev) {
+               log_err("%s: failed retrieving device handle, errno: %d\n", f->file_name, errno);
+               ret = -errno;
+               goto exit;
+       }
+
+       f->real_file_size = xnvme_dev_get_geo(dev)->tbytes;
+       fio_file_set_size_known(f);
+
+       if (td->o.zone_mode == ZONE_MODE_ZBD)
+               f->filetype = FIO_TYPE_BLOCK;
+
+exit:
+       xnvme_dev_close(dev);
+       err = pthread_mutex_unlock(&g_serialize);
+       if (err)
+               log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err);
+
+       return ret;
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+       .name = "xnvme",
+       .version = FIO_IOOPS_VERSION,
+       .options = options,
+       .option_struct_size = sizeof(struct xnvme_fioe_options),
+       .flags = FIO_DISKLESSIO | FIO_NODISKUTIL | FIO_NOEXTEND | FIO_MEMALIGN | FIO_RAWIO,
+
+       .cleanup = xnvme_fioe_cleanup,
+       .init = xnvme_fioe_init,
+
+       .iomem_free = xnvme_fioe_iomem_free,
+       .iomem_alloc = xnvme_fioe_iomem_alloc,
+
+       .io_u_free = xnvme_fioe_io_u_free,
+       .io_u_init = xnvme_fioe_io_u_init,
+
+       .event = xnvme_fioe_event,
+       .getevents = xnvme_fioe_getevents,
+       .queue = xnvme_fioe_queue,
+
+       .close_file = xnvme_fioe_close,
+       .open_file = xnvme_fioe_open,
+       .get_file_size = xnvme_fioe_get_file_size,
+
+       .invalidate = xnvme_fioe_invalidate,
+       .get_max_open_zones = xnvme_fioe_get_max_open_zones,
+       .get_zoned_model = xnvme_fioe_get_zoned_model,
+       .report_zones = xnvme_fioe_report_zones,
+       .reset_wp = xnvme_fioe_reset_wp,
+
+       .fdp_fetch_ruhs = xnvme_fioe_fetch_ruhs,
+};
+
+static void fio_init fio_xnvme_register(void)
+{
+       register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_xnvme_unregister(void)
+{
+       unregister_ioengine(&ioengine);
+}
diff --git a/eta.c b/eta.c

index 978430120b850f8b6712a98aeae90e0101040238..7d07708ff69ee9701fe3ce4f3e8a1e95e91311e8 100644 (file)
--- a/eta.c
+++ b/eta.c
@@ -3,6 +3,7 @@
   */
  #include <unistd.h>
  #include <string.h>
+#include <stdlib.h>
  #ifdef CONFIG_VALGRIND_DEV
  #include <valgrind/drd.h>
  #else
@@ -214,8 +215,9 @@ static unsigned long thread_eta(struct thread_data *td)
                                 perc = td->o.rwmix[DDIR_WRITE];
  
                         bytes_total += (bytes_total * perc) / 100;
-               } else
+               } else {
                         bytes_total <<= 1;
+               }
         }
  
         if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING) {
@@ -227,8 +229,9 @@ static unsigned long thread_eta(struct thread_data *td)
                         perc = (double) bytes_done / (double) bytes_total;
                         if (perc > 1.0)
                                 perc = 1.0;
-               } else
+               } else {
                         perc = 0.0;
+               }
  
                 if (td->o.time_based) {
                         if (timeout) {
@@ -331,7 +334,7 @@ static void calc_rate(int unified_rw_rep, unsigned long mtime,
                 else
                         this_rate = 0;
  
-               if (unified_rw_rep) {
+               if (unified_rw_rep == UNIFIED_MIXED) {
                         rate[i] = 0;
                         rate[0] += this_rate;
                 } else
@@ -356,7 +359,7 @@ static void calc_iops(int unified_rw_rep, unsigned long mtime,
                 else
                         this_iops = 0;
  
-               if (unified_rw_rep) {
+               if (unified_rw_rep == UNIFIED_MIXED) {
                         iops[i] = 0;
                         iops[0] += this_iops;
                 } else
@@ -374,14 +377,30 @@ bool eta_time_within_slack(unsigned int time)
         return time > ((eta_interval_msec * 95) / 100);
  }
  
+/*
+ * These are the conditions under which we might be able to skip the eta
+ * calculation.
+ */
+static bool skip_eta()
+{
+       if (!(output_format & FIO_OUTPUT_NORMAL) && f_out == stdout)
+               return true;
+       if (temp_stall_ts || eta_print == FIO_ETA_NEVER)
+               return true;
+       if (!isatty(STDOUT_FILENO) && eta_print != FIO_ETA_ALWAYS)
+               return true;
+
+       return false;
+}
+
  /*
   * Print status of the jobs we know about. This includes rate estimates,
   * ETA, thread state, etc.
   */
-bool calc_thread_status(struct jobs_eta *je, int force)
+static bool calc_thread_status(struct jobs_eta *je, int force)
  {
-       struct thread_data *td;
-       int i, unified_rw_rep;
+       int unified_rw_rep;
+       bool any_td_in_ramp;
         uint64_t rate_time, disp_time, bw_avg_time, *eta_secs;
         unsigned long long io_bytes[DDIR_RWDIR_CNT] = {};
         unsigned long long io_iops[DDIR_RWDIR_CNT] = {};
@@ -392,14 +411,12 @@ bool calc_thread_status(struct jobs_eta *je, int force)
         static unsigned long long disp_io_iops[DDIR_RWDIR_CNT];
         static struct timespec rate_prev_time, disp_prev_time;
  
-       if (!force) {
-               if (!(output_format & FIO_OUTPUT_NORMAL) &&
-                   f_out == stdout)
-                       return false;
-               if (temp_stall_ts || eta_print == FIO_ETA_NEVER)
-                       return false;
+       bool ret = true;
  
-               if (!isatty(STDOUT_FILENO) && (eta_print != FIO_ETA_ALWAYS))
+       if (!force && skip_eta()) {
+               if (write_bw_log)
+                       ret = false;
+               else
                         return false;
         }
  
@@ -408,18 +425,18 @@ bool calc_thread_status(struct jobs_eta *je, int force)
         if (!ddir_rw_sum(disp_io_bytes))
                 fill_start_time(&disp_prev_time);
  
-       eta_secs = malloc(thread_number * sizeof(uint64_t));
-       memset(eta_secs, 0, thread_number * sizeof(uint64_t));
+       eta_secs = calloc(thread_number, sizeof(uint64_t));
  
         je->elapsed_sec = (mtime_since_genesis() + 999) / 1000;
  
         bw_avg_time = ULONG_MAX;
         unified_rw_rep = 0;
-       for_each_td(td, i) {
+       for_each_td(td) {
                 unified_rw_rep += td->o.unified_rw_rep;
                 if (is_power_of_2(td->o.kb_base))
                         je->is_pow2 = 1;
                 je->unit_base = td->o.unit_base;
+               je->sig_figs = td->o.sig_figs;
                 if (td->o.bw_avg_time < bw_avg_time)
                         bw_avg_time = td->o.bw_avg_time;
                 if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING
@@ -456,9 +473,9 @@ bool calc_thread_status(struct jobs_eta *je, int force)
                         je->nr_pending++;
  
                 if (je->elapsed_sec >= 3)
-                       eta_secs[i] = thread_eta(td);
+                       eta_secs[__td_index] = thread_eta(td);
                 else
-                       eta_secs[i] = INT_MAX;
+                       eta_secs[__td_index] = INT_MAX;
  
                 check_str_update(td);
  
@@ -475,26 +492,26 @@ bool calc_thread_status(struct jobs_eta *je, int force)
                                 }
                         }
                 }
-       }
+       } end_for_each();
  
         if (exitall_on_terminate) {
                 je->eta_sec = INT_MAX;
-               for_each_td(td, i) {
-                       if (eta_secs[i] < je->eta_sec)
-                               je->eta_sec = eta_secs[i];
-               }
+               for_each_td_index() {
+                       if (eta_secs[__td_index] < je->eta_sec)
+                               je->eta_sec = eta_secs[__td_index];
+               } end_for_each();
         } else {
                 unsigned long eta_stone = 0;
  
                 je->eta_sec = 0;
-               for_each_td(td, i) {
+               for_each_td(td) {
                         if ((td->runstate == TD_NOT_CREATED) && td->o.stonewall)
-                               eta_stone += eta_secs[i];
+                               eta_stone += eta_secs[__td_index];
                         else {
-                               if (eta_secs[i] > je->eta_sec)
-                                       je->eta_sec = eta_secs[i];
+                               if (eta_secs[__td_index] > je->eta_sec)
+                                       je->eta_sec = eta_secs[__td_index];
                         }
-               }
+               } end_for_each();
                 je->eta_sec += eta_stone;
         }
  
@@ -503,13 +520,17 @@ bool calc_thread_status(struct jobs_eta *je, int force)
         fio_gettime(&now, NULL);
         rate_time = mtime_since(&rate_prev_time, &now);
  
-       if (write_bw_log && rate_time > bw_avg_time && !in_ramp_time(td)) {
+       any_td_in_ramp = false;
+       for_each_td(td) {
+               any_td_in_ramp |= in_ramp_time(td);
+       } end_for_each();
+       if (write_bw_log && rate_time > bw_avg_time && !any_td_in_ramp) {
                 calc_rate(unified_rw_rep, rate_time, io_bytes, rate_io_bytes,
                                 je->rate);
                 memcpy(&rate_prev_time, &now, sizeof(now));
                 regrow_agg_logs();
                 for_each_rw_ddir(ddir) {
-                       add_agg_sample(sample_val(je->rate[ddir]), ddir, 0, 0);
+                       add_agg_sample(sample_val(je->rate[ddir]), ddir, 0);
                 }
         }
  
@@ -529,7 +550,7 @@ bool calc_thread_status(struct jobs_eta *je, int force)
         je->nr_threads = thread_number;
         update_condensed_str(__run_str, run_str);
         memcpy(je->run_str, run_str, strlen(run_str));
-       return true;
+       return ret;
  }
  
  static int gen_eta_str(struct jobs_eta *je, char *p, size_t left,
@@ -600,9 +621,9 @@ void display_thread_status(struct jobs_eta *je)
                 char *tr, *mr;
  
                 mr = num2str(je->m_rate[0] + je->m_rate[1] + je->m_rate[2],
-                               je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC);
+                               je->sig_figs, 1, je->is_pow2, N2S_BYTEPERSEC);
                 tr = num2str(je->t_rate[0] + je->t_rate[1] + je->t_rate[2],
-                               je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC);
+                               je->sig_figs, 1, je->is_pow2, N2S_BYTEPERSEC);
  
                 p += sprintf(p, ", %s-%s", mr, tr);
                 free(tr);
@@ -686,10 +707,9 @@ struct jobs_eta *get_jobs_eta(bool force, size_t *size)
                 return NULL;
  
         *size = sizeof(*je) + THREAD_RUNSTR_SZ + 8;
-       je = malloc(*size);
+       je = calloc(1, *size);
         if (!je)
                 return NULL;
-       memset(je, 0, *size);
  
         if (!calc_thread_status(je, force)) {
                 free(je);
@@ -706,10 +726,10 @@ void print_thread_status(void)
         size_t size;
  
         je = get_jobs_eta(false, &size);
-       if (je)
+       if (je) {
                 display_thread_status(je);
-
-       free(je);
+               free(je);
+       }
  }
  
  void print_status_init(int thr_number)
diff --git a/examples/1mbs_clients.png b/examples/1mbs_clients.png

new file mode 100644 (file)

index 0000000..3f972dc

Binary files /dev/null and b/examples/1mbs_clients.png differ
diff --git a/examples/aio-read.png b/examples/aio-read.png

new file mode 100644 (file)

index 0000000..e0c020a

Binary files /dev/null and b/examples/aio-read.png differ
diff --git a/examples/backwards-read.png b/examples/backwards-read.png

new file mode 100644 (file)

index 0000000..81dc920

Binary files /dev/null and b/examples/backwards-read.png differ
diff --git a/examples/basic-verify.png b/examples/basic-verify.png

new file mode 100644 (file)

index 0000000..98f7302

Binary files /dev/null and b/examples/basic-verify.png differ
diff --git a/examples/butterfly.png b/examples/butterfly.png

new file mode 100644 (file)

index 0000000..2c56651

Binary files /dev/null and b/examples/butterfly.png differ
diff --git a/examples/cmdprio-bssplit.fio b/examples/cmdprio-bssplit.fio

new file mode 100644 (file)

index 0000000..ee202d7
--- /dev/null
+++ b/examples/cmdprio-bssplit.fio
@@ -0,0 +1,79 @@
+; Randomly read/write a block device file at queue depth 16.
+[global]
+filename=/dev/sda
+direct=1
+write_lat_log=prio-run.log
+log_prio=1
+rw=randrw
+ioengine=libaio
+iodepth=16
+
+; Simple cmdprio_bssplit format. All non-zero percentage entries will
+; use the same prio class and prio level defined by the cmdprio_class
+; and cmdprio options.
+[cmdprio]
+; 40% of read I/Os are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 100% of the 64kB reads are executed with prio class 1 and prio level 0.
+; All other I/Os are executed without a priority set.
+bssplit=64k/40:1024k/60,1024k/100
+cmdprio_bssplit=64k/100:1024k/0,1024k/0
+cmdprio_class=1
+cmdprio=0
+
+; Advanced cmdprio_bssplit format. Each non-zero percentage entry can
+; use a different prio class and prio level (appended to each entry).
+[cmdprio-adv]
+; 40% of read I/Os are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 25% of the 64kB reads are executed with prio class 1 and prio level 1,
+; 75% of the 64kB reads are executed with prio class 3 and prio level 2.
+; All other I/Os are executed without a priority set.
+stonewall
+bssplit=64k/40:1024k/60,1024k/100
+cmdprio_bssplit=64k/25/1/1:64k/75/3/2:1024k/0,1024k/0
+
+; Identical to the previous example, but with a default priority defined.
+[cmdprio-adv-def]
+; 40% of read I/Os are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 25% of the 64kB reads are executed with prio class 1 and prio level 1,
+; 75% of the 64kB reads are executed with prio class 3 and prio level 2.
+; All other I/Os are executed with prio class 2 and prio level 7.
+stonewall
+prioclass=2
+prio=7
+bssplit=64k/40:1024k/60,1024k/100
+cmdprio_bssplit=64k/25/1/1:64k/75/3/2:1024k/0,1024k/0
+
+; Example of how to use cmdprio_bssplit with Command Duration Limits (CDL)
+; using I/O priority hints. The drive has to support CDL, and CDL has to be
+; enabled in sysfs, otherwise the hints will not be sent down to the drive.
+[cmdprio-hints]
+; 40% of the I/Os are 1MB reads and 60% of the I/Os are 2MB reads.
+;
+; 10% of the 1MB reads are executed with prio class 2 (Best Effort),
+; prio level 0, and prio hint 1. Prio hint 1 means CDL descriptor 1.
+; Since 40% of read I/Os are 1MB, and 10% of the 1MB I/Os use CDL desc 1,
+; this means that 4% of all the issued I/O will use this configuration.
+;
+; 30% of the 1MB reads are executed with prio class 2 (Best Effort),
+; prio level 0, and prio hint 2. Prio hint 2 means CDL descriptor 2.
+; Since 40% of read I/Os are 1MB, and 30% of the 1MB I/Os use CDL desc 2,
+; this means that 12% of all the issued I/O will use this configuration.
+;
+; 60% of the 1MB reads are executed with prio class 2 (Best Effort),
+; prio level 0, and prio hint 0. Prio hint 0 means no hint.
+; Since 40% of read I/Os are 1MB, and 60% of the 1MB I/Os use no hint,
+; this means that 24% of all the issued I/O will use this configuration.
+;
+; 10% of the 2MB reads are executed with prio class 2 (Best Effort),
+; prio level 0, and prio hint 3. Prio hint 3 means CDL descriptor 3.
+; Since 60% of read I/Os are 2MB, and 10% of the 2MB I/Os use CDL desc 3,
+; this means that 6% of all the issued I/O will use this configuration.
+;
+; 90% of the 2MB reads are executed with prio class 2 (Best Effort),
+; prio level 0, and prio hint 0. Prio hint 0 means no hint.
+; Since 60% of read I/Os are 2MB, and 90% of the 2MB I/Os use no hint,
+; this means that 54% of all the issued I/O will use this configuration.
+stonewall
+rw=randread
+bssplit=1M/40:2M/60
+cmdprio_bssplit=1M/10/2/0/1:1M/30/2/0/2:1M/60/2/0/0:2M/10/2/0/3:2M/90/2/0/0
diff --git a/examples/cmdprio-bssplit.png b/examples/cmdprio-bssplit.png

new file mode 100644 (file)

index 0000000..83a5570

Binary files /dev/null and b/examples/cmdprio-bssplit.png differ
diff --git a/examples/cmdprio-percentage.fio b/examples/cmdprio-percentage.fio

new file mode 100644 (file)

index 0000000..e4bc9db
--- /dev/null
+++ b/examples/cmdprio-percentage.fio
@@ -0,0 +1,17 @@
+; Read a block device file at queue depth 8
+; with 20 % of the IOs using the high priority RT class
+; and the remaining IOs using the idle priority class
+[global]
+filename=/dev/sda
+direct=1
+write_lat_log=prio-run.log
+log_prio=1
+
+[randread]
+rw=randread
+bs=128k
+ioengine=libaio
+iodepth=8
+prioclass=3
+cmdprio_percentage=20
+cmdprio_class=1
diff --git a/examples/cmdprio-percentage.png b/examples/cmdprio-percentage.png

new file mode 100644 (file)

index 0000000..e794de0

Binary files /dev/null and b/examples/cmdprio-percentage.png differ
diff --git a/examples/cpp_null.png b/examples/cpp_null.png

new file mode 100644 (file)

index 0000000..5303ac2

Binary files /dev/null and b/examples/cpp_null.png differ
diff --git a/examples/cpuio.png b/examples/cpuio.png

new file mode 100644 (file)

index 0000000..02938db

Binary files /dev/null and b/examples/cpuio.png differ
diff --git a/examples/cross-stripe-verify.png b/examples/cross-stripe-verify.png

new file mode 100644 (file)

index 0000000..90aa630

Binary files /dev/null and b/examples/cross-stripe-verify.png differ
diff --git a/examples/dedupe-global.fio b/examples/dedupe-global.fio

new file mode 100644 (file)

index 0000000..edaaad5
--- /dev/null
+++ b/examples/dedupe-global.fio
@@ -0,0 +1,57 @@
+# Writing to 2 files that share the duplicate blocks.
+# The dedupe working set is spread uniformly such that when
+# each of the jobs choose to perform a dedup operation they will
+# regenerate a buffer from the global space.
+# If you test the dedup ratio on either file by itself the result
+# is likely lower than if you test the ratio of the two files combined.
+#
+# Use `./t/fio-dedupe <file> -C 1 -c 1 -b 4096` to test the total
+# data reduction ratio.
+#
+#
+# Full example of test:
+# $ ./fio ./examples/dedupe-global.fio
+#
+# Checking ratio on a and b individually:
+# $ ./t/fio-dedupe a.0.0 -C 1 -c 1 -b 4096
+#
+# $ Extents=25600, Unique extents=16817 Duplicated extents=5735
+# $ De-dupe ratio: 1:0.52
+# $ De-dupe working set at least: 22.40%
+# $ Fio setting: dedupe_percentage=34
+# $ Unique capacity 33MB
+#
+# ./t/fio-dedupe b.0.0 -C 1 -c 1 -b 4096
+# $ Extents=25600, Unique extents=17009 Duplicated extents=5636
+# $ De-dupe ratio: 1:0.51
+# $ De-dupe working set at least: 22.02%
+# $ Fio setting: dedupe_percentage=34
+# $ Unique capacity 34MB
+#
+# Combining files:
+# $ cat a.0.0 > c.0.0
+# $ cat b.0.0 >> c.0.0
+#
+# Checking data reduction ratio on combined file:
+# $ ./t/fio-dedupe c.0.0 -C 1 -c 1 -b 4096
+# $ Extents=51200, Unique extents=25747 Duplicated extents=11028
+# $ De-dupe ratio: 1:0.99
+# $ De-dupe working set at least: 21.54%
+# $ Fio setting: dedupe_percentage=50
+# $ Unique capacity 51MB
+#
+[global]
+ioengine=libaio
+iodepth=256
+size=100m
+dedupe_mode=working_set
+dedupe_global=1
+dedupe_percentage=50
+blocksize=4k
+rw=write
+buffer_compress_percentage=50
+dedupe_working_set_percentage=50
+
+[a]
+
+[b]
diff --git a/examples/dedupe-global.png b/examples/dedupe-global.png

new file mode 100644 (file)

index 0000000..fd4602e

Binary files /dev/null and b/examples/dedupe-global.png differ
diff --git a/examples/dev-dax.png b/examples/dev-dax.png

new file mode 100644 (file)

index 0000000..2463bca

Binary files /dev/null and b/examples/dev-dax.png differ
diff --git a/examples/dfs.fio b/examples/dfs.fio

new file mode 100644 (file)

index 0000000..5de887d
--- /dev/null
+++ b/examples/dfs.fio
@@ -0,0 +1,33 @@
+[global]
+ioengine=dfs
+pool=${POOL}
+cont=${CONT}
+filename_format=fio-test.$jobnum
+
+cpus_allowed_policy=split
+group_reporting=1
+time_based=0
+percentile_list=99.0:99.9:99.99:99.999:99.9999:100
+disable_slat=1
+disable_clat=1
+
+bs=1M
+size=100G
+iodepth=16
+numjobs=16
+
+[daos-seqwrite]
+rw=write
+stonewall
+
+[daos-seqread]
+rw=read
+stonewall
+
+[daos-randwrite]
+rw=randwrite
+stonewall
+
+[daos-randread]
+rw=randread
+stonewall
diff --git a/examples/dfs.png b/examples/dfs.png

new file mode 100644 (file)

index 0000000..049ccae

Binary files /dev/null and b/examples/dfs.png differ
diff --git a/examples/dircreate-ioengine.fio b/examples/dircreate-ioengine.fio

new file mode 100644 (file)

index 0000000..c89d9e4
--- /dev/null
+++ b/examples/dircreate-ioengine.fio
@@ -0,0 +1,25 @@
+# Example dircreate job
+#
+# create_on_open is needed so that the open happens during the run and not the
+# setup.
+#
+# openfiles needs to be set so that you do not exceed the maximum allowed open
+# files.
+#
+# filesize needs to be set to a non zero value so fio will actually run, but the
+# IO will not really be done and the write latency numbers will only reflect the
+# open times.
+[global]
+create_on_open=1
+nrfiles=30
+ioengine=dircreate
+fallocate=none
+filesize=4k
+openfiles=1
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
diff --git a/examples/dircreate-ioengine.png b/examples/dircreate-ioengine.png

new file mode 100644 (file)

index 0000000..da1a8c4

Binary files /dev/null and b/examples/dircreate-ioengine.png differ
diff --git a/examples/dirdelete-ioengine.fio b/examples/dirdelete-ioengine.fio

new file mode 100644 (file)

index 0000000..4e5b1e2
--- /dev/null
+++ b/examples/dirdelete-ioengine.fio
@@ -0,0 +1,18 @@
+# Example dirdelete job
+
+# 'filedelete' engine only do 'rmdir(dirname)'.
+# 'filesize' must be set, then directories will be created at setup stage.
+# 'unlink' is better set to 0, since the directory is deleted in measurement.
+# the options disabled completion latency output such as 'disable_clat' and 'gtod_reduce' must not set.
+[global]
+ioengine=dirdelete
+filesize=4k
+nrfiles=200
+unlink=0
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
diff --git a/examples/dirdelete-ioengine.png b/examples/dirdelete-ioengine.png

new file mode 100644 (file)

index 0000000..af24619

Binary files /dev/null and b/examples/dirdelete-ioengine.png differ
diff --git a/examples/dirstat-ioengine.fio b/examples/dirstat-ioengine.fio

new file mode 100644 (file)

index 0000000..1322dd2
--- /dev/null
+++ b/examples/dirstat-ioengine.fio
@@ -0,0 +1,18 @@
+# Example dirstat job
+
+# 'dirstat' engine only do 'stat(dirname)', file will not be open().
+# 'filesize' must be set, then files will be created at setup stage.
+
+[global]
+ioengine=dirstat
+numjobs=10
+filesize=4k
+nrfiles=5
+thread
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
diff --git a/examples/dirstat-ioengine.png b/examples/dirstat-ioengine.png

new file mode 100644 (file)

index 0000000..14b948b

Binary files /dev/null and b/examples/dirstat-ioengine.png differ
diff --git a/examples/disk-zone-profile.fio b/examples/disk-zone-profile.fio

index 96e56695566541026b29d98d94ca54cae152d548..577820ebe7818823fc9ff69c863048fb7b54bde5 100644 (file)
--- a/examples/disk-zone-profile.fio
+++ b/examples/disk-zone-profile.fio
@@ -1,4 +1,4 @@
-; Read disk in zones of 128m/2g, generating a plot of that afterwards
+; Read disk in zones of 256m/2g. Generating a plot of that afterwards
  ; should give a nice picture of the zoning of this drive
  
  [global]
@@ -7,8 +7,11 @@ direct=1
  rw=read
  ioengine=libaio
  iodepth=2
+zonemode=strided
  zonesize=256m
  zoneskip=2g
-write_bw_log
  
-[/dev/sdb]
+[disk-zone-profile]
+filename=/dev/sdb
+write_bw_log
+log_offset=1
diff --git a/examples/disk-zone-profile.png b/examples/disk-zone-profile.png

new file mode 100644 (file)

index 0000000..5f7b24c

Binary files /dev/null and b/examples/disk-zone-profile.png differ
diff --git a/examples/e4defrag.png b/examples/e4defrag.png

new file mode 100644 (file)

index 0000000..00a7fef

Binary files /dev/null and b/examples/e4defrag.png differ
diff --git a/examples/e4defrag2.fio b/examples/e4defrag2.fio

index 2d4e1a87705890065228bca7456c914d75a4117d..86554ef7a04b45cbeecaae168434ebd7a9fcaa59 100644 (file)
--- a/examples/e4defrag2.fio
+++ b/examples/e4defrag2.fio
@@ -48,7 +48,7 @@ donorname=file.def
  
  ########
  # Run random e4defrag and various aio workers in parallel
-[e4defrag-fuzzer-4k]
+[e4defrag-fuzzer-4k-bis]
  stonewall
  continue_on_error=all
  inplace=1
diff --git a/examples/e4defrag2.png b/examples/e4defrag2.png

new file mode 100644 (file)

index 0000000..8a128e9

Binary files /dev/null and b/examples/e4defrag2.png differ
diff --git a/examples/enospc-pressure.fio b/examples/enospc-pressure.fio

index ca9d8f7a7abcbec95d4e1f080cd7cc6a77c41d8b..fa404fd50568898506e1094cc69514ccc6012398 100644 (file)
--- a/examples/enospc-pressure.fio
+++ b/examples/enospc-pressure.fio
@@ -35,8 +35,8 @@ bs=4k
  rw=randtrim
  filename=raicer
  
-# Verifier thread continiously write to newly allcated blocks
-# and veryfy written content
+# Verifier thread continuously writes to newly allcated blocks
+# and verifies written content
  [aio-dio-verifier]
  create_on_open=1
  verify=crc32c-intel
diff --git a/examples/enospc-pressure.png b/examples/enospc-pressure.png

new file mode 100644 (file)

index 0000000..da28b7c

Binary files /dev/null and b/examples/enospc-pressure.png differ
diff --git a/examples/exec.fio b/examples/exec.fio

new file mode 100644 (file)

index 0000000..ac1bedf
--- /dev/null
+++ b/examples/exec.fio
@@ -0,0 +1,36 @@
+[global]
+time_based
+runtime=30
+
+[monitoring_noop]
+ioengine=exec
+program=/usr/sbin/turbostat
+arguments=-c package -qS --interval 5 -s Busy%,Bzy_MHz,Avg_MHz,CorWatt,PkgWatt,RAMWatt,PkgTmp
+
+[cpuload_noop]
+ioengine=cpuio
+cpuload=100
+numjobs=12
+cpumode=noop
+
+[sleep]
+# Let the processor cooling down for a few seconds
+stonewall
+ioengine=exec
+runtime=10
+program=/bin/sleep
+arguments=%r
+grace_time=0
+std_redirect=0
+
+[monitoring_qsort]
+stonewall
+ioengine=exec
+program=/usr/sbin/turbostat
+arguments=-c package -qS --interval 5 -s Busy%,Bzy_MHz,Avg_MHz,CorWatt,PkgWatt,RAMWatt,PkgTmp
+
+[cpuload_qsort]
+ioengine=cpuio
+cpuload=100
+numjobs=12
+cpumode=qsort
diff --git a/examples/exec.png b/examples/exec.png

new file mode 100644 (file)

index 0000000..5f9f3b5

Binary files /dev/null and b/examples/exec.png differ
diff --git a/examples/exitwhat.png b/examples/exitwhat.png

new file mode 100644 (file)

index 0000000..9fc1883

Binary files /dev/null and b/examples/exitwhat.png differ
diff --git a/examples/falloc.fio b/examples/falloc.fio

index fadf13216922dc76a52d0d5f4bb916f954a8630e..5a3e88b81ee59f8b20576673bf49dc01a190d6d9 100644 (file)
--- a/examples/falloc.fio
+++ b/examples/falloc.fio
@@ -29,7 +29,7 @@ rw=randtrim
  numjobs=2
  filename=fragmented_file
  
-## Mesure IO performance on fragmented file
+## Measure IO performance on fragmented file
  [sequential aio-dio write]
  stonewall
  ioengine=libaio
diff --git a/examples/falloc.png b/examples/falloc.png

new file mode 100644 (file)

index 0000000..886be22

Binary files /dev/null and b/examples/falloc.png differ
diff --git a/examples/filecreate-ioengine.png b/examples/filecreate-ioengine.png

new file mode 100644 (file)

index 0000000..45d11da

Binary files /dev/null and b/examples/filecreate-ioengine.png differ
diff --git a/examples/filedelete-ioengine.fio b/examples/filedelete-ioengine.fio

new file mode 100644 (file)

index 0000000..3c0028f
--- /dev/null
+++ b/examples/filedelete-ioengine.fio
@@ -0,0 +1,18 @@
+# Example filedelete job
+
+# 'filedelete' engine only do 'unlink(filename)', file will not be open().
+# 'filesize' must be set, then files will be created at setup stage.
+# 'unlink' is better set to 0, since the file is deleted in measurement.
+# the options disabled completion latency output such as 'disable_clat' and 'gtod_reduce' must not set.
+[global]
+ioengine=filedelete
+filesize=4k
+nrfiles=200
+unlink=0
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
diff --git a/examples/filedelete-ioengine.png b/examples/filedelete-ioengine.png

new file mode 100644 (file)

index 0000000..3512ab7

Binary files /dev/null and b/examples/filedelete-ioengine.png differ
diff --git a/examples/filestat-ioengine.png b/examples/filestat-ioengine.png

new file mode 100644 (file)

index 0000000..bed59ab

Binary files /dev/null and b/examples/filestat-ioengine.png differ
diff --git a/examples/fio-rand-RW.png b/examples/fio-rand-RW.png

new file mode 100644 (file)

index 0000000..aa4b099

Binary files /dev/null and b/examples/fio-rand-RW.png differ
diff --git a/examples/fio-rand-read.png b/examples/fio-rand-read.png

new file mode 100644 (file)

index 0000000..d45664a

Binary files /dev/null and b/examples/fio-rand-read.png differ
diff --git a/examples/fio-rand-write.png b/examples/fio-rand-write.png

new file mode 100644 (file)

index 0000000..10e068b

Binary files /dev/null and b/examples/fio-rand-write.png differ
diff --git a/examples/fio-seq-RW.png b/examples/fio-seq-RW.png

new file mode 100644 (file)

index 0000000..a2be35e

Binary files /dev/null and b/examples/fio-seq-RW.png differ
diff --git a/examples/fio-seq-read.png b/examples/fio-seq-read.png

new file mode 100644 (file)

index 0000000..cf8f297

Binary files /dev/null and b/examples/fio-seq-read.png differ
diff --git a/examples/fio-seq-write.png b/examples/fio-seq-write.png

new file mode 100644 (file)

index 0000000..8db1209

Binary files /dev/null and b/examples/fio-seq-write.png differ
diff --git a/examples/fixed-rate-submission.png b/examples/fixed-rate-submission.png

new file mode 100644 (file)

index 0000000..86ca9b3

Binary files /dev/null and b/examples/fixed-rate-submission.png differ
diff --git a/examples/flow.png b/examples/flow.png

new file mode 100644 (file)

index 0000000..26a3d34

Binary files /dev/null and b/examples/flow.png differ
diff --git a/examples/fsx.png b/examples/fsx.png

new file mode 100644 (file)

index 0000000..b4e13c8

Binary files /dev/null and b/examples/fsx.png differ
diff --git a/examples/ftruncate.png b/examples/ftruncate.png

new file mode 100644 (file)

index 0000000..b98895f

Binary files /dev/null and b/examples/ftruncate.png differ
diff --git a/examples/gfapi.png b/examples/gfapi.png

new file mode 100644 (file)

index 0000000..acc6a6a

Binary files /dev/null and b/examples/gfapi.png differ
diff --git a/examples/gpudirect-rdmaio-client.png b/examples/gpudirect-rdmaio-client.png

new file mode 100644 (file)

index 0000000..eac7985

Binary files /dev/null and b/examples/gpudirect-rdmaio-client.png differ
diff --git a/examples/gpudirect-rdmaio-server.png b/examples/gpudirect-rdmaio-server.png

new file mode 100644 (file)

index 0000000..e043d7c

Binary files /dev/null and b/examples/gpudirect-rdmaio-server.png differ
diff --git a/examples/http-s3-crypto.fio b/examples/http-s3-crypto.fio

new file mode 100644 (file)

index 0000000..2403746
--- /dev/null
+++ b/examples/http-s3-crypto.fio
@@ -0,0 +1,38 @@
+# Example test for the HTTP engine's S3 support against Amazon AWS.
+# Obviously, you have to adjust the S3 credentials; for this example,
+# they're passed in via the environment.
+# And you can set the SSE Customer Key and Algorithm to test Server
+# Side Encryption.
+#
+
+[global]
+ioengine=http
+name=test
+direct=1
+filename=/larsmb-fio-test/object
+http_verbose=0
+https=on
+http_mode=s3
+http_s3_key=${S3_KEY}
+http_s3_keyid=${S3_ID}
+http_host=s3.eu-central-1.amazonaws.com
+http_s3_region=eu-central-1
+http_s3_sse_customer_key=${SSE_KEY}
+http_s3_sse_customer_algorithm=AES256
+group_reporting
+
+# With verify, this both writes and reads the object
+[create]
+rw=write
+bs=4k
+size=64k
+io_size=4k
+verify=sha256
+
+[trim]
+stonewall
+rw=trim
+bs=4k
+size=64k
+io_size=4k
+
diff --git a/examples/http-s3-crypto.png b/examples/http-s3-crypto.png

new file mode 100644 (file)

index 0000000..b452cf4

Binary files /dev/null and b/examples/http-s3-crypto.png differ
diff --git a/examples/http-s3-storage-class.fio b/examples/http-s3-storage-class.fio

new file mode 100644 (file)

index 0000000..9ee2383
--- /dev/null
+++ b/examples/http-s3-storage-class.fio
@@ -0,0 +1,37 @@
+# Example test for the HTTP engine's S3 support against Amazon AWS.
+# Obviously, you have to adjust the S3 credentials; for this example,
+# they're passed in via the environment.
+# And here add storage class parameter, you can set normal test for
+# STANDARD and compression test for another storage class.
+#
+
+[global]
+ioengine=http
+name=test
+direct=1
+filename=/larsmb-fio-test/object
+http_verbose=0
+https=on
+http_mode=s3
+http_s3_key=${S3_KEY}
+http_s3_keyid=${S3_ID}
+http_host=s3.eu-central-1.amazonaws.com
+http_s3_region=eu-central-1
+http_s3_storage_class=${STORAGE_CLASS}
+group_reporting
+
+# With verify, this both writes and reads the object
+[create]
+rw=write
+bs=4k
+size=64k
+io_size=4k
+verify=sha256
+
+[trim]
+stonewall
+rw=trim
+bs=4k
+size=64k
+io_size=4k
+
diff --git a/examples/http-s3-storage-class.png b/examples/http-s3-storage-class.png

new file mode 100644 (file)

index 0000000..b893a4e

Binary files /dev/null and b/examples/http-s3-storage-class.png differ
diff --git a/examples/http-s3.png b/examples/http-s3.png

new file mode 100644 (file)

index 0000000..2021e85

Binary files /dev/null and b/examples/http-s3.png differ
diff --git a/examples/http-swift.png b/examples/http-swift.png

new file mode 100644 (file)

index 0000000..9928fb1

Binary files /dev/null and b/examples/http-swift.png differ
diff --git a/examples/http-webdav.png b/examples/http-webdav.png

new file mode 100644 (file)

index 0000000..c37c3de

Binary files /dev/null and b/examples/http-webdav.png differ
diff --git a/examples/ime.png b/examples/ime.png

new file mode 100644 (file)

index 0000000..f636f5e

Binary files /dev/null and b/examples/ime.png differ
diff --git a/examples/iometer-file-access-server.png b/examples/iometer-file-access-server.png

new file mode 100644 (file)

index 0000000..e312455

Binary files /dev/null and b/examples/iometer-file-access-server.png differ
diff --git a/examples/jesd219.png b/examples/jesd219.png

new file mode 100644 (file)

index 0000000..73b5a12

Binary files /dev/null and b/examples/jesd219.png differ
diff --git a/examples/latency-profile.png b/examples/latency-profile.png

new file mode 100644 (file)

index 0000000..50650df

Binary files /dev/null and b/examples/latency-profile.png differ
diff --git a/examples/libblkio-io_uring.fio b/examples/libblkio-io_uring.fio

new file mode 100644 (file)

index 0000000..40f625c
--- /dev/null
+++ b/examples/libblkio-io_uring.fio
@@ -0,0 +1,29 @@
+; Benchmark accessing a regular file or block device using libblkio.
+;
+; Replace "/dev/nvme0n1" below with the path to your file or device, or override
+; it by passing the '--libblkio_path=...' flag to fio.
+;
+; In the example below, the two subjobs of "job-B" *and* the single subjob of
+; "job-C" will share a single libblkio instance, and "job-A" will use a separate
+; libblkio instance.
+;
+; For information on libblkio, see: https://gitlab.com/libblkio/libblkio
+
+[global]
+ioengine=libblkio
+libblkio_driver=io_uring
+libblkio_path=/dev/nvme0n1  ; REPLACE THIS WITH THE RIGHT PATH
+rw=randread
+blocksize=4k
+direct=1
+time_based=1
+runtime=10s
+
+[job-A]
+
+[job-B]
+numjobs=2  ; run two copies of this job simultaneously
+thread=1   ; have each copy run as a separate thread in the *same* process
+
+[job-C]
+thread=1  ; have the job run as a thread in the *same* process as "job-B"
diff --git a/examples/libblkio-io_uring.png b/examples/libblkio-io_uring.png

new file mode 100644 (file)

index 0000000..1bc6cc9

Binary files /dev/null and b/examples/libblkio-io_uring.png differ
diff --git a/examples/libblkio-virtio-blk-vfio-pci.fio b/examples/libblkio-virtio-blk-vfio-pci.fio

new file mode 100644 (file)

index 0000000..024224a
--- /dev/null
+++ b/examples/libblkio-virtio-blk-vfio-pci.fio
@@ -0,0 +1,29 @@
+; Benchmark accessing a PCI virtio-blk device using libblkio.
+;
+; Replace "/sys/bus/pci/devices/0000:00:01.0" below with the path to your
+; device's sysfs directory, or override it by passing the '--libblkio_path=...'
+; flag to fio.
+;
+; In the example below, the two subjobs of "job-B" *and* the single subjob of
+; "job-C" will share a single libblkio instance, and "job-A" will use a separate
+; libblkio instance.
+;
+; For information on libblkio, see: https://gitlab.com/libblkio/libblkio
+
+[global]
+ioengine=libblkio
+libblkio_driver=virtio-blk-vfio-pci
+libblkio_path=/sys/bus/pci/devices/0000:00:01.0  ; REPLACE THIS WITH THE RIGHT PATH
+rw=randread
+blocksize=4k
+time_based=1
+runtime=10s
+
+[job-A]
+
+[job-B]
+numjobs=2  ; run two copies of this job simultaneously
+thread=1   ; have each copy run as a separate thread in the *same* process
+
+[job-C]
+thread=1  ; have the job run as a thread in the *same* process as "job-B"
diff --git a/examples/libblkio-virtio-blk-vfio-pci.png b/examples/libblkio-virtio-blk-vfio-pci.png

new file mode 100644 (file)

index 0000000..8a670cc

Binary files /dev/null and b/examples/libblkio-virtio-blk-vfio-pci.png differ
diff --git a/examples/libcufile-cufile.png b/examples/libcufile-cufile.png

new file mode 100644 (file)

index 0000000..f3758e5

Binary files /dev/null and b/examples/libcufile-cufile.png differ
diff --git a/examples/libcufile-posix.png b/examples/libcufile-posix.png

new file mode 100644 (file)

index 0000000..7818feb

Binary files /dev/null and b/examples/libcufile-posix.png differ
diff --git a/examples/libhdfs.png b/examples/libhdfs.png

new file mode 100644 (file)

index 0000000..e774c91

Binary files /dev/null and b/examples/libhdfs.png differ
diff --git a/examples/libiscsi.png b/examples/libiscsi.png

new file mode 100644 (file)

index 0000000..d0006cc

Binary files /dev/null and b/examples/libiscsi.png differ
diff --git a/examples/libpmem.fio b/examples/libpmem.fio

index 0ff681f071981e37d6e5d79438c6d4b3b6fa49f0..3b854a32bff724d2f82ebc1dacb399e5fb2046c9 100644 (file)
--- a/examples/libpmem.fio
+++ b/examples/libpmem.fio
@@ -1,6 +1,6 @@
  [global]
  bs=4k
-size=8g
+size=10g
  ioengine=libpmem
  norandommap
  time_based
@@ -17,16 +17,6 @@ thread
  numjobs=1
  runtime=300
  
-#
-# In case of 'scramble_buffers=1', the source buffer
-# is rewritten with a random value every write operations.
-#
-# But when 'scramble_buffers=0' is set, the source buffer isn't
-# rewritten. So it will be likely that the source buffer is in CPU
-# cache and it seems to be high performance.
-#
-scramble_buffers=0
-
  #
  # depends on direct option, flags are set for pmem_memcpy() call:
  # direct=1 - PMEM_F_MEM_NONTEMPORAL,
@@ -39,9 +29,19 @@ direct=1
  #
  sync=1
  
+#
+# In case of 'scramble_buffers=1', the source buffer
+# is rewritten with a random value every write operation.
+#
+# But when 'scramble_buffers=0' is set, the source buffer isn't
+# rewritten. So it will be likely that the source buffer is in CPU
+# cache and it seems to be high write performance.
+#
+scramble_buffers=1
  
  #
-# Setting for fio process's CPU Node and Memory Node
+# Setting for fio process's CPU Node and Memory Node.
+# Set proper node below or use `numactl` command along with FIO.
  #
  numa_cpu_nodes=0
  numa_mem_policy=bind:0
@@ -53,21 +53,22 @@ cpus_allowed_policy=split
  
  #
  # The libpmem engine does IO to files in a DAX-mounted filesystem.
-# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# The filesystem should be created on a Non-Volatile DIMM (e.g /dev/pmem0)
  # and then mounted with the '-o dax' option.  Note that the engine
  # accesses the underlying NVDIMM directly, bypassing the kernel block
  # layer, so the usual filesystem/disk performance monitoring tools such
  # as iostat will not provide useful data.
  #
-directory=/mnt/pmem0
+#filename=/mnt/pmem/somefile
+directory=/mnt/pmem
  
  [libpmem-seqwrite]
  rw=write
  stonewall
  
-#[libpmem-seqread]
-#rw=read
-#stonewall
+[libpmem-seqread]
+rw=read
+stonewall
  
  #[libpmem-randwrite]
  #rw=randwrite
diff --git a/examples/libpmem.png b/examples/libpmem.png

new file mode 100644 (file)

index 0000000..8a9a143

Binary files /dev/null and b/examples/libpmem.png differ
diff --git a/examples/librpma_apm-client.fio b/examples/librpma_apm-client.fio

new file mode 100644 (file)

index 0000000..82a5d20
--- /dev/null
+++ b/examples/librpma_apm-client.fio
@@ -0,0 +1,24 @@
+# Example of the librpma_apm_client job
+
+[global]
+ioengine=librpma_apm_client
+create_serialize=0 # (required) forces specific initiation sequence
+serverip=[serverip] #IP address the server is listening on
+port=7204 # port(s) the server will listen on, <port; port + numjobs - 1> will be used
+thread
+
+# The client will get a remote memory region description after establishing
+# a connection.
+
+[client]
+numjobs=1 # number of parallel connections
+group_reporting=1
+sync=1 # 1 is the best for latency measurements, 0 for bandwidth
+iodepth=2 # total number of ious
+iodepth_batch_submit=1 # number of ious to be submitted at once
+rw=write # read/write/randread/randwrite/readwrite/rw
+rwmixread=70 # % of a mixed workload that should be reads
+blocksize=4KiB
+ramp_time=15s # gives some time to stabilize the workload
+time_based
+runtime=60s # run the workload for the specified period of time
diff --git a/examples/librpma_apm-client.png b/examples/librpma_apm-client.png

new file mode 100644 (file)

index 0000000..2fe02cd

Binary files /dev/null and b/examples/librpma_apm-client.png differ
diff --git a/examples/librpma_apm-server.fio b/examples/librpma_apm-server.fio

new file mode 100644 (file)

index 0000000..dc1ddba
--- /dev/null
+++ b/examples/librpma_apm-server.fio
@@ -0,0 +1,26 @@
+# Example of the librpma_apm_server job
+
+[global]
+ioengine=librpma_apm_server
+create_serialize=0 # (required) forces specific initiation sequence
+kb_base=1000 # turn on the straight units handling (non-compatibility mode)
+serverip=[serverip] # IP address to listen on
+port=7204 # port(s) the server jobs will listen on, ports <port; port + numjobs - 1> will be used
+thread
+
+# The server side spawns one thread for each expected connection from
+# the client-side, opens and registers the range dedicated for this thread
+# (a workspace) from the provided memory.
+# Each of the server threads accepts a connection on the dedicated port
+# (different for each and every working thread) and waits for it to end up,
+# and closes itself.
+
+[server]
+# set to 1 (true) ONLY when Direct Write to PMem from the remote host is possible
+# (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)
+direct_write_to_pmem=0
+
+numjobs=1 # number of expected incoming connections
+size=100MiB # size of workspace for a single connection
+filename=malloc # device dax or an existing fsdax file or "malloc" for allocation from DRAM
+# filename=/dev/dax1.0
diff --git a/examples/librpma_apm-server.png b/examples/librpma_apm-server.png

new file mode 100644 (file)

index 0000000..f78ae02

Binary files /dev/null and b/examples/librpma_apm-server.png differ
diff --git a/examples/librpma_gpspm-client.fio b/examples/librpma_gpspm-client.fio

new file mode 100644 (file)

index 0000000..843382d
--- /dev/null
+++ b/examples/librpma_gpspm-client.fio
@@ -0,0 +1,23 @@
+# Example of the librpma_gpspm_client job
+
+[global]
+ioengine=librpma_gpspm_client
+create_serialize=0 # (required) forces specific initiation sequence
+serverip=[serverip] #IP address the server is listening on
+port=7204 # port(s) the server will listen on, <port; port + numjobs - 1> will be used
+thread
+
+# The client will get a remote memory region description after establishing
+# a connection.
+
+[client]
+numjobs=1 # number of parallel connections
+group_reporting=1
+sync=1 # 1 is the best for latency measurements, 0 for bandwidth
+iodepth=2 # total number of ious
+iodepth_batch_submit=1 # number of ious to be submitted at once
+rw=write # write/randwrite
+blocksize=4KiB
+ramp_time=15s # gives some time to stabilize the workload
+time_based
+runtime=60s # run the workload for the specified period of time
diff --git a/examples/librpma_gpspm-client.png b/examples/librpma_gpspm-client.png

new file mode 100644 (file)

index 0000000..0c975a2

Binary files /dev/null and b/examples/librpma_gpspm-client.png differ
diff --git a/examples/librpma_gpspm-server.fio b/examples/librpma_gpspm-server.fio

new file mode 100644 (file)

index 0000000..4555314
--- /dev/null
+++ b/examples/librpma_gpspm-server.fio
@@ -0,0 +1,33 @@
+# Example of the librpma_gpspm_server job
+
+[global]
+ioengine=librpma_gpspm_server
+create_serialize=0 # (required) forces specific initiation sequence
+kb_base=1000 # turn on the straight units handling (non-compatibility mode)
+serverip=[serverip] #IP address to listen on
+port=7204 # port(s) the server jobs will listen on, ports <port; port + numjobs - 1> will be used
+thread
+
+# The server side spawns one thread for each expected connection from
+# the client-side, opens and registers the range dedicated for this thread
+# (a workspace) from the provided memory.
+# Each of the server threads accepts a connection on the dedicated port
+# (different for each and every working thread), accepts and executes flush
+# requests, and sends back a flush response for each of the requests.
+# When the client is done it sends the termination notice to the server's thread.
+
+[server]
+# set to 1 (true) ONLY when Direct Write to PMem from the remote host is possible
+# (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)
+direct_write_to_pmem=0
+# set to 0 (false) to wait for completion instead of busy-wait polling completion.
+busy_wait_polling=1
+numjobs=1 # number of expected incoming connections
+iodepth=2 # number of parallel GPSPM requests
+size=100MiB # size of workspace for a single connection
+filename=malloc # device dax or an existing fsdax file or "malloc" for allocation from DRAM
+# filename=/dev/dax1.0
+
+# The client will terminate the server when the client will end up its job.
+time_based
+runtime=365d
diff --git a/examples/librpma_gpspm-server.png b/examples/librpma_gpspm-server.png

new file mode 100644 (file)

index 0000000..5612453

Binary files /dev/null and b/examples/librpma_gpspm-server.png differ
diff --git a/examples/libzbc-rand-write.png b/examples/libzbc-rand-write.png

new file mode 100644 (file)

index 0000000..1d27741

Binary files /dev/null and b/examples/libzbc-rand-write.png differ
diff --git a/examples/libzbc-seq-read.png b/examples/libzbc-seq-read.png

new file mode 100644 (file)

index 0000000..5a53222

Binary files /dev/null and b/examples/libzbc-seq-read.png differ
diff --git a/examples/mtd.fio b/examples/mtd.fio

index e5dcea4c04b8e2677ede8a2b9ca822490db2f1ef..0a7f2bae82f48622702644713ace468904c50085 100644 (file)
--- a/examples/mtd.fio
+++ b/examples/mtd.fio
@@ -6,7 +6,7 @@ ignore_error=,EIO
  blocksize=512,512,16384
  skip_bad=1
  
-[write]
+[trim]
  stonewall
  rw=trim
  
@@ -14,7 +14,7 @@ rw=trim
  stonewall
  rw=write
  
-[write]
+[trimwrite]
  stonewall
  block_error_percentiles=1
  rw=trimwrite
diff --git a/examples/mtd.png b/examples/mtd.png

new file mode 100644 (file)

index 0000000..8cb3692

Binary files /dev/null and b/examples/mtd.png differ
diff --git a/examples/nbd.fio b/examples/nbd.fio

index 6900ebe7f019defc4c0fcabe2072749d5c20c14e..31629fad700b93c530c4d92686d6a59e3ab4337b 100644 (file)
--- a/examples/nbd.fio
+++ b/examples/nbd.fio
@@ -1,21 +1,25 @@
-# To use fio to test nbdkit:
+# To use fio to test nbdkit + RAM disk:
  #
-# nbdkit -U - memory size=256M --run 'export unixsocket; fio examples/nbd.fio'
+#   nbdkit -U - memory size=256M --run 'export uri; fio examples/nbd.fio'
  #
-# To use fio to test qemu-nbd:
+# To use fio to test nbdkit + local file:
  #
-# rm -f /tmp/disk.img /tmp/socket
-# truncate -s 256M /tmp/disk.img
-# export unixsocket=/tmp/socket
-# qemu-nbd -t -k $unixsocket -f raw /tmp/disk.img &
-# fio examples/nbd.fio
-# killall qemu-nbd
+#   rm -f /var/tmp/disk.img
+#   truncate -s 256M /var/tmp/disk.img
+#   nbdkit -U - file /var/tmp/disk.img --run 'export uri; fio examples/nbd.fio'
+#
+# To use fio to test qemu-nbd + local file:
+#
+#   rm -f /var/tmp/disk.img /var/tmp/socket
+#   truncate -s 256M /var/tmp/disk.img
+#   export uri='nbd+unix:///?socket=/var/tmp/socket'
+#   qemu-nbd -t -k /var/tmp/socket -f raw /var/tmp/disk.img &
+#   fio examples/nbd.fio
+#   killall qemu-nbd
  
  [global]
  ioengine=nbd
-uri=nbd+unix:///?socket=${unixsocket}
-# Starting from nbdkit 1.14 the following will work:
-#uri=${uri}
+uri=${uri}
  rw=randrw
  time_based
  runtime=60
diff --git a/examples/nbd.png b/examples/nbd.png

new file mode 100644 (file)

index 0000000..3a933c9

Binary files /dev/null and b/examples/nbd.png differ
diff --git a/examples/netio.png b/examples/netio.png

new file mode 100644 (file)

index 0000000..81afd41

Binary files /dev/null and b/examples/netio.png differ
diff --git a/examples/netio_multicast.png b/examples/netio_multicast.png

new file mode 100644 (file)

index 0000000..f07ab4b

Binary files /dev/null and b/examples/netio_multicast.png differ
diff --git a/examples/netio_vsock.fio b/examples/netio_vsock.fio

new file mode 100644 (file)

index 0000000..8c328f7
--- /dev/null
+++ b/examples/netio_vsock.fio
@@ -0,0 +1,22 @@
+# Example network vsock job, just defines two clients that send/recv data
+[global]
+ioengine=net
+
+port=8888
+protocol=vsock
+bs=4k
+size=100g
+
+#set the below option to enable end-to-end data integrity tests
+#verify=md5
+
+[receiver]
+listen
+rw=read
+
+[sender]
+# 1 (VMADDR_CID_LOCAL) is the well-known address
+# for local communication (loopback)
+hostname=1
+startdelay=1
+rw=write
diff --git a/examples/netio_vsock.png b/examples/netio_vsock.png

new file mode 100644 (file)

index 0000000..01aadde

Binary files /dev/null and b/examples/netio_vsock.png differ
diff --git a/examples/netio_vsock_receiver.fio b/examples/netio_vsock_receiver.fio

new file mode 100644 (file)

index 0000000..e2a00c4
--- /dev/null
+++ b/examples/netio_vsock_receiver.fio
@@ -0,0 +1,14 @@
+# Example network vsock job, just defines a receiver
+[global]
+ioengine=net
+port=8888
+protocol=vsock
+bs=4k
+size=100g
+
+#set the below option to enable end-to-end data integrity tests
+#verify=md5
+
+[receiver]
+listen
+rw=read
diff --git a/examples/netio_vsock_receiver.png b/examples/netio_vsock_receiver.png

new file mode 100644 (file)

index 0000000..524a7a1

Binary files /dev/null and b/examples/netio_vsock_receiver.png differ
diff --git a/examples/netio_vsock_sender.fio b/examples/netio_vsock_sender.fio

new file mode 100644 (file)

index 0000000..2451d99
--- /dev/null
+++ b/examples/netio_vsock_sender.fio
@@ -0,0 +1,17 @@
+# Example network vsock job, just defines a sender
+[global]
+ioengine=net
+port=8888
+protocol=vsock
+bs=4k
+size=100g
+
+#set the below option to enable end-to-end data integrity tests
+#verify=md5
+
+[sender]
+# set the 'hostname' option to the CID of the listening domain
+hostname=3
+startdelay=1
+rw=write
+
diff --git a/examples/netio_vsock_sender.png b/examples/netio_vsock_sender.png

new file mode 100644 (file)

index 0000000..75802aa

Binary files /dev/null and b/examples/netio_vsock_sender.png differ
diff --git a/examples/nfs.fio b/examples/nfs.fio

new file mode 100644 (file)

index 0000000..f856ceb
--- /dev/null
+++ b/examples/nfs.fio
@@ -0,0 +1,22 @@
+[global]
+nfs_url=nfs://127.0.0.1/nfs
+blocksize=524288
+iodepth=10
+ioengine=nfs
+size=104857600
+lat_percentiles=1
+group_reporting
+numjobs=10
+ramp_time=5s
+filename_format=myfiles.$clientuid.$jobnum.$filenum
+time_based=1
+
+[write]
+rw=write
+runtime=10s
+stonewall
+
+[read]
+wait_for=write
+rw=randread
+runtime=10s
diff --git a/examples/nfs.png b/examples/nfs.png

new file mode 100644 (file)

index 0000000..29dbca0

Binary files /dev/null and b/examples/nfs.png differ
diff --git a/examples/null.png b/examples/null.png

new file mode 100644 (file)

index 0000000..052671d

Binary files /dev/null and b/examples/null.png differ
diff --git a/examples/numa.png b/examples/numa.png

new file mode 100644 (file)

index 0000000..1ef4575

Binary files /dev/null and b/examples/numa.png differ
diff --git a/examples/pmemblk.fio b/examples/pmemblk.fio

deleted file mode 100644 (file)

index f813174..0000000
--- a/examples/pmemblk.fio
+++ /dev/null
@@ -1,71 +0,0 @@
-[global]
-bs=1m
-ioengine=pmemblk
-norandommap
-time_based
-runtime=30
-group_reporting
-disable_lat=1
-disable_slat=1
-disable_clat=1
-clat_percentiles=0
-cpus_allowed_policy=split
-
-# For the pmemblk engine:
-#
-#   IOs always complete immediately
-#   IOs are always direct
-#   Must use threads
-#
-iodepth=1
-direct=1
-thread
-numjobs=16
-#
-# Unlink can be used to remove the files when done, but if you are
-# using serial runs with stonewall, and you want the files to be created
-# only once and unlinked only at the very end, then put the unlink=1
-# in the last group.  This is the method demonstrated here.
-#
-# Note that if you have a read-only group and if the files will be
-# newly created, then all of the data will read back as zero and the
-# read will be optimized, yielding performance that is different from
-# that of reading non-zero blocks (or unoptimized zero blocks).
-#
-unlink=0
-#
-# The pmemblk engine does IO to files in a DAX-mounted filesystem.
-# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
-# and then mounted with the '-o dax' option.  Note that the engine
-# accesses the underlying NVDIMM directly, bypassing the kernel block
-# layer, so the usual filesystem/disk performance monitoring tools such
-# as iostat will not provide useful data.
-#
-# Here we specify a test file on each of two NVDIMMs.  The first
-# number after the file name is the block size in bytes (4096 bytes
-# in this example).  The second number is the size of the file to
-# create in MiB (1 GiB in this example); note that the actual usable
-# space available to fio will be less than this as libpmemblk requires
-# some space for metadata.
-#
-# Currently, the minimum block size is 512 bytes and the minimum file
-# size is about 17 MiB (these are libpmemblk requirements).
-#
-# While both files in this example have the same block size and file
-# size, this is not required.
-#
-filename=/pmem0/fio-test,4096,1024
-filename=/pmem1/fio-test,4096,1024
-
-[pmemblk-write]
-rw=randwrite
-stonewall
-
-[pmemblk-read]
-rw=randread
-stonewall
-#
-# We're done, so unlink the file:
-#
-unlink=1
-
diff --git a/examples/poisson-rate-submission.png b/examples/poisson-rate-submission.png

new file mode 100644 (file)

index 0000000..739c256

Binary files /dev/null and b/examples/poisson-rate-submission.png differ
diff --git a/examples/rados.fio b/examples/rados.fio

index 035cbff4ab1481172f7e90ac0a6554d56ca0aed6..dd86f354c8173479969b677ae9247c55ef747b82 100644 (file)
--- a/examples/rados.fio
+++ b/examples/rados.fio
@@ -14,6 +14,7 @@
  ioengine=rados
  clientname=admin
  pool=rados
+conf=/etc/ceph/ceph.conf
  busy_poll=0
  rw=randwrite
  bs=4k
diff --git a/examples/rados.png b/examples/rados.png

new file mode 100644 (file)

index 0000000..91bd61a

Binary files /dev/null and b/examples/rados.png differ
diff --git a/examples/rand-zones.fio b/examples/rand-zones.fio

index 169137d493fe8b6038f3afdba44be281586ab913..10e717278fabccff52d7506f05ae1bfdd7bd1e82 100644 (file)
--- a/examples/rand-zones.fio
+++ b/examples/rand-zones.fio
@@ -21,6 +21,6 @@ random_distribution=zoned:50/5:30/15:20/
  # The above applies to all of reads/writes/trims. If we wanted to do
  # something differently for writes, let's say 50% for the first 10%
  # and 50% for the remaining 90%, we could do it by adding a new section
-# after a a comma.
+# after a comma.
  
  # random_distribution=zoned:50/5:30/15:20/,50/10:50/90
diff --git a/examples/rand-zones.png b/examples/rand-zones.png

new file mode 100644 (file)

index 0000000..13cbfb4

Binary files /dev/null and b/examples/rand-zones.png differ
diff --git a/examples/rbd.png b/examples/rbd.png

new file mode 100644 (file)

index 0000000..f118613

Binary files /dev/null and b/examples/rbd.png differ
diff --git a/examples/rdmaio-client.png b/examples/rdmaio-client.png

new file mode 100644 (file)

index 0000000..4e4bc28

Binary files /dev/null and b/examples/rdmaio-client.png differ
diff --git a/examples/rdmaio-server.png b/examples/rdmaio-server.png

new file mode 100644 (file)

index 0000000..fc34472

Binary files /dev/null and b/examples/rdmaio-server.png differ
diff --git a/examples/sg_verify-fail.fio b/examples/sg_verify-fail.fio

new file mode 100644 (file)

index 0000000..64feece
--- /dev/null
+++ b/examples/sg_verify-fail.fio
@@ -0,0 +1,48 @@
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL  *
+# * DESTROY DATA ON /dev/sdb       *
+# **********************************
+#
+# Test SCSI VERIFY commands issued via the sg ioengine
+# The jobs with fail in the name should produce errors
+#
+# job                  description
+# precon               precondition the device by writing with a known
+#                      pattern
+# verify01             verify each block one at a time by comparing to known
+#                      pattern
+# verify01-fail                verifying one too many blocks should produce a failure
+# verify11-one_ios     verify all 20 blocks by sending only 512 bytes
+# verify11-fail                verifying beyond the preconditioned region should
+#                      produce a failure
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+number_ios=20
+stonewall
+
+[precon]
+
+[verify01]
+sg_write_mode=verify_bytchk_01
+number_ios=20
+
+[verify01-fail]
+sg_write_mode=verify_bytchk_01
+number_ios=21
+
+[verify11-one_ios]
+sg_write_mode=verify_bytchk_11
+number_ios=1
+bs=10240
+
+[verify11-fail]
+sg_write_mode=verify_bytchk_11
+number_ios=1
+bs=10752
diff --git a/examples/sg_verify-fail.png b/examples/sg_verify-fail.png

new file mode 100644 (file)

index 0000000..516e2d4

Binary files /dev/null and b/examples/sg_verify-fail.png differ
diff --git a/examples/sg_verify.fio b/examples/sg_verify.fio

new file mode 100644 (file)

index 0000000..6db0dd0
--- /dev/null
+++ b/examples/sg_verify.fio
@@ -0,0 +1,57 @@
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL  *
+# * DESTROY DATA ON /dev/sdb       *
+# **********************************
+#
+# Test SCSI VERIFY commands issued via the sg ioengine
+# All of the jobs below should complete without error
+#
+# job                  description
+# precon               precondition the device by writing with a known
+#                      pattern
+# verify00             verify written data on medium only
+# verify01             verify each block one at a time by comparing to known
+#                      pattern
+# verify01-two_ios     verify same data but with only two VERIFY operations
+# verify11             verify each block one at a time
+# verify11-five_ios    verify data with five IOs, four blocks at a time,
+#                      sending 512 bytes for each IO
+# verify11-one_ios     verify all 20 blocks by sending only 512 bytes
+#
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+number_ios=20
+stonewall
+
+[precon]
+
+[verify00]
+sg_write_mode=verify_bytchk_00
+
+[verify01]
+sg_write_mode=verify_bytchk_01
+
+[verify01-two_ios]
+sg_write_mode=verify_bytchk_01
+bs=5120
+number_ios=2
+
+[verify11]
+sg_write_mode=verify_bytchk_11
+
+[verify11-five_ios]
+sg_write_mode=verify_bytchk_11
+bs=2048
+number_ios=5
+
+[verify11-one_ios]
+sg_write_mode=verify_bytchk_11
+bs=10240
+number_ios=1
diff --git a/examples/sg_verify.png b/examples/sg_verify.png

new file mode 100644 (file)

index 0000000..f244a74

Binary files /dev/null and b/examples/sg_verify.png differ
diff --git a/examples/sg_write_same_ndob.fio b/examples/sg_write_same_ndob.fio

new file mode 100644 (file)

index 0000000..fb04731
--- /dev/null
+++ b/examples/sg_write_same_ndob.fio
@@ -0,0 +1,44 @@
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL  *
+# * DESTROY DATA ON /dev/sdb       *
+# **********************************
+#
+# Test WRITE SAME commands with the NDOB flag set
+# issued via the sg ioengine
+# All of the jobs below should complete without error
+# except the last one
+#
+# job                  description
+# precon               Precondition the device by writing 20 blocks with a
+#                      known pattern
+# write_same_ndob      Write 19 sectors of all zeroes with the NDOB flag set
+# verify-pass          Verify 19 blocks of all zeroes
+# verify-fail          Verify 20 blocks of all zeroes. This should fail.
+#
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+stonewall
+
+[precon]
+number_ios=20
+
+[write_same_ndob]
+sg_write_mode=write_same_ndob
+number_ios=19
+
+[verify-pass]
+sg_write_mode=verify_bytchk_01
+buffer_pattern=0x00
+number_ios=19
+
+[verify-fail]
+sg_write_mode=verify_bytchk_01
+buffer_pattern=0x00
+number_ios=20
diff --git a/examples/sg_write_same_ndob.png b/examples/sg_write_same_ndob.png

new file mode 100644 (file)

index 0000000..8b76fc6

Binary files /dev/null and b/examples/sg_write_same_ndob.png differ
diff --git a/examples/ssd-steadystate.png b/examples/ssd-steadystate.png

new file mode 100644 (file)

index 0000000..eb27f8a

Binary files /dev/null and b/examples/ssd-steadystate.png differ
diff --git a/examples/ssd-test.png b/examples/ssd-test.png

new file mode 100644 (file)

index 0000000..a92ed15

Binary files /dev/null and b/examples/ssd-test.png differ
diff --git a/examples/steadystate.png b/examples/steadystate.png

new file mode 100644 (file)

index 0000000..4bb9048

Binary files /dev/null and b/examples/steadystate.png differ
diff --git a/examples/surface-scan.png b/examples/surface-scan.png

new file mode 100644 (file)

index 0000000..0057380

Binary files /dev/null and b/examples/surface-scan.png differ
diff --git a/examples/tiobench-example.png b/examples/tiobench-example.png

new file mode 100644 (file)

index 0000000..1441032

Binary files /dev/null and b/examples/tiobench-example.png differ
diff --git a/examples/uring-cmd-fdp.fio b/examples/uring-cmd-fdp.fio

new file mode 100644 (file)

index 0000000..55d741d
--- /dev/null
+++ b/examples/uring-cmd-fdp.fio
@@ -0,0 +1,37 @@
+# io_uring_cmd I/O engine for nvme-ns generic character device with FDP enabled
+# This assumes the namespace is already configured with FDP support and has at
+# least 8 available reclaim units.
+#
+# Each job targets different ranges of LBAs with different placement
+# identifiers, and has different write intensity.
+
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+iodepth=32
+bs=4K
+fdp=1
+time_based=1
+runtime=1000
+
+[write-heavy]
+rw=randrw
+rwmixwrite=90
+fdp_pli=0,1,2,3
+offset=0%
+size=30%
+
+[write-mid]
+rw=randrw
+rwmixwrite=30
+fdp_pli=4,5
+offset=30%
+size=30%
+
+[write-light]
+rw=randrw
+rwmixwrite=10
+fdp_pli=6
+offset=60%
+size=30%
diff --git a/examples/uring-cmd-fdp.png b/examples/uring-cmd-fdp.png

new file mode 100644 (file)

index 0000000..251f4fe

Binary files /dev/null and b/examples/uring-cmd-fdp.png differ
diff --git a/examples/uring-cmd-ng.fio b/examples/uring-cmd-ng.fio

new file mode 100644 (file)

index 0000000..b2888a0
--- /dev/null
+++ b/examples/uring-cmd-ng.fio
@@ -0,0 +1,25 @@
+# io_uring_cmd I/O engine for nvme-ns generic character device
+
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+size=1G
+iodepth=32
+bs=4K
+thread=1
+stonewall=1
+
+[rand-write]
+rw=randwrite
+sqthread_poll=1
+
+[rand-read]
+rw=randread
+
+[write-opts]
+rw=write
+sqthread_poll=1
+sqthread_poll_cpu=0
+nonvectored=1
+registerfiles=1
diff --git a/examples/uring-cmd-ng.png b/examples/uring-cmd-ng.png

new file mode 100644 (file)

index 0000000..cd2ff16

Binary files /dev/null and b/examples/uring-cmd-ng.png differ
diff --git a/examples/uring-cmd-pi-ext.fio b/examples/uring-cmd-pi-ext.fio

new file mode 100644 (file)

index 0000000..e22ec06
--- /dev/null
+++ b/examples/uring-cmd-pi-ext.fio
@@ -0,0 +1,31 @@
+# Protection information test with io_uring_cmd I/O engine for nvme-ns generic
+# character device.
+#
+# This requires nvme device to be formatted with extended LBA data size and
+# protection information enabled. This can be done with nvme-cli utility.
+# Replace bs below with the correct extended LBA size.
+#
+# First we sequentially write to the device, without protection information
+# action being set. FIO will generate and send necessary protection
+# information data as per the protection information check option. Later on we
+# sequentially read and verify the device returned protection information data.
+#
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+size=1G
+iodepth=32
+bs=4160
+pi_act=0
+pi_chk=GUARD,APPTAG,REFTAG
+apptag=0x0888
+apptag_mask=0xFFFF
+thread=1
+stonewall=1
+
+[write]
+rw=write
+
+[read]
+rw=read
diff --git a/examples/uring-cmd-pi-ext.png b/examples/uring-cmd-pi-ext.png

new file mode 100644 (file)

index 0000000..a102fc1

Binary files /dev/null and b/examples/uring-cmd-pi-ext.png differ
diff --git a/examples/uring-cmd-pi-sb.fio b/examples/uring-cmd-pi-sb.fio

new file mode 100644 (file)

index 0000000..b201a7c
--- /dev/null
+++ b/examples/uring-cmd-pi-sb.fio
@@ -0,0 +1,32 @@
+# Protection information test with io_uring_cmd I/O engine for nvme-ns generic
+# character device.
+#
+# This requires nvme device to be formatted with separate metadata buffer and
+# protection information enabled. This can be done with nvme-cli utility.
+# Replace md_per_io_size as per the required metadata buffer size for each IO.
+#
+# First we sequentially write to the device, without protection information
+# action being set. FIO will generate and send necessary protection
+# information data as per the protection information check option. Later on we
+# sequentially read and verify the device returned protection information data.
+#
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+size=1G
+iodepth=32
+bs=4096
+md_per_io_size=64
+pi_act=0
+pi_chk=GUARD,APPTAG,REFTAG
+apptag=0x0888
+apptag_mask=0xFFFF
+thread=1
+stonewall=1
+
+[write]
+rw=write
+
+[read]
+rw=read
diff --git a/examples/uring-cmd-pi-sb.png b/examples/uring-cmd-pi-sb.png

new file mode 100644 (file)

index 0000000..dcdda8c

Binary files /dev/null and b/examples/uring-cmd-pi-sb.png differ
diff --git a/examples/uring-cmd-trim-multi-range.fio b/examples/uring-cmd-trim-multi-range.fio

new file mode 100644 (file)

index 0000000..b376481
--- /dev/null
+++ b/examples/uring-cmd-trim-multi-range.fio
@@ -0,0 +1,21 @@
+# Multi-range trim command test with io_uring_cmd I/O engine for nvme-ns
+# generic character device.
+#
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+size=10M
+iodepth=32
+thread=1
+stonewall=1
+
+[write_bs]
+bs=4096
+rw=randtrim
+num_range=8
+
+[write_bssplit]
+bssplit=4k/10:64k/50:32k/40
+rw=trim
+num_range=8
diff --git a/examples/uring-cmd-trim-multi-range.png b/examples/uring-cmd-trim-multi-range.png

new file mode 100644 (file)

index 0000000..c3ffd54

Binary files /dev/null and b/examples/uring-cmd-trim-multi-range.png differ
diff --git a/examples/uring-cmd-zoned.fio b/examples/uring-cmd-zoned.fio

new file mode 100644 (file)

index 0000000..89be61b
--- /dev/null
+++ b/examples/uring-cmd-zoned.fio
@@ -0,0 +1,35 @@
+# io_uring_cmd I/O engine for nvme-ns generic zoned character device
+#
+# NOTE:
+# Regular writes against a zone should be limited to QD1, as the device can
+# reorder the requests.
+#
+# As the passthrough path do not use an IO scheduler (such as mq-deadline),
+# the queue depth should be limited to 1 to avoid zone invalid writes.
+
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+zonemode=zbd
+size=1G
+iodepth=1
+bs=256K
+verify=crc32c
+stonewall=1
+
+[rand-write]
+rw=randwrite
+
+[write-opts]
+rw=write
+registerfiles=1
+sqthread_poll=1
+sqthread_poll_cpu=0
+
+[randwrite-opts]
+rw=randwrite
+sqthread_poll=1
+sqthread_poll_cpu=0
+nonvectored=1
+registerfiles=1
diff --git a/examples/uring-cmd-zoned.png b/examples/uring-cmd-zoned.png

new file mode 100644 (file)

index 0000000..a3dd199

Binary files /dev/null and b/examples/uring-cmd-zoned.png differ
diff --git a/examples/waitfor.png b/examples/waitfor.png

new file mode 100644 (file)

index 0000000..64e4bf9

Binary files /dev/null and b/examples/waitfor.png differ
diff --git a/examples/xnvme-compare.fio b/examples/xnvme-compare.fio

new file mode 100644 (file)

index 0000000..b89dfdf
--- /dev/null
+++ b/examples/xnvme-compare.fio
@@ -0,0 +1,72 @@
+; Compare fio IO engines with a random-read workload using BS=4k at QD=1
+;
+; README
+;
+; This job-file is intended to be used as:
+;
+; # Use the built-in io_uring engine to get baseline numbers
+; fio examples/xnvme-compare.fio \
+;   --section=default \
+;   --ioengine=io_uring \
+;   --sqthread_poll=1 \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and io_uring async. impl.
+; fio examples/xnvme-compare.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --sqthread_poll=1 \
+;   --xnvme_async=io_uring \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and libaio async. impl.
+; fio examples/xnvme-compare.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_async=libaio \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id
+; fio examples/xnvme-compare.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_dev_nsid=1 \
+;   --filename=0000\\:01\\:00.0
+;
+; NOTE: The URI encoded in the filename above, the ":" must be escaped.
+;
+; On the command-line using two "\\":
+;
+; --filename=0000\\:01\\:00.0
+;
+; Within a fio-script using a single "\":
+;
+; filename=0000\:01\:00.0
+;
+; NOTE: If you want to override the default bs, iodepth, and workload, then
+; invoke it as:
+;
+; FIO_BS="512" FIO_RW="verify" FIO_IODEPTH=16 fio examples/xnvme-compare.fio \
+;   --section=override
+;
+[global]
+rw=randread
+size=12G
+iodepth=1
+bs=4K
+direct=1
+thread=1
+time_based=1
+runtime=7
+ramp_time=3
+norandommap=1
+
+; Avoid accidentally creating device files; e.g. "/dev/nvme0n1", "/dev/nullb0"
+allow_file_create=0
+
+[default]
+
+[override]
+rw=${FIO_RW}
+iodepth=${FIO_IODEPTH}
+bs=${FIO_BS}
diff --git a/examples/xnvme-compare.png b/examples/xnvme-compare.png

new file mode 100644 (file)

index 0000000..2af92f6

Binary files /dev/null and b/examples/xnvme-compare.png differ
diff --git a/examples/xnvme-fdp.fio b/examples/xnvme-fdp.fio

new file mode 100644 (file)

index 0000000..c50959f
--- /dev/null
+++ b/examples/xnvme-fdp.fio
@@ -0,0 +1,56 @@
+; README
+;
+; This job-file is intended to be used either as:
+;
+; # Use the xNVMe io-engine engine io_uring_cmd async. impl.
+; fio examples/xnvme-fdp.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_async=io_uring_cmd \
+;   --filename=/dev/ng0n1
+;
+; # Use the xNVMe io-engine engine with nvme sync. impl.
+; fio examples/xnvme-fdp.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_sync=nvme \
+;   --filename=/dev/ng0n1
+;
+; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id
+; fio examples/xnvme-fdp.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_dev_nsid=1 \
+;   --filename=0000\\:01\\:00.0
+;
+; NOTE: The URI encoded in the filename above, the ":" must be escaped.
+;
+; On the command-line using two "\\":
+;
+; --filename=0000\\:01\\:00.0
+;
+; Within a fio-script using a single "\":
+;
+; filename=0000\:01\:00.0
+;
+; NOTE: If you want to override the default bs, iodepth, and workload, then
+; invoke it as:
+;
+; FIO_BS="512" FIO_RW="read" FIO_IODEPTH=16 fio examples/xnvme-fdp.fio \
+;   --section=override --ioengine=xnvme --xnvme_sync=nvme --filename=/dev/ng0n1
+;
+[global]
+rw=randwrite
+size=2M
+iodepth=1
+bs=4K
+thread=1
+fdp=1
+fdp_pli=4,5
+
+[default]
+
+[override]
+rw=${FIO_RW}
+iodepth=${FIO_IODEPTH}
+bs=${FIO_BS}
diff --git a/examples/xnvme-fdp.png b/examples/xnvme-fdp.png

new file mode 100644 (file)

index 0000000..7f80274

Binary files /dev/null and b/examples/xnvme-fdp.png differ
diff --git a/examples/xnvme-pi.fio b/examples/xnvme-pi.fio

new file mode 100644 (file)

index 0000000..ca8c010
--- /dev/null
+++ b/examples/xnvme-pi.fio
@@ -0,0 +1,53 @@
+; README
+;
+; This job-file is intended to be used either as:
+;
+; # Use the xNVMe io-engine engine io_uring_cmd async. impl.
+; fio examples/xnvme-pi.fio \
+;   --ioengine=xnvme \
+;   --xnvme_async=io_uring_cmd \
+;   --filename=/dev/ng0n1
+;
+; # Use the xNVMe io-engine engine with nvme sync. impl.
+; fio examples/xnvme-pi.fio \
+;   --ioengine=xnvme \
+;   --xnvme_sync=nvme \
+;   --filename=/dev/ng0n1
+;
+; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id
+; fio examples/xnvme-pi.fio \
+;   --ioengine=xnvme \
+;   --xnvme_dev_nsid=1 \
+;   --filename=0000\\:01\\:00.0
+;
+; NOTE: The URI encoded in the filename above, the ":" must be escaped.
+;
+; On the command-line using two "\\":
+;
+; --filename=0000\\:01\\:00.0
+;
+; Within a fio-script using a single "\":
+;
+; filename=0000\:01\:00.0
+;
+; NOTE: This example configuration assumes that the NVMe device is formatted
+; with a separate metadata buffer. If you want to run on an extended LBA format
+; update the "bs" accordingly.
+;
+[global]
+size=100M
+iodepth=16
+bs=4K
+md_per_io_size=64
+pi_act=0
+pi_chk=GUARD,APPTAG,REFTAG
+apptag=0x0234
+apptag_mask=0xFFFF
+thread=1
+stonewall=1
+
+[write]
+rw=write
+
+[read]
+rw=read
diff --git a/examples/xnvme-pi.png b/examples/xnvme-pi.png

new file mode 100644 (file)

index 0000000..def7e68

Binary files /dev/null and b/examples/xnvme-pi.png differ
diff --git a/examples/xnvme-zoned.fio b/examples/xnvme-zoned.fio

new file mode 100644 (file)

index 0000000..1344f9a
--- /dev/null
+++ b/examples/xnvme-zoned.fio
@@ -0,0 +1,87 @@
+; Running xNVMe/fio on a Zoned Device
+;
+; Writes 1GB at QD1 using 4K BS and verifies it.
+;
+; README
+;
+; This job-file is intended to be used as:
+;
+; # Use the built-in io_uring engine to get baseline numbers
+; fio examples/xnvme-zoned.fio \
+;   --section=default \
+;   --ioengine=io_uring \
+;   --sqthread_poll=1 \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and io_uring async. impl.
+; fio examples/xnvme-zoned.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --sqthread_poll=1 \
+;   --xnvme_async=io_uring \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and libaio async. impl.
+; fio examples/xnvme-zoned.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_async=libaio \
+;   --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id
+; fio examples/xnvme-zoned.fio \
+;   --section=default \
+;   --ioengine=xnvme \
+;   --xnvme_dev_nsid=1 \
+;   --filename=0000\\:01\\:00.0
+;
+; NOTE: The URI encoded in the filename above, the ":" must be escaped.
+;
+; On the command-line using two "\\":
+;
+; --filename=0000\\:01\\:00.0
+;
+; Within a fio-script using a single "\":
+;
+; filename=0000\:01\:00.0
+;
+; NOTE: If you want to override the default bs, iodepth, and workload, then
+; invoke it as:
+;
+; FIO_BS="512" FIO_RW="verify" FIO_IODEPTH=16 fio examples/xnvme-zoned.fio \
+;   --section=override
+;
+; To reset all zones on the device to EMPTY state aka. wipe the entire device.
+;
+; # zoned mgmt-reset /dev/nvme0n2 --slba 0x0 --all
+;
+[global]
+zonemode=zbd
+rw=write
+size=1G
+iodepth=1
+bs=4K
+direct=1
+thread=1
+ramp_time=1
+norandommap=1
+verify=crc32c
+; Avoid accidentally creating device files; e.g. "/dev/nvme0n1", "/dev/nullb0"
+allow_file_create=0
+;
+; NOTE: If fio complains about zone-size, then run:
+;
+; # zoned info /dev/nvme0n1
+;
+; The command will provide the values you need, then in the fio-script define:
+;
+; zonesize=nsect * nbytes
+;
+;zonesize=
+
+[default]
+
+[override]
+rw=${FIO_RW}
+iodepth=${FIO_IODEPTH}
+bs=${FIO_BS}
diff --git a/examples/xnvme-zoned.png b/examples/xnvme-zoned.png

new file mode 100644 (file)

index 0000000..2f85074

Binary files /dev/null and b/examples/xnvme-zoned.png differ
diff --git a/examples/zbd-rand-write-trim-gc.fio b/examples/zbd-rand-write-trim-gc.fio

new file mode 100644 (file)

index 0000000..139d2c4
--- /dev/null
+++ b/examples/zbd-rand-write-trim-gc.fio
@@ -0,0 +1,43 @@
+; Using the libaio ioengine, random write to a (zoned) block device. Write
+; target zones are chosen randomly among the first 128 zones starting from
+; device offset corresponding to the 524th zone of the device (524 x 256 MB).
+; For first 3 seconds, run only random write. After that, run random write job
+; and garbage collection simulation job in parallel. The garbage collection
+; simulation job runs trim workload to reset the 128 zones randomly. Use flow
+; option to make the zone resets happen every 128 blocks writes by the other
+; job. This example does not specify max_open_zones. The limit of maximum
+; open zones is obtained from the target block device.
+
+[global]
+group_reporting
+zonemode=zbd
+zonesize=256M
+direct=1
+time_based
+runtime=30
+
+filename=/dev/sdb
+offset=524z
+
+[warmup]
+rw=randwrite
+bs=2M
+size=128z
+ioengine=libaio
+runtime=3
+
+[wjob]
+wait_for=warmup
+rw=randwrite
+bs=2M
+size=128z
+ioengine=libaio
+flow=128
+
+[trimjob]
+wait_for=warmup
+rw=randtrim
+bs=256M
+size=128z
+ioengine=psync
+flow=1
diff --git a/examples/zbd-rand-write-trim-gc.png b/examples/zbd-rand-write-trim-gc.png

new file mode 100644 (file)

index 0000000..f58dd41

Binary files /dev/null and b/examples/zbd-rand-write-trim-gc.png differ
diff --git a/examples/zbd-rand-write-zone-reset-gc.fio b/examples/zbd-rand-write-zone-reset-gc.fio

new file mode 100644 (file)

index 0000000..8f77baf
--- /dev/null
+++ b/examples/zbd-rand-write-zone-reset-gc.fio
@@ -0,0 +1,27 @@
+; Using the psync ioengine, random write to a (zoned) block device. Write
+; target zones are chosen randomly among the first 8 zones starting from device
+; offset corresponding to the 524th zone of the device (524 x 256 MB). Simulate
+; garbage collection operation using zone_reset_threshold and
+; zone_reset_frequency options. The zone resets happen when total written data
+; bytes is beyond 70% of 8 zones, and 8 = 1 / 0.125 blocks are written. This
+; example does not specify max_open_zones. The limit of maximum open zones is
+; obtained from the target block device.
+
+[global]
+name=zbd-rand-write-gc
+group_reporting
+rw=randwrite
+zonemode=zbd
+zonesize=256M
+bs=32M
+direct=1
+time_based
+runtime=40
+
+[dev1]
+filename=/dev/sdb
+size=8z
+offset=524z
+ioengine=psync
+zone_reset_threshold=0.7
+zone_reset_frequency=0.125
diff --git a/examples/zbd-rand-write-zone-reset-gc.png b/examples/zbd-rand-write-zone-reset-gc.png

new file mode 100644 (file)

index 0000000..b10acc8

Binary files /dev/null and b/examples/zbd-rand-write-zone-reset-gc.png differ
diff --git a/examples/zbd-rand-write.fio b/examples/zbd-rand-write.fio

index 46cddd06093e9cae0ed8187b15642ea2599585b3..9494a583dd48c52e407a9ea410dea873e3162608 100644 (file)
--- a/examples/zbd-rand-write.fio
+++ b/examples/zbd-rand-write.fio
@@ -1,4 +1,4 @@
-; Using the libaio ioengine, random write to a (zoned) block device,
+; Using the psync ioengine, random write to a (zoned) block device,
  ; writing at most 32 zones at a time. Target zones are chosen randomly
  ; and writes directed at the write pointer of the chosen zones
  
diff --git a/examples/zbd-rand-write.png b/examples/zbd-rand-write.png

new file mode 100644 (file)

index 0000000..d58721b

Binary files /dev/null and b/examples/zbd-rand-write.png differ
diff --git a/examples/zbd-seq-read.png b/examples/zbd-seq-read.png

new file mode 100644 (file)

index 0000000..b81a08c

Binary files /dev/null and b/examples/zbd-seq-read.png differ
diff --git a/examples/zipf.png b/examples/zipf.png

new file mode 100644 (file)

index 0000000..cb2a981

Binary files /dev/null and b/examples/zipf.png differ
diff --git a/file.h b/file.h

index faf65a2a013846fb716f0b80403d6549b90120d2..deb36e0291369ee78058a4a06e0072b50d8fb2dc 100644 (file)
--- a/file.h
+++ b/file.h
@@ -12,6 +12,7 @@
  
  /* Forward declarations */
  struct zoned_block_device_info;
+struct fdp_ruh_info;
  
  /*
   * The type of object we are working on
@@ -101,6 +102,8 @@ struct fio_file {
         uint64_t file_offset;
         uint64_t io_size;
  
+       struct fio_ruhs_info *ruhs_info;
+
         /*
          * Zoned block device information. See also zonemode=zbd.
          */
@@ -126,12 +129,14 @@ struct fio_file {
         unsigned int last_write_idx;
  
         /*
-        * For use by the io engine for offset or private data storage
+        * For use by the io engine to store offset
          */
-       union {
-               uint64_t engine_pos;
-               void *engine_data;
-       };
+       uint64_t engine_pos;
+
+       /*
+        * For use by the io engine for private data storage
+        */
+       void *engine_data;
  
         /*
          * if io is protected by a semaphore, this is set
diff --git a/filehash.c b/filehash.c

index b55ab7340e68065de4bcb6eb6e8f39111d38886e..71ec7b18c3345cfabdd2a7f48c0ce20c1c2579d3 100644 (file)
--- a/filehash.c
+++ b/filehash.c
@@ -60,10 +60,8 @@ static struct fio_file *__lookup_file_hash(const char *name)
                 if (!f->file_name)
                         continue;
  
-               if (!strcmp(f->file_name, name)) {
-                       assert(f->fd != -1);
+               if (!strcmp(f->file_name, name))
                         return f;
-               }
         }
  
         return NULL;
diff --git a/filesetup.c b/filesetup.c

index 661d4c2fa083e2c4b948b628a344c6f294b28ebe..cb42a852c8cf3728571e54f55103e17d802ef4c4 100644 (file)
--- a/filesetup.c
+++ b/filesetup.c
@@ -226,11 +226,16 @@ static int extend_file(struct thread_data *td, struct fio_file *f)
                         if (r < 0) {
                                 int __e = errno;
  
-                               if (__e == ENOSPC) {
+                               if (__e == ENOSPC || __e == EDQUOT) {
+                                       const char *__e_name;
                                         if (td->o.fill_device)
                                                 break;
-                                       log_info("fio: ENOSPC on laying out "
-                                                "file, stopping\n");
+                                       if (__e == ENOSPC)
+                                               __e_name = "ENOSPC";
+                                       else
+                                               __e_name = "EDQUOT";
+                                       log_info("fio: %s on laying out "
+                                                "file, stopping\n", __e_name);
                                 }
                                 td_verror(td, errno, "write");
                         } else
@@ -298,13 +303,12 @@ static bool pre_read_file(struct thread_data *td, struct fio_file *f)
         if (bs > left)
                 bs = left;
  
-       b = malloc(bs);
+       b = calloc(1, bs);
         if (!b) {
                 td_verror(td, errno, "malloc");
                 ret = false;
                 goto error;
         }
-       memset(b, 0, bs);
  
         if (lseek(f->fd, f->file_offset, SEEK_SET) < 0) {
                 td_verror(td, errno, "lseek");
@@ -732,21 +736,11 @@ int generic_open_file(struct thread_data *td, struct fio_file *f)
                         f_out = stderr;
         }
  
-       if (td_trim(td))
-               goto skip_flags;
         if (td->o.odirect)
                 flags |= OS_O_DIRECT;
-       if (td->o.oatomic) {
-               if (!FIO_O_ATOMIC) {
-                       td_verror(td, EINVAL, "OS does not support atomic IO");
-                       return 1;
-               }
-               flags |= OS_O_DIRECT | FIO_O_ATOMIC;
-       }
         flags |= td->o.sync_io;
         if (td->o.create_on_open && td->o.allow_create)
                 flags |= O_CREAT;
-skip_flags:
         if (f->filetype != FIO_TYPE_FILE)
                 flags |= FIO_O_NOATIME;
  
@@ -755,6 +749,11 @@ open_again:
                 if (!read_only)
                         flags |= O_RDWR;
  
+               if (td->o.verify_only) {
+                       flags &= ~O_RDWR;
+                       flags |= O_RDONLY;
+               }
+
                 if (f->filetype == FIO_TYPE_FILE && td->o.allow_create)
                         flags |= O_CREAT;
  
@@ -763,7 +762,7 @@ open_again:
                 else
                         from_hash = file_lookup_open(f, flags);
         } else if (td_read(td)) {
-               if (f->filetype == FIO_TYPE_CHAR && !read_only)
+               if (td_ioengine_flagged(td, FIO_RO_NEEDS_RW_OPEN) && !read_only)
                         flags |= O_RDWR;
                 else
                         flags |= O_RDONLY;
@@ -1019,7 +1018,6 @@ int longest_existing_path(char *path) {
         while (!done) {
                 buf_pos = strrchr(buf, FIO_OS_PATH_SEPARATOR);
                 if (!buf_pos) {
-                       done = true;
                         offset = 0;
                         break;
                 }
@@ -1115,6 +1113,13 @@ int setup_files(struct thread_data *td)
         if (err)
                 goto err_out;
  
+       if (td->o.zone_mode == ZONE_MODE_ZBD) {
+               err = zbd_init_files(td);
+               if (err)
+                       goto err_out;
+       }
+       zbd_recalc_options_with_zone_granularity(td);
+
         if (o->read_iolog_file)
                 goto done;
  
@@ -1395,16 +1400,23 @@ int setup_files(struct thread_data *td)
         }
  
  done:
+       if (td->o.zone_mode == ZONE_MODE_ZBD) {
+               err = zbd_setup_files(td);
+               if (err)
+                       goto err_out;
+       }
+
         if (o->create_only)
                 td->done = 1;
  
         td_restore_runstate(td, old_state);
  
-       if (td->o.zone_mode == ZONE_MODE_ZBD) {
-               err = zbd_setup_files(td);
+       if (td->o.dp_type != FIO_DP_NONE) {
+               err = dp_init(td);
                 if (err)
                         goto err_out;
         }
+
         return 0;
  
  err_offset:
@@ -1440,9 +1452,8 @@ static void __init_rand_distribution(struct thread_data *td, struct fio_file *f)
  
         nranges = (fsize + range_size - 1ULL) / range_size;
  
-       seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number;
-       if (!td->o.rand_repeatable)
-               seed = td->rand_seeds[4];
+       seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number *
+               td->rand_seeds[FIO_RAND_BLOCK_OFF];
  
         if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
                 zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, td->o.random_center.u.f, seed);
@@ -1474,7 +1485,7 @@ static bool init_rand_distribution(struct thread_data *td)
  
  /*
   * Check if the number of blocks exceeds the randomness capability of
- * the selected generator. Tausworthe is 32-bit, the others are fullly
+ * the selected generator. Tausworthe is 32-bit, the others are fully
   * 64-bit capable.
   */
  static int check_rand_gen_limits(struct thread_data *td, struct fio_file *f,
@@ -1582,6 +1593,8 @@ void fio_file_free(struct fio_file *f)
  {
         if (fio_file_axmap(f))
                 axmap_free(f->io_axmap);
+       if (f->ruhs_info)
+               sfree(f->ruhs_info);
         if (!fio_file_smalloc(f)) {
                 free(f->file_name);
                 free(f);
@@ -1615,6 +1628,7 @@ void close_and_free_files(struct thread_data *td)
                 }
  
                 zbd_close_file(f);
+               fdp_free_ruhs_info(f);
                 fio_file_free(f);
         }
  
@@ -2019,11 +2033,12 @@ void dup_files(struct thread_data *td, struct thread_data *org)
         if (!org->files)
                 return;
  
-       td->files = malloc(org->files_index * sizeof(f));
+       td->files = calloc(org->files_index, sizeof(f));
  
         if (td->o.file_lock_mode != FILE_LOCK_NONE)
                 td->file_locks = malloc(org->files_index);
  
+       assert(org->files_index >= org->o.nr_files);
         for_each_file(org, f, i) {
                 struct fio_file *__f;
  
diff --git a/fio.1 b/fio.1

index accc6a329a12fa98b65cd7fcd443b06faaef8fcf..ee8124946a81a087375fb17293e7a8db5cfc81c4 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -67,8 +67,8 @@ List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR
  defined by \fIioengine\fR. If no \fIioengine\fR is given, list all
  available ioengines.
  .TP
-.BI \-\-showcmd \fR=\fPjobfile
-Convert \fIjobfile\fR to a set of command\-line options.
+.BI \-\-showcmd
+Convert given \fIjobfile\fRs to a set of command\-line options.
  .TP
  .BI \-\-readonly
  Turn on safety read\-only checks, preventing writes and trims. The \fB\-\-readonly\fR
@@ -288,6 +288,15 @@ Pi means pebi (Pi) or 1024**5
  .PD
  .RE
  .P
+For Zone Block Device Mode:
+.RS
+.P
+.PD 0
+z means Zone
+.P
+.PD
+.RE
+.P
  With `kb_base=1024' (the default), the unit prefixes are opposite
  from those specified in the SI and IEC 80000-13 standards to provide
  compatibility with old scripts. For example, 4k means 4096.
@@ -348,6 +357,9 @@ us or usec means microseconds
  .PD
  .RE
  .P
+`z' suffix specifies that the value is measured in zones.
+Value is recalculated once block device's zone size becomes known.
+.P
  If the option accepts an upper and lower range, use a colon ':' or
  minus '\-' to separate such values. See \fIirange\fR parameter type.
  If the lower value specified happens to be larger than the upper value
@@ -459,10 +471,12 @@ See \fB\-\-max\-jobs\fR. Default: 1.
  .SS "Time related parameters"
  .TP
  .BI runtime \fR=\fPtime
-Tell fio to terminate processing after the specified period of time. It
-can be quite hard to determine for how long a specified job will run, so
-this parameter is handy to cap the total runtime to a given time. When
-the unit is omitted, the value is interpreted in seconds.
+Limit runtime. The test will run until it completes the configured I/O
+workload or until it has run for this specified amount of time, whichever
+occurs first. It can be quite hard to determine for how long a specified
+job will run, so this parameter is handy to cap the total runtime to a
+given time.  When the unit is omitted, the value is interpreted in
+seconds.
  .TP
  .BI time_based
  If set, fio will run for the duration of the \fBruntime\fR specified
@@ -523,6 +537,10 @@ copy that segment, instead of entering the kernel with a
  \fBgettimeofday\fR\|(2) call. The CPU set aside for doing these time
  calls will be excluded from other uses. Fio will manually clear it from the
  CPU mask of other jobs.
+.TP
+.BI job_start_clock_id \fR=\fPint
+The clock_id passed to the call to \fBclock_gettime\fR used to record job_start
+in the \fBjson\fR output format. Default is 0, or CLOCK_REALTIME.
  .SS "Target file/device"
  .TP
  .BI directory \fR=\fPstr
@@ -557,7 +575,7 @@ by this option will be \fBsize\fR divided by number of files unless an
  explicit size is specified by \fBfilesize\fR.
  .RS
  .P
-Each colon in the wanted path must be escaped with a '\\'
+Each colon in the wanted path must be escaped with a '\e'
  character. For instance, if the path is `/dev/dsk/foo@3,0:c' then you
  would use `filename=/dev/dsk/foo@3,0\\:c' and if the path is
  `F:\\filename' then you would use `filename=F\\:\\filename'.
@@ -613,7 +631,9 @@ generated filenames (with a directory specified) with the source of the
  client connecting. To disable this behavior, set this option to 0.
  .TP
  .BI opendir \fR=\fPstr
-Recursively open any files below directory \fIstr\fR.
+Recursively open any files below directory \fIstr\fR. This accepts only a
+single directory and unlike related options, colons appearing in the path must
+not be escaped.
  .TP
  .BI lockfile \fR=\fPstr
  Fio defaults to not locking any files before it does I/O to them. If a file
@@ -687,7 +707,8 @@ of how that would work.
  .TP
  .BI ioscheduler \fR=\fPstr
  Attempt to switch the device hosting the file to the specified I/O scheduler
-before running.
+before running. If the file is a pipe, a character device file or if device
+hosting the file could not be determined, this option is ignored.
  .TP
  .BI create_serialize \fR=\fPbool
  If true, serialize the file creation for the jobs. This may be handy to
@@ -728,12 +749,12 @@ same data multiple times. Thus it will not work on non-seekable I/O engines
  (e.g. network, splice). Default: false.
  .TP
  .BI unlink \fR=\fPbool
-Unlink the job files when done. Not the default, as repeated runs of that
+Unlink (delete) the job files when done. Not the default, as repeated runs of that
  job would then waste time recreating the file set again and again. Default:
  false.
  .TP
  .BI unlink_each_loop \fR=\fPbool
-Unlink job files after each iteration or loop. Default: false.
+Unlink (delete) job files after each iteration or loop. Default: false.
  .TP
  .BI zonemode \fR=\fPstr
  Accepted values are:
@@ -753,6 +774,8 @@ starts. The \fBzonecapacity\fR parameter is ignored.
  Zoned block device mode. I/O happens sequentially in each zone, even if random
  I/O has been selected. Random I/O happens across all zones instead of being
  restricted to a single zone.
+Trim is handled using a zone reset operation. Trim only considers non-empty
+sequential write required and sequential write preferred zones.
  .RE
  .RE
  .TP
@@ -783,7 +806,7 @@ If not specified it defaults to the zone size. If the target device is a zoned
  block device, the zone capacity is obtained from the device information and this
  option is ignored.
  .TP
-.BI zoneskip \fR=\fPint
+.BI zoneskip \fR=\fPint[z]
  For \fBzonemode\fR=strided, the number of bytes to skip after \fBzonesize\fR
  bytes of data have been transferred.
  
@@ -813,19 +836,49 @@ numbers fio only reads beyond the write pointer if explicitly told to do
  so. Default: false.
  .TP
  .BI max_open_zones \fR=\fPint
-When running a random write test across an entire drive many more zones will be
-open than in a typical application workload. Hence this command line option
-that allows to limit the number of open zones. The number of open zones is
-defined as the number of zones to which write commands are issued by all
-threads/processes.
+When a zone of a zoned block device is partially written (i.e. not all sectors
+of the zone have been written), the zone is in one of three
+conditions: 'implicit open', 'explicit open' or 'closed'. Zoned block devices
+may have a limit called 'max_open_zones' (same name as the parameter) on the
+total number of zones that can simultaneously be in the 'implicit open'
+or 'explicit open' conditions. Zoned block devices may have another limit
+called 'max_active_zones', on the total number of zones that can simultaneously
+be in the three conditions. The \fBmax_open_zones\fR parameter limits
+the number of zones to which write commands are issued by all fio jobs, that is,
+limits the number of zones that will be in the conditions. When the device has
+the max_open_zones limit and does not have the max_active_zones limit, the
+\fBmax_open_zones\fR parameter limits the number of zones in the two open
+conditions up to the limit. In this case, fio includes zones in the two open
+conditions to the write target zones at fio start. When the device has both the
+max_open_zones and the max_active_zones limits, the \fBmax_open_zones\fR
+parameter limits the number of zones in the three conditions up to the limit.
+In this case, fio includes zones in the three conditions to the write target
+zones at fio start.
+
+This parameter is relevant only if the \fBzonemode=zbd\fR is used. The default
+value is always equal to the max_open_zones limit of the target zoned block
+device and a value higher than this limit cannot be specified by users unless
+the option \fBignore_zone_limits\fR is specified. When \fBignore_zone_limits\fR
+is specified or the target device does not have the max_open_zones limit,
+\fBmax_open_zones\fR can specify 0 to disable any limit on the number of zones
+that can be simultaneously written to by all jobs.
  .TP
  .BI job_max_open_zones \fR=\fPint
-Limit on the number of simultaneously opened zones per single thread/process.
+In the same manner as \fBmax_open_zones\fR, limit the number of open zones per
+fio job, that is, the number of zones that a single job can simultaneously write
+to. A value of zero indicates no limit. Default: zero.
+.TP
+.BI ignore_zone_limits \fR=\fPbool
+If this option is used, fio will ignore the maximum number of open zones limit
+of the zoned block device in use, thus allowing the option \fBmax_open_zones\fR
+value to be larger than the device reported limit. Default: false.
  .TP
  .BI zone_reset_threshold \fR=\fPfloat
-A number between zero and one that indicates the ratio of logical blocks with
-data to the total number of logical blocks in the test above which zones
-should be reset periodically.
+A number between zero and one that indicates the ratio of written bytes in the
+zones with write pointers in the IO range to the size of the IO range. When
+current ratio is above this ratio, zones are reset periodically as
+\fBzone_reset_frequency\fR specifies. If there are multiple jobs when using this
+option, the IO range for all write jobs has to be the same.
  .TP
  .BI zone_reset_frequency \fR=\fPfloat
  A number between zero and one that indicates how often a zone reset should be
@@ -840,11 +893,6 @@ If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
  OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous
  ioengines don't support direct I/O. Default: false.
  .TP
-.BI atomic \fR=\fPbool
-If value is true, attempt to use atomic direct I/O. Atomic writes are
-guaranteed to be stable once acknowledged by the operating system. Only
-Linux supports O_ATOMIC right now.
-.TP
  .BI buffered \fR=\fPbool
  If value is true, use buffered I/O. This is the opposite of the
  \fBdirect\fR option. Defaults to true.
@@ -880,7 +928,15 @@ Random mixed reads and writes.
  .TP
  .B trimwrite
  Sequential trim+write sequences. Blocks will be trimmed first,
-then the same blocks will be written to.
+then the same blocks will be written to. So if `io_size=64K' is specified,
+Fio will trim a total of 64K bytes and also write 64K bytes on the same
+trimmed blocks. This behaviour will be consistent with `number_ios' or
+other Fio options limiting the total bytes or number of I/O's.
+.TP
+.B randtrimwrite
+Like
+.B trimwrite ,
+but uses random offsets rather than sequential writes.
  .RE
  .P
  Fio defaults to read if the option is not specified. For the mixed I/O
@@ -913,26 +969,82 @@ Generate the same offset.
  .P
  \fBsequential\fR is only useful for random I/O, where fio would normally
  generate a new random offset for every I/O. If you append e.g. 8 to randread,
-you would get a new random offset for every 8 I/Os. The result would be a
-seek for only every 8 I/Os, instead of for every I/O. Use `rw=randread:8'
-to specify that. As sequential I/O is already sequential, setting
-\fBsequential\fR for that would not result in any differences. \fBidentical\fR
-behaves in a similar fashion, except it sends the same offset 8 number of
-times before generating a new offset.
+i.e. `rw=randread:8' you would get a new random offset for every 8 I/Os. The
+result would be a sequence of 8 sequential offsets with a random starting
+point.  However this behavior may change if a sequential I/O reaches end of the
+file. As sequential I/O is already sequential, setting \fBsequential\fR for
+that would not result in any difference. \fBidentical\fR behaves in a similar
+fashion, except it sends the same offset 8 number of times before generating a
+new offset.
+.P
+.P
+Example #1:
+.RS
+.P
+.PD 0
+rw=randread:8
+.P
+rw_sequencer=sequential
+.P
+bs=4k
+.PD
+.RE
+.P
+The generated sequence of offsets will look like this:
+4k, 8k, 12k, 16k, 20k, 24k, 28k, 32k, 92k, 96k, 100k, 104k, 108k, 112k, 116k,
+120k, 48k, 52k ...
+.P
+.P
+Example #2:
+.RS
+.P
+.PD 0
+rw=randread:8
+.P
+rw_sequencer=identical
+.P
+bs=4k
+.PD
+.RE
+.P
+The generated sequence of offsets will look like this:
+4k, 4k, 4k, 4k, 4k, 4k, 4k, 4k, 92k, 92k, 92k, 92k, 92k, 92k, 92k, 92k, 48k,
+48k, 48k ...
  .RE
  .TP
-.BI unified_rw_reporting \fR=\fPbool
+.BI unified_rw_reporting \fR=\fPstr
  Fio normally reports statistics on a per data direction basis, meaning that
-reads, writes, and trims are accounted and reported separately. If this
-option is set fio sums the results and report them as "mixed" instead.
+reads, writes, and trims are accounted and reported separately. This option
+determines whether fio reports the results normally, summed together, or as
+both options.
+Accepted values are:
+.RS
+.TP
+.B none
+Normal statistics reporting.
+.TP
+.B mixed
+Statistics are summed per data direction and reported together.
+.TP
+.B both
+Statistics are reported normally, followed by the mixed statistics.
+.TP
+.B 0
+Backward-compatible alias for \fBnone\fR.
+.TP
+.B 1
+Backward-compatible alias for \fBmixed\fR.
+.TP
+.B 2
+Alias for \fBboth\fR.
+.RE
  .TP
  .BI randrepeat \fR=\fPbool
-Seed the random number generator used for random I/O patterns in a
-predictable way so the pattern is repeatable across runs. Default: true.
+Seed all random number generators in a predictable way so the pattern is
+repeatable across runs. Default: true.
  .TP
  .BI allrandrepeat \fR=\fPbool
-Seed all random number generators in a predictable way so results are
-repeatable across runs. Default: false.
+Alias for \fBrandrepeat\fR. Default: true.
  .TP
  .BI randseed \fR=\fPint
  Seed the random number generators based on this seed value, to be able to
@@ -1003,6 +1115,11 @@ Advise using FADV_SEQUENTIAL.
  .TP
  .B random
  Advise using FADV_RANDOM.
+.TP
+.B noreuse
+Advise using FADV_NOREUSE. This may be a no-op on older Linux
+kernels. Since Linux 6.3, it provides a hint to the LRU algorithm.
+See the \fBposix_fadvise\fR\|(2) man page.
  .RE
  .RE
  .TP
@@ -1033,22 +1150,23 @@ The values are all relative to each other, and no absolute meaning
  should be associated with them.
  .RE
  .TP
-.BI offset \fR=\fPint
+.BI offset \fR=\fPint[%|z]
  Start I/O at the provided offset in the file, given as either a fixed size in
-bytes or a percentage. If a percentage is given, the generated offset will be
+bytes, zones or a percentage. If a percentage is given, the generated offset will be
  aligned to the minimum \fBblocksize\fR or to the value of \fBoffset_align\fR if
  provided. Data before the given offset will not be touched. This
  effectively caps the file size at `real_size \- offset'. Can be combined with
  \fBsize\fR to constrain the start and end range of the I/O workload.
  A percentage can be specified by a number between 1 and 100 followed by '%',
-for example, `offset=20%' to specify 20%.
+for example, `offset=20%' to specify 20%. In ZBD mode, value can be set as
+number of zones using 'z'.
  .TP
  .BI offset_align \fR=\fPint
  If set to non-zero value, the byte offset generated by a percentage \fBoffset\fR
  is aligned upwards to this value. Defaults to 0 meaning that a percentage
  offset is aligned to the minimum block size.
  .TP
-.BI offset_increment \fR=\fPint
+.BI offset_increment \fR=\fPint[%|z]
  If this is provided, then the real offset becomes `\fBoffset\fR + \fBoffset_increment\fR
  * thread_number', where the thread number is a counter that starts at 0 and
  is incremented for each sub-job (i.e. when \fBnumjobs\fR option is
@@ -1056,7 +1174,8 @@ specified). This option is useful if there are several jobs which are
  intended to operate on a file in parallel disjoint segments, with even
  spacing between the starting points. Percentages can be used for this option.
  If a percentage is given, the generated offset will be aligned to the minimum
-\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.
+\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.In ZBD mode, value
+can be set as number of zones using 'z'.
  .TP
  .BI number_ios \fR=\fPint
  Fio will normally perform I/Os until it has exhausted the size of the region
@@ -1078,7 +1197,7 @@ see \fBend_fsync\fR and \fBfsync_on_close\fR.
  .TP
  .BI fdatasync \fR=\fPint
  Like \fBfsync\fR but uses \fBfdatasync\fR\|(2) to only sync data and
-not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no
+not metadata blocks. In Windows, DragonFlyBSD or OSX there is no
  \fBfdatasync\fR\|(2) so this falls back to using \fBfsync\fR\|(2).
  Defaults to 0, which means fio does not periodically issue and wait for a
  data-only sync to complete.
@@ -1172,12 +1291,12 @@ map. For the \fBnormal\fR distribution, a normal (Gaussian) deviation is
  supplied as a value between 0 and 100.
  .P
  The second, optional float is allowed for \fBpareto\fR, \fBzipf\fR and \fBnormal\fR
-distributions. It allows to set base of distribution in non-default place, giving
+distributions. It allows one to set base of distribution in non-default place, giving
  more control over most probable outcome. This value is in range [0-1] which maps linearly to
  range of possible random values.
  Defaults are: random for \fBpareto\fR and \fBzipf\fR, and 0.5 for \fBnormal\fR.
  If you wanted to use \fBzipf\fR with a `theta` of 1.2 centered on 1/4 of allowed value range,
-you would use `random_distibution=zipf:1.2:0.25`.
+you would use `random_distribution=zipf:1.2:0.25`.
  .P
  For a \fBzoned\fR distribution, fio supports specifying percentages of I/O
  access that should fall within what range of the file or device. For
@@ -1315,7 +1434,7 @@ described in \fBblocksize\fR. Example:
  .RS
  .RS
  .P
-bsrange=1k\-4k,2k\-8k
+bsrange=1k\-4k,2k\-8k or bsrange=1k:4k,2k:8k
  .RE
  .RE
  .TP
@@ -1467,6 +1586,57 @@ all \-\- this option only controls the distribution of unique buffers. Setting
  this option will also enable \fBrefill_buffers\fR to prevent every buffer
  being identical.
  .TP
+.BI dedupe_mode \fR=\fPstr
+If \fBdedupe_percentage\fR is given, then this option controls how fio
+generates the dedupe buffers.
+.RS
+.RS
+.TP
+.B repeat
+.P
+.RS
+Generate dedupe buffers by repeating previous writes
+.RE
+.TP
+.B working_set
+.P
+.RS
+Generate dedupe buffers from working set
+.RE
+.RE
+.P
+\fBrepeat\fR is the default option for fio. Dedupe buffers are generated
+by repeating previous unique write.
+
+\fBworking_set\fR is a more realistic workload.
+With \fBworking_set\fR, \fBdedupe_working_set_percentage\fR should be provided.
+Given that, fio will use the initial unique write buffers as its working set.
+Upon deciding to dedupe, fio will randomly choose a buffer from the working set.
+Note that by using \fBworking_set\fR the dedupe percentage will converge
+to the desired over time while \fBrepeat\fR maintains the desired percentage
+throughout the job.
+.RE
+.RE
+.TP
+.BI dedupe_working_set_percentage \fR=\fPint
+If \fBdedupe_mode\fR is set to \fBworking_set\fR, then this controls
+the percentage of size of the file or device used as the buffers
+fio will choose to generate the dedupe buffers from
+.P
+.RS
+Note that \fBsize\fR needs to be explicitly provided and only 1 file
+per job is supported
+.RE
+.TP
+.BI dedupe_global \fR=\fPbool
+This controls whether the deduplication buffers will be shared amongst
+all jobs that have this option set. The buffers are spread evenly between
+participating jobs.
+.P
+.RS
+Note that \fBdedupe_mode\fR must be set to \fBworking_set\fR for this to work.
+Can be used in combination with compression
+.TP
  .BI invalidate \fR=\fPbool
  Invalidate the buffer/page cache parts of the files to be used prior to
  starting I/O if the platform and file type support it. Defaults to true.
@@ -1536,11 +1706,11 @@ multiplied by the I/O depth given. Note that for \fBshmhuge\fR and
  \fBmmaphuge\fR to work, the system must have free huge pages allocated. This
  can normally be checked and set by reading/writing
  `/proc/sys/vm/nr_hugepages' on a Linux system. Fio assumes a huge page
-is 4MiB in size. So to calculate the number of huge pages you need for a
-given job file, add up the I/O depth of all jobs (normally one unless
-\fBiodepth\fR is used) and multiply by the maximum bs set. Then divide
-that number by the huge page size. You can see the size of the huge pages in
-`/proc/meminfo'. If no huge pages are allocated by having a non-zero
+is 2 or 4MiB in size depending on the platform. So to calculate the number of
+huge pages you need for a given job file, add up the I/O depth of all jobs
+(normally one unless \fBiodepth\fR is used) and multiply by the maximum bs set.
+Then divide that number by the huge page size. You can see the size of the huge
+pages in `/proc/meminfo'. If no huge pages are allocated by having a non-zero
  number in `nr_hugepages', using \fBmmaphuge\fR or \fBshmhuge\fR will fail. Also
  see \fBhugepage\-size\fR.
  .P
@@ -1560,20 +1730,24 @@ of subsequent I/O memory buffers is the sum of the \fBiomem_align\fR and
  \fBbs\fR used.
  .TP
  .BI hugepage\-size \fR=\fPint
-Defines the size of a huge page. Must at least be equal to the system
-setting, see `/proc/meminfo'. Defaults to 4MiB. Should probably
-always be a multiple of megabytes, so using `hugepage\-size=Xm' is the
-preferred way to set this to avoid setting a non-pow-2 bad value.
+Defines the size of a huge page. Must at least be equal to the system setting,
+see `/proc/meminfo' and `/sys/kernel/mm/hugepages/'. Defaults to 2 or 4MiB
+depending on the platform. Should probably always be a multiple of megabytes,
+so using `hugepage\-size=Xm' is the preferred way to set this to avoid setting
+a non-pow-2 bad value.
  .TP
  .BI lockmem \fR=\fPint
  Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to
  simulate a smaller amount of memory. The amount specified is per worker.
  .SS "I/O size"
  .TP
-.BI size \fR=\fPint
+.BI size \fR=\fPint[%|z]
  The total size of file I/O for each thread of this job. Fio will run until
-this many bytes has been transferred, unless runtime is limited by other options
-(such as \fBruntime\fR, for instance, or increased/decreased by \fBio_size\fR).
+this many bytes has been transferred, unless runtime is altered by other means
+such as (1) \fBruntime\fR, (2) \fBio_size\fR, (3) \fBnumber_ios\fR, (4)
+gaps/holes while doing I/O's such as `rw=read:16K', or (5) sequential I/O
+reaching end of the file which is possible when \fBpercentage_random\fR is
+less than 100.
  Fio will divide this size between the available files determined by options
  such as \fBnrfiles\fR, \fBfilename\fR, unless \fBfilesize\fR is
  specified by the job. If the result of division happens to be 0, the size is
@@ -1581,11 +1755,11 @@ set to the physical size of the given files or devices if they exist.
  If this option is not specified, fio will use the full size of the given
  files or devices. If the files do not exist, size must be given. It is also
  possible to give size as a percentage between 1 and 100. If `size=20%' is
-given, fio will use 20% of the full size of the given files or devices.
-Can be combined with \fBoffset\fR to constrain the start and end range
-that I/O will be done within.
+given, fio will use 20% of the full size of the given files or devices. In ZBD mode,
+size can be given in units of number of zones using 'z'. Can be combined with \fBoffset\fR to
+constrain the start and end range that I/O will be done within.
  .TP
-.BI io_size \fR=\fPint "\fR,\fB io_limit" \fR=\fPint
+.BI io_size \fR=\fPint[%|z] "\fR,\fB io_limit" \fR=\fPint[%|z]
  Normally fio operates within the region set by \fBsize\fR, which means
  that the \fBsize\fR option sets both the region and size of I/O to be
  performed. Sometimes that is not what you want. With this option, it is
@@ -1595,14 +1769,15 @@ will perform I/O within the first 20GiB but exit when 5GiB have been
  done. The opposite is also possible \-\- if \fBsize\fR is set to 20GiB,
  and \fBio_size\fR is set to 40GiB, then fio will do 40GiB of I/O within
  the 0..20GiB region. Value can be set as percentage: \fBio_size\fR=N%.
-In this case \fBio_size\fR multiplies \fBsize\fR= value.
+In this case \fBio_size\fR multiplies \fBsize\fR= value. In ZBD mode, value can
+also be set as number of zones using 'z'.
  .TP
  .BI filesize \fR=\fPirange(int)
  Individual file sizes. May be a range, in which case fio will select sizes
-for files at random within the given range and limited to \fBsize\fR in
-total (if that is given). If not given, each created file is the same size.
-This option overrides \fBsize\fR in terms of file size, which means
-this value is used as a fixed size or possible range of each file.
+for files at random within the given range. If not given, each created file
+is the same size. This option overrides \fBsize\fR in terms of file size,
+i.e. \fBsize\fR becomes merely the default for \fBio_size\fR (and
+has no effect it all if \fBio_size\fR is set explicitly).
  .TP
  .BI file_append \fR=\fPbool
  Perform I/O after the end of the file. Normally fio will operate within the
@@ -1612,16 +1787,16 @@ of a file. This option is ignored on non-regular files.
  .TP
  .BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
  Sets size to something really large and waits for ENOSPC (no space left on
-device) as the terminating condition. Only makes sense with sequential
+device) or EDQUOT (disk quota exceeded)
+as the terminating condition. Only makes sense with sequential
  write. For a read workload, the mount point will be filled first then I/O
-started on the result. This option doesn't make sense if operating on a raw
-device node, since the size of that is already known by the file system.
-Additionally, writing beyond end-of-device will not return ENOSPC there.
+started on the result.
  .SS "I/O engine"
  .TP
  .BI ioengine \fR=\fPstr
-Defines how the job issues I/O to the file. The following types are defined:
-.RS
+fio supports 2 kinds of performance measurement: I/O and file/directory operation.
+
+I/O engines define how the job issues I/O to the file. The following types are defined:
  .RS
  .TP
  .B sync
@@ -1643,6 +1818,15 @@ Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O.
  .B pvsync2
  Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O.
  .TP
+.B io_uring
+Fast Linux native asynchronous I/O. Supports async IO
+for both direct and buffered IO.
+This engine defines engine specific options.
+.TP
+.B io_uring_cmd
+Fast Linux native asynchronous I/O for passthrough commands.
+This engine defines engine specific options.
+.TP
  .B libaio
  Linux native asynchronous I/O. Note that Linux may only support
  queued behavior with non-buffered I/O (set `direct=1' or
@@ -1677,10 +1861,9 @@ character devices. This engine supports trim operations. The
  sg engine includes engine specific options.
  .TP
  .B libzbc
-Synchronous I/O engine for SMR hard-disks using the \fBlibzbc\fR
-library. The target can be either an sg character device or
-a block device file. This engine supports the zonemode=zbd zone
-operations.
+Read, write, trim and ZBC/ZAC operations to a zoned block device using
+\fBlibzbc\fR library. The target can be either an SG character device or
+a block device file.
  .TP
  .B null
  Doesn't transfer any data, just pretends to. This is mainly used to
@@ -1796,11 +1979,6 @@ e.g., on NAND, writing sequentially to erase blocks and discarding
  before overwriting. The \fBtrimwrite\fR mode works well for this
  constraint.
  .TP
-.B pmemblk
-Read and write using filesystem DAX to a file on a filesystem
-mounted with DAX on a persistent memory device through the PMDK
-libpmemblk library.
-.TP
  .B dev\-dax
  Read and write using device DAX to a persistent memory device (e.g.,
  /dev/dax0.0) through the PMDK libpmem library.
@@ -1812,16 +1990,6 @@ ioengine `foo.o' in `/tmp'. The path can be either
  absolute or relative. See `engines/skeleton_external.c' in the fio source for
  details of writing an external I/O engine.
  .TP
-.B filecreate
-Simply create the files and do no I/O to them.  You still need to set
-\fBfilesize\fR so that all the accounting still occurs, but no actual I/O will be
-done other than creating the file.
-.TP
-.B filestat
-Simply do stat() and do no I/O to the file. You need to set 'filesize'
-and 'nrfiles', so that files will be created.
-This engine is to measure file lookup and meta data access.
-.TP
  .B libpmem
  Read and write using mmap I/O to a file on a filesystem
  mounted with DAX on a persistent memory device through the PMDK
@@ -1853,49 +2021,215 @@ GPUDirect Storage-supported filesystem. This engine performs
  I/O without transferring buffers between user-space and the kernel,
  unless \fBverify\fR is set or \fBcuda_io\fR is \fBposix\fR. \fBiomem\fR must
  not be \fBcudamalloc\fR. This ioengine defines engine specific options.
+.TP
+.B dfs
+I/O engine supporting asynchronous read and write operations to the DAOS File
+System (DFS) via libdfs.
+.TP
+.B nfs
+I/O engine supporting asynchronous read and write operations to
+NFS filesystems from userspace via libnfs. This is useful for
+achieving higher concurrency and thus throughput than is possible
+via kernel NFS.
+.TP
+.B exec
+Execute 3rd party tools. Could be used to perform monitoring during jobs runtime.
+.TP
+.B xnvme
+I/O engine using the xNVMe C API, for NVMe devices. The xnvme engine provides
+flexibility to access GNU/Linux Kernel NVMe driver via libaio, IOCTLs, io_uring,
+the SPDK NVMe driver, or your own custom NVMe driver. The xnvme engine includes
+engine specific options. (See \fIhttps://xnvme.io/\fR).
+.TP
+.B libblkio
+Use the libblkio library (\fIhttps://gitlab.com/libblkio/libblkio\fR). The
+specific driver to use must be set using \fBlibblkio_driver\fR. If
+\fBmem\fR/\fBiomem\fR is not specified, memory allocation is delegated to
+libblkio (and so is guaranteed to work with the selected driver). One libblkio
+instance is used per process, so all jobs setting option \fBthread\fR will share
+a single instance (with one queue per thread) and must specify compatible
+options. Note that some drivers don't allow several instances to access the same
+device or file simultaneously, but allow it for threads.
+.TP
+.RE
+.P
+File/directory operation engines define how the job operates file or directory.
+The following types are defined:
+.RS
+.TP
+.B filecreate
+Simply create the files and do no I/O to them.  You still need to
+set  \fBfilesize\fP so that all the accounting still occurs, but no
+actual I/O will be done other than creating the file.
+Example job file: filecreate-ioengine.fio.
+.TP
+.B filestat
+Simply do stat() and do no I/O to the file. You need to set \fBfilesize\fP
+and \fBnrfiles\fP, so that files will be created.
+This engine is to measure file lookup and meta data access.
+Example job file: filestat-ioengine.fio.
+.TP
+.B filedelete
+Simply delete the files by unlink() and do no I/O to them. You need to set \fBfilesize\fP
+and \fBnrfiles\fP, so that the files will be created.
+This engine is to measure file delete.
+Example job file: filedelete-ioengine.fio.
+.TP
+.B dircreate
+Simply create the directories and do no I/O to them.  You still need to
+set  \fBfilesize\fP so that all the accounting still occurs, but no
+actual I/O will be done other than creating the directories.
+Example job file: dircreate-ioengine.fio.
+.TP
+.B dirstat
+Simply do stat() and do no I/O to the directories. You need to set \fBfilesize\fP
+and \fBnrfiles\fP, so that directories will be created.
+This engine is to measure directory lookup and meta data access.
+Example job file: dirstat-ioengine.fio.
+.TP
+.B dirdelete
+Simply delete the directories by rmdir() and do no I/O to them. You need to set \fBfilesize\fP
+and \fBnrfiles\fP, so that the directories will be created.
+This engine is to measure directory delete.
+.TP
+.RE
+.P
+For file and directory operation engines, there is no I/O throughput, then the statistics \
+data in report have different meanings. The meaningful output indexes are: \fBiops\fP and \fBclat\fP. \
+\fBbw\fP is meaningless. Refer to section: "Interpreting the output" for more details.
+.RE
+.P
  .SS "I/O engine specific parameters"
  In addition, there are some parameters which are only valid when a specific
  \fBioengine\fR is in use. These are used identically to normal parameters,
  with the caveat that when used on the command line, they must come after the
  \fBioengine\fR that defines them is selected.
  .TP
-.BI (io_uring, libaio)cmdprio_percentage \fR=\fPint
-Set the percentage of I/O that will be issued with higher priority by setting
-the priority bit. Non-read I/O is likely unaffected by ``cmdprio_percentage``.
-This option cannot be used with the `prio` or `prioclass` options. For this
-option to set the priority bit properly, NCQ priority must be supported and
-enabled and `direct=1' option must be used. fio must also be run as the root
-user.
+.BI (io_uring,libaio)cmdprio_percentage \fR=\fPint[,int]
+Set the percentage of I/O that will be issued with the highest priority.
+Default: 0. A single value applies to reads and writes. Comma-separated
+values may be specified for reads and writes. For this option to be effective,
+NCQ priority must be supported and enabled, and `direct=1' option must be
+used. fio must also be run as the root user. Unlike slat/clat/lat stats, which
+can be tracked and reported independently, per priority stats only track and
+report a single type of latency. By default, completion latency (clat) will be
+reported, if \fBlat_percentiles\fR is set, total latency (lat) will be reported.
  .TP
-.BI (io_uring)fixedbufs
+.BI (io_uring,libaio)cmdprio_class \fR=\fPint[,int]
+Set the I/O priority class to use for I/Os that must be issued with a
+priority when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR is set.
+If not specified when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR
+is set, this defaults to the highest priority class. A single value applies
+to reads and writes. Comma-separated values may be specified for reads and
+writes. See man \fBionice\fR\|(1). See also the \fBprioclass\fR option.
+.TP
+.BI (io_uring,libaio)cmdprio_hint \fR=\fPint[,int]
+Set the I/O priority hint to use for I/Os that must be issued with a
+priority when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR is set.
+If not specified when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR
+is set, this defaults to 0 (no hint). A single value applies to reads and
+writes. Comma-separated values may be specified for reads and writes.
+See also the \fBpriohint\fR option.
+.TP
+.BI (io_uring,libaio)cmdprio \fR=\fPint[,int]
+Set the I/O priority value to use for I/Os that must be issued with a
+priority when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR is set.
+If not specified when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR
+is set, this defaults to 0. Linux limits us to a positive value between
+0 and 7, with 0 being the highest. A single value applies to reads and writes.
+Comma-separated values may be specified for reads and writes. See man
+\fBionice\fR\|(1). Refer to an appropriate manpage for other operating systems
+since the meaning of priority may differ. See also the \fBprio\fR option.
+.TP
+.BI (io_uring,libaio)cmdprio_bssplit \fR=\fPstr[,str]
+To get a finer control over I/O priority, this option allows specifying
+the percentage of IOs that must have a priority set depending on the block
+size of the IO. This option is useful only when used together with the option
+\fBbssplit\fR, that is, multiple different block sizes are used for reads and
+writes.
+.RS
+.P
+The first accepted format for this option is the same as the format of the
+\fBbssplit\fR option:
+.RS
+.P
+cmdprio_bssplit=blocksize/percentage:blocksize/percentage
+.RE
+.P
+In this case, each entry will use the priority class, priority hint and
+priority level defined by the options \fBcmdprio_class\fR, \fBcmdprio\fR
+and \fBcmdprio_hint\fR respectively.
+.P
+The second accepted format for this option is:
+.RS
+.P
+cmdprio_bssplit=blocksize/percentage/class/level:blocksize/percentage/class/level
+.RE
+.P
+In this case, the priority class and priority level is defined inside each
+entry. In comparison with the first accepted format, the second accepted format
+does not restrict all entries to have the same priority class and priority
+level.
+.P
+The third accepted format for this option is:
+.RS
+.P
+cmdprio_bssplit=blocksize/percentage/class/level/hint:...
+.RE
+.P
+This is an extension of the second accepted format that allows one to also
+specify a priority hint.
+.P
+For all formats, only the read and write data directions are supported, values
+for trim IOs are ignored. This option is mutually exclusive with the
+\fBcmdprio_percentage\fR option.
+.RE
+.TP
+.BI (io_uring,io_uring_cmd)fixedbufs
  If fio is asked to do direct IO, then Linux will map pages for each IO call, and
  release them when IO is done. If this option is set, the pages are pre-mapped
  before IO is started. This eliminates the need to map and release for each IO.
  This is more efficient, and reduces the IO latency as well.
  .TP
-.BI (io_uring)hipri
+.BI (io_uring,io_uring_cmd)nonvectored \fR=\fPint
+With this option, fio will use non-vectored read/write commands, where address
+must contain the address directly. Default is -1.
+.TP
+.BI (io_uring,io_uring_cmd)force_async
+Normal operation for io_uring is to try and issue an sqe as non-blocking first,
+and if that fails, execute it in an async manner. With this option set to N,
+then every N request fio will ask sqe to be issued in an async manner. Default
+is 0.
+.TP
+.BI (io_uring,io_uring_cmd,xnvme)hipri
  If this option is set, fio will attempt to use polled IO completions. Normal IO
  completions generate interrupts to signal the completion of IO, polled
  completions do not. Hence they are require active reaping by the application.
  The benefits are more efficient IO for high IOPS scenarios, and lower latencies
  for low queue depth IO.
  .TP
-.BI (io_uring)registerfiles
+.BI (io_uring,io_uring_cmd)registerfiles
  With this option, fio registers the set of files being used with the kernel.
  This avoids the overhead of managing file counts in the kernel, making the
  submission and completion part more lightweight. Required for the below
  sqthread_poll option.
  .TP
-.BI (io_uring)sqthread_poll
+.BI (io_uring,io_uring_cmd,xnvme)sqthread_poll
  Normally fio will submit IO by issuing a system call to notify the kernel of
  available items in the SQ ring. If this option is set, the act of submitting IO
  will be done by a polling thread in the kernel. This frees up cycles for fio, at
-the cost of using more CPU in the system.
+the cost of using more CPU in the system. As submission is just the time it
+takes to fill in the sqe entries and any syscall required to wake up the idle
+kernel thread, fio will not report submission latencies.
  .TP
-.BI (io_uring)sqthread_poll_cpu
+.BI (io_uring,io_uring_cmd)sqthread_poll_cpu \fR=\fPint
  When `sqthread_poll` is set, this option provides a way to define which CPU
  should be used for the polling thread.
  .TP
+.BI (io_uring_cmd)cmd_type \fR=\fPstr
+Specifies the type of uring passthrough command to be used. Supported
+value is nvme. Default is nvme.
+.TP
  .BI (libaio)userspace_reap
  Normally, with the libaio engine in use, fio will use the
  \fBio_getevents\fR\|(3) system call to reap newly returned events. With
@@ -1911,7 +2245,7 @@ than normal.
  When hipri is set this determines the probability of a pvsync2 I/O being high
  priority. The default is 100%.
  .TP
-.BI (pvsync2,libaio,io_uring)nowait
+.BI (pvsync2,libaio,io_uring,io_uring_cmd)nowait \fR=\fPbool
  By default if a request cannot be executed immediately (e.g. resource starvation,
  waiting on locks) it is queued and the initiating process will be blocked until
  the required resource becomes free.
@@ -1927,6 +2261,101 @@ cached data. Currently the RWF_NOWAIT flag does not supported for cached write.
  For direct I/O, requests will only succeed if cache invalidation isn't required,
  file blocks are fully allocated and the disk request could be issued immediately.
  .TP
+.BI (io_uring_cmd,xnvme)fdp \fR=\fPbool
+Enable Flexible Data Placement mode for write commands.
+.TP
+.BI (io_uring_cmd,xnvme)dataplacement \fR=\fPstr
+Specifies the data placement directive type to use for write commands. The
+following types are supported:
+.RS
+.RS
+.TP
+.B none
+Do not use a data placement directive. This is the default.
+.TP
+.B fdp
+Use Flexible Data placement directives for write commands. This is equivalent
+to specifying \fBfdp\fR=1.
+.TP
+.B streams
+Use Streams directives for write commands.
+.TP
+.RE
+.RE
+.TP
+.BI (io_uring_cmd,xnvme)plid_select=str, fdp_pli_select \fR=\fPstr
+Defines how fio decides which placement ID to use next. The following types
+are defined:
+.RS
+.RS
+.TP
+.B random
+Choose a placement ID at random (uniform).
+.TP
+.B roundrobin
+Round robin over available placement IDs. This is the default.
+.RE
+.P
+The available placement ID (indices) are defined by the \fBplids\fR option.
+.RE
+.TP
+.BI (io_uring_cmd,xnvme)plids=str, fdp_pli \fR=\fPstr
+Select which Placement IDs (streams) or Placement ID Indicies (FDP) this job is
+allowed to use for writes.  For FDP by default, the job will cycle through all
+available Placement IDs, so use this to isolate these identifiers to specific
+jobs. If you want fio to use placement identifier only at indices 0, 2 and 5
+specify, you would set `plids=0,2,5`. For streams this should be a
+comma-separated list of Stream IDs.
+.TP
+.BI (io_uring_cmd,xnvme)md_per_io_size \fR=\fPint
+Size in bytes for separate metadata buffer per IO. Default: 0.
+.TP
+.BI (io_uring_cmd,xnvme)pi_act \fR=\fPint
+Action to take when nvme namespace is formatted with protection information.
+If this is set to 1 and namespace is formatted with metadata size equal to
+protection information size, fio won't use separate metadata buffer or extended
+logical block. If this is set to 1 and namespace is formatted with metadata
+size greater than protection information size, fio will not generate or verify
+the protection information portion of metadata for write or read case
+respectively. If this is set to 0, fio generates protection information for
+write case and verifies for read case. Default: 1.
+
+For 16 bit CRC generation fio will use isa-l if available otherwise it will
+use the default slower generator.
+(see: https://github.com/intel/isa-l)
+.TP
+.BI (io_uring_cmd,xnvme)pi_chk \fR=\fPstr[,str][,str]
+Controls the protection information check. This can take one or more of these
+values. Default: none.
+.RS
+.RS
+.TP
+.B GUARD
+Enables protection information checking of guard field.
+.TP
+.B REFTAG
+Enables protection information checking of logical block reference tag field.
+.TP
+.B APPTAG
+Enables protection information checking of application tag field.
+.RE
+.RE
+.TP
+.BI (io_uring_cmd,xnvme)apptag \fR=\fPint
+Specifies logical block application tag value, if namespace is formatted to use
+end to end protection information. Default: 0x1234.
+.TP
+.BI (io_uring_cmd,xnvme)apptag_mask \fR=\fPint
+Specifies logical block application tag mask value, if namespace is formatted
+to use end to end protection information. Default: 0xffff.
+.TP
+.BI (io_uring_cmd)num_range \fR=\fPint
+For trim command this will be the number of ranges to trim per I/O request.
+The number of logical blocks per range is determined by the \fBbs\fR option
+which should be a multiple of logical block size. This cannot be used with
+read or write. Note that setting this option > 1, \fBlog_offset\fR will not be
+able to log all the offsets. Default: 1.
+.TP
  .BI (cpuio)cpuload \fR=\fPint
  Attempt to use the specified percentage of CPU cycles. This is a mandatory
  option when using cpuio I/O engine.
@@ -1934,30 +2363,53 @@ option when using cpuio I/O engine.
  .BI (cpuio)cpuchunks \fR=\fPint
  Split the load into cycles of the given time. In microseconds.
  .TP
+.BI (cpuio)cpumode \fR=\fPstr
+Specify how to stress the CPU. It can take these two values:
+.RS
+.RS
+.TP
+.B noop
+This is the default and directs the CPU to execute noop instructions.
+.TP
+.B qsort
+Replace the default noop instructions with a qsort algorithm to consume more energy.
+.RE
+.RE
+.TP
  .BI (cpuio)exit_on_io_done \fR=\fPbool
  Detect when I/O threads are done, then exit.
  .TP
  .BI (libhdfs)namenode \fR=\fPstr
  The hostname or IP address of a HDFS cluster namenode to contact.
  .TP
-.BI (libhdfs)port
+.BI (libhdfs)port \fR=\fPint
  The listening port of the HFDS cluster namenode.
  .TP
-.BI (netsplice,net)port
+.BI (netsplice,net)port \fR=\fPint
  The TCP or UDP port to bind to or connect to. If this is used with
  \fBnumjobs\fR to spawn multiple instances of the same job type, then
  this will be the starting port number since fio will use a range of
  ports.
  .TP
-.BI (rdma)port
+.BI (rdma,librpma_*)port \fR=\fPint
  The port to use for RDMA-CM communication. This should be the same
  value on the client and the server side.
  .TP
-.BI (netsplice,net, rdma)hostname \fR=\fPstr
+.BI (netsplice,net,rdma)hostname \fR=\fPstr
  The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O.
  If the job is a TCP listener or UDP reader, the hostname is not used
  and must be omitted unless it is a valid UDP multicast address.
  .TP
+.BI (librpma_*)serverip \fR=\fPstr
+The IP address to be used for RDMA-CM based I/O.
+.TP
+.BI (librpma_*_server)direct_write_to_pmem \fR=\fPbool
+Set to 1 only when Direct Write to PMem from the remote host is possible. Otherwise, set to 0.
+.TP
+.BI (librpma_*_server)busy_wait_polling \fR=\fPbool
+Set to 0 to wait for completion instead of busy-wait polling completion.
+Default: 1.
+.TP
  .BI (netsplice,net)interface \fR=\fPstr
  The IP address of the network interface used to send or receive UDP
  multicast.
@@ -1987,11 +2439,16 @@ User datagram protocol V6.
  .TP
  .B unix
  UNIX domain socket.
+.TP
+.B vsock
+VSOCK protocol.
  .RE
  .P
-When the protocol is TCP or UDP, the port must also be given, as well as the
-hostname if the job is a TCP listener or UDP reader. For unix sockets, the
+When the protocol is TCP, UDP or VSOCK, the port must also be given, as well as the
+hostname if the job is a TCP or VSOCK listener or UDP reader. For unix sockets, the
  normal \fBfilename\fR option should be used and the port is invalid.
+When the protocol is VSOCK, the \fBhostname\fR is the CID of the remote VM.
+
  .RE
  .TP
  .BI (netsplice,net)listen
@@ -2048,10 +2505,19 @@ Ceph cluster. If the \fBclustername\fR is specified, the \fBclientname\fR shall
  the full *type.id* string. If no type. prefix is given, fio will add 'client.'
  by default.
  .TP
+.BI (rados)conf \fR=\fPstr
+Specifies the configuration path of ceph cluster, so conf file does not
+have to be /etc/ceph/ceph.conf.
+.TP
  .BI (rbd,rados)busy_poll \fR=\fPbool
  Poll store instead of waiting for completion. Usually this provides better
  throughput at cost of higher(up to 100%) CPU utilization.
  .TP
+.BI (rados)touch_objects \fR=\fPbool
+During initialization, touch (create if do not exist) all objects (files).
+Touching all objects affects ceph caches and likely impacts test results.
+Enabled by default.
+.TP
  .BI (http)http_host \fR=\fPstr
  Hostname to connect to. For S3, this could be the bucket name. Default
  is \fBlocalhost\fR
@@ -2080,6 +2546,15 @@ The S3 secret key.
  .BI (http)http_s3_keyid \fR=\fPstr
  The S3 key/access id.
  .TP
+.BI (http)http_s3_sse_customer_key \fR=\fPstr
+The encryption customer key in SSE server side.
+.TP
+.BI (http)http_s3_sse_customer_algorithm \fR=\fPstr
+The encryption customer algorithm in SSE server side. Default is \fBAES256\fR
+.TP
+.BI (http)http_s3_storage_class \fR=\fPstr
+Which storage class to access. User-customizable settings. Default is \fBSTANDARD\fR
+.TP
  .BI (http)http_swift_auth_token \fR=\fPstr
  The Swift auth token. See the example configuration file on how to
  retrieve this.
@@ -2136,7 +2611,7 @@ With writefua option set to 1, write operations include the force
  unit access (fua) flag. Default: 0.
  .TP
  .BI (sg)sg_write_mode \fR=\fPstr
-Specify the type of write commands to issue. This option can take three
+Specify the type of write commands to issue. This option can take multiple
  values:
  .RS
  .RS
@@ -2144,12 +2619,15 @@ values:
  .B write (default)
  Write opcodes are issued as usual
  .TP
+.B write_and_verify
+Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 00b. This directs the
+device to carry out a medium verification with no data comparison for the data
+that was written. The writefua option is ignored with this selection.
+.TP
  .B verify
-Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
-directs the device to carry out a medium verification with no data
-comparison. The writefua option is ignored with this selection.
+This option is deprecated. Use write_and_verify instead.
  .TP
-.B same
+.B write_same
  Issue WRITE SAME commands. This transfers a single block to the device
  and writes this same block of data to a contiguous sequence of LBAs
  beginning at the specified offset. fio's block size parameter
@@ -2160,9 +2638,43 @@ blocksize=8k will write 16 sectors with each command. fio will still
  generate 8k of data for each command butonly the first 512 bytes will
  be used and transferred to the device. The writefua option is ignored
  with this selection.
+.TP
+.B same
+This option is deprecated. Use write_same instead.
+.TP
+.B write_same_ndob
+Issue WRITE SAME(16) commands as above but with the No Data Output
+Buffer (NDOB) bit set. No data will be transferred to the device with
+this bit set. Data written will be a pre-determined pattern such as
+all zeroes.
+.TP
+.B write_stream
+Issue WRITE STREAM(16) commands. Use the stream_id option to specify
+the stream identifier.
+.TP
+.B verify_bytchk_00
+Issue VERIFY commands with BYTCHK set to 00. This directs the device to carry
+out a medium verification with no data comparison.
+.TP
+.B verify_bytchk_01
+Issue VERIFY commands with BYTCHK set to 01. This directs the device to
+compare the data on the device with the data transferred to the device.
+.TP
+.B verify_bytchk_11
+Issue VERIFY commands with BYTCHK set to 11. This transfers a single block to
+the device and compares the contents of this block with the data on the device
+beginning at the specified offset. fio's block size parameter specifies the
+total amount of data compared with this command. However, only one block
+(sector) worth of data is transferred to the device. This is similar to the
+WRITE SAME command except that data is compared instead of written.
  .RE
  .RE
  .TP
+.BI (sg)stream_id \fR=\fPint
+Set the stream identifier for WRITE STREAM commands. If this is set to 0 (which is not
+a valid stream identifier) fio will open a stream and then close it when done. Default
+is 0.
+.TP
  .BI (nbd)uri \fR=\fPstr
  Specify the NBD URI of the server to test.
  The string is a standard NBD URI (see
@@ -2206,6 +2718,218 @@ from RAM to GPU after a read. \fBverify\fR does not affect
  the use of cudaMemcpy.
  .RE
  .RE
+.TP
+.BI (dfs)pool
+Specify the label or UUID of the DAOS pool to connect to.
+.TP
+.BI (dfs)cont
+Specify the label or UUID of the DAOS container to open.
+.TP
+.BI (dfs)chunk_size
+Specify a different chunk size (in bytes) for the dfs file.
+Use DAOS container's chunk size by default.
+.TP
+.BI (dfs)object_class
+Specify a different object class for the dfs file.
+Use DAOS container's object class by default.
+.TP
+.BI (nfs)nfs_url
+URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
+Refer to the libnfs README for more details.
+.TP
+.BI (exec)program\fR=\fPstr
+Specify the program to execute.
+Note the program will receive a SIGTERM when the job is reaching the time limit.
+A SIGKILL is sent once the job is over. The delay between the two signals is defined by \fBgrace_time\fR option.
+.TP
+.BI (exec)arguments\fR=\fPstr
+Specify arguments to pass to program.
+Some special variables can be expanded to pass fio's job details to the program :
+.RS
+.RS
+.TP
+.B %r
+replaced by the duration of the job in seconds
+.TP
+.BI %n
+replaced by the name of the job
+.RE
+.RE
+.TP
+.BI (exec)grace_time\fR=\fPint
+Defines the time between the SIGTERM and SIGKILL signals. Default is 1 second.
+.TP
+.BI (exec)std_redirect\fR=\fPbool
+If set, stdout and stderr streams are redirected to files named from the job name. Default is true.
+.TP
+.BI (xnvme)xnvme_async\fR=\fPstr
+Select the xnvme async command interface. This can take these values.
+.RS
+.RS
+.TP
+.B emu
+This is default and use to emulate asynchronous I/O by using a single thread to
+create a queue pair on top of a synchronous I/O interface using the NVMe driver
+IOCTL.
+.TP
+.BI thrpool
+Emulate an asynchronous I/O interface with a pool of userspace threads on top
+of a synchronous I/O interface using the NVMe driver IOCTL. By default four
+threads are used.
+.TP
+.BI io_uring
+Linux native asynchronous I/O interface which supports both direct and buffered
+I/O.
+.TP
+.BI libaio
+Use Linux aio for Asynchronous I/O
+.TP
+.BI posix
+Use the posix asynchronous I/O interface to perform one or more I/O operations
+asynchronously.
+.TP
+.BI vfio
+Use the user-space VFIO-based backend, implemented using libvfn instead of
+SPDK.
+.TP
+.BI nil
+Do not transfer any data; just pretend to. This is mainly used for
+introspective performance evaluation.
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_sync\fR=\fPstr
+Select the xnvme synchronous command interface. This can take these values.
+.RS
+.RS
+.TP
+.B nvme
+This is default and uses Linux NVMe Driver ioctl() for synchronous I/O.
+.TP
+.BI psync
+This supports regular as well as vectored pread() and pwrite() commands.
+.TP
+.BI block
+This is the same as psync except that it also supports zone management
+commands using Linux block layer IOCTLs.
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_admin\fR=\fPstr
+Select the xnvme admin command interface. This can take these values.
+.RS
+.RS
+.TP
+.B nvme
+This is default and uses Linux NVMe Driver ioctl() for admin commands.
+.TP
+.BI block
+Use Linux Block Layer ioctl() and sysfs for admin commands.
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_dev_nsid\fR=\fPint
+xnvme namespace identifier for userspace NVMe driver SPDK or vfio.
+.TP
+.BI (xnvme)xnvme_dev_subnqn\fR=\fPstr
+Sets the subsystem NQN for fabrics. This is for xNVMe to utilize a fabrics
+target with multiple systems.
+.TP
+.BI (xnvme)xnvme_mem\fR=\fPstr
+Select the xnvme memory backend. This can take these values.
+.RS
+.RS
+.TP
+.B posix
+This is the default posix memory backend for linux NVMe driver.
+.TP
+.BI hugepage
+Use hugepages, instead of existing posix memory backend. The memory backend
+uses hugetlbfs. This require users to allocate hugepages, mount hugetlbfs and
+set an environment variable for XNVME_HUGETLB_PATH.
+.TP
+.BI spdk
+Uses SPDK's memory allocator.
+.TP
+.BI vfio
+Uses libvfn's memory allocator. This also specifies the use of libvfn backend
+instead of SPDK.
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_iovec
+If this option is set, xnvme will use vectored read/write commands.
+.TP
+.BI (libblkio)libblkio_driver \fR=\fPstr
+The libblkio driver to use. Different drivers access devices through different
+underlying interfaces. Available drivers depend on the libblkio version in use
+and are listed at \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR
+.TP
+.BI (libblkio)libblkio_path \fR=\fPstr
+Sets the value of the driver-specific "path" property before connecting the
+libblkio instance, which identifies the target device or file on which to
+perform I/O. Its exact semantics are driver-dependent and not all drivers may
+support it; see \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR
+.TP
+.BI (libblkio)libblkio_pre_connect_props \fR=\fPstr
+A colon-separated list of additional libblkio properties to be set after
+creating but before connecting the libblkio instance. Each property must have
+the format \fB<name>=<value>\fR. Colons can be escaped as \fB\\:\fR. These are
+set after the engine sets any other properties, so those can be overridden.
+Available properties depend on the libblkio version in use and are listed at
+\fIhttps://libblkio.gitlab.io/libblkio/blkio.html#properties\fR
+.TP
+.BI (libblkio)libblkio_num_entries \fR=\fPint
+Sets the value of the driver-specific "num-entries" property before starting the
+libblkio instance. Its exact semantics are driver-dependent and not all drivers
+may support it; see \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR
+.TP
+.BI (libblkio)libblkio_queue_size \fR=\fPint
+Sets the value of the driver-specific "queue-size" property before starting the
+libblkio instance. Its exact semantics are driver-dependent and not all drivers
+may support it; see \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR
+.TP
+.BI (libblkio)libblkio_pre_start_props \fR=\fPstr
+A colon-separated list of additional libblkio properties to be set after
+connecting but before starting the libblkio instance. Each property must have
+the format \fB<name>=<value>\fR. Colons can be escaped as \fB\\:\fR. These are
+set after the engine sets any other properties, so those can be overridden.
+Available properties depend on the libblkio version in use and are listed at
+\fIhttps://libblkio.gitlab.io/libblkio/blkio.html#properties\fR
+.TP
+.BI (libblkio)hipri
+Use poll queues. This is incompatible with \fBlibblkio_wait_mode=eventfd\fR and
+\fBlibblkio_force_enable_completion_eventfd\fR.
+.TP
+.BI (libblkio)libblkio_vectored
+Submit vectored read and write requests.
+.TP
+.BI (libblkio)libblkio_write_zeroes_on_trim
+Submit trims as "write zeroes" requests instead of discard requests.
+.TP
+.BI (libblkio)libblkio_wait_mode \fR=\fPstr
+How to wait for completions:
+.RS
+.RS
+.TP
+.B block \fR(default)
+Use a blocking call to \fBblkioq_do_io()\fR.
+.TP
+.B eventfd
+Use a blocking call to \fBread()\fR on the completion eventfd.
+.TP
+.B loop
+Use a busy loop with a non-blocking call to \fBblkioq_do_io()\fR.
+.RE
+.RE
+.TP
+.BI (libblkio)libblkio_force_enable_completion_eventfd
+Enable the queue's completion eventfd even when unused. This may impact
+performance. The default is to enable it only if
+\fBlibblkio_wait_mode=eventfd\fR.
+.TP
+.BI (windowsaio)no_completion_thread
+Avoid using a separate thread for completion polling.
  .SS "I/O depth"
  .TP
  .BI iodepth \fR=\fPint
@@ -2306,11 +3030,18 @@ reporting if I/O gets backed up on the device side (the coordinated omission
  problem). Note that this option cannot reliably be used with async IO engines.
  .SS "I/O rate"
  .TP
+.BI thinkcycles \fR=\fPint
+Stall the job for the specified number of cycles after an I/O has completed before
+issuing the next. May be used to simulate processing being done by an application.
+This is not taken into account for the time to be waited on for \fBthinktime\fR.
+Might not have any effect on some platforms, this can be checked by trying a setting
+a high enough amount of thinkcycles.
+.TP
  .BI thinktime \fR=\fPtime
  Stall the job for the specified period of time after an I/O has completed before issuing the
  next. May be used to simulate processing being done by an application.
  When the unit is omitted, the value is interpreted in microseconds. See
-\fBthinktime_blocks\fR and \fBthinktime_spin\fR.
+\fBthinktime_blocks\fR, \fBthinktime_iotime\fR and \fBthinktime_spin\fR.
  .TP
  .BI thinktime_spin \fR=\fPtime
  Only valid if \fBthinktime\fR is set - pretend to spend CPU time doing
@@ -2331,6 +3062,17 @@ Only valid if \fBthinktime\fR is set - control how \fBthinktime_blocks\fR trigge
  The default is `complete', which triggers \fBthinktime\fR when fio completes
  \fBthinktime_blocks\fR blocks. If this is set to `issue', then the trigger happens
  at the issue side.
+.TP
+.BI thinktime_iotime \fR=\fPtime
+Only valid if \fBthinktime\fR is set - control \fBthinktime\fR interval by time.
+The \fBthinktime\fR stall is repeated after IOs are executed for
+\fBthinktime_iotime\fR. For example, `\-\-thinktime_iotime=9s \-\-thinktime=1s'
+repeat 10-second cycle with IOs for 9 seconds and stall for 1 second. When the
+unit is omitted, \fBthinktime_iotime\fR is interpreted as a number of seconds.
+If this option is used together with \fBthinktime_blocks\fR, the \fBthinktime\fR
+stall is repeated after \fBthinktime_iotime\fR or after \fBthinktime_blocks\fR
+IOs, whichever happens first.
+
  .TP
  .BI rate \fR=\fPint[,int][,int]
  Cap the bandwidth used by this job. The number is in bytes/sec, the normal
@@ -2376,6 +3118,10 @@ By default, fio will attempt to catch up to the specified rate setting, if any
  kind of thinktime setting was used. If this option is set, then fio will
  ignore the thinktime and continue doing IO at the specified rate, instead of
  entering a catch-up mode after thinktime is done.
+.TP
+.BI rate_cycle \fR=\fPint
+Average bandwidth for \fBrate_min\fR and \fBrate_iops_min\fR over this number
+of milliseconds. Defaults to 1000.
  .SS "I/O latency"
  .TP
  .BI latency_target \fR=\fPtime
@@ -2400,20 +3146,18 @@ Used with \fBlatency_target\fR. If false (default), fio will find the highest
  queue depth that meets \fBlatency_target\fR and exit. If true, fio will continue
  running and try to meet \fBlatency_target\fR by adjusting queue depth.
  .TP
-.BI max_latency \fR=\fPtime
+.BI max_latency \fR=\fPtime[,time][,time]
  If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
  maximum latency. When the unit is omitted, the value is interpreted in
-microseconds.
-.TP
-.BI rate_cycle \fR=\fPint
-Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number
-of milliseconds. Defaults to 1000.
+microseconds. Comma-separated values may be specified for reads, writes,
+and trims as described in \fBblocksize\fR.
  .SS "I/O replay"
  .TP
  .BI write_iolog \fR=\fPstr
  Write the issued I/O patterns to the specified file. See
  \fBread_iolog\fR. Specify a separate file for each job, otherwise the
-iologs will be interspersed and the file may be corrupt.
+iologs will be interspersed and the file may be corrupt. This file will be
+opened in append mode.
  .TP
  .BI read_iolog \fR=\fPstr
  Open an iolog with the specified filename and replay the I/O patterns it
@@ -2531,13 +3275,22 @@ Set the I/O priority value of this job. Linux limits us to a positive value
  between 0 and 7, with 0 being the highest. See man
  \fBionice\fR\|(1). Refer to an appropriate manpage for other operating
  systems since meaning of priority may differ. For per-command priority
-setting, see I/O engine specific `cmdprio_percentage` and `hipri_percentage`
-options.
+setting, see the I/O engine specific `cmdprio_percentage` and
+`cmdprio` options.
  .TP
  .BI prioclass \fR=\fPint
  Set the I/O priority class. See man \fBionice\fR\|(1). For per-command
-priority setting, see I/O engine specific `cmdprio_percentage` and `hipri_percent`
-options.
+priority setting, see the I/O engine specific `cmdprio_percentage` and
+`cmdprio_class` options.
+.TP
+.BI priohint \fR=\fPint
+Set the I/O priority hint. This is only applicable to platforms that support
+I/O priority classes and to devices with features controlled through priority
+hints, e.g. block devices supporting command duration limits, or CDL. CDL is a
+way to indicate the desired maximum latency of I/Os so that the device can
+optimize its internal command scheduling according to the latency limits
+indicated by the user. For per-I/O priority hint setting, see the I/O engine
+specific \fBcmdprio_hint\fB option.
  .TP
  .BI cpus_allowed \fR=\fPstr
  Controls the same options as \fBcpumask\fR, but accepts a textual
@@ -2805,7 +3558,7 @@ the verify will be of the newly written data.
  To avoid false verification errors, do not use the norandommap option when
  verifying data with async I/O engines and I/O depths > 1.  Or use the
  norandommap and the lfsr random generator together to avoid writing to the
-same offset with muliple outstanding I/Os.
+same offset with multiple outstanding I/Os.
  .RE
  .TP
  .BI verify_offset \fR=\fPint
@@ -2903,6 +3656,11 @@ far it should verify. Without this information, fio will run a full
  verification pass, according to the settings in the job file used. Default
  false.
  .TP
+.BI experimental_verify \fR=\fPbool
+Enable experimental verification. Standard verify records I/O metadata for
+later use during the verification phase. Experimental verify instead resets the
+file after the write phase and then replays I/Os for the verification phase.
+.TP
  .BI trim_percentage \fR=\fPint
  Number of verify blocks to discard/trim.
  .TP
@@ -2914,9 +3672,6 @@ Verify that trim/discarded blocks are returned as zeros.
  .TP
  .BI trim_backlog_batch \fR=\fPint
  Trim this number of I/O blocks.
-.TP
-.BI experimental_verify \fR=\fPbool
-Enable experimental verification.
  .SS "Steady state"
  .TP
  .BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float
@@ -2962,19 +3717,28 @@ slope. Stop the job if the slope falls below the specified limit.
  .TP
  .BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime
  A rolling window of this duration will be used to judge whether steady state
-has been reached. Data will be collected once per second. The default is 0
-which disables steady state detection. When the unit is omitted, the
-value is interpreted in seconds.
+has been reached. Data will be collected every \fBss_interval\fR. The default
+is 0 which disables steady state detection. When the unit is omitted, the value
+is interpreted in seconds.
  .TP
  .BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime
  Allow the job to run for the specified duration before beginning data
  collection for checking the steady state job termination criterion. The
  default is 0. When the unit is omitted, the value is interpreted in seconds.
+.TP
+.BI steadystate_check_interval \fR=\fPtime "\fR,\fP ss_interval" \fR=\fPtime
+The values suring the rolling window will be collected with a period of this
+value. If \fBss_interval\fR is 30s and \fBss_dur\fR is 300s, 10 measurements
+will be taken. Default is 1s but that might not converge, especially for slower
+devices, so set this accordingly. When the unit is omitted, the value is
+interpreted in seconds.
  .SS "Measurements and reporting"
  .TP
  .BI per_job_logs \fR=\fPbool
-If set, this generates bw/clat/iops log with per file private filenames. If
-not set, jobs with identical names will share the log filename. Default:
+If set to true, fio generates bw/clat/iops logs with per job unique filenames.
+If set to false, jobs with identical names will share a log filename. Note that
+when this option is set to false log files will be opened in append mode and if
+log files already exist the previous contents will not be overwritten. Default:
  true.
  .TP
  .BI group_reporting
@@ -2985,6 +3749,15 @@ quickly becomes unwieldy. To see the final report per-group instead of
  per-job, use \fBgroup_reporting\fR. Jobs in a file will be part of the
  same reporting group, unless if separated by a \fBstonewall\fR, or by
  using \fBnew_group\fR.
+.RS
+.P
+NOTE: When \fBgroup_reporting\fR is used along with \fBjson\fR output, there
+are certain per-job properties which can be different between jobs but do not
+have a natural group-level equivalent. Examples include \fBkb_base\fR,
+\fBunit_base\fR, \fBsig_figs\fR, \fBthread_number\fR, \fBpid\fR, and
+\fBjob_start\fR. For these properties, the values for the first job are
+recorded for the group.
+.RE
  .TP
  .BI new_group
  Start a new reporting group. See: \fBgroup_reporting\fR. If not given,
@@ -3043,13 +3816,26 @@ logging (see \fBlog_avg_msec\fR) has been enabled. See
  \fBwrite_bw_log\fR for details about the filename format and \fBLOG
  FILE FORMATS\fR for how data is structured within the file.
  .TP
+.BI log_entries \fR=\fPint
+By default, fio will log an entry in the iops, latency, or bw log for
+every I/O that completes. The initial number of I/O log entries is 1024.
+When the log entries are all used, new log entries are dynamically
+allocated.  This dynamic log entry allocation may negatively impact
+time-related statistics such as I/O tail latencies (e.g. 99.9th percentile
+completion latency). This option allows specifying a larger initial
+number of log entries to avoid run-time allocation of new log entries,
+resulting in more precise time-related I/O statistics.
+Also see \fBlog_avg_msec\fR as well. Defaults to 1024.
+.TP
  .BI log_avg_msec \fR=\fPint
-By default, fio will log an entry in the iops, latency, or bw log for every
-I/O that completes. When writing to the disk log, that can quickly grow to a
-very large size. Setting this option makes fio average the each log entry
-over the specified period of time, reducing the resolution of the log. See
-\fBlog_max_value\fR as well. Defaults to 0, logging all entries.
-Also see \fBLOG FILE FORMATS\fR section.
+By default, fio will log an entry in the iops, latency, or bw log for every I/O
+that completes. When writing to the disk log, that can quickly grow to a very
+large size. Setting this option directs fio to instead record an average over
+the specified duration for each log entry, reducing the resolution of the log.
+When the job completes, fio will flush any accumulated latency log data, so the
+final log interval may not match the value specified by this option and there
+may even be duplicate timestamps. See \fBlog_window_value\fR as well. Defaults
+to 0, logging entries for each I/O. Also see \fBLOG FILE FORMATS\fR section.
  .TP
  .BI log_hist_msec \fR=\fPint
  Same as \fBlog_avg_msec\fR, but logs entries for completion latency
@@ -3066,16 +3852,39 @@ the histogram logs enabled with \fBlog_hist_msec\fR. For each increment
  in coarseness, fio outputs half as many bins. Defaults to 0, for which
  histogram logs contain 1216 latency bins. See \fBLOG FILE FORMATS\fR section.
  .TP
-.BI log_max_value \fR=\fPbool
-If \fBlog_avg_msec\fR is set, fio logs the average over that window. If
-you instead want to log the maximum value, set this option to 1. Defaults to
-0, meaning that averaged values are logged.
+.BI log_window_value \fR=\fPstr "\fR,\fP log_max_value" \fR=\fPstr
+If \fBlog_avg_msec\fR is set, fio by default logs the average over that window.
+This option determines whether fio logs the average, maximum or both the
+values over the window. This only affects the latency logging, as both average
+and maximum values for iops or bw log will be same. Accepted values are:
+.RS
+.TP
+.B avg
+Log average value over the window. The default.
+.TP
+.B max
+Log maximum value in the window.
+.TP
+.B both
+Log both average and maximum value over the window.
+.TP
+.B 0
+Backward-compatible alias for \fBavg\fR.
+.TP
+.B 1
+Backward-compatible alias for \fBmax\fR.
+.RE
  .TP
  .BI log_offset \fR=\fPbool
  If this is set, the iolog options will include the byte offset for the I/O
  entry as well as the other data values. Defaults to 0 meaning that
  offsets are not present in logs. Also see \fBLOG FILE FORMATS\fR section.
  .TP
+.BI log_prio \fR=\fPbool
+If this is set, the iolog options will include the I/O priority for the I/O
+entry as well as the other data values. Defaults to 0 meaning that
+I/O priorities are not present in logs. Also see \fBLOG FILE FORMATS\fR section.
+.TP
  .BI log_compression \fR=\fPint
  If this is set, fio will compress the I/O logs as it goes, to keep the
  memory footprint lower. When a log reaches the specified size, that chunk is
@@ -3100,10 +3909,19 @@ decompressed with fio, using the \fB\-\-inflate\-log\fR command line
  parameter. The files will be stored with a `.fz' suffix.
  .TP
  .BI log_unix_epoch \fR=\fPbool
-If set, fio will log Unix timestamps to the log files produced by enabling
-write_type_log for each log type, instead of the default zero-based
+Backward-compatible alias for \fBlog_alternate_epoch\fR.
+.TP
+.BI log_alternate_epoch \fR=\fPbool
+If set, fio will log timestamps based on the epoch used by the clock specified
+in the \fBlog_alternate_epoch_clock_id\fR option, to the log files produced by
+enabling write_type_log for each log type, instead of the default zero-based
  timestamps.
  .TP
+.BI log_alternate_epoch_clock_id \fR=\fPint
+Specifies the clock_id to be used by clock_gettime to obtain the alternate
+epoch if \fBlog_alternate_epoch\fR is true. Otherwise has no effect. Default
+value is 0, or CLOCK_REALTIME.
+.TP
  .BI block_error_percentiles \fR=\fPbool
  If set, record errors in trim block-sized units from writes and trims and
  output a histogram of how many trims it took to get to errors, and what kind
@@ -3181,6 +3999,16 @@ EILSEQ) until the runtime is exceeded or the I/O size specified is
  completed. If this option is used, there are two more stats that are
  appended, the total error count and the first error. The error field given
  in the stats is the first error that was hit during the run.
+.RS
+.P
+Note: a write error from the device may go unnoticed by fio when using buffered
+IO, as the write() (or similar) system call merely dirties the kernel pages,
+unless `sync' or `direct' is used. Device IO errors occur when the dirty data is
+actually written out to disk. If fully sync writes aren't desirable, `fsync' or
+`fdatasync' can be used as well. This is specific to writes, as reads are always
+synchronous.
+.RS
+.P
  The allowed values are:
  .RS
  .RS
@@ -3449,21 +4277,65 @@ submission to completion of the I/O pieces. For sync I/O, clat will
  usually be equal (or very close) to 0, as the time from submit to
  complete is basically just CPU time (I/O has already been done, see slat
  explanation).
+
+For file and directory operation engines, \fBclat\fP denotes the time
+to complete one file or directory operation.
+.RS
+.TP
+\fBfilecreate engine\fP:\tthe time cost to create a new file
+.TP
+\fBfilestat engine\fP:\tthe time cost to look up an existing file
+.TP
+\fBfiledelete engine\fP:\tthe time cost to delete a file
+.TP
+\fBdircreate engine\fP:\tthe time cost to create a new directory
+.TP
+\fBdirstat engine\fP:\tthe time cost to look up an existing directory
+.TP
+\fBdirdelete engine\fP:\tthe time cost to delete a directory
+.TP
+.RE
  .TP
  .B lat
  Total latency. Same names as slat and clat, this denotes the time from
  when fio created the I/O unit to completion of the I/O operation.
  .TP
  .B bw
-Bandwidth statistics based on samples. Same names as the xlat stats,
-but also includes the number of samples taken (\fIsamples\fR) and an
-approximate percentage of total aggregate bandwidth this thread
-received in its group (\fIper\fR). This last value is only really
-useful if the threads in this group are on the same disk, since they
-are then competing for disk access.
+Bandwidth statistics based on measurements from discrete intervals. Fio
+continuosly monitors bytes transferred and I/O operations completed. By default
+fio calculates bandwidth in each half-second interval (see \fBbwavgtime\fR)
+and reports descriptive statistics for the measurements here. Same names as the
+xlat stats, but also includes the number of samples taken (\fIsamples\fR) and an
+approximate percentage of total aggregate bandwidth this thread received in its
+group (\fIper\fR). This last value is only really useful if the threads in this
+group are on the same disk, since they are then competing for disk access.
+
+For file and directory operation engines, \fBbw\fR is meaningless.
  .TP
  .B iops
-IOPS statistics based on samples. Same names as \fBbw\fR.
+IOPS statistics based on measurements from discrete intervals.
+For details see the description for \fBbw\fR above. See
+\fBiopsavgtime\fR to control the duration of the intervals.
+Same values reported here as for \fBbw\fR except for percentage.
+
+For file and directory operation engines, \fBiops\fP is the most
+fundamental index to denote the performance.
+It means how many files or directories can be operated per second.
+.RS
+.TP
+\fBfilecreate engine\fP:\tnumber of files can be created per second
+.TP
+\fBfilestat engine\fP:\tnumber of files can be looked up per second
+.TP
+\fBfiledelete engine\fP:\tnumber of files can be deleted per second
+.TP
+\fBdircreate engine\fP:\tnumber of directories can be created per second
+.TP
+\fBdirstat engine\fP:\tnumber of directories can be looked up per second
+.TP
+\fBdirdelete engine\fP:\tnumber of directories can be deleted per second
+.TP
+.RE
  .TP
  .B lat (nsec/usec/msec)
  The distribution of I/O completion latencies. This is the time from when
@@ -3540,7 +4412,7 @@ They will look like this:
  .P
  .nf
                   Disk stats (read/write):
-                   sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+                   sda: ios=16398/16511, sectors=32321/65472, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
  .fi
  .P
  Each value is printed for both reads and writes, with reads first. The
@@ -3768,7 +4640,7 @@ This format is not supported in fio versions >= 1.20\-rc3.
  .TP
  .B Trace file format v2
  The second version of the trace file format was added in fio version 1.17. It
-allows to access more then one file per trace and has a bigger set of possible
+allows one to access more than one file per trace and has a bigger set of possible
  file actions.
  .RS
  .P
@@ -3813,7 +4685,9 @@ given in bytes. The `action' can be one of these:
  .TP
  .B wait
  Wait for `offset' microseconds. Everything below 100 is discarded.
-The time is relative to the previous `wait' statement.
+The time is relative to the previous `wait' statement. Note that action `wait`
+is not allowed as of version 3, as the same behavior can be achieved using
+timestamps.
  .TP
  .B read
  Read `length' bytes beginning from `offset'.
@@ -3831,6 +4705,37 @@ Write `length' bytes beginning from `offset'.
  Trim the given file from the given `offset' for `length' bytes.
  .RE
  .RE
+.RE
+.TP
+.B Trace file format v3
+The third version of the trace file format was added in fio version 3.31. It
+forces each action to have a timestamp associated with it.
+.RS
+.P
+The first line of the trace file has to be:
+.RS
+.P
+"fio version 3 iolog"
+.RE
+.P
+Following this can be lines in two different formats, which are described below.
+.P
+.B
+The file management format:
+.RS
+timestamp filename action
+.P
+.RE
+.B
+The file I/O action format:
+.RS
+timestamp filename action offset length
+.P
+The `timestamp` is relative to the beginning of the run (ie starts at 0). The
+`filename`, `action`, `offset` and `length`  are identical to version 2, except
+that version 3 does not allow the `wait` action.
+.RE
+.RE
  .SH I/O REPLAY \- MERGING TRACES
  Colocation is a common practice used to get the most out of a machine.
  Knowing which workloads play nicely with each other and which ones don't is
@@ -4009,15 +4914,30 @@ The entry's `block size' is always in bytes. The `offset' is the position in byt
  from the start of the file for that particular I/O. The logging of the offset can be
  toggled with \fBlog_offset\fR.
  .P
-`Command priority` is 0 for normal priority and 1 for high priority. This is controlled
-by the ioengine specific \fBcmdprio_percentage\fR.
+If \fBlog_prio\fR is not set, the entry's `Command priority` is 1 for an IO executed
+with the highest RT priority class (\fBprioclass\fR=1 or \fBcmdprio_class\fR=1) and 0
+otherwise. This is controlled by the \fBprioclass\fR option and the ioengine specific
+\fBcmdprio_percentage\fR \fBcmdprio_class\fR options. If \fBlog_prio\fR is set, the
+entry's `Command priority` is the priority set for the IO, as a 16-bits hexadecimal
+number with the lowest 13 bits indicating the priority value (\fBprio\fR and
+\fBcmdprio\fR options) and the highest 3 bits indicating the IO priority class
+(\fBprioclass\fR and \fBcmdprio_class\fR options).
  .P
  Fio defaults to logging every individual I/O but when windowed logging is set
-through \fBlog_avg_msec\fR, either the average (by default) or the maximum
-(\fBlog_max_value\fR is set) `value' seen over the specified period of time
-is recorded. Each `data direction' seen within the window period will aggregate
-its values in a separate row. Further, when using windowed logging the `block
-size' and `offset' entries will always contain 0.
+through \fBlog_avg_msec\fR, either the average (by default), the maximum
+(\fBlog_window_value\fR is set to max) `value' seen over the specified period of
+time, or both the average `value' and maximum `value1' (\fBlog_window_value\fR is
+set to both) is recorded. The log file format when both the values are reported
+takes this form:
+.RS
+.P
+time (msec), value, value1, data direction, block size (bytes), offset (bytes),
+command priority
+.RE
+.P
+Each `data direction' seen within the window period will aggregate its values
+in a separate row. Further, when using windowed logging the `block size' and
+`offset' entries will always contain 0.
  .SH CLIENT / SERVER
  Normally fio is invoked as a stand-alone application on the machine where the
  I/O workload should be generated. However, the backend and frontend of fio can
@@ -4067,6 +4987,9 @@ is the connect string, and `remote\-args' and `job file(s)' are sent to the
  server. The `server' string follows the same format as it does on the server
  side, to allow IP/hostname/socket and port strings.
  .P
+Note that all job options must be defined in job files when running fio as a
+client. Any job options specified in `remote\-args' will be ignored.
+.P
  Fio can connect to multiple servers this way:
  .RS
  .P
diff --git a/fio.c b/fio.c

index f19db1be6f024b0a0420f8a298eca863851a01e4..3d6ce59703743c9d2313045162a2a0e3b2e20d80 100644 (file)
--- a/fio.c
+++ b/fio.c
@@ -27,8 +27,6 @@ int main(int argc, char *argv[], char *envp[])
  {
         int ret = 1;
  
-       compiletime_assert(TD_NR <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT");
-
         if (initialize_fio(envp))
                 return 1;
  
diff --git a/fio.h b/fio.h

index b05cb3dfc395346fe02775e65d7a4fba5288ed43..7d9927a006551af0bfb6080e06c36844aeb1fce0 100644 (file)
--- a/fio.h
+++ b/fio.h
@@ -47,6 +47,7 @@
  #include "workqueue.h"
  #include "steadystate.h"
  #include "lib/nowarn_snprintf.h"
+#include "dedupe.h"
  
  #ifdef CONFIG_SOLARISAIO
  #include <sys/asynch.h>
@@ -70,6 +71,16 @@
  
  struct fio_sem;
  
+#define MAX_TRIM_RANGE 256
+
+/*
+ * Range for trim command
+ */
+struct trim_range {
+       unsigned long long start;
+       unsigned long long len;
+};
+
  /*
   * offset generator types
   */
@@ -96,6 +107,7 @@ enum {
         __TD_F_MMAP_KEEP,
         __TD_F_DIRS_CREATED,
         __TD_F_CHECK_RATE,
+       __TD_F_SYNCS,
         __TD_F_LAST,            /* not a real bit, keep last */
  };
  
@@ -117,6 +129,7 @@ enum {
         TD_F_MMAP_KEEP          = 1U << __TD_F_MMAP_KEEP,
         TD_F_DIRS_CREATED       = 1U << __TD_F_DIRS_CREATED,
         TD_F_CHECK_RATE         = 1U << __TD_F_CHECK_RATE,
+       TD_F_SYNCS              = 1U << __TD_F_SYNCS,
  };
  
  enum {
@@ -140,6 +153,8 @@ enum {
         FIO_RAND_POISSON2_OFF,
         FIO_RAND_POISSON3_OFF,
         FIO_RAND_PRIO_CMDS,
+       FIO_RAND_DEDUPE_WORKING_SET_IX,
+       FIO_RAND_FDP_OFF,
         FIO_RAND_NR_OFFS,
  };
  
@@ -159,6 +174,7 @@ enum {
         F_ADV_TYPE,
         F_ADV_RANDOM,
         F_ADV_SEQUENTIAL,
+       F_ADV_NOREUSE,
  };
  
  /*
@@ -180,7 +196,7 @@ struct zone_split_index {
   */
  struct thread_data {
         struct flist_head opt_list;
-       unsigned long flags;
+       unsigned long long flags;
         struct thread_options o;
         void *eo;
         pthread_t thread;
@@ -254,17 +270,24 @@ struct thread_data {
  
         struct frand_state bsrange_state[DDIR_RWDIR_CNT];
         struct frand_state verify_state;
+       struct frand_state verify_state_last_do_io;
         struct frand_state trim_state;
         struct frand_state delay_state;
+       struct frand_state fdp_state;
  
         struct frand_state buf_state;
         struct frand_state buf_state_prev;
+       struct frand_state buf_state_ret;
         struct frand_state dedupe_state;
         struct frand_state zone_state;
         struct frand_state prio_state;
+       struct frand_state dedupe_working_set_index_state;
+       struct frand_state *dedupe_working_set_states;
+
+       unsigned long long num_unique_pages;
  
         struct zone_split_index **zone_state_index;
-       unsigned int num_open_zones;
+       unsigned int num_write_zones;
  
         unsigned int verify_batch;
         unsigned int trim_batch;
@@ -273,6 +296,11 @@ struct thread_data {
  
         int shm_id;
  
+       /*
+        * Job default IO priority set with prioclass and prio options.
+        */
+       unsigned int ioprio;
+
         /*
          * IO engine hooks, contains everything needed to submit an io_u
          * to any of the available IO engines.
@@ -323,10 +351,10 @@ struct thread_data {
          */
         uint64_t rate_bps[DDIR_RWDIR_CNT];
         uint64_t rate_next_io_time[DDIR_RWDIR_CNT];
-       unsigned long long rate_bytes[DDIR_RWDIR_CNT];
-       unsigned long rate_blocks[DDIR_RWDIR_CNT];
+       unsigned long long last_rate_check_bytes[DDIR_RWDIR_CNT];
+       unsigned long last_rate_check_blocks[DDIR_RWDIR_CNT];
         unsigned long long rate_io_issue_bytes[DDIR_RWDIR_CNT];
-       struct timespec lastrate[DDIR_RWDIR_CNT];
+       struct timespec last_rate_check_time[DDIR_RWDIR_CNT];
         int64_t last_usec[DDIR_RWDIR_CNT];
         struct frand_state poisson_state[DDIR_RWDIR_CNT];
  
@@ -342,6 +370,7 @@ struct thread_data {
          * Issue side
          */
         uint64_t io_issues[DDIR_RWDIR_CNT];
+       uint64_t verify_read_issues;
         uint64_t io_issue_bytes[DDIR_RWDIR_CNT];
         uint64_t loops;
  
@@ -356,8 +385,11 @@ struct thread_data {
         uint64_t zone_bytes;
         struct fio_sem *sem;
         uint64_t bytes_done[DDIR_RWDIR_CNT];
+       uint64_t bytes_verified;
  
         uint64_t *thinktime_blocks_counter;
+       struct timespec last_thinktime;
+       int64_t last_thinktime_blocks;
  
         /*
          * State for random io, a bitmap of blocks done vs not done
@@ -366,7 +398,8 @@ struct thread_data {
  
         struct timespec start;  /* start of this loop */
         struct timespec epoch;  /* time job was started */
-       unsigned long long unix_epoch; /* Time job was started, unix epoch based. */
+       unsigned long long alternate_epoch; /* Time job was started, as clock_gettime(log_alternate_epoch_clock_id) */
+       unsigned long long job_start; /* Time job was started, as clock_gettime(job_start_clock_id) */
         struct timespec last_issue;
         long time_offset;
         struct timespec ts_cache;
@@ -413,9 +446,14 @@ struct thread_data {
          */
         struct flist_head io_log_list;
         FILE *io_log_rfile;
+       unsigned int io_log_blktrace;
+       unsigned int io_log_blktrace_swap;
+       unsigned long long io_log_last_ttime;
+       struct timespec io_log_start_time;
         unsigned int io_log_current;
         unsigned int io_log_checkmark;
         unsigned int io_log_highmark;
+       unsigned int io_log_version;
         struct timespec io_log_highmark_time;
  
         /*
@@ -581,6 +619,14 @@ static inline void fio_ro_check(const struct thread_data *td, struct io_u *io_u)
                !(io_u->ddir == DDIR_TRIM && !td_trim(td)));
  }
  
+static inline bool multi_range_trim(struct thread_data *td, struct io_u *io_u)
+{
+       if (io_u->ddir == DDIR_TRIM && td->o.num_range > 1)
+               return true;
+
+       return false;
+}
+
  static inline bool should_fsync(struct thread_data *td)
  {
         if (td->last_was_sync)
@@ -613,7 +659,6 @@ extern void fio_options_dup_and_init(struct option *);
  extern char *fio_option_dup_subs(const char *);
  extern void fio_options_mem_dupe(struct thread_data *);
  extern void td_fill_rand_seeds(struct thread_data *);
-extern void td_fill_verify_state_seed(struct thread_data *);
  extern void add_job_opts(const char **, int);
  extern int ioengine_load(struct thread_data *);
  extern bool parse_dryrun(void);
@@ -661,13 +706,13 @@ enum {
         TD_NR,
  };
  
-#define TD_ENG_FLAG_SHIFT      17
-#define TD_ENG_FLAG_MASK       ((1U << 17) - 1)
+#define TD_ENG_FLAG_SHIFT      (__TD_F_LAST)
+#define TD_ENG_FLAG_MASK       ((1ULL << (__TD_F_LAST)) - 1)
  
  static inline void td_set_ioengine_flags(struct thread_data *td)
  {
         td->flags = (~(TD_ENG_FLAG_MASK << TD_ENG_FLAG_SHIFT) & td->flags) |
-                   (td->io_ops->flags << TD_ENG_FLAG_SHIFT);
+                   ((unsigned long long)td->io_ops->flags << TD_ENG_FLAG_SHIFT);
  }
  
  static inline bool td_ioengine_flagged(struct thread_data *td,
@@ -729,9 +774,24 @@ extern void lat_target_reset(struct thread_data *);
  
  /*
   * Iterates all threads/processes within all the defined jobs
+ * Usage:
+ *             for_each_td(var_name_for_td) {
+ *                     << bodoy of your loop >>
+ *                      Note: internally-scoped loop index availble as __td_index
+ *             } end_for_each_td()
   */
-#define for_each_td(td, i)     \
-       for ((i) = 0, (td) = &segments[0].threads[0]; (i) < (int) thread_number; (i)++, (td) = tnumber_to_td((i)))
+#define for_each_td(td)                        \
+{                                                              \
+       int __td_index;                         \
+       struct thread_data *(td);       \
+       for (__td_index = 0, (td) = &segments[0].threads[0];\
+               __td_index < (int) thread_number; __td_index++, (td) = tnumber_to_td(__td_index))
+#define for_each_td_index()        \
+{                                                              \
+       int __td_index;                         \
+       for (__td_index = 0; __td_index < (int) thread_number; __td_index++)
+#define        end_for_each()  }
+
  #define for_each_file(td, f, i)        \
         if ((td)->files_index)                                          \
                 for ((i) = 0, (f) = (td)->files[0];                     \
diff --git a/fio_time.h b/fio_time.h

index b3bbd4c011c2518aea5cbf5523f514136f8601b3..969ad68d5b9cdd0116a79c53d157c4366e7011ad 100644 (file)
--- a/fio_time.h
+++ b/fio_time.h
@@ -22,6 +22,7 @@ extern uint64_t time_since_now(const struct timespec *);
  extern uint64_t time_since_genesis(void);
  extern uint64_t mtime_since_genesis(void);
  extern uint64_t utime_since_genesis(void);
+extern void cycles_spin(unsigned int);
  extern uint64_t usec_spin(unsigned int);
  extern uint64_t usec_sleep(struct thread_data *, unsigned long);
  extern void fill_start_time(struct timespec *);
@@ -30,6 +31,6 @@ extern bool ramp_time_over(struct thread_data *);
  extern bool in_ramp_time(struct thread_data *);
  extern void fio_time_init(void);
  extern void timespec_add_msec(struct timespec *, unsigned int);
-extern void set_epoch_time(struct thread_data *, int);
+extern void set_epoch_time(struct thread_data *, clockid_t, clockid_t);
  
  #endif
diff --git a/gclient.c b/gclient.c

index e0e0e7bf920cd0bc3933e5c9f62ae024be4437b4..73f64b3b87f1fde00317dcd65e0aa9884a07d02f 100644 (file)
--- a/gclient.c
+++ b/gclient.c
@@ -292,7 +292,7 @@ static void gfio_thread_status_op(struct fio_client *client,
         if (sum_stat_clients == 1)
                 return;
  
-       sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
+       sum_thread_stats(&client_ts, &p->ts);
         sum_group_stats(&client_gs, &p->rs);
  
         client_ts.members++;
@@ -553,12 +553,15 @@ static void gfio_quit_op(struct fio_client *client, struct fio_net_cmd *cmd)
  }
  
  static struct thread_options *gfio_client_add_job(struct gfio_client *gc,
-                       struct thread_options_pack *top)
+                       struct thread_options_pack *top, size_t top_sz)
  {
         struct gfio_client_options *gco;
  
         gco = calloc(1, sizeof(*gco));
-       convert_thread_options_to_cpu(&gco->o, top);
+       if (convert_thread_options_to_cpu(&gco->o, top, top_sz)) {
+               dprint(FD_NET, "client: failed parsing add_job command\n");
+               return NULL;
+       }
         INIT_FLIST_HEAD(&gco->list);
         flist_add_tail(&gco->list, &gc->o_list);
         gc->o_list_nr = 1;
@@ -577,7 +580,10 @@ static void gfio_add_job_op(struct fio_client *client, struct fio_net_cmd *cmd)
  
         p->thread_number = le32_to_cpu(p->thread_number);
         p->groupid = le32_to_cpu(p->groupid);
-       o = gfio_client_add_job(gc, &p->top);
+       o = gfio_client_add_job(gc, &p->top,
+                       cmd->pdu_len - offsetof(struct cmd_add_job_pdu, top));
+       if (o == NULL)
+               return;
  
         gdk_threads_enter();
  
@@ -1155,21 +1161,18 @@ out:
  #define GFIO_CLAT      1
  #define GFIO_SLAT      2
  #define GFIO_LAT       4
-#define GFIO_HILAT     8
-#define GFIO_LOLAT     16
  
  static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
                                   struct group_run_stats *rs,
                                   struct thread_stat *ts, int ddir)
  {
         const char *ddir_label[3] = { "Read", "Write", "Trim" };
-       const char *hilat, *lolat;
         GtkWidget *frame, *label, *box, *vbox, *main_vbox;
-       unsigned long long min[5], max[5];
+       unsigned long long min[3], max[3];
         unsigned long runt;
         unsigned long long bw, iops;
         unsigned int flags = 0;
-       double mean[5], dev[5];
+       double mean[3], dev[3];
         char *io_p, *io_palt, *bw_p, *bw_palt, *iops_p;
         char tmp[128];
         int i2p;
@@ -1268,14 +1271,6 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
                 flags |= GFIO_CLAT;
         if (calc_lat(&ts->lat_stat[ddir], &min[2], &max[2], &mean[2], &dev[2]))
                 flags |= GFIO_LAT;
-       if (calc_lat(&ts->clat_high_prio_stat[ddir], &min[3], &max[3], &mean[3], &dev[3])) {
-               flags |= GFIO_HILAT;
-               if (calc_lat(&ts->clat_low_prio_stat[ddir], &min[4], &max[4], &mean[4], &dev[4]))
-                       flags |= GFIO_LOLAT;
-               /* we only want to print low priority statistics if other IOs were
-                * submitted with the priority bit set
-                */
-       }
  
         if (flags) {
                 frame = gtk_frame_new("Latency");
@@ -1284,24 +1279,12 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
                 vbox = gtk_vbox_new(FALSE, 3);
                 gtk_container_add(GTK_CONTAINER(frame), vbox);
  
-               if (ts->lat_percentiles) {
-                       hilat = "High priority total latency";
-                       lolat = "Low priority total latency";
-               } else {
-                       hilat = "High priority completion latency";
-                       lolat = "Low priority completion latency";
-               }
-
                 if (flags & GFIO_SLAT)
                         gfio_show_lat(vbox, "Submission latency", min[0], max[0], mean[0], dev[0]);
                 if (flags & GFIO_CLAT)
                         gfio_show_lat(vbox, "Completion latency", min[1], max[1], mean[1], dev[1]);
                 if (flags & GFIO_LAT)
                         gfio_show_lat(vbox, "Total latency", min[2], max[2], mean[2], dev[2]);
-               if (flags & GFIO_HILAT)
-                       gfio_show_lat(vbox, hilat, min[3], max[3], mean[3], dev[3]);
-               if (flags & GFIO_LOLAT)
-                       gfio_show_lat(vbox, lolat, min[4], max[4], mean[4], dev[4]);
         }
  
         if (ts->slat_percentiles && flags & GFIO_SLAT)
@@ -1309,40 +1292,16 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
                                 ts->io_u_plat[FIO_SLAT][ddir],
                                 ts->slat_stat[ddir].samples,
                                 "Submission");
-       if (ts->clat_percentiles && flags & GFIO_CLAT) {
+       if (ts->clat_percentiles && flags & GFIO_CLAT)
                 gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
                                 ts->io_u_plat[FIO_CLAT][ddir],
                                 ts->clat_stat[ddir].samples,
                                 "Completion");
-               if (!ts->lat_percentiles) {
-                       if (flags & GFIO_HILAT)
-                               gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-                                               ts->io_u_plat_high_prio[ddir],
-                                               ts->clat_high_prio_stat[ddir].samples,
-                                               "High priority completion");
-                       if (flags & GFIO_LOLAT)
-                               gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-                                               ts->io_u_plat_low_prio[ddir],
-                                               ts->clat_low_prio_stat[ddir].samples,
-                                               "Low priority completion");
-               }
-       }
-       if (ts->lat_percentiles && flags & GFIO_LAT) {
+       if (ts->lat_percentiles && flags & GFIO_LAT)
                 gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
                                 ts->io_u_plat[FIO_LAT][ddir],
                                 ts->lat_stat[ddir].samples,
                                 "Total");
-               if (flags & GFIO_HILAT)
-                       gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-                                       ts->io_u_plat_high_prio[ddir],
-                                       ts->clat_high_prio_stat[ddir].samples,
-                                       "High priority total");
-               if (flags & GFIO_LOLAT)
-                       gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
-                                       ts->io_u_plat_low_prio[ddir],
-                                       ts->clat_low_prio_stat[ddir].samples,
-                                       "Low priority total");
-       }
  
         free(io_p);
         free(bw_p);
diff --git a/gettime.c b/gettime.c

index f85da6e082c8d7cef315f644a2be90cf853ce82f..5ca3120633dd409c669da255dd7e422258b3021d 100644 (file)
--- a/gettime.c
+++ b/gettime.c
@@ -313,7 +313,7 @@ static int calibrate_cpu_clock(void)
  
         max_ticks = MAX_CLOCK_SEC * cycles_per_msec * 1000ULL;
         max_mult = ULLONG_MAX / max_ticks;
-       dprint(FD_TIME, "\n\nmax_ticks=%llu, __builtin_clzll=%d, "
+       dprint(FD_TIME, "max_ticks=%llu, __builtin_clzll=%d, "
                         "max_mult=%llu\n", max_ticks,
                         __builtin_clzll(max_ticks), max_mult);
  
@@ -335,7 +335,7 @@ static int calibrate_cpu_clock(void)
  
         /*
          * Find the greatest power of 2 clock ticks that is less than the
-        * ticks in MAX_CLOCK_SEC_2STAGE
+        * ticks in MAX_CLOCK_SEC
          */
         max_cycles_shift = max_cycles_mask = 0;
         tmp = MAX_CLOCK_SEC * 1000ULL * cycles_per_msec;
@@ -431,22 +431,22 @@ void fio_clock_init(void)
  
  uint64_t ntime_since(const struct timespec *s, const struct timespec *e)
  {
-       int64_t sec, nsec;
+       int64_t sec, nsec;
  
-       sec = e->tv_sec - s->tv_sec;
-       nsec = e->tv_nsec - s->tv_nsec;
-       if (sec > 0 && nsec < 0) {
-              sec--;
-              nsec += 1000000000LL;
-       }
+       sec = e->tv_sec - s->tv_sec;
+       nsec = e->tv_nsec - s->tv_nsec;
+       if (sec > 0 && nsec < 0) {
+               sec--;
+               nsec += 1000000000LL;
+       }
  
         /*
         * time warp bug on some kernels?
         */
-       if (sec < 0 || (sec == 0 && nsec < 0))
-              return 0;
+       if (sec < 0 || (sec == 0 && nsec < 0))
+               return 0;
  
-       return nsec + (sec * 1000000000LL);
+       return nsec + (sec * 1000000000LL);
  }
  
  uint64_t ntime_since_now(const struct timespec *s)
@@ -623,7 +623,7 @@ static void *clock_thread_fn(void *data)
                         seq = *t->seq;
                         if (seq == UINT_MAX)
                                 break;
-                       __sync_synchronize();
+                       tsc_barrier();
                         tsc = get_cpu_clock();
                 } while (seq != atomic32_compare_and_swap(t->seq, seq, seq + 1));
  
@@ -671,12 +671,21 @@ static int clock_cmp(const void *p1, const void *p2)
  int fio_monotonic_clocktest(int debug)
  {
         struct clock_thread *cthreads;
-       unsigned int nr_cpus = cpus_online();
+       unsigned int seen_cpus, nr_cpus = cpus_configured();
         struct clock_entry *entries;
         unsigned long nr_entries, tentries, failed = 0;
         struct clock_entry *prev, *this;
         uint32_t seq = 0;
         unsigned int i;
+       os_cpu_mask_t mask;
+
+#ifdef FIO_HAVE_GET_THREAD_AFFINITY
+       fio_get_thread_affinity(mask);
+#else
+       memset(&mask, 0, sizeof(mask));
+       for (i = 0; i < nr_cpus; i++)
+               fio_cpu_set(&mask, i);
+#endif
  
         if (debug) {
                 log_info("cs: reliable_tsc: %s\n", tsc_reliable ? "yes" : "no");
@@ -703,25 +712,31 @@ int fio_monotonic_clocktest(int debug)
         if (debug)
                 log_info("cs: Testing %u CPUs\n", nr_cpus);
  
+       seen_cpus = 0;
         for (i = 0; i < nr_cpus; i++) {
                 struct clock_thread *t = &cthreads[i];
  
+               if (!fio_cpu_isset(&mask, i))
+                       continue;
                 t->cpu = i;
                 t->debug = debug;
                 t->seq = &seq;
                 t->nr_entries = nr_entries;
-               t->entries = &entries[i * nr_entries];
+               t->entries = &entries[seen_cpus * nr_entries];
                 __fio_sem_init(&t->lock, FIO_SEM_LOCKED);
                 if (pthread_create(&t->thread, NULL, clock_thread_fn, t)) {
                         failed++;
                         nr_cpus = i;
                         break;
                 }
+               seen_cpus++;
         }
  
         for (i = 0; i < nr_cpus; i++) {
                 struct clock_thread *t = &cthreads[i];
  
+               if (!fio_cpu_isset(&mask, i))
+                       continue;
                 fio_sem_up(&t->lock);
         }
  
@@ -729,6 +744,8 @@ int fio_monotonic_clocktest(int debug)
                 struct clock_thread *t = &cthreads[i];
                 void *ret;
  
+               if (!fio_cpu_isset(&mask, i))
+                       continue;
                 pthread_join(t->thread, &ret);
                 if (ret)
                         failed++;
@@ -742,6 +759,7 @@ int fio_monotonic_clocktest(int debug)
                 goto err;
         }
  
+       tentries = nr_entries * seen_cpus;
         qsort(entries, tentries, sizeof(struct clock_entry), clock_cmp);
  
         /* silence silly gcc */
diff --git a/gfio.c b/gfio.c

index 22c5314d3d1d5f93141dd7bedd8b7e902a848cba..10c9b0947bbde51623f0f58b5a0ef5d43b30e4d6 100644 (file)
--- a/gfio.c
+++ b/gfio.c
@@ -730,8 +730,7 @@ static struct gui_entry *alloc_new_gui_entry(struct gui *ui)
  {
         struct gui_entry *ge;
  
-       ge = malloc(sizeof(*ge));
-       memset(ge, 0, sizeof(*ge));
+       ge = calloc(1, sizeof(*ge));
         ge->state = GE_STATE_NEW;
         ge->ui = ui;
         return ge;
diff --git a/goptions.h b/goptions.h

index a225a8d1b65a3e4faf222960f398d01cb34526e7..0361750946addb861da78c9574986d7da182700c 100644 (file)
--- a/goptions.h
+++ b/goptions.h
@@ -1,6 +1,8 @@
  #ifndef GFIO_OPTIONS_H
  #define GFIO_OPTIONS_H
  
+#include <gtk/gtk.h>
+
  void gopt_get_options_window(GtkWidget *window, struct gfio_client *gc);
  void gopt_init(void);
  void gopt_exit(void);
diff --git a/graph.c b/graph.c

index 7a174170c7562fe7b13b27506ba0b68709d3df00..3d2b6c96dd0ba12f3a4b7619d1ab2548d9ccbf5c 100644 (file)
--- a/graph.c
+++ b/graph.c
@@ -713,8 +713,7 @@ static void graph_label_add_value(struct graph_label *i, void *value,
         struct graph *g = i->parent;
         struct graph_value *x;
  
-       x = malloc(sizeof(*x));
-       memset(x, 0, sizeof(*x));
+       x = calloc(1, sizeof(*x));
         INIT_FLIST_HEAD(&x->alias);
         INIT_FLIST_HEAD(&x->list);
         flist_add_tail(&x->list, &i->value_list);
@@ -999,7 +998,7 @@ const char *graph_find_tooltip(struct graph *g, int ix, int iy)
                                 ydiff = fabs(yval - y);
  
                                 /*
-                                * zero delta, or within or match critera, break
+                                * zero delta, or within or match criteria, break
                                  */
                                 if (ydiff < best_delta) {
                                         best_delta = ydiff;
diff --git a/hash.h b/hash.h

index 2c04bc296974dbe7708fe2ef940afd5893a92ec0..51f0706e2cc5cc5a310efaf43b86b3417ef8d198 100644 (file)
--- a/hash.h
+++ b/hash.h
@@ -9,32 +9,6 @@
     (C) 2002 William Lee Irwin III, IBM */
  
  /*
- * Knuth recommends primes in approximately golden ratio to the maximum
- * integer representable by a machine word for multiplicative hashing.
- * Chuck Lever verified the effectiveness of this technique:
- * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
- *
- * These primes are chosen to be bit-sparse, that is operations on
- * them can use shifts and additions instead of multiplications for
- * machines where multiplications are slow.
- */
-
-#if BITS_PER_LONG == 32
-/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
-#define GOLDEN_RATIO_PRIME 0x9e370001UL
-#elif BITS_PER_LONG == 64
-/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
-#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
-#else
-#error Define GOLDEN_RATIO_PRIME for your wordsize.
-#endif
-
-/*
- * The above primes are actively bad for hashing, since they are
- * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
- * real problems. Besides, the "prime" part is pointless for the
- * multiplicative hash.
- *
   * Although a random odd number will do, it turns out that the golden
   * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
   * properties.
@@ -142,20 +116,20 @@ static inline uint32_t jhash(const void *key, uint32_t length, uint32_t initval)
         /* Last block: affect all 32 bits of (c) */
         /* All the case statements fall through */
         switch (length) {
-       case 12: c += (uint32_t) k[11] << 24;   fallthrough;
-       case 11: c += (uint32_t) k[10] << 16;   fallthrough;
-       case 10: c += (uint32_t) k[9] << 8;     fallthrough;
-       case 9:  c += k[8];                     fallthrough;
-       case 8:  b += (uint32_t) k[7] << 24;    fallthrough;
-       case 7:  b += (uint32_t) k[6] << 16;    fallthrough;
-       case 6:  b += (uint32_t) k[5] << 8;     fallthrough;
-       case 5:  b += k[4];                     fallthrough;
-       case 4:  a += (uint32_t) k[3] << 24;    fallthrough;
-       case 3:  a += (uint32_t) k[2] << 16;    fallthrough;
-       case 2:  a += (uint32_t) k[1] << 8;     fallthrough;
+       case 12: c += (uint32_t) k[11] << 24;   fio_fallthrough;
+       case 11: c += (uint32_t) k[10] << 16;   fio_fallthrough;
+       case 10: c += (uint32_t) k[9] << 8;     fio_fallthrough;
+       case 9:  c += k[8];                     fio_fallthrough;
+       case 8:  b += (uint32_t) k[7] << 24;    fio_fallthrough;
+       case 7:  b += (uint32_t) k[6] << 16;    fio_fallthrough;
+       case 6:  b += (uint32_t) k[5] << 8;     fio_fallthrough;
+       case 5:  b += k[4];                     fio_fallthrough;
+       case 4:  a += (uint32_t) k[3] << 24;    fio_fallthrough;
+       case 3:  a += (uint32_t) k[2] << 16;    fio_fallthrough;
+       case 2:  a += (uint32_t) k[1] << 8;     fio_fallthrough;
         case 1:  a += k[0];
                  __jhash_final(a, b, c);
-                fallthrough;
+                fio_fallthrough;
         case 0: /* Nothing left to add */
                 break;
         }
diff --git a/helper_thread.c b/helper_thread.c

index d8e7ebfe573ef5531fb2c60d00e87e4cb2127308..332ccb53c254e45087919f9a1f758390a9873f2f 100644 (file)
--- a/helper_thread.c
+++ b/helper_thread.c
@@ -1,4 +1,7 @@
+#include <errno.h>
  #include <signal.h>
+#include <stdio.h>
+#include <string.h>
  #include <unistd.h>
  #ifdef CONFIG_HAVE_TIMERFD_CREATE
  #include <sys/timerfd.h>
@@ -9,6 +12,10 @@
  #define DRD_IGNORE_VAR(x) do { } while (0)
  #endif
  
+#ifdef WIN32
+#include "os/os-windows.h"
+#endif
+
  #include "fio.h"
  #include "smalloc.h"
  #include "helper_thread.h"
@@ -99,13 +106,14 @@ static int read_from_pipe(int fd, void *buf, size_t len)
  
  static void block_signals(void)
  {
-#ifdef HAVE_PTHREAD_SIGMASK
+#ifdef CONFIG_PTHREAD_SIGMASK
         sigset_t sigmask;
  
+       int ret;
+
         ret = pthread_sigmask(SIG_UNBLOCK, NULL, &sigmask);
         assert(ret == 0);
         ret = pthread_sigmask(SIG_BLOCK, &sigmask, NULL);
-       assert(ret == 0);
  #endif
  }
  
@@ -118,7 +126,10 @@ static void submit_action(enum action a)
                 return;
  
         ret = write_to_pipe(helper_data->pipe[1], &data, sizeof(data));
-       assert(ret == 1);
+       if (ret != 1) {
+               log_err("failed to write action into pipe, err %i:%s", errno, strerror(errno));
+               assert(0);
+       }
  }
  
  void helper_reset(void)
@@ -150,7 +161,6 @@ void helper_thread_exit(void)
                 return;
  
         helper_data->exit = 1;
-       submit_action(A_EXIT);
         pthread_join(helper_data->thread, NULL);
  }
  
@@ -277,25 +287,18 @@ static void *helper_thread_main(void *data)
                 },
                 {
                         .name = "steadystate",
-                       .interval_ms = steadystate_enabled ? STEADYSTATE_MSEC :
+                       .interval_ms = steadystate_enabled ? ss_check_interval :
                                 0,
                         .func = steadystate_check,
                 }
         };
         struct timespec ts;
-       int clk_tck, ret = 0;
+       long clk_tck;
+       int ret = 0;
  
-#ifdef _SC_CLK_TCK
-       clk_tck = sysconf(_SC_CLK_TCK);
-#else
-       /*
-        * The timer frequence is variable on Windows. Instead of trying to
-        * query it, use 64 Hz, the clock frequency lower bound. See also
-        * https://carpediemsystems.co.uk/2019/07/18/windows-system-timer-granularity/.
-        */
-       clk_tck = 64;
-#endif
-       dprint(FD_HELPERTHREAD, "clk_tck = %d\n", clk_tck);
+       os_clk_tck(&clk_tck);
+
+       dprint(FD_HELPERTHREAD, "clk_tck = %ld\n", clk_tck);
         assert(clk_tck > 0);
         sleep_accuracy_ms = (1000 + clk_tck - 1) / clk_tck;
  
diff --git a/helper_thread.h b/helper_thread.h

index d7df6c4d804d03693efae459a794366df599ce6f..1c8167e83b63a8f9b74a3e218e892aad1bc1b7cf 100644 (file)
--- a/helper_thread.h
+++ b/helper_thread.h
@@ -1,6 +1,11 @@
  #ifndef FIO_HELPER_THREAD_H
  #define FIO_HELPER_THREAD_H
  
+#include <stdbool.h>
+
+struct fio_sem;
+struct sk_out;
+
  extern void helper_reset(void);
  extern void helper_do_stat(void);
  extern bool helper_should_exit(void);
diff --git a/idletime.c b/idletime.c

index fc1df8e9d009a49059558c6ca576a6e553f304c7..90ed77ea6e7d671e7bc251f5ddca69f4fb074e78 100644 (file)
--- a/idletime.c
+++ b/idletime.c
@@ -189,7 +189,7 @@ void fio_idle_prof_init(void)
         pthread_condattr_t cattr;
         struct idle_prof_thread *ipt;
  
-       ipc.nr_cpus = cpus_online();
+       ipc.nr_cpus = cpus_configured();
         ipc.status = IDLE_PROF_STATUS_OK;
  
         if (ipc.opt == IDLE_PROF_OPT_NONE)
diff --git a/init.c b/init.c

index eea6e54692b177036dce001134f8ed1baeb62ca8..ff3e9a90d551500b3880df95c2505e3a7df9e684 100644 (file)
--- a/init.c
+++ b/init.c
@@ -224,6 +224,13 @@ static struct option l_opts[FIO_NR_OPTIONS] = {
                 .has_arg        = optional_argument,
                 .val            = 'S',
         },
+#ifdef WIN32
+       {
+               .name           = (char *) "server-internal",
+               .has_arg        = required_argument,
+               .val            = 'N',
+       },
+#endif
         {       .name           = (char *) "daemonize",
                 .has_arg        = required_argument,
                 .val            = 'D',
@@ -448,19 +455,6 @@ static void dump_opt_list(struct thread_data *td)
         }
  }
  
-static void fio_dump_options_free(struct thread_data *td)
-{
-       while (!flist_empty(&td->opt_list)) {
-               struct print_option *p;
-
-               p = flist_first_entry(&td->opt_list, struct print_option, list);
-               flist_del_init(&p->list);
-               free(p->name);
-               free(p->value);
-               free(p);
-       }
-}
-
  static void copy_opt_list(struct thread_data *dst, struct thread_data *src)
  {
         struct flist_head *entry;
@@ -624,6 +618,19 @@ static int fixup_options(struct thread_data *td)
                 ret |= 1;
         }
  
+       if (td_trimwrite(td) && o->num_range > 1) {
+               log_err("fio: trimwrite cannot be used with multiple"
+                       " ranges.\n");
+               ret |= 1;
+       }
+
+       if (td_trim(td) && o->num_range > 1 &&
+           !td_ioengine_flagged(td, FIO_MULTI_RANGE_TRIM)) {
+               log_err("fio: can't use multiple ranges with IO engine %s\n",
+                       td->io_ops->name);
+               ret |= 1;
+       }
+
  #ifndef CONFIG_PSHARED
         if (!o->use_thread) {
                 log_info("fio: this platform does not support process shared"
@@ -646,6 +653,11 @@ static int fixup_options(struct thread_data *td)
                 ret |= 1;
         }
  
+       if (o->zone_mode == ZONE_MODE_ZBD && !o->create_serialize) {
+               log_err("fio: --zonemode=zbd and --create_serialize=0 are not compatible.\n");
+               ret |= 1;
+       }
+
         if (o->zone_mode == ZONE_MODE_STRIDED && !o->zone_size) {
                 log_err("fio: --zonesize must be specified when using --zonemode=strided.\n");
                 ret |= 1;
@@ -917,12 +929,6 @@ static int fixup_options(struct thread_data *td)
                 ret |= 1;
         }
  
-       /*
-        * O_ATOMIC implies O_DIRECT
-        */
-       if (o->oatomic)
-               o->odirect = 1;
-
         /*
          * If randseed is set, that overrides randrepeat
          */
@@ -958,12 +964,66 @@ static int fixup_options(struct thread_data *td)
         if (o->disable_slat)
                 o->slat_percentiles = 0;
  
+       /* Do this only for the parent job */
+       if (!td->subjob_number) {
+               /*
+                * Fix these up to be nsec internally
+                */
+               for_each_rw_ddir(ddir)
+                       o->max_latency[ddir] *= 1000ULL;
+
+               o->latency_target *= 1000ULL;
+       }
+
         /*
-        * Fix these up to be nsec internally
+        * Dedupe working set verifications
          */
-       o->max_latency *= 1000ULL;
-       o->latency_target *= 1000ULL;
+       if (o->dedupe_percentage && o->dedupe_mode == DEDUPE_MODE_WORKING_SET) {
+               if (!fio_option_is_set(o, size)) {
+                       log_err("fio: pregenerated dedupe working set "
+                                       "requires size to be set\n");
+                       ret |= 1;
+               } else if (o->nr_files != 1) {
+                       log_err("fio: dedupe working set mode supported with "
+                                       "single file per job, but %d files "
+                                       "provided\n", o->nr_files);
+                       ret |= 1;
+               } else if (o->dedupe_working_set_percentage + o->dedupe_percentage > 100) {
+                       log_err("fio: impossible to reach expected dedupe percentage %u "
+                                       "since %u percentage of size is reserved to dedupe working set "
+                                       "(those are unique pages)\n",
+                                       o->dedupe_percentage, o->dedupe_working_set_percentage);
+                       ret |= 1;
+               }
+       }
+
+       for_each_td(td2) {
+               if (td->o.ss_check_interval != td2->o.ss_check_interval) {
+                       log_err("fio: conflicting ss_check_interval: %llu and %llu, must be globally equal\n",
+                                       td->o.ss_check_interval, td2->o.ss_check_interval);
+                       ret |= 1;
+               }
+       } end_for_each();
+       if (td->o.ss_dur && td->o.ss_check_interval / 1000L < 1000) {
+               log_err("fio: ss_check_interval must be at least 1s\n");
+               ret |= 1;
  
+       }
+       if (td->o.ss_dur && (td->o.ss_dur % td->o.ss_check_interval != 0 || td->o.ss_dur <= td->o.ss_check_interval)) {
+               log_err("fio: ss_duration %lluus must be multiple of ss_check_interval %lluus\n",
+                               td->o.ss_dur, td->o.ss_check_interval);
+               ret |= 1;
+       }
+
+       if (td->o.fdp) {
+               if (fio_option_is_set(&td->o, dp_type) &&
+                       (td->o.dp_type == FIO_DP_STREAMS || td->o.dp_type == FIO_DP_NONE)) {
+                       log_err("fio: fdp=1 is not compatible with dataplacement={streams, none}\n");
+                       ret |= 1;
+               } else {
+                       td->o.dp_type = FIO_DP_FDP;
+               }
+       }
         return ret;
  }
  
@@ -984,8 +1044,12 @@ static void init_rand_file_service(struct thread_data *td)
         }
  }
  
-void td_fill_verify_state_seed(struct thread_data *td)
+void td_fill_rand_seeds(struct thread_data *td)
  {
+       uint64_t read_seed = td->rand_seeds[FIO_RAND_BS_OFF];
+       uint64_t write_seed = td->rand_seeds[FIO_RAND_BS1_OFF];
+       uint64_t trim_seed = td->rand_seeds[FIO_RAND_BS2_OFF];
+       int i;
         bool use64;
  
         if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
@@ -993,17 +1057,6 @@ void td_fill_verify_state_seed(struct thread_data *td)
         else
                 use64 = false;
  
-       init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF],
-               use64);
-}
-
-static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
-{
-       uint64_t read_seed = td->rand_seeds[FIO_RAND_BS_OFF];
-       uint64_t write_seed = td->rand_seeds[FIO_RAND_BS1_OFF];
-       uint64_t trim_seed = td->rand_seeds[FIO_RAND_BS2_OFF];
-       int i;
-
         /*
          * trimwrite is special in that we need to generate the same
          * offsets to get the "write after trim" effect. If we are
@@ -1020,7 +1073,8 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
         init_rand_seed(&td->bsrange_state[DDIR_WRITE], write_seed, use64);
         init_rand_seed(&td->bsrange_state[DDIR_TRIM], trim_seed, use64);
  
-       td_fill_verify_state_seed(td);
+       init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF],
+               use64);
         init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], false);
  
         if (td->o.file_service_type == FIO_FSERVICE_RANDOM)
@@ -1037,12 +1091,7 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
         init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false);
         init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false);
         init_rand_seed(&td->prio_state, td->rand_seeds[FIO_RAND_PRIO_CMDS], false);
-
-       if (!td_random(td))
-               return;
-
-       if (td->o.rand_repeatable)
-               td->rand_seeds[FIO_RAND_BLOCK_OFF] = FIO_RANDSEED * td->thread_number;
+       init_rand_seed(&td->dedupe_working_set_index_state, td->rand_seeds[FIO_RAND_DEDUPE_WORKING_SET_IX], use64);
  
         init_rand_seed(&td->random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF], use64);
  
@@ -1051,29 +1100,41 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
  
                 init_rand_seed(s, td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF], false);
         }
+
+       init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF], use64);
+       frand_copy(&td->buf_state_prev, &td->buf_state);
+
+       init_rand_seed(&td->fdp_state, td->rand_seeds[FIO_RAND_FDP_OFF], use64);
  }
  
-void td_fill_rand_seeds(struct thread_data *td)
+static int setup_random_seeds(struct thread_data *td)
  {
-       bool use64;
-
-       if (td->o.allrand_repeatable) {
-               unsigned int i;
+       uint64_t seed;
+       unsigned int i;
  
-               for (i = 0; i < FIO_RAND_NR_OFFS; i++)
-                       td->rand_seeds[i] = FIO_RANDSEED * td->thread_number
-                               + i;
+       if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) {
+               int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds));
+               dprint(FD_RANDOM, "using system RNG for random seeds\n");
+               if (ret)
+                       return ret;
+       } else {
+               seed = td->o.rand_seed;
+               for (i = 0; i < 4; i++)
+                       seed *= 0x9e370001UL;
+
+               for (i = 0; i < FIO_RAND_NR_OFFS; i++) {
+                       td->rand_seeds[i] = seed * td->thread_number + i;
+                       seed *= 0x9e370001UL;
+               }
         }
  
-       if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
-               use64 = true;
-       else
-               use64 = false;
+       td_fill_rand_seeds(td);
  
-       td_fill_rand_seeds_internal(td, use64);
+       dprint(FD_RANDOM, "FIO_RAND_NR_OFFS=%d\n", FIO_RAND_NR_OFFS);
+       for (int i = 0; i < FIO_RAND_NR_OFFS; i++)
+               dprint(FD_RANDOM, "rand_seeds[%d]=%" PRIu64 "\n", i, td->rand_seeds[i]);
  
-       init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF], use64);
-       frand_copy(&td->buf_state_prev, &td->buf_state);
+       return 0;
  }
  
  /*
@@ -1209,31 +1270,6 @@ static void init_flags(struct thread_data *td)
         }
  }
  
-static int setup_random_seeds(struct thread_data *td)
-{
-       uint64_t seed;
-       unsigned int i;
-
-       if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) {
-               int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds));
-               if (!ret)
-                       td_fill_rand_seeds(td);
-               return ret;
-       }
-
-       seed = td->o.rand_seed;
-       for (i = 0; i < 4; i++)
-               seed *= 0x9e370001UL;
-
-       for (i = 0; i < FIO_RAND_NR_OFFS; i++) {
-               td->rand_seeds[i] = seed * td->thread_number + i;
-               seed *= 0x9e370001UL;
-       }
-
-       td_fill_rand_seeds(td);
-       return 0;
-}
-
  enum {
         FPRE_NONE = 0,
         FPRE_JOBNAME,
@@ -1387,15 +1423,14 @@ static void gen_log_name(char *name, size_t size, const char *logtype,
  
  static int check_waitees(char *waitee)
  {
-       struct thread_data *td;
-       int i, ret = 0;
+       int ret = 0;
  
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if (td->subjob_number)
                         continue;
  
                 ret += !strcmp(td->o.name, waitee);
-       }
+       } end_for_each();
  
         return ret;
  }
@@ -1428,6 +1463,23 @@ static bool wait_for_ok(const char *jobname, struct thread_options *o)
         return true;
  }
  
+static int verify_per_group_options(struct thread_data *td, const char *jobname)
+{
+       for_each_td(td2) {
+               if (td->groupid != td2->groupid)
+                       continue;
+
+               if (td->o.stats &&
+                   td->o.lat_percentiles != td2->o.lat_percentiles) {
+                       log_err("fio: lat_percentiles in job: %s differs from group\n",
+                               jobname);
+                       return 1;
+               }
+       } end_for_each();
+
+       return 0;
+}
+
  /*
   * Treat an empty log file name the same as a one not given
   */
@@ -1497,6 +1549,9 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
         if (fixup_options(td))
                 goto err;
  
+       if (!td->o.dedupe_global && init_dedupe_working_set_seeds(td, 0))
+               goto err;
+
         /*
          * Belongs to fixup_options, but o->name is not necessarily set as yet
          */
@@ -1528,17 +1583,15 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
         memcpy(td->ts.percentile_list, o->percentile_list, sizeof(o->percentile_list));
         td->ts.sig_figs = o->sig_figs;
  
-       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               td->ts.clat_stat[i].min_val = ULONG_MAX;
-               td->ts.slat_stat[i].min_val = ULONG_MAX;
-               td->ts.lat_stat[i].min_val = ULONG_MAX;
-               td->ts.bw_stat[i].min_val = ULONG_MAX;
-               td->ts.iops_stat[i].min_val = ULONG_MAX;
-               td->ts.clat_high_prio_stat[i].min_val = ULONG_MAX;
-               td->ts.clat_low_prio_stat[i].min_val = ULONG_MAX;
-       }
-       td->ts.sync_stat.min_val = ULONG_MAX;
-       td->ddir_seq_nr = o->ddir_seq_nr;
+       init_thread_stat_min_vals(&td->ts);
+
+       /*
+        * td->>ddir_seq_nr needs to be initialized to 1, NOT o->ddir_seq_nr,
+        * so that get_next_offset gets a new random offset the first time it
+        * is called, instead of keeping an initial offset of 0 for the first
+        * nr-1 calls
+        */
+       td->ddir_seq_nr = 1;
  
         if ((o->stonewall || o->new_group) && prev_group_jobs) {
                 prev_group_jobs = 0;
@@ -1552,6 +1605,10 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
         td->groupid = groupid;
         prev_group_jobs++;
  
+       if (td->o.group_reporting && prev_group_jobs > 1 &&
+           verify_per_group_options(td, jobname))
+               goto err;
+
         if (setup_rate(td))
                 goto err;
  
@@ -1563,6 +1620,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                         .hist_coarseness = o->log_hist_coarseness,
                         .log_type = IO_LOG_TYPE_LAT,
                         .log_offset = o->log_offset,
+                       .log_prio = o->log_prio,
                         .log_gz = o->log_gz,
                         .log_gz_store = o->log_gz_store,
                 };
@@ -1574,17 +1632,23 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                 else
                         suf = "log";
  
-               gen_log_name(logname, sizeof(logname), "lat", pre,
-                               td->thread_number, suf, o->per_job_logs);
-               setup_log(&td->lat_log, &p, logname);
+               if (!o->disable_lat) {
+                       gen_log_name(logname, sizeof(logname), "lat", pre,
+                                    td->thread_number, suf, o->per_job_logs);
+                       setup_log(&td->lat_log, &p, logname);
+               }
  
-               gen_log_name(logname, sizeof(logname), "slat", pre,
-                               td->thread_number, suf, o->per_job_logs);
-               setup_log(&td->slat_log, &p, logname);
+               if (!o->disable_slat) {
+                       gen_log_name(logname, sizeof(logname), "slat", pre,
+                                    td->thread_number, suf, o->per_job_logs);
+                       setup_log(&td->slat_log, &p, logname);
+               }
  
-               gen_log_name(logname, sizeof(logname), "clat", pre,
-                               td->thread_number, suf, o->per_job_logs);
-               setup_log(&td->clat_log, &p, logname);
+               if (!o->disable_clat) {
+                       gen_log_name(logname, sizeof(logname), "clat", pre,
+                                    td->thread_number, suf, o->per_job_logs);
+                       setup_log(&td->clat_log, &p, logname);
+               }
  
         }
  
@@ -1596,6 +1660,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                         .hist_coarseness = o->log_hist_coarseness,
                         .log_type = IO_LOG_TYPE_HIST,
                         .log_offset = o->log_offset,
+                       .log_prio = o->log_prio,
                         .log_gz = o->log_gz,
                         .log_gz_store = o->log_gz_store,
                 };
@@ -1627,6 +1692,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                         .hist_coarseness = o->log_hist_coarseness,
                         .log_type = IO_LOG_TYPE_BW,
                         .log_offset = o->log_offset,
+                       .log_prio = o->log_prio,
                         .log_gz = o->log_gz,
                         .log_gz_store = o->log_gz_store,
                 };
@@ -1658,6 +1724,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                         .hist_coarseness = o->log_hist_coarseness,
                         .log_type = IO_LOG_TYPE_IOPS,
                         .log_offset = o->log_offset,
+                       .log_prio = o->log_prio,
                         .log_gz = o->log_gz,
                         .log_gz_store = o->log_gz_store,
                 };
@@ -1905,8 +1972,7 @@ static int __parse_jobs_ini(struct thread_data *td,
          * it's really 256 + small bit, 280 should suffice
          */
         if (!nested) {
-               name = malloc(280);
-               memset(name, 0, 280);
+               name = calloc(1, 280);
         }
  
         opts = NULL;
@@ -2126,6 +2192,10 @@ static int __parse_jobs_ini(struct thread_data *td,
                 i++;
         }
  
+       free(job_sections);
+       job_sections = NULL;
+       nr_job_sections = 0;
+
         free(opts);
  out:
         free(string);
@@ -2206,7 +2276,7 @@ static void usage(const char *name)
         printf("  --minimal\t\tMinimal (terse) output\n");
         printf("  --output-format=type\tOutput format (terse,json,json+,normal)\n");
         printf("  --terse-version=type\tSet terse version output format"
-               " (default 3, or 2 or 4)\n");
+               " (default 3, or 2 or 4 or 5)\n");
         printf("  --version\t\tPrint version info and exit\n");
         printf("  --help\t\tPrint this page\n");
         printf("  --cpuclock-test\tPerform test/validation of CPU clock\n");
@@ -2747,6 +2817,15 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
                                 break;
  
                         ret = fio_cmd_ioengine_option_parse(td, opt, val);
+
+                       if (ret) {
+                               if (td) {
+                                       put_job(td);
+                                       td = NULL;
+                               }
+                               do_exit++;
+                               exit_val = 1;
+                       }
                         break;
                 }
                 case 'w':
@@ -2774,6 +2853,12 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
                         exit_val = 1;
  #endif
                         break;
+#ifdef WIN32
+               case 'N':
+                       did_arg = true;
+                       fio_server_internal_set(optarg);
+                       break;
+#endif
                 case 'D':
                         if (pid_file)
                                 free(pid_file);
@@ -2921,7 +3006,7 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
                         log_err("%s: unrecognized option '%s'\n", argv[0],
                                                         argv[optind - 1]);
                         show_closest_option(argv[optind - 1]);
-                       fallthrough;
+                       fio_fallthrough;
                 default:
                         do_exit++;
                         exit_val = 1;
diff --git a/io_ddir.h b/io_ddir.h

index a42da97a335cd046b82c44c396695be471341b8e..280c1e796a2690b599f28ae453020d11cbc3b836 100644 (file)
--- a/io_ddir.h
+++ b/io_ddir.h
@@ -11,6 +11,7 @@ enum fio_ddir {
         DDIR_WAIT,
         DDIR_LAST,
         DDIR_INVAL = -1,
+       DDIR_TIMEOUT = -2,
  
         DDIR_RWDIR_CNT = 3,
         DDIR_RWDIR_SYNC_CNT = 4,
@@ -24,7 +25,7 @@ static inline const char *io_ddir_name(enum fio_ddir ddir)
                                         "datasync", "sync_file_range",
                                         "wait", };
  
-       if (ddir < DDIR_LAST)
+       if (ddir >= 0 && ddir < DDIR_LAST)
                 return name[ddir];
  
         return "invalid";
@@ -41,6 +42,7 @@ enum td_ddir {
         TD_DDIR_RANDRW          = TD_DDIR_RW | TD_DDIR_RAND,
         TD_DDIR_RANDTRIM        = TD_DDIR_TRIM | TD_DDIR_RAND,
         TD_DDIR_TRIMWRITE       = TD_DDIR_TRIM | TD_DDIR_WRITE,
+       TD_DDIR_RANDTRIMWRITE   = TD_DDIR_RANDTRIM | TD_DDIR_WRITE,
  };
  
  #define td_read(td)            ((td)->o.td_ddir & TD_DDIR_READ)
@@ -51,6 +53,8 @@ enum td_ddir {
  #define file_randommap(td, f)  (!(td)->o.norandommap && fio_file_axmap((f)))
  #define td_trimwrite(td)       (((td)->o.td_ddir & TD_DDIR_TRIMWRITE) \
                                         == TD_DDIR_TRIMWRITE)
+#define td_randtrimwrite(td)   (((td)->o.td_ddir & TD_DDIR_RANDTRIMWRITE) \
+                                       == TD_DDIR_RANDTRIMWRITE)
  
  static inline int ddir_sync(enum fio_ddir ddir)
  {
@@ -67,7 +71,8 @@ static inline const char *ddir_str(enum td_ddir ddir)
  {
         static const char *__str[] = { NULL, "read", "write", "rw", "rand",
                                 "randread", "randwrite", "randrw",
-                               "trim", NULL, "trimwrite", NULL, "randtrim" };
+                               "trim", NULL, "trimwrite", NULL, "randtrim",
+                               NULL, "randtrimwrite" };
  
         return __str[ddir];
  }
diff --git a/io_u.c b/io_u.c

index 00a219c2e85922906dd859583ef6f1ae31ad29c0..a090e12122642f238569f9fa663ab44d5091bef6 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -355,11 +355,22 @@ static int get_next_seq_offset(struct thread_data *td, struct fio_file *f,
          * and invalidate the cache, if we need to.
          */
         if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) &&
-           o->time_based) {
+           o->time_based && o->nr_files == 1) {
                 f->last_pos[ddir] = f->file_offset;
                 loop_cache_invalidate(td, f);
         }
  
+       /*
+        * If we reach the end for a rw-io-size based run, reset us back to 0
+        * and invalidate the cache, if we need to.
+        */
+       if (td_rw(td) && o->io_size > o->size) {
+               if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f)) {
+                       f->last_pos[ddir] = f->file_offset;
+                       loop_cache_invalidate(td, f);
+               }
+        }
+
         if (f->last_pos[ddir] < f->real_file_size) {
                 uint64_t pos;
  
@@ -417,7 +428,13 @@ static int get_next_block(struct thread_data *td, struct io_u *io_u,
  
         b = offset = -1ULL;
  
-       if (rw_seq) {
+       if (td_randtrimwrite(td) && ddir == DDIR_WRITE) {
+               /* don't mark randommap for these writes */
+               io_u_set(td, io_u, IO_U_F_BUSY_OK);
+               offset = f->last_start[DDIR_TRIM];
+               *is_random = true;
+               ret = 0;
+       } else if (rw_seq) {
                 if (td_random(td)) {
                         if (should_do_random(td, ddir)) {
                                 ret = get_next_rand_block(td, f, ddir, &b);
@@ -507,6 +524,24 @@ static int get_next_offset(struct thread_data *td, struct io_u *io_u,
                 return 1;
         }
  
+       /*
+        * For randtrimwrite, we decide whether to issue a trim or a write
+        * based on whether the offsets for the most recent trim and write
+        * operations match. If they don't match that means we just issued a
+        * new trim and the next operation should be a write. If they *do*
+        * match that means we just completed a trim+write pair and the next
+        * command should be a trim.
+        *
+        * This works fine for sequential workloads but for random workloads
+        * it's possible to complete a trim+write pair and then have the next
+        * randomly generated offset match the previous offset. If that happens
+        * we need to alter the offset for the last write operation in order
+        * to ensure that we issue a write operation the next time through.
+        */
+       if (td_randtrimwrite(td) && ddir == DDIR_TRIM &&
+           f->last_start[DDIR_TRIM] == io_u->offset)
+               f->last_start[DDIR_WRITE]--;
+
         io_u->verify_offset = io_u->offset;
         return 0;
  }
@@ -530,6 +565,12 @@ static unsigned long long get_next_buflen(struct thread_data *td, struct io_u *i
  
         assert(ddir_rw(ddir));
  
+       if (td_randtrimwrite(td) && ddir == DDIR_WRITE) {
+               struct fio_file *f = io_u->file;
+
+               return f->last_pos[DDIR_TRIM] - f->last_start[DDIR_TRIM];
+       }
+
         if (td->o.bs_is_seq_rand)
                 ddir = is_random ? DDIR_WRITE : DDIR_READ;
  
@@ -687,7 +728,7 @@ static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
                  * check if the usec is capable of taking negative values
                  */
                 if (now > td->o.timeout) {
-                       ddir = DDIR_INVAL;
+                       ddir = DDIR_TIMEOUT;
                         return ddir;
                 }
                 usec = td->o.timeout - now;
@@ -696,7 +737,7 @@ static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
  
         now = utime_since_now(&td->epoch);
         if ((td->o.timeout && (now > td->o.timeout)) || td->terminate)
-               ddir = DDIR_INVAL;
+               ddir = DDIR_TIMEOUT;
  
         return ddir;
  }
@@ -755,7 +796,15 @@ static enum fio_ddir get_rw_ddir(struct thread_data *td)
         else
                 ddir = DDIR_INVAL;
  
-       td->rwmix_ddir = rate_ddir(td, ddir);
+       if (!should_check_rate(td)) {
+               /*
+                * avoid time-consuming call to utime_since_now() if rate checking
+                * isn't being used. this imrpoves IOPs 50%. See:
+                * https://github.com/axboe/fio/issues/1501#issuecomment-1418327049
+                */
+               td->rwmix_ddir = ddir;
+       } else
+               td->rwmix_ddir = rate_ddir(td, ddir);
         return td->rwmix_ddir;
  }
  
@@ -768,7 +817,7 @@ static void set_rw_ddir(struct thread_data *td, struct io_u *io_u)
  
         if (td_trimwrite(td)) {
                 struct fio_file *f = io_u->file;
-               if (f->last_pos[DDIR_WRITE] == f->last_pos[DDIR_TRIM])
+               if (f->last_start[DDIR_WRITE] == f->last_start[DDIR_TRIM])
                         ddir = DDIR_TRIM;
                 else
                         ddir = DDIR_WRITE;
@@ -902,6 +951,65 @@ static void setup_strided_zone_mode(struct thread_data *td, struct io_u *io_u)
                 fio_file_reset(td, f);
  }
  
+static int fill_multi_range_io_u(struct thread_data *td, struct io_u *io_u)
+{
+       bool is_random;
+       uint64_t buflen, i = 0;
+       struct trim_range *range;
+       struct fio_file *f = io_u->file;
+       uint8_t *buf;
+
+       buf = io_u->buf;
+       buflen = 0;
+
+       while (i < td->o.num_range) {
+               range = (struct trim_range *)buf;
+               if (get_next_offset(td, io_u, &is_random)) {
+                       dprint(FD_IO, "io_u %p, failed getting offset\n",
+                              io_u);
+                       break;
+               }
+
+               io_u->buflen = get_next_buflen(td, io_u, is_random);
+               if (!io_u->buflen) {
+                       dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u);
+                       break;
+               }
+
+               if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
+                       dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n",
+                              io_u,
+                              (unsigned long long) io_u->offset, io_u->buflen,
+                              (unsigned long long) io_u->file->real_file_size);
+                       break;
+               }
+
+               range->start = io_u->offset;
+               range->len = io_u->buflen;
+               buflen += io_u->buflen;
+               f->last_start[io_u->ddir] = io_u->offset;
+               f->last_pos[io_u->ddir] = io_u->offset + range->len;
+
+               buf += sizeof(struct trim_range);
+               i++;
+
+               if (td_random(td) && file_randommap(td, io_u->file))
+                       mark_random_map(td, io_u, io_u->offset, io_u->buflen);
+               dprint_io_u(io_u, "fill");
+       }
+       if (buflen) {
+               /*
+                * Set buffer length as overall trim length for this IO, and
+                * tell the ioengine about the number of ranges to be trimmed.
+                */
+               io_u->buflen = buflen;
+               io_u->number_trim = i;
+               return 0;
+       }
+
+       return 1;
+}
+
  static int fill_io_u(struct thread_data *td, struct io_u *io_u)
  {
         bool is_random;
@@ -913,7 +1021,7 @@ static int fill_io_u(struct thread_data *td, struct io_u *io_u)
  
         set_rw_ddir(td, io_u);
  
-       if (io_u->ddir == DDIR_INVAL) {
+       if (io_u->ddir == DDIR_INVAL || io_u->ddir == DDIR_TIMEOUT) {
                 dprint(FD_IO, "invalid direction received ddir = %d", io_u->ddir);
                 return 1;
         }
@@ -928,28 +1036,38 @@ static int fill_io_u(struct thread_data *td, struct io_u *io_u)
         else if (td->o.zone_mode == ZONE_MODE_ZBD)
                 setup_zbd_zone_mode(td, io_u);
  
-       /*
-        * No log, let the seq/rand engine retrieve the next buflen and
-        * position.
-        */
-       if (get_next_offset(td, io_u, &is_random)) {
-               dprint(FD_IO, "io_u %p, failed getting offset\n", io_u);
-               return 1;
-       }
+       if (multi_range_trim(td, io_u)) {
+               if (fill_multi_range_io_u(td, io_u))
+                       return 1;
+       } else {
+               /*
+                * No log, let the seq/rand engine retrieve the next buflen and
+                * position.
+                */
+               if (get_next_offset(td, io_u, &is_random)) {
+                       dprint(FD_IO, "io_u %p, failed getting offset\n", io_u);
+                       return 1;
+               }
  
-       io_u->buflen = get_next_buflen(td, io_u, is_random);
-       if (!io_u->buflen) {
-               dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u);
-               return 1;
+               io_u->buflen = get_next_buflen(td, io_u, is_random);
+               if (!io_u->buflen) {
+                       dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u);
+                       return 1;
+               }
         }
-
         offset = io_u->offset;
+
         if (td->o.zone_mode == ZONE_MODE_ZBD) {
                 ret = zbd_adjust_block(td, io_u);
-               if (ret == io_u_eof)
+               if (ret == io_u_eof) {
+                       dprint(FD_IO, "zbd_adjust_block() returned io_u_eof\n");
                         return 1;
+               }
         }
  
+       if (td->o.dp_type != FIO_DP_NONE)
+               dp_fill_dspec_data(td, io_u);
+
         if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
                 dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n",
                         io_u,
@@ -961,11 +1079,12 @@ static int fill_io_u(struct thread_data *td, struct io_u *io_u)
         /*
          * mark entry before potentially trimming io_u
          */
-       if (td_random(td) && file_randommap(td, io_u->file))
+       if (!multi_range_trim(td, io_u) && td_random(td) && file_randommap(td, io_u->file))
                 io_u->buflen = mark_random_map(td, io_u, offset, io_u->buflen);
  
  out:
-       dprint_io_u(io_u, "fill");
+       if (!multi_range_trim(td, io_u))
+               dprint_io_u(io_u, "fill");
         io_u->verify_offset = io_u->offset;
         td->zone_bytes += io_u->buflen;
         return 0;
@@ -993,7 +1112,7 @@ static void __io_u_mark_map(uint64_t *map, unsigned int nr)
                 break;
         case 1 ... 4:
                 idx = 1;
-               fallthrough;
+               fio_fallthrough;
         case 0:
                 break;
         }
@@ -1035,7 +1154,7 @@ void io_u_mark_depth(struct thread_data *td, unsigned int nr)
                 break;
         case 2 ... 3:
                 idx = 1;
-               fallthrough;
+               fio_fallthrough;
         case 1:
                 break;
         }
@@ -1076,7 +1195,7 @@ static void io_u_mark_lat_nsec(struct thread_data *td, unsigned long long nsec)
                 break;
         case 2 ... 3:
                 idx = 1;
-               fallthrough;
+               fio_fallthrough;
         case 0 ... 1:
                 break;
         }
@@ -1118,7 +1237,7 @@ static void io_u_mark_lat_usec(struct thread_data *td, unsigned long long usec)
                 break;
         case 2 ... 3:
                 idx = 1;
-               fallthrough;
+               fio_fallthrough;
         case 0 ... 1:
                 break;
         }
@@ -1166,7 +1285,7 @@ static void io_u_mark_lat_msec(struct thread_data *td, unsigned long long msec)
                 break;
         case 2 ... 3:
                 idx = 1;
-               fallthrough;
+               fio_fallthrough;
         case 0 ... 1:
                 break;
         }
@@ -1327,8 +1446,8 @@ static struct fio_file *__get_next_file(struct thread_data *td)
                 if (td->o.file_service_type == FIO_FSERVICE_SEQ)
                         goto out;
                 if (td->file_service_left) {
-                 td->file_service_left--;
-                 goto out;
+                       td->file_service_left--;
+                       goto out;
                 }
         }
  
@@ -1376,6 +1495,10 @@ static long set_io_u_file(struct thread_data *td, struct io_u *io_u)
                 put_file_log(td, f);
                 td_io_close_file(td, f);
                 io_u->file = NULL;
+
+               if (io_u->ddir == DDIR_TIMEOUT)
+                       return 1;
+
                 if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)
                         fio_file_reset(td, f);
                 else {
@@ -1389,11 +1512,16 @@ static long set_io_u_file(struct thread_data *td, struct io_u *io_u)
         return 0;
  }
  
-static void lat_fatal(struct thread_data *td, struct io_completion_data *icd,
+static void lat_fatal(struct thread_data *td, struct io_u *io_u, struct io_completion_data *icd,
                       unsigned long long tnsec, unsigned long long max_nsec)
  {
-       if (!td->error)
-               log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec)\n", tnsec, max_nsec);
+       if (!td->error) {
+               log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec): %s %s %llu %llu\n",
+                                       tnsec, max_nsec,
+                                       io_u->file->file_name,
+                                       io_ddir_name(io_u->ddir),
+                                       io_u->offset, io_u->buflen);
+       }
         td_verror(td, ETIMEDOUT, "max latency exceeded");
         icd->error = ETIMEDOUT;
  }
@@ -1565,7 +1693,6 @@ struct io_u *__get_io_u(struct thread_data *td)
  {
         const bool needs_lock = td_async_processing(td);
         struct io_u *io_u = NULL;
-       int ret;
  
         if (td->stop_io)
                 return NULL;
@@ -1590,7 +1717,7 @@ again:
                 assert(io_u->flags & IO_U_F_FREE);
                 io_u_clear(td, io_u, IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
                                  IO_U_F_TRIMMED | IO_U_F_BARRIER |
-                                IO_U_F_VER_LIST | IO_U_F_PRIORITY);
+                                IO_U_F_VER_LIST);
  
                 io_u->error = 0;
                 io_u->acct_ddir = -1;
@@ -1599,14 +1726,16 @@ again:
                 io_u_set(td, io_u, IO_U_F_IN_CUR_DEPTH);
                 io_u->ipo = NULL;
         } else if (td_async_processing(td)) {
+               int ret;
                 /*
                  * We ran out, wait for async verify threads to finish and
                  * return one
                  */
                 assert(!(td->flags & TD_F_CHILD));
                 ret = pthread_cond_wait(&td->free_cond, &td->io_u_lock);
-               assert(ret == 0);
-               if (!td->error)
+               if (fio_unlikely(ret != 0)) {
+                       td->error = errno;
+               } else if (!td->error)
                         goto again;
         }
  
@@ -1761,7 +1890,7 @@ struct io_u *get_io_u(struct thread_data *td)
  
         assert(fio_file_open(f));
  
-       if (ddir_rw(io_u->ddir)) {
+       if (ddir_rw(io_u->ddir) && !multi_range_trim(td, io_u)) {
                 if (!io_u->buflen && !td_ioengine_flagged(td, FIO_NOIO)) {
                         dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u);
                         goto err_put;
@@ -1777,8 +1906,9 @@ struct io_u *get_io_u(struct thread_data *td)
                                         io_u->buflen);
                         } else if ((td->flags & TD_F_SCRAMBLE_BUFFERS) &&
                                    !(td->flags & TD_F_COMPRESS) &&
-                                  !(td->flags & TD_F_DO_VERIFY))
+                                  !(td->flags & TD_F_DO_VERIFY)) {
                                 do_scramble = 1;
+                       }
                 } else if (io_u->ddir == DDIR_READ) {
                         /*
                          * Reset the buf_filled parameters so next time if the
@@ -1794,6 +1924,11 @@ struct io_u *get_io_u(struct thread_data *td)
         io_u->xfer_buf = io_u->buf;
         io_u->xfer_buflen = io_u->buflen;
  
+       /*
+        * Remember the issuing context priority. The IO engine may change this.
+        */
+       io_u->ioprio = td->ioprio;
+       io_u->clat_prio_index = 0;
  out:
         assert(io_u->file);
         if (!td_io_prep(td, io_u)) {
@@ -1825,6 +1960,8 @@ static void __io_u_log_error(struct thread_data *td, struct io_u *io_u)
                 io_ddir_name(io_u->ddir),
                 io_u->offset, io_u->xfer_buflen);
  
+       zbd_log_err(td, io_u);
+
         if (td->io_ops->errdetails) {
                 char *err = td->io_ops->errdetails(io_u);
  
@@ -1879,7 +2016,8 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
                 unsigned long long tnsec;
  
                 tnsec = ntime_since(&io_u->start_time, &icd->time);
-               add_lat_sample(td, idx, tnsec, bytes, io_u->offset, io_u_is_prio(io_u));
+               add_lat_sample(td, idx, tnsec, bytes, io_u->offset,
+                              io_u->ioprio, io_u->clat_prio_index);
  
                 if (td->flags & TD_F_PROFILE_OPS) {
                         struct prof_io_ops *ops = &td->prof_io_ops;
@@ -1888,17 +2026,20 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
                                 icd->error = ops->io_u_lat(td, tnsec);
                 }
  
-               if (td->o.max_latency && tnsec > td->o.max_latency)
-                       lat_fatal(td, icd, tnsec, td->o.max_latency);
-               if (td->o.latency_target && tnsec > td->o.latency_target) {
-                       if (lat_target_failed(td))
-                               lat_fatal(td, icd, tnsec, td->o.latency_target);
+               if (ddir_rw(idx)) {
+                       if (td->o.max_latency[idx] && tnsec > td->o.max_latency[idx])
+                               lat_fatal(td, io_u, icd, tnsec, td->o.max_latency[idx]);
+                       if (td->o.latency_target && tnsec > td->o.latency_target) {
+                               if (lat_target_failed(td))
+                                       lat_fatal(td, io_u, icd, tnsec, td->o.latency_target);
+                       }
                 }
         }
  
         if (ddir_rw(idx)) {
                 if (!td->o.disable_clat) {
-                       add_clat_sample(td, idx, llnsec, bytes, io_u->offset, io_u_is_prio(io_u));
+                       add_clat_sample(td, idx, llnsec, bytes, io_u->offset,
+                                       io_u->ioprio, io_u->clat_prio_index);
                         io_u_mark_latency(td, llnsec);
                 }
  
@@ -1952,7 +2093,7 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
         dprint_io_u(io_u, "complete");
  
         assert(io_u->flags & IO_U_F_FLIGHT);
-       io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK);
+       io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK | IO_U_F_PATTERN_DONE);
  
         /*
          * Mark IO ok to verify
@@ -1970,7 +2111,8 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
         }
  
         if (ddir_sync(ddir)) {
-               td->last_was_sync = true;
+               if (io_u->error)
+                       goto error;
                 if (f) {
                         f->first_write = -1ULL;
                         f->last_write = -1ULL;
@@ -1980,7 +2122,6 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
                 return;
         }
  
-       td->last_was_sync = false;
         td->last_ddir = ddir;
  
         if (!io_u->error && ddir_rw(ddir)) {
@@ -1991,7 +2132,7 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
                  * Make sure we notice short IO from here, and requeue them
                  * appropriately!
                  */
-               if (io_u->resid) {
+               if (bytes && io_u->resid) {
                         io_u->xfer_buflen = io_u->resid;
                         io_u->xfer_buf += bytes;
                         io_u->offset += bytes;
@@ -2025,6 +2166,7 @@ static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
                                 icd->error = ret;
                 }
         } else if (io_u->error) {
+error:
                 icd->error = io_u->error;
                 io_u_log_error(td, io_u);
         }
@@ -2077,13 +2219,27 @@ static void ios_completed(struct thread_data *td,
         }
  }
  
+static void io_u_update_bytes_done(struct thread_data *td,
+                                  struct io_completion_data *icd)
+{
+       int ddir;
+
+       if (td->runstate == TD_VERIFYING) {
+               td->bytes_verified += icd->bytes_done[DDIR_READ];
+               if (td_write(td))
+                       return;
+       }
+
+       for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+               td->bytes_done[ddir] += icd->bytes_done[ddir];
+}
+
  /*
   * Complete a single io_u for the sync engines.
   */
  int io_u_sync_complete(struct thread_data *td, struct io_u *io_u)
  {
         struct io_completion_data icd;
-       int ddir;
  
         init_icd(td, &icd, 1);
         io_completed(td, &io_u, &icd);
@@ -2096,8 +2252,7 @@ int io_u_sync_complete(struct thread_data *td, struct io_u *io_u)
                 return -1;
         }
  
-       for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
-               td->bytes_done[ddir] += icd.bytes_done[ddir];
+       io_u_update_bytes_done(td, &icd);
  
         return 0;
  }
@@ -2109,7 +2264,7 @@ int io_u_queued_complete(struct thread_data *td, int min_evts)
  {
         struct io_completion_data icd;
         struct timespec *tvp = NULL;
-       int ret, ddir;
+       int ret;
         struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
  
         dprint(FD_IO, "io_u_queued_complete: min=%d\n", min_evts);
@@ -2135,8 +2290,7 @@ int io_u_queued_complete(struct thread_data *td, int min_evts)
                 return -1;
         }
  
-       for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
-               td->bytes_done[ddir] += icd.bytes_done[ddir];
+       io_u_update_bytes_done(td, &icd);
  
         return ret;
  }
@@ -2155,7 +2309,7 @@ void io_u_queued(struct thread_data *td, struct io_u *io_u)
                         td = td->parent;
  
                 add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen,
-                               io_u->offset, io_u_is_prio(io_u));
+                               io_u->offset, io_u->ioprio);
         }
  }
  
@@ -2165,6 +2319,7 @@ void io_u_queued(struct thread_data *td, struct io_u *io_u)
  static struct frand_state *get_buf_state(struct thread_data *td)
  {
         unsigned int v;
+       unsigned long long i;
  
         if (!td->o.dedupe_percentage)
                 return &td->buf_state;
@@ -2176,7 +2331,24 @@ static struct frand_state *get_buf_state(struct thread_data *td)
         v = rand_between(&td->dedupe_state, 1, 100);
  
         if (v <= td->o.dedupe_percentage)
-               return &td->buf_state_prev;
+               switch (td->o.dedupe_mode) {
+               case DEDUPE_MODE_REPEAT:
+                       /*
+                       * The caller advances the returned frand_state.
+                       * A copy of prev should be returned instead since
+                       * a subsequent intention to generate a deduped buffer
+                       * might result in generating a unique one
+                       */
+                       frand_copy(&td->buf_state_ret, &td->buf_state_prev);
+                       return &td->buf_state_ret;
+               case DEDUPE_MODE_WORKING_SET:
+                       i = rand_between(&td->dedupe_working_set_index_state, 0, td->num_unique_pages - 1);
+                       frand_copy(&td->buf_state_ret, &td->dedupe_working_set_states[i]);
+                       return &td->buf_state_ret;
+               default:
+                       log_err("unexpected dedupe mode %u\n", td->o.dedupe_mode);
+                       assert(0);
+               }
  
         return &td->buf_state;
  }
@@ -2199,27 +2371,30 @@ void fill_io_buffer(struct thread_data *td, void *buf, unsigned long long min_wr
  
         if (o->compress_percentage || o->dedupe_percentage) {
                 unsigned int perc = td->o.compress_percentage;
-               struct frand_state *rs;
+               struct frand_state *rs = NULL;
                 unsigned long long left = max_bs;
                 unsigned long long this_write;
  
                 do {
-                       rs = get_buf_state(td);
+                       /*
+                        * Buffers are either entirely dedupe-able or not.
+                        * If we choose to dedup, the buffer should undergo
+                        * the same manipulation as the original write. Which
+                        * means we should retrack the steps we took for compression
+                        * as well.
+                        */
+                       if (!rs)
+                               rs = get_buf_state(td);
  
                         min_write = min(min_write, left);
  
-                       if (perc) {
-                               this_write = min_not_zero(min_write,
-                                                       (unsigned long long) td->o.compress_chunk);
+                       this_write = min_not_zero(min_write,
+                                               (unsigned long long) td->o.compress_chunk);
  
-                               fill_random_buf_percentage(rs, buf, perc,
-                                       this_write, this_write,
-                                       o->buffer_pattern,
-                                       o->buffer_pattern_bytes);
-                       } else {
-                               fill_random_buf(rs, buf, min_write);
-                               this_write = min_write;
-                       }
+                       fill_random_buf_percentage(rs, buf, perc,
+                               this_write, this_write,
+                               o->buffer_pattern,
+                               o->buffer_pattern_bytes);
  
                         buf += this_write;
                         left -= this_write;
@@ -2262,7 +2437,11 @@ int do_io_u_sync(const struct thread_data *td, struct io_u *io_u)
         int ret;
  
         if (io_u->ddir == DDIR_SYNC) {
+#ifdef CONFIG_FCNTL_SYNC
+               ret = fcntl(io_u->file->fd, F_FULLFSYNC);
+#else
                 ret = fsync(io_u->file->fd);
+#endif
         } else if (io_u->ddir == DDIR_DATASYNC) {
  #ifdef CONFIG_FDATASYNC
                 ret = fdatasync(io_u->file->fd);
@@ -2283,7 +2462,7 @@ int do_io_u_sync(const struct thread_data *td, struct io_u *io_u)
         return ret;
  }
  
-int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
+int do_io_u_trim(struct thread_data *td, struct io_u *io_u)
  {
  #ifndef FIO_HAVE_TRIM
         io_u->error = EINVAL;
@@ -2292,10 +2471,19 @@ int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
         struct fio_file *f = io_u->file;
         int ret;
  
+       if (td->o.zone_mode == ZONE_MODE_ZBD) {
+               ret = zbd_do_io_u_trim(td, io_u);
+               if (ret == io_u_completed)
+                       return io_u->xfer_buflen;
+               if (ret)
+                       goto err;
+       }
+
         ret = os_trim(f, io_u->offset, io_u->xfer_buflen);
         if (!ret)
                 return io_u->xfer_buflen;
  
+err:
         io_u->error = ret;
         return 0;
  #endif
diff --git a/io_u.h b/io_u.h

index d4c5be4303b3dc4bb7ed9a3cb905b86852231401..ab93d50f967e78547961e7650b2921f53f64f6b9 100644 (file)
--- a/io_u.h
+++ b/io_u.h
@@ -21,7 +21,7 @@ enum {
         IO_U_F_TRIMMED          = 1 << 5,
         IO_U_F_BARRIER          = 1 << 6,
         IO_U_F_VER_LIST         = 1 << 7,
-       IO_U_F_PRIORITY         = 1 << 8,
+       IO_U_F_PATTERN_DONE     = 1 << 8,
  };
  
  /*
@@ -46,6 +46,17 @@ struct io_u {
          */
         unsigned short numberio;
  
+       /*
+        * IO priority.
+        */
+       unsigned short ioprio;
+       unsigned short clat_prio_index;
+
+       /*
+        * number of trim ranges for this IO.
+        */
+       unsigned int number_trim;
+
         /*
          * Allocated/set buffer and length
          */
@@ -83,8 +94,8 @@ struct io_u {
         union {
                 unsigned int index;
                 unsigned int seen;
-               void *engine_data;
         };
+       void *engine_data;
  
         union {
                 struct flist_head verify_list;
@@ -112,6 +123,9 @@ struct io_u {
          */
         int (*end_io)(struct thread_data *, struct io_u **);
  
+       uint32_t dtype;
+       uint32_t dspec;
+
         union {
  #ifdef CONFIG_LIBAIO
                 struct iocb iocb;
@@ -153,7 +167,7 @@ void io_u_mark_submit(struct thread_data *, unsigned int);
  bool queue_full(const struct thread_data *);
  
  int do_io_u_sync(const struct thread_data *, struct io_u *);
-int do_io_u_trim(const struct thread_data *, struct io_u *);
+int do_io_u_trim(struct thread_data *, struct io_u *);
  
  #ifdef FIO_INC_DEBUG
  static inline void dprint_io_u(struct io_u *io_u, const char *p)
@@ -188,7 +202,5 @@ static inline enum fio_ddir acct_ddir(struct io_u *io_u)
         td_flags_clear((td), &(io_u->flags), (val))
  #define io_u_set(td, io_u, val)                \
         td_flags_set((td), &(io_u)->flags, (val))
-#define io_u_is_prio(io_u)     \
-       (io_u->flags & (unsigned int) IO_U_F_PRIORITY) != 0
  
  #endif
diff --git a/ioengines.c b/ioengines.c

index f88b0537f1e622f963919b4be9679164e98c7c59..6b81dc772ad3284683eb1c668e24d899f758d0ec 100644 (file)
--- a/ioengines.c
+++ b/ioengines.c
@@ -17,6 +17,7 @@
  #include <assert.h>
  #include <sys/types.h>
  #include <dirent.h>
+#include <errno.h>
  
  #include "fio.h"
  #include "diskutil.h"
@@ -24,6 +25,13 @@
  
  static FLIST_HEAD(engine_list);
  
+static inline bool async_ioengine_sync_trim(struct thread_data *td,
+                                           struct io_u *io_u)
+{
+       return td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) &&
+               io_u->ddir == DDIR_TRIM;
+}
+
  static bool check_engine_ops(struct thread_data *td, struct ioengine_ops *ops)
  {
         if (ops->version != FIO_IOOPS_VERSION) {
@@ -223,6 +231,8 @@ struct ioengine_ops *load_ioengine(struct thread_data *td)
   */
  void free_ioengine(struct thread_data *td)
  {
+       assert(td != NULL && td->io_ops != NULL);
+
         dprint(FD_IO, "free ioengine %s\n", td->io_ops->name);
  
         if (td->eo && td->io_ops->options) {
@@ -234,7 +244,6 @@ void free_ioengine(struct thread_data *td)
         if (td->io_ops->dlhandle) {
                 dprint(FD_IO, "dlclose ioengine %s\n", td->io_ops->name);
                 dlclose(td->io_ops->dlhandle);
-               td->io_ops->dlhandle = NULL;
         }
  
         td->io_ops = NULL;
@@ -334,8 +343,13 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
          * flag is now set
          */
         if (td_offload_overlap(td)) {
-               int res = pthread_mutex_unlock(&overlap_check);
-               assert(res == 0);
+               int res;
+
+               res = pthread_mutex_unlock(&overlap_check);
+               if (fio_unlikely(res != 0)) {
+                       log_err("failed to unlock overlap check mutex, err: %i:%s", errno, strerror(errno));
+                       abort();
+               }
         }
  
         assert(fio_file_open(io_u->file));
@@ -349,17 +363,17 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
         io_u->resid = 0;
  
         if (td_ioengine_flagged(td, FIO_SYNCIO) ||
-               (td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) && 
-               io_u->ddir == DDIR_TRIM)) {
-               if (fio_fill_issue_time(td))
+               async_ioengine_sync_trim(td, io_u)) {
+               if (fio_fill_issue_time(td)) {
                         fio_gettime(&io_u->issue_time, NULL);
  
-               /*
-                * only used for iolog
-                */
-               if (td->o.read_iolog_file)
-                       memcpy(&td->last_issue, &io_u->issue_time,
-                                       sizeof(io_u->issue_time));
+                       /*
+                        * only used for iolog
+                        */
+                       if (td->o.read_iolog_file)
+                               memcpy(&td->last_issue, &io_u->issue_time,
+                                               sizeof(io_u->issue_time));
+               }
         }
  
  
@@ -414,7 +428,6 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
         if (!td->io_ops->commit) {
                 io_u_mark_submit(td, 1);
                 io_u_mark_complete(td, 1);
-               zbd_put_io_u(td, io_u);
         }
  
         if (ret == FIO_Q_COMPLETED) {
@@ -423,6 +436,8 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
                         io_u_mark_depth(td, 1);
                         td->ts.total_io_u[io_u->ddir]++;
                 }
+
+               td->last_was_sync = ddir_sync(io_u->ddir);
         } else if (ret == FIO_Q_QUEUED) {
                 td->io_u_queued++;
  
@@ -432,20 +447,23 @@ enum fio_q_status td_io_queue(struct thread_data *td, struct io_u *io_u)
  
                 if (td->io_u_queued >= td->o.iodepth_batch)
                         td_io_commit(td);
+
+               td->last_was_sync = ddir_sync(io_u->ddir);
         }
  
         if (!td_ioengine_flagged(td, FIO_SYNCIO) &&
-               (!td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) ||
-                io_u->ddir != DDIR_TRIM)) {
-               if (fio_fill_issue_time(td))
+               !async_ioengine_sync_trim(td, io_u)) {
+               if (fio_fill_issue_time(td) &&
+                       !td_ioengine_flagged(td, FIO_ASYNCIO_SETS_ISSUE_TIME)) {
                         fio_gettime(&io_u->issue_time, NULL);
  
-               /*
-                * only used for iolog
-                */
-               if (td->o.read_iolog_file)
-                       memcpy(&td->last_issue, &io_u->issue_time,
-                                       sizeof(io_u->issue_time));
+                       /*
+                        * only used for iolog
+                        */
+                       if (td->o.read_iolog_file)
+                               memcpy(&td->last_issue, &io_u->issue_time,
+                                               sizeof(io_u->issue_time));
+               }
         }
  
         return ret;
@@ -557,6 +575,10 @@ int td_io_open_file(struct thread_data *td, struct fio_file *f)
                         flags = POSIX_FADV_RANDOM;
                 else if (td->o.fadvise_hint == F_ADV_SEQUENTIAL)
                         flags = POSIX_FADV_SEQUENTIAL;
+#ifdef POSIX_FADV_NOREUSE
+               else if (td->o.fadvise_hint == F_ADV_NOREUSE)
+                       flags = POSIX_FADV_NOREUSE;
+#endif
                 else {
                         log_err("fio: unknown fadvise type %d\n",
                                                         td->o.fadvise_hint);
@@ -572,19 +594,21 @@ int td_io_open_file(struct thread_data *td, struct fio_file *f)
         if (fio_option_is_set(&td->o, write_hint) &&
             (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
                 uint64_t hint = td->o.write_hint;
-               int cmd;
+               int res;
  
                 /*
-                * For direct IO, we just need/want to set the hint on
-                * the file descriptor. For buffered IO, we need to set
-                * it on the inode.
+                * For direct IO, set the hint on the file descriptor if that is
+                * supported. Otherwise set it on the inode. For buffered IO, we
+                * need to set it on the inode.
                  */
-               if (td->o.odirect)
-                       cmd = F_SET_FILE_RW_HINT;
-               else
-                       cmd = F_SET_RW_HINT;
-
-               if (fcntl(f->fd, cmd, &hint) < 0) {
+               if (td->o.odirect) {
+                       res = fcntl(f->fd, F_SET_FILE_RW_HINT, &hint);
+                       if (res < 0)
+                               res = fcntl(f->fd, F_SET_RW_HINT, &hint);
+               } else {
+                       res = fcntl(f->fd, F_SET_RW_HINT, &hint);
+               }
+               if (res < 0) {
                         td_verror(td, errno, "fcntl write hint");
                         goto err;
                 }
@@ -694,17 +718,17 @@ int fio_show_ioengine_help(const char *engine)
         }
  
         td.o.ioengine = (char *)engine;
-       io_ops = load_ioengine(&td);
+       td.io_ops = load_ioengine(&td);
  
-       if (!io_ops) {
+       if (!td.io_ops) {
                 log_info("IO engine %s not found\n", engine);
                 return 1;
         }
  
-       if (io_ops->options)
-               ret = show_cmd_help(io_ops->options, sep);
+       if (td.io_ops->options)
+               ret = show_cmd_help(td.io_ops->options, sep);
         else
-               log_info("IO engine %s has no options\n", io_ops->name);
+               log_info("IO engine %s has no options\n", td.io_ops->name);
  
         free_ioengine(&td);
         return ret;
diff --git a/ioengines.h b/ioengines.h

index 1d01ab0a6d1345a71b04b0576f40fefca20062b9..d5b0cafe33773cfebe931ea3ff57e9a34efbc472 100644 (file)
--- a/ioengines.h
+++ b/ioengines.h
@@ -7,8 +7,9 @@
  #include "flist.h"
  #include "io_u.h"
  #include "zbd_types.h"
+#include "dataplacement.h"
  
-#define FIO_IOOPS_VERSION      29
+#define FIO_IOOPS_VERSION      34
  
  #ifndef CONFIG_DYNAMIC_ENGINES
  #define FIO_STATIC     static
@@ -59,28 +60,66 @@ struct ioengine_ops {
                             uint64_t, struct zbd_zone *, unsigned int);
         int (*reset_wp)(struct thread_data *, struct fio_file *,
                         uint64_t, uint64_t);
+       int (*get_max_open_zones)(struct thread_data *, struct fio_file *,
+                                 unsigned int *);
+       int (*get_max_active_zones)(struct thread_data *, struct fio_file *,
+                                   unsigned int *);
+       int (*finish_zone)(struct thread_data *, struct fio_file *,
+                          uint64_t, uint64_t);
+       int (*fdp_fetch_ruhs)(struct thread_data *, struct fio_file *,
+                             struct fio_ruhs_info *);
         int option_struct_size;
         struct fio_option *options;
  };
  
+enum {
+       __FIO_SYNCIO = 0,               /* io engine has synchronous ->queue */
+       __FIO_RAWIO,                    /* some sort of direct/raw io */
+       __FIO_DISKLESSIO,               /* no disk involved */
+       __FIO_NOEXTEND,                 /* engine can't extend file */
+       __FIO_NODISKUTIL,               /* diskutil can't handle filename */
+       __FIO_UNIDIR,                   /* engine is uni-directional */
+       __FIO_NOIO,                     /* thread does only pseudo IO */
+       __FIO_PIPEIO,                   /* input/output no seekable */
+       __FIO_BARRIER,                  /* engine supports barriers */
+       __FIO_MEMALIGN,                 /* engine wants aligned memory */
+       __FIO_BIT_BASED,                /* engine uses a bit base (e.g. uses Kbit as opposed to
+                                          KB) */
+       __FIO_FAKEIO,                   /* engine pretends to do IO */
+       __FIO_NOSTATS,                  /* don't do IO stats */
+       __FIO_NOFILEHASH,               /* doesn't hash the files for lookup later. */
+       __FIO_ASYNCIO_SYNC_TRIM,        /* io engine has async ->queue except for trim */
+       __FIO_NO_OFFLOAD,               /* no async offload */
+       __FIO_ASYNCIO_SETS_ISSUE_TIME,  /* async ioengine with commit function that sets
+                                          issue_time */
+       __FIO_SKIPPABLE_IOMEM_ALLOC,    /* skip iomem_alloc & iomem_free if job sets mem/iomem */
+       __FIO_RO_NEEDS_RW_OPEN,         /* open files in rw mode even if we have a read job; only
+                                          affects ioengines using generic_open_file */
+       __FIO_MULTI_RANGE_TRIM,         /* ioengine supports trim with more than one range */
+       __FIO_IOENGINE_F_LAST,          /* not a real bit; used to count number of bits */
+};
+
  enum fio_ioengine_flags {
-       FIO_SYNCIO      = 1 << 0,       /* io engine has synchronous ->queue */
-       FIO_RAWIO       = 1 << 1,       /* some sort of direct/raw io */
-       FIO_DISKLESSIO  = 1 << 2,       /* no disk involved */
-       FIO_NOEXTEND    = 1 << 3,       /* engine can't extend file */
-       FIO_NODISKUTIL  = 1 << 4,       /* diskutil can't handle filename */
-       FIO_UNIDIR      = 1 << 5,       /* engine is uni-directional */
-       FIO_NOIO        = 1 << 6,       /* thread does only pseudo IO */
-       FIO_PIPEIO      = 1 << 7,       /* input/output no seekable */
-       FIO_BARRIER     = 1 << 8,       /* engine supports barriers */
-       FIO_MEMALIGN    = 1 << 9,       /* engine wants aligned memory */
-       FIO_BIT_BASED   = 1 << 10,      /* engine uses a bit base (e.g. uses Kbit as opposed to KB) */
-       FIO_FAKEIO      = 1 << 11,      /* engine pretends to do IO */
-       FIO_NOSTATS     = 1 << 12,      /* don't do IO stats */
-       FIO_NOFILEHASH  = 1 << 13,      /* doesn't hash the files for lookup later. */
-       FIO_ASYNCIO_SYNC_TRIM
-                       = 1 << 14,      /* io engine has async ->queue except for trim */
-       FIO_NO_OFFLOAD  = 1 << 15,      /* no async offload */
+       FIO_SYNCIO                      = 1 << __FIO_SYNCIO,
+       FIO_RAWIO                       = 1 << __FIO_RAWIO,
+       FIO_DISKLESSIO                  = 1 << __FIO_DISKLESSIO,
+       FIO_NOEXTEND                    = 1 << __FIO_NOEXTEND,
+       FIO_NODISKUTIL                  = 1 << __FIO_NODISKUTIL,
+       FIO_UNIDIR                      = 1 << __FIO_UNIDIR,
+       FIO_NOIO                        = 1 << __FIO_NOIO,
+       FIO_PIPEIO                      = 1 << __FIO_PIPEIO,
+       FIO_BARRIER                     = 1 << __FIO_BARRIER,
+       FIO_MEMALIGN                    = 1 << __FIO_MEMALIGN,
+       FIO_BIT_BASED                   = 1 << __FIO_BIT_BASED,
+       FIO_FAKEIO                      = 1 << __FIO_FAKEIO,
+       FIO_NOSTATS                     = 1 << __FIO_NOSTATS,
+       FIO_NOFILEHASH                  = 1 << __FIO_NOFILEHASH,
+       FIO_ASYNCIO_SYNC_TRIM           = 1 << __FIO_ASYNCIO_SYNC_TRIM,
+       FIO_NO_OFFLOAD                  = 1 << __FIO_NO_OFFLOAD,
+       FIO_ASYNCIO_SETS_ISSUE_TIME     = 1 << __FIO_ASYNCIO_SETS_ISSUE_TIME,
+       FIO_SKIPPABLE_IOMEM_ALLOC       = 1 << __FIO_SKIPPABLE_IOMEM_ALLOC,
+       FIO_RO_NEEDS_RW_OPEN            = 1 << __FIO_RO_NEEDS_RW_OPEN,
+       FIO_MULTI_RANGE_TRIM            = 1 << __FIO_MULTI_RANGE_TRIM,
  };
  
  /*
diff --git a/iolog.c b/iolog.c

index fa40c8572664a4e14d1e53d39eae3547dc43b672..96af4f33e186dd4ec1a02b851a1c7ae4c0b1eaeb 100644 (file)
--- a/iolog.c
+++ b/iolog.c
@@ -31,6 +31,7 @@
  static int iolog_flush(struct io_log *log);
  
  static const char iolog_ver2[] = "fio version 2 iolog";
+static const char iolog_ver3[] = "fio version 3 iolog";
  
  void queue_io_piece(struct thread_data *td, struct io_piece *ipo)
  {
@@ -40,18 +41,24 @@ void queue_io_piece(struct thread_data *td, struct io_piece *ipo)
  
  void log_io_u(const struct thread_data *td, const struct io_u *io_u)
  {
+       struct timespec now;
+
         if (!td->o.write_iolog_file)
                 return;
  
-       fprintf(td->iolog_f, "%s %s %llu %llu\n", io_u->file->file_name,
-                                               io_ddir_name(io_u->ddir),
-                                               io_u->offset, io_u->buflen);
+       fio_gettime(&now, NULL);
+       fprintf(td->iolog_f, "%llu %s %s %llu %llu\n",
+               (unsigned long long) utime_since_now(&td->io_log_start_time),
+               io_u->file->file_name, io_ddir_name(io_u->ddir), io_u->offset,
+               io_u->buflen);
+
  }
  
  void log_file(struct thread_data *td, struct fio_file *f,
               enum file_log_act what)
  {
         const char *act[] = { "add", "open", "close" };
+       struct timespec now;
  
         assert(what < 3);
  
@@ -65,15 +72,18 @@ void log_file(struct thread_data *td, struct fio_file *f,
         if (!td->iolog_f)
                 return;
  
-       fprintf(td->iolog_f, "%s %s\n", f->file_name, act[what]);
+       fio_gettime(&now, NULL);
+       fprintf(td->iolog_f, "%llu %s %s\n",
+               (unsigned long long) utime_since_now(&td->io_log_start_time),
+               f->file_name, act[what]);
  }
  
  static void iolog_delay(struct thread_data *td, unsigned long delay)
  {
         uint64_t usec = utime_since_now(&td->last_issue);
         unsigned long orig_delay = delay;
-       uint64_t this_delay;
         struct timespec ts;
+       int ret = 0;
  
         if (delay < td->time_offset) {
                 td->time_offset = 0;
@@ -87,13 +97,15 @@ static void iolog_delay(struct thread_data *td, unsigned long delay)
         delay -= usec;
  
         fio_gettime(&ts, NULL);
-       while (delay && !td->terminate) {
-               this_delay = delay;
-               if (this_delay > 500000)
-                       this_delay = 500000;
  
-               usec_sleep(td, this_delay);
-               delay -= this_delay;
+       while (delay && !td->terminate) {
+               ret = io_u_queued_complete(td, 0);
+               if (ret < 0)
+                       td_verror(td, -ret, "io_u_queued_complete");
+               if (td->flags & TD_F_REGROW_LOGS)
+                       regrow_logs(td);
+               if (utime_since_now(&ts) > delay)
+                       break;
         }
  
         usec = utime_since_now(&ts);
@@ -116,6 +128,10 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo)
  
         f = td->files[ipo->fileno];
  
+       if (ipo->delay)
+               iolog_delay(td, ipo->delay);
+       if (fio_fill_issue_time(td))
+               fio_gettime(&td->last_issue, NULL);
         switch (ipo->file_action) {
         case FIO_LOG_OPEN_FILE:
                 if (td->o.replay_redirect && fio_file_open(f)) {
@@ -134,6 +150,11 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo)
         case FIO_LOG_UNLINK_FILE:
                 td_io_unlink_file(td, f);
                 break;
+       case FIO_LOG_ADD_FILE:
+               /*
+                * Nothing to do
+                */
+               break;
         default:
                 log_err("fio: bad file action %d\n", ipo->file_action);
                 break;
@@ -142,7 +163,25 @@ static int ipo_special(struct thread_data *td, struct io_piece *ipo)
         return 1;
  }
  
-static bool read_iolog2(struct thread_data *td);
+static bool read_iolog(struct thread_data *td);
+
+unsigned long long delay_since_ttime(const struct thread_data *td,
+              unsigned long long time)
+{
+       double tmp;
+       double scale;
+       const unsigned long long *last_ttime = &td->io_log_last_ttime;
+
+       if (!*last_ttime || td->o.no_stall || time < *last_ttime)
+               return 0;
+       else if (td->o.replay_time_scale == 100)
+               return time - *last_ttime;
+
+
+       scale = (double) 100.0 / (double) td->o.replay_time_scale;
+       tmp = time - *last_ttime;
+       return tmp * scale;
+}
  
  int read_iolog_get(struct thread_data *td, struct io_u *io_u)
  {
@@ -151,10 +190,16 @@ int read_iolog_get(struct thread_data *td, struct io_u *io_u)
  
         while (!flist_empty(&td->io_log_list)) {
                 int ret;
+
                 if (td->o.read_iolog_chunked) {
                         if (td->io_log_checkmark == td->io_log_current) {
-                               if (!read_iolog2(td))
-                                       return 1;
+                               if (td->io_log_blktrace) {
+                                       if (!read_blktrace(td))
+                                               return 1;
+                               } else {
+                                       if (!read_iolog(td))
+                                               return 1;
+                               }
                         }
                         td->io_log_current--;
                 }
@@ -354,7 +399,7 @@ void write_iolog_close(struct thread_data *td)
         td->iolog_buf = NULL;
  }
  
-static int64_t iolog_items_to_fetch(struct thread_data *td)
+int64_t iolog_items_to_fetch(struct thread_data *td)
  {
         struct timespec now;
         uint64_t elapsed;
@@ -382,20 +427,27 @@ static int64_t iolog_items_to_fetch(struct thread_data *td)
         return items_to_fetch;
  }
  
+#define io_act(_td, _r) (((_td)->io_log_version == 3 && (r) == 5) || \
+                                       ((_td)->io_log_version == 2 && (r) == 4))
+#define file_act(_td, _r) (((_td)->io_log_version == 3 && (r) == 3) || \
+                                       ((_td)->io_log_version == 2 && (r) == 2))
+
  /*
- * Read version 2 iolog data. It is enhanced to include per-file logging,
+ * Read version 2 and 3 iolog data. It is enhanced to include per-file logging,
   * syncs, etc.
   */
-static bool read_iolog2(struct thread_data *td)
+static bool read_iolog(struct thread_data *td)
  {
         unsigned long long offset;
         unsigned int bytes;
-       int reads, writes, waits, fileno = 0, file_action = 0; /* stupid gcc */
+       unsigned long long delay = 0;
+       int reads, writes, trims, waits, fileno = 0, file_action = 0; /* stupid gcc */
         char *rfname, *fname, *act;
         char *str, *p;
         enum fio_ddir rw;
         bool realloc = false;
         int64_t items_to_fetch = 0;
+       int syncs;
  
         if (td->o.read_iolog_chunked) {
                 items_to_fetch = iolog_items_to_fetch(td);
@@ -411,40 +463,62 @@ static bool read_iolog2(struct thread_data *td)
         rfname = fname = malloc(256+16);
         act = malloc(256+16);
  
-       reads = writes = waits = 0;
+       syncs = reads = writes = trims = waits = 0;
         while ((p = fgets(str, 4096, td->io_log_rfile)) != NULL) {
                 struct io_piece *ipo;
                 int r;
+               unsigned long long ttime;
  
-               r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset,
-                                                                       &bytes);
+               if (td->io_log_version == 3) {
+                       r = sscanf(p, "%llu %256s %256s %llu %u", &ttime, rfname, act,
+                                                       &offset, &bytes);
+                       delay = delay_since_ttime(td, ttime);
+                       td->io_log_last_ttime = ttime;
+                       /*
+                        * "wait" is not allowed with version 3
+                        */
+                       if (!strcmp(act, "wait")) {
+                               log_err("iolog: ignoring wait command with"
+                                       " version 3 for file %s\n", fname);
+                               continue;
+                       }
+               } else /* version 2 */
+                       r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset, &bytes);
  
                 if (td->o.replay_redirect)
                         fname = td->o.replay_redirect;
  
-               if (r == 4) {
+               if (io_act(td, r)) {
                         /*
                          * Check action first
                          */
                         if (!strcmp(act, "wait"))
                                 rw = DDIR_WAIT;
-                       else if (!strcmp(act, "read"))
+                       else if (!strcmp(act, "read")) {
+                               if (td->o.replay_skip & (1u << DDIR_READ))
+                                       continue;
                                 rw = DDIR_READ;
-                       else if (!strcmp(act, "write"))
+                       } else if (!strcmp(act, "write")) {
+                               if (td->o.replay_skip & (1u << DDIR_WRITE))
+                                       continue;
                                 rw = DDIR_WRITE;
-                       else if (!strcmp(act, "sync"))
+                       } else if (!strcmp(act, "sync")) {
+                               if (td->o.replay_skip & (1u << DDIR_SYNC))
+                                       continue;
                                 rw = DDIR_SYNC;
-                       else if (!strcmp(act, "datasync"))
+                       } else if (!strcmp(act, "datasync"))
                                 rw = DDIR_DATASYNC;
-                       else if (!strcmp(act, "trim"))
+                       else if (!strcmp(act, "trim")) {
+                               if (td->o.replay_skip & (1u << DDIR_TRIM))
+                                       continue;
                                 rw = DDIR_TRIM;
-                       else {
+                       } else {
                                 log_err("fio: bad iolog file action: %s\n",
                                                                         act);
                                 continue;
                         }
                         fileno = get_fileno(td, fname);
-               } else if (r == 2) {
+               } else if (file_act(td, r)) {
                         rw = DDIR_INVAL;
                         if (!strcmp(act, "add")) {
                                 if (td->o.replay_redirect &&
@@ -455,7 +529,6 @@ static bool read_iolog2(struct thread_data *td)
                                         fileno = add_file(td, fname, td->subjob_number, 1);
                                         file_action = FIO_LOG_ADD_FILE;
                                 }
-                               continue;
                         } else if (!strcmp(act, "open")) {
                                 fileno = get_fileno(td, fname);
                                 file_action = FIO_LOG_OPEN_FILE;
@@ -468,7 +541,7 @@ static bool read_iolog2(struct thread_data *td)
                                 continue;
                         }
                 } else {
-                       log_err("bad iolog2: %s\n", p);
+                       log_err("bad iolog%d: %s\n", td->io_log_version, p);
                         continue;
                 }
  
@@ -481,12 +554,21 @@ static bool read_iolog2(struct thread_data *td)
                         if (read_only)
                                 continue;
                         writes++;
+               } else if (rw == DDIR_TRIM) {
+                       /*
+                        * Don't add a trim for ro mode
+                        */
+                       if (read_only)
+                               continue;
+                       trims++;
                 } else if (rw == DDIR_WAIT) {
                         if (td->o.no_stall)
                                 continue;
                         waits++;
                 } else if (rw == DDIR_INVAL) {
-               } else if (!ddir_sync(rw)) {
+               } else if (ddir_sync(rw)) {
+                       syncs++;
+               } else {
                         log_err("bad ddir: %d\n", rw);
                         continue;
                 }
@@ -497,6 +579,8 @@ static bool read_iolog2(struct thread_data *td)
                 ipo = calloc(1, sizeof(*ipo));
                 init_ipo(ipo);
                 ipo->ddir = rw;
+               if (td->io_log_version == 3)
+                       ipo->delay = delay;
                 if (rw == DDIR_WAIT) {
                         ipo->delay = offset;
                 } else {
@@ -541,6 +625,8 @@ static bool read_iolog2(struct thread_data *td)
                         " read-only\n", td->o.name, writes);
                 writes = 0;
         }
+       if (syncs)
+               td->flags |= TD_F_SYNCS;
  
         if (td->o.read_iolog_chunked) {
                 if (td->io_log_current == 0) {
@@ -551,19 +637,22 @@ static bool read_iolog2(struct thread_data *td)
                 {
                         io_u_quiesce(td);
                         free_io_mem(td);
-                       init_io_u_buffers(td);
+                       if (init_io_u_buffers(td))
+                               return false;
                 }
                 return true;
         }
  
-       if (!reads && !writes && !waits)
+       if (!reads && !writes && !waits && !trims)
                 return false;
-       else if (reads && !writes)
-               td->o.td_ddir = TD_DDIR_READ;
-       else if (!reads && writes)
-               td->o.td_ddir = TD_DDIR_WRITE;
-       else
-               td->o.td_ddir = TD_DDIR_RW;
+
+       td->o.td_ddir = 0;
+       if (reads)
+               td->o.td_ddir |= TD_DDIR_READ;
+       if (writes)
+               td->o.td_ddir |= TD_DDIR_WRITE;
+       if (trims)
+               td->o.td_ddir |= TD_DDIR_TRIM;
  
         return true;
  }
@@ -607,12 +696,11 @@ static int open_socket(const char *path)
  /*
   * open iolog, check version, and call appropriate parser
   */
-static bool init_iolog_read(struct thread_data *td)
+static bool init_iolog_read(struct thread_data *td, char *fname)
  {
-       char buffer[256], *p, *fname;
+       char buffer[256], *p;
         FILE *f = NULL;
  
-       fname = get_name_by_idx(td->o.read_iolog_file, td->subjob_number);
         dprint(FD_IO, "iolog: name=%s\n", fname);
  
         if (is_socket(fname)) {
@@ -626,8 +714,6 @@ static bool init_iolog_read(struct thread_data *td)
         } else
                 f = fopen(fname, "r");
  
-       free(fname);
-
         if (!f) {
                 perror("fopen read iolog");
                 return false;
@@ -642,18 +728,22 @@ static bool init_iolog_read(struct thread_data *td)
         }
  
         /*
-        * version 2 of the iolog stores a specific string as the
+        * versions 2 and 3 of the iolog store a specific string as the
          * first line, check for that
          */
-       if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2))) {
-               free_release_files(td);
-               td->io_log_rfile = f;
-               return read_iolog2(td);
+       if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2)))
+               td->io_log_version = 2;
+       else if (!strncmp(iolog_ver3, buffer, strlen(iolog_ver3)))
+               td->io_log_version = 3;
+       else {
+               log_err("fio: iolog version 1 is no longer supported\n");
+               fclose(f);
+               return false;
         }
  
-       log_err("fio: iolog version 1 is no longer supported\n");
-       fclose(f);
-       return false;
+       free_release_files(td);
+       td->io_log_rfile = f;
+       return read_iolog(td);
  }
  
  /*
@@ -677,11 +767,12 @@ static bool init_iolog_write(struct thread_data *td)
         td->iolog_f = f;
         td->iolog_buf = malloc(8192);
         setvbuf(f, td->iolog_buf, _IOFBF, 8192);
+       fio_gettime(&td->io_log_start_time, NULL);
  
         /*
          * write our version line
          */
-       if (fprintf(f, "%s\n", iolog_ver2) < 0) {
+       if (fprintf(f, "%s\n", iolog_ver3) < 0) {
                 perror("iolog init\n");
                 return false;
         }
@@ -701,15 +792,20 @@ bool init_iolog(struct thread_data *td)
  
         if (td->o.read_iolog_file) {
                 int need_swap;
+               char * fname = get_name_by_idx(td->o.read_iolog_file, td->subjob_number);
  
                 /*
                  * Check if it's a blktrace file and load that if possible.
                  * Otherwise assume it's a normal log file and load that.
                  */
-               if (is_blktrace(td->o.read_iolog_file, &need_swap))
-                       ret = load_blktrace(td, td->o.read_iolog_file, need_swap);
-               else
-                       ret = init_iolog_read(td);
+               if (is_blktrace(fname, &need_swap)) {
+                       td->io_log_blktrace = 1;
+                       ret = init_blktrace_read(td, fname, need_swap);
+               } else {
+                       td->io_log_blktrace = 0;
+                       ret = init_iolog_read(td, fname);
+               }
+               free(fname);
         } else if (td->o.write_iolog_file)
                 ret = init_iolog_write(td);
         else
@@ -718,6 +814,8 @@ bool init_iolog(struct thread_data *td)
         if (!ret)
                 td_verror(td, EINVAL, "failed initializing iolog");
  
+       init_disk_util(td);
+
         return ret;
  }
  
@@ -733,6 +831,7 @@ void setup_log(struct io_log **log, struct log_params *p,
         INIT_FLIST_HEAD(&l->io_logs);
         l->log_type = p->log_type;
         l->log_offset = p->log_offset;
+       l->log_prio = p->log_prio;
         l->log_gz = p->log_gz;
         l->log_gz_store = p->log_gz_store;
         l->avg_msec = p->avg_msec;
@@ -765,6 +864,15 @@ void setup_log(struct io_log **log, struct log_params *p,
  
         if (l->log_offset)
                 l->log_ddir_mask = LOG_OFFSET_SAMPLE_BIT;
+       if (l->log_prio)
+               l->log_ddir_mask |= LOG_PRIO_SAMPLE_BIT;
+       /*
+        * The bandwidth-log option generates agg-read_bw.log,
+        * agg-write_bw.log and agg-trim_bw.log for which l->td is NULL.
+        * Check if l->td is valid before dereferencing it.
+        */
+       if (l->td && l->td->o.log_max == IO_LOG_SAMPLE_BOTH)
+               l->log_ddir_mask |= LOG_AVG_MAX_SAMPLE_BIT;
  
         INIT_FLIST_HEAD(&l->chunk_list);
  
@@ -891,33 +999,87 @@ static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
  void flush_samples(FILE *f, void *samples, uint64_t sample_size)
  {
         struct io_sample *s;
-       int log_offset;
+       int log_offset, log_prio, log_avg_max;
         uint64_t i, nr_samples;
+       unsigned int prio_val;
+       const char *fmt;
  
         if (!sample_size)
                 return;
  
         s = __get_sample(samples, 0, 0);
         log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+       log_prio = (s->__ddir & LOG_PRIO_SAMPLE_BIT) != 0;
+       log_avg_max = (s->__ddir & LOG_AVG_MAX_SAMPLE_BIT) != 0;
+
+       if (log_offset) {
+               if (log_prio) {
+                       if (log_avg_max)
+                               fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, %llu, 0x%04x\n";
+                       else
+                               fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, %llu, 0x%04x\n";
+               } else {
+                       if (log_avg_max)
+                               fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, %llu, %u\n";
+                       else
+                               fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, %llu, %u\n";
+               }
+       } else {
+               if (log_prio) {
+                       if (log_avg_max)
+                               fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, 0x%04x\n";
+                       else
+                               fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, 0x%04x\n";
+               } else {
+                       if (log_avg_max)
+                               fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, %u\n";
+                       else
+                               fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, %u\n";
+               }
+       }
  
         nr_samples = sample_size / __log_entry_sz(log_offset);
  
         for (i = 0; i < nr_samples; i++) {
                 s = __get_sample(samples, log_offset, i);
  
+               if (log_prio)
+                       prio_val = s->priority;
+               else
+                       prio_val = ioprio_value_is_class_rt(s->priority);
+
                 if (!log_offset) {
-                       fprintf(f, "%lu, %" PRId64 ", %u, %llu, %u\n",
-                                       (unsigned long) s->time,
-                                       s->data.val,
-                                       io_sample_ddir(s), (unsigned long long) s->bs, s->priority_bit);
+                       if (log_avg_max)
+                               fprintf(f, fmt,
+                                       s->time,
+                                       s->data.val.val0,
+                                       s->data.val.val1,
+                                       io_sample_ddir(s), (unsigned long long) s->bs,
+                                       prio_val);
+                       else
+                               fprintf(f, fmt,
+                                       s->time,
+                                       s->data.val.val0,
+                                       io_sample_ddir(s), (unsigned long long) s->bs,
+                                       prio_val);
                 } else {
                         struct io_sample_offset *so = (void *) s;
  
-                       fprintf(f, "%lu, %" PRId64 ", %u, %llu, %llu, %u\n",
-                                       (unsigned long) s->time,
-                                       s->data.val,
+                       if (log_avg_max)
+                               fprintf(f, fmt,
+                                       s->time,
+                                       s->data.val.val0,
+                                       s->data.val.val1,
+                                       io_sample_ddir(s), (unsigned long long) s->bs,
+                                       (unsigned long long) so->offset,
+                                       prio_val);
+                       else
+                               fprintf(f, fmt,
+                                       s->time,
+                                       s->data.val.val0,
                                         io_sample_ddir(s), (unsigned long long) s->bs,
-                                       (unsigned long long) so->offset, s->priority_bit);
+                                       (unsigned long long) so->offset,
+                                       prio_val);
                 }
         }
  }
@@ -1117,7 +1279,7 @@ int iolog_file_inflate(const char *file)
         void *buf;
         FILE *f;
  
-       f = fopen(file, "r");
+       f = fopen(file, "rb");
         if (!f) {
                 perror("fopen");
                 return 1;
@@ -1199,10 +1361,21 @@ void flush_log(struct io_log *log, bool do_append)
         void *buf;
         FILE *f;
  
+       /*
+        * If log_gz_store is true, we are writing a binary file.
+        * Set the mode appropriately (on all platforms) to avoid issues
+        * on windows (line-ending conversions, etc.)
+        */
         if (!do_append)
-               f = fopen(log->filename, "w");
+               if (log->log_gz_store)
+                       f = fopen(log->filename, "wb");
+               else
+                       f = fopen(log->filename, "w");
         else
-               f = fopen(log->filename, "a");
+               if (log->log_gz_store)
+                       f = fopen(log->filename, "ab");
+               else
+                       f = fopen(log->filename, "a");
         if (!f) {
                 perror("fopen log");
                 return;
@@ -1473,14 +1646,14 @@ void iolog_compress_exit(struct thread_data *td)
   * Queue work item to compress the existing log entries. We reset the
   * current log to a small size, and reference the existing log in the
   * data that we queue for compression. Once compression has been done,
- * this old log is freed. If called with finish == true, will not return
- * until the log compression has completed, and will flush all previous
- * logs too
+ * this old log is freed. Will not return until the log compression
+ * has completed, and will flush all previous logs too
   */
  static int iolog_flush(struct io_log *log)
  {
         struct iolog_flush_data *data;
  
+       workqueue_flush(&log->td->log_compress_wq);
         data = malloc(sizeof(*data));
         if (!data)
                 return 1;
@@ -1745,9 +1918,7 @@ void td_writeout_logs(struct thread_data *td, bool unit_logs)
  
  void fio_writeout_logs(bool unit_logs)
  {
-       struct thread_data *td;
-       int i;
-
-       for_each_td(td, i)
+       for_each_td(td) {
                 td_writeout_logs(td, unit_logs);
+       } end_for_each();
  }
diff --git a/iolog.h b/iolog.h

index 9e382cc0211748254f97dc8071176319d874a591..26dd5cca81f388e194199caf4197dcf9368592b7 100644 (file)
--- a/iolog.h
+++ b/iolog.h
@@ -26,13 +26,23 @@ struct io_hist {
         struct flist_head list;
  };
  
+enum {
+       IO_LOG_SAMPLE_AVG = 0,
+       IO_LOG_SAMPLE_MAX,
+       IO_LOG_SAMPLE_BOTH,
+};
+
+struct io_sample_value {
+       uint64_t val0;
+       uint64_t val1;
+};
  
  union io_sample_data {
-       uint64_t val;
+       struct io_sample_value val;
         struct io_u_plat_entry *plat_entry;
  };
  
-#define sample_val(value) ((union io_sample_data) { .val = value })
+#define sample_val(value) ((union io_sample_data) { .val.val0 = value })
  #define sample_plat(plat) ((union io_sample_data) { .plat_entry = plat })
  
  /*
@@ -42,7 +52,7 @@ struct io_sample {
         uint64_t time;
         union io_sample_data data;
         uint32_t __ddir;
-       uint8_t priority_bit;
+       uint16_t priority;
         uint64_t bs;
  };
  
@@ -104,6 +114,11 @@ struct io_log {
          */
         unsigned int log_offset;
  
+       /*
+        * Log I/O priorities
+        */
+       unsigned int log_prio;
+
         /*
          * Max size of log entries before a chunk is compressed
          */
@@ -145,7 +160,18 @@ struct io_log {
   * If the upper bit is set, then we have the offset as well
   */
  #define LOG_OFFSET_SAMPLE_BIT  0x80000000U
-#define io_sample_ddir(io)     ((io)->__ddir & ~LOG_OFFSET_SAMPLE_BIT)
+/*
+ * If the bit following the upper bit is set, then we have the priority
+ */
+#define LOG_PRIO_SAMPLE_BIT    0x40000000U
+/*
+ * If the bit following prioity sample vit is set, we report both avg and max
+ */
+#define LOG_AVG_MAX_SAMPLE_BIT 0x20000000U
+
+#define LOG_SAMPLE_BITS                (LOG_OFFSET_SAMPLE_BIT | LOG_PRIO_SAMPLE_BIT |\
+                                       LOG_AVG_MAX_SAMPLE_BIT)
+#define io_sample_ddir(io)     ((io)->__ddir & ~LOG_SAMPLE_BITS)
  
  static inline void io_sample_set_ddir(struct io_log *log,
                                       struct io_sample *io,
@@ -216,10 +242,8 @@ struct io_piece {
         unsigned long len;
         unsigned int flags;
         enum fio_ddir ddir;
-       union {
-               unsigned long delay;
-               unsigned int file_action;
-       };
+       unsigned long delay;
+       unsigned int file_action;
  };
  
  /*
@@ -243,10 +267,13 @@ extern void trim_io_piece(const struct io_u *);
  extern void queue_io_piece(struct thread_data *, struct io_piece *);
  extern void prune_io_piece_log(struct thread_data *);
  extern void write_iolog_close(struct thread_data *);
+int64_t iolog_items_to_fetch(struct thread_data *td);
  extern int iolog_compress_init(struct thread_data *, struct sk_out *);
  extern void iolog_compress_exit(struct thread_data *);
  extern size_t log_chunk_sizes(struct io_log *);
  extern int init_io_u_buffers(struct thread_data *);
+extern unsigned long long delay_since_ttime(const struct thread_data *,
+                                            unsigned long long);
  
  #ifdef CONFIG_ZLIB
  extern int iolog_file_inflate(const char *);
@@ -262,6 +289,7 @@ struct log_params {
         int hist_coarseness;
         int log_type;
         int log_offset;
+       int log_prio;
         int log_gz;
         int log_gz_store;
         int log_compress;
diff --git a/json.h b/json.h

index d98242638d4a186b13fa6ba2da86639454dc47b8..66bb06b1f17024b6759b585dfc5d9b9afcfa0326 100644 (file)
--- a/json.h
+++ b/json.h
@@ -81,8 +81,13 @@ static inline int json_object_add_value_string(struct json_object *obj,
         struct json_value arg = {
                 .type = JSON_TYPE_STRING,
         };
+       union {
+               const char *a;
+               char *b;
+       } string;
  
-       arg.string = strdup(val ? : "");
+       string.a = val ? val : "";
+       arg.string = string.b;
         return json_object_add_value_type(obj, name, &arg);
  }
  
diff --git a/lib/fls.h b/lib/fls.h

index dc7ecd0d629c94afe22a599739e76d68cb39922c..99e1862a34c5047d487b6ad2f300f61969dd8c3c 100644 (file)
--- a/lib/fls.h
+++ b/lib/fls.h
@@ -32,7 +32,6 @@ static inline int __fls(int x)
                 r -= 2;
         }
         if (!(x & 0x80000000u)) {
-               x <<= 1;
                 r -= 1;
         }
         return r;
diff --git a/lib/lfsr.c b/lib/lfsr.c

index a32e850a704f08e8e43456a5b684cae6cd8aba22..e86086c4af123645c4ba02693d954816feae6f34 100644 (file)
--- a/lib/lfsr.c
+++ b/lib/lfsr.c
@@ -88,37 +88,37 @@ static inline void __lfsr_next(struct fio_lfsr *fl, unsigned int spin)
          */
         switch (spin) {
                 case 15: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case 14: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case 13: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case 12: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case 11: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case 10: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case  9: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case  8: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case  7: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case  6: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case  5: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case  4: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case  3: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case  2: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case  1: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 case  0: __LFSR_NEXT(fl, fl->last_val);
-               fallthrough;
+               fio_fallthrough;
                 default: break;
         }
  }
diff --git a/lib/pattern.c b/lib/pattern.c

index 680a12be7efc8fd232a4afa6f917c76c07bfa858..9fca643e32e250ec80b4ae7233054f59f14f0c67 100644 (file)
--- a/lib/pattern.c
+++ b/lib/pattern.c
@@ -32,7 +32,7 @@ static const char *parse_file(const char *beg, char *out,
         const char *end;
         char *file;
         int fd;
-       ssize_t count;
+       ssize_t rc, count = 0;
  
         if (!out_len)
                 goto err_out;
@@ -47,13 +47,32 @@ static const char *parse_file(const char *beg, char *out,
         if (file == NULL)
                 goto err_out;
  
+#ifdef _WIN32
+       fd = open(file, O_RDONLY | O_BINARY);
+#else
         fd = open(file, O_RDONLY);
+#endif
         if (fd < 0)
                 goto err_free_out;
  
-       count = read(fd, out, out_len);
-       if (count == -1)
-               goto err_free_close_out;
+       if (out) {
+               while (1) {
+                       rc = read(fd, out, out_len - count);
+                       if (rc == 0)
+                               break;
+                       if (rc == -1)
+                               goto err_free_close_out;
+
+                       count += rc;
+                       out += rc;
+               }
+       } else {
+               count = lseek(fd, 0, SEEK_END);
+               if (count == -1)
+                       goto err_free_close_out;
+               if (count >= out_len)
+                       count = out_len;
+       }
  
         *filled = count;
         close(fd);
@@ -100,7 +119,8 @@ static const char *parse_string(const char *beg, char *out,
         if (end - beg > out_len)
                 return NULL;
  
-       memcpy(out, beg, end - beg);
+       if (out)
+               memcpy(out, beg, end - beg);
         *filled = end - beg;
  
         /* Catch up quote */
@@ -156,12 +176,14 @@ static const char *parse_number(const char *beg, char *out,
                 i = 0;
                 if (!lval) {
                         num    = 0;
-                       out[i] = 0x00;
+                       if (out)
+                               out[i] = 0x00;
                         i      = 1;
                 } else {
                         val = (unsigned int)lval;
                         for (; val && out_len; out_len--, i++, val >>= 8)
-                               out[i] = val & 0xff;
+                               if (out)
+                                       out[i] = val & 0xff;
                         if (val)
                                 return NULL;
                 }
@@ -183,7 +205,8 @@ static const char *parse_number(const char *beg, char *out,
                         const char *fmt;
  
                         fmt = (num & 1 ? "%1hhx" : "%2hhx");
-                       sscanf(beg, fmt, &out[i]);
+                       if (out)
+                               sscanf(beg, fmt, &out[i]);
                         if (num & 1) {
                                 num++;
                                 beg--;
@@ -211,7 +234,7 @@ static const char *parse_number(const char *beg, char *out,
   * This function tries to find formats, e.g.:
   *   %o - offset of the block
   *
- * In case of successfull parsing it fills the format param
+ * In case of successful parsing it fills the format param
   * with proper offset and the size of the expected value, which
   * should be pasted into buffer using the format 'func' callback.
   *
@@ -251,7 +274,8 @@ static const char *parse_format(const char *in, char *out, unsigned int parsed,
         if (f->desc->len > out_len)
                 return NULL;
  
-       memset(out, '\0', f->desc->len);
+       if (out)
+               memset(out, '\0', f->desc->len);
         *filled = f->desc->len;
  
         return in + len;
@@ -262,12 +286,14 @@ static const char *parse_format(const char *in, char *out, unsigned int parsed,
   *                            numbers and pattern formats.
   * @in - string input
   * @in_len - size of the input string
- * @out - output buffer where parsed result will be put
+ * @out - output buffer where parsed result will be put, may be NULL
+ *       in which case this function just calculates the required
+ *       length of the buffer
   * @out_len - lengths of the output buffer
   * @fmt_desc - array of pattern format descriptors [input]
   * @fmt - array of pattern formats [output]
   * @fmt_sz - pointer where the size of pattern formats array stored [input],
- *           after successfull parsing this pointer will contain the number
+ *           after successful parsing this pointer will contain the number
   *           of parsed formats if any [output].
   *
   * strings:
@@ -275,7 +301,7 @@ static const char *parse_format(const char *in, char *out, unsigned int parsed,
   *   NOTE: there is no way to escape quote, so "123\"abc" does not work.
   *
   * numbers:
- *   hexidecimal - sequence of hex bytes starting from 0x or 0X prefix,
+ *   hexadecimal - sequence of hex bytes starting from 0x or 0X prefix,
   *                 e.g. 0xff12ceff1100ff
   *   decimal     - decimal number in range [INT_MIN, INT_MAX]
   *
@@ -305,16 +331,16 @@ static const char *parse_format(const char *in, char *out, unsigned int parsed,
   *
   * Returns number of bytes filled or err < 0 in case of failure.
   */
-int parse_and_fill_pattern(const char *in, unsigned int in_len,
-                          char *out, unsigned int out_len,
-                          const struct pattern_fmt_desc *fmt_desc,
-                          struct pattern_fmt *fmt,
-                          unsigned int *fmt_sz_out)
+static int parse_and_fill_pattern(const char *in, unsigned int in_len,
+                                 char *out, unsigned int out_len,
+                                 const struct pattern_fmt_desc *fmt_desc,
+                                 struct pattern_fmt *fmt,
+                                 unsigned int *fmt_sz_out)
  {
         const char *beg, *end, *out_beg = out;
         unsigned int total = 0, fmt_rem = 0;
  
-       if (!in || !in_len || !out || !out_len)
+       if (!in || !in_len || !out_len)
                 return -EINVAL;
         if (fmt_sz_out)
                 fmt_rem = *fmt_sz_out;
@@ -360,8 +386,9 @@ int parse_and_fill_pattern(const char *in, unsigned int in_len,
                 assert(filled);
                 assert(filled <= out_len);
                 out_len -= filled;
-               out     += filled;
                 total   += filled;
+               if (out)
+                       out += filled;
  
         } while (in_len);
  
@@ -370,6 +397,48 @@ int parse_and_fill_pattern(const char *in, unsigned int in_len,
         return total;
  }
  
+/**
+ * parse_and_fill_pattern_alloc() - Parses combined input, which consists of
+ *                                 strings, numbers and pattern formats and
+ *                                 allocates a buffer for the result.
+ *
+ * @in - string input
+ * @in_len - size of the input string
+ * @out - pointer to the output buffer pointer, this will be set to the newly
+ *        allocated pattern buffer which must be freed by the caller
+ * @fmt_desc - array of pattern format descriptors [input]
+ * @fmt - array of pattern formats [output]
+ * @fmt_sz - pointer where the size of pattern formats array stored [input],
+ *           after successful parsing this pointer will contain the number
+ *           of parsed formats if any [output].
+ *
+ * See documentation on parse_and_fill_pattern() above for a description
+ * of the functionality.
+ *
+ * Returns number of bytes filled or err < 0 in case of failure.
+ */
+int parse_and_fill_pattern_alloc(const char *in, unsigned int in_len,
+               char **out, const struct pattern_fmt_desc *fmt_desc,
+               struct pattern_fmt *fmt, unsigned int *fmt_sz_out)
+{
+       int count;
+
+       count = parse_and_fill_pattern(in, in_len, NULL, MAX_PATTERN_SIZE,
+                                      fmt_desc, fmt, fmt_sz_out);
+       if (count < 0)
+               return count;
+
+       *out = malloc(count);
+       count = parse_and_fill_pattern(in, in_len, *out, count, fmt_desc,
+                                      fmt, fmt_sz_out);
+       if (count < 0) {
+               free(*out);
+               *out = NULL;
+       }
+
+       return count;
+}
+
  /**
   * dup_pattern() - Duplicates part of the pattern all over the buffer.
   *
diff --git a/lib/pattern.h b/lib/pattern.h

index a6d9d6b4275cf863c86c0b9978fa4dd36b239e80..7123b42d6747bc2cecc55a61b79389d0d6587f5d 100644 (file)
--- a/lib/pattern.h
+++ b/lib/pattern.h
@@ -1,6 +1,19 @@
  #ifndef FIO_PARSE_PATTERN_H
  #define FIO_PARSE_PATTERN_H
  
+/*
+ * The pattern is dynamically allocated, but that doesn't mean there
+ * are not limits. The network protocol has a limit of
+ * FIO_SERVER_MAX_CMD_MB and potentially two patterns must fit in there.
+ * There's also a need to verify the incoming data from the network and
+ * this provides a sensible check.
+ *
+ * 128MiB is an arbitrary limit that meets these criteria. The patterns
+ * tend to be truncated at the IO size anyway and IO sizes that large
+ * aren't terribly practical.
+ */
+#define MAX_PATTERN_SIZE       (128 << 20)
+
  /**
   * Pattern format description. The input for 'parse_pattern'.
   * Describes format with its name and callback, which should
@@ -21,11 +34,9 @@ struct pattern_fmt {
         const struct pattern_fmt_desc *desc;
  };
  
-int parse_and_fill_pattern(const char *in, unsigned int in_len,
-                          char *out, unsigned int out_len,
-                          const struct pattern_fmt_desc *fmt_desc,
-                          struct pattern_fmt *fmt,
-                          unsigned int *fmt_sz_out);
+int parse_and_fill_pattern_alloc(const char *in, unsigned int in_len,
+               char **out, const struct pattern_fmt_desc *fmt_desc,
+               struct pattern_fmt *fmt, unsigned int *fmt_sz_out);
  
  int paste_format_inplace(char *pattern, unsigned int pattern_len,
                          struct pattern_fmt *fmt, unsigned int fmt_sz,
diff --git a/lib/rand.c b/lib/rand.c

index 5eb6e60aeb6b651e88595b4ba9c2d875d1a5748d..0e787a62bad152568eee08824abf723716fec040 100644 (file)
--- a/lib/rand.c
+++ b/lib/rand.c
@@ -59,7 +59,7 @@ static void __init_rand32(struct taus88_state *state, unsigned int seed)
                 __rand32(state);
  }
  
-static void __init_rand64(struct taus258_state *state, uint64_t seed)
+void __init_rand64(struct taus258_state *state, uint64_t seed)
  {
         int cranks = 6;
  
@@ -95,40 +95,56 @@ void init_rand_seed(struct frand_state *state, uint64_t seed, bool use64)
                 __init_rand64(&state->state64, seed);
  }
  
+void __fill_random_buf_small(void *buf, unsigned int len, uint64_t seed)
+{
+       uint64_t *b = buf;
+       uint64_t *e = b  + len / sizeof(*b);
+       unsigned int rest = len % sizeof(*b);
+
+       for (; b != e; ++b) {
+               *b = seed;
+               seed = __hash_u64(seed);
+       }
+
+       if (fio_unlikely(rest))
+               __builtin_memcpy(e, &seed, rest);
+}
+
  void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
  {
-       void *ptr = buf;
+       static uint64_t prime[] = {1, 2, 3, 5, 7, 11, 13, 17,
+                                  19, 23, 29, 31, 37, 41, 43, 47};
+       uint64_t *b, *e, s[CONFIG_SEED_BUCKETS];
+       unsigned int rest;
+       int p;
  
-       while (len) {
-               int this_len;
-
-               if (len >= sizeof(int64_t)) {
-                       *((int64_t *) ptr) = seed;
-                       this_len = sizeof(int64_t);
-               } else if (len >= sizeof(int32_t)) {
-                       *((int32_t *) ptr) = seed;
-                       this_len = sizeof(int32_t);
-               } else if (len >= sizeof(int16_t)) {
-                       *((int16_t *) ptr) = seed;
-                       this_len = sizeof(int16_t);
-               } else {
-                       *((int8_t *) ptr) = seed;
-                       this_len = sizeof(int8_t);
+       /*
+        * Calculate the max index which is multiples of the seed buckets.
+        */
+       rest = (len / sizeof(*b) / CONFIG_SEED_BUCKETS) * CONFIG_SEED_BUCKETS;
+
+       b = buf;
+       e = b + rest;
+
+       rest = len - (rest * sizeof(*b));
+
+       for (p = 0; p < CONFIG_SEED_BUCKETS; p++)
+               s[p] = seed * prime[p];
+
+       for (; b != e; b += CONFIG_SEED_BUCKETS) {
+               for (p = 0; p < CONFIG_SEED_BUCKETS; ++p) {
+                       b[p] = s[p];
+                       s[p] = __hash_u64(s[p]);
                 }
-               ptr += this_len;
-               len -= this_len;
-               seed *= GOLDEN_RATIO_PRIME;
-               seed >>= 3;
         }
+
+       __fill_random_buf_small(b, rest, s[0]);
  }
  
  uint64_t fill_random_buf(struct frand_state *fs, void *buf,
                          unsigned int len)
  {
-       uint64_t r = __rand(fs);
-
-       if (sizeof(int) != sizeof(long *))
-               r *= (unsigned long) __rand(fs);
+       uint64_t r = __get_next_seed(fs);
  
         __fill_random_buf(buf, len, r);
         return r;
@@ -188,10 +204,7 @@ uint64_t fill_random_buf_percentage(struct frand_state *fs, void *buf,
                                     unsigned int segment, unsigned int len,
                                     char *pattern, unsigned int pbytes)
  {
-       uint64_t r = __rand(fs);
-
-       if (sizeof(int) != sizeof(long *))
-               r *= (unsigned long) __rand(fs);
+       uint64_t r = __get_next_seed(fs);
  
         __fill_random_buf_percentage(r, buf, percentage, segment, len,
                                         pattern, pbytes);
diff --git a/lib/rand.h b/lib/rand.h

index 46c1c5e023a132513ded05d885bd5769dbf7ceb9..2b4be78893a926d22a37006706a1caf4b0d3e431 100644 (file)
--- a/lib/rand.h
+++ b/lib/rand.h
@@ -150,8 +150,19 @@ static inline uint64_t rand_between(struct frand_state *state, uint64_t start,
                 return start + rand32_upto(state, end - start);
  }
  
+static inline uint64_t __get_next_seed(struct frand_state *fs)
+{
+       uint64_t r = __rand(fs);
+
+       if (sizeof(int) != sizeof(long *))
+               r *= (unsigned long) __rand(fs);
+
+       return r;
+}
+
  extern void init_rand(struct frand_state *, bool);
  extern void init_rand_seed(struct frand_state *, uint64_t seed, bool);
+void __init_rand64(struct taus258_state *state, uint64_t seed);
  extern void __fill_random_buf(void *buf, unsigned int len, uint64_t seed);
  extern uint64_t fill_random_buf(struct frand_state *, void *buf, unsigned int len);
  extern void __fill_random_buf_percentage(uint64_t, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
diff --git a/lib/seqlock.h b/lib/seqlock.h

index 56f3e37dab5b35750a390714c3af2d9579de5628..ef3aa0918df461ba3f60cf99b9d2d1f83b17095b 100644 (file)
--- a/lib/seqlock.h
+++ b/lib/seqlock.h
@@ -5,7 +5,11 @@
  #include "../arch/arch.h"
  
  struct seqlock {
+#ifdef __cplusplus
+       std::atomic<unsigned int> sequence;
+#else
         volatile unsigned int sequence;
+#endif
  };
  
  static inline void seqlock_init(struct seqlock *s)
diff --git a/libfio.c b/libfio.c

index 6144a474738fe9a51a530ffad691aa234138d665..d0c6bf8f5667ba69bb42f54a5e4ea4a0d5fe211a 100644 (file)
--- a/libfio.c
+++ b/libfio.c
@@ -74,6 +74,8 @@ static const char *fio_arch_strings[arch_nr] = {
         "hppa",
         "mips",
         "aarch64",
+       "loongarch64",
+       "riscv64",
         "generic"
  };
  
@@ -87,13 +89,14 @@ static void reset_io_counters(struct thread_data *td, int all)
                         td->this_io_bytes[ddir] = 0;
                         td->stat_io_blocks[ddir] = 0;
                         td->this_io_blocks[ddir] = 0;
-                       td->rate_bytes[ddir] = 0;
-                       td->rate_blocks[ddir] = 0;
+                       td->last_rate_check_bytes[ddir] = 0;
+                       td->last_rate_check_blocks[ddir] = 0;
                         td->bytes_done[ddir] = 0;
                         td->rate_io_issue_bytes[ddir] = 0;
                         td->rate_next_io_time[ddir] = 0;
                         td->last_usec[ddir] = 0;
                 }
+               td->bytes_verified = 0;
         }
  
         td->zone_bytes = 0;
@@ -104,7 +107,7 @@ static void reset_io_counters(struct thread_data *td, int all)
         /*
          * reset file done count if we are to start over
          */
-       if (td->o.time_based || td->o.loops || td->o.do_verify)
+       if (td->o.time_based || td->loops > 1 || td->o.do_verify)
                 td->nr_done_files = 0;
  }
  
@@ -130,25 +133,30 @@ void clear_io_state(struct thread_data *td, int all)
  
  void reset_all_stats(struct thread_data *td)
  {
+       unsigned long long b;
         int i;
  
         reset_io_counters(td, 1);
  
+       b = ddir_rw_sum(td->thinktime_blocks_counter);
+       td->last_thinktime_blocks -= b;
+
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
                 td->io_bytes[i] = 0;
                 td->io_blocks[i] = 0;
                 td->io_issues[i] = 0;
                 td->ts.total_io_u[i] = 0;
                 td->ts.runtime[i] = 0;
-               td->rwmix_issues = 0;
         }
  
-       set_epoch_time(td, td->o.log_unix_epoch);
+       set_epoch_time(td, td->o.log_alternate_epoch_clock_id, td->o.job_start_clock_id);
         memcpy(&td->start, &td->epoch, sizeof(td->epoch));
         memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
         memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
         memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
  
+       td->last_thinktime = td->epoch;
+
         lat_target_reset(td);
         clear_rusage_stat(td);
         helper_reset();
@@ -240,13 +248,11 @@ void fio_mark_td_terminate(struct thread_data *td)
  
  void fio_terminate_threads(unsigned int group_id, unsigned int terminate)
  {
-       struct thread_data *td;
         pid_t pid = getpid();
-       int i;
  
         dprint(FD_PROCESS, "terminate group_id=%d\n", group_id);
  
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if ((terminate == TERMINATE_GROUP && group_id == TERMINATE_ALL) ||
                     (terminate == TERMINATE_GROUP && group_id == td->groupid) ||
                     (terminate == TERMINATE_STONEWALL && td->runstate >= TD_RUNNING) ||
@@ -274,22 +280,20 @@ void fio_terminate_threads(unsigned int group_id, unsigned int terminate)
                                         ops->terminate(td);
                         }
                 }
-       }
+       } end_for_each();
  }
  
  int fio_running_or_pending_io_threads(void)
  {
-       struct thread_data *td;
-       int i;
         int nr_io_threads = 0;
  
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if (td->io_ops_init && td_ioengine_flagged(td, FIO_NOIO))
                         continue;
                 nr_io_threads++;
                 if (td->runstate < TD_EXITED)
                         return 1;
-       }
+       } end_for_each();
  
         if (!nr_io_threads)
                 return -1; /* we only had cpuio threads to begin with */
@@ -374,6 +378,7 @@ int initialize_fio(char *envp[])
         compiletime_assert((offsetof(struct jobs_eta, m_rate) % 8) == 0, "m_rate");
  
         compiletime_assert(__TD_F_LAST <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT");
+       compiletime_assert((__TD_F_LAST + __FIO_IOENGINE_F_LAST) <= 8*sizeof(((struct thread_data *)0)->flags), "td->flags");
         compiletime_assert(BSSPLIT_MAX <= ZONESPLIT_MAX, "bsssplit/zone max");
  
         err = endian_check();
diff --git a/log.c b/log.c

index 562a29aaddfe0e1d42de369cd8e5645afa96f7a3..df58ea07a5a6c5c4254693dbd75b32927fe7adae 100644 (file)
--- a/log.c
+++ b/log.c
@@ -1,3 +1,5 @@
+#include "log.h"
+
  #include <unistd.h>
  #include <string.h>
  #include <stdarg.h>
@@ -62,7 +64,7 @@ void log_prevalist(int type, const char *fmt, va_list args)
         free(buf1);
         if (len < 0)
                 return;
-       len = log_info_buf(buf2, len);
+       log_info_buf(buf2, len);
         free(buf2);
  }
  #endif
diff --git a/memory.c b/memory.c

index 6cf7333375d035dd6eb36526eb338f355f310e78..2fdca65768b91b9967011f7778ad59c9dbd4e4e5 100644 (file)
--- a/memory.c
+++ b/memory.c
@@ -295,7 +295,7 @@ int allocate_io_mem(struct thread_data *td)
  
         total_mem = td->orig_buffer_size;
  
-       if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+       if (td->o.odirect || td->o.mem_align ||
             td_ioengine_flagged(td, FIO_MEMALIGN)) {
                 total_mem += page_mask;
                 if (td->o.mem_align && td->o.mem_align > page_size)
@@ -305,16 +305,18 @@ int allocate_io_mem(struct thread_data *td)
         dprint(FD_MEM, "Alloc %llu for buffers\n", (unsigned long long) total_mem);
  
         /*
-        * If the IO engine has hooks to allocate/free memory, use those. But
-        * error out if the user explicitly asked for something else.
+        * If the IO engine has hooks to allocate/free memory and the user
+        * doesn't explicitly ask for something else, use those. But fail if the
+        * user asks for something else with an engine that doesn't allow that.
          */
-       if (td->io_ops->iomem_alloc) {
-               if (fio_option_is_set(&td->o, mem_type)) {
-                       log_err("fio: option 'mem/iomem' conflicts with specified IO engine\n");
-                       ret = 1;
-               } else
-                       ret = td->io_ops->iomem_alloc(td, total_mem);
-       } else if (td->o.mem_type == MEM_MALLOC)
+       if (td->io_ops->iomem_alloc && fio_option_is_set(&td->o, mem_type) &&
+           !td_ioengine_flagged(td, FIO_SKIPPABLE_IOMEM_ALLOC)) {
+               log_err("fio: option 'mem/iomem' conflicts with specified IO engine\n");
+               ret = 1;
+       } else if (td->io_ops->iomem_alloc &&
+                  !fio_option_is_set(&td->o, mem_type))
+               ret = td->io_ops->iomem_alloc(td, total_mem);
+       else if (td->o.mem_type == MEM_MALLOC)
                 ret = alloc_mem_malloc(td, total_mem);
         else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE)
                 ret = alloc_mem_shm(td, total_mem);
@@ -339,10 +341,10 @@ void free_io_mem(struct thread_data *td)
         unsigned int total_mem;
  
         total_mem = td->orig_buffer_size;
-       if (td->o.odirect || td->o.oatomic)
+       if (td->o.odirect)
                 total_mem += page_mask;
  
-       if (td->io_ops->iomem_alloc) {
+       if (td->io_ops->iomem_alloc && !fio_option_is_set(&td->o, mem_type)) {
                 if (td->io_ops->iomem_free)
                         td->io_ops->iomem_free(td);
         } else if (td->o.mem_type == MEM_MALLOC)
diff --git a/optgroup.c b/optgroup.c

index 647748963193db4db016b2befde7fa2ec4f3a758..bebb4a5133a8615fa341c83d0dddcc80c38ba1b4 100644 (file)
--- a/optgroup.c
+++ b/optgroup.c
@@ -141,6 +141,10 @@ static const struct opt_group fio_opt_cat_groups[] = {
                 .name   = "RDMA I/O engine", /* rdma */
                 .mask   = FIO_OPT_G_RDMA,
         },
+       {
+               .name   = "librpma I/O engines", /* librpma_apm && librpma_gpspm */
+               .mask   = FIO_OPT_G_LIBRPMA,
+       },
         {
                 .name   = "libaio I/O engine", /* libaio */
                 .mask   = FIO_OPT_G_LIBAIO,
@@ -177,6 +181,14 @@ static const struct opt_group fio_opt_cat_groups[] = {
                 .name   = "libcufile I/O engine", /* libcufile */
                 .mask   = FIO_OPT_G_LIBCUFILE,
         },
+       {
+               .name   = "DAOS File System (dfs) I/O engine", /* dfs */
+               .mask   = FIO_OPT_G_DFS,
+       },
+       {
+               .name   = "NFS I/O engine", /* nfs */
+               .mask   = FIO_OPT_G_NFS,
+       },
         {
                 .name   = NULL,
         },
diff --git a/optgroup.h b/optgroup.h

index d2f1ceb391c34fd6101cf3e4b6a867b078b178e6..024b902f63ae5734190f08ad4faa206372d61faa 100644 (file)
--- a/optgroup.h
+++ b/optgroup.h
@@ -52,6 +52,7 @@ enum opt_category_group {
         __FIO_OPT_G_E4DEFRAG,
         __FIO_OPT_G_NETIO,
         __FIO_OPT_G_RDMA,
+       __FIO_OPT_G_LIBRPMA,
         __FIO_OPT_G_LIBAIO,
         __FIO_OPT_G_ACT,
         __FIO_OPT_G_LATPROF,
@@ -68,6 +69,11 @@ enum opt_category_group {
         __FIO_OPT_G_FILESTAT,
         __FIO_OPT_G_NR,
         __FIO_OPT_G_LIBCUFILE,
+       __FIO_OPT_G_DFS,
+       __FIO_OPT_G_NFS,
+       __FIO_OPT_G_WINDOWSAIO,
+       __FIO_OPT_G_XNVME,
+       __FIO_OPT_G_LIBBLKIO,
  
         FIO_OPT_G_RATE          = (1ULL << __FIO_OPT_G_RATE),
         FIO_OPT_G_ZONE          = (1ULL << __FIO_OPT_G_ZONE),
@@ -94,6 +100,7 @@ enum opt_category_group {
         FIO_OPT_G_E4DEFRAG      = (1ULL << __FIO_OPT_G_E4DEFRAG),
         FIO_OPT_G_NETIO         = (1ULL << __FIO_OPT_G_NETIO),
         FIO_OPT_G_RDMA          = (1ULL << __FIO_OPT_G_RDMA),
+       FIO_OPT_G_LIBRPMA       = (1ULL << __FIO_OPT_G_LIBRPMA),
         FIO_OPT_G_LIBAIO        = (1ULL << __FIO_OPT_G_LIBAIO),
         FIO_OPT_G_ACT           = (1ULL << __FIO_OPT_G_ACT),
         FIO_OPT_G_LATPROF       = (1ULL << __FIO_OPT_G_LATPROF),
@@ -107,9 +114,14 @@ enum opt_category_group {
         FIO_OPT_G_INVALID       = (1ULL << __FIO_OPT_G_NR),
         FIO_OPT_G_ISCSI         = (1ULL << __FIO_OPT_G_ISCSI),
         FIO_OPT_G_NBD           = (1ULL << __FIO_OPT_G_NBD),
+       FIO_OPT_G_NFS           = (1ULL << __FIO_OPT_G_NFS),
         FIO_OPT_G_IOURING       = (1ULL << __FIO_OPT_G_IOURING),
         FIO_OPT_G_FILESTAT      = (1ULL << __FIO_OPT_G_FILESTAT),
         FIO_OPT_G_LIBCUFILE     = (1ULL << __FIO_OPT_G_LIBCUFILE),
+       FIO_OPT_G_DFS           = (1ULL << __FIO_OPT_G_DFS),
+       FIO_OPT_G_WINDOWSAIO    = (1ULL << __FIO_OPT_G_WINDOWSAIO),
+       FIO_OPT_G_XNVME         = (1ULL << __FIO_OPT_G_XNVME),
+       FIO_OPT_G_LIBBLKIO      = (1ULL << __FIO_OPT_G_LIBBLKIO),
  };
  
  extern const struct opt_group *opt_group_from_mask(uint64_t *mask);
diff --git a/options.c b/options.c

index e62e0cfb35413a3f59b8c8d3d141a40d6c44f8b7..61ea41cc4e0fc673893c3b9ecc9ec647c4216e3a 100644 (file)
--- a/options.c
+++ b/options.c
@@ -4,6 +4,7 @@
  #include <ctype.h>
  #include <string.h>
  #include <assert.h>
+#include <fcntl.h>
  #include <sys/stat.h>
  #include <netinet/in.h>
  
@@ -73,13 +74,7 @@ static int bs_cmp(const void *p1, const void *p2)
         return (int) bsp1->perc - (int) bsp2->perc;
  }
  
-struct split {
-       unsigned int nr;
-       unsigned long long val1[ZONESPLIT_MAX];
-       unsigned long long val2[ZONESPLIT_MAX];
-};
-
-static int split_parse_ddir(struct thread_options *o, struct split *split,
+int split_parse_ddir(struct thread_options *o, struct split *split,
                             char *str, bool absolute, unsigned int max_splits)
  {
         unsigned long long perc;
@@ -138,8 +133,8 @@ static int split_parse_ddir(struct thread_options *o, struct split *split,
         return 0;
  }
  
-static int bssplit_ddir(struct thread_options *o, enum fio_ddir ddir, char *str,
-                       bool data)
+static int bssplit_ddir(struct thread_options *o, void *eo,
+                       enum fio_ddir ddir, char *str, bool data)
  {
         unsigned int i, perc, perc_missing;
         unsigned long long max_bs, min_bs;
@@ -211,10 +206,8 @@ static int bssplit_ddir(struct thread_options *o, enum fio_ddir ddir, char *str,
         return 0;
  }
  
-typedef int (split_parse_fn)(struct thread_options *, enum fio_ddir, char *, bool);
-
-static int str_split_parse(struct thread_data *td, char *str,
-                          split_parse_fn *fn, bool data)
+int str_split_parse(struct thread_data *td, char *str,
+                   split_parse_fn *fn, void *eo, bool data)
  {
         char *odir, *ddir;
         int ret = 0;
@@ -223,42 +216,77 @@ static int str_split_parse(struct thread_data *td, char *str,
         if (odir) {
                 ddir = strchr(odir + 1, ',');
                 if (ddir) {
-                       ret = fn(&td->o, DDIR_TRIM, ddir + 1, data);
+                       ret = fn(&td->o, eo, DDIR_TRIM, ddir + 1, data);
                         if (!ret)
                                 *ddir = '\0';
                 } else {
                         char *op;
  
                         op = strdup(odir + 1);
-                       ret = fn(&td->o, DDIR_TRIM, op, data);
+                       ret = fn(&td->o, eo, DDIR_TRIM, op, data);
  
                         free(op);
                 }
                 if (!ret)
-                       ret = fn(&td->o, DDIR_WRITE, odir + 1, data);
+                       ret = fn(&td->o, eo, DDIR_WRITE, odir + 1, data);
                 if (!ret) {
                         *odir = '\0';
-                       ret = fn(&td->o, DDIR_READ, str, data);
+                       ret = fn(&td->o, eo, DDIR_READ, str, data);
                 }
         } else {
                 char *op;
  
                 op = strdup(str);
-               ret = fn(&td->o, DDIR_WRITE, op, data);
+               ret = fn(&td->o, eo, DDIR_WRITE, op, data);
                 free(op);
  
                 if (!ret) {
                         op = strdup(str);
-                       ret = fn(&td->o, DDIR_TRIM, op, data);
+                       ret = fn(&td->o, eo, DDIR_TRIM, op, data);
                         free(op);
                 }
                 if (!ret)
-                       ret = fn(&td->o, DDIR_READ, str, data);
+                       ret = fn(&td->o, eo, DDIR_READ, str, data);
         }
  
         return ret;
  }
  
+static int fio_fdp_cmp(const void *p1, const void *p2)
+{
+       const uint16_t *t1 = p1;
+       const uint16_t *t2 = p2;
+
+       return *t1 - *t2;
+}
+
+static int str_fdp_pli_cb(void *data, const char *input)
+{
+       struct thread_data *td = cb_data_to_td(data);
+       char *str, *p, *v;
+       int i = 0;
+
+       p = str = strdup(input);
+       strip_blank_front(&str);
+       strip_blank_end(str);
+
+       while ((v = strsep(&str, ",")) != NULL && i < FIO_MAX_DP_IDS) {
+               unsigned long long id = strtoull(v, NULL, 0);
+               if (id > 0xFFFF) {
+                       log_err("Placement IDs cannot exceed 0xFFFF\n");
+                       free(p);
+                       return 1;
+               }
+               td->o.dp_ids[i++] = id;
+       }
+       free(p);
+
+       qsort(td->o.dp_ids, i, sizeof(*td->o.dp_ids), fio_fdp_cmp);
+       td->o.dp_nr_ids = i;
+
+       return 0;
+}
+
  static int str_bssplit_cb(void *data, const char *input)
  {
         struct thread_data *td = cb_data_to_td(data);
@@ -270,7 +298,7 @@ static int str_bssplit_cb(void *data, const char *input)
         strip_blank_front(&str);
         strip_blank_end(str);
  
-       ret = str_split_parse(td, str, bssplit_ddir, false);
+       ret = str_split_parse(td, str, bssplit_ddir, NULL, false);
  
         if (parse_dryrun()) {
                 int i;
@@ -286,6 +314,135 @@ static int str_bssplit_cb(void *data, const char *input)
         return ret;
  }
  
+static int parse_cmdprio_bssplit_entry(struct thread_options *o,
+                                      struct split_prio *entry, char *str)
+{
+       int matches = 0;
+       char *bs_str = NULL;
+       long long bs_val;
+       unsigned int perc = 0, class, level, hint;
+
+       /*
+        * valid entry formats:
+        * bs/ - %s/ - set perc to 0, prio to -1.
+        * bs/perc - %s/%u - set prio to -1.
+        * bs/perc/class/level - %s/%u/%u/%u
+        * bs/perc/class/level/hint - %s/%u/%u/%u/%u
+        */
+       matches = sscanf(str, "%m[^/]/%u/%u/%u/%u",
+                        &bs_str, &perc, &class, &level, &hint);
+       if (matches < 1) {
+               log_err("fio: invalid cmdprio_bssplit format\n");
+               return 1;
+       }
+
+       if (str_to_decimal(bs_str, &bs_val, 1, o, 0, 0)) {
+               log_err("fio: split conversion failed\n");
+               free(bs_str);
+               return 1;
+       }
+       free(bs_str);
+
+       entry->bs = bs_val;
+       entry->perc = min(perc, 100u);
+       entry->prio = -1;
+       switch (matches) {
+       case 1: /* bs/ case */
+       case 2: /* bs/perc case */
+               break;
+       case 4: /* bs/perc/class/level case */
+       case 5: /* bs/perc/class/level/hint case */
+               class = min(class, (unsigned int) IOPRIO_MAX_PRIO_CLASS);
+               level = min(level, (unsigned int) IOPRIO_MAX_PRIO);
+               if (matches == 5)
+                       hint = min(hint, (unsigned int) IOPRIO_MAX_PRIO_HINT);
+               else
+                       hint = 0;
+               entry->prio = ioprio_value(class, level, hint);
+               break;
+       default:
+               log_err("fio: invalid cmdprio_bssplit format\n");
+               return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * Returns a negative integer if the first argument should be before the second
+ * argument in the sorted list. A positive integer if the first argument should
+ * be after the second argument in the sorted list. A zero if they are equal.
+ */
+static int fio_split_prio_cmp(const void *p1, const void *p2)
+{
+       const struct split_prio *tmp1 = p1;
+       const struct split_prio *tmp2 = p2;
+
+       if (tmp1->bs > tmp2->bs)
+               return 1;
+       if (tmp1->bs < tmp2->bs)
+               return -1;
+       return 0;
+}
+
+int split_parse_prio_ddir(struct thread_options *o, struct split_prio **entries,
+                         int *nr_entries, char *str)
+{
+       struct split_prio *tmp_entries;
+       unsigned int nr_bssplits;
+       char *str_cpy, *p, *fname;
+
+       /* strsep modifies the string, dup it so that we can use strsep twice */
+       p = str_cpy = strdup(str);
+       if (!p)
+               return 1;
+
+       nr_bssplits = 0;
+       while ((fname = strsep(&str_cpy, ":")) != NULL) {
+               if (!strlen(fname))
+                       break;
+               nr_bssplits++;
+       }
+       free(p);
+
+       if (nr_bssplits > BSSPLIT_MAX) {
+               log_err("fio: too many cmdprio_bssplit entries\n");
+               return 1;
+       }
+
+       tmp_entries = calloc(nr_bssplits, sizeof(*tmp_entries));
+       if (!tmp_entries)
+               return 1;
+
+       nr_bssplits = 0;
+       while ((fname = strsep(&str, ":")) != NULL) {
+               struct split_prio *entry;
+
+               if (!strlen(fname))
+                       break;
+
+               entry = &tmp_entries[nr_bssplits];
+
+               if (parse_cmdprio_bssplit_entry(o, entry, fname)) {
+                       log_err("fio: failed to parse cmdprio_bssplit entry\n");
+                       free(tmp_entries);
+                       return 1;
+               }
+
+               /* skip zero perc entries, they provide no useful information */
+               if (entry->perc)
+                       nr_bssplits++;
+       }
+
+       qsort(tmp_entries, nr_bssplits, sizeof(*tmp_entries),
+             fio_split_prio_cmp);
+
+       *entries = tmp_entries;
+       *nr_entries = nr_bssplits;
+
+       return 0;
+}
+
  static int str2error(char *str)
  {
         const char *err[] = { "EPERM", "ENOENT", "ESRCH", "EINTR", "EIO",
@@ -446,9 +603,21 @@ static int str_rw_cb(void *data, const char *str)
         if (!nr)
                 return 0;
  
-       if (td_random(td))
-               o->ddir_seq_nr = atoi(nr);
-       else {
+       if (td_random(td)) {
+               long long val;
+
+               if (str_to_decimal(nr, &val, 1, o, 0, 0)) {
+                       log_err("fio: randrw postfix parsing failed\n");
+                       free(nr);
+                       return 1;
+               }
+               if ((val <= 0) || (val > UINT_MAX)) {
+                       log_err("fio: randrw postfix parsing out of range\n");
+                       free(nr);
+                       return 1;
+               }
+               o->ddir_seq_nr = (unsigned int) val;
+       } else {
                 long long val;
  
                 if (str_to_decimal(nr, &val, 1, o, 0, 0)) {
@@ -485,7 +654,7 @@ static int fio_clock_source_cb(void *data, const char *str)
         return 0;
  }
  
-static int str_rwmix_read_cb(void *data, unsigned long long *val)
+static int str_rwmix_read_cb(void *data, long long *val)
  {
         struct thread_data *td = cb_data_to_td(data);
  
@@ -494,7 +663,7 @@ static int str_rwmix_read_cb(void *data, unsigned long long *val)
         return 0;
  }
  
-static int str_rwmix_write_cb(void *data, unsigned long long *val)
+static int str_rwmix_write_cb(void *data, long long *val)
  {
         struct thread_data *td = cb_data_to_td(data);
  
@@ -513,7 +682,7 @@ static int str_exitall_cb(void)
  int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu_index)
  {
         unsigned int i, index, cpus_in_mask;
-       const long max_cpu = cpus_online();
+       const long max_cpu = cpus_configured();
  
         cpus_in_mask = fio_cpu_count(mask);
         if (!cpus_in_mask)
@@ -552,7 +721,7 @@ static int str_cpumask_cb(void *data, unsigned long long *val)
                 return 1;
         }
  
-       max_cpu = cpus_online();
+       max_cpu = cpus_configured();
  
         for (i = 0; i < sizeof(int) * 8; i++) {
                 if ((1 << i) & *val) {
@@ -588,7 +757,7 @@ static int set_cpus_allowed(struct thread_data *td, os_cpu_mask_t *mask,
         strip_blank_front(&str);
         strip_blank_end(str);
  
-       max_cpu = cpus_online();
+       max_cpu = cpus_configured();
  
         while ((cpu = strsep(&str, ",")) != NULL) {
                 char *str2, *cpu2;
@@ -906,8 +1075,8 @@ static int str_sfr_cb(void *data, const char *str)
  }
  #endif
  
-static int zone_split_ddir(struct thread_options *o, enum fio_ddir ddir,
-                          char *str, bool absolute)
+static int zone_split_ddir(struct thread_options *o, void *eo,
+                          enum fio_ddir ddir, char *str, bool absolute)
  {
         unsigned int i, perc, perc_missing, sperc, sperc_missing;
         struct split split;
@@ -1012,7 +1181,7 @@ static int parse_zoned_distribution(struct thread_data *td, const char *input,
         }
         str += strlen(pre);
  
-       ret = str_split_parse(td, str, zone_split_ddir, absolute);
+       ret = str_split_parse(td, str, zone_split_ddir, NULL, absolute);
  
         free(p);
  
@@ -1252,7 +1421,7 @@ int get_max_str_idx(char *input)
  }
  
  /*
- * Returns the directory at the index, indexes > entires will be
+ * Returns the directory at the index, indexes > entries will be
   * assigned via modulo division of the index
   */
  int set_name_idx(char *target, size_t tlen, char *input, int index,
@@ -1374,8 +1543,8 @@ static int str_buffer_pattern_cb(void *data, const char *input)
         int ret;
  
         /* FIXME: for now buffer pattern does not support formats */
-       ret = parse_and_fill_pattern(input, strlen(input), td->o.buffer_pattern,
-                                    MAX_PATTERN_SIZE, NULL, NULL, NULL);
+       ret = parse_and_fill_pattern_alloc(input, strlen(input),
+                               &td->o.buffer_pattern, NULL, NULL, NULL);
         if (ret < 0)
                 return 1;
  
@@ -1423,9 +1592,9 @@ static int str_verify_pattern_cb(void *data, const char *input)
         int ret;
  
         td->o.verify_fmt_sz = FIO_ARRAY_SIZE(td->o.verify_fmt);
-       ret = parse_and_fill_pattern(input, strlen(input), td->o.verify_pattern,
-                                    MAX_PATTERN_SIZE, fmt_desc,
-                                    td->o.verify_fmt, &td->o.verify_fmt_sz);
+       ret = parse_and_fill_pattern_alloc(input, strlen(input),
+                       &td->o.verify_pattern, fmt_desc, td->o.verify_fmt,
+                       &td->o.verify_fmt_sz);
         if (ret < 0)
                 return 1;
  
@@ -1446,7 +1615,7 @@ static int str_gtod_reduce_cb(void *data, int *il)
         int val = *il;
  
         /*
-        * Only modfiy options if gtod_reduce==1
+        * Only modify options if gtod_reduce==1
          * Otherwise leave settings alone.
          */
         if (val) {
@@ -1463,7 +1632,7 @@ static int str_gtod_reduce_cb(void *data, int *il)
         return 0;
  }
  
-static int str_offset_cb(void *data, unsigned long long *__val)
+static int str_offset_cb(void *data, long long *__val)
  {
         struct thread_data *td = cb_data_to_td(data);
         unsigned long long v = *__val;
@@ -1471,15 +1640,20 @@ static int str_offset_cb(void *data, unsigned long long *__val)
         if (parse_is_percent(v)) {
                 td->o.start_offset = 0;
                 td->o.start_offset_percent = -1ULL - v;
+               td->o.start_offset_nz = 0;
                 dprint(FD_PARSE, "SET start_offset_percent %d\n",
                                         td->o.start_offset_percent);
+       } else if (parse_is_zone(v)) {
+               td->o.start_offset = 0;
+               td->o.start_offset_percent = 0;
+               td->o.start_offset_nz = v - ZONE_BASE_VAL;
         } else
                 td->o.start_offset = v;
  
         return 0;
  }
  
-static int str_offset_increment_cb(void *data, unsigned long long *__val)
+static int str_offset_increment_cb(void *data, long long *__val)
  {
         struct thread_data *td = cb_data_to_td(data);
         unsigned long long v = *__val;
@@ -1487,15 +1661,20 @@ static int str_offset_increment_cb(void *data, unsigned long long *__val)
         if (parse_is_percent(v)) {
                 td->o.offset_increment = 0;
                 td->o.offset_increment_percent = -1ULL - v;
+               td->o.offset_increment_nz = 0;
                 dprint(FD_PARSE, "SET offset_increment_percent %d\n",
                                         td->o.offset_increment_percent);
+       } else if (parse_is_zone(v)) {
+               td->o.offset_increment = 0;
+               td->o.offset_increment_percent = 0;
+               td->o.offset_increment_nz = v - ZONE_BASE_VAL;
         } else
                 td->o.offset_increment = v;
  
         return 0;
  }
  
-static int str_size_cb(void *data, unsigned long long *__val)
+static int str_size_cb(void *data, long long *__val)
  {
         struct thread_data *td = cb_data_to_td(data);
         unsigned long long v = *__val;
@@ -1505,6 +1684,10 @@ static int str_size_cb(void *data, unsigned long long *__val)
                 td->o.size_percent = -1ULL - v;
                 dprint(FD_PARSE, "SET size_percent %d\n",
                                         td->o.size_percent);
+       } else if (parse_is_zone(v)) {
+               td->o.size = 0;
+               td->o.size_percent = 0;
+               td->o.size_nz = v - ZONE_BASE_VAL;
         } else
                 td->o.size = v;
  
@@ -1525,12 +1708,30 @@ static int str_io_size_cb(void *data, unsigned long long *__val)
                 }
                 dprint(FD_PARSE, "SET io_size_percent %d\n",
                                         td->o.io_size_percent);
+       } else if (parse_is_zone(v)) {
+               td->o.io_size = 0;
+               td->o.io_size_percent = 0;
+               td->o.io_size_nz = v - ZONE_BASE_VAL;
         } else
                 td->o.io_size = v;
  
         return 0;
  }
  
+static int str_zoneskip_cb(void *data, long long *__val)
+{
+       struct thread_data *td = cb_data_to_td(data);
+       unsigned long long v = *__val;
+
+       if (parse_is_zone(v)) {
+               td->o.zone_skip = 0;
+               td->o.zone_skip_nz = v - ZONE_BASE_VAL;
+       } else
+               td->o.zone_skip = v;
+
+       return 0;
+}
+
  static int str_write_bw_log_cb(void *data, const char *str)
  {
         struct thread_data *td = cb_data_to_td(data);
@@ -1801,6 +2002,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                             .oval = TD_DDIR_TRIMWRITE,
                             .help = "Trim and write mix, trims preceding writes"
                           },
+                         { .ival = "randtrimwrite",
+                           .oval = TD_DDIR_RANDTRIMWRITE,
+                           .help = "Randomly trim and write mix, trims preceding writes"
+                         },
                 },
         },
         {
@@ -1913,6 +2118,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                             .help = "RDMA IO engine",
                           },
  #endif
+#ifdef CONFIG_LIBRPMA_APM
+                         { .ival = "librpma_apm",
+                           .help = "librpma IO engine in APM mode",
+                         },
+#endif
+#ifdef CONFIG_LIBRPMA_GPSPM
+                         { .ival = "librpma_gpspm",
+                           .help = "librpma IO engine in GPSPM mode",
+                         },
+#endif
  #ifdef CONFIG_LINUX_EXT4_MOVE_EXTENT
                           { .ival = "e4defrag",
                             .help = "ext4 defrag engine",
@@ -1936,12 +2151,6 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                             .help = "Hadoop Distributed Filesystem (HDFS) engine"
                           },
  #endif
-#ifdef CONFIG_PMEMBLK
-                         { .ival = "pmemblk",
-                           .help = "PMDK libpmemblk based IO engine",
-                         },
-
-#endif
  #ifdef CONFIG_IME
                           { .ival = "ime_psync",
                             .help = "DDN's IME synchronous IO engine",
@@ -1979,6 +2188,21 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                           { .ival = "nbd",
                             .help = "Network Block Device (NBD) IO engine"
                           },
+#ifdef CONFIG_DFS
+                         { .ival = "dfs",
+                           .help = "DAOS File System (dfs) IO engine",
+                         },
+#endif
+#ifdef CONFIG_LIBNFS
+                         { .ival = "nfs",
+                           .help = "NFS IO engine",
+                         },
+#endif
+#ifdef CONFIG_LIBXNVME
+                         { .ival = "xnvme",
+                           .help = "XNVME IO engine",
+                         },
+#endif
                 },
         },
         {
@@ -2081,11 +2305,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
         {
                 .name   = "size",
                 .lname  = "Size",
-               .type   = FIO_OPT_STR_VAL,
+               .type   = FIO_OPT_STR_VAL_ZONE,
                 .cb     = str_size_cb,
                 .off1   = offsetof(struct thread_options, size),
                 .help   = "Total size of device or files",
-               .interval = 1024 * 1024,
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_INVALID,
         },
@@ -2093,11 +2316,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .name   = "io_size",
                 .alias  = "io_limit",
                 .lname  = "IO Size",
-               .type   = FIO_OPT_STR_VAL,
+               .type   = FIO_OPT_STR_VAL_ZONE,
                 .cb     = str_io_size_cb,
                 .off1   = offsetof(struct thread_options, io_size),
                 .help   = "Total size of I/O to be performed",
-               .interval = 1024 * 1024,
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_INVALID,
         },
@@ -2138,12 +2360,11 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .name   = "offset",
                 .lname  = "IO offset",
                 .alias  = "fileoffset",
-               .type   = FIO_OPT_STR_VAL,
+               .type   = FIO_OPT_STR_VAL_ZONE,
                 .cb     = str_offset_cb,
                 .off1   = offsetof(struct thread_options, start_offset),
                 .help   = "Start IO from this offset",
                 .def    = "0",
-               .interval = 1024 * 1024,
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_INVALID,
         },
@@ -2161,14 +2382,13 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
         {
                 .name   = "offset_increment",
                 .lname  = "IO offset increment",
-               .type   = FIO_OPT_STR_VAL,
+               .type   = FIO_OPT_STR_VAL_ZONE,
                 .cb     = str_offset_increment_cb,
                 .off1   = offsetof(struct thread_options, offset_increment),
                 .help   = "What is the increment from one offset to the next",
                 .parent = "offset",
                 .hide   = 1,
                 .def    = "0",
-               .interval = 1024 * 1024,
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_INVALID,
         },
@@ -2182,6 +2402,17 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_INVALID,
         },
+       {
+               .name   = "num_range",
+               .lname  = "Number of ranges",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, num_range),
+               .maxval = MAX_TRIM_RANGE,
+               .help   = "Number of ranges for trim command",
+               .def    = "1",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_INVALID,
+       },
         {
                 .name   = "bs",
                 .lname  = "Block size",
@@ -2271,6 +2502,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
         },
         {
                 .name   = "randrepeat",
+               .alias  = "allrandrepeat",
                 .lname  = "Random repeatable",
                 .type   = FIO_OPT_BOOL,
                 .off1   = offsetof(struct thread_options, rand_repeatable),
@@ -2400,16 +2632,6 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_RANDOM,
         },
-       {
-               .name   = "allrandrepeat",
-               .lname  = "All Random Repeat",
-               .type   = FIO_OPT_BOOL,
-               .off1   = offsetof(struct thread_options, allrand_repeatable),
-               .help   = "Use repeatable random numbers for everything",
-               .def    = "0",
-               .category = FIO_OPT_C_IO,
-               .group  = FIO_OPT_G_RANDOM,
-       },
         {
                 .name   = "nrfiles",
                 .lname  = "Number of files",
@@ -2547,6 +2769,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                             .oval = F_ADV_SEQUENTIAL,
                             .help = "Advise using FADV_SEQUENTIAL",
                           },
+#ifdef POSIX_FADV_NOREUSE
+                         { .ival = "noreuse",
+                           .oval = F_ADV_NOREUSE,
+                           .help = "Advise using FADV_NOREUSE",
+                         },
+#endif
                 },
                 .help   = "Use fadvise() to advise the kernel on IO pattern",
                 .def    = "1",
@@ -3404,11 +3632,11 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
         {
                 .name   = "zoneskip",
                 .lname  = "Zone skip",
-               .type   = FIO_OPT_STR_VAL,
+               .type   = FIO_OPT_STR_VAL_ZONE,
+               .cb     = str_zoneskip_cb,
                 .off1   = offsetof(struct thread_options, zone_skip),
                 .help   = "Space between IO zones",
                 .def    = "0",
-               .interval = 1024 * 1024,
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_ZONE,
         },
@@ -3427,7 +3655,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .lname  = "Per device/file maximum number of open zones",
                 .type   = FIO_OPT_INT,
                 .off1   = offsetof(struct thread_options, max_open_zones),
-               .maxval = ZBD_MAX_OPEN_ZONES,
+               .maxval = ZBD_MAX_WRITE_ZONES,
                 .help   = "Limit on the number of simultaneously opened sequential write zones with zonemode=zbd",
                 .def    = "0",
                 .category = FIO_OPT_C_IO,
@@ -3438,12 +3666,22 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .lname  = "Job maximum number of open zones",
                 .type   = FIO_OPT_INT,
                 .off1   = offsetof(struct thread_options, job_max_open_zones),
-               .maxval = ZBD_MAX_OPEN_ZONES,
+               .maxval = ZBD_MAX_WRITE_ZONES,
                 .help   = "Limit on the number of simultaneously opened sequential write zones with zonemode=zbd by one thread/process",
                 .def    = "0",
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_INVALID,
         },
+       {
+               .name   = "ignore_zone_limits",
+               .lname  = "Ignore zone resource limits",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct thread_options, ignore_zone_limits),
+               .def    = "0",
+               .help   = "Ignore the zone resource limits (max open/active zones) reported by the device",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_INVALID,
+       },
         {
                 .name   = "zone_reset_threshold",
                 .lname  = "Zone reset threshold",
@@ -3468,6 +3706,74 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_ZONE,
         },
+       {
+               .name   = "fdp",
+               .lname  = "Flexible data placement",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct thread_options, fdp),
+               .help   = "Use Data placement directive (FDP)",
+               .def    = "0",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_INVALID,
+       },
+       {
+               .name   = "dataplacement",
+               .alias  = "data_placement",
+               .lname  = "Data Placement interface",
+               .type   = FIO_OPT_STR,
+               .off1   = offsetof(struct thread_options, dp_type),
+               .help   = "Data Placement interface to use",
+               .def    = "none",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_INVALID,
+               .posval = {
+                         { .ival = "none",
+                           .oval = FIO_DP_NONE,
+                           .help = "Do not specify a data placement interface",
+                         },
+                         { .ival = "fdp",
+                           .oval = FIO_DP_FDP,
+                           .help = "Use Flexible Data Placement interface",
+                         },
+                         { .ival = "streams",
+                           .oval = FIO_DP_STREAMS,
+                           .help = "Use Streams interface",
+                         },
+               },
+       },
+       {
+               .name   = "plid_select",
+               .alias  = "fdp_pli_select",
+               .lname  = "Data Placement ID selection strategy",
+               .type   = FIO_OPT_STR,
+               .off1   = offsetof(struct thread_options, dp_id_select),
+               .help   = "Strategy for selecting next Data Placement ID",
+               .def    = "roundrobin",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_INVALID,
+               .posval = {
+                         { .ival = "random",
+                           .oval = FIO_DP_RANDOM,
+                           .help = "Choose a Placement ID at random (uniform)",
+                         },
+                         { .ival = "roundrobin",
+                           .oval = FIO_DP_RR,
+                           .help = "Round robin select Placement IDs",
+                         },
+               },
+       },
+       {
+               .name   = "plids",
+               .alias  = "fdp_pli",
+               .lname  = "Stream IDs/Data Placement ID indices",
+               .type   = FIO_OPT_STR,
+               .cb     = str_fdp_pli_cb,
+               .off1   = offsetof(struct thread_options, dp_ids),
+               .help   = "Sets which Data Placement ids to use (defaults to all for FDP)",
+               .hide   = 1,
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_INVALID,
+       },
         {
                 .name   = "lockmem",
                 .lname  = "Lock memory",
@@ -3564,6 +3870,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_GENERAL,
                 .group  = FIO_OPT_G_CRED,
         },
+       {
+               .name   = "priohint",
+               .lname  = "I/O nice priority hint",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, ioprio_hint),
+               .help   = "Set job IO priority hint",
+               .minval = IOPRIO_MIN_PRIO_HINT,
+               .maxval = IOPRIO_MAX_PRIO_HINT,
+               .interval = 1,
+               .category = FIO_OPT_C_GENERAL,
+               .group  = FIO_OPT_G_CRED,
+       },
  #else
         {
                 .name   = "prioclass",
@@ -3571,6 +3889,12 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .type   = FIO_OPT_UNSUPPORTED,
                 .help   = "Your platform does not support IO priority classes",
         },
+       {
+               .name   = "priohint",
+               .lname  = "I/O nice priority hint",
+               .type   = FIO_OPT_UNSUPPORTED,
+               .help   = "Your platform does not support IO priority hints",
+       },
  #endif
         {
                 .name   = "thinktime",
@@ -3596,6 +3920,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_THINKTIME,
         },
+       {
+               .name   = "thinkcycles",
+               .lname  = "Think cycles",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, thinkcycles),
+               .help   = "Spin for a constant amount of cycles between requests",
+               .def    = "0",
+               .parent = "thinktime",
+               .hide   = 1,
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_THINKTIME,
+       },
         {
                 .name   = "thinktime_blocks",
                 .lname  = "Thinktime blocks",
@@ -3630,6 +3966,20 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 },
                 .parent = "thinktime",
         },
+       {
+               .name   = "thinktime_iotime",
+               .lname  = "Thinktime interval",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, thinktime_iotime),
+               .help   = "IO time interval between 'thinktime'",
+               .def    = "0",
+               .parent = "thinktime",
+               .hide   = 1,
+               .is_seconds = 1,
+               .is_time = 1,
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_THINKTIME,
+       },
         {
                 .name   = "rate",
                 .lname  = "I/O rate",
@@ -3728,8 +4078,10 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
         {
                 .name   = "max_latency",
                 .lname  = "Max Latency (usec)",
-               .type   = FIO_OPT_STR_VAL_TIME,
-               .off1   = offsetof(struct thread_options, max_latency),
+               .type   = FIO_OPT_ULL,
+               .off1   = offsetof(struct thread_options, max_latency[DDIR_READ]),
+               .off2   = offsetof(struct thread_options, max_latency[DDIR_WRITE]),
+               .off3   = offsetof(struct thread_options, max_latency[DDIR_TRIM]),
                 .help   = "Maximum tolerated IO latency (usec)",
                 .is_time = 1,
                 .category = FIO_OPT_C_IO,
@@ -4178,6 +4530,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_LOG,
                 .group  = FIO_OPT_G_INVALID,
         },
+       {
+               .name   = "log_entries",
+               .lname  = "Log entries",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, log_entries),
+               .help   = "Initial number of entries in a job IO log",
+               .def    = __fio_stringify(DEF_LOG_ENTRIES),
+               .minval = DEF_LOG_ENTRIES,
+               .maxval = MAX_LOG_ENTRIES,
+               .category = FIO_OPT_C_LOG,
+               .group  = FIO_OPT_G_INVALID,
+       },
         {
                 .name   = "log_avg_msec",
                 .lname  = "Log averaging (msec)",
@@ -4221,14 +4585,38 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .group  = FIO_OPT_G_INVALID,
         },
         {
-               .name   = "log_max_value",
-               .lname  = "Log maximum instead of average",
-               .type   = FIO_OPT_BOOL,
+               .name   = "log_window_value",
+               .alias  = "log_max_value",
+               .lname  = "Log maximum, average or both values",
+               .type   = FIO_OPT_STR,
                 .off1   = offsetof(struct thread_options, log_max),
-               .help   = "Log max sample in a window instead of average",
-               .def    = "0",
+               .help   = "Log max, average or both sample in a window",
+               .def    = "avg",
                 .category = FIO_OPT_C_LOG,
                 .group  = FIO_OPT_G_INVALID,
+               .posval = {
+                         { .ival = "avg",
+                           .oval = IO_LOG_SAMPLE_AVG,
+                           .help = "Log average value over the window",
+                         },
+                         { .ival = "max",
+                           .oval = IO_LOG_SAMPLE_MAX,
+                           .help = "Log maximum value in the window",
+                         },
+                         { .ival = "both",
+                           .oval = IO_LOG_SAMPLE_BOTH,
+                           .help = "Log both average and maximum values over the window"
+                         },
+                         /* Compatibility with former boolean values */
+                         { .ival = "0",
+                           .oval = IO_LOG_SAMPLE_AVG,
+                           .help = "Alias for 'avg'",
+                         },
+                         { .ival = "1",
+                           .oval = IO_LOG_SAMPLE_MAX,
+                           .help = "Alias for 'max'",
+                         },
+               },
         },
         {
                 .name   = "log_offset",
@@ -4240,6 +4628,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_LOG,
                 .group  = FIO_OPT_G_INVALID,
         },
+       {
+               .name   = "log_prio",
+               .lname  = "Log priority of IO",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct thread_options, log_prio),
+               .help   = "Include priority value of IO for each log entry",
+               .def    = "0",
+               .category = FIO_OPT_C_LOG,
+               .group  = FIO_OPT_G_INVALID,
+       },
  #ifdef CONFIG_ZLIB
         {
                 .name   = "log_compression",
@@ -4296,11 +4694,21 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
         },
  #endif
         {
-               .name = "log_unix_epoch",
-               .lname = "Log epoch unix",
+               .name = "log_alternate_epoch",
+               .alias = "log_unix_epoch",
+               .lname = "Log epoch alternate",
                 .type = FIO_OPT_BOOL,
-               .off1 = offsetof(struct thread_options, log_unix_epoch),
-               .help = "Use Unix time in log files",
+               .off1 = offsetof(struct thread_options, log_alternate_epoch),
+               .help = "Use alternate epoch time in log files. Uses the same epoch as that is used by clock_gettime with specified log_alternate_epoch_clock_id.",
+               .category = FIO_OPT_C_LOG,
+               .group = FIO_OPT_G_INVALID,
+       },
+       {
+               .name = "log_alternate_epoch_clock_id",
+               .lname = "Log alternate epoch clock_id",
+               .type = FIO_OPT_INT,
+               .off1 = offsetof(struct thread_options, log_alternate_epoch_clock_id),
+               .help = "If log_alternate_epoch is true, this option specifies the clock_id from clock_gettime whose epoch should be used. If log_alternate_epoch is false, this option has no effect. Default value is 0, or CLOCK_REALTIME",
                 .category = FIO_OPT_C_LOG,
                 .group = FIO_OPT_G_INVALID,
         },
@@ -4437,6 +4845,50 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_IO_BUF,
         },
+       {
+               .name   = "dedupe_global",
+               .lname  = "Global deduplication",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct thread_options, dedupe_global),
+               .help   = "Share deduplication buffers across jobs",
+               .def    = "0",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_IO_BUF,
+       },
+       {
+               .name   = "dedupe_mode",
+               .lname  = "Dedupe mode",
+               .help   = "Mode for the deduplication buffer generation",
+               .type   = FIO_OPT_STR,
+               .off1   = offsetof(struct thread_options, dedupe_mode),
+               .parent = "dedupe_percentage",
+               .def    = "repeat",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_IO_BUF,
+               .posval = {
+                          { .ival = "repeat",
+                            .oval = DEDUPE_MODE_REPEAT,
+                            .help = "repeat previous page",
+                          },
+                          { .ival = "working_set",
+                            .oval = DEDUPE_MODE_WORKING_SET,
+                            .help = "choose a page randomly from limited working set defined in dedupe_working_set_percentage",
+                          },
+               },
+       },
+       {
+               .name   = "dedupe_working_set_percentage",
+               .lname  = "Dedupe working set percentage",
+               .help   = "Dedupe working set size in percentages from file or device size used to generate dedupe patterns from",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, dedupe_working_set_percentage),
+               .parent = "dedupe_percentage",
+               .def    = "5",
+               .maxval = 100,
+               .minval = 0,
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_IO_BUF,
+       },
         {
                 .name   = "clat_percentiles",
                 .lname  = "Completion latency percentiles",
@@ -4585,15 +5037,52 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_GENERAL,
                 .group  = FIO_OPT_G_CLOCK,
         },
+       {
+               .name   = "job_start_clock_id",
+               .lname  = "Job start clock_id",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct thread_options, job_start_clock_id),
+               .help   = "The clock_id passed to the call to clock_gettime used to record job_start in the json output format. Default is 0, or CLOCK_REALTIME",
+               .verify = gtod_cpu_verify,
+               .category = FIO_OPT_C_GENERAL,
+               .group  = FIO_OPT_G_CLOCK,
+       },
         {
                 .name   = "unified_rw_reporting",
                 .lname  = "Unified RW Reporting",
-               .type   = FIO_OPT_BOOL,
+               .type   = FIO_OPT_STR,
                 .off1   = offsetof(struct thread_options, unified_rw_rep),
                 .help   = "Unify reporting across data direction",
-               .def    = "0",
+               .def    = "none",
                 .category = FIO_OPT_C_GENERAL,
                 .group  = FIO_OPT_G_INVALID,
+               .posval = {
+                         { .ival = "none",
+                           .oval = UNIFIED_SPLIT,
+                           .help = "Normal statistics reporting",
+                         },
+                         { .ival = "mixed",
+                           .oval = UNIFIED_MIXED,
+                           .help = "Statistics are summed per data direction and reported together",
+                         },
+                         { .ival = "both",
+                           .oval = UNIFIED_BOTH,
+                           .help = "Statistics are reported normally, followed by the mixed statistics"
+                         },
+                         /* Compatibility with former boolean values */
+                         { .ival = "0",
+                           .oval = UNIFIED_SPLIT,
+                           .help = "Alias for 'none'",
+                         },
+                         { .ival = "1",
+                           .oval = UNIFIED_MIXED,
+                           .help = "Alias for 'mixed'",
+                         },
+                         { .ival = "2",
+                           .oval = UNIFIED_BOTH,
+                           .help = "Alias for 'both'",
+                         },
+               },
         },
         {
                 .name   = "continue_on_error",
@@ -4877,6 +5366,20 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_GENERAL,
                 .group  = FIO_OPT_G_RUNTIME,
         },
+        {
+               .name   = "steadystate_check_interval",
+               .lname  = "Steady state check interval",
+               .alias  = "ss_interval",
+               .parent = "steadystate",
+               .type   = FIO_OPT_STR_VAL_TIME,
+               .off1   = offsetof(struct thread_options, ss_check_interval),
+               .help   = "Polling interval for the steady state check (too low means steadystate will not converge)",
+               .def    = "1",
+               .is_seconds = 1,
+               .is_time = 1,
+               .category = FIO_OPT_C_GENERAL,
+               .group  = FIO_OPT_G_RUNTIME,
+       },
         {
                 .name = NULL,
         },
@@ -4997,7 +5500,7 @@ void fio_keywords_init(void)
         sprintf(buf, "%llu", mb_memory);
         fio_keywords[1].replace = strdup(buf);
  
-       l = cpus_online();
+       l = cpus_configured();
         sprintf(buf, "%lu", l);
         fio_keywords[2].replace = strdup(buf);
  }
@@ -5426,6 +5929,19 @@ void fio_options_free(struct thread_data *td)
         }
  }
  
+void fio_dump_options_free(struct thread_data *td)
+{
+       while (!flist_empty(&td->opt_list)) {
+               struct print_option *p;
+
+               p = flist_first_entry(&td->opt_list, struct print_option, list);
+               flist_del_init(&p->list);
+               free(p->name);
+               free(p->value);
+               free(p);
+       }
+}
+
  struct fio_option *fio_option_find(const char *name)
  {
         return find_option(fio_options, name);
diff --git a/options.h b/options.h

index 5276f31e6818673a338fcbc3ef18997263b0abd0..df80fd9864bdd3f18d84e22c9aee937f45eab8aa 100644 (file)
--- a/options.h
+++ b/options.h
@@ -16,6 +16,7 @@ void add_opt_posval(const char *, const char *, const char *);
  void del_opt_posval(const char *, const char *);
  struct thread_data;
  void fio_options_free(struct thread_data *);
+void fio_dump_options_free(struct thread_data *);
  char *get_next_str(char **ptr);
  int get_max_str_idx(char *input);
  char* get_name_by_idx(char *input, int index);
diff --git a/os/linux/io_uring.h b/os/linux/io_uring.h

index d39b45fddb2214293aee16c137023a5993680837..c7a24ad88d63dc8220076a153fe724858198ab3e 100644 (file)
--- a/os/linux/io_uring.h
+++ b/os/linux/io_uring.h
@@ -11,10 +11,6 @@
  #include <linux/fs.h>
  #include <linux/types.h>
  
-#ifdef __cplusplus
-extern "C" {
-#endif
-
  /*
   * IO submission data structure (Submission Queue Entry)
   */
@@ -26,6 +22,7 @@ struct io_uring_sqe {
         union {
                 __u64   off;    /* offset into file */
                 __u64   addr2;
+               __u32   cmd_op;
         };
         union {
                 __u64   addr;   /* pointer to buffer or iovecs */
@@ -46,22 +43,35 @@ struct io_uring_sqe {
                 __u32           statx_flags;
                 __u32           fadvise_advice;
                 __u32           splice_flags;
+               __u32           rename_flags;
+               __u32           unlink_flags;
+               __u32           hardlink_flags;
+               __u32           uring_cmd_flags;
         };
         __u64   user_data;      /* data to be passed back at completion time */
+       /* pack this to avoid bogus arm OABI complaints */
+       union {
+               /* index into fixed buffers, if used */
+               __u16   buf_index;
+               /* for grouped buffer selection */
+               __u16   buf_group;
+       } __attribute__((packed));
+       /* personality to use, if used */
+       __u16   personality;
+       union {
+               __s32   splice_fd_in;
+               __u32   file_index;
+       };
         union {
                 struct {
-                       /* pack this to avoid bogus arm OABI complaints */
-                       union {
-                               /* index into fixed buffers, if used */
-                               __u16   buf_index;
-                               /* for grouped buffer selection */
-                               __u16   buf_group;
-                       } __attribute__((packed));
-                       /* personality to use, if used */
-                       __u16   personality;
-                       __s32   splice_fd_in;
+                       __u64   addr3;
+                       __u64   __pad2[1];
                 };
-               __u64   __pad2[3];
+               /*
+                * If the ring is initialized with IORING_SETUP_SQE128, then
+                * this field is used for 80 bytes of arbitrary command data
+                */
+               __u8    cmd[0];
         };
  };
  
@@ -72,6 +82,7 @@ enum {
         IOSQE_IO_HARDLINK_BIT,
         IOSQE_ASYNC_BIT,
         IOSQE_BUFFER_SELECT_BIT,
+       IOSQE_CQE_SKIP_SUCCESS_BIT,
  };
  
  /*
@@ -89,6 +100,8 @@ enum {
  #define IOSQE_ASYNC            (1U << IOSQE_ASYNC_BIT)
  /* select buffer from sqe->buf_group */
  #define IOSQE_BUFFER_SELECT    (1U << IOSQE_BUFFER_SELECT_BIT)
+/* don't post CQE if request succeeded */
+#define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT)
  
  /*
   * io_uring_setup() flags
@@ -99,6 +112,37 @@ enum {
  #define IORING_SETUP_CQSIZE    (1U << 3)       /* app defines CQ size */
  #define IORING_SETUP_CLAMP     (1U << 4)       /* clamp SQ/CQ ring sizes */
  #define IORING_SETUP_ATTACH_WQ (1U << 5)       /* attach to existing wq */
+#define IORING_SETUP_R_DISABLED        (1U << 6)       /* start with ring disabled */
+#define IORING_SETUP_SUBMIT_ALL        (1U << 7)       /* continue submit on error */
+/*
+ * Cooperative task running. When requests complete, they often require
+ * forcing the submitter to transition to the kernel to complete. If this
+ * flag is set, work will be done when the task transitions anyway, rather
+ * than force an inter-processor interrupt reschedule. This avoids interrupting
+ * a task running in userspace, and saves an IPI.
+ */
+#define IORING_SETUP_COOP_TASKRUN      (1U << 8)
+/*
+ * If COOP_TASKRUN is set, get notified if task work is available for
+ * running and a kernel transition would be needed to run it. This sets
+ * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
+ */
+#define IORING_SETUP_TASKRUN_FLAG      (1U << 9)
+
+#define IORING_SETUP_SQE128            (1U << 10) /* SQEs are 128 byte */
+#define IORING_SETUP_CQE32             (1U << 11) /* CQEs are 32 byte */
+
+/*
+ * Only one task is allowed to submit requests
+ */
+#define IORING_SETUP_SINGLE_ISSUER     (1U << 12)
+
+/*
+ * Defer running task work to get events.
+ * Rather than running bits of task work whenever the task transitions
+ * try to do it just before it is needed.
+ */
+#define IORING_SETUP_DEFER_TASKRUN     (1U << 13)
  
  enum {
         IORING_OP_NOP,
@@ -135,11 +179,32 @@ enum {
         IORING_OP_PROVIDE_BUFFERS,
         IORING_OP_REMOVE_BUFFERS,
         IORING_OP_TEE,
+       IORING_OP_SHUTDOWN,
+       IORING_OP_RENAMEAT,
+       IORING_OP_UNLINKAT,
+       IORING_OP_MKDIRAT,
+       IORING_OP_SYMLINKAT,
+       IORING_OP_LINKAT,
+       IORING_OP_MSG_RING,
+       IORING_OP_FSETXATTR,
+       IORING_OP_SETXATTR,
+       IORING_OP_FGETXATTR,
+       IORING_OP_GETXATTR,
+       IORING_OP_SOCKET,
+       IORING_OP_URING_CMD,
+
  
         /* this goes last, obviously */
         IORING_OP_LAST,
  };
  
+/*
+ * sqe->uring_cmd_flags
+ * IORING_URING_CMD_FIXED      use registered buffer; pass thig flag
+ *                             along with setting sqe->buf_index.
+ */
+#define IORING_URING_CMD_FIXED (1U << 0)
+
  /*
   * sqe->fsync_flags
   */
@@ -148,14 +213,35 @@ enum {
  /*
   * sqe->timeout_flags
   */
-#define IORING_TIMEOUT_ABS     (1U << 0)
-
+#define IORING_TIMEOUT_ABS             (1U << 0)
+#define IORING_TIMEOUT_UPDATE          (1U << 1)
+#define IORING_TIMEOUT_BOOTTIME                (1U << 2)
+#define IORING_TIMEOUT_REALTIME                (1U << 3)
+#define IORING_LINK_TIMEOUT_UPDATE     (1U << 4)
+#define IORING_TIMEOUT_ETIME_SUCCESS   (1U << 5)
+#define IORING_TIMEOUT_CLOCK_MASK      (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
+#define IORING_TIMEOUT_UPDATE_MASK     (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
  /*
   * sqe->splice_flags
   * extends splice(2) flags
   */
  #define SPLICE_F_FD_IN_FIXED   (1U << 31) /* the last bit of __u32 */
  
+/*
+ * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
+ * command flags for POLL_ADD are stored in sqe->len.
+ *
+ * IORING_POLL_ADD_MULTI       Multishot poll. Sets IORING_CQE_F_MORE if
+ *                             the poll handler will continue to report
+ *                             CQEs on behalf of the same SQE.
+ *
+ * IORING_POLL_UPDATE          Update existing poll request, matching
+ *                             sqe->addr as the old user_data field.
+ */
+#define IORING_POLL_ADD_MULTI  (1U << 0)
+#define IORING_POLL_UPDATE_EVENTS      (1U << 1)
+#define IORING_POLL_UPDATE_USER_DATA   (1U << 2)
+
  /*
   * IO completion data structure (Completion Queue Entry)
   */
@@ -163,14 +249,22 @@ struct io_uring_cqe {
         __u64   user_data;      /* sqe->data submission passed back */
         __s32   res;            /* result code for this event */
         __u32   flags;
+
+       /*
+        * If the ring is initialized with IORING_SETUP_CQE32, then this field
+        * contains 16-bytes of padding, doubling the size of the CQE.
+        */
+       __u64 big_cqe[];
  };
  
  /*
   * cqe->flags
   *
   * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
+ * IORING_CQE_F_MORE   If set, parent SQE will generate more CQE entries
   */
  #define IORING_CQE_F_BUFFER            (1U << 0)
+#define IORING_CQE_F_MORE              (1U << 1)
  
  enum {
         IORING_CQE_BUFFER_SHIFT         = 16,
@@ -226,8 +320,11 @@ struct io_cqring_offsets {
  /*
   * io_uring_enter(2) flags
   */
-#define IORING_ENTER_GETEVENTS (1U << 0)
-#define IORING_ENTER_SQ_WAKEUP (1U << 1)
+#define IORING_ENTER_GETEVENTS         (1U << 0)
+#define IORING_ENTER_SQ_WAKEUP         (1U << 1)
+#define IORING_ENTER_SQ_WAIT           (1U << 2)
+#define IORING_ENTER_EXT_ARG           (1U << 3)
+#define IORING_ENTER_REGISTERED_RING   (1U << 4)
  
  /*
   * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -255,28 +352,90 @@ struct io_uring_params {
  #define IORING_FEAT_CUR_PERSONALITY    (1U << 4)
  #define IORING_FEAT_FAST_POLL          (1U << 5)
  #define IORING_FEAT_POLL_32BITS        (1U << 6)
+#define IORING_FEAT_SQPOLL_NONFIXED    (1U << 7)
+#define IORING_FEAT_EXT_ARG            (1U << 8)
+#define IORING_FEAT_NATIVE_WORKERS     (1U << 9)
+#define IORING_FEAT_RSRC_TAGS          (1U << 10)
+#define IORING_FEAT_CQE_SKIP           (1U << 11)
  
  /*
   * io_uring_register(2) opcodes and arguments
   */
-#define IORING_REGISTER_BUFFERS                0
-#define IORING_UNREGISTER_BUFFERS      1
-#define IORING_REGISTER_FILES          2
-#define IORING_UNREGISTER_FILES                3
-#define IORING_REGISTER_EVENTFD                4
-#define IORING_UNREGISTER_EVENTFD      5
-#define IORING_REGISTER_FILES_UPDATE   6
-#define IORING_REGISTER_EVENTFD_ASYNC  7
-#define IORING_REGISTER_PROBE          8
-#define IORING_REGISTER_PERSONALITY    9
-#define IORING_UNREGISTER_PERSONALITY  10
+enum {
+       IORING_REGISTER_BUFFERS                 = 0,
+       IORING_UNREGISTER_BUFFERS               = 1,
+       IORING_REGISTER_FILES                   = 2,
+       IORING_UNREGISTER_FILES                 = 3,
+       IORING_REGISTER_EVENTFD                 = 4,
+       IORING_UNREGISTER_EVENTFD               = 5,
+       IORING_REGISTER_FILES_UPDATE            = 6,
+       IORING_REGISTER_EVENTFD_ASYNC           = 7,
+       IORING_REGISTER_PROBE                   = 8,
+       IORING_REGISTER_PERSONALITY             = 9,
+       IORING_UNREGISTER_PERSONALITY           = 10,
+       IORING_REGISTER_RESTRICTIONS            = 11,
+       IORING_REGISTER_ENABLE_RINGS            = 12,
+
+       /* extended with tagging */
+       IORING_REGISTER_FILES2                  = 13,
+       IORING_REGISTER_FILES_UPDATE2           = 14,
+       IORING_REGISTER_BUFFERS2                = 15,
+       IORING_REGISTER_BUFFERS_UPDATE          = 16,
+
+       /* set/clear io-wq thread affinities */
+       IORING_REGISTER_IOWQ_AFF                = 17,
+       IORING_UNREGISTER_IOWQ_AFF              = 18,
+
+       /* set/get max number of io-wq workers */
+       IORING_REGISTER_IOWQ_MAX_WORKERS        = 19,
+
+       /* register/unregister io_uring fd with the ring */
+       IORING_REGISTER_RING_FDS                = 20,
+       IORING_UNREGISTER_RING_FDS              = 21,
+
+       /* this goes last */
+       IORING_REGISTER_LAST
+};
  
+/* io-wq worker categories */
+enum {
+       IO_WQ_BOUND,
+       IO_WQ_UNBOUND,
+};
+
+/* deprecated, see struct io_uring_rsrc_update */
  struct io_uring_files_update {
         __u32 offset;
         __u32 resv;
         __aligned_u64 /* __s32 * */ fds;
  };
  
+struct io_uring_rsrc_register {
+       __u32 nr;
+       __u32 resv;
+       __u64 resv2;
+       __aligned_u64 data;
+       __aligned_u64 tags;
+};
+
+struct io_uring_rsrc_update {
+       __u32 offset;
+       __u32 resv;
+       __aligned_u64 data;
+};
+
+struct io_uring_rsrc_update2 {
+       __u32 offset;
+       __u32 resv;
+       __aligned_u64 data;
+       __aligned_u64 tags;
+       __u32 nr;
+       __u32 resv2;
+};
+
+/* Skip updating fd indexes set to this value in the fd table */
+#define IORING_REGISTER_FILES_SKIP     (-2)
+
  #define IO_URING_OP_SUPPORTED  (1U << 0)
  
  struct io_uring_probe_op {
@@ -294,8 +453,41 @@ struct io_uring_probe {
         struct io_uring_probe_op ops[0];
  };
  
-#ifdef __cplusplus
-}
-#endif
+struct io_uring_restriction {
+       __u16 opcode;
+       union {
+               __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */
+               __u8 sqe_op;      /* IORING_RESTRICTION_SQE_OP */
+               __u8 sqe_flags;   /* IORING_RESTRICTION_SQE_FLAGS_* */
+       };
+       __u8 resv;
+       __u32 resv2[3];
+};
+
+/*
+ * io_uring_restriction->opcode values
+ */
+enum {
+       /* Allow an io_uring_register(2) opcode */
+       IORING_RESTRICTION_REGISTER_OP          = 0,
+
+       /* Allow an sqe opcode */
+       IORING_RESTRICTION_SQE_OP               = 1,
+
+       /* Allow sqe flags */
+       IORING_RESTRICTION_SQE_FLAGS_ALLOWED    = 2,
+
+       /* Require sqe flags (these flags must be set on each submission) */
+       IORING_RESTRICTION_SQE_FLAGS_REQUIRED   = 3,
+
+       IORING_RESTRICTION_LAST
+};
+
+struct io_uring_getevents_arg {
+       __u64   sigmask;
+       __u32   sigmask_sz;
+       __u32   pad;
+       __u64   ts;
+};
  
  #endif
diff --git a/os/os-aix.h b/os/os-aix.h

index 1aab96e08d40ad9656eaa41ba1626a348bfa0d52..db99eef4cef6214408277e70d0ba1685961307aa 100644 (file)
--- a/os/os-aix.h
+++ b/os/os-aix.h
@@ -18,6 +18,12 @@
  
  #define FIO_USE_GENERIC_SWAP
  
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)  \
+       pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
  static inline int blockdev_invalidate_cache(struct fio_file *f)
  {
         return ENOTSUP;
diff --git a/os/os-android.h b/os/os-android.h

deleted file mode 100644 (file)

index 3c05077..0000000
--- a/os/os-android.h
+++ /dev/null
@@ -1,281 +0,0 @@
-#ifndef FIO_OS_ANDROID_H
-#define FIO_OS_ANDROID_H
-
-#define        FIO_OS  os_android
-
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/uio.h>
-#include <sys/syscall.h>
-#include <sys/sysmacros.h>
-#include <sys/vfs.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <sched.h>
-#include <linux/unistd.h>
-#include <linux/major.h>
-#include <asm/byteorder.h>
-
-#include "./os-linux-syscall.h"
-#include "../file.h"
-
-#ifndef __has_builtin         // Optional of course.
-  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
-#endif
-
-#define FIO_HAVE_DISK_UTIL
-#define FIO_HAVE_IOSCHED_SWITCH
-#define FIO_HAVE_IOPRIO
-#define FIO_HAVE_IOPRIO_CLASS
-#define FIO_HAVE_ODIRECT
-#define FIO_HAVE_HUGETLB
-#define FIO_HAVE_BLKTRACE
-#define FIO_HAVE_CL_SIZE
-#define FIO_HAVE_CGROUPS
-#define FIO_HAVE_FS_STAT
-#define FIO_HAVE_TRIM
-#define FIO_HAVE_GETTID
-#define FIO_USE_GENERIC_INIT_RANDOM_STATE
-#define FIO_HAVE_E4_ENG
-#define FIO_HAVE_BYTEORDER_FUNCS
-#define FIO_HAVE_MMAP_HUGE
-#define FIO_NO_HAVE_SHM_H
-
-#define OS_MAP_ANON            MAP_ANONYMOUS
-
-#ifndef POSIX_MADV_DONTNEED
-#define posix_madvise   madvise
-#define POSIX_MADV_DONTNEED MADV_DONTNEED
-#define POSIX_MADV_SEQUENTIAL  MADV_SEQUENTIAL
-#define POSIX_MADV_RANDOM      MADV_RANDOM
-#endif
-
-#ifdef MADV_REMOVE
-#define FIO_MADV_FREE  MADV_REMOVE
-#endif
-#ifndef MAP_HUGETLB
-#define MAP_HUGETLB 0x40000 /* arch specific */
-#endif
-
-#ifndef CONFIG_NO_SHM
-/*
- * Bionic doesn't support SysV shared memeory, so implement it using ashmem
- */
-#include <stdio.h>
-#include <linux/ashmem.h>
-#include <linux/shm.h>
-#define shmid_ds shmid64_ds
-#define SHM_HUGETLB    04000
-
-#define ASHMEM_DEVICE  "/dev/ashmem"
-
-static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
-{
-       int ret=0;
-       if (__cmd == IPC_RMID)
-       {
-               int length = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
-               struct ashmem_pin pin = {0 , length};
-               ret = ioctl(__shmid, ASHMEM_UNPIN, &pin);
-               close(__shmid);
-       }
-       return ret;
-}
-
-static inline int shmget(key_t __key, size_t __size, int __shmflg)
-{
-       int fd,ret;
-       char keybuf[11];
-
-       fd = open(ASHMEM_DEVICE, O_RDWR);
-       if (fd < 0)
-               return fd;
-
-       sprintf(keybuf,"%d",__key);
-       ret = ioctl(fd, ASHMEM_SET_NAME, keybuf);
-       if (ret < 0)
-               goto error;
-
-       /* Stores size in first 8 bytes, allocate extra space */
-       ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t));
-       if (ret < 0)
-               goto error;
-
-       return fd;
-
-error:
-       close(fd);
-       return ret;
-}
-
-static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg)
-{
-       size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
-       /* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */
-       uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
-       /* Save size at beginning of buffer, for use with munmap */
-       *ptr = size;
-       return ptr + 1;
-}
-
-static inline int shmdt (const void *__shmaddr)
-{
-       /* Find mmap size which we stored at the beginning of the buffer */
-       uint64_t *ptr = (uint64_t *)__shmaddr - 1;
-       size_t size = *ptr;
-       return munmap(ptr, size);
-}
-#endif
-
-#define SPLICE_DEF_SIZE        (64*1024)
-
-enum {
-       IOPRIO_CLASS_NONE,
-       IOPRIO_CLASS_RT,
-       IOPRIO_CLASS_BE,
-       IOPRIO_CLASS_IDLE,
-};
-
-enum {
-       IOPRIO_WHO_PROCESS = 1,
-       IOPRIO_WHO_PGRP,
-       IOPRIO_WHO_USER,
-};
-
-#define IOPRIO_BITS            16
-#define IOPRIO_CLASS_SHIFT     13
-
-#define IOPRIO_MIN_PRIO                0       /* highest priority */
-#define IOPRIO_MAX_PRIO                7       /* lowest priority */
-
-#define IOPRIO_MIN_PRIO_CLASS  0
-#define IOPRIO_MAX_PRIO_CLASS  3
-
-static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
-{
-       /*
-        * If no class is set, assume BE
-        */
-       if (!ioprio_class)
-               ioprio_class = IOPRIO_CLASS_BE;
-
-       ioprio |= ioprio_class << IOPRIO_CLASS_SHIFT;
-       return syscall(__NR_ioprio_set, which, who, ioprio);
-}
-
-#ifndef BLKGETSIZE64
-#define BLKGETSIZE64   _IOR(0x12,114,size_t)
-#endif
-
-#ifndef BLKFLSBUF
-#define BLKFLSBUF      _IO(0x12,97)
-#endif
-
-#ifndef BLKDISCARD
-#define BLKDISCARD     _IO(0x12,119)
-#endif
-
-static inline int blockdev_invalidate_cache(struct fio_file *f)
-{
-       return ioctl(f->fd, BLKFLSBUF);
-}
-
-static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
-{
-       if (!ioctl(f->fd, BLKGETSIZE64, bytes))
-               return 0;
-
-       return errno;
-}
-
-static inline unsigned long long os_phys_mem(void)
-{
-       long pagesize, pages;
-
-       pagesize = sysconf(_SC_PAGESIZE);
-       pages = sysconf(_SC_PHYS_PAGES);
-       if (pages == -1 || pagesize == -1)
-               return 0;
-
-       return (unsigned long long) pages * (unsigned long long) pagesize;
-}
-
-#ifdef O_NOATIME
-#define FIO_O_NOATIME  O_NOATIME
-#else
-#define FIO_O_NOATIME  0
-#endif
-
-/* Check for GCC or Clang byte swap intrinsics */
-#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \
-     && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \
-     || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */
-#define fio_swap16(x)  __builtin_bswap16(x)
-#define fio_swap32(x)  __builtin_bswap32(x)
-#define fio_swap64(x)  __builtin_bswap64(x)
-#else
-#include <byteswap.h>
-#define fio_swap16(x)  bswap_16(x)
-#define fio_swap32(x)  bswap_32(x)
-#define fio_swap64(x)  bswap_64(x)
-#endif /* fio_swapN */
-
-#define CACHE_LINE_FILE        \
-       "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
-
-static inline int arch_cache_line_size(void)
-{
-       char size[32];
-       int fd, ret;
-
-       fd = open(CACHE_LINE_FILE, O_RDONLY);
-       if (fd < 0)
-               return -1;
-
-       ret = read(fd, size, sizeof(size));
-
-       close(fd);
-
-       if (ret <= 0)
-               return -1;
-       else
-               return atoi(size);
-}
-
-static inline unsigned long long get_fs_free_size(const char *path)
-{
-       unsigned long long ret;
-       struct statfs s;
-
-       if (statfs(path, &s) < 0)
-               return -1ULL;
-
-       ret = s.f_bsize;
-       ret *= (unsigned long long) s.f_bfree;
-       return ret;
-}
-
-static inline int os_trim(struct fio_file *f, unsigned long long start,
-                         unsigned long long len)
-{
-       uint64_t range[2];
-
-       range[0] = start;
-       range[1] = len;
-
-       if (!ioctl(f->fd, BLKDISCARD, range))
-               return 0;
-
-       return errno;
-}
-
-#ifdef CONFIG_SCHED_IDLE
-static inline int fio_set_sched_idle(void)
-{
-        struct sched_param p = { .sched_priority = 0, };
-        return sched_setscheduler(gettid(), SCHED_IDLE, &p);
-}
-#endif
-
-#endif
diff --git a/os/os-ashmem.h b/os/os-ashmem.h

new file mode 100644 (file)

index 0000000..80eab7c
--- /dev/null
+++ b/os/os-ashmem.h
@@ -0,0 +1,84 @@
+#ifndef CONFIG_NO_SHM
+/*
+ * Bionic doesn't support SysV shared memory, so implement it using ashmem
+ */
+#include <stdio.h>
+#include <linux/ashmem.h>
+#include <linux/shm.h>
+#include <android/api-level.h>
+#ifdef CONFIG_ASHAREDMEMORY_CREATE
+#include <android/sharedmem.h>
+#else
+#define ASHMEM_DEVICE  "/dev/ashmem"
+#endif
+#define shmid_ds shmid64_ds
+#define SHM_HUGETLB    04000
+
+static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
+{
+       int ret=0;
+       if (__cmd == IPC_RMID)
+       {
+               int length = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
+               struct ashmem_pin pin = {0 , length};
+               ret = ioctl(__shmid, ASHMEM_UNPIN, &pin);
+               close(__shmid);
+       }
+       return ret;
+}
+
+#ifdef CONFIG_ASHAREDMEMORY_CREATE
+static inline int shmget(key_t __key, size_t __size, int __shmflg)
+{
+       char keybuf[11];
+
+       sprintf(keybuf, "%d", __key);
+
+       return ASharedMemory_create(keybuf, __size + sizeof(uint64_t));
+}
+#else
+static inline int shmget(key_t __key, size_t __size, int __shmflg)
+{
+       int fd,ret;
+       char keybuf[11];
+
+       fd = open(ASHMEM_DEVICE, O_RDWR);
+       if (fd < 0)
+               return fd;
+
+       sprintf(keybuf,"%d",__key);
+       ret = ioctl(fd, ASHMEM_SET_NAME, keybuf);
+       if (ret < 0)
+               goto error;
+
+       /* Stores size in first 8 bytes, allocate extra space */
+       ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t));
+       if (ret < 0)
+               goto error;
+
+       return fd;
+
+error:
+       close(fd);
+       return ret;
+}
+#endif
+
+static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg)
+{
+       size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
+       /* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */
+       uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
+       /* Save size at beginning of buffer, for use with munmap */
+       *ptr = size;
+       return ptr + 1;
+}
+
+static inline int shmdt (const void *__shmaddr)
+{
+       /* Find mmap size which we stored at the beginning of the buffer */
+       uint64_t *ptr = (uint64_t *)__shmaddr - 1;
+       size_t size = *ptr;
+       return munmap(ptr, size);
+}
+#endif
diff --git a/os/os-dragonfly.h b/os/os-dragonfly.h

index 44bfcd5d064001722567d8c2011665b612a13d4b..4ce7253956a3ca27d90275d132ec9f405cf63227 100644 (file)
--- a/os/os-dragonfly.h
+++ b/os/os-dragonfly.h
@@ -92,6 +92,12 @@ typedef cpumask_t os_cpu_mask_t;
  /* No CPU_COUNT(), but use the default function defined in os/os.h */
  #define fio_cpu_count(mask)             CPU_COUNT((mask))
  
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)  \
+       pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
  static inline int fio_cpuset_init(os_cpu_mask_t *mask)
  {
         CPUMASK_ASSZERO(*mask);
@@ -165,9 +171,12 @@ static inline int fio_getaffinity(int pid, os_cpu_mask_t *mask)
   * ioprio_set() with 4 arguments, so define fio's ioprio_set() as a macro.
   * Note that there is no idea of class within ioprio_set(2) unlike Linux.
   */
-#define ioprio_set(which, who, ioprio_class, ioprio)   \
+#define ioprio_value(ioprio_class, ioprio, ioprio_hint)        (ioprio)
+#define ioprio_set(which, who, ioprio_class, ioprio, ioprio_hint)      \
         ioprio_set(which, who, ioprio)
  
+#define ioprio(ioprio)         (ioprio)
+
  static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
  {
         struct partinfo pi;
diff --git a/os/os-freebsd.h b/os/os-freebsd.h

index b3addf981f98d1d0ac872bde57ae0363f6e8af4c..1b24fa022a3b61ccb17b35e158c227d370fb3c7e 100644 (file)
--- a/os/os-freebsd.h
+++ b/os/os-freebsd.h
@@ -37,6 +37,12 @@ typedef cpuset_t os_cpu_mask_t;
  #define fio_cpu_isset(mask, cpu)       (CPU_ISSET((cpu), (mask)) != 0)
  #define fio_cpu_count(mask)            CPU_COUNT((mask))
  
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)  \
+       pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
  static inline int fio_cpuset_init(os_cpu_mask_t *mask)
  {
          CPU_ZERO(mask);
diff --git a/os/os-hpux.h b/os/os-hpux.h

index c1dafe42ee55c0bf2317304daa828e138691e312..9f3d76f50719736c998e76dbe4e30f1cfdd067e6 100644 (file)
--- a/os/os-hpux.h
+++ b/os/os-hpux.h
@@ -38,6 +38,13 @@
  #define FIO_USE_GENERIC_SWAP
  
  #define FIO_OS_HAVE_AIOCB_TYPEDEF
+
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)  \
+       pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
  typedef struct aiocb64 os_aiocb_t;
  
  static inline int blockdev_invalidate_cache(struct fio_file *f)
@@ -81,9 +88,9 @@ static inline unsigned long long os_phys_mem(void)
         return ret;
  }
  
-#define FIO_HAVE_CPU_ONLINE_SYSCONF
+#define FIO_HAVE_CPU_CONF_SYSCONF
  
-static inline unsigned int cpus_online(void)
+static inline unsigned int cpus_configured(void)
  {
         return mpctl(MPC_GETNUMSPUS, 0, NULL);
  }
diff --git a/os/os-linux-syscall.h b/os/os-linux-syscall.h

index c399b2fa99d3976f8bdea884e832df842e5dab98..626330adde01ff25dec386dd23d765cf3e3e30cd 100644 (file)
--- a/os/os-linux-syscall.h
+++ b/os/os-linux-syscall.h
@@ -270,6 +270,29 @@
  #define __NR_ioprio_get                31
  #endif
  
+/* Linux syscalls for loongarch64 */
+#elif defined(ARCH_LOONGARCH64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set         30
+#define __NR_ioprio_get         31
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64          223
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice         76
+#define __NR_sys_tee           77
+#define __NR_sys_vmsplice       75
+#endif
+
+/* Linux syscalls for riscv64 */
+#elif defined(ARCH_RISCV64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set                30
+#define __NR_ioprio_get                31
+#endif
  #else
  #warning "Unknown architecture"
  #endif
diff --git a/os/os-linux.h b/os/os-linux.h

index 5562b0da93a67bd2949032e54e914ce85ae7c380..c5cd6515813146d4f63e0d17427a1b8ee2d9efe1 100644 (file)
--- a/os/os-linux.h
+++ b/os/os-linux.h
@@ -1,7 +1,11 @@
  #ifndef FIO_OS_LINUX_H
  #define FIO_OS_LINUX_H
  
+#ifdef __ANDROID__
+#define FIO_OS  os_android
+#else
  #define        FIO_OS  os_linux
+#endif
  
  #include <sys/ioctl.h>
  #include <sys/uio.h>
@@ -14,13 +18,20 @@
  #include <errno.h>
  #include <sched.h>
  #include <linux/unistd.h>
-#include <linux/raw.h>
  #include <linux/major.h>
  #include <linux/fs.h>
  #include <scsi/sg.h>
+#include <asm/byteorder.h>
+#ifdef __ANDROID__
+#include "os-ashmem.h"
+#define FIO_NO_HAVE_SHM_H
+#endif
  
  #ifdef ARCH_HAVE_CRC_CRYPTO
  #include <sys/auxv.h>
+#ifndef HWCAP_PMULL
+#define HWCAP_PMULL             (1 << 4)
+#endif /* HWCAP_PMULL */
  #ifndef HWCAP_CRC32
  #define HWCAP_CRC32             (1 << 7)
  #endif /* HWCAP_CRC32 */
@@ -41,7 +52,6 @@
  #define FIO_HAVE_IOSCHED_SWITCH
  #define FIO_HAVE_ODIRECT
  #define FIO_HAVE_HUGETLB
-#define FIO_HAVE_RAWBIND
  #define FIO_HAVE_BLKTRACE
  #define FIO_HAVE_CL_SIZE
  #define FIO_HAVE_CGROUPS
@@ -49,6 +59,7 @@
  #define FIO_HAVE_TRIM
  #define FIO_HAVE_GETTID
  #define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_BYTEORDER_FUNCS
  #define FIO_HAVE_PWRITEV2
  #define FIO_HAVE_SHM_ATTACH_REMOVED
  
@@ -74,8 +85,14 @@ typedef cpu_set_t os_cpu_mask_t;
         sched_getaffinity((pid), (ptr))
  #endif
  
-#define fio_cpu_clear(mask, cpu)       (void) CPU_CLR((cpu), (mask))
-#define fio_cpu_set(mask, cpu)         (void) CPU_SET((cpu), (mask))
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)  \
+       pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
+#define fio_cpu_clear(mask, cpu)       CPU_CLR((cpu), (mask))
+#define fio_cpu_set(mask, cpu)         CPU_SET((cpu), (mask))
  #define fio_cpu_isset(mask, cpu)       (CPU_ISSET((cpu), (mask)) != 0)
  #define fio_cpu_count(mask)            CPU_COUNT((mask))
  
@@ -108,22 +125,46 @@ enum {
  #define IOPRIO_BITS            16
  #define IOPRIO_CLASS_SHIFT     13
  
+#define IOPRIO_HINT_BITS       10
+#define IOPRIO_HINT_SHIFT      3
+
  #define IOPRIO_MIN_PRIO                0       /* highest priority */
  #define IOPRIO_MAX_PRIO                7       /* lowest priority */
  
  #define IOPRIO_MIN_PRIO_CLASS  0
  #define IOPRIO_MAX_PRIO_CLASS  3
  
-static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
+#define IOPRIO_MIN_PRIO_HINT   0
+#define IOPRIO_MAX_PRIO_HINT   ((1 << IOPRIO_HINT_BITS) - 1)
+
+#define ioprio_class(ioprio)   ((ioprio) >> IOPRIO_CLASS_SHIFT)
+#define ioprio(ioprio)         ((ioprio) & IOPRIO_MAX_PRIO)
+#define ioprio_hint(ioprio)    \
+       (((ioprio) >> IOPRIO_HINT_SHIFT) & IOPRIO_MAX_PRIO_HINT)
+
+static inline int ioprio_value(int ioprio_class, int ioprio, int ioprio_hint)
  {
         /*
          * If no class is set, assume BE
          */
-       if (!ioprio_class)
-               ioprio_class = IOPRIO_CLASS_BE;
+        if (!ioprio_class)
+                ioprio_class = IOPRIO_CLASS_BE;
  
-       ioprio |= ioprio_class << IOPRIO_CLASS_SHIFT;
-       return syscall(__NR_ioprio_set, which, who, ioprio);
+       return (ioprio_class << IOPRIO_CLASS_SHIFT) |
+               (ioprio_hint << IOPRIO_HINT_SHIFT) |
+               ioprio;
+}
+
+static inline bool ioprio_value_is_class_rt(unsigned int priority)
+{
+       return ioprio_class(priority) == IOPRIO_CLASS_RT;
+}
+
+static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio,
+                            int ioprio_hint)
+{
+       return syscall(__NR_ioprio_set, which, who,
+                      ioprio_value(ioprio_class, ioprio, ioprio_hint));
  }
  
  #ifndef CONFIG_HAVE_GETTID
@@ -172,48 +213,12 @@ static inline unsigned long long os_phys_mem(void)
         return (unsigned long long) pages * (unsigned long long) pagesize;
  }
  
-static inline int fio_lookup_raw(dev_t dev, int *majdev, int *mindev)
-{
-       struct raw_config_request rq;
-       int fd;
-
-       if (major(dev) != RAW_MAJOR)
-               return 1;
-
-       /*
-        * we should be able to find /dev/rawctl or /dev/raw/rawctl
-        */
-       fd = open("/dev/rawctl", O_RDONLY);
-       if (fd < 0) {
-               fd = open("/dev/raw/rawctl", O_RDONLY);
-               if (fd < 0)
-                       return 1;
-       }
-
-       rq.raw_minor = minor(dev);
-       if (ioctl(fd, RAW_GETBIND, &rq) < 0) {
-               close(fd);
-               return 1;
-       }
-
-       close(fd);
-       *majdev = rq.block_major;
-       *mindev = rq.block_minor;
-       return 0;
-}
-
  #ifdef O_NOATIME
  #define FIO_O_NOATIME  O_NOATIME
  #else
  #define FIO_O_NOATIME  0
  #endif
  
-#ifdef O_ATOMIC
-#define OS_O_ATOMIC    O_ATOMIC
-#else
-#define OS_O_ATOMIC    040000000
-#endif
-
  #ifdef MADV_REMOVE
  #define FIO_MADV_FREE  MADV_REMOVE
  #endif
@@ -254,14 +259,6 @@ static inline int arch_cache_line_size(void)
                 return atoi(size);
  }
  
-#ifdef __powerpc64__
-#define FIO_HAVE_CPU_ONLINE_SYSCONF
-static inline unsigned int cpus_online(void)
-{
-        return sysconf(_SC_NPROCESSORS_CONF);
-}
-#endif
-
  static inline unsigned long long get_fs_free_size(const char *path)
  {
         unsigned long long ret;
@@ -421,7 +418,8 @@ static inline bool os_cpu_has(cpu_features feature)
  #ifdef ARCH_HAVE_CRC_CRYPTO
         case CPU_ARM64_CRC32C:
                 hwcap = getauxval(AT_HWCAP);
-               have_feature = (hwcap & HWCAP_CRC32) != 0;
+               have_feature = (hwcap & (HWCAP_PMULL | HWCAP_CRC32)) ==
+                              (HWCAP_PMULL | HWCAP_CRC32);
                 break;
  #endif
         default:
diff --git a/os/os-mac.h b/os/os-mac.h

index 683aab3220a54f4125ebe97327be75b2a2b64d86..c9103c45ac44afe0a79e6878d7f604f49b9dae18 100644 (file)
--- a/os/os-mac.h
+++ b/os/os-mac.h
@@ -14,12 +14,14 @@
  #include <machine/endian.h>
  #include <libkern/OSByteOrder.h>
  
+#include "../arch/arch.h"
  #include "../file.h"
  
  #define FIO_USE_GENERIC_INIT_RANDOM_STATE
  #define FIO_HAVE_GETTID
  #define FIO_HAVE_CHARDEV_SIZE
  #define FIO_HAVE_NATIVE_FALLOCATE
+#define FIO_HAVE_CPU_HAS
  
  #define OS_MAP_ANON            MAP_ANON
  
@@ -27,6 +29,12 @@
  #define fio_swap32(x)  OSSwapInt32(x)
  #define fio_swap64(x)  OSSwapInt64(x)
  
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)  \
+       pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
  #ifndef CONFIG_CLOCKID_T
  typedef unsigned int clockid_t;
  #endif
@@ -100,4 +108,12 @@ static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t l
         return false;
  }
  
+static inline bool os_cpu_has(cpu_features feature)
+{
+       /* just check for arm on OSX for now, we know that has it */
+       if (feature != CPU_ARM64_CRC32C)
+               return false;
+       return FIO_ARCH == arch_aarch64;
+}
+
  #endif
diff --git a/os/os-netbsd.h b/os/os-netbsd.h

index abc1d3cb70120b25244ff80e8833e1023f6f602a..b553a4300bcf1d59c64394ad685826748141b9b9 100644 (file)
--- a/os/os-netbsd.h
+++ b/os/os-netbsd.h
@@ -13,7 +13,7 @@
  #include <sys/endian.h>
  #include <sys/sysctl.h>
  
-/* XXX hack to avoid confilcts between rbtree.h and <sys/rbtree.h> */
+/* XXX hack to avoid conflicts between rbtree.h and <sys/rbtree.h> */
  #undef rb_node
  #undef rb_left
  #undef rb_right
@@ -35,6 +35,12 @@
  #define fio_swap32(x)  bswap32(x)
  #define fio_swap64(x)  bswap64(x)
  
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)  \
+       pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
  static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
  {
         struct disklabel dl;
diff --git a/os/os-openbsd.h b/os/os-openbsd.h

index 994bf078c9863a1fb0d28bcb940bd224fd1d4a1c..f1bad67165e982c4da1ecc743887ec8e4bbe056c 100644 (file)
--- a/os/os-openbsd.h
+++ b/os/os-openbsd.h
@@ -35,6 +35,12 @@
  #define fio_swap32(x)  swap32(x)
  #define fio_swap64(x)  swap64(x)
  
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)  \
+       pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
  static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
  {
         struct disklabel dl;
diff --git a/os/os-solaris.h b/os/os-solaris.h

index f1966f449d9f4acb6fa7b20c3b376ae6f384fb8c..60d4c1eca49e21e5f040e3ce810f80094f493935 100644 (file)
--- a/os/os-solaris.h
+++ b/os/os-solaris.h
@@ -46,6 +46,12 @@ struct solaris_rand_seed {
  #define os_ctime_r(x, y, z)     ctime_r((x), (y), (z))
  #define FIO_OS_HAS_CTIME_R
  
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask)  \
+       pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
  typedef psetid_t os_cpu_mask_t;
  
  static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
@@ -113,7 +119,7 @@ static inline int fio_set_odirect(struct fio_file *f)
  
  static inline bool fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
  {
-       const unsigned int max_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+       const unsigned int max_cpus = sysconf(_SC_NPROCESSORS_CONF);
         unsigned int num_cpus;
         processorid_t *cpus;
         bool ret;
diff --git a/os/os-windows.h b/os/os-windows.h

index ddfae41344cfe23c5bac09ef9dfd4f6ace0e95c1..12f334861126678d73f92c92c91d0d041514ada4 100644 (file)
--- a/os/os-windows.h
+++ b/os/os-windows.h
@@ -44,7 +44,7 @@
  #define fio_swap64(x)  _byteswap_uint64(x)
  
  #define _SC_PAGESIZE                   0x1
-#define _SC_NPROCESSORS_ONLN   0x2
+#define _SC_NPROCESSORS_CONF   0x2
  #define _SC_PHYS_PAGES                 0x4
  
  #define SA_RESTART     0
@@ -77,6 +77,7 @@
  #define SIGCONT        0
  #define SIGUSR1        1
  #define SIGUSR2 2
+#define SIGKILL 15 /* SIGKILL doesn't exists, let's use SIGTERM */
  
  typedef int sigset_t;
  typedef int siginfo_t;
@@ -109,6 +110,8 @@ int nanosleep(const struct timespec *rqtp, struct timespec *rmtp);
  ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
  ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
                 off_t offset);
+HANDLE windows_handle_connection(HANDLE hjob, int sk);
+HANDLE windows_create_job(void);
  
  static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
  {
@@ -216,9 +219,6 @@ static inline int fio_mkdir(const char *path, mode_t mode) {
         return 0;
  }
  
-#define FIO_HAVE_CPU_ONLINE_SYSCONF
-unsigned int cpus_online(void);
-
  int first_set_cpu(os_cpu_mask_t *cpumask);
  int fio_setaffinity(int pid, os_cpu_mask_t cpumask);
  int fio_cpuset_init(os_cpu_mask_t *mask);
diff --git a/os/os.h b/os/os.h

index b46f4164008eb9d63a33f8b668204e77060314dd..0f1823240faa9f11e638b95c2207f03f5480b4ff 100644 (file)
--- a/os/os.h
+++ b/os/os.h
@@ -7,6 +7,7 @@
  #include <pthread.h>
  #include <unistd.h>
  #include <stdlib.h>
+#include <errno.h>
  
  #include "../arch/arch.h" /* IWYU pragma: export */
  #include "../lib/types.h"
@@ -32,9 +33,7 @@ typedef enum {
  } cpu_features;
  
  /* IWYU pragma: begin_exports */
-#if defined(__ANDROID__)
-#include "os-android.h"
-#elif defined(__linux__)
+#if defined(__linux__)
  #include "os-linux.h"
  #elif defined(__FreeBSD__)
  #include "os-freebsd.h"
@@ -58,6 +57,10 @@ typedef enum {
  #error "unsupported os"
  #endif
  
+#ifndef EDQUOT
+#define EDQUOT EIO
+#endif
+
  #ifdef CONFIG_POSIXAIO
  #include <aio.h>
  #ifndef FIO_OS_HAVE_AIOCB_TYPEDEF
@@ -112,8 +115,21 @@ static inline int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu_index)
  extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
  #endif
  
+#ifndef FIO_HAVE_IOPRIO_CLASS
+#define ioprio_class(prio)             0
+#define ioprio_value_is_class_rt(prio) (false)
+#define IOPRIO_MIN_PRIO_CLASS          0
+#define IOPRIO_MAX_PRIO_CLASS          0
+#define ioprio_hint(prio)              0
+#define IOPRIO_MIN_PRIO_HINT           0
+#define IOPRIO_MAX_PRIO_HINT           0
+#endif
  #ifndef FIO_HAVE_IOPRIO
-#define ioprio_set(which, who, prioclass, prio)        (0)
+#define ioprio_value(prioclass, prio, priohint)        (0)
+#define ioprio(ioprio)                 0
+#define ioprio_set(which, who, prioclass, prio, priohint) (0)
+#define IOPRIO_MIN_PRIO                        0
+#define IOPRIO_MAX_PRIO                        0
  #endif
  
  #ifndef FIO_HAVE_ODIRECT
@@ -122,12 +138,6 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
  #define OS_O_DIRECT                    O_DIRECT
  #endif
  
-#ifdef OS_O_ATOMIC
-#define FIO_O_ATOMIC                   OS_O_ATOMIC
-#else
-#define FIO_O_ATOMIC                   0
-#endif
-
  #ifndef FIO_HAVE_HUGETLB
  #define SHM_HUGETLB                    0
  #define MAP_HUGETLB                    0
@@ -152,10 +162,6 @@ extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
  #define OS_RAND_MAX                    RAND_MAX
  #endif
  
-#ifndef FIO_HAVE_RAWBIND
-#define fio_lookup_raw(dev, majdev, mindev)    1
-#endif
-
  #ifndef FIO_PREFERRED_ENGINE
  #define FIO_PREFERRED_ENGINE   "psync"
  #endif
@@ -345,10 +351,12 @@ static inline unsigned long long get_fs_free_size(const char *path)
  }
  #endif
  
-#ifndef FIO_HAVE_CPU_ONLINE_SYSCONF
-static inline unsigned int cpus_online(void)
+#ifndef FIO_HAVE_CPU_CONF_SYSCONF
+static inline unsigned int cpus_configured(void)
  {
-       return sysconf(_SC_NPROCESSORS_ONLN);
+       int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+
+       return nr_cpus >= 1 ? nr_cpus : 1;
  }
  #endif
  
@@ -356,7 +364,7 @@ static inline unsigned int cpus_online(void)
  #ifdef FIO_HAVE_CPU_AFFINITY
  static inline int CPU_COUNT(os_cpu_mask_t *mask)
  {
-       int max_cpus = cpus_online();
+       int max_cpus = cpus_configured();
         int nr_cpus, i;
  
         for (i = 0, nr_cpus = 0; i < max_cpus; i++)
@@ -407,4 +415,13 @@ static inline bool os_cpu_has(cpu_features feature)
  # define fio_mkdir(path, mode) mkdir(path, mode)
  #endif
  
+#ifdef _SC_CLK_TCK
+static inline void os_clk_tck(long *clk_tck)
+{
+       *clk_tck = sysconf(_SC_CLK_TCK);
+}
+#else
+extern void os_clk_tck(long *clk_tck);
+#endif
+
  #endif /* FIO_OS_H */
diff --git a/os/windows/WixUI_Minimal_NoEULA.wxs b/os/windows/WixUI_Minimal_NoEULA.wxs

new file mode 100755 (executable)

index 0000000..4839118
--- /dev/null
+++ b/os/windows/WixUI_Minimal_NoEULA.wxs
@@ -0,0 +1,96 @@
+<?xml version="1.0" encoding="UTF-8"?>\r
+<!-- Copyright (c) .NET Foundation and contributors. All rights reserved. Licensed under the Microsoft Reciprocal License. See LICENSE.TXT file in the project root for full license information. -->\r
+\r
+\r
+\r
+<!--\r
+First-time install dialog sequence:\r
+ - WixUI_MyWelcomeDlg\r
+Maintenance dialog sequence:\r
+ WixUI_MaintenanceWelcomeDlg\r
+ - WixUI_MaintenanceTypeDlg\r
+ - WixUI_VerifyReadyDlg\r
+-->\r
+\r
+<Wix xmlns="http://schemas.microsoft.com/wix/2006/wi">\r
+  <Fragment>\r
+    <UI Id="WixUI_Minimal_NoEULA">\r
+      <TextStyle Id="WixUI_Font_Normal" FaceName="Tahoma" Size="8" />\r
+      <TextStyle Id="WixUI_Font_Bigger" FaceName="Tahoma" Size="12" />\r
+      <TextStyle Id="WixUI_Font_Title" FaceName="Tahoma" Size="9" Bold="yes" />\r
+\r
+      <Property Id="DefaultUIFont" Value="WixUI_Font_Normal" />\r
+      <Property Id="WixUI_Mode" Value="Minimal" />\r
+\r
+      <DialogRef Id="ErrorDlg" />\r
+      <DialogRef Id="FatalError" />\r
+      <DialogRef Id="FilesInUse" />\r
+      <DialogRef Id="MsiRMFilesInUse" />\r
+      <DialogRef Id="PrepareDlg" />\r
+      <DialogRef Id="ProgressDlg" />\r
+      <DialogRef Id="ResumeDlg" />\r
+      <DialogRef Id="UserExit" />\r
+      <DialogRef Id="MyWelcomeDlg" />\r
+\r
+      <Dialog Id="MyWelcomeDlg" Width="370" Height="270" Title="!(loc.WelcomeDlg_Title)">\r
+          <Control Id="Install" Type="PushButton" ElevationShield="yes" X="236" Y="243" Width="56" Height="17" Default="yes" Hidden="yes" Text="!(loc.WelcomeEulaDlgInstall)" >\r
+            <Publish Property="WixUI_InstallMode" Value="Update">Installed AND PATCH</Publish>\r
+            <Publish Event="SpawnWaitDialog" Value="WaitForCostingDlg">!(wix.WixUICostingPopupOptOut) OR CostingComplete = 1</Publish>\r
+            <Publish Event="EndDialog" Value="Return"><![CDATA[OutOfDiskSpace <> 1]]></Publish>\r
+            <Publish Event="SpawnDialog" Value="OutOfRbDiskDlg">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND (PROMPTROLLBACKCOST="P" OR NOT PROMPTROLLBACKCOST)</Publish>\r
+            <Publish Event="EndDialog" Value="Return">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>\r
+            <Publish Event="EnableRollback" Value="False">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>\r
+            <Publish Event="SpawnDialog" Value="OutOfDiskDlg">(OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 1) OR (OutOfDiskSpace = 1 AND PROMPTROLLBACKCOST="F")</Publish>\r
+            <Condition Action="show">ALLUSERS</Condition>\r
+        </Control>\r
+        <Control Id="InstallNoShield" Type="PushButton" ElevationShield="no" X="212" Y="243" Width="80" Height="17" Default="yes" Text="!(loc.WelcomeEulaDlgInstall)" Hidden="yes">\r
+          <Publish Event="SpawnWaitDialog" Value="WaitForCostingDlg">!(wix.WixUICostingPopupOptOut) OR CostingComplete = 1</Publish>\r
+          <Publish Event="EndDialog" Value="Return"><![CDATA[OutOfDiskSpace <> 1]]></Publish>\r
+          <Publish Event="SpawnDialog" Value="OutOfRbDiskDlg">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND (PROMPTROLLBACKCOST="P" OR NOT PROMPTROLLBACKCOST)</Publish>\r
+          <Publish Event="EndDialog" Value="Return">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>\r
+          <Publish Event="EnableRollback" Value="False">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>\r
+          <Publish Event="SpawnDialog" Value="OutOfDiskDlg">(OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 1) OR (OutOfDiskSpace = 1 AND PROMPTROLLBACKCOST="F")</Publish>\r
+          <Condition Action="disable"><![CDATA[LicenseAccepted <> "1"]]></Condition>\r
+          <Condition Action="show">NOT ALLUSERS</Condition>\r
+        </Control>\r
+        <Control Id="Cancel" Type="PushButton" X="304" Y="243" Width="56" Height="17" Cancel="yes" Text="!(loc.WixUICancel)">\r
+          <Publish Event="SpawnDialog" Value="CancelDlg">1</Publish>\r
+        </Control>\r
+        <Control Id="Bitmap" Type="Bitmap" X="0" Y="0" Width="370" Height="234" TabSkip="no" Text="!(loc.WelcomeDlgBitmap)" />\r
+        <Control Id="Back" Type="PushButton" X="180" Y="243" Width="56" Height="17" Disabled="yes" Text="!(loc.WixUIBack)" />\r
+        <Control Id="BottomLine" Type="Line" X="0" Y="234" Width="370" Height="0" />\r
+        <Control Id="Description" Type="Text" X="135" Y="80" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.MyWelcomeDlgDescription)" >\r
+          <Condition Action="show">NOT Installed OR NOT PATCH</Condition>\r
+          <Condition Action="hide">Installed AND PATCH</Condition>\r
+        </Control>\r
+        <Control Id="PatchDescription" Type="Text" X="135" Y="80" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.WelcomeUpdateDlgDescriptionUpdate)" >\r
+          <Condition Action="show">Installed AND PATCH</Condition>\r
+          <Condition Action="hide">NOT Installed OR NOT PATCH</Condition>\r
+        </Control>\r
+        <Control Id="Title" Type="Text" X="135" Y="20" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.WelcomeDlgTitle)" />\r
+      </Dialog>\r
+\r
+      <Publish Dialog="ExitDialog" Control="Finish" Event="EndDialog" Value="Return" Order="999">1</Publish>\r
+\r
+      <Publish Dialog="VerifyReadyDlg" Control="Back" Event="NewDialog" Value="MaintenanceTypeDlg">1</Publish>\r
+\r
+      <Publish Dialog="MaintenanceWelcomeDlg" Control="Next" Event="NewDialog" Value="MaintenanceTypeDlg">1</Publish>\r
+\r
+      <Publish Dialog="MaintenanceTypeDlg" Control="RepairButton" Event="NewDialog" Value="VerifyReadyDlg">1</Publish>\r
+      <Publish Dialog="MaintenanceTypeDlg" Control="RemoveButton" Event="NewDialog" Value="VerifyReadyDlg">1</Publish>\r
+      <Publish Dialog="MaintenanceTypeDlg" Control="Back" Event="NewDialog" Value="MaintenanceWelcomeDlg">1</Publish>\r
+\r
+      <Publish Dialog="MyWelcomeDlg" Control="Install" Event="NewDialog" Value="PrepareDlg">1</Publish>\r
+      <Publish Dialog="VerifyReadyDlg" Control="Back" Event="NewDialog" Value="WelcomeDlg" Order="2">Installed AND PATCH</Publish>\r
+\r
+      <InstallUISequence>\r
+        <Show Dialog="WelcomeDlg" Before="ProgressDlg">0</Show>\r
+        <Show Dialog="MyWelcomeDlg" Before="ProgressDlg">NOT Installed</Show>\r
+      </InstallUISequence>\r
+\r
+      <Property Id="ARPNOMODIFY" Value="1" />\r
+    </UI>\r
+\r
+    <UIRef Id="WixUI_Common" />\r
+  </Fragment>\r
+</Wix>
+\ No newline at end of file
diff --git a/os/windows/WixUI_fio.wxl b/os/windows/WixUI_fio.wxl

new file mode 100755 (executable)

index 0000000..11ec736
--- /dev/null
+++ b/os/windows/WixUI_fio.wxl
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>\r
+<!-- Copyright (c) .NET Foundation and contributors. All rights reserved. Licensed under the Microsoft Reciprocal License. See LICENSE.TXT file in the project root for full license information. -->\r
+\r
+\r
+<WixLocalization Culture="en-US" Codepage="1252" xmlns="http://schemas.microsoft.com/wix/2006/localization">\r
+  <!-- _locID@Culture="en-US" _locComment="American English" -->\r
+  <!-- _locID@Codepage="1252" _locComment="Windows-1252" -->\r
+\r
+<String Id="MyWelcomeDlgDescription" Overridable="yes">\r
+<!-- _locID_text="MyWelcomeDlgDescription" _locComment="MyWelcomeDlgDescription" -->The Setup Wizard will install [ProductName] on your computer. Click Install to continue or Cancel to exit the Setup Wizard.\r
+</String>\r
+</WixLocalization>
+\ No newline at end of file
diff --git a/os/windows/cpu-affinity.c b/os/windows/cpu-affinity.c

index 7601970fc7c284bd120d0ed1b6637ed46882dae3..8f3d6a76b45242d148c637c7bc07a6335985a554 100644 (file)
--- a/os/windows/cpu-affinity.c
+++ b/os/windows/cpu-affinity.c
@@ -2,12 +2,6 @@
  
  #include <windows.h>
  
-/* Return all processors regardless of processor group */
-unsigned int cpus_online(void)
-{
-       return GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
-}
-
  static void print_mask(os_cpu_mask_t *cpumask)
  {
         for (int i = 0; i < FIO_CPU_MASK_ROWS; i++)
diff --git a/os/windows/dlls.c b/os/windows/dlls.c

new file mode 100644 (file)

index 0000000..ffedfa1
--- /dev/null
+++ b/os/windows/dlls.c
@@ -0,0 +1,39 @@
+#include "os/os.h"
+
+#include <windows.h>
+
+void os_clk_tck(long *clk_tck)
+{
+       /*
+        * The timer resolution is variable on Windows. Try to query it 
+        * or use 64 Hz, the clock frequency lower bound. See also
+        * https://carpediemsystems.co.uk/2019/07/18/windows-system-timer-granularity/.
+        */
+       unsigned long minRes, maxRes, curRes;
+       HMODULE lib;
+       NTSTATUS NTAPI (*queryTimer)
+               (OUT PULONG              MinimumResolution,
+                OUT PULONG              MaximumResolution,
+                OUT PULONG              CurrentResolution);
+       NTSTATUS NTAPI (*setTimer)
+               (IN ULONG                DesiredResolution,
+                IN BOOLEAN              SetResolution,
+                OUT PULONG              CurrentResolution);
+
+       if (!(lib = LoadLibrary(TEXT("ntdll.dll"))) ||
+               !(queryTimer = (void *)GetProcAddress(lib, "NtQueryTimerResolution")) ||
+               !(setTimer = (void *)GetProcAddress(lib, "NtSetTimerResolution"))) {
+               dprint(FD_HELPERTHREAD, 
+                       "Failed to load ntdll library, set to lower bound 64 Hz\n");
+               *clk_tck = 64;
+       } else {
+               queryTimer(&minRes, &maxRes, &curRes);
+               dprint(FD_HELPERTHREAD, 
+                       "minRes = %lu, maxRes = %lu, curRes = %lu\n",
+                       minRes, maxRes, curRes);
+
+               /* Use maximum resolution for most accurate timestamps */
+               setTimer(maxRes, 1, &curRes);
+               *clk_tck = (long) (10000000L / maxRes);
+       }
+}
diff --git a/os/windows/dobuild.cmd b/os/windows/dobuild.cmd

index 08df3e876da6d3f761ba5ca5d335428bbbd2be71..7b9cb1ddad95d62713323fc7e3b1149a2fdec8e4 100644 (file)
--- a/os/windows/dobuild.cmd
+++ b/os/windows/dobuild.cmd
@@ -44,7 +44,10 @@ if exist ..\..\fio.pdb (
  @if ERRORLEVEL 1 goto end\r
  "%WIX%bin\candle" -nologo -arch %FIO_ARCH% examples.wxs\r
  @if ERRORLEVEL 1 goto end\r
-"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi\r
+"%WIX%bin\candle" -nologo -arch %FIO_ARCH% WixUI_Minimal_NoEULA.wxs\r
+@if ERRORLEVEL 1 goto end\r
+\r
+"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj WixUI_Minimal_NoEULA.wixobj -loc WixUI_fio.wxl -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi\r
  :end\r
  \r
  if defined SIGN_FIO (\r
diff --git a/os/windows/eula.rtf b/os/windows/eula.rtf

deleted file mode 100755 (executable)

index a931017..0000000

Binary files a/os/windows/eula.rtf and /dev/null differ
diff --git a/os/windows/examples.wxs b/os/windows/examples.wxs

index 9308ba8be829c62b88cb06470a068cc2aef3f7dc..d70c77133f5a9f24908fffde9ce5ad20dbad2562 100755 (executable)
--- a/os/windows/examples.wxs
+++ b/os/windows/examples.wxs
@@ -125,9 +125,6 @@
                  <Component>
                    <File Source="..\..\examples\numa.fio" />
                  </Component>
-                <Component>
-                  <File Source="..\..\examples\pmemblk.fio" />
-                </Component>
                  <Component>
                    <File Source="..\..\examples\poisson-rate-submission.fio" />
                  </Component>
@@ -212,7 +209,6 @@
              <ComponentRef Id="netio_multicast.fio" />
              <ComponentRef Id="null.fio" />
              <ComponentRef Id="numa.fio" />
-            <ComponentRef Id="pmemblk.fio" />
              <ComponentRef Id="poisson_rate_submission.fio" />
              <ComponentRef Id="rados.fio"/>
              <ComponentRef Id="rand_zones.fio" />
diff --git a/os/windows/install.wxs b/os/windows/install.wxs

index f73ec5e2517487f3623dfbbccf37aa5567d0792e..f2753289a93b690ccd93c48eb32b2e25ffad702e 100755 (executable)
--- a/os/windows/install.wxs
+++ b/os/windows/install.wxs
@@ -33,13 +33,13 @@
                                                 </Component>
                                                 <?endif?>
                                                 <Component>
-                                                       <File Id="README" Name="README.txt" Source="..\..\README"/>
+                                                       <File Id="README" Name="README.txt" Source="..\..\README.rst"/>
                                                 </Component>
                                                 <Component>
                                                         <File Id="REPORTING_BUGS" Name="REPORTING-BUGS.txt" Source="..\..\REPORTING-BUGS"/>
                                                 </Component>
                                                 <Component>
-                                                       <File Id="HOWTO" Name="HOWTO.txt" Source="..\..\HOWTO"/>
+                                                       <File Id="HOWTO" Name="HOWTO.txt" Source="..\..\HOWTO.rst"/>
                                                 </Component>
                                                 <Component>
                                                         <File Id="COPYING" Name="COPYING.txt" Source="..\..\COPYING"/>
@@ -107,7 +107,7 @@
  
         <WixVariable Id="WixUILicenseRtf" Value="eula.rtf" />
  
-       <UIRef Id="WixUI_Minimal"/>
+       <UIRef Id="WixUI_Minimal_NoEULA"/>
  
         <MajorUpgrade AllowDowngrades="no" DowngradeErrorMessage="A newer version of the application is already installed."
                    AllowSameVersionUpgrades="yes"/>
diff --git a/os/windows/posix.c b/os/windows/posix.c

index 09c2e4a7857bd4121441b866c70657a935fde730..a47223daaf761e0a5ae5f0843195ea93de62d011 100644 (file)
--- a/os/windows/posix.c
+++ b/os/windows/posix.c
@@ -216,10 +216,18 @@ long sysconf(int name)
         MEMORYSTATUSEX status;
  
         switch (name) {
-       case _SC_NPROCESSORS_ONLN:
-               val = GetNumLogicalProcessors();
+       case _SC_NPROCESSORS_CONF:
+               /*
+                * Using GetMaximumProcessorCount introduces a problem in
+                * gettime.c because Windows does not have
+                * fio_get_thread_affinity. Log sample (see #1479):
+                *
+                *   CPU mask contains processor beyond last active processor index (2)
+                *   clock setaffinity failed: No error
+                */
+               val = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
                 if (val == -1)
-                       log_err("sysconf(_SC_NPROCESSORS_ONLN) failed\n");
+                       log_err("sysconf(_SC_NPROCESSORS_CONF) failed\n");
  
                 break;
  
@@ -537,16 +545,21 @@ int fcntl(int fildes, int cmd, ...)
  return 0;
  }
  
+#ifndef CLOCK_MONOTONIC_RAW
+#define CLOCK_MONOTONIC_RAW 4
+#endif
+
  /*
   * Get the value of a local clock source.
- * This implementation supports 2 clocks: CLOCK_MONOTONIC provides high-accuracy
- * relative time, while CLOCK_REALTIME provides a low-accuracy wall time.
+ * This implementation supports 3 clocks: CLOCK_MONOTONIC/CLOCK_MONOTONIC_RAW
+ * provide high-accuracy relative time, while CLOCK_REALTIME provides a
+ * low-accuracy wall time.
   */
  int clock_gettime(clockid_t clock_id, struct timespec *tp)
  {
         int rc = 0;
  
-       if (clock_id == CLOCK_MONOTONIC) {
+       if (clock_id == CLOCK_MONOTONIC || clock_id == CLOCK_MONOTONIC_RAW) {
                 static LARGE_INTEGER freq = {{0,0}};
                 LARGE_INTEGER counts;
                 uint64_t t;
@@ -1026,3 +1039,174 @@ in_addr_t inet_network(const char *cp)
         hbo = ((nbo & 0xFF) << 24) + ((nbo & 0xFF00) << 8) + ((nbo & 0xFF0000) >> 8) + ((nbo & 0xFF000000) >> 24);
         return hbo;
  }
+
+static HANDLE create_named_pipe(char *pipe_name, int wait_connect_time)
+{
+       HANDLE hpipe;
+
+       hpipe = CreateNamedPipe (
+                       pipe_name,
+                       PIPE_ACCESS_DUPLEX,
+                       PIPE_WAIT | PIPE_TYPE_BYTE,
+                       1, 0, 0, wait_connect_time, NULL);
+
+       if (hpipe == INVALID_HANDLE_VALUE) {
+               log_err("ConnectNamedPipe failed (%lu).\n", GetLastError());
+               return INVALID_HANDLE_VALUE;
+       }
+
+       if (!ConnectNamedPipe(hpipe, NULL)) {
+               log_err("ConnectNamedPipe failed (%lu).\n", GetLastError());
+               CloseHandle(hpipe);
+               return INVALID_HANDLE_VALUE;
+       }
+
+       return hpipe;
+}
+
+static BOOL windows_create_process(PROCESS_INFORMATION *pi, const char *args, HANDLE *hjob)
+{
+       LPSTR this_cmd_line = GetCommandLine();
+       LPSTR new_process_cmd_line = malloc((strlen(this_cmd_line)+strlen(args)) * sizeof(char *));
+       STARTUPINFO si = {0};
+       DWORD flags = 0;
+
+       strcpy(new_process_cmd_line, this_cmd_line);
+       strcat(new_process_cmd_line, args);
+
+       si.cb = sizeof(si);
+       memset(pi, 0, sizeof(*pi));
+
+       if ((hjob != NULL) && (*hjob != INVALID_HANDLE_VALUE))
+               flags = CREATE_SUSPENDED | CREATE_BREAKAWAY_FROM_JOB;
+
+       flags |= CREATE_NEW_CONSOLE;
+
+       if( !CreateProcess( NULL,
+               new_process_cmd_line,
+               NULL,    /* Process handle not inherited */
+               NULL,    /* Thread handle not inherited */
+               TRUE,    /* no handle inheritance */
+               flags,
+               NULL,    /* Use parent's environment block */
+               NULL,    /* Use parent's starting directory */
+               &si,
+               pi )
+       )
+       {
+               log_err("CreateProcess failed (%lu).\n", GetLastError() );
+               free(new_process_cmd_line);
+               return 1;
+       }
+       if ((hjob != NULL) && (*hjob != INVALID_HANDLE_VALUE)) {
+               BOOL ret = AssignProcessToJobObject(*hjob, pi->hProcess);
+               if (!ret) {
+                       log_err("AssignProcessToJobObject failed (%lu).\n", GetLastError() );
+                       return 1;
+               }
+
+               ResumeThread(pi->hThread);
+       }
+
+       free(new_process_cmd_line);
+       return 0;
+}
+
+HANDLE windows_create_job(void)
+{
+       JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli = { 0 };
+       BOOL success;
+       HANDLE hjob = CreateJobObject(NULL, NULL);
+
+       jeli.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE;
+       success = SetInformationJobObject(hjob, JobObjectExtendedLimitInformation, &jeli, sizeof(jeli));
+       if ( success == 0 ) {
+        log_err( "SetInformationJobObject failed: error %lu\n", GetLastError() );
+        return INVALID_HANDLE_VALUE;
+    }
+       return hjob;
+}
+
+/* wait for a child process to either exit or connect to a child */
+static bool monitor_process_till_connect(PROCESS_INFORMATION *pi, HANDLE *hpipe)
+{
+       bool connected = FALSE;
+       bool process_alive = TRUE;
+       char buffer[32] = {0};
+       DWORD bytes_read;
+
+       do {
+               DWORD exit_code;
+               GetExitCodeProcess(pi->hProcess, &exit_code);
+               if (exit_code != STILL_ACTIVE) {
+                       dprint(FD_PROCESS, "process %u exited %d\n", GetProcessId(pi->hProcess), exit_code);
+                       break;
+               }
+
+               memset(buffer, 0, sizeof(buffer));
+               ReadFile(*hpipe, &buffer, sizeof(buffer) - 1, &bytes_read, NULL);
+               if (bytes_read && strstr(buffer, "connected")) {
+                       dprint(FD_PROCESS, "process %u connected to client\n", GetProcessId(pi->hProcess));
+                       connected = TRUE;
+               }
+               usleep(10*1000);
+       } while (process_alive && !connected);
+       return connected;
+}
+
+/*create a process with --server-internal to emulate fork() */
+HANDLE windows_handle_connection(HANDLE hjob, int sk)
+{
+       char pipe_name[64] =  "\\\\.\\pipe\\fiointernal-";
+       char args[128] = " --server-internal=";
+       PROCESS_INFORMATION pi;
+       HANDLE hpipe = INVALID_HANDLE_VALUE;
+       WSAPROTOCOL_INFO protocol_info;
+       HANDLE ret;
+
+       sprintf(pipe_name+strlen(pipe_name), "%d", GetCurrentProcessId());
+       sprintf(args+strlen(args), "%s", pipe_name);
+
+       if (windows_create_process(&pi, args, &hjob) != 0)
+               return INVALID_HANDLE_VALUE;
+       else
+               ret = pi.hProcess;
+
+       /* duplicate socket and write the protocol_info to pipe so child can
+        * duplicate the communication socket */
+       if (WSADuplicateSocket(sk, GetProcessId(pi.hProcess), &protocol_info)) {
+               log_err("WSADuplicateSocket failed (%lu).\n", GetLastError());
+               ret = INVALID_HANDLE_VALUE;
+               goto cleanup;
+       }
+
+       /* make a pipe with a unique name based upon processid */
+       hpipe = create_named_pipe(pipe_name, 1000);
+       if (hpipe == INVALID_HANDLE_VALUE) {
+               ret = INVALID_HANDLE_VALUE;
+               goto cleanup;
+       }
+
+       if (!WriteFile(hpipe, &protocol_info, sizeof(protocol_info), NULL, NULL)) {
+               log_err("WriteFile failed (%lu).\n", GetLastError());
+               ret = INVALID_HANDLE_VALUE;
+               goto cleanup;
+       }
+
+       dprint(FD_PROCESS, "process %d created child process %u\n", GetCurrentProcessId(), GetProcessId(pi.hProcess));
+
+       /* monitor the process until it either exits or connects. This level
+        * doesnt care which of those occurs because the result is that it
+        * needs to loop around and create another child process to monitor */
+       if (!monitor_process_till_connect(&pi, &hpipe))
+               ret = INVALID_HANDLE_VALUE;
+
+cleanup:
+       /* close the handles and pipes because this thread is done monitoring them */
+       if (ret == INVALID_HANDLE_VALUE)
+               CloseHandle(pi.hProcess);
+       CloseHandle(pi.hThread);
+       DisconnectNamedPipe(hpipe);
+       CloseHandle(hpipe);
+       return ret;
+}
diff --git a/os/windows/posix/include/syslog.h b/os/windows/posix/include/syslog.h

index b8582e95407523356c71e7508a4e3f5125da5a52..03a04f69f87148fd5a7bb0e703b4d0b397d33e21 100644 (file)
--- a/os/windows/posix/include/syslog.h
+++ b/os/windows/posix/include/syslog.h
@@ -1,7 +1,7 @@
  #ifndef SYSLOG_H
  #define SYSLOG_H
  
-int syslog();
+int syslog(int priority, const char *format, ...);
  
  #define LOG_INFO       0x1
  #define LOG_ERROR      0x2
diff --git a/oslib/blkzoned.h b/oslib/blkzoned.h

index 4cc071dc6a61be64a257cd749e40cbd536cb9247..e598bd4f8026f74ad6e61771858710589b409e2b 100644 (file)
--- a/oslib/blkzoned.h
+++ b/oslib/blkzoned.h
@@ -16,6 +16,13 @@ extern int blkzoned_report_zones(struct thread_data *td,
                                 struct zbd_zone *zones, unsigned int nr_zones);
  extern int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f,
                                 uint64_t offset, uint64_t length);
+extern int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                                      unsigned int *max_open_zones);
+extern int blkzoned_get_max_active_zones(struct thread_data *td,
+                                        struct fio_file *f,
+                                        unsigned int *max_active_zones);
+extern int blkzoned_finish_zone(struct thread_data *td, struct fio_file *f,
+                               uint64_t offset, uint64_t length);
  #else
  /*
   * Define stubs for systems that do not have zoned block device support.
@@ -44,6 +51,23 @@ static inline int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f,
  {
         return -EIO;
  }
+static inline int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                                             unsigned int *max_open_zones)
+{
+       return -EIO;
+}
+static inline int blkzoned_get_max_active_zones(struct thread_data *td,
+                                               struct fio_file *f,
+                                               unsigned int *max_open_zones)
+{
+       return -EIO;
+}
+static inline int blkzoned_finish_zone(struct thread_data *td,
+                                      struct fio_file *f,
+                                      uint64_t offset, uint64_t length)
+{
+       return -EIO;
+}
  #endif
  
  #endif /* FIO_BLKZONED_H */
diff --git a/oslib/libmtd.h b/oslib/libmtd.h

index a0c90dcb9de5e14fb3078341440d6fef8b169001..668e77981f5ced8d488a318e560d429dd3fbcdee 100644 (file)
--- a/oslib/libmtd.h
+++ b/oslib/libmtd.h
@@ -256,7 +256,7 @@ int mtd_mark_bad(const struct mtd_dev_info *mtd, int fd, int eb);
   * @mtd: MTD device description object
   * @fd: MTD device node file descriptor
   * @eb: eraseblock to read from
- * @offs: offset withing the eraseblock to read from
+ * @offs: offset within the eraseblock to read from
   * @buf: buffer to read data to
   * @len: how many bytes to read
   *
@@ -273,7 +273,7 @@ int mtd_read(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
   * @mtd: MTD device description object
   * @fd: MTD device node file descriptor
   * @eb: eraseblock to write to
- * @offs: offset withing the eraseblock to write to
+ * @offs: offset within the eraseblock to write to
   * @data: data buffer to write
   * @len: how many data bytes to write
   * @oob: OOB buffer to write
@@ -329,7 +329,7 @@ int mtd_write_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
   * @mtd: MTD device description object
   * @fd: MTD device node file descriptor
   * @eb: eraseblock to write to
- * @offs: offset withing the eraseblock to write to
+ * @offs: offset within the eraseblock to write to
   * @img_name: the file to write
   *
   * This function writes an image @img_name the MTD device defined by @mtd. @eb
diff --git a/oslib/linux-blkzoned.c b/oslib/linux-blkzoned.c

index f37c67fc86953cadab7eaab2c539c79dbf81027d..1cc8d288b49c9b80ea0cfb83d905934eab6e9419 100644 (file)
--- a/oslib/linux-blkzoned.c
+++ b/oslib/linux-blkzoned.c
@@ -22,6 +22,40 @@
  #include "zbd_types.h"
  
  #include <linux/blkzoned.h>
+#ifndef BLKFINISHZONE
+#define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range)
+#endif
+
+/*
+ * If the uapi headers installed on the system lacks zone capacity support,
+ * use our local versions. If the installed headers are recent enough to
+ * support zone capacity, do not redefine any structs.
+ */
+#ifndef CONFIG_HAVE_REP_CAPACITY
+#define BLK_ZONE_REP_CAPACITY  (1 << 0)
+
+struct blk_zone_v2 {
+       __u64   start;          /* Zone start sector */
+       __u64   len;            /* Zone length in number of sectors */
+       __u64   wp;             /* Zone write pointer position */
+       __u8    type;           /* Zone type */
+       __u8    cond;           /* Zone condition */
+       __u8    non_seq;        /* Non-sequential write resources active */
+       __u8    reset;          /* Reset write pointer recommended */
+       __u8    resv[4];
+       __u64   capacity;       /* Zone capacity in number of sectors */
+       __u8    reserved[24];
+};
+#define blk_zone blk_zone_v2
+
+struct blk_zone_report_v2 {
+       __u64   sector;
+       __u32   nr_zones;
+       __u32   flags;
+struct blk_zone zones[0];
+};
+#define blk_zone_report blk_zone_report_v2
+#endif /* CONFIG_HAVE_REP_CAPACITY */
  
  /*
   * Read up to 255 characters from the first line of a file. Strip the trailing
@@ -43,12 +77,16 @@ static char *read_file(const char *path)
         return strdup(line);
  }
  
-int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
-                            enum zbd_zoned_model *model)
+/*
+ * Get the value of a sysfs attribute for a block device.
+ *
+ * Returns NULL on failure.
+ * Returns a pointer to a string on success.
+ * The caller is responsible for freeing the memory.
+ */
+static char *blkzoned_get_sysfs_attr(const char *file_name, const char *attr)
  {
-       const char *file_name = f->file_name;
-       char *zoned_attr_path = NULL;
-       char *model_str = NULL;
+       char *attr_path = NULL;
         struct stat statbuf;
         char *sys_devno_path = NULL;
         char *part_attr_path = NULL;
@@ -56,13 +94,7 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
         char sys_path[PATH_MAX];
         ssize_t sz;
         char *delim = NULL;
-
-       if (f->filetype != FIO_TYPE_BLOCK) {
-               *model = ZBD_IGNORE;
-               return 0;
-       }
-
-       *model = ZBD_NONE;
+       char *attr_str = NULL;
  
         if (stat(file_name, &statbuf) < 0)
                 goto out;
@@ -92,34 +124,96 @@ int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
                 *delim = '\0';
         }
  
-       if (asprintf(&zoned_attr_path,
-                    "/sys/dev/block/%s/queue/zoned", sys_path) < 0)
+       if (asprintf(&attr_path,
+                    "/sys/dev/block/%s/%s", sys_path, attr) < 0)
                 goto out;
  
-       model_str = read_file(zoned_attr_path);
+       attr_str = read_file(attr_path);
+out:
+       free(attr_path);
+       free(part_str);
+       free(part_attr_path);
+       free(sys_devno_path);
+
+       return attr_str;
+}
+
+int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
+                            enum zbd_zoned_model *model)
+{
+       char *model_str = NULL;
+
+       if (f->filetype != FIO_TYPE_BLOCK)
+               return -EINVAL;
+
+       *model = ZBD_NONE;
+
+       model_str = blkzoned_get_sysfs_attr(f->file_name, "queue/zoned");
         if (!model_str)
-               goto out;
-       dprint(FD_ZBD, "%s: zbd model string: %s\n", file_name, model_str);
+               return 0;
+
+       dprint(FD_ZBD, "%s: zbd model string: %s\n", f->file_name, model_str);
         if (strcmp(model_str, "host-aware") == 0)
                 *model = ZBD_HOST_AWARE;
         else if (strcmp(model_str, "host-managed") == 0)
                 *model = ZBD_HOST_MANAGED;
-out:
+
         free(model_str);
-       free(zoned_attr_path);
-       free(part_str);
-       free(part_attr_path);
-       free(sys_devno_path);
+
+       return 0;
+}
+
+int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                               unsigned int *max_open_zones)
+{
+       char *max_open_str;
+
+       if (f->filetype != FIO_TYPE_BLOCK)
+               return -EIO;
+
+       max_open_str = blkzoned_get_sysfs_attr(f->file_name, "queue/max_open_zones");
+       if (!max_open_str) {
+               *max_open_zones = 0;
+               return 0;
+       }
+
+       dprint(FD_ZBD, "%s: max open zones supported by device: %s\n",
+              f->file_name, max_open_str);
+       *max_open_zones = atoll(max_open_str);
+
+       free(max_open_str);
+
+       return 0;
+}
+
+int blkzoned_get_max_active_zones(struct thread_data *td, struct fio_file *f,
+                                 unsigned int *max_active_zones)
+{
+       char *max_active_str;
+
+       if (f->filetype != FIO_TYPE_BLOCK)
+               return -EIO;
+
+       max_active_str = blkzoned_get_sysfs_attr(f->file_name, "queue/max_active_zones");
+       if (!max_active_str) {
+               *max_active_zones = 0;
+               return 0;
+       }
+
+       dprint(FD_ZBD, "%s: max active zones supported by device: %s\n",
+              f->file_name, max_active_str);
+       *max_active_zones = atoll(max_active_str);
+
+       free(max_active_str);
+
         return 0;
  }
  
  static uint64_t zone_capacity(struct blk_zone_report *hdr,
                               struct blk_zone *blkz)
  {
-#ifdef CONFIG_HAVE_REP_CAPACITY
         if (hdr->flags & BLK_ZONE_REP_CAPACITY)
                 return blkz->capacity << 9;
-#endif
         return blkz->len << 9;
  }
  
@@ -148,6 +242,8 @@ int blkzoned_report_zones(struct thread_data *td, struct fio_file *f,
         hdr->sector = offset >> 9;
         ret = ioctl(fd, BLKREPORTZONE, hdr);
         if (ret) {
+               log_err("%s: BLKREPORTZONE ioctl failed, ret=%d, err=%d.\n",
+                       f->file_name, ret, -errno);
                 ret = -errno;
                 goto out;
         }
@@ -240,3 +336,37 @@ int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f,
  
         return ret;
  }
+
+int blkzoned_finish_zone(struct thread_data *td, struct fio_file *f,
+                        uint64_t offset, uint64_t length)
+{
+       struct blk_zone_range zr = {
+               .sector         = offset >> 9,
+               .nr_sectors     = length >> 9,
+       };
+       int fd, ret = 0;
+
+       /* If the file is not yet opened, open it for this function. */
+       fd = f->fd;
+       if (fd < 0) {
+               fd = open(f->file_name, O_RDWR | O_LARGEFILE);
+               if (fd < 0)
+                       return -errno;
+       }
+
+       if (ioctl(fd, BLKFINISHZONE, &zr) < 0) {
+               ret = -errno;
+               /*
+                * Kernel versions older than 5.5 do not support BLKFINISHZONE
+                * and return the ENOTTY error code. These old kernels only
+                * support block devices that close zones automatically.
+                */
+               if (ret == ENOTTY)
+                       ret = 0;
+       }
+
+       if (f->fd < 0)
+               close(fd);
+
+       return ret;
+}
diff --git a/oslib/linux-dev-lookup.c b/oslib/linux-dev-lookup.c

index 1dda93f2a0ef3fa0e537a9ccdf202a9a624ecb35..4335faf99b91b1c1876afd836673f1d4cb658472 100644 (file)
--- a/oslib/linux-dev-lookup.c
+++ b/oslib/linux-dev-lookup.c
@@ -16,6 +16,16 @@ int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
         int found = 0;
         DIR *D;
  
+       /*
+        * If replay_redirect is set then always return this device
+        * upon lookup which overrides the device lookup based on
+        * major minor in the actual blktrace
+        */
+       if (redirect) {
+               strcpy(path, redirect);
+               return 1;
+       }
+
         D = opendir(path);
         if (!D)
                 return 0;
@@ -44,17 +54,6 @@ int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
                 if (!S_ISBLK(st.st_mode))
                         continue;
  
-               /*
-                * If replay_redirect is set then always return this device
-                * upon lookup which overrides the device lookup based on
-                * major minor in the actual blktrace
-                */
-               if (redirect) {
-                       strcpy(path, redirect);
-                       found = 1;
-                       break;
-               }
-
                 if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
                         strcpy(path, full_path);
                         found = 1;
diff --git a/parse.c b/parse.c

index 44bf950768d9b51bcc3e43ba30c2564614a6e4c6..656a50250b43cb09712de51fc2876c2aaefd5d65 100644 (file)
--- a/parse.c
+++ b/parse.c
@@ -37,6 +37,7 @@ static const char *opt_type_names[] = {
         "OPT_BOOL",
         "OPT_FLOAT_LIST",
         "OPT_STR_SET",
+       "OPT_STR_VAL_ZONE",
         "OPT_DEPRECATED",
         "OPT_SOFT_DEPRECATED",
         "OPT_UNSUPPORTED",
@@ -476,13 +477,17 @@ static int check_int(const char *p, int *val)
  
  static size_t opt_len(const char *str)
  {
+       char delimiter[] = {',', ':'};
         char *postfix;
+       unsigned int i;
  
-       postfix = strchr(str, ':');
-       if (!postfix)
-               return strlen(str);
+       for (i = 0; i < FIO_ARRAY_SIZE(delimiter); i++) {
+               postfix = strchr(str, delimiter[i]);
+               if (postfix)
+                       return (int)(postfix - str);
+       }
  
-       return (int)(postfix - str);
+       return strlen(str);
  }
  
  static int str_match_len(const struct value_pair *vp, const char *str)
@@ -596,12 +601,38 @@ static int __handle_option(const struct fio_option *o, const char *ptr,
         }
         case FIO_OPT_STR_VAL_TIME:
                 is_time = 1;
-               fallthrough;
+               fio_fallthrough;
         case FIO_OPT_ULL:
         case FIO_OPT_INT:
-       case FIO_OPT_STR_VAL: {
+       case FIO_OPT_STR_VAL:
+       case FIO_OPT_STR_VAL_ZONE:
+       {
                 fio_opt_str_val_fn *fn = o->cb;
                 char tmp[128], *p;
+               size_t len = strlen(ptr);
+
+               if (len > 0 && ptr[len - 1] == 'z') {
+                       if (o->type == FIO_OPT_STR_VAL_ZONE) {
+                               char *ep;
+                               unsigned long long val;
+
+                               errno = 0;
+                               val = strtoul(ptr, &ep, 10);
+                               if (errno == 0 && ep != ptr && *ep == 'z') {
+                                       ull = ZONE_BASE_VAL + (uint32_t)val;
+                                       ret = 0;
+                                       goto store_option_value;
+                               } else {
+                                       log_err("%s: unexpected zone value '%s'\n",
+                                               o->name, ptr);
+                                       return 1;
+                               }
+                       } else {
+                               log_err("%s: 'z' suffix isn't applicable\n",
+                                       o->name);
+                               return 1;
+                       }
+               }
  
                 if (!is_time && o->is_time)
                         is_time = o->is_time;
@@ -655,6 +686,7 @@ static int __handle_option(const struct fio_option *o, const char *ptr,
                         }
                 }
  
+store_option_value:
                 if (fn)
                         ret = fn(data, &ull);
                 else {
@@ -785,6 +817,8 @@ static int __handle_option(const struct fio_option *o, const char *ptr,
  
                 if (o->off1) {
                         cp = td_var(data, o, o->off1);
+                       if (*cp)
+                               free(*cp);
                         *cp = strdup(ptr);
                         if (strlen(ptr) > o->maxlen - 1) {
                                 log_err("value exceeds max length of %d\n",
@@ -946,7 +980,7 @@ static int __handle_option(const struct fio_option *o, const char *ptr,
         }
         case FIO_OPT_DEPRECATED:
                 ret = 1;
-               fallthrough;
+               fio_fallthrough;
         case FIO_OPT_SOFT_DEPRECATED:
                 log_info("Option %s is deprecated\n", o->name);
                 break;
diff --git a/parse.h b/parse.h

index e6663ed484ed343b096ebc33a28a52560f642aea..806a76ee09eee1ba4ad1978b542a4e8b8ed2bc37 100644 (file)
--- a/parse.h
+++ b/parse.h
@@ -21,6 +21,7 @@ enum fio_opt_type {
         FIO_OPT_BOOL,
         FIO_OPT_FLOAT_LIST,
         FIO_OPT_STR_SET,
+       FIO_OPT_STR_VAL_ZONE,
         FIO_OPT_DEPRECATED,
         FIO_OPT_SOFT_DEPRECATED,
         FIO_OPT_UNSUPPORTED,    /* keep this last */
@@ -31,7 +32,7 @@ enum fio_opt_type {
   */
  struct value_pair {
         const char *ival;               /* string option */
-       unsigned long long oval;/* output value */
+       unsigned long long oval;        /* output value */
         const char *help;               /* help text for sub option */
         int orval;                      /* OR value */
         void *cb;                       /* sub-option callback */
@@ -130,12 +131,18 @@ static inline void *td_var(void *to, const struct fio_option *o,
  
  static inline int parse_is_percent(unsigned long long val)
  {
-       return val <= -1ULL && val >= (-1ULL - 100ULL);
+       return val >= -101ULL;
  }
  
+#define ZONE_BASE_VAL ((-1ULL >> 1) + 1)
  static inline int parse_is_percent_uncapped(unsigned long long val)
  {
-       return (long long)val <= -1;
+       return ZONE_BASE_VAL + -1U < val;
+}
+
+static inline int parse_is_zone(unsigned long long val)
+{
+       return (val - ZONE_BASE_VAL) <= -1U;
  }
  
  struct print_option {
diff --git a/rate-submit.c b/rate-submit.c

index 13dbe7a2e9ff8f3face1619e750b83018ba222df..92be3df75ebc181a653c5b745d15c84c9d7b97d2 100644 (file)
--- a/rate-submit.c
+++ b/rate-submit.c
@@ -5,6 +5,9 @@
   *
   */
  #include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+
  #include "fio.h"
  #include "ioengines.h"
  #include "lib/getrusage.h"
@@ -12,8 +15,7 @@
  
  static void check_overlap(struct io_u *io_u)
  {
-       int i, res;
-       struct thread_data *td;
+       int res;
  
         /*
          * Allow only one thread to check for overlap at a time to prevent two
@@ -28,10 +30,13 @@ static void check_overlap(struct io_u *io_u)
          * threads as they assess overlap.
          */
         res = pthread_mutex_lock(&overlap_check);
-       assert(res == 0);
+       if (fio_unlikely(res != 0)) {
+               log_err("failed to lock overlap check mutex, err: %i:%s", errno, strerror(errno));
+               abort();
+       }
  
  retry:
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if (td->runstate <= TD_SETTING_UP ||
                     td->runstate >= TD_FINISHING ||
                     !td->o.serialize_overlap ||
@@ -42,11 +47,17 @@ retry:
                         continue;
  
                 res = pthread_mutex_unlock(&overlap_check);
-               assert(res == 0);
+               if (fio_unlikely(res != 0)) {
+                       log_err("failed to unlock overlap check mutex, err: %i:%s", errno, strerror(errno));
+                       abort();
+               }
                 res = pthread_mutex_lock(&overlap_check);
-               assert(res == 0);
+               if (fio_unlikely(res != 0)) {
+                       log_err("failed to lock overlap check mutex, err: %i:%s", errno, strerror(errno));
+                       abort();
+               }
                 goto retry;
-       }
+       } end_for_each();
  }
  
  static int io_workqueue_fn(struct submit_worker *sw,
@@ -154,6 +165,7 @@ static int io_workqueue_init_worker_fn(struct submit_worker *sw)
         dup_files(td, parent);
         td->eo = parent->eo;
         fio_options_mem_dupe(td);
+       td->iolog_f = parent->iolog_f;
  
         if (ioengine_load(td))
                 goto err;
@@ -173,7 +185,7 @@ static int io_workqueue_init_worker_fn(struct submit_worker *sw)
         if (td->io_ops->post_init && td->io_ops->post_init(td))
                 goto err_io_init;
  
-       set_epoch_time(td, td->o.log_unix_epoch);
+       set_epoch_time(td, td->o.log_alternate_epoch_clock_id, td->o.job_start_clock_id);
         fio_getrusage(&td->ru_start);
         clear_io_state(td, 1);
  
@@ -195,7 +207,16 @@ static void io_workqueue_exit_worker_fn(struct submit_worker *sw,
         struct thread_data *td = sw->priv;
  
         (*sum_cnt)++;
-       sum_thread_stats(&sw->wq->td->ts, &td->ts, *sum_cnt == 1);
+
+       /*
+        * io_workqueue_update_acct_fn() doesn't support per prio stats, and
+        * even if it did, offload can't be used with all async IO engines.
+        * If group reporting is set in the parent td, the group result
+        * generated by __show_run_stats() can still contain multiple prios
+        * from different offloaded jobs.
+        */
+       sw->wq->td->ts.disable_prio_stat = 1;
+       sum_thread_stats(&sw->wq->td->ts, &td->ts);
  
         fio_options_free(td);
         close_and_free_files(td);
@@ -254,6 +275,8 @@ static void sum_ddir(struct thread_data *dst, struct thread_data *src,
         sum_val(&dst->this_io_blocks[ddir], &src->this_io_blocks[ddir]);
         sum_val(&dst->this_io_bytes[ddir], &src->this_io_bytes[ddir]);
         sum_val(&dst->bytes_done[ddir], &src->bytes_done[ddir]);
+       if (ddir == DDIR_READ)
+               sum_val(&dst->bytes_verified, &src->bytes_verified);
  
         pthread_double_unlock(&dst->io_wq.stat_lock, &src->io_wq.stat_lock);
  }
diff --git a/server.c b/server.c

index 1b65297ec25feb166e3f39e6b01b7c081a96fa42..afaeb3482b0fd0072c123bf78b925b5a35176bf1 100644 (file)
--- a/server.c
+++ b/server.c
@@ -1,5 +1,6 @@
  #include <stdio.h>
  #include <stdlib.h>
+#include <string.h>
  #include <unistd.h>
  #include <errno.h>
  #include <poll.h>
@@ -63,12 +64,28 @@ static char me[128];
  
  static pthread_key_t sk_out_key;
  
+#ifdef WIN32
+static char *fio_server_pipe_name  = NULL;
+static HANDLE hjob = INVALID_HANDLE_VALUE;
+struct ffi_element {
+       union {
+               pthread_t thread;
+               HANDLE hProcess;
+       };
+       bool is_thread;
+};
+#endif
+
  struct fio_fork_item {
         struct flist_head list;
         int exitval;
         int signal;
         int exited;
+#ifdef WIN32
+       struct ffi_element element;
+#else
         pid_t pid;
+#endif
  };
  
  struct cmd_reply {
@@ -250,6 +267,28 @@ static int fio_send_data(int sk, const void *p, unsigned int len)
         return fio_sendv_data(sk, &iov, 1);
  }
  
+bool fio_server_poll_fd(int fd, short events, int timeout)
+{
+       struct pollfd pfd = {
+               .fd     = fd,
+               .events = events,
+       };
+       int ret;
+
+       ret = poll(&pfd, 1, timeout);
+       if (ret < 0) {
+               if (errno == EINTR)
+                       return false;
+               log_err("fio: poll: %s\n", strerror(errno));
+               return false;
+       } else if (!ret) {
+               return false;
+       }
+       if (pfd.revents & events)
+               return true;
+       return false;
+}
+
  static int fio_recv_data(int sk, void *buf, unsigned int len, bool wait)
  {
         int flags;
@@ -409,8 +448,9 @@ struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait)
                         if (cmdret->opcode == FIO_NET_CMD_TEXT) {
                                 struct cmd_text_pdu *__pdu = (struct cmd_text_pdu *) cmdret->payload;
                                 char *buf = (char *) __pdu->buf;
+                               int len = le32_to_cpu(__pdu->buf_len);
  
-                               buf[__pdu->buf_len] = '\0';
+                               buf[len] = '\0';
                         } else if (cmdret->opcode == FIO_NET_CMD_JOB) {
                                 struct cmd_job_pdu *__pdu = (struct cmd_job_pdu *) cmdret->payload;
                                 char *buf = (char *) __pdu->buf;
@@ -650,6 +690,63 @@ static int fio_net_queue_stop(int error, int signal)
         return fio_net_send_ack(NULL, error, signal);
  }
  
+#ifdef WIN32
+static void fio_server_add_fork_item(struct ffi_element *element, struct flist_head *list)
+{
+       struct fio_fork_item *ffi;
+
+       ffi = malloc(sizeof(*ffi));
+       ffi->exitval = 0;
+       ffi->signal = 0;
+       ffi->exited = 0;
+       ffi->element = *element;
+       flist_add_tail(&ffi->list, list);
+}
+
+static void fio_server_add_conn_pid(struct flist_head *conn_list, HANDLE hProcess)
+{
+       struct ffi_element element = {.hProcess = hProcess, .is_thread=FALSE};
+       dprint(FD_NET, "server: forked off connection job (tid=%u)\n", (int) element.thread);
+
+       fio_server_add_fork_item(&element, conn_list);
+}
+
+static void fio_server_add_job_pid(struct flist_head *job_list, pthread_t thread)
+{
+       struct ffi_element element = {.thread = thread, .is_thread=TRUE};
+       dprint(FD_NET, "server: forked off job job (tid=%u)\n", (int) element.thread);
+       fio_server_add_fork_item(&element, job_list);
+}
+
+static void fio_server_check_fork_item(struct fio_fork_item *ffi)
+{
+       int ret;
+
+       if (ffi->element.is_thread) {
+
+               ret = pthread_kill(ffi->element.thread, 0);
+               if (ret) {
+                       int rev_val;
+                       pthread_join(ffi->element.thread, (void**) &rev_val); /*if the thread is dead, then join it to get status*/
+
+                       ffi->exitval = rev_val;
+                       if (ffi->exitval)
+                               log_err("thread (tid=%u) exited with %x\n", (int) ffi->element.thread, (int) ffi->exitval);
+                       dprint(FD_PROCESS, "thread (tid=%u) exited with %x\n", (int) ffi->element.thread, (int) ffi->exitval);
+                       ffi->exited = 1;
+               }
+       } else {
+               DWORD exit_val;
+               GetExitCodeProcess(ffi->element.hProcess, &exit_val);
+
+               if (exit_val != STILL_ACTIVE) {
+                       dprint(FD_PROCESS, "process %u exited with %d\n", GetProcessId(ffi->element.hProcess), exit_val);
+                       ffi->exited = 1;
+                       ffi->exitval = exit_val;
+               }
+       }
+}
+#else
  static void fio_server_add_fork_item(pid_t pid, struct flist_head *list)
  {
         struct fio_fork_item *ffi;
@@ -697,10 +794,21 @@ static void fio_server_check_fork_item(struct fio_fork_item *ffi)
                 }
         }
  }
+#endif
  
  static void fio_server_fork_item_done(struct fio_fork_item *ffi, bool stop)
  {
+#ifdef WIN32
+       if (ffi->element.is_thread)
+               dprint(FD_NET, "tid %u exited, sig=%u, exitval=%d\n", (int) ffi->element.thread, ffi->signal, ffi->exitval);
+       else {
+               dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int)  GetProcessId(ffi->element.hProcess), ffi->signal, ffi->exitval);
+               CloseHandle(ffi->element.hProcess);
+               ffi->element.hProcess = INVALID_HANDLE_VALUE;
+       }
+#else
         dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int) ffi->pid, ffi->signal, ffi->exitval);
+#endif
  
         /*
          * Fold STOP and QUIT...
@@ -761,27 +869,62 @@ static int handle_load_file_cmd(struct fio_net_cmd *cmd)
         return 0;
  }
  
-static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
-                         struct fio_net_cmd *cmd)
+#ifdef WIN32
+static void *fio_backend_thread(void *data)
  {
-       pid_t pid;
         int ret;
+       struct sk_out *sk_out = (struct sk_out *) data;
  
         sk_out_assign(sk_out);
  
+       ret = fio_backend(sk_out);
+       sk_out_drop();
+
+       pthread_exit((void*) (intptr_t) ret);
+       return NULL;
+}
+#endif
+
+static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
+                         struct fio_net_cmd *cmd)
+{
+       int ret;
+
         fio_time_init();
         set_genesis_time();
  
-       pid = fork();
-       if (pid) {
-               fio_server_add_job_pid(job_list, pid);
-               return 0;
+#ifdef WIN32
+       {
+               pthread_t thread;
+               /* both this thread and backend_thread call sk_out_assign() to double increment
+                * the ref count.  This ensures struct is valid until both threads are done with it
+                */
+               sk_out_assign(sk_out);
+               ret = pthread_create(&thread, NULL,     fio_backend_thread, sk_out);
+               if (ret) {
+                       log_err("pthread_create: %s\n", strerror(ret));
+                       return ret;
+               }
+
+               fio_server_add_job_pid(job_list, thread);
+               return ret;
         }
+#else
+    {
+               pid_t pid;
+               sk_out_assign(sk_out);
+               pid = fork();
+               if (pid) {
+                       fio_server_add_job_pid(job_list, pid);
+                       return 0;
+               }
  
-       ret = fio_backend(sk_out);
-       free_threads_shm();
-       sk_out_drop();
-       _exit(ret);
+               ret = fio_backend(sk_out);
+               free_threads_shm();
+               sk_out_drop();
+               _exit(ret);
+       }
+#endif
  }
  
  static int handle_job_cmd(struct fio_net_cmd *cmd)
@@ -857,7 +1000,7 @@ static int handle_probe_cmd(struct fio_net_cmd *cmd)
                 .os             = FIO_OS,
                 .arch           = FIO_ARCH,
                 .bpp            = sizeof(void *),
-               .cpus           = __cpu_to_le32(cpus_online()),
+               .cpus           = __cpu_to_le32(cpus_configured()),
         };
  
         dprint(FD_NET, "server: sending probe reply\n");
@@ -940,6 +1083,7 @@ static int handle_update_job_cmd(struct fio_net_cmd *cmd)
         struct cmd_add_job_pdu *pdu = (struct cmd_add_job_pdu *) cmd->payload;
         struct thread_data *td;
         uint32_t tnumber;
+       int ret;
  
         tnumber = le32_to_cpu(pdu->thread_number);
  
@@ -951,8 +1095,9 @@ static int handle_update_job_cmd(struct fio_net_cmd *cmd)
         }
  
         td = tnumber_to_td(tnumber);
-       convert_thread_options_to_cpu(&td->o, &pdu->top);
-       send_update_job_reply(cmd->tag, 0);
+       ret = convert_thread_options_to_cpu(&td->o, &pdu->top,
+                       cmd->pdu_len - offsetof(struct cmd_add_job_pdu, top));
+       send_update_job_reply(cmd->tag, ret);
         return 0;
  }
  
@@ -1181,7 +1326,7 @@ static int handle_xmits(struct sk_out *sk_out)
         sk_unlock(sk_out);
  
         while (!flist_empty(&list)) {
-               entry = flist_entry(list.next, struct sk_entry, list);
+               entry = flist_first_entry(&list, struct sk_entry, list);
                 flist_del(&entry->list);
                 ret += handle_sk_entry(sk_out, entry);
         }
@@ -1237,7 +1382,8 @@ static int handle_connection(struct sk_out *sk_out)
                 if (ret < 0)
                         break;
  
-               cmd = fio_net_recv_cmd(sk_out->sk, true);
+               if (pfd.revents & POLLIN)
+                       cmd = fio_net_recv_cmd(sk_out->sk, true);
                 if (!cmd) {
                         ret = -1;
                         break;
@@ -1299,6 +1445,73 @@ static int get_my_addr_str(int sk)
         return 0;
  }
  
+#ifdef WIN32
+static int handle_connection_process(void)
+{
+       WSAPROTOCOL_INFO protocol_info;
+       DWORD bytes_read;
+       HANDLE hpipe;
+       int sk;
+       struct sk_out *sk_out;
+       int ret;
+       char *msg = (char *) "connected";
+
+       log_info("server enter accept loop.  ProcessID %d\n", GetCurrentProcessId());
+
+       hpipe = CreateFile(
+                                       fio_server_pipe_name,
+                                       GENERIC_READ | GENERIC_WRITE,
+                                       0, NULL,
+                                       OPEN_EXISTING,
+                                       0, NULL);
+
+       if (hpipe == INVALID_HANDLE_VALUE) {
+               log_err("couldnt open pipe %s error %lu\n",
+                               fio_server_pipe_name, GetLastError());
+               return -1;
+       }
+
+       if (!ReadFile(hpipe, &protocol_info, sizeof(protocol_info), &bytes_read, NULL)) {
+               log_err("couldnt read pi from pipe %s error %lu\n", fio_server_pipe_name,
+                               GetLastError());
+       }
+
+       if (use_ipv6) /* use protocol_info to create a duplicate of parents socket */
+               sk = WSASocket(AF_INET6, SOCK_STREAM, 0, &protocol_info, 0, 0);
+       else
+               sk = WSASocket(AF_INET,  SOCK_STREAM, 0, &protocol_info, 0, 0);
+
+       sk_out = scalloc(1, sizeof(*sk_out));
+       if (!sk_out) {
+               CloseHandle(hpipe);
+               close(sk);
+               return -1;
+       }
+
+       sk_out->sk = sk;
+       sk_out->hProcess = INVALID_HANDLE_VALUE;
+       INIT_FLIST_HEAD(&sk_out->list);
+       __fio_sem_init(&sk_out->lock, FIO_SEM_UNLOCKED);
+       __fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED);
+       __fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED);
+
+       get_my_addr_str(sk);
+
+       if (!WriteFile(hpipe, msg, strlen(msg), NULL, NULL)) {
+               log_err("couldnt write pipe\n");
+               close(sk);
+               return -1;
+       }
+       CloseHandle(hpipe);
+
+       sk_out_assign(sk_out);
+
+       ret = handle_connection(sk_out);
+       __sk_out_drop(sk_out);
+       return ret;
+}
+#endif
+
  static int accept_loop(int listen_sk)
  {
         struct sockaddr_in addr;
@@ -1316,8 +1529,11 @@ static int accept_loop(int listen_sk)
                 struct sk_out *sk_out;
                 const char *from;
                 char buf[64];
+#ifdef WIN32
+               HANDLE hProcess;
+#else
                 pid_t pid;
-
+#endif
                 pfd.fd = listen_sk;
                 pfd.events = POLLIN;
                 do {
@@ -1375,6 +1591,13 @@ static int accept_loop(int listen_sk)
                 __fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED);
                 __fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED);
  
+#ifdef WIN32
+               hProcess = windows_handle_connection(hjob, sk);
+               if (hProcess == INVALID_HANDLE_VALUE)
+                       return -1;
+               sk_out->hProcess = hProcess;
+               fio_server_add_conn_pid(&conn_list, hProcess);
+#else
                 pid = fork();
                 if (pid) {
                         close(sk);
@@ -1391,6 +1614,7 @@ static int accept_loop(int listen_sk)
                  */
                 sk_out_assign(sk_out);
                 handle_connection(sk_out);
+#endif
         }
  
         return exitval;
@@ -1464,8 +1688,11 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
  {
         struct cmd_ts_pdu p;
         int i, j, k;
-       void *ss_buf;
-       uint64_t *ss_iops, *ss_bw;
+       size_t clat_prio_stats_extra_size = 0;
+       size_t ss_extra_size = 0;
+       size_t extended_buf_size = 0;
+       void *extended_buf;
+       void *extended_buf_wp;
  
         dprint(FD_NET, "server sending end stats\n");
  
@@ -1479,9 +1706,12 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
         p.ts.error              = cpu_to_le32(ts->error);
         p.ts.thread_number      = cpu_to_le32(ts->thread_number);
         p.ts.groupid            = cpu_to_le32(ts->groupid);
+       p.ts.job_start          = cpu_to_le64(ts->job_start);
         p.ts.pid                = cpu_to_le32(ts->pid);
         p.ts.members            = cpu_to_le32(ts->members);
         p.ts.unified_rw_rep     = cpu_to_le32(ts->unified_rw_rep);
+       p.ts.ioprio             = cpu_to_le32(ts->ioprio);
+       p.ts.disable_prio_stat  = cpu_to_le32(ts->disable_prio_stat);
  
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
                 convert_io_stat(&p.ts.clat_stat[i], &ts->clat_stat[i]);
@@ -1576,38 +1806,87 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
         p.ts.cachehit           = cpu_to_le64(ts->cachehit);
         p.ts.cachemiss          = cpu_to_le64(ts->cachemiss);
  
+       convert_gs(&p.rs, rs);
+
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
-                       p.ts.io_u_plat_high_prio[i][j] = cpu_to_le64(ts->io_u_plat_high_prio[i][j]);
-                       p.ts.io_u_plat_low_prio[i][j] = cpu_to_le64(ts->io_u_plat_low_prio[i][j]);
+               if (ts->nr_clat_prio[i])
+                       clat_prio_stats_extra_size += ts->nr_clat_prio[i] * sizeof(*ts->clat_prio[i]);
+       }
+       extended_buf_size += clat_prio_stats_extra_size;
+
+       dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
+       if (ts->ss_state & FIO_SS_DATA)
+               ss_extra_size = 2 * ts->ss_dur * sizeof(uint64_t);
+
+       extended_buf_size += ss_extra_size;
+       if (!extended_buf_size) {
+               fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
+               return;
+       }
+
+       extended_buf_size += sizeof(p);
+       extended_buf = calloc(1, extended_buf_size);
+       if (!extended_buf) {
+               log_err("fio: failed to allocate FIO_NET_CMD_TS buffer\n");
+               return;
+       }
+
+       memcpy(extended_buf, &p, sizeof(p));
+       extended_buf_wp = (struct cmd_ts_pdu *)extended_buf + 1;
+
+       if (clat_prio_stats_extra_size) {
+               for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+                       struct clat_prio_stat *prio = (struct clat_prio_stat *) extended_buf_wp;
+
+                       for (j = 0; j < ts->nr_clat_prio[i]; j++) {
+                               for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+                                       prio->io_u_plat[k] =
+                                               cpu_to_le64(ts->clat_prio[i][j].io_u_plat[k]);
+                               convert_io_stat(&prio->clat_stat,
+                                               &ts->clat_prio[i][j].clat_stat);
+                               prio->ioprio = cpu_to_le32(ts->clat_prio[i][j].ioprio);
+                               prio++;
+                       }
+
+                       if (ts->nr_clat_prio[i]) {
+                               uint64_t offset = (char *)extended_buf_wp - (char *)extended_buf;
+                               struct cmd_ts_pdu *ptr = extended_buf;
+
+                               ptr->ts.clat_prio_offset[i] = cpu_to_le64(offset);
+                               ptr->ts.nr_clat_prio[i] = cpu_to_le32(ts->nr_clat_prio[i]);
+                       }
+
+                       extended_buf_wp = prio;
                 }
-               convert_io_stat(&p.ts.clat_high_prio_stat[i], &ts->clat_high_prio_stat[i]);
-               convert_io_stat(&p.ts.clat_low_prio_stat[i], &ts->clat_low_prio_stat[i]);
         }
  
-       convert_gs(&p.rs, rs);
+       if (ss_extra_size) {
+               uint64_t *ss_iops, *ss_bw;
+               uint64_t offset;
+               struct cmd_ts_pdu *ptr = extended_buf;
  
-       dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
-       if (ts->ss_state & FIO_SS_DATA) {
                 dprint(FD_NET, "server sending steadystate ring buffers\n");
  
-               ss_buf = malloc(sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t));
+               /* ss iops */
+               ss_iops = (uint64_t *) extended_buf_wp;
+               for (i = 0; i < ts->ss_dur; i++)
+                       ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
  
-               memcpy(ss_buf, &p, sizeof(p));
+               offset = (char *)extended_buf_wp - (char *)extended_buf;
+               ptr->ts.ss_iops_data_offset = cpu_to_le64(offset);
+               extended_buf_wp = ss_iops + (int) ts->ss_dur;
  
-               ss_iops = (uint64_t *) ((struct cmd_ts_pdu *)ss_buf + 1);
-               ss_bw = ss_iops + (int) ts->ss_dur;
-               for (i = 0; i < ts->ss_dur; i++) {
-                       ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
+               /* ss bw */
+               ss_bw = extended_buf_wp;
+               for (i = 0; i < ts->ss_dur; i++)
                         ss_bw[i] = cpu_to_le64(ts->ss_bw_data[i]);
-               }
-
-               fio_net_queue_cmd(FIO_NET_CMD_TS, ss_buf, sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t), NULL, SK_F_COPY);
  
-               free(ss_buf);
+               offset = (char *)extended_buf_wp - (char *)extended_buf;
+               ptr->ts.ss_bw_data_offset = cpu_to_le64(offset);
         }
-       else
-               fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
+
+       fio_net_queue_cmd(FIO_NET_CMD_TS, extended_buf, extended_buf_size, NULL, SK_F_COPY);
+       free(extended_buf);
  }
  
  void fio_server_send_gs(struct group_run_stats *rs)
@@ -1909,7 +2188,7 @@ static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
                         break;
                 }
                 flist_add_tail(&entry->list, &first->next);
-       } while (ret != Z_STREAM_END);
+       }
  
         ret = deflateEnd(&stream);
         if (ret == Z_OK)
@@ -1980,6 +2259,7 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
                 .thread_number          = cpu_to_le32(td->thread_number),
                 .log_type               = cpu_to_le32(log->log_type),
                 .log_hist_coarseness    = cpu_to_le32(log->hist_coarseness),
+               .per_job_logs           = cpu_to_le32(td->o.per_job_logs),
         };
         struct sk_entry *first;
         struct flist_head *entry;
@@ -2008,7 +2288,10 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
                         struct io_sample *s = get_sample(log, cur_log, i);
  
                         s->time         = cpu_to_le64(s->time);
-                       s->data.val     = cpu_to_le64(s->data.val);
+                       if (log->log_type != IO_LOG_TYPE_HIST) {
+                               s->data.val.val0        = cpu_to_le64(s->data.val.val0);
+                               s->data.val.val1        = cpu_to_le64(s->data.val.val1);
+                       }
                         s->__ddir       = __cpu_to_le32(s->__ddir);
                         s->bs           = cpu_to_le64(s->bs);
  
@@ -2046,22 +2329,29 @@ int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
  
  void fio_server_send_add_job(struct thread_data *td)
  {
-       struct cmd_add_job_pdu pdu = {
-               .thread_number = cpu_to_le32(td->thread_number),
-               .groupid = cpu_to_le32(td->groupid),
-       };
+       struct cmd_add_job_pdu *pdu;
+       size_t cmd_sz = offsetof(struct cmd_add_job_pdu, top) +
+               thread_options_pack_size(&td->o);
  
-       convert_thread_options_to_net(&pdu.top, &td->o);
+       pdu = malloc(cmd_sz);
+       pdu->thread_number = cpu_to_le32(td->thread_number);
+       pdu->groupid = cpu_to_le32(td->groupid);
  
-       fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, &pdu, sizeof(pdu), NULL,
-                               SK_F_COPY);
+       convert_thread_options_to_net(&pdu->top, &td->o);
+
+       fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, pdu, cmd_sz, NULL, SK_F_COPY);
+       free(pdu);
  }
  
  void fio_server_send_start(struct thread_data *td)
  {
         struct sk_out *sk_out = pthread_getspecific(sk_out_key);
  
-       assert(sk_out->sk != -1);
+       if (sk_out->sk == -1) {
+               log_err("pthread getting specific for key failed, sk_out %p, sk %i, err: %i:%s",
+                       sk_out, sk_out->sk, errno, strerror(errno));
+               abort();
+       }
  
         fio_net_queue_cmd(FIO_NET_CMD_SERVER_START, NULL, 0, NULL, SK_F_SIMPLE);
  }
@@ -2456,6 +2746,11 @@ static void set_sig_handlers(void)
         };
  
         sigaction(SIGINT, &act, NULL);
+
+       /* Windows uses SIGBREAK as a quit signal from other applications */
+#ifdef WIN32
+       sigaction(SIGBREAK, &act, NULL);
+#endif
  }
  
  void fio_server_destroy_sk_key(void)
@@ -2483,12 +2778,25 @@ static int fio_server(void)
         if (fio_handle_server_arg())
                 return -1;
  
+       set_sig_handlers();
+
+#ifdef WIN32
+       /* if this is a child process, go handle the connection */
+       if (fio_server_pipe_name != NULL) {
+               ret = handle_connection_process();
+               return ret;
+       }
+
+       /* job to link child processes so they terminate together */
+       hjob = windows_create_job();
+       if (hjob == INVALID_HANDLE_VALUE)
+               return -1;
+#endif
+
         sk = fio_init_server_connection();
         if (sk < 0)
                 return -1;
  
-       set_sig_handlers();
-
         ret = accept_loop(sk);
  
         close(sk);
@@ -2564,6 +2872,7 @@ static int write_pid(pid_t pid, const char *pidfile)
   */
  int fio_start_server(char *pidfile)
  {
+       FILE *file;
         pid_t pid;
         int ret;
  
@@ -2596,14 +2905,28 @@ int fio_start_server(char *pidfile)
         setsid();
         openlog("fio", LOG_NDELAY|LOG_NOWAIT|LOG_PID, LOG_USER);
         log_syslog = true;
-       close(STDIN_FILENO);
-       close(STDOUT_FILENO);
-       close(STDERR_FILENO);
+
+       file = freopen("/dev/null", "r", stdin);
+       if (!file)
+               perror("freopen");
+
+       file = freopen("/dev/null", "w", stdout);
+       if (!file)
+               perror("freopen");
+
+       file = freopen("/dev/null", "w", stderr);
+       if (!file)
+               perror("freopen");
+
         f_out = NULL;
         f_err = NULL;
  
         ret = fio_server();
  
+       fclose(stdin);
+       fclose(stdout);
+       fclose(stderr);
+
         closelog();
         unlink(pidfile);
         free(pidfile);
@@ -2614,3 +2937,10 @@ void fio_server_set_arg(const char *arg)
  {
         fio_server_arg = strdup(arg);
  }
+
+#ifdef WIN32
+void fio_server_internal_set(const char *arg)
+{
+       fio_server_pipe_name = strdup(arg);
+}
+#endif
diff --git a/server.h b/server.h

index 9256d44c5001c2daa72232407650ecef36d4fdfa..83ce449ba0e77034de3bf2f0ef9897ec81ff5f86 100644 (file)
--- a/server.h
+++ b/server.h
@@ -15,6 +15,9 @@ struct sk_out {
         unsigned int refs;      /* frees sk_out when it drops to zero.
                                  * protected by below ->lock */
  
+#ifdef WIN32
+       HANDLE hProcess;                /* process handle of handle_connection_process*/
+#endif
         int sk;                 /* socket fd to talk to client */
         struct fio_sem lock;    /* protects ref and below list */
         struct flist_head list; /* list of pending transmit work */
@@ -48,7 +51,7 @@ struct fio_net_cmd_reply {
  };
  
  enum {
-       FIO_SERVER_VER                  = 87,
+       FIO_SERVER_VER                  = 104,
  
         FIO_SERVER_MAX_FRAGMENT_PDU     = 1024,
         FIO_SERVER_MAX_CMD_MB           = 2048,
@@ -193,7 +196,9 @@ struct cmd_iolog_pdu {
         uint32_t log_type;
         uint32_t compressed;
         uint32_t log_offset;
+       uint32_t log_prio;
         uint32_t log_hist_coarseness;
+       uint32_t per_job_logs;
         uint8_t name[FIO_NET_NAME_MAX];
         struct io_sample samples[0];
  };
@@ -211,6 +216,7 @@ extern int fio_server_text_output(int, const char *, size_t);
  extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *);
  extern int fio_net_send_simple_cmd(int, uint16_t, uint64_t, struct flist_head *);
  extern void fio_server_set_arg(const char *);
+extern void fio_server_internal_set(const char *);
  extern int fio_server_parse_string(const char *, char **, bool *, int *, struct in_addr *, struct in6_addr *, int *);
  extern int fio_server_parse_host(const char *, int, struct in_addr *, struct in6_addr *);
  extern const char *fio_server_op(unsigned int);
@@ -221,6 +227,7 @@ extern void fio_server_send_gs(struct group_run_stats *);
  extern void fio_server_send_du(void);
  extern void fio_server_send_job_options(struct flist_head *, unsigned int);
  extern int fio_server_get_verify_state(const char *, int, void **);
+extern bool fio_server_poll_fd(int fd, short events, int timeout);
  
  extern struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait);
  
diff --git a/smalloc.c b/smalloc.c

index fa00f0ee3325b25fa6bdc5abe6c3bf405026a743..23243054ec7ab401ee7d89d73db81d3d809dd612 100644 (file)
--- a/smalloc.c
+++ b/smalloc.c
@@ -283,13 +283,13 @@ static void sfree_check_redzone(struct block_hdr *hdr)
         if (hdr->prered != SMALLOC_PRE_RED) {
                 log_err("smalloc pre redzone destroyed!\n"
                         " ptr=%p, prered=%x, expected %x\n",
-                               hdr, hdr->prered, SMALLOC_PRE_RED);
+                               hdr+1, hdr->prered, SMALLOC_PRE_RED);
                 assert(0);
         }
         if (*postred != SMALLOC_POST_RED) {
                 log_err("smalloc post redzone destroyed!\n"
                         "  ptr=%p, postred=%x, expected %x\n",
-                               hdr, *postred, SMALLOC_POST_RED);
+                               hdr+1, *postred, SMALLOC_POST_RED);
                 assert(0);
         }
  }
diff --git a/stat.c b/stat.c

index b723795301cc4c9bb3e71ad31450b6a6af97366b..b98e8b27c3b0a70b2cb220411406665ac96d68f5 100644 (file)
--- a/stat.c
+++ b/stat.c
@@ -1,5 +1,6 @@
  #include <stdio.h>
  #include <string.h>
+#include <stdlib.h>
  #include <sys/time.h>
  #include <sys/stat.h>
  #include <math.h>
@@ -17,7 +18,11 @@
  #include "zbd.h"
  #include "oslib/asprintf.h"
  
+#ifdef WIN32
+#define LOG_MSEC_SLACK 2
+#else
  #define LOG_MSEC_SLACK 1
+#endif
  
  struct fio_sem *stat_sem;
  
@@ -211,7 +216,7 @@ static void show_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr,
  
         len = calc_clat_percentiles(io_u_plat, nr, plist, &ovals, &maxv, &minv);
         if (!len || !ovals)
-               goto out;
+               return;
  
         /*
          * We default to nsecs, but if the value range is such that we
@@ -258,10 +263,21 @@ static void show_clat_percentiles(uint64_t *io_u_plat, unsigned long long nr,
                         log_buf(out, "\n");
         }
  
-out:
         free(ovals);
  }
  
+static int get_nr_prios_with_samples(struct thread_stat *ts, enum fio_ddir ddir)
+{
+       int i, nr_prios_with_samples = 0;
+
+       for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+               if (ts->clat_prio[ddir][i].clat_stat.samples)
+                       nr_prios_with_samples++;
+       }
+
+       return nr_prios_with_samples;
+}
+
  bool calc_lat(struct io_stat *is, unsigned long long *min,
               unsigned long long *max, double *mean, double *dev)
  {
@@ -282,6 +298,47 @@ bool calc_lat(struct io_stat *is, unsigned long long *min,
         return true;
  }
  
+void show_mixed_group_stats(struct group_run_stats *rs, struct buf_output *out) 
+{
+       char *io, *agg, *min, *max;
+       char *ioalt, *aggalt, *minalt, *maxalt;
+       uint64_t io_mix = 0, agg_mix = 0, min_mix = -1, max_mix = 0;
+       uint64_t min_run = -1, max_run = 0;
+       const int i2p = is_power_of_2(rs->kb_base);
+       int i;
+
+       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+               if (!rs->max_run[i])
+                       continue;
+               io_mix += rs->iobytes[i];
+               agg_mix += rs->agg[i];
+               min_mix = min_mix < rs->min_bw[i] ? min_mix : rs->min_bw[i];
+               max_mix = max_mix > rs->max_bw[i] ? max_mix : rs->max_bw[i];
+               min_run = min_run < rs->min_run[i] ? min_run : rs->min_run[i];
+               max_run = max_run > rs->max_run[i] ? max_run : rs->max_run[i];
+       }
+       io = num2str(io_mix, rs->sig_figs, 1, i2p, N2S_BYTE);
+       ioalt = num2str(io_mix, rs->sig_figs, 1, !i2p, N2S_BYTE);
+       agg = num2str(agg_mix, rs->sig_figs, 1, i2p, rs->unit_base);
+       aggalt = num2str(agg_mix, rs->sig_figs, 1, !i2p, rs->unit_base);
+       min = num2str(min_mix, rs->sig_figs, 1, i2p, rs->unit_base);
+       minalt = num2str(min_mix, rs->sig_figs, 1, !i2p, rs->unit_base);
+       max = num2str(max_mix, rs->sig_figs, 1, i2p, rs->unit_base);
+       maxalt = num2str(max_mix, rs->sig_figs, 1, !i2p, rs->unit_base);
+       log_buf(out, "  MIXED: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n",
+                       agg, aggalt, min, max, minalt, maxalt, io, ioalt,
+                       (unsigned long long) min_run,
+                       (unsigned long long) max_run);
+       free(io);
+       free(agg);
+       free(min);
+       free(max);
+       free(ioalt);
+       free(aggalt);
+       free(minalt);
+       free(maxalt);
+}
+
  void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
  {
         char *io, *agg, *min, *max;
@@ -306,7 +363,7 @@ void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
                 max = num2str(rs->max_bw[i], rs->sig_figs, 1, i2p, rs->unit_base);
                 maxalt = num2str(rs->max_bw[i], rs->sig_figs, 1, !i2p, rs->unit_base);
                 log_buf(out, "%s: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n",
-                               rs->unified_rw_rep ? "  MIXED" : str[i],
+                               (rs->unified_rw_rep == UNIFIED_MIXED) ? "  MIXED" : str[i],
                                 agg, aggalt, min, max, minalt, maxalt, io, ioalt,
                                 (unsigned long long) rs->min_run[i],
                                 (unsigned long long) rs->max_run[i]);
@@ -320,6 +377,10 @@ void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
                 free(minalt);
                 free(maxalt);
         }
+
+       /* Need to aggregate statistics to show mixed values */
+       if (rs->unified_rw_rep == UNIFIED_BOTH)
+               show_mixed_group_stats(rs, out);
  }
  
  void stat_calc_dist(uint64_t *map, unsigned long total, double *io_u_dist)
@@ -414,11 +475,41 @@ static void display_lat(const char *name, unsigned long long min,
         free(maxp);
  }
  
-static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, int mean)
+static struct thread_stat *gen_mixed_ddir_stats_from_ts(struct thread_stat *ts)
+{
+       struct thread_stat *ts_lcl;
+
+       /*
+        * Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and
+        * Trims (ddir = 2)
+        */
+       ts_lcl = malloc(sizeof(struct thread_stat));
+       if (!ts_lcl) {
+               log_err("fio: failed to allocate local thread stat\n");
+               return NULL;
+       }
+
+       init_thread_stat(ts_lcl);
+
+       /* calculate mixed stats  */
+       ts_lcl->unified_rw_rep = UNIFIED_MIXED;
+       ts_lcl->lat_percentiles = ts->lat_percentiles;
+       ts_lcl->clat_percentiles = ts->clat_percentiles;
+       ts_lcl->slat_percentiles = ts->slat_percentiles;
+       ts_lcl->percentile_precision = ts->percentile_precision;
+       memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
+
+       sum_thread_stats(ts_lcl, ts);
+
+       return ts_lcl;
+}
+
+static double convert_agg_kbytes_percent(struct group_run_stats *rs,
+                                        enum fio_ddir ddir, int mean)
  {
         double p_of_agg = 100.0;
         if (rs && rs->agg[ddir] > 1024) {
-               p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024.0);
+               p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0);
  
                 if (p_of_agg > 100.0)
                         p_of_agg = 100.0;
@@ -427,13 +518,14 @@ static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, i
  }
  
  static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
-                            int ddir, struct buf_output *out)
+                            enum fio_ddir ddir, struct buf_output *out)
  {
         unsigned long runt;
         unsigned long long min, max, bw, iops;
         double mean, dev;
         char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL;
-       int i2p;
+       int i2p, i;
+       const char *clat_type = ts->lat_percentiles ? "lat" : "clat";
  
         if (ddir_sync(ddir)) {
                 if (calc_lat(&ts->sync_stat, &min, &max, &mean, &dev)) {
@@ -463,7 +555,7 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
  
         iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt;
         iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_NONE);
-       if (ddir == DDIR_WRITE)
+       if (ddir == DDIR_WRITE || ddir == DDIR_TRIM)
                 post_st = zbd_write_status(ts);
         else if (ddir == DDIR_READ && ts->cachehit && ts->cachemiss) {
                 uint64_t total;
@@ -477,7 +569,7 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
         }
  
         log_buf(out, "  %s: IOPS=%s, BW=%s (%s)(%s/%llumsec)%s\n",
-                       rs->unified_rw_rep ? "mixed" : io_ddir_name(ddir),
+                       (ts->unified_rw_rep == UNIFIED_MIXED) ? "mixed" : io_ddir_name(ddir),
                         iops_p, bw_p, bw_p_alt, io_p,
                         (unsigned long long) ts->runtime[ddir],
                         post_st ? : "");
@@ -494,12 +586,24 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
                 display_lat("clat", min, max, mean, dev, out);
         if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
                 display_lat(" lat", min, max, mean, dev, out);
-       if (calc_lat(&ts->clat_high_prio_stat[ddir], &min, &max, &mean, &dev)) {
-               display_lat(ts->lat_percentiles ? "high prio_lat" : "high prio_clat",
-                               min, max, mean, dev, out);
-               if (calc_lat(&ts->clat_low_prio_stat[ddir], &min, &max, &mean, &dev))
-                       display_lat(ts->lat_percentiles ? "low prio_lat" : "low prio_clat",
-                                       min, max, mean, dev, out);
+
+       /* Only print per prio stats if there are >= 2 prios with samples */
+       if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+               for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+                       char buf[64];
+
+                       if (!calc_lat(&ts->clat_prio[ddir][i].clat_stat, &min,
+                                     &max, &mean, &dev))
+                               continue;
+
+                       snprintf(buf, sizeof(buf),
+                                "%s prio %u/%u/%u",
+                                clat_type,
+                                ioprio_class(ts->clat_prio[ddir][i].ioprio),
+                                ioprio(ts->clat_prio[ddir][i].ioprio),
+                                ioprio_hint(ts->clat_prio[ddir][i].ioprio));
+                       display_lat(buf, min, max, mean, dev, out);
+               }
         }
  
         if (ts->slat_percentiles && ts->slat_stat[ddir].samples > 0)
@@ -519,8 +623,7 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
                                         ts->percentile_precision, "lat", out);
  
         if (ts->clat_percentiles || ts->lat_percentiles) {
-               const char *name = ts->lat_percentiles ? "lat" : "clat";
-               char prio_name[32];
+               char prio_name[64];
                 uint64_t samples;
  
                 if (ts->lat_percentiles)
@@ -528,25 +631,27 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
                 else
                         samples = ts->clat_stat[ddir].samples;
  
-               /* Only print this if some high and low priority stats were collected */
-               if (ts->clat_high_prio_stat[ddir].samples > 0 &&
-                       ts->clat_low_prio_stat[ddir].samples > 0)
-               {
-                       sprintf(prio_name, "high prio (%.2f%%) %s",
-                                       100. * (double) ts->clat_high_prio_stat[ddir].samples / (double) samples,
-                                       name);
-                       show_clat_percentiles(ts->io_u_plat_high_prio[ddir],
-                                               ts->clat_high_prio_stat[ddir].samples,
-                                               ts->percentile_list,
-                                               ts->percentile_precision, prio_name, out);
-
-                       sprintf(prio_name, "low prio (%.2f%%) %s",
-                                       100. * (double) ts->clat_low_prio_stat[ddir].samples / (double) samples,
-                                       name);
-                       show_clat_percentiles(ts->io_u_plat_low_prio[ddir],
-                                               ts->clat_low_prio_stat[ddir].samples,
-                                               ts->percentile_list,
-                                               ts->percentile_precision, prio_name, out);
+               /* Only print per prio stats if there are >= 2 prios with samples */
+               if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+                       for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+                               uint64_t prio_samples =
+                                       ts->clat_prio[ddir][i].clat_stat.samples;
+
+                               if (!prio_samples)
+                                       continue;
+
+                               snprintf(prio_name, sizeof(prio_name),
+                                        "%s prio %u/%u/%u (%.2f%% of IOs)",
+                                        clat_type,
+                                        ioprio_class(ts->clat_prio[ddir][i].ioprio),
+                                        ioprio(ts->clat_prio[ddir][i].ioprio),
+                                        ioprio_hint(ts->clat_prio[ddir][i].ioprio),
+                                        100. * (double) prio_samples / (double) samples);
+                               show_clat_percentiles(ts->clat_prio[ddir][i].io_u_plat,
+                                               prio_samples, ts->percentile_list,
+                                               ts->percentile_precision,
+                                               prio_name, out);
+                       }
                 }
         }
  
@@ -592,6 +697,19 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
         }
  }
  
+static void show_mixed_ddir_status(struct group_run_stats *rs,
+                                  struct thread_stat *ts,
+                                  struct buf_output *out)
+{
+       struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
+
+       if (ts_lcl)
+               show_ddir_status(rs, ts_lcl, DDIR_READ, out);
+
+       free_clat_prio_stats(ts_lcl);
+       free(ts_lcl);
+}
+
  static bool show_lat(double *io_u_lat, int nr, const char **ranges,
                      const char *msg, struct buf_output *out)
  {
@@ -841,11 +959,13 @@ static void show_agg_stats(struct disk_util_agg *agg, int terse,
                 return;
  
         if (!terse) {
-               log_buf(out, ", aggrios=%llu/%llu, aggrmerge=%llu/%llu, "
-                        "aggrticks=%llu/%llu, aggrin_queue=%llu, "
-                        "aggrutil=%3.2f%%",
+               log_buf(out, ", aggrios=%llu/%llu, aggsectors=%llu/%llu, "
+                        "aggrmerge=%llu/%llu, aggrticks=%llu/%llu, "
+                        "aggrin_queue=%llu, aggrutil=%3.2f%%",
                         (unsigned long long) agg->ios[0] / agg->slavecount,
                         (unsigned long long) agg->ios[1] / agg->slavecount,
+                       (unsigned long long) agg->sectors[0] / agg->slavecount,
+                       (unsigned long long) agg->sectors[1] / agg->slavecount,
                         (unsigned long long) agg->merges[0] / agg->slavecount,
                         (unsigned long long) agg->merges[1] / agg->slavecount,
                         (unsigned long long) agg->ticks[0] / agg->slavecount,
@@ -914,11 +1034,14 @@ void print_disk_util(struct disk_util_stat *dus, struct disk_util_agg *agg,
                 if (agg->slavecount)
                         log_buf(out, "  ");
  
-               log_buf(out, "  %s: ios=%llu/%llu, merge=%llu/%llu, "
-                        "ticks=%llu/%llu, in_queue=%llu, util=%3.2f%%",
+               log_buf(out, "  %s: ios=%llu/%llu, sectors=%llu/%llu, "
+                       "merge=%llu/%llu, ticks=%llu/%llu, in_queue=%llu, "
+                       "util=%3.2f%%",
                                 dus->name,
                                 (unsigned long long) dus->s.ios[0],
                                 (unsigned long long) dus->s.ios[1],
+                               (unsigned long long) dus->s.sectors[0],
+                               (unsigned long long) dus->s.sectors[1],
                                 (unsigned long long) dus->s.merges[0],
                                 (unsigned long long) dus->s.merges[1],
                                 (unsigned long long) dus->s.ticks[0],
@@ -965,6 +1088,8 @@ void json_array_add_disk_util(struct disk_util_stat *dus,
         json_object_add_value_string(obj, "name", (const char *)dus->name);
         json_object_add_value_int(obj, "read_ios", dus->s.ios[0]);
         json_object_add_value_int(obj, "write_ios", dus->s.ios[1]);
+       json_object_add_value_int(obj, "read_sectors", dus->s.sectors[0]);
+       json_object_add_value_int(obj, "write_sectors", dus->s.sectors[1]);
         json_object_add_value_int(obj, "read_merges", dus->s.merges[0]);
         json_object_add_value_int(obj, "write_merges", dus->s.merges[1]);
         json_object_add_value_int(obj, "read_ticks", dus->s.ticks[0]);
@@ -982,6 +1107,10 @@ void json_array_add_disk_util(struct disk_util_stat *dus,
                                 agg->ios[0] / agg->slavecount);
         json_object_add_value_int(obj, "aggr_write_ios",
                                 agg->ios[1] / agg->slavecount);
+       json_object_add_value_int(obj, "aggr_read_sectors",
+                               agg->sectors[0] / agg->slavecount);
+       json_object_add_value_int(obj, "aggr_write_sectors",
+                               agg->sectors[1] / agg->slavecount);
         json_object_add_value_int(obj, "aggr_read_merges",
                                 agg->merges[0] / agg->slavecount);
         json_object_add_value_int(obj, "aggr_write_merge",
@@ -1022,9 +1151,8 @@ void show_disk_util(int terse, struct json_object *parent,
         if (!is_running_backend())
                 return;
  
-       if (flist_empty(&disk_list)) {
+       if (flist_empty(&disk_list))
                 return;
-       }
  
         if ((output_format & FIO_OUTPUT_JSON) && parent)
                 do_json = true;
@@ -1034,9 +1162,9 @@ void show_disk_util(int terse, struct json_object *parent,
         if (!terse && !do_json)
                 log_buf(out, "\nDisk stats (read/write):\n");
  
-       if (do_json)
+       if (do_json) {
                 json_object_add_disk_utils(parent, &disk_list);
-       else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
+       } else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
                 flist_for_each(entry, &disk_list) {
                         du = flist_entry(entry, struct disk_util, list);
  
@@ -1083,6 +1211,9 @@ static void show_thread_status_normal(struct thread_stat *ts,
                         show_ddir_status(rs, ts, ddir, out);
         }
  
+       if (ts->unified_rw_rep == UNIFIED_BOTH)
+               show_mixed_ddir_status(rs, ts, out);
+
         show_latencies(ts, out);
  
         if (ts->sync_stat.samples)
@@ -1160,8 +1291,9 @@ static void show_thread_status_normal(struct thread_stat *ts,
  }
  
  static void show_ddir_status_terse(struct thread_stat *ts,
-                                  struct group_run_stats *rs, int ddir,
-                                  int ver, struct buf_output *out)
+                                  struct group_run_stats *rs,
+                                  enum fio_ddir ddir, int ver,
+                                  struct buf_output *out)
  {
         unsigned long long min, max, minv, maxv, bw, iops;
         unsigned long long *ovals = NULL;
@@ -1193,18 +1325,19 @@ static void show_ddir_status_terse(struct thread_stat *ts,
         else
                 log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
  
-       if (ts->lat_percentiles)
+       if (ts->lat_percentiles) {
                 len = calc_clat_percentiles(ts->io_u_plat[FIO_LAT][ddir],
                                         ts->lat_stat[ddir].samples,
                                         ts->percentile_list, &ovals, &maxv,
                                         &minv);
-       else if (ts->clat_percentiles)
+       } else if (ts->clat_percentiles) {
                 len = calc_clat_percentiles(ts->io_u_plat[FIO_CLAT][ddir],
                                         ts->clat_stat[ddir].samples,
                                         ts->percentile_list, &ovals, &maxv,
                                         &minv);
-       else
+       } else {
                 len = 0;
+       }
  
         for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
                 if (i >= len) {
@@ -1232,8 +1365,9 @@ static void show_ddir_status_terse(struct thread_stat *ts,
                 }
  
                 log_buf(out, ";%llu;%llu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
-       } else
+       } else {
                 log_buf(out, ";%llu;%llu;%f%%;%f;%f", 0ULL, 0ULL, 0.0, 0.0, 0.0);
+       }
  
         if (ver == 5) {
                 if (bw_stat)
@@ -1249,8 +1383,23 @@ static void show_ddir_status_terse(struct thread_stat *ts,
         }
  }
  
-static struct json_object *add_ddir_lat_json(struct thread_stat *ts, uint32_t percentiles,
-               struct io_stat *lat_stat, uint64_t *io_u_plat)
+static void show_mixed_ddir_status_terse(struct thread_stat *ts,
+                                  struct group_run_stats *rs,
+                                  int ver, struct buf_output *out)
+{
+       struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
+
+       if (ts_lcl)
+               show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out);
+
+       free_clat_prio_stats(ts_lcl);
+       free(ts_lcl);
+}
+
+static struct json_object *add_ddir_lat_json(struct thread_stat *ts,
+                                            uint32_t percentiles,
+                                            struct io_stat *lat_stat,
+                                            uint64_t *io_u_plat)
  {
         char buf[120];
         double mean, dev;
@@ -1300,7 +1449,8 @@ static struct json_object *add_ddir_lat_json(struct thread_stat *ts, uint32_t pe
  }
  
  static void add_ddir_status_json(struct thread_stat *ts,
-               struct group_run_stats *rs, int ddir, struct json_object *parent)
+                                struct group_run_stats *rs, enum fio_ddir ddir,
+                                struct json_object *parent)
  {
         unsigned long long min, max;
         unsigned long long bw_bytes, bw;
@@ -1310,12 +1460,12 @@ static void add_ddir_status_json(struct thread_stat *ts,
  
         assert(ddir_rw(ddir) || ddir_sync(ddir));
  
-       if (ts->unified_rw_rep && ddir != DDIR_READ)
+       if ((ts->unified_rw_rep == UNIFIED_MIXED) && ddir != DDIR_READ)
                 return;
  
         dir_object = json_create_object();
         json_object_add_value_object(parent,
-               ts->unified_rw_rep ? "mixed" : io_ddir_name(ddir), dir_object);
+               (ts->unified_rw_rep == UNIFIED_MIXED) ? "mixed" : io_ddir_name(ddir), dir_object);
  
         if (ddir_rw(ddir)) {
                 bw_bytes = 0;
@@ -1360,25 +1510,41 @@ static void add_ddir_status_json(struct thread_stat *ts,
         if (!ddir_rw(ddir))
                 return;
  
-       /* Only print PRIO latencies if some high priority samples were gathered */
-       if (ts->clat_high_prio_stat[ddir].samples > 0) {
-               const char *high, *low;
+       /* Only include per prio stats if there are >= 2 prios with samples */
+       if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+               struct json_array *array = json_create_array();
+               const char *obj_name;
+               int i;
  
-               if (ts->lat_percentiles) {
-                       high = "lat_high_prio";
-                       low = "lat_low_prio";
-               } else {
-                       high = "clat_high_prio";
-                       low = "clat_low_prio";
-               }
+               if (ts->lat_percentiles)
+                       obj_name = "lat_ns";
+               else
+                       obj_name = "clat_ns";
+
+               json_object_add_value_array(dir_object, "prios", array);
  
-               tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles,
-                               &ts->clat_high_prio_stat[ddir], ts->io_u_plat_high_prio[ddir]);
-               json_object_add_value_object(dir_object, high, tmp_object);
+               for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+                       struct json_object *obj;
  
-               tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles,
-                               &ts->clat_low_prio_stat[ddir], ts->io_u_plat_low_prio[ddir]);
-               json_object_add_value_object(dir_object, low, tmp_object);
+                       if (!ts->clat_prio[ddir][i].clat_stat.samples)
+                               continue;
+
+                       obj = json_create_object();
+
+                       json_object_add_value_int(obj, "prioclass",
+                               ioprio_class(ts->clat_prio[ddir][i].ioprio));
+                       json_object_add_value_int(obj, "prio",
+                               ioprio(ts->clat_prio[ddir][i].ioprio));
+                       json_object_add_value_int(obj, "priohint",
+                               ioprio_hint(ts->clat_prio[ddir][i].ioprio));
+
+                       tmp_object = add_ddir_lat_json(ts,
+                                       ts->clat_percentiles | ts->lat_percentiles,
+                                       &ts->clat_prio[ddir][i].clat_stat,
+                                       ts->clat_prio[ddir][i].io_u_plat);
+                       json_object_add_value_object(obj, obj_name, tmp_object);
+                       json_array_add_value_object(array, obj);
+               }
         }
  
         if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
@@ -1418,6 +1584,19 @@ static void add_ddir_status_json(struct thread_stat *ts,
         }
  }
  
+static void add_mixed_ddir_status_json(struct thread_stat *ts,
+               struct group_run_stats *rs, struct json_object *parent)
+{
+       struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
+
+       /* add the aggregated stats to json parent */
+       if (ts_lcl)
+               add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent);
+
+       free_clat_prio_stats(ts_lcl);
+       free(ts_lcl);
+}
+
  static void show_thread_status_terse_all(struct thread_stat *ts,
                                          struct group_run_stats *rs, int ver,
                                          struct buf_output *out)
@@ -1435,14 +1614,17 @@ static void show_thread_status_terse_all(struct thread_stat *ts,
                 log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string,
                         ts->name, ts->groupid, ts->error);
  
-       /* Log Read Status */
+       /* Log Read Status, or mixed if unified_rw_rep = 1 */
         show_ddir_status_terse(ts, rs, DDIR_READ, ver, out);
-       /* Log Write Status */
-       show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out);
-       /* Log Trim Status */
-       if (ver == 2 || ver == 4 || ver == 5)
-               show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out);
-
+       if (ts->unified_rw_rep != UNIFIED_MIXED) {
+               /* Log Write Status */
+               show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out);
+               /* Log Trim Status */
+               if (ver == 2 || ver == 4 || ver == 5)
+                       show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out);
+       }
+       if (ts->unified_rw_rep == UNIFIED_BOTH)
+               show_mixed_ddir_status_terse(ts, rs, ver, out);
         /* CPU Usage */
         if (ts->total_run_time) {
                 double runt = (double) ts->total_run_time;
@@ -1530,6 +1712,7 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
         root = json_create_object();
         json_object_add_value_string(root, "jobname", ts->name);
         json_object_add_value_int(root, "groupid", ts->groupid);
+       json_object_add_value_int(root, "job_start", ts->job_start);
         json_object_add_value_int(root, "error", ts->error);
  
         /* ETA Info */
@@ -1537,6 +1720,7 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
         if (je) {
                 json_object_add_value_int(root, "eta", je->eta_sec);
                 json_object_add_value_int(root, "elapsed", je->elapsed_sec);
+               free(je);
         }
  
         if (opt_list)
@@ -1547,6 +1731,9 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
         add_ddir_status_json(ts, rs, DDIR_TRIM, root);
         add_ddir_status_json(ts, rs, DDIR_SYNC, root);
  
+       if (ts->unified_rw_rep == UNIFIED_BOTH)
+               add_mixed_ddir_status_json(ts, rs, root);
+
         /* CPU Usage */
         if (ts->total_run_time) {
                 double runt = (double) ts->total_run_time;
@@ -1703,6 +1890,7 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
                 struct json_array *iops, *bw;
                 int j, k, l;
                 char ss_buf[64];
+               int intervals = ts->ss_dur / (ss_check_interval / 1000L);
  
                 snprintf(ss_buf, sizeof(ss_buf), "%s%s:%f%s",
                         ts->ss_state & FIO_SS_IOPS ? "iops" : "bw",
@@ -1736,9 +1924,9 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
                 if ((ts->ss_state & FIO_SS_ATTAINED) || !(ts->ss_state & FIO_SS_BUFFER_FULL))
                         j = ts->ss_head;
                 else
-                       j = ts->ss_head == 0 ? ts->ss_dur - 1 : ts->ss_head - 1;
-               for (l = 0; l < ts->ss_dur; l++) {
-                       k = (j + l) % ts->ss_dur;
+                       j = ts->ss_head == 0 ? intervals - 1 : ts->ss_head - 1;
+               for (l = 0; l < intervals; l++) {
+                       k = (j + l) % intervals;
                         json_array_add_value_int(bw, ts->ss_bw_data[k]);
                         json_array_add_value_int(iops, ts->ss_iops_data[k]);
                 }
@@ -1817,9 +2005,10 @@ static void __sum_stat(struct io_stat *dst, struct io_stat *src, bool first)
   * numbers. For group_reporting, we should just add those up, not make
   * them the mean of everything.
   */
-static void sum_stat(struct io_stat *dst, struct io_stat *src, bool first,
-                    bool pure_sum)
+static void sum_stat(struct io_stat *dst, struct io_stat *src, bool pure_sum)
  {
+       bool first = dst->samples == 0;
+
         if (src->samples == 0)
                 return;
  
@@ -1869,48 +2058,251 @@ void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src)
                 dst->sig_figs = src->sig_figs;
  }
  
-void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
-                     bool first)
+/*
+ * Free the clat_prio_stat arrays allocated by alloc_clat_prio_stat_ddir().
+ */
+void free_clat_prio_stats(struct thread_stat *ts)
+{
+       enum fio_ddir ddir;
+
+       if (!ts)
+               return;
+
+       for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+               sfree(ts->clat_prio[ddir]);
+               ts->clat_prio[ddir] = NULL;
+               ts->nr_clat_prio[ddir] = 0;
+       }
+}
+
+/*
+ * Allocate a clat_prio_stat array. The array has to be allocated/freed using
+ * smalloc/sfree, so that it is accessible by the process/thread summing the
+ * thread_stats.
+ */
+int alloc_clat_prio_stat_ddir(struct thread_stat *ts, enum fio_ddir ddir,
+                             int nr_prios)
+{
+       struct clat_prio_stat *clat_prio;
+       int i;
+
+       clat_prio = scalloc(nr_prios, sizeof(*ts->clat_prio[ddir]));
+       if (!clat_prio) {
+               log_err("fio: failed to allocate ts clat data\n");
+               return 1;
+       }
+
+       for (i = 0; i < nr_prios; i++)
+               clat_prio[i].clat_stat.min_val = ULONG_MAX;
+
+       ts->clat_prio[ddir] = clat_prio;
+       ts->nr_clat_prio[ddir] = nr_prios;
+
+       return 0;
+}
+
+static int grow_clat_prio_stat(struct thread_stat *dst, enum fio_ddir ddir)
+{
+       int curr_len = dst->nr_clat_prio[ddir];
+       void *new_arr;
+
+       new_arr = scalloc(curr_len + 1, sizeof(*dst->clat_prio[ddir]));
+       if (!new_arr) {
+               log_err("fio: failed to grow clat prio array\n");
+               return 1;
+       }
+
+       memcpy(new_arr, dst->clat_prio[ddir],
+              curr_len * sizeof(*dst->clat_prio[ddir]));
+       sfree(dst->clat_prio[ddir]);
+
+       dst->clat_prio[ddir] = new_arr;
+       dst->clat_prio[ddir][curr_len].clat_stat.min_val = ULONG_MAX;
+       dst->nr_clat_prio[ddir]++;
+
+       return 0;
+}
+
+static int find_clat_prio_index(struct thread_stat *dst, enum fio_ddir ddir,
+                               uint32_t ioprio)
+{
+       int i, nr_prios = dst->nr_clat_prio[ddir];
+
+       for (i = 0; i < nr_prios; i++) {
+               if (dst->clat_prio[ddir][i].ioprio == ioprio)
+                       return i;
+       }
+
+       return -1;
+}
+
+static int alloc_or_get_clat_prio_index(struct thread_stat *dst,
+                                       enum fio_ddir ddir, uint32_t ioprio,
+                                       int *idx)
+{
+       int index = find_clat_prio_index(dst, ddir, ioprio);
+
+       if (index == -1) {
+               index = dst->nr_clat_prio[ddir];
+
+               if (grow_clat_prio_stat(dst, ddir))
+                       return 1;
+
+               dst->clat_prio[ddir][index].ioprio = ioprio;
+       }
+
+       *idx = index;
+
+       return 0;
+}
+
+static int clat_prio_stats_copy(struct thread_stat *dst, struct thread_stat *src,
+                               enum fio_ddir dst_ddir, enum fio_ddir src_ddir)
+{
+       size_t sz = sizeof(*src->clat_prio[src_ddir]) *
+               src->nr_clat_prio[src_ddir];
+
+       dst->clat_prio[dst_ddir] = smalloc(sz);
+       if (!dst->clat_prio[dst_ddir]) {
+               log_err("fio: failed to alloc clat prio array\n");
+               return 1;
+       }
+
+       memcpy(dst->clat_prio[dst_ddir], src->clat_prio[src_ddir], sz);
+       dst->nr_clat_prio[dst_ddir] = src->nr_clat_prio[src_ddir];
+
+       return 0;
+}
+
+static int clat_prio_stat_add_samples(struct thread_stat *dst,
+                                     enum fio_ddir dst_ddir, uint32_t ioprio,
+                                     struct io_stat *io_stat,
+                                     uint64_t *io_u_plat)
+{
+       int i, dst_index;
+
+       if (!io_stat->samples)
+               return 0;
+
+       if (alloc_or_get_clat_prio_index(dst, dst_ddir, ioprio, &dst_index))
+               return 1;
+
+       sum_stat(&dst->clat_prio[dst_ddir][dst_index].clat_stat, io_stat,
+                false);
+
+       for (i = 0; i < FIO_IO_U_PLAT_NR; i++)
+               dst->clat_prio[dst_ddir][dst_index].io_u_plat[i] += io_u_plat[i];
+
+       return 0;
+}
+
+static int sum_clat_prio_stats_src_single_prio(struct thread_stat *dst,
+                                              struct thread_stat *src,
+                                              enum fio_ddir dst_ddir,
+                                              enum fio_ddir src_ddir)
+{
+       struct io_stat *io_stat;
+       uint64_t *io_u_plat;
+
+       /*
+        * If src ts has no clat_prio_stat array, then all I/Os were submitted
+        * using src->ioprio. Thus, the global samples in src->clat_stat (or
+        * src->lat_stat) can be used as the 'per prio' samples for src->ioprio.
+        */
+       assert(!src->clat_prio[src_ddir]);
+       assert(src->nr_clat_prio[src_ddir] == 0);
+
+       if (src->lat_percentiles) {
+               io_u_plat = src->io_u_plat[FIO_LAT][src_ddir];
+               io_stat = &src->lat_stat[src_ddir];
+       } else {
+               io_u_plat = src->io_u_plat[FIO_CLAT][src_ddir];
+               io_stat = &src->clat_stat[src_ddir];
+       }
+
+       return clat_prio_stat_add_samples(dst, dst_ddir, src->ioprio, io_stat,
+                                         io_u_plat);
+}
+
+static int sum_clat_prio_stats_src_multi_prio(struct thread_stat *dst,
+                                             struct thread_stat *src,
+                                             enum fio_ddir dst_ddir,
+                                             enum fio_ddir src_ddir)
+{
+       int i;
+
+       /*
+        * If src ts has a clat_prio_stat array, then there are multiple prios
+        * in use (i.e. src ts had cmdprio_percentage or cmdprio_bssplit set).
+        * The samples for the default prio will exist in the src->clat_prio
+        * array, just like the samples for any other prio.
+        */
+       assert(src->clat_prio[src_ddir]);
+       assert(src->nr_clat_prio[src_ddir]);
+
+       /* If the dst ts doesn't yet have a clat_prio array, simply memcpy. */
+       if (!dst->clat_prio[dst_ddir])
+               return clat_prio_stats_copy(dst, src, dst_ddir, src_ddir);
+
+       /* The dst ts already has a clat_prio_array, add src stats into it. */
+       for (i = 0; i < src->nr_clat_prio[src_ddir]; i++) {
+               struct io_stat *io_stat = &src->clat_prio[src_ddir][i].clat_stat;
+               uint64_t *io_u_plat = src->clat_prio[src_ddir][i].io_u_plat;
+               uint32_t ioprio = src->clat_prio[src_ddir][i].ioprio;
+
+               if (clat_prio_stat_add_samples(dst, dst_ddir, ioprio, io_stat, io_u_plat))
+                       return 1;
+       }
+
+       return 0;
+}
+
+static int sum_clat_prio_stats(struct thread_stat *dst, struct thread_stat *src,
+                              enum fio_ddir dst_ddir, enum fio_ddir src_ddir)
+{
+       if (dst->disable_prio_stat)
+               return 0;
+
+       if (!src->clat_prio[src_ddir])
+               return sum_clat_prio_stats_src_single_prio(dst, src, dst_ddir,
+                                                          src_ddir);
+
+       return sum_clat_prio_stats_src_multi_prio(dst, src, dst_ddir, src_ddir);
+}
+
+void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src)
  {
         int k, l, m;
  
         for (l = 0; l < DDIR_RWDIR_CNT; l++) {
-               if (!dst->unified_rw_rep) {
-                       sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first, false);
-                       sum_stat(&dst->clat_high_prio_stat[l], &src->clat_high_prio_stat[l], first, false);
-                       sum_stat(&dst->clat_low_prio_stat[l], &src->clat_low_prio_stat[l], first, false);
-                       sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first, false);
-                       sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first, false);
-                       sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first, true);
-                       sum_stat(&dst->iops_stat[l], &src->iops_stat[l], first, true);
+               if (dst->unified_rw_rep != UNIFIED_MIXED) {
+                       sum_stat(&dst->clat_stat[l], &src->clat_stat[l], false);
+                       sum_stat(&dst->slat_stat[l], &src->slat_stat[l], false);
+                       sum_stat(&dst->lat_stat[l], &src->lat_stat[l], false);
+                       sum_stat(&dst->bw_stat[l], &src->bw_stat[l], true);
+                       sum_stat(&dst->iops_stat[l], &src->iops_stat[l], true);
+                       sum_clat_prio_stats(dst, src, l, l);
  
                         dst->io_bytes[l] += src->io_bytes[l];
  
                         if (dst->runtime[l] < src->runtime[l])
                                 dst->runtime[l] = src->runtime[l];
                 } else {
-                       sum_stat(&dst->clat_stat[0], &src->clat_stat[l], first, false);
-                       sum_stat(&dst->clat_high_prio_stat[0], &src->clat_high_prio_stat[l], first, false);
-                       sum_stat(&dst->clat_low_prio_stat[0], &src->clat_low_prio_stat[l], first, false);
-                       sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first, false);
-                       sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first, false);
-                       sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first, true);
-                       sum_stat(&dst->iops_stat[0], &src->iops_stat[l], first, true);
+                       sum_stat(&dst->clat_stat[0], &src->clat_stat[l], false);
+                       sum_stat(&dst->slat_stat[0], &src->slat_stat[l], false);
+                       sum_stat(&dst->lat_stat[0], &src->lat_stat[l], false);
+                       sum_stat(&dst->bw_stat[0], &src->bw_stat[l], true);
+                       sum_stat(&dst->iops_stat[0], &src->iops_stat[l], true);
+                       sum_clat_prio_stats(dst, src, 0, l);
  
                         dst->io_bytes[0] += src->io_bytes[l];
  
                         if (dst->runtime[0] < src->runtime[l])
                                 dst->runtime[0] = src->runtime[l];
-
-                       /*
-                        * We're summing to the same destination, so override
-                        * 'first' after the first iteration of the loop
-                        */
-                       first = false;
                 }
         }
  
-       sum_stat(&dst->sync_stat, &src->sync_stat, first, false);
+       sum_stat(&dst->sync_stat, &src->sync_stat, false);
         dst->usr_time += src->usr_time;
         dst->sys_time += src->sys_time;
         dst->ctx += src->ctx;
@@ -1931,7 +2323,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
                 dst->io_u_lat_m[k] += src->io_u_lat_m[k];
  
         for (k = 0; k < DDIR_RWDIR_CNT; k++) {
-               if (!dst->unified_rw_rep) {
+               if (dst->unified_rw_rep != UNIFIED_MIXED) {
                         dst->total_io_u[k] += src->total_io_u[k];
                         dst->short_io_u[k] += src->short_io_u[k];
                         dst->drop_io_u[k] += src->drop_io_u[k];
@@ -1947,7 +2339,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
         for (k = 0; k < FIO_LAT_CNT; k++)
                 for (l = 0; l < DDIR_RWDIR_CNT; l++)
                         for (m = 0; m < FIO_IO_U_PLAT_NR; m++)
-                               if (!dst->unified_rw_rep)
+                               if (dst->unified_rw_rep != UNIFIED_MIXED)
                                         dst->io_u_plat[k][l][m] += src->io_u_plat[k][l][m];
                                 else
                                         dst->io_u_plat[k][0][m] += src->io_u_plat[k][l][m];
@@ -1955,19 +2347,6 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
         for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
                 dst->io_u_sync_plat[k] += src->io_u_sync_plat[k];
  
-       for (k = 0; k < DDIR_RWDIR_CNT; k++) {
-               for (m = 0; m < FIO_IO_U_PLAT_NR; m++) {
-                       if (!dst->unified_rw_rep) {
-                               dst->io_u_plat_high_prio[k][m] += src->io_u_plat_high_prio[k][m];
-                               dst->io_u_plat_low_prio[k][m] += src->io_u_plat_low_prio[k][m];
-                       } else {
-                               dst->io_u_plat_high_prio[0][m] += src->io_u_plat_high_prio[k][m];
-                               dst->io_u_plat_low_prio[0][m] += src->io_u_plat_low_prio[k][m];
-                       }
-
-               }
-       }
-
         dst->total_run_time += src->total_run_time;
         dst->total_submit += src->total_submit;
         dst->total_complete += src->total_complete;
@@ -1985,29 +2364,82 @@ void init_group_run_stat(struct group_run_stats *gs)
                 gs->min_bw[i] = gs->min_run[i] = ~0UL;
  }
  
-void init_thread_stat(struct thread_stat *ts)
+void init_thread_stat_min_vals(struct thread_stat *ts)
  {
-       int j;
+       int i;
  
+       for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+               ts->clat_stat[i].min_val = ULONG_MAX;
+               ts->slat_stat[i].min_val = ULONG_MAX;
+               ts->lat_stat[i].min_val = ULONG_MAX;
+               ts->bw_stat[i].min_val = ULONG_MAX;
+               ts->iops_stat[i].min_val = ULONG_MAX;
+       }
+       ts->sync_stat.min_val = ULONG_MAX;
+}
+
+void init_thread_stat(struct thread_stat *ts)
+{
         memset(ts, 0, sizeof(*ts));
  
-       for (j = 0; j < DDIR_RWDIR_CNT; j++) {
-               ts->lat_stat[j].min_val = -1UL;
-               ts->clat_stat[j].min_val = -1UL;
-               ts->slat_stat[j].min_val = -1UL;
-               ts->bw_stat[j].min_val = -1UL;
-               ts->iops_stat[j].min_val = -1UL;
-               ts->clat_high_prio_stat[j].min_val = -1UL;
-               ts->clat_low_prio_stat[j].min_val = -1UL;
-       }
-       ts->sync_stat.min_val = -1UL;
+       init_thread_stat_min_vals(ts);
         ts->groupid = -1;
  }
  
+static void init_per_prio_stats(struct thread_stat *threadstats, int nr_ts)
+{
+       struct thread_stat *ts;
+       int i, j, last_ts, idx;
+       enum fio_ddir ddir;
+
+       j = 0;
+       last_ts = -1;
+       idx = 0;
+
+       /*
+        * Loop through all tds, if a td requires per prio stats, temporarily
+        * store a 1 in ts->disable_prio_stat, and then do an additional
+        * loop at the end where we invert the ts->disable_prio_stat values.
+        */
+       for_each_td(td) {
+               if (!td->o.stats)
+                       continue;
+               if (idx &&
+                   (!td->o.group_reporting ||
+                    (td->o.group_reporting && last_ts != td->groupid))) {
+                       idx = 0;
+                       j++;
+               }
+
+               last_ts = td->groupid;
+               ts = &threadstats[j];
+
+               /* idx == 0 means first td in group, or td is not in a group. */
+               if (idx == 0)
+                       ts->ioprio = td->ioprio;
+               else if (td->ioprio != ts->ioprio)
+                       ts->disable_prio_stat = 1;
+
+               for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+                       if (td->ts.clat_prio[ddir]) {
+                               ts->disable_prio_stat = 1;
+                               break;
+                       }
+               }
+
+               idx++;
+       } end_for_each();
+
+       /* Loop through all dst threadstats and fixup the values. */
+       for (i = 0; i < nr_ts; i++) {
+               ts = &threadstats[i];
+               ts->disable_prio_stat = !ts->disable_prio_stat;
+       }
+}
+
  void __show_run_stats(void)
  {
         struct group_run_stats *runstats, *rs;
-       struct thread_data *td;
         struct thread_stat *threadstats, *ts;
         int i, j, k, nr_ts, last_ts, idx;
         bool kb_base_warned = false;
@@ -2028,7 +2460,7 @@ void __show_run_stats(void)
          */
         nr_ts = 0;
         last_ts = -1;
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if (!td->o.group_reporting) {
                         nr_ts++;
                         continue;
@@ -2040,7 +2472,7 @@ void __show_run_stats(void)
  
                 last_ts = td->groupid;
                 nr_ts++;
-       }
+       } end_for_each();
  
         threadstats = malloc(nr_ts * sizeof(struct thread_stat));
         opt_lists = malloc(nr_ts * sizeof(struct flist_head *));
@@ -2050,10 +2482,12 @@ void __show_run_stats(void)
                 opt_lists[i] = NULL;
         }
  
+       init_per_prio_stats(threadstats, nr_ts);
+
         j = 0;
         last_ts = -1;
         idx = 0;
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if (!td->o.stats)
                         continue;
                 if (idx && (!td->o.group_reporting ||
@@ -2074,7 +2508,6 @@ void __show_run_stats(void)
                 opt_lists[j] = &td->opt_list;
  
                 idx++;
-               ts->members++;
  
                 if (ts->groupid == -1) {
                         /*
@@ -2094,6 +2527,7 @@ void __show_run_stats(void)
                          */
                         ts->thread_number = td->thread_number;
                         ts->groupid = td->groupid;
+                       ts->job_start = td->job_start;
  
                         /*
                          * first pid in group, not very useful...
@@ -2139,7 +2573,9 @@ void __show_run_stats(void)
                 for (k = 0; k < ts->nr_block_infos; k++)
                         ts->block_infos[k] = td->ts.block_infos[k];
  
-               sum_thread_stats(ts, &td->ts, idx == 1);
+               sum_thread_stats(ts, &td->ts);
+
+               ts->members++;
  
                 if (td->o.ss_dur) {
                         ts->ss_state = td->ss.state;
@@ -2154,7 +2590,7 @@ void __show_run_stats(void)
                 }
                 else
                         ts->ss_dur = ts->ss_state = 0;
-       }
+       } end_for_each();
  
         for (i = 0; i < nr_ts; i++) {
                 unsigned long long bw;
@@ -2166,7 +2602,7 @@ void __show_run_stats(void)
                 rs->kb_base = ts->kb_base;
                 rs->unit_base = ts->unit_base;
                 rs->sig_figs = ts->sig_figs;
-               rs->unified_rw_rep += ts->unified_rw_rep;
+               rs->unified_rw_rep |= ts->unified_rw_rep;
  
                 for (j = 0; j < DDIR_RWDIR_CNT; j++) {
                         if (!ts->runtime[j])
@@ -2189,7 +2625,7 @@ void __show_run_stats(void)
         }
  
         for (i = 0; i < groupid + 1; i++) {
-               int ddir;
+               enum fio_ddir ddir;
  
                 rs = &runstats[i];
  
@@ -2295,39 +2731,46 @@ void __show_run_stats(void)
  
         log_info_flush();
         free(runstats);
+
+       /* free arrays allocated by sum_thread_stats(), if any */
+       for (i = 0; i < nr_ts; i++) {
+               ts = &threadstats[i];
+               free_clat_prio_stats(ts);
+       }
         free(threadstats);
         free(opt_lists);
  }
  
  int __show_running_run_stats(void)
  {
-       struct thread_data *td;
         unsigned long long *rt;
         struct timespec ts;
-       int i;
  
         fio_sem_down(stat_sem);
  
         rt = malloc(thread_number * sizeof(unsigned long long));
         fio_gettime(&ts, NULL);
  
-       for_each_td(td, i) {
+       for_each_td(td) {
+               if (td->runstate >= TD_EXITED)
+                       continue;
+
                 td->update_rusage = 1;
                 for_each_rw_ddir(ddir) {
                         td->ts.io_bytes[ddir] = td->io_bytes[ddir];
                 }
                 td->ts.total_run_time = mtime_since(&td->epoch, &ts);
  
-               rt[i] = mtime_since(&td->start, &ts);
+               rt[__td_index] = mtime_since(&td->start, &ts);
                 if (td_read(td) && td->ts.io_bytes[DDIR_READ])
-                       td->ts.runtime[DDIR_READ] += rt[i];
+                       td->ts.runtime[DDIR_READ] += rt[__td_index];
                 if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
-                       td->ts.runtime[DDIR_WRITE] += rt[i];
+                       td->ts.runtime[DDIR_WRITE] += rt[__td_index];
                 if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM])
-                       td->ts.runtime[DDIR_TRIM] += rt[i];
-       }
+                       td->ts.runtime[DDIR_TRIM] += rt[__td_index];
+       } end_for_each();
  
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if (td->runstate >= TD_EXITED)
                         continue;
                 if (td->rusage_sem) {
@@ -2335,18 +2778,21 @@ int __show_running_run_stats(void)
                         fio_sem_down(td->rusage_sem);
                 }
                 td->update_rusage = 0;
-       }
+       } end_for_each();
  
         __show_run_stats();
  
-       for_each_td(td, i) {
+       for_each_td(td) {
+               if (td->runstate >= TD_EXITED)
+                       continue;
+
                 if (td_read(td) && td->ts.io_bytes[DDIR_READ])
-                       td->ts.runtime[DDIR_READ] -= rt[i];
+                       td->ts.runtime[DDIR_READ] -= rt[__td_index];
                 if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
-                       td->ts.runtime[DDIR_WRITE] -= rt[i];
+                       td->ts.runtime[DDIR_WRITE] -= rt[__td_index];
                 if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM])
-                       td->ts.runtime[DDIR_TRIM] -= rt[i];
-       }
+                       td->ts.runtime[DDIR_TRIM] -= rt[__td_index];
+       } end_for_each();
  
         free(rt);
         fio_sem_up(stat_sem);
@@ -2421,33 +2867,42 @@ static inline void add_stat_sample(struct io_stat *is, unsigned long long data)
         is->samples++;
  }
  
+static inline void add_stat_prio_sample(struct clat_prio_stat *clat_prio,
+                                       unsigned short clat_prio_index,
+                                       unsigned long long nsec)
+{
+       if (clat_prio)
+               add_stat_sample(&clat_prio[clat_prio_index].clat_stat, nsec);
+}
+
  /*
   * Return a struct io_logs, which is added to the tail of the log
   * list for 'iolog'.
   */
  static struct io_logs *get_new_log(struct io_log *iolog)
  {
-       size_t new_size, new_samples;
+       size_t new_samples;
         struct io_logs *cur_log;
  
         /*
          * Cap the size at MAX_LOG_ENTRIES, so we don't keep doubling
          * forever
          */
-       if (!iolog->cur_log_max)
-               new_samples = DEF_LOG_ENTRIES;
-       else {
+       if (!iolog->cur_log_max) {
+               if (iolog->td)
+                       new_samples = iolog->td->o.log_entries;
+               else
+                       new_samples = DEF_LOG_ENTRIES;
+       } else {
                 new_samples = iolog->cur_log_max * 2;
                 if (new_samples > MAX_LOG_ENTRIES)
                         new_samples = MAX_LOG_ENTRIES;
         }
  
-       new_size = new_samples * log_entry_sz(iolog);
-
         cur_log = smalloc(sizeof(*cur_log));
         if (cur_log) {
                 INIT_FLIST_HEAD(&cur_log->list);
-               cur_log->log = malloc(new_size);
+               cur_log->log = calloc(new_samples, log_entry_sz(iolog));
                 if (cur_log->log) {
                         cur_log->nr_samples = 0;
                         cur_log->max_samples = new_samples;
@@ -2578,7 +3033,8 @@ static struct io_logs *get_cur_log(struct io_log *iolog)
  
  static void __add_log_sample(struct io_log *iolog, union io_sample_data data,
                              enum fio_ddir ddir, unsigned long long bs,
-                            unsigned long t, uint64_t offset, uint8_t priority_bit)
+                            unsigned long t, uint64_t offset,
+                            unsigned int priority)
  {
         struct io_logs *cur_log;
  
@@ -2594,10 +3050,12 @@ static void __add_log_sample(struct io_log *iolog, union io_sample_data data,
                 s = get_sample(iolog, cur_log, cur_log->nr_samples);
  
                 s->data = data;
-               s->time = t + (iolog->td ? iolog->td->unix_epoch : 0);
+               s->time = t;
+               if (iolog->td && iolog->td->o.log_alternate_epoch)
+                       s->time += iolog->td->alternate_epoch;
                 io_sample_set_ddir(iolog, s, ddir);
                 s->bs = bs;
-               s->priority_bit = priority_bit;
+               s->priority = priority;
  
                 if (iolog->log_offset) {
                         struct io_sample_offset *so = (void *) s;
@@ -2619,14 +3077,36 @@ static inline void reset_io_stat(struct io_stat *ios)
         ios->mean.u.f = ios->S.u.f = 0;
  }
  
+static inline void reset_io_u_plat(uint64_t *io_u_plat)
+{
+       int i;
+
+       for (i = 0; i < FIO_IO_U_PLAT_NR; i++)
+               io_u_plat[i] = 0;
+}
+
+static inline void reset_clat_prio_stats(struct thread_stat *ts)
+{
+       enum fio_ddir ddir;
+       int i;
+
+       for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+               if (!ts->clat_prio[ddir])
+                       continue;
+
+               for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+                       reset_io_stat(&ts->clat_prio[ddir][i].clat_stat);
+                       reset_io_u_plat(ts->clat_prio[ddir][i].io_u_plat);
+               }
+       }
+}
+
  void reset_io_stats(struct thread_data *td)
  {
         struct thread_stat *ts = &td->ts;
-       int i, j, k;
+       int i, j;
  
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-               reset_io_stat(&ts->clat_high_prio_stat[i]);
-               reset_io_stat(&ts->clat_low_prio_stat[i]);
                 reset_io_stat(&ts->clat_stat[i]);
                 reset_io_stat(&ts->slat_stat[i]);
                 reset_io_stat(&ts->lat_stat[i]);
@@ -2638,21 +3118,16 @@ void reset_io_stats(struct thread_data *td)
                 ts->total_io_u[i] = 0;
                 ts->short_io_u[i] = 0;
                 ts->drop_io_u[i] = 0;
-
-               for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
-                       ts->io_u_plat_high_prio[i][j] = 0;
-                       ts->io_u_plat_low_prio[i][j] = 0;
-                       if (!i)
-                               ts->io_u_sync_plat[j] = 0;
-               }
         }
  
         for (i = 0; i < FIO_LAT_CNT; i++)
                 for (j = 0; j < DDIR_RWDIR_CNT; j++)
-                       for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
-                               ts->io_u_plat[i][j][k] = 0;
+                       reset_io_u_plat(ts->io_u_plat[i][j]);
+
+       reset_clat_prio_stats(ts);
  
         ts->total_io_u[DDIR_SYNC] = 0;
+       reset_io_u_plat(ts->io_u_sync_plat);
  
         for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
                 ts->io_u_map[i] = 0;
@@ -2674,7 +3149,7 @@ void reset_io_stats(struct thread_data *td)
  }
  
  static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
-                             unsigned long elapsed, bool log_max, uint8_t priority_bit)
+                             unsigned long elapsed, int log_max)
  {
         /*
          * Note an entry in the log. Use the mean from the logged samples,
@@ -2684,31 +3159,37 @@ static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
         if (iolog->avg_window[ddir].samples) {
                 union io_sample_data data;
  
-               if (log_max)
-                       data.val = iolog->avg_window[ddir].max_val;
-               else
-                       data.val = iolog->avg_window[ddir].mean.u.f + 0.50;
+               if (log_max == IO_LOG_SAMPLE_AVG) {
+                       data.val.val0 = iolog->avg_window[ddir].mean.u.f + 0.50;
+                       data.val.val1 = 0;
+               } else if (log_max == IO_LOG_SAMPLE_MAX) {
+                       data.val.val0 = iolog->avg_window[ddir].max_val;
+                       data.val.val1 = 0;
+               } else {
+                       data.val.val0 = iolog->avg_window[ddir].mean.u.f + 0.50;
+                       data.val.val1 = iolog->avg_window[ddir].max_val;
+               }
  
-               __add_log_sample(iolog, data, ddir, 0, elapsed, 0, priority_bit);
+               __add_log_sample(iolog, data, ddir, 0, elapsed, 0, 0);
         }
  
         reset_io_stat(&iolog->avg_window[ddir]);
  }
  
  static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed,
-                            bool log_max, uint8_t priority_bit)
+                            int log_max)
  {
-       int ddir;
+       enum fio_ddir ddir;
  
         for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
-               __add_stat_to_log(iolog, ddir, elapsed, log_max, priority_bit);
+               __add_stat_to_log(iolog, ddir, elapsed, log_max);
  }
  
  static unsigned long add_log_sample(struct thread_data *td,
                                     struct io_log *iolog,
                                     union io_sample_data data,
                                     enum fio_ddir ddir, unsigned long long bs,
-                                   uint64_t offset, uint8_t priority_bit)
+                                   uint64_t offset, unsigned int ioprio)
  {
         unsigned long elapsed, this_window;
  
@@ -2721,7 +3202,8 @@ static unsigned long add_log_sample(struct thread_data *td,
          * If no time averaging, just add the log sample.
          */
         if (!iolog->avg_msec) {
-               __add_log_sample(iolog, data, ddir, bs, elapsed, offset, priority_bit);
+               __add_log_sample(iolog, data, ddir, bs, elapsed, offset,
+                                ioprio);
                 return 0;
         }
  
@@ -2729,7 +3211,7 @@ static unsigned long add_log_sample(struct thread_data *td,
          * Add the sample. If the time period has passed, then
          * add that entry to the log and clear.
          */
-       add_stat_sample(&iolog->avg_window[ddir], data.val);
+       add_stat_sample(&iolog->avg_window[ddir], data.val.val0);
  
         /*
          * If period hasn't passed, adding the above sample is all we
@@ -2745,7 +3227,7 @@ static unsigned long add_log_sample(struct thread_data *td,
                         return diff;
         }
  
-       __add_stat_to_log(iolog, ddir, elapsed, td->o.log_max != 0, priority_bit);
+       __add_stat_to_log(iolog, ddir, elapsed, td->o.log_max);
  
         iolog->avg_last[ddir] = elapsed - (elapsed % iolog->avg_msec);
  
@@ -2759,19 +3241,19 @@ void finalize_logs(struct thread_data *td, bool unit_logs)
         elapsed = mtime_since_now(&td->epoch);
  
         if (td->clat_log && unit_logs)
-               _add_stat_to_log(td->clat_log, elapsed, td->o.log_max != 0, 0);
+               _add_stat_to_log(td->clat_log, elapsed, td->o.log_max);
         if (td->slat_log && unit_logs)
-               _add_stat_to_log(td->slat_log, elapsed, td->o.log_max != 0, 0);
+               _add_stat_to_log(td->slat_log, elapsed, td->o.log_max);
         if (td->lat_log && unit_logs)
-               _add_stat_to_log(td->lat_log, elapsed, td->o.log_max != 0, 0);
+               _add_stat_to_log(td->lat_log, elapsed, td->o.log_max);
         if (td->bw_log && (unit_logs == per_unit_log(td->bw_log)))
-               _add_stat_to_log(td->bw_log, elapsed, td->o.log_max != 0, 0);
+               _add_stat_to_log(td->bw_log, elapsed, td->o.log_max);
         if (td->iops_log && (unit_logs == per_unit_log(td->iops_log)))
-               _add_stat_to_log(td->iops_log, elapsed, td->o.log_max != 0, 0);
+               _add_stat_to_log(td->iops_log, elapsed, td->o.log_max);
  }
  
-void add_agg_sample(union io_sample_data data, enum fio_ddir ddir, unsigned long long bs,
-                                       uint8_t priority_bit)
+void add_agg_sample(union io_sample_data data, enum fio_ddir ddir,
+                   unsigned long long bs)
  {
         struct io_log *iolog;
  
@@ -2779,7 +3261,7 @@ void add_agg_sample(union io_sample_data data, enum fio_ddir ddir, unsigned long
                 return;
  
         iolog = agg_io_log[ddir];
-       __add_log_sample(iolog, data, ddir, bs, mtime_since_genesis(), 0, priority_bit);
+       __add_log_sample(iolog, data, ddir, bs, mtime_since_genesis(), 0, 0);
  }
  
  void add_sync_clat_sample(struct thread_stat *ts, unsigned long long nsec)
@@ -2791,8 +3273,10 @@ void add_sync_clat_sample(struct thread_stat *ts, unsigned long long nsec)
         add_stat_sample(&ts->sync_stat, nsec);
  }
  
-static void add_lat_percentile_sample_noprio(struct thread_stat *ts,
-                               unsigned long long nsec, enum fio_ddir ddir, enum fio_lat lat)
+static inline void add_lat_percentile_sample(struct thread_stat *ts,
+                                            unsigned long long nsec,
+                                            enum fio_ddir ddir,
+                                            enum fio_lat lat)
  {
         unsigned int idx = plat_val_to_idx(nsec);
         assert(idx < FIO_IO_U_PLAT_NR);
@@ -2800,23 +3284,21 @@ static void add_lat_percentile_sample_noprio(struct thread_stat *ts,
         ts->io_u_plat[lat][ddir][idx]++;
  }
  
-static void add_lat_percentile_sample(struct thread_stat *ts,
-                               unsigned long long nsec, enum fio_ddir ddir, uint8_t priority_bit,
-                               enum fio_lat lat)
+static inline void
+add_lat_percentile_prio_sample(struct thread_stat *ts, unsigned long long nsec,
+                              enum fio_ddir ddir,
+                              unsigned short clat_prio_index)
  {
         unsigned int idx = plat_val_to_idx(nsec);
  
-       add_lat_percentile_sample_noprio(ts, nsec, ddir, lat);
-
-       if (!priority_bit)
-               ts->io_u_plat_low_prio[ddir][idx]++;
-       else
-               ts->io_u_plat_high_prio[ddir][idx]++;
+       if (ts->clat_prio[ddir])
+               ts->clat_prio[ddir][clat_prio_index].io_u_plat[idx]++;
  }
  
  void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
                      unsigned long long nsec, unsigned long long bs,
-                    uint64_t offset, uint8_t priority_bit)
+                    uint64_t offset, unsigned int ioprio,
+                    unsigned short clat_prio_index)
  {
         const bool needs_lock = td_async_processing(td);
         unsigned long elapsed, this_window;
@@ -2828,22 +3310,33 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
  
         add_stat_sample(&ts->clat_stat[ddir], nsec);
  
-       if (!ts->lat_percentiles) {
-               if (priority_bit)
-                       add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
-               else
-                       add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
-       }
+       /*
+        * When lat_percentiles=1 (default 0), the reported per priority
+        * percentiles and stats are used for describing total latency values,
+        * even though the variable names themselves start with clat_.
+        *
+        * Because of the above definition, add a prio stat sample only when
+        * lat_percentiles=0. add_lat_sample() will add the prio stat sample
+        * when lat_percentiles=1.
+        */
+       if (!ts->lat_percentiles)
+               add_stat_prio_sample(ts->clat_prio[ddir], clat_prio_index,
+                                    nsec);
  
         if (td->clat_log)
                 add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs,
-                              offset, priority_bit);
+                              offset, ioprio);
  
         if (ts->clat_percentiles) {
-               if (ts->lat_percentiles)
-                       add_lat_percentile_sample_noprio(ts, nsec, ddir, FIO_CLAT);
-               else
-                       add_lat_percentile_sample(ts, nsec, ddir, priority_bit, FIO_CLAT);
+               /*
+                * Because of the above definition, add a prio lat percentile
+                * sample only when lat_percentiles=0. add_lat_sample() will add
+                * the prio lat percentile sample when lat_percentiles=1.
+                */
+               add_lat_percentile_sample(ts, nsec, ddir, FIO_CLAT);
+               if (!ts->lat_percentiles)
+                       add_lat_percentile_prio_sample(ts, nsec, ddir,
+                                                      clat_prio_index);
         }
  
         if (iolog && iolog->hist_msec) {
@@ -2872,7 +3365,7 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
                                 FIO_IO_U_PLAT_NR * sizeof(uint64_t));
                         flist_add(&dst->list, &hw->list);
                         __add_log_sample(iolog, sample_plat(dst), ddir, bs,
-                                               elapsed, offset, priority_bit);
+                                        elapsed, offset, ioprio);
  
                         /*
                          * Update the last time we recorded as being now, minus
@@ -2889,8 +3382,8 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
  }
  
  void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
-                       unsigned long long nsec, unsigned long long bs, uint64_t offset,
-                       uint8_t priority_bit)
+                    unsigned long long nsec, unsigned long long bs,
+                    uint64_t offset, unsigned int ioprio)
  {
         const bool needs_lock = td_async_processing(td);
         struct thread_stat *ts = &td->ts;
@@ -2904,11 +3397,11 @@ void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
         add_stat_sample(&ts->slat_stat[ddir], nsec);
  
         if (td->slat_log)
-               add_log_sample(td, td->slat_log, sample_val(nsec), ddir, bs, offset,
-                       priority_bit);
+               add_log_sample(td, td->slat_log, sample_val(nsec), ddir, bs,
+                              offset, ioprio);
  
         if (ts->slat_percentiles)
-               add_lat_percentile_sample_noprio(ts, nsec, ddir, FIO_SLAT);
+               add_lat_percentile_sample(ts, nsec, ddir, FIO_SLAT);
  
         if (needs_lock)
                 __td_io_u_unlock(td);
@@ -2916,7 +3409,8 @@ void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
  
  void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
                     unsigned long long nsec, unsigned long long bs,
-                   uint64_t offset, uint8_t priority_bit)
+                   uint64_t offset, unsigned int ioprio,
+                   unsigned short clat_prio_index)
  {
         const bool needs_lock = td_async_processing(td);
         struct thread_stat *ts = &td->ts;
@@ -2931,15 +3425,23 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
  
         if (td->lat_log)
                 add_log_sample(td, td->lat_log, sample_val(nsec), ddir, bs,
-                              offset, priority_bit);
+                              offset, ioprio);
  
+       /*
+        * When lat_percentiles=1 (default 0), the reported per priority
+        * percentiles and stats are used for describing total latency values,
+        * even though the variable names themselves start with clat_.
+        *
+        * Because of the above definition, add a prio stat and prio lat
+        * percentile sample only when lat_percentiles=1. add_clat_sample() will
+        * add the prio stat and prio lat percentile sample when
+        * lat_percentiles=0.
+        */
         if (ts->lat_percentiles) {
-               add_lat_percentile_sample(ts, nsec, ddir, priority_bit, FIO_LAT);
-               if (priority_bit)
-                       add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
-               else
-                       add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
-
+               add_lat_percentile_sample(ts, nsec, ddir, FIO_LAT);
+               add_lat_percentile_prio_sample(ts, nsec, ddir, clat_prio_index);
+               add_stat_prio_sample(ts->clat_prio[ddir], clat_prio_index,
+                                    nsec);
         }
         if (needs_lock)
                 __td_io_u_unlock(td);
@@ -2964,7 +3466,7 @@ void add_bw_sample(struct thread_data *td, struct io_u *io_u,
  
         if (td->bw_log)
                 add_log_sample(td, td->bw_log, sample_val(rate), io_u->ddir,
-                              bytes, io_u->offset, io_u_is_prio(io_u));
+                              bytes, io_u->offset, io_u->ioprio);
  
         td->stat_io_bytes[io_u->ddir] = td->this_io_bytes[io_u->ddir];
  
@@ -3018,7 +3520,8 @@ static int __add_samples(struct thread_data *td, struct timespec *parent_tv,
                         if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
                                 bs = td->o.min_bs[ddir];
  
-                       next = add_log_sample(td, log, sample_val(rate), ddir, bs, 0, 0);
+                       next = add_log_sample(td, log, sample_val(rate), ddir,
+                                             bs, 0, 0);
                         next_log = min(next_log, next);
                 }
  
@@ -3058,7 +3561,7 @@ void add_iops_sample(struct thread_data *td, struct io_u *io_u,
  
         if (td->iops_log)
                 add_log_sample(td, td->iops_log, sample_val(1), io_u->ddir,
-                              bytes, io_u->offset, io_u_is_prio(io_u));
+                              bytes, io_u->offset, io_u->ioprio);
  
         td->stat_io_blocks[io_u->ddir] = td->this_io_blocks[io_u->ddir];
  
@@ -3073,26 +3576,38 @@ static int add_iops_samples(struct thread_data *td, struct timespec *t)
                                 td->ts.iops_stat, td->iops_log, false);
  }
  
+static bool td_in_logging_state(struct thread_data *td)
+{
+       if (in_ramp_time(td))
+               return false;
+
+       switch(td->runstate) {
+       case TD_RUNNING:
+       case TD_VERIFYING:
+       case TD_FINISHING:
+       case TD_EXITED:
+               return true;
+       default:
+               return false;
+       }
+}
+
  /*
   * Returns msecs to next event
   */
  int calc_log_samples(void)
  {
-       struct thread_data *td;
         unsigned int next = ~0U, tmp = 0, next_mod = 0, log_avg_msec_min = -1U;
         struct timespec now;
-       int i;
         long elapsed_time = 0;
  
-       fio_gettime(&now, NULL);
-
-       for_each_td(td, i) {
-               elapsed_time = mtime_since_now(&td->epoch);
+       for_each_td(td) {
+               fio_gettime(&now, NULL);
+               elapsed_time = mtime_since(&td->epoch, &now);
  
                 if (!td->o.stats)
                         continue;
-               if (in_ramp_time(td) ||
-                   !(td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING)) {
+               if (!td_in_logging_state(td)) {
                         next = min(td->o.iops_avg_time, td->o.bw_avg_time);
                         continue;
                 }
@@ -3113,7 +3628,7 @@ int calc_log_samples(void)
  
                 if (tmp < next)
                         next = tmp;
-       }
+       } end_for_each();
  
         /* if log_avg_msec_min has not been changed, set it to 0 */
         if (log_avg_msec_min == -1U)
diff --git a/stat.h b/stat.h

index 6dd5ef743a0803b79d5652615dabba416c01dad3..0d57cceb217543e596364cfeeefd75edad142dbd 100644 (file)
--- a/stat.h
+++ b/stat.h
@@ -51,7 +51,7 @@ struct group_run_stats {
   *
   * FIO_IO_U_PLAT_GROUP_NR and FIO_IO_U_PLAT_BITS determine the memory
   * requirement of storing those aggregate counts. The memory used will
- * be (FIO_IO_U_PLAT_GROUP_NR * 2^FIO_IO_U_PLAT_BITS) * sizeof(int)
+ * be (FIO_IO_U_PLAT_GROUP_NR * 2^FIO_IO_U_PLAT_BITS) * sizeof(uint64_t)
   * bytes.
   *
   * FIO_IO_U_PLAT_NR is the total number of buckets.
@@ -68,7 +68,7 @@ struct group_run_stats {
   * than one. This method has low accuracy when the value is small. For
   * example, let the buckets be {[0,99],[100,199],...,[900,999]}, and
   * the represented value of each bucket be the mean of the range. Then
- * a value 0 has an round-off error of 49.5. To improve on this, we
+ * a value 0 has a round-off error of 49.5. To improve on this, we
   * use buckets with non-uniform ranges, while bounding the error of
   * each bucket within a ratio of the sample value. A simple example
   * would be when error_bound = 0.005, buckets are {
@@ -142,10 +142,12 @@ enum block_info_state {
         BLOCK_STATE_COUNT,
  };
  
-#define MAX_PATTERN_SIZE       512
  #define FIO_JOBNAME_SIZE       128
  #define FIO_JOBDESC_SIZE       256
  #define FIO_VERROR_SIZE                128
+#define UNIFIED_SPLIT          0
+#define UNIFIED_MIXED          1
+#define UNIFIED_BOTH           2
  
  enum fio_lat {
         FIO_SLAT = 0,
@@ -155,16 +157,24 @@ enum fio_lat {
         FIO_LAT_CNT = 3,
  };
  
+struct clat_prio_stat {
+       uint64_t io_u_plat[FIO_IO_U_PLAT_NR];
+       struct io_stat clat_stat;
+       uint32_t ioprio;
+};
+
  struct thread_stat {
         char name[FIO_JOBNAME_SIZE];
         char verror[FIO_VERROR_SIZE];
         uint32_t error;
         uint32_t thread_number;
         uint32_t groupid;
+       uint64_t job_start; /* Time job was started, as clock_gettime(job_start_clock_id) */
         uint32_t pid;
         char description[FIO_JOBDESC_SIZE];
         uint32_t members;
         uint32_t unified_rw_rep;
+       uint32_t disable_prio_stat;
  
         /*
          * bandwidth and latency stats
@@ -249,21 +259,40 @@ struct thread_stat {
         fio_fp64_t ss_deviation;
         fio_fp64_t ss_criterion;
  
-       uint64_t io_u_plat_high_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR] __attribute__((aligned(8)));;
-       uint64_t io_u_plat_low_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
-       struct io_stat clat_high_prio_stat[DDIR_RWDIR_CNT] __attribute__((aligned(8)));
-       struct io_stat clat_low_prio_stat[DDIR_RWDIR_CNT];
+       /* A mirror of td->ioprio. */
+       uint32_t ioprio;
  
         union {
                 uint64_t *ss_iops_data;
+               /*
+                * For FIO_NET_CMD_TS, the pointed to data will temporarily
+                * be stored at this offset from the start of the payload.
+                */
+               uint64_t ss_iops_data_offset;
                 uint64_t pad4;
         };
  
         union {
                 uint64_t *ss_bw_data;
+               /*
+                * For FIO_NET_CMD_TS, the pointed to data will temporarily
+                * be stored at this offset from the start of the payload.
+                */
+               uint64_t ss_bw_data_offset;
                 uint64_t pad5;
         };
  
+       union {
+               struct clat_prio_stat *clat_prio[DDIR_RWDIR_CNT];
+               /*
+                * For FIO_NET_CMD_TS, the pointed to data will temporarily
+                * be stored at this offset from the start of the payload.
+                */
+               uint64_t clat_prio_offset[DDIR_RWDIR_CNT];
+               uint64_t pad6;
+       };
+       uint32_t nr_clat_prio[DDIR_RWDIR_CNT];
+
         uint64_t cachehit;
         uint64_t cachemiss;
  } __attribute__((packed));
@@ -316,14 +345,14 @@ extern void stat_exit(void);
  
  extern struct json_object * show_thread_status(struct thread_stat *ts, struct group_run_stats *rs, struct flist_head *, struct buf_output *);
  extern void show_group_stats(struct group_run_stats *rs, struct buf_output *);
-extern bool calc_thread_status(struct jobs_eta *je, int force);
  extern void display_thread_status(struct jobs_eta *je);
  extern void __show_run_stats(void);
  extern int __show_running_run_stats(void);
  extern void show_running_run_stats(void);
  extern void check_for_running_stats(void);
-extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first);
+extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src);
  extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src);
+extern void init_thread_stat_min_vals(struct thread_stat *ts);
  extern void init_thread_stat(struct thread_stat *ts);
  extern void init_group_run_stat(struct group_run_stats *gs);
  extern void eta_to_str(char *str, unsigned long eta_sec);
@@ -338,13 +367,12 @@ extern void update_rusage_stat(struct thread_data *);
  extern void clear_rusage_stat(struct thread_data *);
  
  extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
-                               unsigned long long, uint64_t, uint8_t);
+                          unsigned long long, uint64_t, unsigned int, unsigned short);
  extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
-                               unsigned long long, uint64_t, uint8_t);
+                           unsigned long long, uint64_t, unsigned int, unsigned short);
  extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
-                               unsigned long long, uint64_t, uint8_t);
-extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long bs,
-                               uint8_t priority_bit);
+                               unsigned long long, uint64_t, unsigned int);
+extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long);
  extern void add_iops_sample(struct thread_data *, struct io_u *,
                                 unsigned int);
  extern void add_bw_sample(struct thread_data *, struct io_u *,
@@ -352,6 +380,8 @@ extern void add_bw_sample(struct thread_data *, struct io_u *,
  extern void add_sync_clat_sample(struct thread_stat *ts,
                                 unsigned long long nsec);
  extern int calc_log_samples(void);
+extern void free_clat_prio_stats(struct thread_stat *);
+extern int alloc_clat_prio_stat_ddir(struct thread_stat *, enum fio_ddir, int);
  
  extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse, struct buf_output *);
  extern void json_array_add_disk_util(struct disk_util_stat *dus,
diff --git a/steadystate.c b/steadystate.c

index 2e3da1db0cfb3de21c068d6b6f3d159f71c592ad..3e3683f38aa7b5e19e7b2ab3915517098db7a972 100644 (file)
--- a/steadystate.c
+++ b/steadystate.c
@@ -4,6 +4,7 @@
  #include "steadystate.h"
  
  bool steadystate_enabled = false;
+unsigned int ss_check_interval = 1000;
  
  void steadystate_free(struct thread_data *td)
  {
@@ -15,16 +16,18 @@ void steadystate_free(struct thread_data *td)
  
  static void steadystate_alloc(struct thread_data *td)
  {
-       td->ss.bw_data = calloc(td->ss.dur, sizeof(uint64_t));
-       td->ss.iops_data = calloc(td->ss.dur, sizeof(uint64_t));
+       int intervals = td->ss.dur / (ss_check_interval / 1000L);
+
+       td->ss.bw_data = calloc(intervals, sizeof(uint64_t));
+       td->ss.iops_data = calloc(intervals, sizeof(uint64_t));
  
         td->ss.state |= FIO_SS_DATA;
  }
  
  void steadystate_setup(void)
  {
-       struct thread_data *td, *prev_td;
-       int i, prev_groupid;
+       struct thread_data *prev_td;
+       int prev_groupid;
  
         if (!steadystate_enabled)
                 return;
@@ -36,7 +39,7 @@ void steadystate_setup(void)
          */
         prev_groupid = -1;
         prev_td = NULL;
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if (!td->ss.dur)
                         continue;
  
@@ -51,7 +54,7 @@ void steadystate_setup(void)
                         prev_groupid = td->groupid;
                 }
                 prev_td = td;
-       }
+       } end_for_each();
  
         if (prev_td && prev_td->o.group_reporting)
                 steadystate_alloc(prev_td);
@@ -64,6 +67,7 @@ static bool steadystate_slope(uint64_t iops, uint64_t bw,
         double result;
         struct steadystate_data *ss = &td->ss;
         uint64_t new_val;
+       int intervals = ss->dur / (ss_check_interval / 1000L);
  
         ss->bw_data[ss->tail] = bw;
         ss->iops_data[ss->tail] = iops;
@@ -73,15 +77,15 @@ static bool steadystate_slope(uint64_t iops, uint64_t bw,
         else
                 new_val = bw;
  
-       if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) {
+       if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == intervals - 1) {
                 if (!(ss->state & FIO_SS_BUFFER_FULL)) {
                         /* first time through */
-                       for(i = 0, ss->sum_y = 0; i < ss->dur; i++) {
+                       for (i = 0, ss->sum_y = 0; i < intervals; i++) {
                                 if (ss->state & FIO_SS_IOPS)
                                         ss->sum_y += ss->iops_data[i];
                                 else
                                         ss->sum_y += ss->bw_data[i];
-                               j = (ss->head + i) % ss->dur;
+                               j = (ss->head + i) % intervals;
                                 if (ss->state & FIO_SS_IOPS)
                                         ss->sum_xy += i * ss->iops_data[j];
                                 else
@@ -91,7 +95,7 @@ static bool steadystate_slope(uint64_t iops, uint64_t bw,
                 } else {                /* easy to update the sums */
                         ss->sum_y -= ss->oldest_y;
                         ss->sum_y += new_val;
-                       ss->sum_xy = ss->sum_xy - ss->sum_y + ss->dur * new_val;
+                       ss->sum_xy = ss->sum_xy - ss->sum_y + intervals * new_val;
                 }
  
                 if (ss->state & FIO_SS_IOPS)
@@ -105,10 +109,10 @@ static bool steadystate_slope(uint64_t iops, uint64_t bw,
                  * equally spaced when they are often off by a few milliseconds.
                  * This assumption greatly simplifies the calculations.
                  */
-               ss->slope = (ss->sum_xy - (double) ss->sum_x * ss->sum_y / ss->dur) /
-                               (ss->sum_x_sq - (double) ss->sum_x * ss->sum_x / ss->dur);
+               ss->slope = (ss->sum_xy - (double) ss->sum_x * ss->sum_y / intervals) /
+                               (ss->sum_x_sq - (double) ss->sum_x * ss->sum_x / intervals);
                 if (ss->state & FIO_SS_PCT)
-                       ss->criterion = 100.0 * ss->slope / (ss->sum_y / ss->dur);
+                       ss->criterion = 100.0 * ss->slope / (ss->sum_y / intervals);
                 else
                         ss->criterion = ss->slope;
  
@@ -123,9 +127,9 @@ static bool steadystate_slope(uint64_t iops, uint64_t bw,
                         return true;
         }
  
-       ss->tail = (ss->tail + 1) % ss->dur;
+       ss->tail = (ss->tail + 1) % intervals;
         if (ss->tail <= ss->head)
-               ss->head = (ss->head + 1) % ss->dur;
+               ss->head = (ss->head + 1) % intervals;
  
         return false;
  }
@@ -138,18 +142,20 @@ static bool steadystate_deviation(uint64_t iops, uint64_t bw,
         double mean;
  
         struct steadystate_data *ss = &td->ss;
+       int intervals = ss->dur / (ss_check_interval / 1000L);
  
         ss->bw_data[ss->tail] = bw;
         ss->iops_data[ss->tail] = iops;
  
-       if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) {
+       if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == intervals  - 1) {
                 if (!(ss->state & FIO_SS_BUFFER_FULL)) {
                         /* first time through */
-                       for(i = 0, ss->sum_y = 0; i < ss->dur; i++)
+                       for (i = 0, ss->sum_y = 0; i < intervals; i++) {
                                 if (ss->state & FIO_SS_IOPS)
                                         ss->sum_y += ss->iops_data[i];
                                 else
                                         ss->sum_y += ss->bw_data[i];
+                       }
                         ss->state |= FIO_SS_BUFFER_FULL;
                 } else {                /* easy to update the sum */
                         ss->sum_y -= ss->oldest_y;
@@ -164,10 +170,10 @@ static bool steadystate_deviation(uint64_t iops, uint64_t bw,
                 else
                         ss->oldest_y = ss->bw_data[ss->head];
  
-               mean = (double) ss->sum_y / ss->dur;
+               mean = (double) ss->sum_y / intervals;
                 ss->deviation = 0.0;
  
-               for (i = 0; i < ss->dur; i++) {
+               for (i = 0; i < intervals; i++) {
                         if (ss->state & FIO_SS_IOPS)
                                 diff = ss->iops_data[i] - mean;
                         else
@@ -180,8 +186,9 @@ static bool steadystate_deviation(uint64_t iops, uint64_t bw,
                 else
                         ss->criterion = ss->deviation;
  
-               dprint(FD_STEADYSTATE, "sum_y: %llu, mean: %f, max diff: %f, "
+               dprint(FD_STEADYSTATE, "intervals: %d, sum_y: %llu, mean: %f, max diff: %f, "
                                         "objective: %f, limit: %f\n",
+                                       intervals,
                                         (unsigned long long) ss->sum_y, mean,
                                         ss->deviation, ss->criterion, ss->limit);
  
@@ -189,25 +196,24 @@ static bool steadystate_deviation(uint64_t iops, uint64_t bw,
                         return true;
         }
  
-       ss->tail = (ss->tail + 1) % ss->dur;
-       if (ss->tail <= ss->head)
-               ss->head = (ss->head + 1) % ss->dur;
+       ss->tail = (ss->tail + 1) % intervals;
+       if (ss->tail == ss->head)
+               ss->head = (ss->head + 1) % intervals;
  
         return false;
  }
  
  int steadystate_check(void)
  {
-       int i, j, ddir, prev_groupid, group_ramp_time_over = 0;
+       int  ddir, prev_groupid, group_ramp_time_over = 0;
         unsigned long rate_time;
-       struct thread_data *td, *td2;
         struct timespec now;
         uint64_t group_bw = 0, group_iops = 0;
         uint64_t td_iops, td_bytes;
         bool ret;
  
         prev_groupid = -1;
-       for_each_td(td, i) {
+       for_each_td(td) {
                 const bool needs_lock = td_async_processing(td);
                 struct steadystate_data *ss = &td->ss;
  
@@ -229,10 +235,10 @@ int steadystate_check(void)
                 fio_gettime(&now, NULL);
                 if (ss->ramp_time && !(ss->state & FIO_SS_RAMP_OVER)) {
                         /*
-                        * Begin recording data one second after ss->ramp_time
+                        * Begin recording data one check interval after ss->ramp_time
                          * has elapsed
                          */
-                       if (utime_since(&td->epoch, &now) >= (ss->ramp_time + 1000000L))
+                       if (utime_since(&td->epoch, &now) >= (ss->ramp_time + ss_check_interval * 1000L))
                                 ss->state |= FIO_SS_RAMP_OVER;
                 }
  
@@ -250,16 +256,11 @@ int steadystate_check(void)
                 rate_time = mtime_since(&ss->prev_time, &now);
                 memcpy(&ss->prev_time, &now, sizeof(now));
  
-               /*
-                * Begin monitoring when job starts but don't actually use
-                * data in checking stopping criterion until ss->ramp_time is
-                * over. This ensures that we will have a sane value in
-                * prev_iops/bw the first time through after ss->ramp_time
-                * is done.
-                */
                 if (ss->state & FIO_SS_RAMP_OVER) {
-                       group_bw += 1000 * (td_bytes - ss->prev_bytes) / rate_time;
-                       group_iops += 1000 * (td_iops - ss->prev_iops) / rate_time;
+                       group_bw += rate_time * (td_bytes - ss->prev_bytes) /
+                               (ss_check_interval * ss_check_interval / 1000L);
+                       group_iops += rate_time * (td_iops - ss->prev_iops) /
+                               (ss_check_interval * ss_check_interval / 1000L);
                         ++group_ramp_time_over;
                 }
                 ss->prev_iops = td_iops;
@@ -278,7 +279,7 @@ int steadystate_check(void)
                 dprint(FD_STEADYSTATE, "steadystate_check() thread: %d, "
                                         "groupid: %u, rate_msec: %ld, "
                                         "iops: %llu, bw: %llu, head: %d, tail: %d\n",
-                                       i, td->groupid, rate_time,
+                                       __td_index, td->groupid, rate_time,
                                         (unsigned long long) group_iops,
                                         (unsigned long long) group_bw,
                                         ss->head, ss->tail);
@@ -290,18 +291,18 @@ int steadystate_check(void)
  
                 if (ret) {
                         if (td->o.group_reporting) {
-                               for_each_td(td2, j) {
+                               for_each_td(td2) {
                                         if (td2->groupid == td->groupid) {
                                                 td2->ss.state |= FIO_SS_ATTAINED;
                                                 fio_mark_td_terminate(td2);
                                         }
-                               }
+                               } end_for_each();
                         } else {
                                 ss->state |= FIO_SS_ATTAINED;
                                 fio_mark_td_terminate(td);
                         }
                 }
-       }
+       } end_for_each();
         return 0;
  }
  
@@ -309,8 +310,7 @@ int td_steadystate_init(struct thread_data *td)
  {
         struct steadystate_data *ss = &td->ss;
         struct thread_options *o = &td->o;
-       struct thread_data *td2;
-       int j;
+       int intervals;
  
         memset(ss, 0, sizeof(*ss));
  
@@ -322,17 +322,19 @@ int td_steadystate_init(struct thread_data *td)
                 ss->dur = o->ss_dur;
                 ss->limit = o->ss_limit.u.f;
                 ss->ramp_time = o->ss_ramp_time;
+               ss_check_interval = o->ss_check_interval / 1000L;
  
                 ss->state = o->ss_state;
                 if (!td->ss.ramp_time)
                         ss->state |= FIO_SS_RAMP_OVER;
  
-               ss->sum_x = o->ss_dur * (o->ss_dur - 1) / 2;
-               ss->sum_x_sq = (o->ss_dur - 1) * (o->ss_dur) * (2*o->ss_dur - 1) / 6;
+               intervals = ss->dur / (ss_check_interval / 1000L);
+               ss->sum_x = intervals * (intervals - 1) / 2;
+               ss->sum_x_sq = (intervals - 1) * (intervals) * (2*intervals - 1) / 6;
         }
  
         /* make sure that ss options are consistent within reporting group */
-       for_each_td(td2, j) {
+       for_each_td(td2) {
                 if (td2->groupid == td->groupid) {
                         struct steadystate_data *ss2 = &td2->ss;
  
@@ -346,7 +348,7 @@ int td_steadystate_init(struct thread_data *td)
                                 return 1;
                         }
                 }
-       }
+       } end_for_each();
  
         return 0;
  }
@@ -355,26 +357,28 @@ uint64_t steadystate_bw_mean(struct thread_stat *ts)
  {
         int i;
         uint64_t sum;
-
+       int intervals = ts->ss_dur / (ss_check_interval / 1000L);
+       
         if (!ts->ss_dur)
                 return 0;
  
-       for (i = 0, sum = 0; i < ts->ss_dur; i++)
+       for (i = 0, sum = 0; i < intervals; i++)
                 sum += ts->ss_bw_data[i];
  
-       return sum / ts->ss_dur;
+       return sum / intervals;
  }
  
  uint64_t steadystate_iops_mean(struct thread_stat *ts)
  {
         int i;
         uint64_t sum;
+       int intervals = ts->ss_dur / (ss_check_interval / 1000L);
  
         if (!ts->ss_dur)
                 return 0;
  
-       for (i = 0, sum = 0; i < ts->ss_dur; i++)
+       for (i = 0, sum = 0; i < intervals; i++)
                 sum += ts->ss_iops_data[i];
  
-       return sum / ts->ss_dur;
+       return sum / intervals;
  }
diff --git a/steadystate.h b/steadystate.h

index bbb86fbb30a301299882e655e98b00c951cca8d5..f1ef2b20ba85bda3f2a6450e1e55d97061bd0b8e 100644 (file)
--- a/steadystate.h
+++ b/steadystate.h
@@ -11,6 +11,7 @@ extern uint64_t steadystate_bw_mean(struct thread_stat *);
  extern uint64_t steadystate_iops_mean(struct thread_stat *);
  
  extern bool steadystate_enabled;
+extern unsigned int ss_check_interval;
  
  struct steadystate_data {
         double limit;
@@ -64,6 +65,4 @@ enum {
         FIO_SS_BW_SLOPE         = FIO_SS_BW | FIO_SS_SLOPE,
  };
  
-#define STEADYSTATE_MSEC       1000
-
  #endif
diff --git a/t/dedupe.c b/t/dedupe.c

index 68d31f19bd7b6a028975ece96918e6f9a014c8c0..02e52b742ee33730e94c42c4ca6741f9b17f5eaf 100644 (file)
--- a/t/dedupe.c
+++ b/t/dedupe.c
@@ -24,19 +24,25 @@
  
  #include "../lib/bloom.h"
  #include "debug.h"
+#include "zlib.h"
+
+struct zlib_ctrl {
+       z_stream stream;
+       unsigned char *buf_in;
+       unsigned char *buf_out;
+};
  
  struct worker_thread {
+       struct zlib_ctrl zc;
         pthread_t thread;
-
-       volatile int done;
-
-       int fd;
         uint64_t cur_offset;
         uint64_t size;
-
+       unsigned long long unique_capacity;
         unsigned long items;
         unsigned long dupes;
         int err;
+       int fd;
+       volatile int done;
  };
  
  struct extent {
@@ -68,6 +74,7 @@ static unsigned int odirect;
  static unsigned int collision_check;
  static unsigned int print_progress = 1;
  static unsigned int use_bloom = 1;
+static unsigned int compression = 0;
  
  static uint64_t total_size;
  static uint64_t cur_offset;
@@ -87,8 +94,9 @@ static uint64_t get_size(struct fio_file *f, struct stat *sb)
                         return 0;
                 }
                 ret = bytes;
-       } else
+       } else {
                 ret = sb->st_size;
+       }
  
         return (ret & ~((uint64_t)blocksize - 1));
  }
@@ -120,9 +128,9 @@ static int __read_block(int fd, void *buf, off_t offset, size_t count)
         if (ret < 0) {
                 perror("pread");
                 return 1;
-       } else if (!ret)
+       } else if (!ret) {
                 return 1;
-       else if (ret != count) {
+       } else if (ret != count) {
                 log_err("dedupe: short read on block\n");
                 return 1;
         }
@@ -135,6 +143,36 @@ static int read_block(int fd, void *buf, off_t offset)
         return __read_block(fd, buf, offset, blocksize);
  }
  
+static int account_unique_capacity(uint64_t offset, uint64_t *unique_capacity,
+                                  struct zlib_ctrl *zc)
+{
+       z_stream *stream = &zc->stream;
+       unsigned int compressed_len;
+       int ret;
+
+       if (read_block(file.fd, zc->buf_in, offset))
+               return 1;
+
+       stream->next_in = zc->buf_in;
+       stream->avail_in = blocksize;
+       stream->avail_out = deflateBound(stream, blocksize);
+       stream->next_out = zc->buf_out;
+
+       ret = deflate(stream, Z_FINISH);
+       if (ret == Z_STREAM_ERROR)
+               return 1;
+       compressed_len = blocksize - stream->avail_out;
+
+       if (dump_output)
+               printf("offset 0x%lx compressed to %d blocksize %d ratio %.2f \n",
+                               (unsigned long) offset, compressed_len, blocksize,
+                               (float)compressed_len / (float)blocksize);
+
+       *unique_capacity += compressed_len;
+       deflateReset(stream);
+       return 0;
+}
+
  static void add_item(struct chunk *c, struct item *i)
  {
         /*      
@@ -182,17 +220,19 @@ static struct chunk *alloc_chunk(void)
         if (collision_check || dump_output) {
                 c = malloc(sizeof(struct chunk) + sizeof(struct flist_head));
                 INIT_FLIST_HEAD(&c->extent_list[0]);
-       } else
+       } else {
                 c = malloc(sizeof(struct chunk));
+       }
  
         return c;
  }
  
-static void insert_chunk(struct item *i)
+static int insert_chunk(struct item *i, uint64_t *unique_capacity,
+                       struct zlib_ctrl *zc)
  {
         struct fio_rb_node **p, *parent;
         struct chunk *c;
-       int diff;
+       int ret, diff;
  
         p = &rb_root.rb_node;
         parent = NULL;
@@ -201,13 +241,11 @@ static void insert_chunk(struct item *i)
  
                 c = rb_entry(parent, struct chunk, rb_node);
                 diff = memcmp(i->hash, c->hash, sizeof(i->hash));
-               if (diff < 0)
+               if (diff < 0) {
                         p = &(*p)->rb_left;
-               else if (diff > 0)
+               } else if (diff > 0) {
                         p = &(*p)->rb_right;
-               else {
-                       int ret;
-
+               } else {
                         if (!collision_check)
                                 goto add;
  
@@ -228,14 +266,21 @@ static void insert_chunk(struct item *i)
         memcpy(c->hash, i->hash, sizeof(i->hash));
         rb_link_node(&c->rb_node, parent, p);
         rb_insert_color(&c->rb_node, &rb_root);
+       if (compression) {
+               ret = account_unique_capacity(i->offset, unique_capacity, zc);
+               if (ret)
+                       return ret;
+       }
  add:
         add_item(c, i);
+       return 0;
  }
  
-static void insert_chunks(struct item *items, unsigned int nitems,
-                         uint64_t *ndupes)
+static int insert_chunks(struct item *items, unsigned int nitems,
+                        uint64_t *ndupes, uint64_t *unique_capacity,
+                        struct zlib_ctrl *zc)
  {
-       int i;
+       int i, ret = 0;
  
         fio_sem_down(rb_lock);
  
@@ -247,11 +292,15 @@ static void insert_chunks(struct item *items, unsigned int nitems,
                         s = sizeof(items[i].hash) / sizeof(uint32_t);
                         r = bloom_set(bloom, items[i].hash, s);
                         *ndupes += r;
-               } else
-                       insert_chunk(&items[i]);
+               } else {
+                       ret = insert_chunk(&items[i], unique_capacity, zc);
+                       if (ret)
+                               break;
+               }
         }
  
         fio_sem_up(rb_lock);
+       return ret;
  }
  
  static void crc_buf(void *buf, uint32_t *hash)
@@ -277,11 +326,14 @@ static int do_work(struct worker_thread *thread, void *buf)
         off_t offset;
         int nitems = 0;
         uint64_t ndupes = 0;
+       uint64_t unique_capacity = 0;
         struct item *items;
+       int ret;
  
         offset = thread->cur_offset;
  
-       nblocks = read_blocks(thread->fd, buf, offset, min(thread->size, (uint64_t)chunk_size));
+       nblocks = read_blocks(thread->fd, buf, offset,
+                               min(thread->size, (uint64_t) chunk_size));
         if (!nblocks)
                 return 1;
  
@@ -296,12 +348,34 @@ static int do_work(struct worker_thread *thread, void *buf)
                 nitems++;
         }
  
-       insert_chunks(items, nitems, &ndupes);
+       ret = insert_chunks(items, nitems, &ndupes, &unique_capacity, &thread->zc);
  
         free(items);
-       thread->items += nitems;
-       thread->dupes += ndupes;
-       return 0;
+       if (!ret) {
+               thread->items += nitems;
+               thread->dupes += ndupes;
+               thread->unique_capacity += unique_capacity;
+               return 0;
+       }
+
+       return ret;
+}
+
+static void thread_init_zlib_control(struct worker_thread *thread)
+{
+       size_t sz;
+
+       z_stream *stream = &thread->zc.stream;
+       stream->zalloc = Z_NULL;
+       stream->zfree = Z_NULL;
+       stream->opaque = Z_NULL;
+
+       if (deflateInit(stream, Z_DEFAULT_COMPRESSION) != Z_OK)
+               return;
+
+       thread->zc.buf_in = fio_memalign(blocksize, blocksize, false);
+       sz = deflateBound(stream, blocksize);
+       thread->zc.buf_out = fio_memalign(blocksize, sz, false);
  }
  
  static void *thread_fn(void *data)
@@ -310,6 +384,7 @@ static void *thread_fn(void *data)
         void *buf;
  
         buf = fio_memalign(blocksize, chunk_size, false);
+       thread_init_zlib_control(thread);
  
         do {
                 if (get_work(&thread->cur_offset, &thread->size)) {
@@ -362,15 +437,17 @@ static void show_progress(struct worker_thread *threads, unsigned long total)
                         printf("%3.2f%% done (%luKiB/sec)\r", perc, this_items);
                         last_nitems = nitems;
                         fio_gettime(&last_tv, NULL);
-               } else
+               } else {
                         printf("%3.2f%% done\r", perc);
+               }
                 fflush(stdout);
                 usleep(250000);
         };
  }
  
  static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
-                             uint64_t *nextents, uint64_t *nchunks)
+                             uint64_t *nextents, uint64_t *nchunks,
+                             uint64_t *unique_capacity)
  {
         struct worker_thread *threads;
         unsigned long nitems, total_items;
@@ -398,11 +475,13 @@ static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
         nitems = 0;
         *nextents = 0;
         *nchunks = 1;
+       *unique_capacity = 0;
         for (i = 0; i < num_threads; i++) {
                 void *ret;
                 pthread_join(threads[i].thread, &ret);
                 nitems += threads[i].items;
                 *nchunks += threads[i].dupes;
+               *unique_capacity += threads[i].unique_capacity;
         }
  
         printf("Threads(%u): %lu items processed\n", num_threads, nitems);
@@ -416,7 +495,7 @@ static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
  }
  
  static int dedupe_check(const char *filename, uint64_t *nextents,
-                       uint64_t *nchunks)
+                       uint64_t *nchunks, uint64_t *unique_capacity)
  {
         uint64_t dev_size;
         struct stat sb;
@@ -451,9 +530,11 @@ static int dedupe_check(const char *filename, uint64_t *nextents,
                 bloom = bloom_new(bloom_entries);
         }
  
-       printf("Will check <%s>, size <%llu>, using %u threads\n", filename, (unsigned long long) dev_size, num_threads);
+       printf("Will check <%s>, size <%llu>, using %u threads\n", filename,
+                               (unsigned long long) dev_size, num_threads);
  
-       return run_dedupe_threads(&file, dev_size, nextents, nchunks);
+       return run_dedupe_threads(&file, dev_size, nextents, nchunks,
+                                       unique_capacity);
  err:
         if (file.fd != -1)
                 close(file.fd);
@@ -466,36 +547,69 @@ static void show_chunk(struct chunk *c)
         struct flist_head *n;
         struct extent *e;
  
-       printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1], c->hash[2], c->hash[3], (unsigned long) c->count);
+       printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1],
+                       c->hash[2], c->hash[3], (unsigned long) c->count);
         flist_for_each(n, &c->extent_list[0]) {
                 e = flist_entry(n, struct extent, list);
                 printf("\toffset %llu\n", (unsigned long long) e->offset);
         }
  }
  
-static void show_stat(uint64_t nextents, uint64_t nchunks)
+static const char *capacity_unit[] = {"b","KB", "MB", "GB", "TB", "PB", "EB"};
+
+static uint64_t bytes_to_human_readable_unit(uint64_t n, const char **unit_out)
+{
+       uint8_t i = 0;
+
+       while (n >= 1024) {
+               i++;
+               n /= 1024;
+       }
+
+       *unit_out = capacity_unit[i];
+       return n;
+}
+
+static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents,
+                     uint64_t unique_capacity)
  {
         double perc, ratio;
+       const char *unit;
+       uint64_t uc_human;
  
-       printf("Extents=%lu, Unique extents=%lu\n", (unsigned long) nextents, (unsigned long) nchunks);
+       printf("Extents=%lu, Unique extents=%lu", (unsigned long) nextents,
+                                               (unsigned long) nchunks);
+       if (!bloom)
+               printf(" Duplicated extents=%lu", (unsigned long) ndupextents);
+       printf("\n");
  
         if (nchunks) {
                 ratio = (double) nextents / (double) nchunks;
                 printf("De-dupe ratio: 1:%3.2f\n", ratio - 1.0);
-       } else
+       } else {
                 printf("De-dupe ratio: 1:infinite\n");
+       }
+
+       if (ndupextents) {
+               printf("De-dupe working set at least: %3.2f%%\n",
+                       100.0 * (double) ndupextents / (double) nextents);
+       }
  
         perc = 1.00 - ((double) nchunks / (double) nextents);
         perc *= 100.0;
         printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
  
+
+       if (compression) {
+               uc_human = bytes_to_human_readable_unit(unique_capacity, &unit);
+               printf("Unique capacity %lu%s\n", (unsigned long) uc_human, unit);
+       }
  }
  
-static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
+static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks, uint64_t *ndupextents)
  {
         struct fio_rb_node *n;
-
-       *nchunks = *nextents = 0;
+       *nchunks = *nextents = *ndupextents = 0;
  
         n = rb_first(&rb_root);
         if (!n)
@@ -507,6 +621,7 @@ static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
                 c = rb_entry(n, struct chunk, rb_node);
                 (*nchunks)++;
                 *nextents += c->count;
+               *ndupextents += (c->count > 1);
  
                 if (dump_output)
                         show_chunk(c);
@@ -525,18 +640,19 @@ static int usage(char *argv[])
         log_err("\t-c\tFull collision check\n");
         log_err("\t-B\tUse probabilistic bloom filter\n");
         log_err("\t-p\tPrint progress indicator\n");
+       log_err("\t-C\tCalculate compressible size\n");
         return 1;
  }
  
  int main(int argc, char *argv[])
  {
-       uint64_t nextents = 0, nchunks = 0;
+       uint64_t nextents = 0, nchunks = 0, ndupextents = 0, unique_capacity;
         int c, ret;
  
         arch_init(argv);
         debug_init();
  
-       while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {
+       while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:C:")) != -1) {
                 switch (c) {
                 case 'b':
                         blocksize = atoi(optarg);
@@ -559,17 +675,20 @@ int main(int argc, char *argv[])
                 case 'B':
                         use_bloom = atoi(optarg);
                         break;
+               case 'C':
+                       compression = atoi(optarg);
+                       break;
                 case '?':
                 default:
                         return usage(argv);
                 }
         }
  
-       if (collision_check || dump_output)
+       if (collision_check || dump_output || compression)
                 use_bloom = 0;
  
         if (!num_threads)
-               num_threads = cpus_online();
+               num_threads = cpus_configured();
  
         if (argc == optind)
                 return usage(argv);
@@ -579,13 +698,13 @@ int main(int argc, char *argv[])
         rb_root = RB_ROOT;
         rb_lock = fio_sem_init(FIO_SEM_UNLOCKED);
  
-       ret = dedupe_check(argv[optind], &nextents, &nchunks);
+       ret = dedupe_check(argv[optind], &nextents, &nchunks, &unique_capacity);
  
         if (!ret) {
                 if (!bloom)
-                       iter_rb_tree(&nextents, &nchunks);
+                       iter_rb_tree(&nextents, &nchunks, &ndupextents);
  
-               show_stat(nextents, nchunks);
+               show_stat(nextents, nchunks, ndupextents, unique_capacity);
         }
  
         fio_sem_remove(rb_lock);
diff --git a/t/fiotestcommon.py b/t/fiotestcommon.py

new file mode 100644 (file)

index 0000000..f5012c8
--- /dev/null
+++ b/t/fiotestcommon.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+"""
+fiotestcommon.py
+
+This contains constant definitions, helpers, and a Requirements class that can
+be used to help with running fio tests.
+"""
+
+import os
+import locale
+import logging
+import platform
+import subprocess
+import multiprocessing
+
+
+SUCCESS_DEFAULT = {
+    'zero_return': True,
+    'stderr_empty': True,
+    'timeout': 600,
+    }
+SUCCESS_NONZERO = {
+    'zero_return': False,
+    'stderr_empty': False,
+    'timeout': 600,
+    }
+SUCCESS_STDERR = {
+    'zero_return': True,
+    'stderr_empty': False,
+    'timeout': 600,
+    }
+
+
+def get_file(filename):
+    """Safely read a file."""
+    file_data = ''
+    success = True
+
+    try:
+        with open(filename, "r", encoding=locale.getpreferredencoding()) as output_file:
+            file_data = output_file.read()
+    except OSError:
+        success = False
+
+    return file_data, success
+
+
+class Requirements():
+    """Requirements consists of multiple run environment characteristics.
+    These are to determine if a particular test can be run"""
+
+    _linux = False
+    _libaio = False
+    _io_uring = False
+    _zbd = False
+    _root = False
+    _zoned_nullb = False
+    _not_macos = False
+    _not_windows = False
+    _unittests = False
+    _cpucount4 = False
+    _nvmecdev = False
+
+    def __init__(self, fio_root, args):
+        Requirements._not_macos = platform.system() != "Darwin"
+        Requirements._not_windows = platform.system() != "Windows"
+        Requirements._linux = platform.system() == "Linux"
+
+        if Requirements._linux:
+            config_file = os.path.join(fio_root, "config-host.h")
+            contents, success = get_file(config_file)
+            if not success:
+                print(f"Unable to open {config_file} to check requirements")
+                Requirements._zbd = True
+            else:
+                Requirements._zbd = "CONFIG_HAS_BLKZONED" in contents
+                Requirements._libaio = "CONFIG_LIBAIO" in contents
+
+            contents, success = get_file("/proc/kallsyms")
+            if not success:
+                print("Unable to open '/proc/kallsyms' to probe for io_uring support")
+            else:
+                Requirements._io_uring = "io_uring_setup" in contents
+
+            Requirements._root = os.geteuid() == 0
+            if Requirements._zbd and Requirements._root:
+                try:
+                    subprocess.run(["modprobe", "null_blk"],
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
+                    if os.path.exists("/sys/module/null_blk/parameters/zoned"):
+                        Requirements._zoned_nullb = True
+                except Exception:
+                    pass
+
+        if platform.system() == "Windows":
+            utest_exe = "unittest.exe"
+        else:
+            utest_exe = "unittest"
+        unittest_path = os.path.join(fio_root, "unittests", utest_exe)
+        Requirements._unittests = os.path.exists(unittest_path)
+
+        Requirements._cpucount4 = multiprocessing.cpu_count() >= 4
+        Requirements._nvmecdev = args.nvmecdev
+
+        req_list = [
+                Requirements.linux,
+                Requirements.libaio,
+                Requirements.io_uring,
+                Requirements.zbd,
+                Requirements.root,
+                Requirements.zoned_nullb,
+                Requirements.not_macos,
+                Requirements.not_windows,
+                Requirements.unittests,
+                Requirements.cpucount4,
+                Requirements.nvmecdev,
+                    ]
+        for req in req_list:
+            value, desc = req()
+            logging.debug("Requirements: Requirement '%s' met? %s", desc, value)
+
+    @classmethod
+    def linux(cls):
+        """Are we running on Linux?"""
+        return Requirements._linux, "Linux required"
+
+    @classmethod
+    def libaio(cls):
+        """Is libaio available?"""
+        return Requirements._libaio, "libaio required"
+
+    @classmethod
+    def io_uring(cls):
+        """Is io_uring available?"""
+        return Requirements._io_uring, "io_uring required"
+
+    @classmethod
+    def zbd(cls):
+        """Is ZBD support available?"""
+        return Requirements._zbd, "Zoned block device support required"
+
+    @classmethod
+    def root(cls):
+        """Are we running as root?"""
+        return Requirements._root, "root required"
+
+    @classmethod
+    def zoned_nullb(cls):
+        """Are zoned null block devices available?"""
+        return Requirements._zoned_nullb, "Zoned null block device support required"
+
+    @classmethod
+    def not_macos(cls):
+        """Are we running on a platform other than macOS?"""
+        return Requirements._not_macos, "platform other than macOS required"
+
+    @classmethod
+    def not_windows(cls):
+        """Are we running on a platform other than Windws?"""
+        return Requirements._not_windows, "platform other than Windows required"
+
+    @classmethod
+    def unittests(cls):
+        """Were unittests built?"""
+        return Requirements._unittests, "Unittests support required"
+
+    @classmethod
+    def cpucount4(cls):
+        """Do we have at least 4 CPUs?"""
+        return Requirements._cpucount4, "4+ CPUs required"
+
+    @classmethod
+    def nvmecdev(cls):
+        """Do we have an NVMe character device to test?"""
+        return Requirements._nvmecdev, "NVMe character device test target required"
diff --git a/t/fiotestlib.py b/t/fiotestlib.py

new file mode 100755 (executable)

index 0000000..466e482
--- /dev/null
+++ b/t/fiotestlib.py
@@ -0,0 +1,488 @@
+#!/usr/bin/env python3
+"""
+fiotestlib.py
+
+This library contains FioTest objects that provide convenient means to run
+different sorts of fio tests.
+
+It also contains a test runner that runs an array of dictionary objects
+describing fio tests.
+"""
+
+import os
+import sys
+import json
+import locale
+import logging
+import platform
+import traceback
+import subprocess
+from pathlib import Path
+from fiotestcommon import get_file, SUCCESS_DEFAULT
+
+
+class FioTest():
+    """Base for all fio tests."""
+
+    def __init__(self, exe_path, success, testnum, artifact_root):
+        self.success = success
+        self.testnum = testnum
+        self.output = {}
+        self.passed = True
+        self.failure_reason = ''
+        self.parameters = None
+        self.paths = {
+                        'exe': exe_path,
+                        'artifacts': artifact_root,
+                        'test_dir': os.path.join(artifact_root, \
+                                f"{testnum:04d}"),
+                        }
+        self.filenames = {
+                            'cmd': os.path.join(self.paths['test_dir'], \
+                                    f"{os.path.basename(self.paths['exe'])}.command"),
+                            'stdout': os.path.join(self.paths['test_dir'], \
+                                    f"{os.path.basename(self.paths['exe'])}.stdout"),
+                            'stderr': os.path.join(self.paths['test_dir'], \
+                                    f"{os.path.basename(self.paths['exe'])}.stderr"),
+                            'exitcode': os.path.join(self.paths['test_dir'], \
+                                    f"{os.path.basename(self.paths['exe'])}.exitcode"),
+                            }
+
+    def setup(self, parameters):
+        """Setup instance variables for test."""
+
+        self.parameters = parameters
+        if not os.path.exists(self.paths['test_dir']):
+            os.mkdir(self.paths['test_dir'])
+
+    def run(self):
+        """Run the test."""
+
+        raise NotImplementedError()
+
+    def check_result(self):
+        """Check test results."""
+
+        raise NotImplementedError()
+
+
+class FioExeTest(FioTest):
+    """Test consists of an executable binary or script"""
+
+    def run(self):
+        """Execute the binary or script described by this instance."""
+
+        command = [self.paths['exe']] + self.parameters
+        with open(self.filenames['cmd'], "w+",
+                  encoding=locale.getpreferredencoding()) as command_file:
+            command_file.write(" \\\n ".join(command))
+
+        try:
+            with open(self.filenames['stdout'], "w+",
+                      encoding=locale.getpreferredencoding()) as stdout_file, \
+                open(self.filenames['stderr'], "w+",
+                     encoding=locale.getpreferredencoding()) as stderr_file, \
+                open(self.filenames['exitcode'], "w+",
+                     encoding=locale.getpreferredencoding()) as exitcode_file:
+                proc = None
+                # Avoid using subprocess.run() here because when a timeout occurs,
+                # fio will be stopped with SIGKILL. This does not give fio a
+                # chance to clean up and means that child processes may continue
+                # running and submitting IO.
+                proc = subprocess.Popen(command,
+                                        stdout=stdout_file,
+                                        stderr=stderr_file,
+                                        cwd=self.paths['test_dir'],
+                                        universal_newlines=True)
+                proc.communicate(timeout=self.success['timeout'])
+                exitcode_file.write(f'{proc.returncode}\n')
+                logging.debug("Test %d: return code: %d", self.testnum, proc.returncode)
+                self.output['proc'] = proc
+        except subprocess.TimeoutExpired:
+            proc.terminate()
+            proc.communicate()
+            assert proc.poll()
+            self.output['failure'] = 'timeout'
+        except Exception:
+            if proc:
+                if not proc.poll():
+                    proc.terminate()
+                    proc.communicate()
+            self.output['failure'] = 'exception'
+            self.output['exc_info'] = sys.exc_info()
+
+    def check_result(self):
+        """Check results of test run."""
+
+        if 'proc' not in self.output:
+            if self.output['failure'] == 'timeout':
+                self.failure_reason = f"{self.failure_reason} timeout,"
+            else:
+                assert self.output['failure'] == 'exception'
+                self.failure_reason = f'{self.failure_reason} exception: ' + \
+                f'{self.output["exc_info"][0]}, {self.output["exc_info"][1]}'
+
+            self.passed = False
+            return
+
+        if 'zero_return' in self.success:
+            if self.success['zero_return']:
+                if self.output['proc'].returncode != 0:
+                    self.passed = False
+                    self.failure_reason = f"{self.failure_reason} non-zero return code,"
+            else:
+                if self.output['proc'].returncode == 0:
+                    self.failure_reason = f"{self.failure_reason} zero return code,"
+                    self.passed = False
+
+        stderr_size = os.path.getsize(self.filenames['stderr'])
+        if 'stderr_empty' in self.success:
+            if self.success['stderr_empty']:
+                if stderr_size != 0:
+                    self.failure_reason = f"{self.failure_reason} stderr not empty,"
+                    self.passed = False
+            else:
+                if stderr_size == 0:
+                    self.failure_reason = f"{self.failure_reason} stderr empty,"
+                    self.passed = False
+
+
+class FioJobFileTest(FioExeTest):
+    """Test consists of a fio job with options in a job file."""
+
+    def __init__(self, fio_path, fio_job, success, testnum, artifact_root,
+                 fio_pre_job=None, fio_pre_success=None,
+                 output_format="normal"):
+        """Construct a FioJobFileTest which is a FioExeTest consisting of a
+        single fio job file with an optional setup step.
+
+        fio_path:           location of fio executable
+        fio_job:            location of fio job file
+        success:            Definition of test success
+        testnum:            test ID
+        artifact_root:      root directory for artifacts
+        fio_pre_job:        fio job for preconditioning
+        fio_pre_success:    Definition of test success for fio precon job
+        output_format:      normal (default), json, jsonplus, or terse
+        """
+
+        self.fio_job = fio_job
+        self.fio_pre_job = fio_pre_job
+        self.fio_pre_success = fio_pre_success if fio_pre_success else success
+        self.output_format = output_format
+        self.precon_failed = False
+        self.json_data = None
+
+        super().__init__(fio_path, success, testnum, artifact_root)
+
+    def setup(self, parameters):
+        """Setup instance variables for fio job test."""
+
+        self.filenames['fio_output'] = f"{os.path.basename(self.fio_job)}.output"
+        fio_args = [
+            "--max-jobs=16",
+            f"--output-format={self.output_format}",
+            f"--output={self.filenames['fio_output']}",
+            self.fio_job,
+            ]
+        if parameters:
+            fio_args += parameters
+
+        super().setup(fio_args)
+
+        # Update the filenames from the default
+        self.filenames['cmd'] = os.path.join(self.paths['test_dir'],
+                                             f"{os.path.basename(self.fio_job)}.command")
+        self.filenames['stdout'] = os.path.join(self.paths['test_dir'],
+                                                f"{os.path.basename(self.fio_job)}.stdout")
+        self.filenames['stderr'] = os.path.join(self.paths['test_dir'],
+                                                f"{os.path.basename(self.fio_job)}.stderr")
+        self.filenames['exitcode'] = os.path.join(self.paths['test_dir'],
+                                                  f"{os.path.basename(self.fio_job)}.exitcode")
+
+    def run_pre_job(self):
+        """Run fio job precondition step."""
+
+        precon = FioJobFileTest(self.paths['exe'], self.fio_pre_job,
+                            self.fio_pre_success,
+                            self.testnum,
+                            self.paths['artifacts'],
+                            output_format=self.output_format)
+        precon.setup(None)
+        precon.run()
+        precon.check_result()
+        self.precon_failed = not precon.passed
+        self.failure_reason = precon.failure_reason
+
+    def run(self):
+        """Run fio job test."""
+
+        if self.fio_pre_job:
+            self.run_pre_job()
+
+        if not self.precon_failed:
+            super().run()
+        else:
+            logging.debug("Test %d: precondition step failed", self.testnum)
+
+    def get_file_fail(self, filename):
+        """Safely read a file and fail the test upon error."""
+        file_data = None
+
+        try:
+            with open(filename, "r", encoding=locale.getpreferredencoding()) as output_file:
+                file_data = output_file.read()
+        except OSError:
+            self.failure_reason += f" unable to read file {filename}"
+            self.passed = False
+
+        return file_data
+
+    def check_result(self):
+        """Check fio job results."""
+
+        if self.precon_failed:
+            self.passed = False
+            self.failure_reason = f"{self.failure_reason} precondition step failed,"
+            return
+
+        super().check_result()
+
+        if not self.passed:
+            return
+
+        if 'json' not in self.output_format:
+            return
+
+        file_data = self.get_file_fail(os.path.join(self.paths['test_dir'],
+                                                    self.filenames['fio_output']))
+        if not file_data:
+            return
+
+        #
+        # Sometimes fio informational messages are included at the top of the
+        # JSON output, especially under Windows. Try to decode output as JSON
+        # data, skipping everything until the first {
+        #
+        lines = file_data.splitlines()
+        file_data = '\n'.join(lines[lines.index("{"):])
+        try:
+            self.json_data = json.loads(file_data)
+        except json.JSONDecodeError:
+            self.failure_reason = f"{self.failure_reason} unable to decode JSON data,"
+            self.passed = False
+
+
+class FioJobCmdTest(FioExeTest):
+    """This runs a fio job with options specified on the command line."""
+
+    def __init__(self, fio_path, success, testnum, artifact_root, fio_opts, basename=None):
+
+        self.basename = basename if basename else os.path.basename(fio_path)
+        self.fio_opts = fio_opts
+        self.json_data = None
+        self.iops_log_lines = None
+
+        super().__init__(fio_path, success, testnum, artifact_root)
+
+        filename_stub = os.path.join(self.paths['test_dir'], f"{self.basename}{self.testnum:03d}")
+        self.filenames['cmd'] = f"{filename_stub}.command"
+        self.filenames['stdout'] = f"{filename_stub}.stdout"
+        self.filenames['stderr'] = f"{filename_stub}.stderr"
+        self.filenames['output'] = os.path.abspath(f"{filename_stub}.output")
+        self.filenames['exitcode'] = f"{filename_stub}.exitcode"
+        self.filenames['iopslog'] = os.path.abspath(f"{filename_stub}")
+
+    def run(self):
+        super().run()
+
+        if 'output-format' in self.fio_opts and 'json' in \
+                self.fio_opts['output-format']:
+            if not self.get_json():
+                print('Unable to decode JSON data')
+                self.passed = False
+
+        if any('--write_iops_log=' in param for param in self.parameters):
+            self.get_iops_log()
+
+    def get_iops_log(self):
+        """Read IOPS log from the first job."""
+
+        log_filename = self.filenames['iopslog'] + "_iops.1.log"
+        with open(log_filename, 'r', encoding=locale.getpreferredencoding()) as iops_file:
+            self.iops_log_lines = iops_file.read()
+
+    def get_json(self):
+        """Convert fio JSON output into a python JSON object"""
+
+        filename = self.filenames['output']
+        with open(filename, 'r', encoding=locale.getpreferredencoding()) as file:
+            file_data = file.read()
+
+        #
+        # Sometimes fio informational messages are included at the top of the
+        # JSON output, especially under Windows. Try to decode output as JSON
+        # data, lopping off up to the first four lines
+        #
+        lines = file_data.splitlines()
+        for i in range(5):
+            file_data = '\n'.join(lines[i:])
+            try:
+                self.json_data = json.loads(file_data)
+            except json.JSONDecodeError:
+                continue
+            else:
+                return True
+
+        return False
+
+    @staticmethod
+    def check_empty(job):
+        """
+        Make sure JSON data is empty.
+
+        Some data structures should be empty. This function makes sure that they are.
+
+        job         JSON object that we need to check for emptiness
+        """
+
+        return job['total_ios'] == 0 and \
+                job['slat_ns']['N'] == 0 and \
+                job['clat_ns']['N'] == 0 and \
+                job['lat_ns']['N'] == 0
+
+    def check_all_ddirs(self, ddir_nonzero, job):
+        """
+        Iterate over the data directions and check whether each is
+        appropriately empty or not.
+        """
+
+        retval = True
+        ddirlist = ['read', 'write', 'trim']
+
+        for ddir in ddirlist:
+            if ddir in ddir_nonzero:
+                if self.check_empty(job[ddir]):
+                    print(f"Unexpected zero {ddir} data found in output")
+                    retval = False
+            else:
+                if not self.check_empty(job[ddir]):
+                    print(f"Unexpected {ddir} data found in output")
+                    retval = False
+
+        return retval
+
+
+def run_fio_tests(test_list, test_env, args):
+    """
+    Run tests as specified in test_list.
+    """
+
+    passed = 0
+    failed = 0
+    skipped = 0
+
+    for config in test_list:
+        if (args.skip and config['test_id'] in args.skip) or \
+           (args.run_only and config['test_id'] not in args.run_only) or \
+           ('force_skip' in config and config['force_skip']):
+            skipped = skipped + 1
+            print(f"Test {config['test_id']} SKIPPED (User request or override)")
+            continue
+
+        if issubclass(config['test_class'], FioJobFileTest):
+            if config['pre_job']:
+                fio_pre_job = os.path.join(test_env['fio_root'], 't', 'jobs',
+                                           config['pre_job'])
+            else:
+                fio_pre_job = None
+            if config['pre_success']:
+                fio_pre_success = config['pre_success']
+            else:
+                fio_pre_success = None
+            if 'output_format' in config:
+                output_format = config['output_format']
+            else:
+                output_format = 'normal'
+            test = config['test_class'](
+                test_env['fio_path'],
+                os.path.join(test_env['fio_root'], 't', 'jobs', config['job']),
+                config['success'],
+                config['test_id'],
+                test_env['artifact_root'],
+                fio_pre_job=fio_pre_job,
+                fio_pre_success=fio_pre_success,
+                output_format=output_format)
+            desc = config['job']
+            parameters = config['parameters'] if 'parameters' in config else None
+        elif issubclass(config['test_class'], FioJobCmdTest):
+            if not 'success' in config:
+                config['success'] = SUCCESS_DEFAULT
+            test = config['test_class'](test_env['fio_path'],
+                                        config['success'],
+                                        config['test_id'],
+                                        test_env['artifact_root'],
+                                        config['fio_opts'],
+                                        test_env['basename'])
+            desc = config['test_id']
+            parameters = config
+        elif issubclass(config['test_class'], FioExeTest):
+            exe_path = os.path.join(test_env['fio_root'], config['exe'])
+            parameters = []
+            if config['parameters']:
+                parameters = [p.format(fio_path=test_env['fio_path'], nvmecdev=args.nvmecdev)
+                              for p in config['parameters']]
+            if Path(exe_path).suffix == '.py' and platform.system() == "Windows":
+                parameters.insert(0, exe_path)
+                exe_path = "python.exe"
+            if config['test_id'] in test_env['pass_through']:
+                parameters += test_env['pass_through'][config['test_id']].split()
+            test = config['test_class'](
+                    exe_path,
+                    config['success'],
+                    config['test_id'],
+                    test_env['artifact_root'])
+            desc = config['exe']
+        else:
+            print(f"Test {config['test_id']} FAILED: unable to process test config")
+            failed = failed + 1
+            continue
+
+        if 'requirements' in config and not args.skip_req:
+            reqs_met = True
+            for req in config['requirements']:
+                reqs_met, reason = req()
+                logging.debug("Test %d: Requirement '%s' met? %s", config['test_id'], reason,
+                              reqs_met)
+                if not reqs_met:
+                    break
+            if not reqs_met:
+                print(f"Test {config['test_id']} SKIPPED ({reason}) {desc}")
+                skipped = skipped + 1
+                continue
+
+        try:
+            test.setup(parameters)
+            test.run()
+            test.check_result()
+        except KeyboardInterrupt:
+            break
+        except Exception as e:
+            test.passed = False
+            test.failure_reason += str(e)
+            logging.debug("Test %d exception:\n%s\n", config['test_id'], traceback.format_exc())
+        if test.passed:
+            result = "PASSED"
+            passed = passed + 1
+        else:
+            result = f"FAILED: {test.failure_reason}"
+            failed = failed + 1
+            contents, _ = get_file(test.filenames['stderr'])
+            logging.debug("Test %d: stderr:\n%s", config['test_id'], contents)
+            contents, _ = get_file(test.filenames['stdout'])
+            logging.debug("Test %d: stdout:\n%s", config['test_id'], contents)
+        print(f"Test {config['test_id']} {result} {desc}")
+
+    print(f"{passed} test(s) passed, {failed} failed, {skipped} skipped")
+
+    return passed, failed, skipped
diff --git a/t/io_uring.c b/t/io_uring.c

index 044f9195679566f802460672a3b9c6d96f33be07..aa6e09e9edf94b3438e434dac0f4e21bb113d964 100644 (file)
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -5,6 +5,15 @@
  #include <stddef.h>
  #include <signal.h>
  #include <inttypes.h>
+#include <math.h>
+
+#ifdef CONFIG_LIBAIO
+#include <libaio.h>
+#endif
+
+#ifdef CONFIG_LIBNUMA
+#include <numa.h>
+#endif
  
  #include <sys/types.h>
  #include <sys/stat.h>
@@ -19,12 +28,16 @@
  #include <string.h>
  #include <pthread.h>
  #include <sched.h>
+#include <libgen.h>
  
  #include "../arch/arch.h"
+#include "../os/os.h"
  #include "../lib/types.h"
+#include "../lib/roundup.h"
+#include "../lib/rand.h"
+#include "../minmax.h"
  #include "../os/linux/io_uring.h"
-
-#define min(a, b)              ((a < b) ? (a) : (b))
+#include "../engines/nvme.h"
  
  struct io_sq_ring {
         unsigned *head;
@@ -54,25 +67,53 @@ static unsigned sq_ring_mask, cq_ring_mask;
  
  struct file {
         unsigned long max_blocks;
+       unsigned long max_size;
+       unsigned long cur_off;
         unsigned pending_ios;
+       unsigned int nsid;      /* nsid field required for nvme-passthrough */
+       unsigned int lba_shift; /* lba_shift field required for nvme-passthrough */
         int real_fd;
         int fixed_fd;
+       int fileno;
  };
  
+#define PLAT_BITS              6
+#define PLAT_VAL               (1 << PLAT_BITS)
+#define PLAT_GROUP_NR          29
+#define PLAT_NR                        (PLAT_GROUP_NR * PLAT_VAL)
+
  struct submitter {
         pthread_t thread;
         int ring_fd;
+       int enter_ring_fd;
+       int index;
         struct io_sq_ring sq_ring;
         struct io_uring_sqe *sqes;
         struct io_cq_ring cq_ring;
         int inflight;
+       int tid;
         unsigned long reaps;
         unsigned long done;
         unsigned long calls;
+       unsigned long io_errors;
         volatile int finish;
  
         __s32 *fds;
  
+       struct taus258_state rand_state;
+
+       unsigned long *clock_batch;
+       int clock_index;
+       unsigned long *plat;
+
+#ifdef CONFIG_LIBAIO
+       io_context_t aio_ctx;
+#endif
+
+       int numa_node;
+       int per_file_depth;
+       const char *filename;
+
         struct file files[MAX_FDS];
         unsigned nr_files;
         unsigned cur_file;
@@ -81,6 +122,9 @@ struct submitter {
  
  static struct submitter *submitter;
  static volatile int finish;
+static int stats_running;
+static unsigned long max_iops;
+static long t_io_uring_page_size;
  
  static int depth = DEPTH;
  static int batch_submit = BATCH_SUBMIT;
@@ -93,16 +137,272 @@ static int buffered = 0;  /* use buffered IO, not O_DIRECT */
  static int sq_thread_poll = 0; /* use kernel submission/poller thread */
  static int sq_thread_cpu = -1; /* pin above thread to this CPU */
  static int do_nop = 0;         /* no-op SQ ring commands */
+static int nthreads = 1;
+static int stats = 0;          /* generate IO stats */
+static int aio = 0;            /* use libaio */
+static int runtime = 0;                /* runtime */
+static int random_io = 1;      /* random or sequential IO */
+static int register_ring = 1;  /* register ring */
+static int use_sync = 0;       /* use preadv2 */
+static int numa_placement = 0; /* set to node of device */
+static int pt = 0;             /* passthrough I/O or not */
+
+static unsigned long tsc_rate;
+
+#define TSC_RATE_FILE  "tsc-rate"
  
  static int vectored = 1;
  
+static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0,
+                       80.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.95, 99.99 };
+static int plist_len = 17;
+
+static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
+                        enum nvme_csi csi, void *data)
+{
+       struct nvme_passthru_cmd cmd = {
+               .opcode         = nvme_admin_identify,
+               .nsid           = nsid,
+               .addr           = (__u64)(uintptr_t)data,
+               .data_len       = NVME_IDENTIFY_DATA_SIZE,
+               .cdw10          = cns,
+               .cdw11          = csi << NVME_IDENTIFY_CSI_SHIFT,
+               .timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+       };
+
+       return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
+}
+
+static int nvme_get_info(int fd, __u32 *nsid, __u32 *lba_sz, __u64 *nlba)
+{
+       struct nvme_id_ns ns;
+       int namespace_id;
+       int err;
+
+       namespace_id = ioctl(fd, NVME_IOCTL_ID);
+       if (namespace_id < 0) {
+               fprintf(stderr, "error failed to fetch namespace-id\n");
+               close(fd);
+               return -errno;
+       }
+
+       /*
+        * Identify namespace to get namespace-id, namespace size in LBA's
+        * and LBA data size.
+        */
+       err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
+                               NVME_CSI_NVM, &ns);
+       if (err) {
+               fprintf(stderr, "error failed to fetch identify namespace\n");
+               close(fd);
+               return err;
+       }
+
+       *nsid = namespace_id;
+       *lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds;
+       *nlba = ns.nsze;
+
+       return 0;
+}
+
+static unsigned long cycles_to_nsec(unsigned long cycles)
+{
+       uint64_t val;
+
+       if (!tsc_rate)
+               return cycles;
+
+       val = cycles * 1000000000ULL;
+       return val / tsc_rate;
+}
+
+static unsigned long plat_idx_to_val(unsigned int idx)
+{
+       unsigned int error_bits;
+       unsigned long k, base;
+
+       assert(idx < PLAT_NR);
+
+       /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+        * all bits of the sample as index */
+       if (idx < (PLAT_VAL << 1))
+               return cycles_to_nsec(idx);
+
+       /* Find the group and compute the minimum value of that group */
+       error_bits = (idx >> PLAT_BITS) - 1;
+       base = ((unsigned long) 1) << (error_bits + PLAT_BITS);
+
+       /* Find its bucket number of the group */
+       k = idx % PLAT_VAL;
+
+       /* Return the mean of the range of the bucket */
+       return cycles_to_nsec(base + ((k + 0.5) * (1 << error_bits)));
+}
+
+unsigned int calculate_clat_percentiles(unsigned long *io_u_plat,
+               unsigned long nr, unsigned long **output,
+               unsigned long *maxv, unsigned long *minv)
+{
+       unsigned long sum = 0;
+       unsigned int len = plist_len, i, j = 0;
+       unsigned long *ovals = NULL;
+       bool is_last;
+
+       *minv = -1UL;
+       *maxv = 0;
+
+       ovals = malloc(len * sizeof(*ovals));
+       if (!ovals)
+               return 0;
+
+       /*
+        * Calculate bucket values, note down max and min values
+        */
+       is_last = false;
+       for (i = 0; i < PLAT_NR && !is_last; i++) {
+               sum += io_u_plat[i];
+               while (sum >= ((long double) plist[j] / 100.0 * nr)) {
+                       assert(plist[j] <= 100.0);
+
+                       ovals[j] = plat_idx_to_val(i);
+                       if (ovals[j] < *minv)
+                               *minv = ovals[j];
+                       if (ovals[j] > *maxv)
+                               *maxv = ovals[j];
+
+                       is_last = (j == len - 1) != 0;
+                       if (is_last)
+                               break;
+
+                       j++;
+               }
+       }
+
+       if (!is_last)
+               fprintf(stderr, "error calculating latency percentiles\n");
+
+       *output = ovals;
+       return len;
+}
+
+static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
+                                 unsigned int precision)
+{
+       unsigned int divisor, len, i, j = 0;
+       unsigned long minv, maxv;
+       unsigned long *ovals;
+       int per_line, scale_down, time_width;
+       bool is_last;
+       char fmt[32];
+
+       len = calculate_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv);
+       if (!len || !ovals)
+               goto out;
+
+       if (!tsc_rate) {
+               scale_down = 0;
+               divisor = 1;
+               printf("    percentiles (tsc ticks):\n     |");
+       } else if (minv > 2000 && maxv > 99999) {
+               scale_down = 1;
+               divisor = 1000;
+               printf("    percentiles (usec):\n     |");
+       } else {
+               scale_down = 0;
+               divisor = 1;
+               printf("    percentiles (nsec):\n     |");
+       }
+
+       time_width = max(5, (int) (log10(maxv / divisor) + 1));
+       snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
+                       precision, time_width);
+       /* fmt will be something like " %5.2fth=[%4llu]%c" */
+       per_line = (80 - 7) / (precision + 10 + time_width);
+
+       for (j = 0; j < len; j++) {
+               /* for formatting */
+               if (j != 0 && (j % per_line) == 0)
+                       printf("     |");
+
+               /* end of the list */
+               is_last = (j == len - 1) != 0;
+
+               for (i = 0; i < scale_down; i++)
+                       ovals[j] = (ovals[j] + 999) / 1000;
+
+               printf(fmt, plist[j], ovals[j], is_last ? '\n' : ',');
+
+               if (is_last)
+                       break;
+
+               if ((j % per_line) == per_line - 1)     /* for formatting */
+                       printf("\n");
+       }
+
+out:
+       free(ovals);
+}
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+static unsigned int plat_val_to_idx(unsigned long val)
+{
+       unsigned int msb, error_bits, base, offset, idx;
+
+       /* Find MSB starting from bit 0 */
+       if (val == 0)
+               msb = 0;
+       else
+               msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
+
+       /*
+        * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+        * all bits of the sample as index
+        */
+       if (msb <= PLAT_BITS)
+               return val;
+
+       /* Compute the number of error bits to discard*/
+       error_bits = msb - PLAT_BITS;
+
+       /* Compute the number of buckets before the group */
+       base = (error_bits + 1) << PLAT_BITS;
+
+       /*
+        * Discard the error bits and apply the mask to find the
+        * index for the buckets in the group
+        */
+       offset = (PLAT_VAL - 1) & (val >> error_bits);
+
+       /* Make sure the index does not exceed (array size - 1) */
+       idx = (base + offset) < (PLAT_NR - 1) ?
+               (base + offset) : (PLAT_NR - 1);
+
+       return idx;
+}
+#endif
+
+static void add_stat(struct submitter *s, int clock_index, int nr)
+{
+#ifdef ARCH_HAVE_CPU_CLOCK
+       unsigned long cycles;
+       unsigned int pidx;
+
+       if (!s->finish && clock_index) {
+               cycles = get_cpu_clock();
+               cycles -= s->clock_batch[clock_index];
+               pidx = plat_val_to_idx(cycles);
+               s->plat[pidx] += nr;
+       }
+#endif
+}
+
  static int io_uring_register_buffers(struct submitter *s)
  {
         if (do_nop)
                 return 0;
  
         return syscall(__NR_io_uring_register, s->ring_fd,
-                       IORING_REGISTER_BUFFERS, s->iovecs, depth);
+                       IORING_REGISTER_BUFFERS, s->iovecs, roundup_pow2(depth));
  }
  
  static int io_uring_register_files(struct submitter *s)
@@ -124,7 +424,37 @@ static int io_uring_register_files(struct submitter *s)
  
  static int io_uring_setup(unsigned entries, struct io_uring_params *p)
  {
-       return syscall(__NR_io_uring_setup, entries, p);
+       int ret;
+
+       /*
+        * Clamp CQ ring size at our SQ ring size, we don't need more entries
+        * than that.
+        */
+       p->flags |= IORING_SETUP_CQSIZE;
+       p->cq_entries = entries;
+
+       p->flags |= IORING_SETUP_COOP_TASKRUN;
+       p->flags |= IORING_SETUP_SINGLE_ISSUER;
+       p->flags |= IORING_SETUP_DEFER_TASKRUN;
+retry:
+       ret = syscall(__NR_io_uring_setup, entries, p);
+       if (!ret)
+               return 0;
+
+       if (errno == EINVAL && p->flags & IORING_SETUP_COOP_TASKRUN) {
+               p->flags &= ~IORING_SETUP_COOP_TASKRUN;
+               goto retry;
+       }
+       if (errno == EINVAL && p->flags & IORING_SETUP_SINGLE_ISSUER) {
+               p->flags &= ~IORING_SETUP_SINGLE_ISSUER;
+               goto retry;
+       }
+       if (errno == EINVAL && p->flags & IORING_SETUP_DEFER_TASKRUN) {
+               p->flags &= ~IORING_SETUP_DEFER_TASKRUN;
+               goto retry;
+       }
+
+       return ret;
  }
  
  static void io_uring_probe(int fd)
@@ -132,11 +462,10 @@ static void io_uring_probe(int fd)
         struct io_uring_probe *p;
         int ret;
  
-       p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
+       p = calloc(1, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
         if (!p)
                 return;
  
-       memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
         ret = syscall(__NR_io_uring_register, fd, IORING_REGISTER_PROBE, p, 256);
         if (ret < 0)
                 goto out;
@@ -153,49 +482,69 @@ out:
  static int io_uring_enter(struct submitter *s, unsigned int to_submit,
                           unsigned int min_complete, unsigned int flags)
  {
-       return syscall(__NR_io_uring_enter, s->ring_fd, to_submit, min_complete,
-                       flags, NULL, 0);
+       if (register_ring)
+               flags |= IORING_ENTER_REGISTERED_RING;
+#ifdef FIO_ARCH_HAS_SYSCALL
+       return __do_syscall6(__NR_io_uring_enter, s->enter_ring_fd, to_submit,
+                               min_complete, flags, NULL, 0);
+#else
+       return syscall(__NR_io_uring_enter, s->enter_ring_fd, to_submit,
+                       min_complete, flags, NULL, 0);
+#endif
  }
  
-#ifndef CONFIG_HAVE_GETTID
-static int gettid(void)
+static unsigned long long get_offset(struct submitter *s, struct file *f)
  {
-       return syscall(__NR_gettid);
-}
-#endif
+       unsigned long long offset;
+       long r;
  
-static unsigned file_depth(struct submitter *s)
-{
-       return (depth + s->nr_files - 1) / s->nr_files;
+       if (random_io) {
+               unsigned long long block;
+
+               r = __rand64(&s->rand_state);
+               block = r % f->max_blocks;
+               offset = block * (unsigned long long) bs;
+       } else {
+               offset = f->cur_off;
+               f->cur_off += bs;
+               if (f->cur_off + bs > f->max_size)
+                       f->cur_off = 0;
+       }
+
+       return offset;
  }
  
-static void init_io(struct submitter *s, unsigned index)
+static struct file *get_next_file(struct submitter *s)
  {
-       struct io_uring_sqe *sqe = &s->sqes[index];
-       unsigned long offset;
         struct file *f;
-       long r;
-
-       if (do_nop) {
-               sqe->opcode = IORING_OP_NOP;
-               return;
-       }
  
         if (s->nr_files == 1) {
                 f = &s->files[0];
         } else {
                 f = &s->files[s->cur_file];
-               if (f->pending_ios >= file_depth(s)) {
+               if (f->pending_ios >= s->per_file_depth) {
                         s->cur_file++;
                         if (s->cur_file == s->nr_files)
                                 s->cur_file = 0;
                         f = &s->files[s->cur_file];
                 }
         }
+
         f->pending_ios++;
+       return f;
+}
+
+static void init_io(struct submitter *s, unsigned index)
+{
+       struct io_uring_sqe *sqe = &s->sqes[index];
+       struct file *f;
+
+       if (do_nop) {
+               sqe->opcode = IORING_OP_NOP;
+               return;
+       }
  
-       r = lrand48();
-       offset = (r % (f->max_blocks - 1)) * bs;
+       f = get_next_file(s);
  
         if (register_files) {
                 sqe->flags = IOSQE_FIXED_FILE;
@@ -221,33 +570,82 @@ static void init_io(struct submitter *s, unsigned index)
                 sqe->buf_index = 0;
         }
         sqe->ioprio = 0;
-       sqe->off = offset;
-       sqe->user_data = (unsigned long) f;
+       sqe->off = get_offset(s, f);
+       sqe->user_data = (unsigned long) f->fileno;
+       if (stats && stats_running)
+               sqe->user_data |= ((uint64_t)s->clock_index << 32);
+}
+
+static void init_io_pt(struct submitter *s, unsigned index)
+{
+       struct io_uring_sqe *sqe = &s->sqes[index << 1];
+       unsigned long offset;
+       struct file *f;
+       struct nvme_uring_cmd *cmd;
+       unsigned long long slba;
+       unsigned long long nlb;
+
+       f = get_next_file(s);
+
+       offset = get_offset(s, f);
+
+       if (register_files) {
+               sqe->fd = f->fixed_fd;
+               sqe->flags = IOSQE_FIXED_FILE;
+       } else {
+               sqe->fd = f->real_fd;
+               sqe->flags = 0;
+       }
+       sqe->opcode = IORING_OP_URING_CMD;
+       sqe->user_data = (unsigned long) f->fileno;
+       if (stats)
+               sqe->user_data |= ((__u64) s->clock_index << 32ULL);
+       sqe->cmd_op = NVME_URING_CMD_IO;
+       slba = offset >> f->lba_shift;
+       nlb = (bs >> f->lba_shift) - 1;
+       cmd = (struct nvme_uring_cmd *)&sqe->cmd;
+       /* cdw10 and cdw11 represent starting slba*/
+       cmd->cdw10 = slba & 0xffffffff;
+       cmd->cdw11 = slba >> 32;
+       /* cdw12 represent number of lba to be read*/
+       cmd->cdw12 = nlb;
+       cmd->addr = (unsigned long) s->iovecs[index].iov_base;
+       cmd->data_len = bs;
+       if (fixedbufs) {
+               sqe->uring_cmd_flags = IORING_URING_CMD_FIXED;
+               sqe->buf_index = index;
+       }
+       cmd->nsid = f->nsid;
+       cmd->opcode = 2;
  }
  
-static int prep_more_ios(struct submitter *s, int max_ios)
+static int prep_more_ios_uring(struct submitter *s, int max_ios)
  {
         struct io_sq_ring *ring = &s->sq_ring;
-       unsigned index, tail, next_tail, prepped = 0;
+       unsigned head, index, tail, next_tail, prepped = 0;
+
+       if (sq_thread_poll)
+               head = atomic_load_acquire(ring->head);
+       else
+               head = *ring->head;
  
         next_tail = tail = *ring->tail;
         do {
                 next_tail++;
-               read_barrier();
-               if (next_tail == *ring->head)
+               if (next_tail == head)
                         break;
  
                 index = tail & sq_ring_mask;
-               init_io(s, index);
-               ring->array[index] = index;
+               if (pt)
+                       init_io_pt(s, index);
+               else
+                       init_io(s, index);
                 prepped++;
                 tail = next_tail;
         } while (prepped < max_ios);
  
-       if (*ring->tail != tail) {
-               *ring->tail = tail;
-               write_barrier();
-       }
+       if (prepped)
+               atomic_store_release(ring->tail, tail);
         return prepped;
  }
  
@@ -257,164 +655,227 @@ static int get_file_size(struct file *f)
  
         if (fstat(f->real_fd, &st) < 0)
                 return -1;
-       if (S_ISBLK(st.st_mode)) {
+       if (pt) {
+               __u64 nlba;
+               __u32 lbs;
+               int ret;
+
+               if (!S_ISCHR(st.st_mode)) {
+                       fprintf(stderr, "passthrough works with only nvme-ns "
+                                       "generic devices (/dev/ngXnY)\n");
+                       return -1;
+               }
+               ret = nvme_get_info(f->real_fd, &f->nsid, &lbs, &nlba);
+               if (ret)
+                       return -1;
+               if ((bs % lbs) != 0) {
+                       printf("error: bs:%d should be a multiple logical_block_size:%d\n",
+                                       bs, lbs);
+                       return -1;
+               }
+               f->max_blocks = nlba;
+               f->max_size = nlba;
+               f->lba_shift = ilog2(lbs);
+               return 0;
+       } else if (S_ISBLK(st.st_mode)) {
                 unsigned long long bytes;
  
                 if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
                         return -1;
  
                 f->max_blocks = bytes / bs;
+               f->max_size = bytes;
                 return 0;
         } else if (S_ISREG(st.st_mode)) {
                 f->max_blocks = st.st_size / bs;
+               f->max_size = st.st_size;
                 return 0;
         }
  
         return -1;
  }
  
-static int reap_events(struct submitter *s)
+static int reap_events_uring(struct submitter *s)
  {
         struct io_cq_ring *ring = &s->cq_ring;
         struct io_uring_cqe *cqe;
         unsigned head, reaped = 0;
+       int last_idx = -1, stat_nr = 0;
  
         head = *ring->head;
         do {
                 struct file *f;
  
-               read_barrier();
-               if (head == *ring->tail)
+               if (head == atomic_load_acquire(ring->tail))
                         break;
                 cqe = &ring->cqes[head & cq_ring_mask];
                 if (!do_nop) {
-                       f = (struct file *) (uintptr_t) cqe->user_data;
+                       int fileno = cqe->user_data & 0xffffffff;
+
+                       f = &s->files[fileno];
                         f->pending_ios--;
                         if (cqe->res != bs) {
-                               printf("io: unexpected ret=%d\n", cqe->res);
-                               if (polled && cqe->res == -EOPNOTSUPP)
-                                       printf("Your filesystem/driver/kernel doesn't support polled IO\n");
-                               return -1;
+                               if (cqe->res == -ENODATA || cqe->res == -EIO) {
+                                       s->io_errors++;
+                               } else {
+                                       printf("io: unexpected ret=%d\n", cqe->res);
+                                       if (polled && cqe->res == -EOPNOTSUPP)
+                                               printf("Your filesystem/driver/kernel doesn't support polled IO\n");
+                                       return -1;
+                               }
+                       }
+               }
+               if (stats) {
+                       int clock_index = cqe->user_data >> 32;
+
+                       if (last_idx != clock_index) {
+                               if (last_idx != -1) {
+                                       add_stat(s, last_idx, stat_nr);
+                                       stat_nr = 0;
+                               }
+                               last_idx = clock_index;
                         }
+                       stat_nr++;
                 }
                 reaped++;
                 head++;
         } while (1);
  
-       s->inflight -= reaped;
-       *ring->head = head;
-       write_barrier();
+       if (stat_nr)
+               add_stat(s, last_idx, stat_nr);
+
+       if (reaped) {
+               s->inflight -= reaped;
+               atomic_store_release(ring->head, head);
+       }
         return reaped;
  }
  
-static void *submitter_fn(void *data)
+static int reap_events_uring_pt(struct submitter *s)
  {
-       struct submitter *s = data;
-       struct io_sq_ring *ring = &s->sq_ring;
-       int ret, prepped;
-
-       printf("submitter=%d\n", gettid());
-
-       srand48(pthread_self());
+       struct io_cq_ring *ring = &s->cq_ring;
+       struct io_uring_cqe *cqe;
+       unsigned head, reaped = 0;
+       int last_idx = -1, stat_nr = 0;
+       unsigned index;
+       int fileno;
  
-       prepped = 0;
+       head = *ring->head;
         do {
-               int to_wait, to_submit, this_reap, to_prep;
+               struct file *f;
  
-               if (!prepped && s->inflight < depth) {
-                       to_prep = min(depth - s->inflight, batch_submit);
-                       prepped = prep_more_ios(s, to_prep);
+               if (head == atomic_load_acquire(ring->tail))
+                       break;
+               index = head & cq_ring_mask;
+               cqe = &ring->cqes[index << 1];
+               fileno = cqe->user_data & 0xffffffff;
+               f = &s->files[fileno];
+               f->pending_ios--;
+
+               if (cqe->res != 0) {
+                       printf("io: unexpected ret=%d\n", cqe->res);
+                       if (polled && cqe->res == -EINVAL)
+                               printf("passthrough doesn't support polled IO\n");
+                       return -1;
                 }
-               s->inflight += prepped;
-submit_more:
-               to_submit = prepped;
-submit:
-               if (to_submit && (s->inflight + to_submit <= depth))
-                       to_wait = 0;
-               else
-                       to_wait = min(s->inflight + to_submit, batch_complete);
+               if (stats) {
+                       int clock_index = cqe->user_data >> 32;
+
+                       if (last_idx != clock_index) {
+                               if (last_idx != -1) {
+                                       add_stat(s, last_idx, stat_nr);
+                                       stat_nr = 0;
+                               }
+                               last_idx = clock_index;
+                       }
+                       stat_nr++;
+               }
+               reaped++;
+               head++;
+       } while (1);
  
-               /*
-                * Only need to call io_uring_enter if we're not using SQ thread
-                * poll, or if IORING_SQ_NEED_WAKEUP is set.
-                */
-               if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) {
-                       unsigned flags = 0;
+       if (stat_nr)
+               add_stat(s, last_idx, stat_nr);
  
-                       if (to_wait)
-                               flags = IORING_ENTER_GETEVENTS;
-                       if ((*ring->flags & IORING_SQ_NEED_WAKEUP))
-                               flags |= IORING_ENTER_SQ_WAKEUP;
-                       ret = io_uring_enter(s, to_submit, to_wait, flags);
-                       s->calls++;
-               }
+       if (reaped) {
+               s->inflight -= reaped;
+               atomic_store_release(ring->head, head);
+       }
+       return reaped;
+}
  
-               /*
-                * For non SQ thread poll, we already got the events we needed
-                * through the io_uring_enter() above. For SQ thread poll, we
-                * need to loop here until we find enough events.
-                */
-               this_reap = 0;
-               do {
-                       int r;
-                       r = reap_events(s);
-                       if (r == -1) {
-                               s->finish = 1;
-                               break;
-                       } else if (r > 0)
-                               this_reap += r;
-               } while (sq_thread_poll && this_reap < to_wait);
-               s->reaps += this_reap;
+static void set_affinity(struct submitter *s)
+{
+#ifdef CONFIG_LIBNUMA
+       struct bitmask *mask;
  
-               if (ret >= 0) {
-                       if (!ret) {
-                               to_submit = 0;
-                               if (s->inflight)
-                                       goto submit;
-                               continue;
-                       } else if (ret < to_submit) {
-                               int diff = to_submit - ret;
+       if (s->numa_node == -1)
+               return;
  
-                               s->done += ret;
-                               prepped -= diff;
-                               goto submit_more;
-                       }
-                       s->done += ret;
-                       prepped = 0;
-                       continue;
-               } else if (ret < 0) {
-                       if (errno == EAGAIN) {
-                               if (s->finish)
-                                       break;
-                               if (this_reap)
-                                       goto submit;
-                               to_submit = 0;
-                               goto submit;
-                       }
-                       printf("io_submit: %s\n", strerror(errno));
-                       break;
-               }
-       } while (!s->finish);
+       numa_set_preferred(s->numa_node);
  
-       finish = 1;
-       return NULL;
+       mask = numa_allocate_cpumask();
+       numa_node_to_cpus(s->numa_node, mask);
+       numa_sched_setaffinity(s->tid, mask);
+#endif
  }
  
-static void sig_int(int sig)
+static int detect_node(struct submitter *s, char *name)
  {
-       printf("Exiting on signal %d\n", sig);
-       submitter->finish = 1;
-       finish = 1;
+#ifdef CONFIG_LIBNUMA
+       const char *base = basename(name);
+       char str[128];
+       int ret, fd, node;
+
+       if (pt)
+               sprintf(str, "/sys/class/nvme-generic/%s/device/numa_node", base);
+       else
+               sprintf(str, "/sys/block/%s/device/numa_node", base);
+       fd = open(str, O_RDONLY);
+       if (fd < 0)
+               return -1;
+
+       ret = read(fd, str, sizeof(str));
+       if (ret < 0) {
+               close(fd);
+               return -1;
+       }
+       node = atoi(str);
+       s->numa_node = node;
+       close(fd);
+#else
+       s->numa_node = -1;
+#endif
+       return 0;
  }
  
-static void arm_sig_int(void)
+static int setup_aio(struct submitter *s)
  {
-       struct sigaction act;
-
-       memset(&act, 0, sizeof(act));
-       act.sa_handler = sig_int;
-       act.sa_flags = SA_RESTART;
-       sigaction(SIGINT, &act, NULL);
+#ifdef CONFIG_LIBAIO
+       if (polled) {
+               fprintf(stderr, "aio does not support polled IO\n");
+               polled = 0;
+       }
+       if (sq_thread_poll) {
+               fprintf(stderr, "aio does not support SQPOLL IO\n");
+               sq_thread_poll = 0;
+       }
+       if (do_nop) {
+               fprintf(stderr, "aio does not support polled IO\n");
+               do_nop = 0;
+       }
+       if (fixedbufs || register_files) {
+               fprintf(stderr, "aio does not support registered files or buffers\n");
+               fixedbufs = register_files = 0;
+       }
+
+       s->per_file_depth = (depth + s->nr_files - 1) / s->nr_files;
+       return io_queue_init(roundup_pow2(depth), &s->aio_ctx);
+#else
+       fprintf(stderr, "Legacy AIO not available on this system/build\n");
+       errno = EINVAL;
+       return -1;
+#endif
  }
  
  static int setup_ring(struct submitter *s)
@@ -422,8 +883,9 @@ static int setup_ring(struct submitter *s)
         struct io_sq_ring *sring = &s->sq_ring;
         struct io_cq_ring *cring = &s->cq_ring;
         struct io_uring_params p;
-       int ret, fd;
+       int ret, fd, i;
         void *ptr;
+       size_t len;
  
         memset(&p, 0, sizeof(p));
  
@@ -436,17 +898,28 @@ static int setup_ring(struct submitter *s)
                         p.sq_thread_cpu = sq_thread_cpu;
                 }
         }
+       if (pt) {
+               p.flags |= IORING_SETUP_SQE128;
+               p.flags |= IORING_SETUP_CQE32;
+       }
  
         fd = io_uring_setup(depth, &p);
         if (fd < 0) {
                 perror("io_uring_setup");
                 return 1;
         }
-       s->ring_fd = fd;
+       s->ring_fd = s->enter_ring_fd = fd;
  
         io_uring_probe(fd);
  
         if (fixedbufs) {
+               struct rlimit rlim;
+
+               rlim.rlim_cur = RLIM_INFINITY;
+               rlim.rlim_max = RLIM_INFINITY;
+               /* ignore potential error, not needed on newer kernels */
+               setrlimit(RLIMIT_MEMLOCK, &rlim);
+
                 ret = io_uring_register_buffers(s);
                 if (ret < 0) {
                         perror("io_uring_register_buffers");
@@ -465,7 +938,6 @@ static int setup_ring(struct submitter *s)
         ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
                         IORING_OFF_SQ_RING);
-       printf("sq_ring ptr = 0x%p\n", ptr);
         sring->head = ptr + p.sq_off.head;
         sring->tail = ptr + p.sq_off.tail;
         sring->ring_mask = ptr + p.sq_off.ring_mask;
@@ -474,77 +946,645 @@ static int setup_ring(struct submitter *s)
         sring->array = ptr + p.sq_off.array;
         sq_ring_mask = *sring->ring_mask;
  
-       s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
+       if (p.flags & IORING_SETUP_SQE128)
+               len = 2 * p.sq_entries * sizeof(struct io_uring_sqe);
+       else
+               len = p.sq_entries * sizeof(struct io_uring_sqe);
+       s->sqes = mmap(0, len,
                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
                         IORING_OFF_SQES);
-       printf("sqes ptr    = 0x%p\n", s->sqes);
  
-       ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
+       if (p.flags & IORING_SETUP_CQE32) {
+               len = p.cq_off.cqes +
+                       2 * p.cq_entries * sizeof(struct io_uring_cqe);
+       } else {
+               len = p.cq_off.cqes +
+                       p.cq_entries * sizeof(struct io_uring_cqe);
+       }
+       ptr = mmap(0, len,
                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
                         IORING_OFF_CQ_RING);
-       printf("cq_ring ptr = 0x%p\n", ptr);
         cring->head = ptr + p.cq_off.head;
         cring->tail = ptr + p.cq_off.tail;
         cring->ring_mask = ptr + p.cq_off.ring_mask;
         cring->ring_entries = ptr + p.cq_off.ring_entries;
         cring->cqes = ptr + p.cq_off.cqes;
         cq_ring_mask = *cring->ring_mask;
+
+       for (i = 0; i < p.sq_entries; i++)
+               sring->array[i] = i;
+
+       s->per_file_depth = INT_MAX;
+       if (s->nr_files)
+               s->per_file_depth = (depth + s->nr_files - 1) / s->nr_files;
         return 0;
  }
  
-static void file_depths(char *buf)
+static void *allocate_mem(struct submitter *s, int size)
  {
-       struct submitter *s = submitter;
-       char *p;
-       int i;
+       void *buf;
  
-       buf[0] = '\0';
-       p = buf;
-       for (i = 0; i < s->nr_files; i++) {
-               struct file *f = &s->files[i];
+#ifdef CONFIG_LIBNUMA
+       if (s->numa_node != -1)
+               return numa_alloc_onnode(size, s->numa_node);
+#endif
+
+       if (posix_memalign(&buf, t_io_uring_page_size, bs)) {
+               printf("failed alloc\n");
+               return NULL;
+       }
+
+       return buf;
+}
+
+static int submitter_init(struct submitter *s)
+{
+       int i, nr_batch, err;
+       static int init_printed;
+       char buf[80];
+       s->tid = gettid();
+       printf("submitter=%d, tid=%d, file=%s, nfiles=%d, node=%d\n", s->index, s->tid,
+                                                       s->filename, s->nr_files, s->numa_node);
+
+       set_affinity(s);
+
+       __init_rand64(&s->rand_state, s->tid);
+       srand48(s->tid);
+
+       for (i = 0; i < MAX_FDS; i++)
+               s->files[i].fileno = i;
+
+       for (i = 0; i < roundup_pow2(depth); i++) {
+               void *buf;
+
+               buf = allocate_mem(s, bs);
+               if (!buf)
+                       return -1;
+               s->iovecs[i].iov_base = buf;
+               s->iovecs[i].iov_len = bs;
+       }
+
+       if (use_sync) {
+               sprintf(buf, "Engine=preadv2\n");
+               err = 0;
+       } else if (!aio) {
+               err = setup_ring(s);
+               if (!err)
+                       sprintf(buf, "Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+       } else {
+               sprintf(buf, "Engine=aio\n");
+               err = setup_aio(s);
+       }
+       if (err) {
+               printf("queue setup failed: %s, %d\n", strerror(errno), err);
+               return -1;
+       }
+
+       if (!init_printed) {
+               printf("polled=%d, fixedbufs=%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, register_files, buffered, depth);
+               printf("%s", buf);
+               init_printed = 1;
+       }
+
+       if (stats) {
+               nr_batch = roundup_pow2(depth / batch_submit);
+               if (nr_batch < 2)
+                       nr_batch = 2;
+               s->clock_batch = calloc(nr_batch, sizeof(unsigned long));
+               s->clock_index = 1;
+
+               s->plat = calloc(PLAT_NR, sizeof(unsigned long));
+       } else {
+               s->clock_batch = NULL;
+               s->plat = NULL;
+               nr_batch = 0;
+       }
+       /* perform the expensive command initialization part for passthrough here
+        * rather than in the fast path
+        */
+       if (pt) {
+               for (i = 0; i < roundup_pow2(depth); i++) {
+                       struct io_uring_sqe *sqe = &s->sqes[i << 1];
+
+                       memset(&sqe->cmd, 0, sizeof(struct nvme_uring_cmd));
+               }
+       }
+       return nr_batch;
+}
+
+#ifdef CONFIG_LIBAIO
+static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
+{
+       uint64_t data;
+       struct file *f;
+       unsigned index;
+
+       index = 0;
+       while (index < max_ios) {
+               struct iocb *iocb = &iocbs[index];
+
+               f = get_next_file(s);
+
+               io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base,
+                               s->iovecs[index].iov_len, get_offset(s, f));
+
+               data = f->fileno;
+               if (stats && stats_running)
+                       data |= (((uint64_t) s->clock_index) << 32);
+               iocb->data = (void *) (uintptr_t) data;
+               index++;
+       }
+       return index;
+}
+
+static int reap_events_aio(struct submitter *s, struct io_event *events, int evs)
+{
+       int last_idx = -1, stat_nr = 0;
+       int reaped = 0;
+
+       while (evs) {
+               uint64_t data = (uintptr_t) events[reaped].data;
+               struct file *f = &s->files[data & 0xffffffff];
+
+               f->pending_ios--;
+               if (events[reaped].res != bs) {
+                       if (events[reaped].res == -ENODATA ||
+                           events[reaped].res == -EIO) {
+                               s->io_errors++;
+                       } else {
+                               printf("io: unexpected ret=%ld\n", events[reaped].res);
+                               return -1;
+                       }
+               } else if (stats) {
+                       int clock_index = data >> 32;
+
+                       if (last_idx != clock_index) {
+                               if (last_idx != -1) {
+                                       add_stat(s, last_idx, stat_nr);
+                                       stat_nr = 0;
+                               }
+                               last_idx = clock_index;
+                       }
+                       stat_nr++;
+               }
+               reaped++;
+               evs--;
+       }
+
+       if (stat_nr)
+               add_stat(s, last_idx, stat_nr);
+
+       s->inflight -= reaped;
+       s->done += reaped;
+       return reaped;
+}
+
+static void *submitter_aio_fn(void *data)
+{
+       struct submitter *s = data;
+       int i, ret, prepped;
+       struct iocb **iocbsptr;
+       struct iocb *iocbs;
+       struct io_event *events;
+#ifdef ARCH_HAVE_CPU_CLOCK
+       int nr_batch;
+#endif
+
+       ret = submitter_init(s);
+       if (ret < 0)
+               goto done;
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+       nr_batch = ret;
+#endif
+
+       iocbsptr = calloc(depth, sizeof(struct iocb *));
+       iocbs = calloc(depth, sizeof(struct iocb));
+       events = calloc(depth, sizeof(struct io_event));
+
+       for (i = 0; i < depth; i++)
+               iocbsptr[i] = &iocbs[i];
+
+       prepped = 0;
+       do {
+               int to_wait, to_submit, to_prep;
+
+               if (!prepped && s->inflight < depth) {
+                       to_prep = min(depth - s->inflight, batch_submit);
+                       prepped = prep_more_ios_aio(s, to_prep, iocbs);
+#ifdef ARCH_HAVE_CPU_CLOCK
+                       if (prepped && stats) {
+                               s->clock_batch[s->clock_index] = get_cpu_clock();
+                               s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
+                       }
+#endif
+               }
+               s->inflight += prepped;
+               to_submit = prepped;
+
+               if (to_submit && (s->inflight + to_submit <= depth))
+                       to_wait = 0;
+               else
+                       to_wait = min(s->inflight + to_submit, batch_complete);
+
+               ret = io_submit(s->aio_ctx, to_submit, iocbsptr);
+               s->calls++;
+               if (ret < 0) {
+                       perror("io_submit");
+                       break;
+               } else if (ret != to_submit) {
+                       printf("submitted %d, wanted %d\n", ret, to_submit);
+                       break;
+               }
+               prepped = 0;
+
+               while (to_wait) {
+                       int r;
+
+                       s->calls++;
+                       r = io_getevents(s->aio_ctx, to_wait, to_wait, events, NULL);
+                       if (r < 0) {
+                               perror("io_getevents");
+                               break;
+                       } else if (r != to_wait) {
+                               printf("r=%d, wait=%d\n", r, to_wait);
+                               break;
+                       }
+                       r = reap_events_aio(s, events, r);
+                       s->reaps += r;
+                       to_wait -= r;
+               }
+       } while (!s->finish);
+
+       free(iocbsptr);
+       free(iocbs);
+       free(events);
+done:
+       finish = 1;
+       return NULL;
+}
+#endif
+
+static void io_uring_unregister_ring(struct submitter *s)
+{
+       struct io_uring_rsrc_update up = {
+               .offset = s->enter_ring_fd,
+       };
+
+       syscall(__NR_io_uring_register, s->ring_fd, IORING_UNREGISTER_RING_FDS,
+               &up, 1);
+}
+
+static int io_uring_register_ring(struct submitter *s)
+{
+       struct io_uring_rsrc_update up = {
+               .data   = s->ring_fd,
+               .offset = -1U,
+       };
+       int ret;
+
+       ret = syscall(__NR_io_uring_register, s->ring_fd,
+                       IORING_REGISTER_RING_FDS, &up, 1);
+       if (ret == 1) {
+               s->enter_ring_fd = up.offset;
+               return 0;
+       }
+       register_ring = 0;
+       return -1;
+}
+
+static void *submitter_uring_fn(void *data)
+{
+       struct submitter *s = data;
+       struct io_sq_ring *ring = &s->sq_ring;
+       int ret, prepped;
+#ifdef ARCH_HAVE_CPU_CLOCK
+       int nr_batch;
+#endif
+
+       ret = submitter_init(s);
+       if (ret < 0)
+               goto done;
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+       nr_batch = ret;
+#endif
+
+       if (register_ring)
+               io_uring_register_ring(s);
+
+       prepped = 0;
+       do {
+               int to_wait, to_submit, this_reap, to_prep;
+               unsigned ring_flags = 0;
+
+               if (!prepped && s->inflight < depth) {
+                       to_prep = min(depth - s->inflight, batch_submit);
+                       prepped = prep_more_ios_uring(s, to_prep);
+#ifdef ARCH_HAVE_CPU_CLOCK
+                       if (prepped && stats) {
+                               s->clock_batch[s->clock_index] = get_cpu_clock();
+                               s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
+                       }
+#endif
+               }
+               s->inflight += prepped;
+submit_more:
+               to_submit = prepped;
+submit:
+               if (to_submit && (s->inflight + to_submit <= depth))
+                       to_wait = 0;
+               else
+                       to_wait = min(s->inflight + to_submit, batch_complete);
+
+               /*
+                * Only need to call io_uring_enter if we're not using SQ thread
+                * poll, or if IORING_SQ_NEED_WAKEUP is set.
+                */
+               if (sq_thread_poll)
+                       ring_flags = atomic_load_acquire(ring->flags);
+               if (!sq_thread_poll || ring_flags & IORING_SQ_NEED_WAKEUP) {
+                       unsigned flags = 0;
+
+                       if (to_wait)
+                               flags = IORING_ENTER_GETEVENTS;
+                       if (ring_flags & IORING_SQ_NEED_WAKEUP)
+                               flags |= IORING_ENTER_SQ_WAKEUP;
+                       ret = io_uring_enter(s, to_submit, to_wait, flags);
+                       s->calls++;
+               } else {
+                       /* for SQPOLL, we submitted it all effectively */
+                       ret = to_submit;
+               }
+
+               /*
+                * For non SQ thread poll, we already got the events we needed
+                * through the io_uring_enter() above. For SQ thread poll, we
+                * need to loop here until we find enough events.
+                */
+               this_reap = 0;
+               do {
+                       int r;
+
+                       if (pt)
+                               r = reap_events_uring_pt(s);
+                       else
+                               r = reap_events_uring(s);
+                       if (r == -1) {
+                               s->finish = 1;
+                               break;
+                       } else if (r > 0)
+                               this_reap += r;
+               } while (sq_thread_poll && this_reap < to_wait);
+               s->reaps += this_reap;
+
+               if (ret >= 0) {
+                       if (!ret) {
+                               to_submit = 0;
+                               if (s->inflight)
+                                       goto submit;
+                               continue;
+                       } else if (ret < to_submit) {
+                               int diff = to_submit - ret;
+
+                               s->done += ret;
+                               prepped -= diff;
+                               goto submit_more;
+                       }
+                       s->done += ret;
+                       prepped = 0;
+                       continue;
+               } else if (ret < 0) {
+                       if (errno == EAGAIN) {
+                               if (s->finish)
+                                       break;
+                               if (this_reap)
+                                       goto submit;
+                               to_submit = 0;
+                               goto submit;
+                       }
+                       printf("io_submit: %s\n", strerror(errno));
+                       break;
+               }
+       } while (!s->finish);
+
+       if (register_ring)
+               io_uring_unregister_ring(s);
+
+done:
+       finish = 1;
+       return NULL;
+}
+
+#ifdef CONFIG_PWRITEV2
+static void *submitter_sync_fn(void *data)
+{
+       struct submitter *s = data;
+       int ret;
+
+       if (submitter_init(s) < 0)
+               goto done;
+
+       do {
+               uint64_t offset;
+               struct file *f;
+
+               f = get_next_file(s);
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+               if (stats)
+                       s->clock_batch[s->clock_index] = get_cpu_clock();
+#endif
+
+               s->inflight++;
+               s->calls++;
  
-               if (i + 1 == s->nr_files)
-                       p += sprintf(p, "%d", f->pending_ios);
+               offset = get_offset(s, f);
+               if (polled)
+                       ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, RWF_HIPRI);
                 else
-                       p += sprintf(p, "%d, ", f->pending_ios);
+                       ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, 0);
+
+               if (ret < 0) {
+                       perror("preadv2");
+                       break;
+               } else if (ret != bs) {
+                       break;
+               }
+
+               s->done++;
+               s->inflight--;
+               f->pending_ios--;
+               if (stats)
+                       add_stat(s, s->clock_index, 1);
+       } while (!s->finish);
+
+done:
+       finish = 1;
+       return NULL;
+}
+#else
+static void *submitter_sync_fn(void *data)
+{
+       finish = 1;
+       return NULL;
+}
+#endif
+
+static struct submitter *get_submitter(int offset)
+{
+       void *ret;
+
+       ret = submitter;
+       if (offset)
+               ret += offset * (sizeof(*submitter) + depth * sizeof(struct iovec));
+       return ret;
+}
+
+static void do_finish(const char *reason)
+{
+       int j;
+
+       printf("Exiting on %s\n", reason);
+       for (j = 0; j < nthreads; j++) {
+               struct submitter *s = get_submitter(j);
+               s->finish = 1;
         }
+       if (max_iops > 1000000) {
+               double miops = (double) max_iops / 1000000.0;
+               printf("Maximum IOPS=%.2fM\n", miops);
+       } else if (max_iops > 100000) {
+               double kiops = (double) max_iops / 1000.0;
+               printf("Maximum IOPS=%.2fK\n", kiops);
+       } else {
+               printf("Maximum IOPS=%lu\n", max_iops);
+       }
+       finish = 1;
  }
  
-static void usage(char *argv)
+static void sig_int(int sig)
  {
+       do_finish("signal");
+}
+
+static void arm_sig_int(void)
+{
+       struct sigaction act;
+
+       memset(&act, 0, sizeof(act));
+       act.sa_handler = sig_int;
+       act.sa_flags = SA_RESTART;
+       sigaction(SIGINT, &act, NULL);
+
+       /* Windows uses SIGBREAK as a quit signal from other applications */
+#ifdef WIN32
+       sigaction(SIGBREAK, &act, NULL);
+#endif
+}
+
+static void usage(char *argv, int status)
+{
+       char runtime_str[16];
+       snprintf(runtime_str, sizeof(runtime_str), "%d", runtime);
         printf("%s [options] -- [filenames]\n"
-               " -d <int> : IO Depth, default %d\n"
-               " -s <int> : Batch submit, default %d\n"
-               " -c <int> : Batch complete, default %d\n"
-               " -b <int> : Block size, default %d\n"
-               " -p <bool> : Polled IO, default %d\n",
-               argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled);
-       exit(0);
+               " -d <int>  : IO Depth, default %d\n"
+               " -s <int>  : Batch submit, default %d\n"
+               " -c <int>  : Batch complete, default %d\n"
+               " -b <int>  : Block size, default %d\n"
+               " -p <bool> : Polled IO, default %d\n"
+               " -B <bool> : Fixed buffers, default %d\n"
+               " -F <bool> : Register files, default %d\n"
+               " -n <int>  : Number of threads, default %d\n"
+               " -O <bool> : Use O_DIRECT, default %d\n"
+               " -N <bool> : Perform just no-op requests, default %d\n"
+               " -t <bool> : Track IO latencies, default %d\n"
+               " -T <int>  : TSC rate in HZ\n"
+               " -r <int>  : Runtime in seconds, default %s\n"
+               " -R <bool> : Use random IO, default %d\n"
+               " -a <bool> : Use legacy aio, default %d\n"
+               " -S <bool> : Use sync IO (preadv2), default %d\n"
+               " -X <bool> : Use registered ring %d\n"
+               " -P <bool> : Automatically place on device home node %d\n"
+               " -u <bool> : Use nvme-passthrough I/O, default %d\n",
+               argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
+               fixedbufs, register_files, nthreads, !buffered, do_nop,
+               stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio,
+               use_sync, register_ring, numa_placement, pt);
+       exit(status);
+}
+
+static void read_tsc_rate(void)
+{
+       char buffer[32];
+       int fd, ret;
+
+       if (tsc_rate)
+               return;
+
+       fd = open(TSC_RATE_FILE, O_RDONLY);
+       if (fd < 0)
+               return;
+
+       ret = read(fd, buffer, sizeof(buffer));
+       if (ret < 0) {
+               close(fd);
+               return;
+       }
+
+       tsc_rate = strtoul(buffer, NULL, 10);
+       printf("Using TSC rate %luHz\n", tsc_rate);
+       close(fd);
+}
+
+static void write_tsc_rate(void)
+{
+       char buffer[32];
+       struct stat sb;
+       int fd, ret;
+
+       if (!stat(TSC_RATE_FILE, &sb))
+               return;
+
+       fd = open(TSC_RATE_FILE, O_WRONLY | O_CREAT, 0644);
+       if (fd < 0)
+               return;
+
+       memset(buffer, 0, sizeof(buffer));
+       sprintf(buffer, "%lu", tsc_rate);
+       ret = write(fd, buffer, strlen(buffer));
+       if (ret < 0)
+               perror("write");
+       close(fd);
  }
  
  int main(int argc, char *argv[])
  {
         struct submitter *s;
-       unsigned long done, calls, reap;
-       int err, i, flags, fd, opt;
-       char *fdepths;
+       unsigned long done, calls, reap, io_errors;
+       int i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles;
+       struct file f;
         void *ret;
  
-       if (!do_nop && argc < 2) {
-               printf("%s: filename [options]\n", argv[0]);
-               return 1;
-       }
+       if (!do_nop && argc < 2)
+               usage(argv[0], 1);
  
-       while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:h?")) != -1) {
+       while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:X:S:P:u:h?")) != -1) {
                 switch (opt) {
+               case 'a':
+                       aio = !!atoi(optarg);
+                       break;
                 case 'd':
                         depth = atoi(optarg);
                         break;
                 case 's':
                         batch_submit = atoi(optarg);
+                       if (!batch_submit)
+                               batch_submit = 1;
                         break;
                 case 'c':
                         batch_complete = atoi(optarg);
+                       if (!batch_complete)
+                               batch_complete = 1;
                         break;
                 case 'b':
                         bs = atoi(optarg);
@@ -558,115 +1598,249 @@ int main(int argc, char *argv[])
                 case 'F':
                         register_files = !!atoi(optarg);
                         break;
+               case 'n':
+                       nthreads = atoi(optarg);
+                       if (!nthreads) {
+                               printf("Threads must be non-zero\n");
+                               usage(argv[0], 1);
+                       }
+                       break;
+               case 'N':
+                       do_nop = !!atoi(optarg);
+                       break;
+               case 'O':
+                       buffered = !atoi(optarg);
+                       break;
+               case 't':
+#ifndef ARCH_HAVE_CPU_CLOCK
+                       fprintf(stderr, "Stats not supported on this CPU\n");
+                       return 1;
+#endif
+                       stats = !!atoi(optarg);
+                       break;
+               case 'T':
+#ifndef ARCH_HAVE_CPU_CLOCK
+                       fprintf(stderr, "Stats not supported on this CPU\n");
+                       return 1;
+#endif
+                       tsc_rate = strtoul(optarg, NULL, 10);
+                       write_tsc_rate();
+                       break;
+               case 'r':
+                       runtime = atoi(optarg);
+                       break;
+               case 'R':
+                       random_io = !!atoi(optarg);
+                       break;
+               case 'X':
+                       register_ring = !!atoi(optarg);
+                       break;
+               case 'S':
+#ifdef CONFIG_PWRITEV2
+                       use_sync = !!atoi(optarg);
+#else
+                       fprintf(stderr, "preadv2 not supported\n");
+                       exit(1);
+#endif
+                       break;
+               case 'P':
+                       numa_placement = !!atoi(optarg);
+                       break;
+               case 'u':
+                       pt = !!atoi(optarg);
+                       break;
                 case 'h':
                 case '?':
                 default:
-                       usage(argv[0]);
+                       usage(argv[0], 0);
                         break;
                 }
         }
  
-       submitter = malloc(sizeof(*submitter) + depth * sizeof(struct iovec));
-       memset(submitter, 0, sizeof(*submitter) + depth * sizeof(struct iovec));
-       s = submitter;
+       if (stats)
+               read_tsc_rate();
+
+       if (batch_complete > depth)
+               batch_complete = depth;
+       if (batch_submit > depth)
+               batch_submit = depth;
+
+       submitter = calloc(nthreads, sizeof(*submitter) +
+                               roundup_pow2(depth) * sizeof(struct iovec));
+       for (j = 0; j < nthreads; j++) {
+               s = get_submitter(j);
+               s->numa_node = -1;
+               s->index = j;
+               s->done = s->calls = s->reaps = s->io_errors = 0;
+       }
  
         flags = O_RDONLY | O_NOATIME;
         if (!buffered)
                 flags |= O_DIRECT;
  
+       j = 0;
         i = optind;
+       nfiles = argc - i;
+       if (!do_nop) {
+               if (!nfiles) {
+                       printf("No files specified\n");
+                       usage(argv[0], 1);
+               }
+               threads_per_f = nthreads / nfiles;
+               /* make sure each thread gets assigned files */
+               if (threads_per_f == 0) {
+                       threads_per_f = 1;
+               } else {
+                       threads_rem = nthreads - threads_per_f * nfiles;
+               }
+       }
         while (!do_nop && i < argc) {
-               struct file *f;
+               int k, limit;
+
+               memset(&f, 0, sizeof(f));
  
-               if (s->nr_files == MAX_FDS) {
-                       printf("Max number of files (%d) reached\n", MAX_FDS);
-                       break;
-               }
                 fd = open(argv[i], flags);
                 if (fd < 0) {
                         perror("open");
                         return 1;
                 }
-
-               f = &s->files[s->nr_files];
-               f->real_fd = fd;
-               if (get_file_size(f)) {
+               f.real_fd = fd;
+               if (get_file_size(&f)) {
                         printf("failed getting size of device/file\n");
                         return 1;
                 }
-               if (f->max_blocks <= 1) {
+               if (f.max_blocks <= 1) {
                         printf("Zero file/device size?\n");
                         return 1;
                 }
-               f->max_blocks--;
-
-               printf("Added file %s\n", argv[i]);
-               s->nr_files++;
-               i++;
-       }
+               f.max_blocks--;
  
-       if (fixedbufs) {
-               struct rlimit rlim;
+               limit = threads_per_f;
+               limit += threads_rem > 0 ? 1 : 0;
+               for (k = 0; k < limit; k++) {
+                       s = get_submitter((j + k) % nthreads);
  
-               rlim.rlim_cur = RLIM_INFINITY;
-               rlim.rlim_max = RLIM_INFINITY;
-               if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
-                       perror("setrlimit");
-                       return 1;
-               }
-       }
+                       if (s->nr_files == MAX_FDS) {
+                               printf("Max number of files (%d) reached\n", MAX_FDS);
+                               break;
+                       }
  
-       arm_sig_int();
+                       memcpy(&s->files[s->nr_files], &f, sizeof(f));
  
-       for (i = 0; i < depth; i++) {
-               void *buf;
+                       if (numa_placement)
+                               detect_node(s, argv[i]);
  
-               if (posix_memalign(&buf, bs, bs)) {
-                       printf("failed alloc\n");
-                       return 1;
+                       s->filename = argv[i];
+                       s->nr_files++;
                 }
-               s->iovecs[i].iov_base = buf;
-               s->iovecs[i].iov_len = bs;
+               threads_rem--;
+               i++;
+               j += limit;
         }
  
-       err = setup_ring(s);
-       if (err) {
-               printf("ring setup failed: %s, %d\n", strerror(errno), err);
-               return 1;
-       }
-       printf("polled=%d, fixedbufs=%d, register_files=%d, buffered=%d", polled, fixedbufs, register_files, buffered);
-       printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", depth, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+       arm_sig_int();
  
-       pthread_create(&s->thread, NULL, submitter_fn, s);
+       t_io_uring_page_size = sysconf(_SC_PAGESIZE);
+       if (t_io_uring_page_size < 0)
+               t_io_uring_page_size = 4096;
+
+       for (j = 0; j < nthreads; j++) {
+               s = get_submitter(j);
+               if (use_sync)
+                       pthread_create(&s->thread, NULL, submitter_sync_fn, s);
+               else if (!aio)
+                       pthread_create(&s->thread, NULL, submitter_uring_fn, s);
+#ifdef CONFIG_LIBAIO
+               else
+                       pthread_create(&s->thread, NULL, submitter_aio_fn, s);
+#endif
+       }
  
-       fdepths = malloc(8 * s->nr_files);
-       reap = calls = done = 0;
+       reap = calls = done = io_errors = 0;
         do {
                 unsigned long this_done = 0;
                 unsigned long this_reap = 0;
                 unsigned long this_call = 0;
+               unsigned long this_io_errors = 0;
                 unsigned long rpc = 0, ipc = 0;
+               unsigned long iops, bw;
  
                 sleep(1);
-               this_done += s->done;
-               this_call += s->calls;
-               this_reap += s->reaps;
+               if (runtime && !--runtime)
+                       do_finish("timeout");
+
+               /* don't print partial run, if interrupted by signal */
+               if (finish)
+                       break;
+
+               /* one second in to the run, enable stats */
+               if (stats)
+                       stats_running = 1;
+
+               for (j = 0; j < nthreads; j++) {
+                       s = get_submitter(j);
+                       this_done += s->done;
+                       this_call += s->calls;
+                       this_reap += s->reaps;
+                       this_io_errors += s->io_errors;
+               }
                 if (this_call - calls) {
                         rpc = (this_done - done) / (this_call - calls);
                         ipc = (this_reap - reap) / (this_call - calls);
                 } else
                         rpc = ipc = -1;
-               file_depths(fdepths);
-               printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n",
-                               this_done - done, rpc, ipc, s->inflight,
-                               fdepths);
+               iops = this_done - done;
+               iops -= this_io_errors - io_errors;
+               if (bs > 1048576)
+                       bw = iops * (bs / 1048576);
+               else
+                       bw = iops / (1048576 / bs);
+               if (iops > 1000000) {
+                       double miops = (double) iops / 1000000.0;
+                       printf("IOPS=%.2fM, ", miops);
+               } else if (iops > 100000) {
+                       double kiops = (double) iops / 1000.0;
+                       printf("IOPS=%.2fK, ", kiops);
+               } else {
+                       printf("IOPS=%lu, ", iops);
+               }
+               max_iops = max(max_iops, iops);
+               if (!do_nop) {
+                       if (bw > 2000) {
+                               double bw_g = (double) bw / 1000.0;
+
+                               printf("BW=%.2fGiB/s, ", bw_g);
+                       } else {
+                               printf("BW=%luMiB/s, ", bw);
+                       }
+               }
+               printf("IOS/call=%ld/%ld\n", rpc, ipc);
                 done = this_done;
                 calls = this_call;
                 reap = this_reap;
+               io_errors = this_io_errors;
         } while (!finish);
  
-       pthread_join(s->thread, &ret);
-       close(s->ring_fd);
-       free(fdepths);
+       for (j = 0; j < nthreads; j++) {
+               s = get_submitter(j);
+               pthread_join(s->thread, &ret);
+               close(s->ring_fd);
+
+               if (s->io_errors)
+                       printf("%d: %lu IO errors\n", s->tid, s->io_errors);
+
+               if (stats) {
+                       unsigned long nr;
+
+                       printf("%d: Latency percentiles:\n", s->tid);
+                       for (i = 0, nr = 0; i < PLAT_NR; i++)
+                               nr += s->plat[i];
+                       show_clat_percentiles(s->plat, nr, 4);
+                       free(s->clock_batch);
+                       free(s->plat);
+               }
+       }
+
+       free(submitter);
         return 0;
  }
diff --git a/t/jobs/t0007-37cf9e3c.fio b/t/jobs/t0007-37cf9e3c.fio

index d3c987517d2c4eaa5ae6f48512a139a27bdaa6e0..b2592694e6d6100eb507b1cf3bac60b184580660 100644 (file)
--- a/t/jobs/t0007-37cf9e3c.fio
+++ b/t/jobs/t0007-37cf9e3c.fio
@@ -1,4 +1,7 @@
-# Expected result: fio reads 87040KB of data
+# Expected result: fio reads 87040KB of data:
+# first read is at offset 0, then 2nd read is at offset 1.5m, then the 3rd
+# read is at offset 3m, and after the last read at offset 127m - we have only
+# read 87,040K data.
  # Buggy result: fio reads the full 128MB of data
  [foo]
  size=128mb
diff --git a/t/jobs/t0012.fio b/t/jobs/t0012.fio

index d7123966915a56f0723112d3e0adb6b35efacc4c..e01d2b01b610caf00e806f5d3074479a240ccdd3 100644 (file)
--- a/t/jobs/t0012.fio
+++ b/t/jobs/t0012.fio
@@ -14,6 +14,7 @@ flow_sleep=100
  thread
  log_avg_msec=1000
  write_iops_log=t0012.fio
+time_based
  
  [flow1]
  flow=1
diff --git a/t/jobs/t0014.fio b/t/jobs/t0014.fio

index d9b456516ea9dbbcb03fb8e2d20d0285860bc719..eb13478ba5e7fa8c0bcab1befdfd577ec4814f83 100644 (file)
--- a/t/jobs/t0014.fio
+++ b/t/jobs/t0014.fio
@@ -17,6 +17,7 @@ flow_id=1
  thread
  log_avg_msec=1000
  write_iops_log=t0014.fio
+time_based
  
  [flow1]
  flow=1
diff --git a/t/jobs/t0015-4e7e7898.fio b/t/jobs/t0015-4e7e7898.fio

new file mode 100644 (file)

index 0000000..c650c0b
--- /dev/null
+++ b/t/jobs/t0015-4e7e7898.fio
@@ -0,0 +1,7 @@
+# Expected result: mean(slat) + mean(clat) = mean(lat)
+# Buggy result: equality does not hold
+
+[test]
+ioengine=libaio
+size=1M
+iodepth=16
diff --git a/t/jobs/t0016-d54ae22.fio b/t/jobs/t0016-d54ae22.fio

new file mode 100644 (file)

index 0000000..1b418e7
--- /dev/null
+++ b/t/jobs/t0016-d54ae22.fio
@@ -0,0 +1,7 @@
+# Expected result: mean(slat) + mean(clat) = mean(lat)
+# Buggy result: equality does not hold
+
+[test]
+ioengine=null
+size=1M
+iodepth=16
diff --git a/t/jobs/t0017.fio b/t/jobs/t0017.fio

new file mode 100644 (file)

index 0000000..14486d9
--- /dev/null
+++ b/t/jobs/t0017.fio
@@ -0,0 +1,9 @@
+# Expected result: mean(slat) + mean(clat) = mean(lat)
+# Buggy result: equality does not hold
+# This is similar to t0015 and t0016 except that is uses posixaio which is
+# available on more platforms and does not have a commit hook
+
+[test]
+ioengine=posixaio
+size=1M
+iodepth=16
diff --git a/t/jobs/t0018.fio b/t/jobs/t0018.fio

new file mode 100644 (file)

index 0000000..e2298b1
--- /dev/null
+++ b/t/jobs/t0018.fio
@@ -0,0 +1,9 @@
+# Expected result: job completes without error
+# Buggy result: job fails
+
+[test]
+ioengine=io_uring
+filesize=256K
+time_based
+runtime=3s
+rw=randrw
diff --git a/t/jobs/t0019.fio b/t/jobs/t0019.fio

new file mode 100644 (file)

index 0000000..b60d27d
--- /dev/null
+++ b/t/jobs/t0019.fio
@@ -0,0 +1,10 @@
+# Expected result: offsets are accessed sequentially and all offsets are read
+# Buggy result: offsets are not accessed sequentially and one or more offsets are missed
+# run with --debug=io or logging to see which offsets are accessed
+
+[test]
+ioengine=null
+filesize=1M
+write_bw_log=test
+per_job_logs=0
+log_offset=1
diff --git a/t/jobs/t0020.fio b/t/jobs/t0020.fio

new file mode 100644 (file)

index 0000000..1c1c516
--- /dev/null
+++ b/t/jobs/t0020.fio
@@ -0,0 +1,11 @@
+# Expected result: offsets are not accessed sequentially and all offsets are touched
+# Buggy result: offsets are accessed sequentially and one or more offsets are missed
+# run with --debug=io or logging to see which offsets are read
+
+[test]
+ioengine=null
+filesize=1M
+rw=randread
+write_bw_log=test
+per_job_logs=0
+log_offset=1
diff --git a/t/jobs/t0021.fio b/t/jobs/t0021.fio

new file mode 100644 (file)

index 0000000..47fbae7
--- /dev/null
+++ b/t/jobs/t0021.fio
@@ -0,0 +1,15 @@
+# make sure the lfsr random generator actually does touch all the offsets
+#
+# Expected result: offsets are not accessed sequentially and all offsets are touched
+# Buggy result: offsets are accessed sequentially and one or more offsets are missed
+# run with --debug=io or logging to see which offsets are read
+
+[test]
+ioengine=null
+filesize=1M
+rw=randread
+write_bw_log=test
+per_job_logs=0
+log_offset=1
+norandommap=1
+random_generator=lfsr
diff --git a/t/jobs/t0022.fio b/t/jobs/t0022.fio

new file mode 100644 (file)

index 0000000..2324571
--- /dev/null
+++ b/t/jobs/t0022.fio
@@ -0,0 +1,13 @@
+# make sure that when we enable norandommap we touch some offsets more than once
+#
+# Expected result: at least one offset is touched more than once
+# Buggy result: each offset is touched only once
+
+[test]
+ioengine=null
+filesize=1M
+rw=randread
+write_bw_log=test
+per_job_logs=0
+log_offset=1
+norandommap=1
diff --git a/t/jobs/t0023.fio b/t/jobs/t0023.fio

new file mode 100644 (file)

index 0000000..8e14a11
--- /dev/null
+++ b/t/jobs/t0023.fio
@@ -0,0 +1,63 @@
+# randtrimwrite data direction tests
+[global]
+filesize=1M
+ioengine=null
+rw=randtrimwrite
+log_offset=1
+per_job_logs=0
+randrepeat=0
+write_bw_log
+
+# Expected result:     trim issued to random offset followed by write to same offset
+#                      all offsets touched
+#                      block sizes match
+# Buggy result:        something else
+[basic]
+
+# Expected result:     trim issued to random offset followed by write to same offset
+#                      all offsets trimmed
+#                      block sizes 8k for both write and trim
+# Buggy result:        something else
+[bs]
+bs=8k,8k,8k
+
+# Expected result:     trim issued to random offset followed by write to same offset
+#                      all offsets trimmed
+#                      block sizes match
+# Buggy result:        something else
+[bsrange]
+bsrange=512-4k
+
+# Expected result:     trim issued to random offset followed by write to same offset
+#                      all offsets trimmed
+#                      block sizes match
+# Buggy result:        something else
+[bssplit]
+bssplit=512/25:1k/:2k/:4k/
+
+# Expected result:     trim issued to random offset followed by write to same offset
+#                      block sizes match
+# Buggy result:        something else
+[basic_no_rm]
+norandommap=1
+
+# Expected result:     trim issued to random offset followed by write to same offset
+#                      block sizes 8k for both write and trim
+# Buggy result:        something else
+[bs_no_rm]
+bs=4k,4k,8k
+norandommap=1
+
+# Expected result:     trim issued to random offset followed by write to same offset
+#                      block sizes match
+# Buggy result:        something else
+[bsrange_no_rm]
+bsrange=512-4k
+norandommap=1
+
+# Expected result:     trim issued to random offset followed by write to same offset
+#                      block sizes match
+# Buggy result:        something else
+[bssplit_no_rm]
+bssplit=512/25:1k/:2k/:4k/
+norandommap=1
diff --git a/t/jobs/t0024.fio b/t/jobs/t0024.fio

new file mode 100644 (file)

index 0000000..2b3dc94
--- /dev/null
+++ b/t/jobs/t0024.fio
@@ -0,0 +1,36 @@
+# trimwrite data direction tests
+[global]
+filesize=1M
+ioengine=null
+rw=trimwrite
+log_offset=1
+per_job_logs=0
+randrepeat=0
+write_bw_log
+
+# Expected result:     trim issued to sequential offsets followed by write to same offset
+#                      all offsets touched
+#                      block sizes match
+# Buggy result:        something else
+[basic]
+
+# Expected result:     trim issued to sequential offsets followed by write to same offset
+#                      all offsets trimmed
+#                      block sizes 8k for both write and trim
+# Buggy result:        something else
+[bs]
+bs=8k,8k,8k
+
+# Expected result:     trim issued to sequential offsets followed by write to same offset
+#                      all offsets trimmed
+#                      block sizes match
+# Buggy result:        something else
+[bsrange]
+bsrange=512-4k
+
+# Expected result:     trim issued to sequential offsets followed by write to same offset
+#                      all offsets trimmed
+#                      block sizes match
+# Buggy result:        something else
+[bssplit]
+bssplit=512/25:1k/:2k/:4k/
diff --git a/t/jobs/t0025.fio b/t/jobs/t0025.fio

new file mode 100644 (file)

index 0000000..29b5fe8
--- /dev/null
+++ b/t/jobs/t0025.fio
@@ -0,0 +1,7 @@
+[job]
+filename=t0025file
+size=128k
+readwrite=write
+do_verify=1
+verify=md5
+experimental_verify=1
diff --git a/t/jobs/t0026.fio b/t/jobs/t0026.fio

new file mode 100644 (file)

index 0000000..ee89b14
--- /dev/null
+++ b/t/jobs/t0026.fio
@@ -0,0 +1,19 @@
+[job1]
+filename=t0026file
+size=1M
+readwrite=randwrite
+loops=8
+do_verify=1
+verify=md5
+experimental_verify=1
+
+[job2]
+stonewall=1
+filename=t0026file
+size=1M
+readwrite=randrw
+time_based
+runtime=5
+do_verify=1
+verify=md5
+experimental_verify=1
diff --git a/t/jobs/t0027.fio b/t/jobs/t0027.fio

new file mode 100644 (file)

index 0000000..b5b97a3
--- /dev/null
+++ b/t/jobs/t0027.fio
@@ -0,0 +1,14 @@
+[global]
+filename=t0027file
+size=16k
+bs=16k
+
+[write_job]
+readwrite=write
+buffer_pattern='t0027.pattern'
+
+[read_job]
+stonewall=1
+readwrite=read
+verify=pattern
+verify_pattern='t0027.pattern'
diff --git a/t/jobs/t0028-c6cade16.fio b/t/jobs/t0028-c6cade16.fio

new file mode 100644 (file)

index 0000000..a0096d8
--- /dev/null
+++ b/t/jobs/t0028-c6cade16.fio
@@ -0,0 +1,5 @@
+[test]
+size=16k
+readwrite=write
+buffer_pattern="abcd"-120xdeadface
+ioengine=null
diff --git a/t/jobs/t0029.fio b/t/jobs/t0029.fio

new file mode 100644 (file)

index 0000000..481de6f
--- /dev/null
+++ b/t/jobs/t0029.fio
@@ -0,0 +1,14 @@
+[global]
+filename=t0029file
+size=4k
+verify=md5
+
+[write]
+rw=write
+do_verify=0
+
+[read]
+stonewall=1
+rw=read
+loops=2
+do_verify=1
diff --git a/t/jobs/t0030.fio b/t/jobs/t0030.fio

new file mode 100644 (file)

index 0000000..8bbc810
--- /dev/null
+++ b/t/jobs/t0030.fio
@@ -0,0 +1,10 @@
+# run with --bandwidth-log
+# broken behavior: seg fault
+# successful behavior: test runs to completion with 0 as the exit code
+
+[test]
+ioengine=null
+filesize=1T
+rw=read
+time_based
+runtime=2s
diff --git a/t/jobs/t0031-pre.fio b/t/jobs/t0031-pre.fio

new file mode 100644 (file)

index 0000000..ce4ee3b
--- /dev/null
+++ b/t/jobs/t0031-pre.fio
@@ -0,0 +1,8 @@
+[job]
+rw=write
+ioengine=libaio
+size=1mb
+time_based=1
+runtime=1
+filename=t0030file
+write_iolog=iolog
diff --git a/t/jobs/t0031.fio b/t/jobs/t0031.fio

new file mode 100644 (file)

index 0000000..ae8f744
--- /dev/null
+++ b/t/jobs/t0031.fio
@@ -0,0 +1,7 @@
+[job]
+rw=read
+ioengine=libaio
+iodepth=128
+filename=t0030file
+read_iolog=iolog
+write_lat_log=lat_log
diff --git a/t/jobs/t0032-43063a1c.fio b/t/jobs/t0032-43063a1c.fio

new file mode 100644 (file)

index 0000000..db998e5
--- /dev/null
+++ b/t/jobs/t0032-43063a1c.fio
@@ -0,0 +1,12 @@
+# Expected results: max offset is ~1280K
+# Buggy result: max offset is ~640K
+#
+
+[global]
+ioengine=null
+size=1280K
+io_size=2560k
+bs=128K
+
+[test1]
+rw=rw
diff --git a/t/latency_percentiles.py b/t/latency_percentiles.py

index cc4374262efcca612e19f9dce6f969de399861fa..81704700d4a12b07eedc0f56057ec41a99334d20 100755 (executable)
--- a/t/latency_percentiles.py
+++ b/t/latency_percentiles.py
@@ -80,6 +80,7 @@ import time
  import argparse
  import platform
  import subprocess
+from collections import Counter
  from pathlib import Path
  
  
@@ -125,7 +126,8 @@ class FioLatTest():
              "--output-format={output-format}".format(**self.test_options),
          ]
          for opt in ['slat_percentiles', 'clat_percentiles', 'lat_percentiles',
-                    'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs', 'cmdprio_percentage']:
+                    'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs',
+                    'cmdprio_percentage', 'bssplit', 'cmdprio_bssplit']:
              if opt in self.test_options:
                  option = '--{0}={{{0}}}'.format(opt)
                  fio_args.append(option.format(**self.test_options))
@@ -268,7 +270,7 @@ class FioLatTest():
              #
              # Check only for the presence/absence of json+
              # latency bins. Future work can check the
-            # accurracy of the bin values and counts.
+            # accuracy of the bin values and counts.
              #
              # Because the latency percentiles are based on
              # the bins, we can be confident that the bin
@@ -363,20 +365,19 @@ class FioLatTest():
  
      def check_nocmdprio_lat(self, job):
          """
-        Make sure no high/low priority latencies appear.
+        Make sure no per priority latencies appear.
  
          job         JSON object to check
          """
  
          for ddir in ['read', 'write', 'trim']:
              if ddir in job:
-                if 'lat_high_prio' in job[ddir] or 'lat_low_prio' in job[ddir] or \
-                    'clat_high_prio' in job[ddir] or 'clat_low_prio' in job[ddir]:
-                    print("Unexpected high/low priority latencies found in %s output" % ddir)
+                if 'prios' in job[ddir]:
+                    print("Unexpected per priority latencies found in %s output" % ddir)
                      return False
  
          if self.debug:
-            print("No high/low priority latencies found")
+            print("No per priority latencies found")
  
          return True
  
@@ -497,7 +498,7 @@ class FioLatTest():
          return retval
  
      def check_prio_latencies(self, jsondata, clat=True, plus=False):
-        """Check consistency of high/low priority latencies.
+        """Check consistency of per priority latencies.
  
          clat                True if we should check clat data; other check lat data
          plus                True if we have json+ format data where additional checks can
@@ -506,78 +507,78 @@ class FioLatTest():
          """
  
          if clat:
-            high = 'clat_high_prio'
-            low = 'clat_low_prio'
-            combined = 'clat_ns'
+            obj = combined = 'clat_ns'
          else:
-            high = 'lat_high_prio'
-            low = 'lat_low_prio'
-            combined = 'lat_ns'
+            obj = combined = 'lat_ns'
  
-        if not high in jsondata or not low in jsondata or not combined in jsondata:
-            print("Error identifying high/low priority latencies")
+        if not 'prios' in jsondata or not combined in jsondata:
+            print("Error identifying per priority latencies")
              return False
  
-        if jsondata[high]['N'] + jsondata[low]['N'] != jsondata[combined]['N']:
-            print("High %d + low %d != combined sample size %d" % \
-                    (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N']))
+        sum_sample_size = sum([x[obj]['N'] for x in jsondata['prios']])
+        if sum_sample_size != jsondata[combined]['N']:
+            print("Per prio sample size sum %d != combined sample size %d" %
+                  (sum_sample_size, jsondata[combined]['N']))
              return False
          elif self.debug:
-            print("High %d + low %d == combined sample size %d" % \
-                    (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N']))
+            print("Per prio sample size sum %d == combined sample size %d" %
+                  (sum_sample_size, jsondata[combined]['N']))
  
-        if min(jsondata[high]['min'], jsondata[low]['min']) != jsondata[combined]['min']:
-            print("Min of high %d, low %d min latencies does not match min %d from combined data" % \
-                    (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min']))
+        min_val = min([x[obj]['min'] for x in jsondata['prios']])
+        if min_val != jsondata[combined]['min']:
+            print("Min per prio min latency %d does not match min %d from combined data" %
+                  (min_val, jsondata[combined]['min']))
              return False
          elif self.debug:
-            print("Min of high %d, low %d min latencies matches min %d from combined data" % \
-                    (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min']))
+            print("Min per prio min latency %d matches min %d from combined data" %
+                  (min_val, jsondata[combined]['min']))
  
-        if max(jsondata[high]['max'], jsondata[low]['max']) != jsondata[combined]['max']:
-            print("Max of high %d, low %d max latencies does not match max %d from combined data" % \
-                    (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max']))
+        max_val = max([x[obj]['max'] for x in jsondata['prios']])
+        if max_val != jsondata[combined]['max']:
+            print("Max per prio max latency %d does not match max %d from combined data" %
+                  (max_val, jsondata[combined]['max']))
              return False
          elif self.debug:
-            print("Max of high %d, low %d max latencies matches max %d from combined data" % \
-                    (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max']))
+            print("Max per prio max latency %d matches max %d from combined data" %
+                  (max_val, jsondata[combined]['max']))
  
-        weighted_avg = (jsondata[high]['mean'] * jsondata[high]['N'] + \
-                        jsondata[low]['mean'] * jsondata[low]['N']) / jsondata[combined]['N']
+        weighted_vals = [x[obj]['mean'] * x[obj]['N'] for x in jsondata['prios']]
+        weighted_avg = sum(weighted_vals) / jsondata[combined]['N']
          delta = abs(weighted_avg - jsondata[combined]['mean'])
          if (delta / jsondata[combined]['mean']) > 0.0001:
-            print("Difference between weighted average %f of high, low means "
+            print("Difference between merged per prio weighted average %f mean "
                    "and actual mean %f exceeds 0.01%%" % (weighted_avg, jsondata[combined]['mean']))
              return False
          elif self.debug:
-            print("Weighted average %f of high, low means matches actual mean %f" % \
-                    (weighted_avg, jsondata[combined]['mean']))
+            print("Merged per prio weighted average %f mean matches actual mean %f" %
+                  (weighted_avg, jsondata[combined]['mean']))
  
          if plus:
-            if not self.check_jsonplus(jsondata[high]):
-                return False
-            if not self.check_jsonplus(jsondata[low]):
-                return False
+            for prio in jsondata['prios']:
+                if not self.check_jsonplus(prio[obj]):
+                    return False
  
-            bins = {**jsondata[high]['bins'], **jsondata[low]['bins']}
-            for duration in bins.keys():
-                if duration in jsondata[high]['bins'] and duration in jsondata[low]['bins']:
-                    bins[duration] = jsondata[high]['bins'][duration] + \
-                            jsondata[low]['bins'][duration]
+            counter = Counter()
+            for prio in jsondata['prios']:
+                counter.update(prio[obj]['bins'])
+
+            bins = dict(counter)
  
              if len(bins) != len(jsondata[combined]['bins']):
-                print("Number of combined high/low bins does not match number of overall bins")
+                print("Number of merged bins %d does not match number of overall bins %d" %
+                      (len(bins), len(jsondata[combined]['bins'])))
                  return False
              elif self.debug:
-                print("Number of bins from merged high/low data matches number of overall bins")
+                print("Number of merged bins %d matches number of overall bins %d" %
+                      (len(bins), len(jsondata[combined]['bins'])))
  
              for duration in bins.keys():
                  if bins[duration] != jsondata[combined]['bins'][duration]:
-                    print("Merged high/low count does not match overall count for duration %d" \
-                            % duration)
+                    print("Merged per prio count does not match overall count for duration %d" %
+                          duration)
                      return False
  
-        print("Merged high/low priority latency data match combined latency data")
+        print("Merged per priority latency data match combined latency data")
          return True
  
      def check(self):
@@ -602,7 +603,7 @@ class Test001(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, slat=False)
@@ -626,7 +627,7 @@ class Test002(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['write'], 1, slat=False, clat=False)
@@ -650,7 +651,7 @@ class Test003(FioLatTest):
              print("Unexpected write data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['trim'], 2, slat=False, tlat=False)
@@ -674,7 +675,7 @@ class Test004(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, plus=True)
@@ -698,7 +699,7 @@ class Test005(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
@@ -722,7 +723,7 @@ class Test006(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True)
@@ -743,7 +744,7 @@ class Test007(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, clat=False, tlat=False, plus=True)
@@ -761,11 +762,11 @@ class Test008(FioLatTest):
          job = self.json_data['jobs'][0]
  
          retval = True
-        if 'read' in job or 'write'in job or 'trim' in job:
+        if 'read' in job or 'write' in job or 'trim' in job:
              print("Unexpected data direction found in fio output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['mixed'], 0, plus=True, unified=True)
@@ -792,7 +793,7 @@ class Test009(FioLatTest):
              print("Error checking fsync latency data")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
@@ -813,7 +814,7 @@ class Test010(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, plus=True)
@@ -839,7 +840,7 @@ class Test011(FioLatTest):
              print("Unexpected trim data found in output")
              retval = False
          if not self.check_nocmdprio_lat(job):
-            print("Unexpected high/low priority latencies found")
+            print("Unexpected per priority latencies found")
              retval = False
  
          retval &= self.check_latencies(job['read'], 0, slat=False, clat=False, plus=True)
@@ -953,7 +954,7 @@ class Test019(FioLatTest):
          job = self.json_data['jobs'][0]
  
          retval = True
-        if 'read' in job or 'write'in job or 'trim' in job:
+        if 'read' in job or 'write' in job or 'trim' in job:
              print("Unexpected data direction found in fio output")
              retval = False
  
@@ -963,6 +964,27 @@ class Test019(FioLatTest):
          return retval
  
  
+class Test021(FioLatTest):
+    """Test object for Test 21."""
+
+    def check(self):
+        """Check Test 21 output."""
+
+        job = self.json_data['jobs'][0]
+
+        retval = True
+        if not self.check_empty(job['trim']):
+            print("Unexpected trim data found in output")
+            retval = False
+
+        retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True)
+        retval &= self.check_latencies(job['write'], 1, slat=False, tlat=False, plus=True)
+        retval &= self.check_prio_latencies(job['read'], clat=True, plus=True)
+        retval &= self.check_prio_latencies(job['write'], clat=True, plus=True)
+
+        return retval
+
+
  def parse_args():
      """Parse command-line arguments."""
  
@@ -1007,7 +1029,7 @@ def main():
              # randread, null
              # enable slat, clat, lat
              # only clat and lat will appear because
-            # because the null ioengine is syncrhonous
+            # because the null ioengine is synchronous
              "test_id": 1,
              "runtime": 2,
              "output-format": "json",
@@ -1047,7 +1069,7 @@ def main():
          {
              # randread, aio
              # enable slat, clat, lat
-            # all will appear because liaio is asynchronous
+            # all will appear because libaio is asynchronous
              "test_id": 4,
              "runtime": 5,
              "output-format": "json+",
@@ -1153,9 +1175,9 @@ def main():
              # randread, null
              # enable slat, clat, lat
              # only clat and lat will appear because
-            # because the null ioengine is syncrhonous
-            # same as Test 1 except
-            # numjobs = 4 to test sum_thread_stats() changes
+            # because the null ioengine is synchronous
+            # same as Test 1 except add numjobs = 4 to test
+            # sum_thread_stats() changes
              "test_id": 12,
              "runtime": 2,
              "output-format": "json",
@@ -1170,9 +1192,9 @@ def main():
          {
              # randread, aio
              # enable slat, clat, lat
-            # all will appear because liaio is asynchronous
-            # same as Test 4 except
-            # numjobs = 4 to test sum_thread_stats() changes
+            # all will appear because libaio is asynchronous
+            # same as Test 4 except add numjobs = 4 to test
+            # sum_thread_stats() changes
              "test_id": 13,
              "runtime": 5,
              "output-format": "json+",
@@ -1187,8 +1209,8 @@ def main():
          {
              # 50/50 r/w, aio, unified_rw_reporting
              # enable slat, clat, lata
-            # same as Test 8 except
-            # numjobs = 4 to test sum_thread_stats() changes
+            # same as Test 8 except add numjobs = 4 to test
+            # sum_thread_stats() changes
              "test_id": 14,
              "runtime": 5,
              "output-format": "json+",
@@ -1204,7 +1226,7 @@ def main():
          {
              # randread, aio
              # enable slat, clat, lat
-            # all will appear because liaio is asynchronous
+            # all will appear because libaio is asynchronous
              # same as Test 4 except add cmdprio_percentage
              "test_id": 15,
              "runtime": 5,
@@ -1278,8 +1300,8 @@ def main():
          {
              # 50/50 r/w, aio, unified_rw_reporting
              # enable slat, clat, lat
-            # same as Test 19 except
-            # add numjobs = 4 to test sum_thread_stats() changes
+            # same as Test 19 except add numjobs = 4 to test
+            # sum_thread_stats() changes
              "test_id": 20,
              "runtime": 5,
              "output-format": "json+",
@@ -1293,6 +1315,40 @@ def main():
              'numjobs': 4,
              "test_obj": Test019,
          },
+        {
+            # r/w, aio
+            # enable only clat
+            # test bssplit and cmdprio_bssplit
+            "test_id": 21,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 0,
+            "clat_percentiles": 1,
+            "lat_percentiles": 0,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'bssplit': '64k/40:1024k/60',
+            'cmdprio_bssplit': '64k/25/1/1:64k/75/3/2:1024k/0',
+            "test_obj": Test021,
+        },
+        {
+            # r/w, aio
+            # enable only clat
+            # same as Test 21 except add numjobs = 4 to test
+            # sum_thread_stats() changes
+            "test_id": 22,
+            "runtime": 5,
+            "output-format": "json+",
+            "slat_percentiles": 0,
+            "clat_percentiles": 1,
+            "lat_percentiles": 0,
+            "ioengine": aio,
+            'rw': 'randrw',
+            'bssplit': '64k/40:1024k/60',
+            'cmdprio_bssplit': '64k/25/1/1:64k/75/3/2:1024k/0',
+            'numjobs': 4,
+            "test_obj": Test021,
+        },
      ]
  
      passed = 0
@@ -1304,9 +1360,10 @@ def main():
             (args.run_only and test['test_id'] not in args.run_only):
              skipped = skipped + 1
              outcome = 'SKIPPED (User request)'
-        elif (platform.system() != 'Linux' or os.geteuid() != 0) and 'cmdprio_percentage' in test:
+        elif (platform.system() != 'Linux' or os.geteuid() != 0) and \
+             ('cmdprio_percentage' in test or 'cmdprio_bssplit' in test):
              skipped = skipped + 1
-            outcome = 'SKIPPED (Linux root required for cmdprio_percentage tests)'
+            outcome = 'SKIPPED (Linux root required for cmdprio tests)'
          else:
              test_obj = test['test_obj'](artifact_root, test, args.debug)
              status = test_obj.run_fio(fio)
diff --git a/t/lfsr-test.c b/t/lfsr-test.c

index 279e07f0ecd05110fae4e78e12787b8d7788fc27..632de38313401a468bddd0bf27476f8455297ef8 100644 (file)
--- a/t/lfsr-test.c
+++ b/t/lfsr-test.c
@@ -41,11 +41,11 @@ int main(int argc, char *argv[])
         switch (argc) {
                 case 5: if (strncmp(argv[4], "verify", 7) == 0)
                                 verify = 1;
-                       fallthrough;
+                       fio_fallthrough;
                 case 4: spin = atoi(argv[3]);
-                       fallthrough;
+                       fio_fallthrough;
                 case 3: seed = atol(argv[2]);
-                       fallthrough;
+                       fio_fallthrough;
                 case 2: numbers = strtol(argv[1], NULL, 16);
                                 break;
                 default: usage();
@@ -78,8 +78,7 @@ int main(int argc, char *argv[])
         /* Create verification table */
         if (verify) {
                 v_size = numbers * sizeof(uint8_t);
-               v = malloc(v_size);
-               memset(v, 0, v_size);
+               v = calloc(1, v_size);
                 printf("\nVerification table is %lf KiB\n", (double)(v_size) / 1024);
         }
         v_start = v;
diff --git a/t/log_compression.py b/t/log_compression.py

new file mode 100755 (executable)

index 0000000..94c92db
--- /dev/null
+++ b/t/log_compression.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+#
+# log_compression.py
+#
+# Test log_compression and log_store_compressed. Uses null ioengine.
+# Previous bugs have caused output in per I/O log files to be missing
+# and/or out of order
+#
+# Expected result: 8000 log entries, offset starting at 0 and increasing by bs
+# Buggy result: Log entries out of order (usually without log_store_compressed)
+# and/or missing log entries (usually with log_store_compressed)
+#
+# USAGE
+# python log_compression.py [-f fio-executable]
+#
+# EXAMPLES
+# python t/log_compression.py
+# python t/log_compression.py -f ./fio
+#
+# REQUIREMENTS
+# Python 3.5+
+#
+# ===TEST MATRIX===
+#
+# With log_compression=10K
+# With log_store_compressed=1 and log_compression=10K
+
+import os
+import sys
+import platform
+import argparse
+import subprocess
+
+
+def parse_args():
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fio',
+                        help='path to fio executable (e.g., ./fio)')
+    return parser.parse_args()
+
+
+def run_fio(fio,log_store_compressed):
+    fio_args = [
+        '--name=job',
+        '--ioengine=null',
+        '--filesize=1000M',
+        '--bs=128K',
+        '--rw=write',
+        '--iodepth=1',
+        '--write_bw_log=test',
+        '--per_job_logs=0',
+        '--log_offset=1',
+        '--log_compression=10K',
+        ]
+    if log_store_compressed:
+        fio_args.append('--log_store_compressed=1')
+
+    subprocess.check_output([fio] + fio_args)
+
+    if log_store_compressed:
+        fio_inflate_args = [
+            '--inflate-log=test_bw.log.fz'
+            ]
+        with open('test_bw.from_fz.log','wt') as f:
+            subprocess.check_call([fio]+fio_inflate_args,stdout=f)
+
+def check_log_file(log_store_compressed):
+    filename = 'test_bw.from_fz.log' if log_store_compressed else 'test_bw.log'
+    with open(filename,'rt') as f:
+        file_data = f.read()
+    log_lines = [x for x in file_data.split('\n') if len(x.strip())!=0]
+    log_ios = len(log_lines)
+
+    filesize = 1000*1024*1024
+    bs = 128*1024
+    ios = filesize//bs
+    if log_ios!=ios:
+        print('wrong number of ios ({}) in log; should be {}'.format(log_ios,ios))
+        return False
+
+    expected_offset = 0
+    for line_number,line in enumerate(log_lines):
+        log_offset = int(line.split(',')[4])
+        if log_offset != expected_offset:
+            print('wrong offset ({}) for io number {} in log; should be {}'.format(
+                log_offset, line_number, expected_offset))
+            return False
+        expected_offset += bs
+    return True
+
+def main():
+    """Entry point for this script."""
+    args = parse_args()
+    if args.fio:
+        fio_path = args.fio
+    else:
+        fio_path = os.path.join(os.path.dirname(__file__), '../fio')
+        if not os.path.exists(fio_path):
+            fio_path = 'fio'
+    print("fio path is", fio_path)
+
+    passed_count = 0
+    failed_count = 0
+    for log_store_compressed in [False, True]:
+        run_fio(fio_path, log_store_compressed)
+        passed = check_log_file(log_store_compressed)
+        print('Test with log_store_compressed={} {}'.format(log_store_compressed,
+            'PASSED' if passed else 'FAILED'))
+        if passed:
+            passed_count+=1
+        else:
+            failed_count+=1
+
+    print('{} tests passed, {} failed'.format(passed_count, failed_count))
+
+    sys.exit(failed_count)
+
+if __name__ == '__main__':
+    main()
+
diff --git a/t/nvmept.py b/t/nvmept.py

new file mode 100755 (executable)

index 0000000..1ade64d
--- /dev/null
+++ b/t/nvmept.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+"""
+# nvmept.py
+#
+# Test fio's io_uring_cmd ioengine with NVMe pass-through commands.
+#
+# USAGE
+# see python3 nvmept.py --help
+#
+# EXAMPLES
+# python3 t/nvmept.py --dut /dev/ng0n1
+# python3 t/nvmept.py --dut /dev/ng1n1 -f ./fio
+#
+# REQUIREMENTS
+# Python 3.6
+#
+"""
+import os
+import sys
+import time
+import argparse
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+
+
+class PassThruTest(FioJobCmdTest):
+    """
+    NVMe pass-through test class. Check to make sure output for selected data
+    direction(s) is non-zero and that zero data appears for other directions.
+    """
+
+    def setup(self, parameters):
+        """Setup a test."""
+
+        fio_args = [
+            "--name=nvmept",
+            "--ioengine=io_uring_cmd",
+            "--cmd_type=nvme",
+            "--iodepth=8",
+            "--iodepth_batch=4",
+            "--iodepth_batch_complete=4",
+            f"--filename={self.fio_opts['filename']}",
+            f"--rw={self.fio_opts['rw']}",
+            f"--output={self.filenames['output']}",
+            f"--output-format={self.fio_opts['output-format']}",
+        ]
+        for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles',
+                    'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait',
+                    'time_based', 'runtime', 'verify', 'io_size']:
+            if opt in self.fio_opts:
+                option = f"--{opt}={self.fio_opts[opt]}"
+                fio_args.append(option)
+
+        super().setup(fio_args)
+
+
+    def check_result(self):
+        super().check_result()
+
+        if 'rw' not in self.fio_opts:
+            return
+
+        if not self.passed:
+            return
+
+        job = self.json_data['jobs'][0]
+
+        if self.fio_opts['rw'] in ['read', 'randread']:
+            self.passed = self.check_all_ddirs(['read'], job)
+        elif self.fio_opts['rw'] in ['write', 'randwrite']:
+            if 'verify' not in self.fio_opts:
+                self.passed = self.check_all_ddirs(['write'], job)
+            else:
+                self.passed = self.check_all_ddirs(['read', 'write'], job)
+        elif self.fio_opts['rw'] in ['trim', 'randtrim']:
+            self.passed = self.check_all_ddirs(['trim'], job)
+        elif self.fio_opts['rw'] in ['readwrite', 'randrw']:
+            self.passed = self.check_all_ddirs(['read', 'write'], job)
+        elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']:
+            self.passed = self.check_all_ddirs(['trim', 'write'], job)
+        else:
+            print(f"Unhandled rw value {self.fio_opts['rw']}")
+            self.passed = False
+
+        if job['iodepth_level']['8'] < 95:
+            print("Did not achieve requested iodepth")
+            self.passed = False
+
+
+TEST_LIST = [
+    {
+        "test_id": 1,
+        "fio_opts": {
+            "rw": 'read',
+            "timebased": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 2,
+        "fio_opts": {
+            "rw": 'randread',
+            "timebased": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 3,
+        "fio_opts": {
+            "rw": 'write',
+            "timebased": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 4,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "timebased": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 5,
+        "fio_opts": {
+            "rw": 'trim',
+            "timebased": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 6,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "timebased": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 7,
+        "fio_opts": {
+            "rw": 'write',
+            "io_size": 1024*1024,
+            "verify": "crc32c",
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 8,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "io_size": 1024*1024,
+            "verify": "crc32c",
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 9,
+        "fio_opts": {
+            "rw": 'readwrite',
+            "timebased": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 10,
+        "fio_opts": {
+            "rw": 'randrw',
+            "timebased": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 11,
+        "fio_opts": {
+            "rw": 'trimwrite',
+            "timebased": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 12,
+        "fio_opts": {
+            "rw": 'randtrimwrite',
+            "timebased": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 13,
+        "fio_opts": {
+            "rw": 'randread',
+            "timebased": 1,
+            "runtime": 3,
+            "fixedbufs": 1,
+            "nonvectored": 1,
+            "force_async": 1,
+            "registerfiles": 1,
+            "sqthread_poll": 1,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        "test_id": 14,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "timebased": 1,
+            "runtime": 3,
+            "fixedbufs": 1,
+            "nonvectored": 1,
+            "force_async": 1,
+            "registerfiles": 1,
+            "sqthread_poll": 1,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+    {
+        # We can't enable fixedbufs because for trim-only
+        # workloads fio actually does not allocate any buffers
+        "test_id": 15,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "timebased": 1,
+            "runtime": 3,
+            "fixedbufs": 0,
+            "nonvectored": 1,
+            "force_async": 1,
+            "registerfiles": 1,
+            "sqthread_poll": 1,
+            "output-format": "json",
+            },
+        "test_class": PassThruTest,
+    },
+]
+
+def parse_args():
+    """Parse command-line arguments."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+    parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+    parser.add_argument('-s', '--skip', nargs='+', type=int,
+                        help='list of test(s) to skip')
+    parser.add_argument('-o', '--run-only', nargs='+', type=int,
+                        help='list of test(s) to run, skipping all others')
+    parser.add_argument('--dut', help='target NVMe character device to test '
+                        '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True)
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands."""
+
+    args = parse_args()
+
+    artifact_root = args.artifact_root if args.artifact_root else \
+        f"nvmept-test-{time.strftime('%Y%m%d-%H%M%S')}"
+    os.mkdir(artifact_root)
+    print(f"Artifact directory is {artifact_root}")
+
+    if args.fio:
+        fio_path = str(Path(args.fio).absolute())
+    else:
+        fio_path = 'fio'
+    print(f"fio path is {fio_path}")
+
+    for test in TEST_LIST:
+        test['fio_opts']['filename'] = args.dut
+
+    test_env = {
+              'fio_path': fio_path,
+              'fio_root': str(Path(__file__).absolute().parent.parent),
+              'artifact_root': artifact_root,
+              'basename': 'nvmept',
+              }
+
+    _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
+    sys.exit(failed)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/t/nvmept_fdp.py b/t/nvmept_fdp.py

new file mode 100755 (executable)

index 0000000..031b439
--- /dev/null
+++ b/t/nvmept_fdp.py
@@ -0,0 +1,745 @@
+#!/usr/bin/env python3
+#
+# Copyright 2024 Samsung Electronics Co., Ltd All Rights Reserved
+#
+# For conditions of distribution and use, see the accompanying COPYING file.
+#
+"""
+# nvmept_fdp.py
+#
+# Test fio's io_uring_cmd ioengine with NVMe pass-through FDP write commands.
+#
+# USAGE
+# see python3 nvmept_fdp.py --help
+#
+# EXAMPLES
+# python3 t/nvmept_fdp.py --dut /dev/ng0n1
+# python3 t/nvmept_fdp.py --dut /dev/ng1n1 -f ./fio
+#
+# REQUIREMENTS
+# Python 3.6
+# Device formatted with LBA data size 4096 bytes
+# Device with at least five placement IDs
+#
+# WARNING
+# This is a destructive test
+"""
+import os
+import sys
+import json
+import time
+import locale
+import logging
+import argparse
+import subprocess
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+from fiotestcommon import SUCCESS_NONZERO
+
+
+class FDPTest(FioJobCmdTest):
+    """
+    NVMe pass-through test class. Check to make sure output for selected data
+    direction(s) is non-zero and that zero data appears for other directions.
+    """
+
+    def setup(self, parameters):
+        """Setup a test."""
+
+        fio_args = [
+            "--name=nvmept-fdp",
+            "--ioengine=io_uring_cmd",
+            "--cmd_type=nvme",
+            "--randrepeat=0",
+            f"--filename={self.fio_opts['filename']}",
+            f"--rw={self.fio_opts['rw']}",
+            f"--output={self.filenames['output']}",
+            f"--output-format={self.fio_opts['output-format']}",
+        ]
+        for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles',
+                    'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait',
+                    'time_based', 'runtime', 'verify', 'io_size', 'num_range',
+                    'iodepth', 'iodepth_batch', 'iodepth_batch_complete',
+                    'size', 'rate', 'bs', 'bssplit', 'bsrange', 'randrepeat',
+                    'buffer_pattern', 'verify_pattern', 'offset', 'fdp',
+                    'fdp_pli', 'fdp_pli_select', 'dataplacement', 'plid_select',
+                    'plids', 'number_ios']:
+            if opt in self.fio_opts:
+                option = f"--{opt}={self.fio_opts[opt]}"
+                fio_args.append(option)
+
+        super().setup(fio_args)
+
+
+    def check_result(self):
+        try:
+            self._check_result()
+        finally:
+            if not update_all_ruhs(self.fio_opts['filename']):
+                logging.error("Could not reset device")
+            if not check_all_ruhs(self.fio_opts['filename']):
+                logging.error("Reclaim units have inconsistent RUAMW values")
+
+
+    def _check_result(self):
+
+        super().check_result()
+
+        if 'rw' not in self.fio_opts or \
+                not self.passed or \
+                'json' not in self.fio_opts['output-format']:
+            return
+
+        job = self.json_data['jobs'][0]
+
+        if self.fio_opts['rw'] in ['read', 'randread']:
+            self.passed = self.check_all_ddirs(['read'], job)
+        elif self.fio_opts['rw'] in ['write', 'randwrite']:
+            if 'verify' not in self.fio_opts:
+                self.passed = self.check_all_ddirs(['write'], job)
+            else:
+                self.passed = self.check_all_ddirs(['read', 'write'], job)
+        elif self.fio_opts['rw'] in ['trim', 'randtrim']:
+            self.passed = self.check_all_ddirs(['trim'], job)
+        elif self.fio_opts['rw'] in ['readwrite', 'randrw']:
+            self.passed = self.check_all_ddirs(['read', 'write'], job)
+        elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']:
+            self.passed = self.check_all_ddirs(['trim', 'write'], job)
+        else:
+            logging.error("Unhandled rw value %s", self.fio_opts['rw'])
+            self.passed = False
+
+        if 'iodepth' in self.fio_opts:
+            # We will need to figure something out if any test uses an iodepth
+            # different from 8
+            if job['iodepth_level']['8'] < 95:
+                logging.error("Did not achieve requested iodepth")
+                self.passed = False
+            else:
+                logging.debug("iodepth 8 target met %s", job['iodepth_level']['8'])
+
+
+class FDPMultiplePLIDTest(FDPTest):
+    """
+    Write to multiple placement IDs.
+    """
+
+    def setup(self, parameters):
+        mapping = {
+                    'nruhsd': FIO_FDP_NUMBER_PLIDS,
+                    'max_ruamw': FIO_FDP_MAX_RUAMW,
+                }
+        if 'number_ios' in self.fio_opts and isinstance(self.fio_opts['number_ios'], str):
+            self.fio_opts['number_ios'] = eval(self.fio_opts['number_ios'].format(**mapping))
+
+        super().setup(parameters)
+
+    def _check_result(self):
+        if 'fdp_pli' in self.fio_opts:
+            plid_list = self.fio_opts['fdp_pli'].split(',')
+        elif 'plids' in self.fio_opts:
+            plid_list = self.fio_opts['plids'].split(',')
+        else:
+            plid_list = list(range(FIO_FDP_NUMBER_PLIDS))
+
+        plid_list = sorted([int(i) for i in plid_list])
+        logging.debug("plid_list: %s", str(plid_list))
+
+        fdp_status = get_fdp_status(self.fio_opts['filename'])
+
+        select = "roundrobin"
+        if 'fdp_pli_select' in self.fio_opts:
+            select = self.fio_opts['fdp_pli_select']
+        elif 'plid_select' in self.fio_opts:
+            select = self.fio_opts['plid_select']
+
+        if select == "roundrobin":
+            self._check_robin(plid_list, fdp_status)
+        elif select == "random":
+            self._check_random(plid_list, fdp_status)
+        else:
+            logging.error("Unknown plid selection strategy %s", select)
+            self.passed = False
+
+        super()._check_result()
+
+    def _check_robin(self, plid_list, fdp_status):
+        """
+        With round robin we can know exactly how many writes each PLID will
+        receive.
+        """
+        ruamw = [FIO_FDP_MAX_RUAMW] * FIO_FDP_NUMBER_PLIDS
+
+        remainder = int(self.fio_opts['number_ios'] % len(plid_list))
+        whole = int((self.fio_opts['number_ios'] - remainder) / len(plid_list))
+        logging.debug("PLIDs in the list should receive %d writes; %d PLIDs will receive one extra",
+                      whole, remainder)
+
+        for plid in plid_list:
+            ruamw[plid] -= whole
+            if remainder:
+                ruamw[plid] -= 1
+                remainder -= 1
+        logging.debug("Expected ruamw values: %s", str(ruamw))
+
+        for idx, ruhs in enumerate(fdp_status['ruhss']):
+            if ruhs['ruamw'] != ruamw[idx]:
+                logging.error("RUAMW mismatch with idx %d, pid %d, expected %d, observed %d", idx,
+                              ruhs['pid'], ruamw[idx], ruhs['ruamw'])
+                self.passed = False
+                break
+
+            logging.debug("RUAMW match with idx %d, pid %d: ruamw=%d", idx, ruhs['pid'], ruamw[idx])
+
+    def _check_random(self, plid_list, fdp_status):
+        """
+        With random selection, a set of PLIDs will receive all the write
+        operations and the remainder will be untouched.
+        """
+
+        total_ruamw = 0
+        for plid in plid_list:
+            total_ruamw += fdp_status['ruhss'][plid]['ruamw']
+
+        expected = len(plid_list) * FIO_FDP_MAX_RUAMW - self.fio_opts['number_ios']
+        if total_ruamw != expected:
+            logging.error("Expected total ruamw %d for plids %s, observed %d", expected,
+                          str(plid_list), total_ruamw)
+            self.passed = False
+        else:
+            logging.debug("Observed expected total ruamw %d for plids %s", expected, str(plid_list))
+
+        for idx, ruhs in enumerate(fdp_status['ruhss']):
+            if idx in plid_list:
+                continue
+            if ruhs['ruamw'] != FIO_FDP_MAX_RUAMW:
+                logging.error("Unexpected ruamw %d for idx %d, pid %d, expected %d", ruhs['ruamw'],
+                              idx, ruhs['pid'], FIO_FDP_MAX_RUAMW)
+                self.passed = False
+            else:
+                logging.debug("Observed expected ruamw %d for idx %d, pid %d", ruhs['ruamw'], idx,
+                              ruhs['pid'])
+
+
+class FDPSinglePLIDTest(FDPTest):
+    """
+    Write to a single placement ID only.
+    """
+
+    def _check_result(self):
+        if 'plids' in self.fio_opts:
+            plid = self.fio_opts['plids']
+        elif 'fdp_pli' in self.fio_opts:
+            plid = self.fio_opts['fdp_pli']
+        else:
+            plid = 0
+
+        fdp_status = get_fdp_status(self.fio_opts['filename'])
+        ruamw = fdp_status['ruhss'][plid]['ruamw']
+        lba_count = self.fio_opts['number_ios']
+
+        if FIO_FDP_MAX_RUAMW - lba_count != ruamw:
+            logging.error("FDP accounting mismatch for plid %d; expected ruamw %d, observed %d",
+                          plid, FIO_FDP_MAX_RUAMW - lba_count, ruamw)
+            self.passed = False
+        else:
+            logging.debug("FDP accounting as expected for plid %d; ruamw = %d", plid, ruamw)
+
+        super()._check_result()
+
+
+class FDPReadTest(FDPTest):
+    """
+    Read workload test.
+    """
+
+    def _check_result(self):
+        ruamw = check_all_ruhs(self.fio_opts['filename'])
+
+        if ruamw != FIO_FDP_MAX_RUAMW:
+            logging.error("Read workload affected FDP ruamw")
+            self.passed = False
+        else:
+            logging.debug("Read workload did not disturb FDP ruamw")
+            super()._check_result()
+
+
+def get_fdp_status(dut):
+    """
+    Run the nvme-cli command to obtain FDP status and return result as a JSON
+    object.
+    """
+
+    cmd = f"sudo nvme fdp status --output-format=json {dut}"
+    cmd = cmd.split(' ')
+    cmd_result = subprocess.run(cmd, capture_output=True, check=False,
+                                encoding=locale.getpreferredencoding())
+
+    if cmd_result.returncode != 0:
+        logging.error("Error obtaining device %s FDP status: %s", dut, cmd_result.stderr)
+        return False
+
+    return json.loads(cmd_result.stdout)
+
+
+def update_ruh(dut, plid):
+    """
+    Update reclaim unit handles with specified ID(s). This tells the device to
+    point the RUH to a new (empty) reclaim unit.
+    """
+
+    ids = ','.join(plid) if isinstance(plid, list) else plid
+    cmd = f"nvme fdp update --pids={ids} {dut}"
+    cmd = cmd.split(' ')
+    cmd_result = subprocess.run(cmd, capture_output=True, check=False,
+                                encoding=locale.getpreferredencoding())
+
+    if cmd_result.returncode != 0:
+        logging.error("Error updating RUH %s ID(s) %s", dut, ids)
+        return False
+
+    return True
+
+
+def update_all_ruhs(dut):
+    """
+    Update all reclaim unit handles on the device.
+    """
+
+    fdp_status = get_fdp_status(dut)
+    for ruhs in fdp_status['ruhss']:
+        if not update_ruh(dut, ruhs['pid']):
+            return False
+
+    return True
+
+
+def check_all_ruhs(dut):
+    """
+    Check that all RUHs have the same value for reclaim unit available media
+    writes (RUAMW).  Return the RUAMW value.
+    """
+
+    fdp_status = get_fdp_status(dut)
+    ruh_status = fdp_status['ruhss']
+
+    ruamw = ruh_status[0]['ruamw']
+    for ruhs in ruh_status:
+        if ruhs['ruamw'] != ruamw:
+            logging.error("RUAMW mismatch: found %d, expected %d", ruhs['ruamw'], ruamw)
+            return False
+
+    return ruamw
+
+
+TEST_LIST = [
+    # Write one LBA to one PLID using both the old and new sets of options
+    ## omit fdp_pli_select/plid_select
+    {
+        "test_id": 1,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "number_ios": 1,
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli": 3,
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    {
+        "test_id": 2,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": 1,
+            "verify": "crc32c",
+            "dataplacement": "fdp",
+            "plids": 3,
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    ## fdp_pli_select/plid_select=roundrobin
+    {
+        "test_id": 3,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "number_ios": 1,
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli": 3,
+            "fdp_pli_select": "roundrobin",
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    {
+        "test_id": 4,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": 1,
+            "verify": "crc32c",
+            "dataplacement": "fdp",
+            "plids": 3,
+            "plid_select": "roundrobin",
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    ## fdp_pli_select/plid_select=random
+    {
+        "test_id": 5,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "number_ios": 1,
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli": 3,
+            "fdp_pli_select": "random",
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    {
+        "test_id": 6,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": 1,
+            "verify": "crc32c",
+            "dataplacement": "fdp",
+            "plids": 3,
+            "plid_select": "random",
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    # Write four LBAs to one PLID using both the old and new sets of options
+    ## omit fdp_pli_select/plid_select
+    {
+        "test_id": 7,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "number_ios": 4,
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli": 1,
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    {
+        "test_id": 8,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": 4,
+            "verify": "crc32c",
+            "dataplacement": "fdp",
+            "plids": 1,
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    ## fdp_pli_select/plid_select=roundrobin
+    {
+        "test_id": 9,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "number_ios": 4,
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli": 1,
+            "fdp_pli_select": "roundrobin",
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    {
+        "test_id": 10,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": 4,
+            "verify": "crc32c",
+            "dataplacement": "fdp",
+            "plids": 1,
+            "plid_select": "roundrobin",
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    ## fdp_pli_select/plid_select=random
+    {
+        "test_id": 11,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "number_ios": 4,
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli": 1,
+            "fdp_pli_select": "random",
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    {
+        "test_id": 12,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": 4,
+            "verify": "crc32c",
+            "dataplacement": "fdp",
+            "plids": 1,
+            "plid_select": "random",
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    # Just a regular write without FDP directive--should land on plid 0
+    {
+        "test_id": 13,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": 19,
+            "verify": "crc32c",
+            "output-format": "json",
+            },
+        "test_class": FDPSinglePLIDTest,
+    },
+    # Read workload
+    {
+        "test_id": 14,
+        "fio_opts": {
+            "rw": 'randread',
+            "bs": 4096,
+            "number_ios": 19,
+            "output-format": "json",
+            },
+        "test_class": FDPReadTest,
+    },
+    # write to multiple PLIDs using round robin to select PLIDs
+    ## write to all PLIDs using old and new sets of options
+    {
+        "test_id": 100,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": "2*{nruhsd}+3",
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli_select": "roundrobin",
+            "output-format": "json",
+            },
+        "test_class": FDPMultiplePLIDTest,
+    },
+    {
+        "test_id": 101,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": "2*{nruhsd}+3",
+            "verify": "crc32c",
+            "dataplacement": "fdp",
+            "plid_select": "roundrobin",
+            "output-format": "json",
+            },
+        "test_class": FDPMultiplePLIDTest,
+    },
+    ## write to a subset of PLIDs using old and new sets of options
+    {
+        "test_id": 102,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": "{nruhsd}+1",
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli": "1,3",
+            "fdp_pli_select": "roundrobin",
+            "output-format": "json",
+            },
+        "test_class": FDPMultiplePLIDTest,
+    },
+    {
+        "test_id": 103,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": "{nruhsd}+1",
+            "verify": "crc32c",
+            "dataplacement": "fdp",
+            "plids": "1,3",
+            "plid_select": "roundrobin",
+            "output-format": "json",
+            },
+        "test_class": FDPMultiplePLIDTest,
+    },
+    # write to multiple PLIDs using random selection of PLIDs
+    ## write to all PLIDs using old and new sets of options
+    {
+        "test_id": 200,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": "{max_ruamw}-1",
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli_select": "random",
+            "output-format": "json",
+            },
+        "test_class": FDPMultiplePLIDTest,
+    },
+    {
+        "test_id": 201,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": "{max_ruamw}-1",
+            "verify": "crc32c",
+            "dataplacement": "fdp",
+            "plid_select": "random",
+            "output-format": "json",
+            },
+        "test_class": FDPMultiplePLIDTest,
+    },
+    ## write to a subset of PLIDs using old and new sets of options
+    {
+        "test_id": 202,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": "{max_ruamw}-1",
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli": "1,3,4",
+            "fdp_pli_select": "random",
+            "output-format": "json",
+            },
+        "test_class": FDPMultiplePLIDTest,
+    },
+    {
+        "test_id": 203,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "number_ios": "{max_ruamw}-1",
+            "verify": "crc32c",
+            "dataplacement": "fdp",
+            "plids": "1,3,4",
+            "plid_select": "random",
+            "output-format": "json",
+            },
+        "test_class": FDPMultiplePLIDTest,
+    },
+    # Specify invalid options fdp=1 and dataplacement=none
+    {
+        "test_id": 300,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "io_size": 4096,
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli": 3,
+            "output-format": "normal",
+            "dataplacement": "none",
+            },
+        "test_class": FDPTest,
+        "success": SUCCESS_NONZERO,
+    },
+    # Specify invalid options fdp=1 and dataplacement=streams
+    {
+        "test_id": 301,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "io_size": 4096,
+            "verify": "crc32c",
+            "fdp": 1,
+            "fdp_pli": 3,
+            "output-format": "normal",
+            "dataplacement": "streams",
+            },
+        "test_class": FDPTest,
+        "success": SUCCESS_NONZERO,
+    },
+]
+
+def parse_args():
+    """Parse command-line arguments."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true')
+    parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+    parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+    parser.add_argument('-s', '--skip', nargs='+', type=int,
+                        help='list of test(s) to skip')
+    parser.add_argument('-o', '--run-only', nargs='+', type=int,
+                        help='list of test(s) to run, skipping all others')
+    parser.add_argument('--dut', help='target NVMe character device to test '
+                        '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True)
+    args = parser.parse_args()
+
+    return args
+
+
+FIO_FDP_MAX_RUAMW = 0
+FIO_FDP_NUMBER_PLIDS = 0
+
+def main():
+    """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands."""
+    global FIO_FDP_MAX_RUAMW
+    global FIO_FDP_NUMBER_PLIDS
+
+    args = parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    artifact_root = args.artifact_root if args.artifact_root else \
+        f"nvmept-fdp-test-{time.strftime('%Y%m%d-%H%M%S')}"
+    os.mkdir(artifact_root)
+    print(f"Artifact directory is {artifact_root}")
+
+    if args.fio:
+        fio_path = str(Path(args.fio).absolute())
+    else:
+        fio_path = 'fio'
+    print(f"fio path is {fio_path}")
+
+    for test in TEST_LIST:
+        test['fio_opts']['filename'] = args.dut
+
+    fdp_status = get_fdp_status(args.dut)
+    FIO_FDP_NUMBER_PLIDS = fdp_status['nruhsd']
+    update_all_ruhs(args.dut)
+    FIO_FDP_MAX_RUAMW = check_all_ruhs(args.dut)
+    if not FIO_FDP_MAX_RUAMW:
+        sys.exit(-1)
+
+    test_env = {
+              'fio_path': fio_path,
+              'fio_root': str(Path(__file__).absolute().parent.parent),
+              'artifact_root': artifact_root,
+              'basename': 'nvmept-fdp',
+              }
+
+    _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
+    sys.exit(failed)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/t/nvmept_pi.py b/t/nvmept_pi.py

new file mode 100755 (executable)

index 0000000..df7c0b9
--- /dev/null
+++ b/t/nvmept_pi.py
@@ -0,0 +1,953 @@
+#!/usr/bin/env python3
+"""
+# nvmept_pi.py
+#
+# Test fio's io_uring_cmd ioengine support for DIF/DIX end-to-end data
+# protection.
+#
+# USAGE
+# see python3 nvmept_pi.py --help
+#
+# EXAMPLES (THIS IS A DESTRUCTIVE TEST!!)
+# python3 t/nvmept_pi.py --dut /dev/ng0n1 -f ./fio
+# python3 t/nvmept_pi.py --dut /dev/ng0n1 -f ./fio --lbaf 1
+#
+# REQUIREMENTS
+# Python 3.6
+#
+"""
+import os
+import sys
+import json
+import time
+import locale
+import logging
+import argparse
+import itertools
+import subprocess
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+from fiotestcommon import SUCCESS_NONZERO
+
+NUMBER_IOS = 8192
+BS_LOW = 1
+BS_HIGH = 16
+
+class DifDixTest(FioJobCmdTest):
+    """
+    NVMe DIF/DIX test class.
+    """
+
+    def setup(self, parameters):
+        """Setup a test."""
+
+        fio_args = [
+            "--name=nvmept_pi",
+            f"--ioengine={self.fio_opts['ioengine']}",
+            f"--filename={self.fio_opts['filename']}",
+            f"--rw={self.fio_opts['rw']}",
+            f"--bsrange={self.fio_opts['bsrange']}",
+            f"--output={self.filenames['output']}",
+            f"--md_per_io_size={self.fio_opts['md_per_io_size']}",
+            f"--pi_act={self.fio_opts['pi_act']}",
+            f"--pi_chk={self.fio_opts['pi_chk']}",
+            f"--apptag={self.fio_opts['apptag']}",
+            f"--apptag_mask={self.fio_opts['apptag_mask']}",
+        ]
+        for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles',
+                    'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait',
+                    'time_based', 'runtime', 'verify', 'io_size', 'offset', 'number_ios',
+                    'output-format']:
+            if opt in self.fio_opts:
+                option = f"--{opt}={self.fio_opts[opt]}"
+                fio_args.append(option)
+
+        if self.fio_opts['ioengine'] == 'io_uring_cmd':
+            fio_args.append('--cmd_type=nvme')
+        elif self.fio_opts['ioengine'] == 'xnvme':
+            fio_args.append('--thread=1')
+            fio_args.append('--xnvme_async=io_uring_cmd')
+
+        super().setup(fio_args)
+
+
+TEST_LIST = [
+#
+# Write data with pi_act=1 and then read the data back (with both
+# pi_act=[0,1]).
+#
+    {
+        # Write workload with variable IO sizes
+        # pi_act=1
+        "test_id": 101,
+        "fio_opts": {
+            "rw": 'write',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            "pi_act": 1,
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with fixed small IO size
+        # pi_act=0
+        "test_id": 102,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_LOW,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with fixed small IO size
+        # pi_act=1
+        "test_id": 103,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_LOW,
+        "test_class": DifDixTest,
+    },
+    {
+        # Write workload with fixed large IO size
+        # Precondition for read workloads to follow
+        # pi_act=1
+        "test_id": 104,
+        "fio_opts": {
+            "rw": 'write',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            "pi_act": 1,
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_HIGH,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        "test_id": 105,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=1
+        "test_id": 106,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+#
+# Write data with pi_act=0 and then read the data back (with both
+# pi_act=[0,1]).
+#
+    {
+        # Write workload with variable IO sizes
+        # pi_act=0
+        "test_id": 201,
+        "fio_opts": {
+            "rw": 'write',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            "pi_act": 0,
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with fixed small IO size
+        # pi_act=0
+        "test_id": 202,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_LOW,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with fixed small IO size
+        # pi_act=1
+        "test_id": 203,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_LOW,
+        "test_class": DifDixTest,
+    },
+    {
+        # Write workload with fixed large IO sizes
+        # pi_act=0
+        "test_id": 204,
+        "fio_opts": {
+            "rw": 'write',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            "pi_act": 0,
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_HIGH,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        "test_id": 205,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=1
+        "test_id": 206,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x8888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+#
+# Test apptag errors.
+#
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        # trigger an apptag error
+        "test_id": 301,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x0888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "success": SUCCESS_NONZERO,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=1
+        # trigger an apptag error
+        "test_id": 302,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x0888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "success": SUCCESS_NONZERO,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        # trigger an apptag error
+        # same as above but with pi_chk=APPTAG only
+        "test_id": 303,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x0888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "success": SUCCESS_NONZERO,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=1
+        # trigger an apptag error
+        # same as above but with pi_chk=APPTAG only
+        "test_id": 304,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x0888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "success": SUCCESS_NONZERO,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        # this case would trigger an apptag error, but pi_chk says to check
+        # only the Guard PI and reftag, so there should be no error
+        "test_id": 305,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x0888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=1
+        # this case would trigger an apptag error, but pi_chk says to check
+        # only the Guard PI and reftag, so there should be no error
+        "test_id": 306,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x0888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        # this case would trigger an apptag error, but pi_chk says to check
+        # only the Guard PI, so there should be no error
+        "test_id": 307,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x0888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "GUARD",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=1
+        # this case would trigger an apptag error, but pi_chk says to check
+        # only the Guard PI, so there should be no error
+        "test_id": 308,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x0888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "GUARD",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        # this case would trigger an apptag error, but pi_chk says to check
+        # only the reftag, so there should be no error
+        # This case will be skipped when the device is formatted with Type 3 PI
+        # since Type 3 PI ignores the reftag
+        "test_id": 309,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x0888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "skip": "type3",
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=1
+        # this case would trigger an apptag error, but pi_chk says to check
+        # only the reftag, so there should be no error
+        # This case will be skipped when the device is formatted with Type 3 PI
+        # since Type 3 PI ignores the reftag
+        "test_id": 310,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x0888",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "skip": "type3",
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        # use apptag mask to ignore apptag mismatch
+        "test_id": 311,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x0888",
+            "apptag_mask": "0x0FFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=1
+        # use apptag mask to ignore apptag mismatch
+        "test_id": 312,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x0888",
+            "apptag_mask": "0x0FFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        # use apptag mask to ignore apptag mismatch
+        "test_id": 313,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0xF888",
+            "apptag_mask": "0x0FFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=1
+        # use apptag mask to ignore apptag mismatch
+        "test_id": 314,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0xF888",
+            "apptag_mask": "0x0FFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "test_class": DifDixTest,
+    },
+    {
+        # Write workload with fixed large IO sizes
+        # Set apptag=0xFFFF to disable all checking for Type 1 and 2
+        # pi_act=1
+        "test_id": 315,
+        "fio_opts": {
+            "rw": 'write',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "apptag": "0xFFFF",
+            "apptag_mask": "0xFFFF",
+            "pi_act": 1,
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_HIGH,
+        "bs_high": BS_HIGH,
+        "skip": "type3",
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        # Data was written with apptag=0xFFFF
+        # Reading the data back should disable all checking for Type 1 and 2
+        "test_id": 316,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 0,
+            "apptag": "0x0101",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "skip": "type3",
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=1
+        # Data was written with apptag=0xFFFF
+        # Reading the data back should disable all checking for Type 1 and 2
+        "test_id": 317,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "output-format": "json",
+            "pi_act": 1,
+            "apptag": "0x0000",
+            "apptag_mask": "0xFFFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "skip": "type3",
+        "test_class": DifDixTest,
+    },
+#
+# Error cases related to block size and metadata size
+#
+    {
+        # Use a min block size that is not a multiple of lba/elba size to
+        # trigger an error.
+        "test_id": 401,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "pi_act": 0,
+            "apptag": "0x8888",
+            "apptag_mask": "0x0FFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW+0.5,
+        "bs_high": BS_HIGH,
+        "success": SUCCESS_NONZERO,
+        "test_class": DifDixTest,
+    },
+    {
+        # Use metadata size that is too small
+        "test_id": 402,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "pi_act": 0,
+            "apptag": "0x8888",
+            "apptag_mask": "0x0FFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "mdsize_adjustment": -1,
+        "success": SUCCESS_NONZERO,
+        "skip": "elba",
+        "test_class": DifDixTest,
+    },
+    {
+        # Read workload with variable IO sizes
+        # pi_act=0
+        # Should still work even if metadata size is too large
+        "test_id": 403,
+        "fio_opts": {
+            "rw": 'read',
+            "number_ios": NUMBER_IOS,
+            "pi_act": 0,
+            "apptag": "0x8888",
+            "apptag_mask": "0x0FFF",
+            },
+        "pi_chk": "APPTAG,GUARD,REFTAG",
+        "bs_low": BS_LOW,
+        "bs_high": BS_HIGH,
+        "mdsize_adjustment": 1,
+        "test_class": DifDixTest,
+    },
+]
+
+
+def parse_args():
+    """Parse command-line arguments."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true')
+    parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+    parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+    parser.add_argument('-s', '--skip', nargs='+', type=int,
+                        help='list of test(s) to skip')
+    parser.add_argument('-o', '--run-only', nargs='+', type=int,
+                        help='list of test(s) to run, skipping all others')
+    parser.add_argument('--dut', help='target NVMe character device to test '
+                        '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True)
+    parser.add_argument('-l', '--lbaf', nargs='+', type=int,
+                        help='list of lba formats to test')
+    parser.add_argument('-i', '--ioengine', default='io_uring_cmd')
+    args = parser.parse_args()
+
+    return args
+
+
+def get_lbafs(args):
+    """
+    Determine which LBA formats to use. Use either the ones specified on the
+    command line or if none are specified query the device and use all lba
+    formats with metadata.
+    """
+    lbaf_list = []
+    id_ns_cmd = f"sudo nvme id-ns --output-format=json {args.dut}".split(' ')
+    id_ns_output = subprocess.check_output(id_ns_cmd)
+    lbafs = json.loads(id_ns_output)['lbafs']
+    if args.lbaf:
+        for lbaf in args.lbaf:
+            lbaf_list.append({'lbaf': lbaf, 'ds': 2 ** lbafs[lbaf]['ds'],
+                              'ms': lbafs[lbaf]['ms'], })
+            if lbafs[lbaf]['ms'] == 0:
+                print(f'Error: lbaf {lbaf} has metadata size zero')
+                sys.exit(1)
+    else:
+        for lbaf_num, lbaf in enumerate(lbafs):
+            if lbaf['ms'] != 0:
+                lbaf_list.append({'lbaf': lbaf_num, 'ds': 2 ** lbaf['ds'],
+                                  'ms': lbaf['ms'], })
+
+    return lbaf_list
+
+
+def get_guard_pi(lbaf_list, args):
+    """
+    Find out how many bits of guard protection information are associated with
+    each lbaf to be used. If this is not available assume 16-bit guard pi.
+    Also record the bytes of protection information associated with the number
+    of guard PI bits.
+    """
+    nvm_id_ns_cmd = f"sudo nvme nvm-id-ns --output-format=json {args.dut}".split(' ')
+    try:
+        nvm_id_ns_output = subprocess.check_output(nvm_id_ns_cmd)
+    except subprocess.CalledProcessError:
+        print(f"Non-zero return code from {' '.join(nvm_id_ns_cmd)}; " \
+                "assuming all lbafs use 16b Guard Protection Information")
+        for lbaf in lbaf_list:
+            lbaf['guard_pi_bits'] = 16
+    else:
+        elbafs = json.loads(nvm_id_ns_output)['elbafs']
+        for elbaf_num, elbaf in enumerate(elbafs):
+            for lbaf in lbaf_list:
+                if lbaf['lbaf'] == elbaf_num:
+                    lbaf['guard_pi_bits'] = 16 << elbaf['pif']
+
+    # For 16b Guard Protection Information, the PI requires 8 bytes
+    # For 32b and 64b Guard PI, the PI requires 16 bytes
+    for lbaf in lbaf_list:
+        if lbaf['guard_pi_bits'] == 16:
+            lbaf['pi_bytes'] = 8
+        else:
+            lbaf['pi_bytes'] = 16
+
+
+def get_capabilities(args):
+    """
+    Determine what end-to-end data protection features the device supports.
+    """
+    caps = { 'pil': [], 'pitype': [], 'elba': [] }
+    id_ns_cmd = f"sudo nvme id-ns --output-format=json {args.dut}".split(' ')
+    id_ns_output = subprocess.check_output(id_ns_cmd)
+    id_ns_json = json.loads(id_ns_output)
+
+    mc = id_ns_json['mc']
+    if mc & 1:
+        caps['elba'].append(1)
+    if mc & 2:
+        caps['elba'].append(0)
+
+    dpc = id_ns_json['dpc']
+    if dpc & 1:
+        caps['pitype'].append(1)
+    if dpc & 2:
+        caps['pitype'].append(2)
+    if dpc & 4:
+        caps['pitype'].append(3)
+    if dpc & 8:
+        caps['pil'].append(1)
+    if dpc & 16:
+        caps['pil'].append(0)
+
+    for _, value in caps.items():
+        if len(value) == 0:
+            logging.error("One or more end-to-end data protection features unsupported: %s", caps)
+            sys.exit(-1)
+
+    return caps
+
+
+def format_device(args, lbaf, pitype, pil, elba):
+    """
+    Format device using specified lba format with specified pitype, pil, and
+    elba values.
+    """
+
+    format_cmd = f"sudo nvme format {args.dut} --lbaf={lbaf['lbaf']} " \
+                 f"--pi={pitype} --pil={pil} --ms={elba} --force"
+    logging.debug("Format command: %s", format_cmd)
+    format_cmd = format_cmd.split(' ')
+    format_cmd_result = subprocess.run(format_cmd, capture_output=True, check=False,
+                                       encoding=locale.getpreferredencoding())
+
+    # Sometimes nvme-cli may format the device successfully but fail to
+    # rescan the namespaces after the format. Continue if this happens but
+    # abort if some other error occurs.
+    if format_cmd_result.returncode != 0:
+        if 'failed to rescan namespaces' not in format_cmd_result.stderr \
+                or 'Success formatting namespace' not in format_cmd_result.stdout:
+            logging.error(format_cmd_result.stdout)
+            logging.error(format_cmd_result.stderr)
+            print("Unable to format device; skipping this configuration")
+            return False
+
+    logging.debug(format_cmd_result.stdout)
+    return True
+
+
+def difdix_test(test_env, args, lbaf, pitype, elba):
+    """
+    Adjust test arguments based on values of lbaf, pitype, and elba.  Then run
+    the tests.
+    """
+    for test in TEST_LIST:
+        test['force_skip'] = False
+
+        blocksize = lbaf['ds']
+        # Set fio blocksize parameter at runtime
+        # If we formatted the device in extended LBA mode (e.g., 520-byte
+        # sectors), we usually need to add the lba data size and metadata size
+        # together for fio's bs parameter. However, if pi_act == 1 and the
+        # device is formatted so that the metadata is the same size as the PI,
+        # then the device will take care of everything and the application
+        # should just use regular power of 2 lba data size even when the device
+        # is in extended lba mode.
+        if elba:
+            if not test['fio_opts']['pi_act'] or lbaf['ms'] != lbaf['pi_bytes']:
+                blocksize += lbaf['ms']
+            test['fio_opts']['md_per_io_size'] = 0
+        else:
+        # If we are using a separate buffer for metadata, fio doesn't need to
+        # do anything when pi_act==1 and protection information size is equal to
+        # metadata size since the device is taking care of it all. If either of
+        # the two conditions do not hold, then we do need to allocate a
+        # separate metadata buffer.
+            if test['fio_opts']['pi_act'] and lbaf['ms'] == lbaf['pi_bytes']:
+                test['fio_opts']['md_per_io_size'] = 0
+            else:
+                test['fio_opts']['md_per_io_size'] = lbaf['ms'] * test['bs_high']
+
+        test['fio_opts']['bsrange'] = f"{blocksize * test['bs_low']}-{blocksize * test['bs_high']}"
+        if 'mdsize_adjustment' in test:
+            test['fio_opts']['md_per_io_size'] += test['mdsize_adjustment']
+
+        # Set fio pi_chk parameter at runtime. If the device is formatted
+        # with Type 3 protection information, this means that the reference
+        # tag is not checked and I/O commands may throw an error if they
+        # are submitted with the REFTAG bit set in pi_chk. Make sure fio
+        # does not set pi_chk's REFTAG bit if the device is formatted with
+        # Type 3 PI.
+        if 'pi_chk' in test:
+            if pitype == 3 and 'REFTAG' in test['pi_chk']:
+                test['fio_opts']['pi_chk'] = test['pi_chk'].replace('REFTAG','')
+                logging.debug("Type 3 PI: dropping REFTAG bit")
+            else:
+                test['fio_opts']['pi_chk'] = test['pi_chk']
+
+        if 'skip' in test:
+            if pitype == 3 and 'type3' in test['skip']:
+                test['force_skip'] = True
+                logging.debug("Type 3 PI: skipping test case")
+            if elba and 'elba' in test['skip']:
+                test['force_skip'] = True
+                logging.debug("extended lba format: skipping test case")
+
+        logging.debug("Test %d: pi_act=%d, bsrange=%s, md_per_io_size=%d", test['test_id'],
+                      test['fio_opts']['pi_act'], test['fio_opts']['bsrange'],
+                      test['fio_opts']['md_per_io_size'])
+
+    return run_fio_tests(TEST_LIST, test_env, args)
+
+
+def main():
+    """
+    Run tests using fio's io_uring_cmd ioengine to exercise end-to-end data
+    protection capabilities.
+    """
+
+    args = parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    artifact_root = args.artifact_root if args.artifact_root else \
+        f"nvmept_pi-test-{time.strftime('%Y%m%d-%H%M%S')}"
+    os.mkdir(artifact_root)
+    print(f"Artifact directory is {artifact_root}")
+
+    if args.fio:
+        fio_path = str(Path(args.fio).absolute())
+    else:
+        fio_path = 'fio'
+    print(f"fio path is {fio_path}")
+
+    lbaf_list = get_lbafs(args)
+    get_guard_pi(lbaf_list, args)
+    caps = get_capabilities(args)
+    print("Device capabilities:", caps)
+
+    for test in TEST_LIST:
+        test['fio_opts']['filename'] = args.dut
+        test['fio_opts']['ioengine'] = args.ioengine
+
+    test_env = {
+              'fio_path': fio_path,
+              'fio_root': str(Path(__file__).absolute().parent.parent),
+              'artifact_root': artifact_root,
+              'basename': 'nvmept_pi',
+              }
+
+    total = { 'passed':  0, 'failed': 0, 'skipped': 0 }
+
+    try:
+        for lbaf, pil, pitype, elba in itertools.product(lbaf_list, caps['pil'], caps['pitype'],
+                                                         caps['elba']):
+            print(f"\nlbaf: {lbaf}, pil: {pil}, pitype: {pitype}, elba: {elba}")
+
+            if not format_device(args, lbaf, pitype, pil, elba):
+                continue
+
+            test_env['artifact_root'] = \
+                os.path.join(artifact_root, f"lbaf{lbaf['lbaf']}pil{pil}pitype{pitype}" \
+                    f"elba{elba}")
+            os.mkdir(test_env['artifact_root'])
+
+            passed, failed, skipped = difdix_test(test_env, args, lbaf, pitype, elba)
+
+            total['passed'] += passed
+            total['failed'] += failed
+            total['skipped'] += skipped
+    except KeyboardInterrupt:
+        pass
+
+    print(f"\n\n{total['passed']} test(s) passed, {total['failed']} failed, " \
+            f"{total['skipped']} skipped")
+    sys.exit(total['failed'])
+
+
+if __name__ == '__main__':
+    main()
diff --git a/t/nvmept_streams.py b/t/nvmept_streams.py

new file mode 100755 (executable)

index 0000000..e542550
--- /dev/null
+++ b/t/nvmept_streams.py
@@ -0,0 +1,520 @@
+#!/usr/bin/env python3
+#
+# Copyright 2024 Samsung Electronics Co., Ltd All Rights Reserved
+#
+# For conditions of distribution and use, see the accompanying COPYING file.
+#
+"""
+# nvmept_streams.py
+#
+# Test fio's NVMe streams support using the io_uring_cmd ioengine with NVMe
+# pass-through commands.
+#
+# USAGE
+# see python3 nvmept_streams.py --help
+#
+# EXAMPLES
+# python3 t/nvmept_streams.py --dut /dev/ng0n1
+# python3 t/nvmept_streams.py --dut /dev/ng1n1 -f ./fio
+#
+# REQUIREMENTS
+# Python 3.6
+#
+# WARNING
+# This is a destructive test
+#
+# Enable streams with
+# nvme dir-send -D 0 -O 1 -e 1 -T 1 /dev/nvme0n1
+#
+# See streams directive status with
+# nvme dir-receive -D 0 -O 1 -H /dev/nvme0n1
+"""
+import os
+import sys
+import time
+import locale
+import logging
+import argparse
+import subprocess
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+from fiotestcommon import SUCCESS_NONZERO
+
+
+class StreamsTest(FioJobCmdTest):
+    """
+    NVMe pass-through test class for streams. Check to make sure output for
+    selected data direction(s) is non-zero and that zero data appears for other
+    directions.
+    """
+
+    def setup(self, parameters):
+        """Setup a test."""
+
+        fio_args = [
+            "--name=nvmept-streams",
+            "--ioengine=io_uring_cmd",
+            "--cmd_type=nvme",
+            "--randrepeat=0",
+            f"--filename={self.fio_opts['filename']}",
+            f"--rw={self.fio_opts['rw']}",
+            f"--output={self.filenames['output']}",
+            f"--output-format={self.fio_opts['output-format']}",
+        ]
+        for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles',
+                    'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait',
+                    'time_based', 'runtime', 'verify', 'io_size', 'num_range',
+                    'iodepth', 'iodepth_batch', 'iodepth_batch_complete',
+                    'size', 'rate', 'bs', 'bssplit', 'bsrange', 'randrepeat',
+                    'buffer_pattern', 'verify_pattern', 'offset', 'dataplacement',
+                    'plids', 'plid_select' ]:
+            if opt in self.fio_opts:
+                option = f"--{opt}={self.fio_opts[opt]}"
+                fio_args.append(option)
+
+        super().setup(fio_args)
+
+
+    def check_result(self):
+        try:
+            self._check_result()
+        finally:
+            release_all_streams(self.fio_opts['filename'])
+
+
+    def _check_result(self):
+
+        super().check_result()
+
+        if 'rw' not in self.fio_opts or \
+                not self.passed or \
+                'json' not in self.fio_opts['output-format']:
+            return
+
+        job = self.json_data['jobs'][0]
+
+        if self.fio_opts['rw'] in ['read', 'randread']:
+            self.passed = self.check_all_ddirs(['read'], job)
+        elif self.fio_opts['rw'] in ['write', 'randwrite']:
+            if 'verify' not in self.fio_opts:
+                self.passed = self.check_all_ddirs(['write'], job)
+            else:
+                self.passed = self.check_all_ddirs(['read', 'write'], job)
+        elif self.fio_opts['rw'] in ['trim', 'randtrim']:
+            self.passed = self.check_all_ddirs(['trim'], job)
+        elif self.fio_opts['rw'] in ['readwrite', 'randrw']:
+            self.passed = self.check_all_ddirs(['read', 'write'], job)
+        elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']:
+            self.passed = self.check_all_ddirs(['trim', 'write'], job)
+        else:
+            logging.error("Unhandled rw value %s", self.fio_opts['rw'])
+            self.passed = False
+
+        if 'iodepth' in self.fio_opts:
+            # We will need to figure something out if any test uses an iodepth
+            # different from 8
+            if job['iodepth_level']['8'] < 95:
+                logging.error("Did not achieve requested iodepth")
+                self.passed = False
+            else:
+                logging.debug("iodepth 8 target met %s", job['iodepth_level']['8'])
+
+        stream_ids = [int(stream) for stream in self.fio_opts['plids'].split(',')]
+        if not self.check_streams(self.fio_opts['filename'], stream_ids):
+            self.passed = False
+            logging.error("Streams not as expected")
+        else:
+            logging.debug("Streams created as expected")
+
+
+    def check_streams(self, dut, stream_ids):
+        """
+        Confirm that the specified stream IDs exist on the specified device.
+        """
+
+        id_list = get_device_stream_ids(dut)
+        if not id_list:
+            return False
+
+        for stream in stream_ids:
+            if stream in id_list:
+                logging.debug("Stream ID %d found active on device", stream)
+                id_list.remove(stream)
+            else:
+                if self.__class__.__name__ != "StreamsTestRand":
+                    logging.error("Stream ID %d not found on device", stream)
+                else:
+                    logging.debug("Stream ID %d not found on device", stream)
+                return False
+
+        if len(id_list) != 0:
+            logging.error("Extra stream IDs %s found on device", str(id_list))
+            return False
+
+        return True
+
+
+class StreamsTestRR(StreamsTest):
+    """
+    NVMe pass-through test class for streams. Check to make sure output for
+    selected data direction(s) is non-zero and that zero data appears for other
+    directions. Check that Stream IDs are accessed in round robin order.
+    """
+
+    def check_streams(self, dut, stream_ids):
+        """
+        The number of IOs is less than the number of stream IDs provided. Let N
+        be the number of IOs. Make sure that the device only has the first N of
+        the stream IDs provided.
+
+        This will miss some cases where some other selection algorithm happens
+        to select the first N stream IDs. The solution would be to repeat this
+        test multiple times. Multiple trials passing would be evidence that
+        round robin is working correctly.
+        """
+
+        id_list = get_device_stream_ids(dut)
+        if not id_list:
+            return False
+
+        num_streams = int(self.fio_opts['io_size'] / self.fio_opts['bs'])
+        stream_ids = sorted(stream_ids)[0:num_streams]
+
+        return super().check_streams(dut, stream_ids)
+
+
+class StreamsTestRand(StreamsTest):
+    """
+    NVMe pass-through test class for streams. Check to make sure output for
+    selected data direction(s) is non-zero and that zero data appears for other
+    directions. Check that Stream IDs are accessed in random order.
+    """
+
+    def check_streams(self, dut, stream_ids):
+        """
+        The number of IOs is less than the number of stream IDs provided. Let N
+        be the number of IOs. Confirm that the stream IDs on the device are not
+        the first N stream IDs.
+
+        This will produce false positives because it is possible for the first
+        N stream IDs to be randomly selected. We can reduce the probability of
+        false positives by increasing N and increasing the number of streams
+        IDs to choose from, although fio has a max of 16 placement IDs.
+        """
+
+        id_list = get_device_stream_ids(dut)
+        if not id_list:
+            return False
+
+        num_streams = int(self.fio_opts['io_size'] / self.fio_opts['bs'])
+        stream_ids = sorted(stream_ids)[0:num_streams]
+
+        return not super().check_streams(dut, stream_ids)
+
+
+def get_device_stream_ids(dut):
+    cmd = f"sudo nvme dir-receive -D 1 -O 2 -H {dut}"
+    logging.debug("check streams command: %s", cmd)
+    cmd = cmd.split(' ')
+    cmd_result = subprocess.run(cmd, capture_output=True, check=False,
+                                encoding=locale.getpreferredencoding())
+
+    logging.debug(cmd_result.stdout)
+
+    if cmd_result.returncode != 0:
+        logging.error("Error obtaining device %s stream IDs: %s", dut, cmd_result.stderr)
+        return False
+
+    id_list = []
+    for line in cmd_result.stdout.split('\n'):
+        if not 'Stream Identifier' in line:
+            continue
+        tokens = line.split(':')
+        id_list.append(int(tokens[1]))
+
+    return id_list
+
+
+def release_stream(dut, stream_id):
+    """
+    Release stream on given device with selected ID.
+    """
+    cmd = f"nvme dir-send -D 1 -O 1 -S {stream_id} {dut}"
+    logging.debug("release stream command: %s", cmd)
+    cmd = cmd.split(' ')
+    cmd_result = subprocess.run(cmd, capture_output=True, check=False,
+                                encoding=locale.getpreferredencoding())
+
+    if cmd_result.returncode != 0:
+        logging.error("Error releasing %s stream %d", dut, stream_id)
+        return False
+
+    return True
+
+
+def release_all_streams(dut):
+    """
+    Release all streams on specified device.
+    """
+
+    id_list = get_device_stream_ids(dut)
+    if not id_list:
+        return False
+
+    for stream in id_list:
+        if not release_stream(dut, stream):
+            return False
+
+    return True
+
+
+TEST_LIST = [
+    # 4k block size
+    # {seq write, rand write} x {single stream, four streams}
+    {
+        "test_id": 1,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "io_size": 256*1024*1024,
+            "verify": "crc32c",
+            "plids": "8",
+            "dataplacement": "streams",
+            "output-format": "json",
+            },
+        "test_class": StreamsTest,
+    },
+    {
+        "test_id": 2,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "io_size": 256*1024*1024,
+            "verify": "crc32c",
+            "plids": "3",
+            "dataplacement": "streams",
+            "output-format": "json",
+            },
+        "test_class": StreamsTest,
+    },
+    {
+        "test_id": 3,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "io_size": 256*1024*1024,
+            "verify": "crc32c",
+            "plids": "1,2,3,4",
+            "dataplacement": "streams",
+            "output-format": "json",
+            },
+        "test_class": StreamsTest,
+    },
+    {
+        "test_id": 4,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 4096,
+            "io_size": 256*1024*1024,
+            "verify": "crc32c",
+            "plids": "5,6,7,8",
+            "dataplacement": "streams",
+            "output-format": "json",
+            },
+        "test_class": StreamsTest,
+    },
+    # 256KiB block size
+    # {seq write, rand write} x {single stream, four streams}
+    {
+        "test_id": 10,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 256*1024,
+            "io_size": 256*1024*1024,
+            "verify": "crc32c",
+            "plids": "88",
+            "dataplacement": "streams",
+            "output-format": "json",
+            },
+        "test_class": StreamsTest,
+    },
+    {
+        "test_id": 11,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 256*1024,
+            "io_size": 256*1024*1024,
+            "verify": "crc32c",
+            "plids": "20",
+            "dataplacement": "streams",
+            "output-format": "json",
+            },
+        "test_class": StreamsTest,
+    },
+    {
+        "test_id": 12,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 256*1024,
+            "io_size": 256*1024*1024,
+            "verify": "crc32c",
+            "plids": "16,32,64,128",
+            "dataplacement": "streams",
+            "output-format": "json",
+            },
+        "test_class": StreamsTest,
+    },
+    {
+        "test_id": 13,
+        "fio_opts": {
+            "rw": 'randwrite',
+            "bs": 256*1024,
+            "io_size": 256*1024*1024,
+            "verify": "crc32c",
+            "plids": "10,20,40,82",
+            "dataplacement": "streams",
+            "output-format": "json",
+            },
+        "test_class": StreamsTest,
+    },
+    # Test placement ID selection patterns
+    # default is round robin
+    {
+        "test_id": 20,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "io_size": 8192,
+            "plids": '88,99,100,123,124,125,126,127,128,129,130,131,132,133,134,135',
+            "dataplacement": "streams",
+            "output-format": "json",
+            },
+        "test_class": StreamsTestRR,
+    },
+    {
+        "test_id": 21,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "io_size": 8192,
+            "plids": '12,88,99,100,123,124,125,126,127,128,129,130,131,132,133,11',
+            "dataplacement": "streams",
+            "output-format": "json",
+            },
+        "test_class": StreamsTestRR,
+    },
+    # explicitly select round robin
+    {
+        "test_id": 22,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "io_size": 8192,
+            "plids": '22,88,99,100,123,124,125,126,127,128,129,130,131,132,133,134',
+            "dataplacement": "streams",
+            "output-format": "json",
+            "plid_select": "roundrobin",
+            },
+        "test_class": StreamsTestRR,
+    },
+    # explicitly select random
+    {
+        "test_id": 23,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "io_size": 8192,
+            "plids": '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16',
+            "dataplacement": "streams",
+            "output-format": "json",
+            "plid_select": "random",
+            },
+        "test_class": StreamsTestRand,
+    },
+    # Error case with placement ID > 0xFFFF
+    {
+        "test_id": 30,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "io_size": 8192,
+            "plids": "1,2,3,0x10000",
+            "dataplacement": "streams",
+            "output-format": "normal",
+            "plid_select": "random",
+            },
+        "test_class": StreamsTestRand,
+        "success": SUCCESS_NONZERO,
+    },
+    # Error case with no stream IDs provided
+    {
+        "test_id": 31,
+        "fio_opts": {
+            "rw": 'write',
+            "bs": 4096,
+            "io_size": 8192,
+            "dataplacement": "streams",
+            "output-format": "normal",
+            },
+        "test_class": StreamsTestRand,
+        "success": SUCCESS_NONZERO,
+    },
+
+]
+
+def parse_args():
+    """Parse command-line arguments."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true')
+    parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+    parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+    parser.add_argument('-s', '--skip', nargs='+', type=int,
+                        help='list of test(s) to skip')
+    parser.add_argument('-o', '--run-only', nargs='+', type=int,
+                        help='list of test(s) to run, skipping all others')
+    parser.add_argument('--dut', help='target NVMe character device to test '
+                        '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True)
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands."""
+
+    args = parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    artifact_root = args.artifact_root if args.artifact_root else \
+        f"nvmept-streams-test-{time.strftime('%Y%m%d-%H%M%S')}"
+    os.mkdir(artifact_root)
+    print(f"Artifact directory is {artifact_root}")
+
+    if args.fio:
+        fio_path = str(Path(args.fio).absolute())
+    else:
+        fio_path = 'fio'
+    print(f"fio path is {fio_path}")
+
+    for test in TEST_LIST:
+        test['fio_opts']['filename'] = args.dut
+
+    release_all_streams(args.dut)
+    test_env = {
+              'fio_path': fio_path,
+              'fio_root': str(Path(__file__).absolute().parent.parent),
+              'artifact_root': artifact_root,
+              'basename': 'nvmept-streams',
+              }
+
+    _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
+    sys.exit(failed)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/t/nvmept_trim.py b/t/nvmept_trim.py

new file mode 100755 (executable)

index 0000000..5756838
--- /dev/null
+++ b/t/nvmept_trim.py
@@ -0,0 +1,586 @@
+#!/usr/bin/env python3
+#
+# Copyright 2024 Samsung Electronics Co., Ltd All Rights Reserved
+#
+# For conditions of distribution and use, see the accompanying COPYING file.
+#
+"""
+# nvmept_trim.py
+#
+# Test fio's io_uring_cmd ioengine with NVMe pass-through dataset management
+# commands that trim multiple ranges.
+#
+# USAGE
+# see python3 nvmept_trim.py --help
+#
+# EXAMPLES
+# python3 t/nvmept_trim.py --dut /dev/ng0n1
+# python3 t/nvmept_trim.py --dut /dev/ng1n1 -f ./fio
+#
+# REQUIREMENTS
+# Python 3.6
+#
+"""
+import os
+import sys
+import time
+import logging
+import argparse
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+from fiotestcommon import SUCCESS_NONZERO
+
+
+class TrimTest(FioJobCmdTest):
+    """
+    NVMe pass-through test class. Check to make sure output for selected data
+    direction(s) is non-zero and that zero data appears for other directions.
+    """
+
+    def setup(self, parameters):
+        """Setup a test."""
+
+        fio_args = [
+            "--name=nvmept-trim",
+            "--ioengine=io_uring_cmd",
+            "--cmd_type=nvme",
+            f"--filename={self.fio_opts['filename']}",
+            f"--rw={self.fio_opts['rw']}",
+            f"--output={self.filenames['output']}",
+            f"--output-format={self.fio_opts['output-format']}",
+        ]
+        for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles',
+                    'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait',
+                    'time_based', 'runtime', 'verify', 'io_size', 'num_range',
+                    'iodepth', 'iodepth_batch', 'iodepth_batch_complete',
+                    'size', 'rate', 'bs', 'bssplit', 'bsrange', 'randrepeat',
+                    'buffer_pattern', 'verify_pattern', 'verify', 'offset']:
+            if opt in self.fio_opts:
+                option = f"--{opt}={self.fio_opts[opt]}"
+                fio_args.append(option)
+
+        super().setup(fio_args)
+
+
+    def check_result(self):
+
+        super().check_result()
+
+        if 'rw' not in self.fio_opts or \
+                not self.passed or \
+                'json' not in self.fio_opts['output-format']:
+            return
+
+        job = self.json_data['jobs'][0]
+
+        if self.fio_opts['rw'] in ['read', 'randread']:
+            self.passed = self.check_all_ddirs(['read'], job)
+        elif self.fio_opts['rw'] in ['write', 'randwrite']:
+            if 'verify' not in self.fio_opts:
+                self.passed = self.check_all_ddirs(['write'], job)
+            else:
+                self.passed = self.check_all_ddirs(['read', 'write'], job)
+        elif self.fio_opts['rw'] in ['trim', 'randtrim']:
+            self.passed = self.check_all_ddirs(['trim'], job)
+        elif self.fio_opts['rw'] in ['readwrite', 'randrw']:
+            self.passed = self.check_all_ddirs(['read', 'write'], job)
+        elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']:
+            self.passed = self.check_all_ddirs(['trim', 'write'], job)
+        else:
+            logging.error("Unhandled rw value %s", self.fio_opts['rw'])
+            self.passed = False
+
+        if 'iodepth' in self.fio_opts:
+            # We will need to figure something out if any test uses an iodepth
+            # different from 8
+            if job['iodepth_level']['8'] < 95:
+                logging.error("Did not achieve requested iodepth")
+                self.passed = False
+            else:
+                logging.debug("iodepth 8 target met %s", job['iodepth_level']['8'])
+
+
+class RangeTrimTest(TrimTest):
+    """
+    Multi-range trim test class.
+    """
+
+    def get_bs(self):
+        """Calculate block size and determine whether bs will be an average or exact."""
+
+        if 'bs' in self.fio_opts:
+            exact_size = True
+            bs = self.fio_opts['bs']
+        elif 'bssplit' in self.fio_opts:
+            exact_size = False
+            bs = 0
+            total = 0
+            for split in self.fio_opts['bssplit'].split(':'):
+                [blocksize, share] = split.split('/')
+                total += int(share)
+                bs += int(blocksize) * int(share) / 100
+            if total != 100:
+                logging.error("bssplit '%s' total percentage is not 100", self.fio_opts['bssplit'])
+                self.passed = False
+            else:
+                logging.debug("bssplit: average block size is %d", int(bs))
+            # The only check we do here for bssplit is to calculate an average
+            # blocksize and see if the IOPS and bw are consistent
+        elif 'bsrange' in self.fio_opts:
+            exact_size = False
+            [minbs, maxbs] = self.fio_opts['bsrange'].split('-')
+            minbs = int(minbs)
+            maxbs = int(maxbs)
+            bs = int((minbs + maxbs) / 2)
+            logging.debug("bsrange: average block size is %d", int(bs))
+            # The only check we do here for bsrange is to calculate an average
+            # blocksize and see if the IOPS and bw are consistent
+        else:
+            exact_size = True
+            bs = 4096
+
+        return bs, exact_size
+
+
+    def check_result(self):
+        """
+        Make sure that the number of IO requests is consistent with the
+        blocksize and num_range values. In other words, if the blocksize is
+        4KiB and num_range is 2, we should have 128 IO requests to trim 1MiB.
+        """
+        # TODO Enable debug output to check the actual offsets
+
+        super().check_result()
+
+        if not self.passed or 'json' not in self.fio_opts['output-format']:
+            return
+
+        job = self.json_data['jobs'][0]['trim']
+        bs, exact_size = self.get_bs()
+
+        # make sure bw and IOPS are consistent
+        bw = job['bw_bytes']
+        iops = job['iops']
+        runtime = job['runtime']
+
+        calculated = int(bw*runtime/1000)
+        expected = job['io_bytes']
+        if abs(calculated - expected) / expected > 0.05:
+            logging.error("Total bytes %d from bw does not match reported total bytes %d",
+                          calculated, expected)
+            self.passed = False
+        else:
+            logging.debug("Total bytes %d from bw matches reported total bytes %d", calculated,
+                          expected)
+
+        calculated = int(iops*runtime/1000*bs*self.fio_opts['num_range'])
+        if abs(calculated - expected) / expected > 0.05:
+            logging.error("Total bytes %d from IOPS does not match reported total bytes %d",
+                          calculated, expected)
+            self.passed = False
+        else:
+            logging.debug("Total bytes %d from IOPS matches reported total bytes %d", calculated,
+                          expected)
+
+        if 'size' in self.fio_opts:
+            io_count = self.fio_opts['size'] / self.fio_opts['num_range'] / bs
+            if exact_size:
+                delta = 0.1
+            else:
+                delta = 0.05*job['total_ios']
+
+            if abs(job['total_ios'] - io_count) > delta:
+                logging.error("Expected numbers of IOs %d does not match actual value %d",
+                              io_count, job['total_ios'])
+                self.passed = False
+            else:
+                logging.debug("Expected numbers of IOs %d matches actual value %d", io_count,
+                              job['total_ios'])
+
+        if 'rate' in self.fio_opts:
+            if abs(bw - self.fio_opts['rate']) / self.fio_opts['rate'] > 0.05:
+                logging.error("Actual rate %f does not match expected rate %f", bw,
+                              self.fio_opts['rate'])
+                self.passed = False
+            else:
+                logging.debug("Actual rate %f matches expeected rate %f", bw, self.fio_opts['rate'])
+
+
+
+TEST_LIST = [
+    # The group of tests below checks existing use cases to make sure there are
+    # no regressions.
+    {
+        "test_id": 1,
+        "fio_opts": {
+            "rw": 'trim',
+            "time_based": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": TrimTest,
+    },
+    {
+        "test_id": 2,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "time_based": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": TrimTest,
+    },
+    {
+        "test_id": 3,
+        "fio_opts": {
+            "rw": 'trim',
+            "time_based": 1,
+            "runtime": 3,
+            "iodepth": 8,
+            "iodepth_batch": 4,
+            "iodepth_batch_complete": 4,
+            "output-format": "json",
+            },
+        "test_class": TrimTest,
+    },
+    {
+        "test_id": 4,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "time_based": 1,
+            "runtime": 3,
+            "iodepth": 8,
+            "iodepth_batch": 4,
+            "iodepth_batch_complete": 4,
+            "output-format": "json",
+            },
+        "test_class": TrimTest,
+    },
+    {
+        "test_id": 5,
+        "fio_opts": {
+            "rw": 'trimwrite',
+            "time_based": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": TrimTest,
+    },
+    {
+        "test_id": 6,
+        "fio_opts": {
+            "rw": 'randtrimwrite',
+            "time_based": 1,
+            "runtime": 3,
+            "output-format": "json",
+            },
+        "test_class": TrimTest,
+    },
+    {
+        "test_id": 7,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "time_based": 1,
+            "runtime": 3,
+            "fixedbufs": 0,
+            "nonvectored": 1,
+            "force_async": 1,
+            "registerfiles": 1,
+            "sqthread_poll": 1,
+            "fixedbuffs": 1,
+            "output-format": "json",
+            },
+        "test_class": TrimTest,
+    },
+    # The group of tests below try out the new functionality
+    {
+        "test_id": 100,
+        "fio_opts": {
+            "rw": 'trim',
+            "num_range": 2,
+            "size": 16*1024*1024,
+            "output-format": "json",
+            },
+        "test_class": RangeTrimTest,
+    },
+    {
+        "test_id": 101,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "num_range": 2,
+            "size": 16*1024*1024,
+            "output-format": "json",
+            },
+        "test_class": RangeTrimTest,
+    },
+    {
+        "test_id": 102,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "num_range": 256,
+            "size": 64*1024*1024,
+            "output-format": "json",
+            },
+        "test_class": RangeTrimTest,
+    },
+    {
+        "test_id": 103,
+        "fio_opts": {
+            "rw": 'trim',
+            "num_range": 2,
+            "bs": 16*1024,
+            "size": 32*1024*1024,
+            "output-format": "json",
+            },
+        "test_class": RangeTrimTest,
+    },
+    {
+        "test_id": 104,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "num_range": 2,
+            "bs": 16*1024,
+            "size": 32*1024*1024,
+            "output-format": "json",
+            },
+        "test_class": RangeTrimTest,
+    },
+    {
+        "test_id": 105,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "num_range": 2,
+            "bssplit": "4096/50:16384/50",
+            "size": 80*1024*1024,
+            "output-format": "json",
+            "randrepeat": 0,
+            },
+        "test_class": RangeTrimTest,
+    },
+    {
+        "test_id": 106,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "num_range": 4,
+            "bssplit": "4096/25:8192/25:12288/25:16384/25",
+            "size": 80*1024*1024,
+            "output-format": "json",
+            "randrepeat": 0,
+            },
+        "test_class": RangeTrimTest,
+    },
+    {
+        "test_id": 107,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "num_range": 4,
+            "bssplit": "4096/20:8192/20:12288/20:16384/20:20480/20",
+            "size": 72*1024*1024,
+            "output-format": "json",
+            "randrepeat": 0,
+            },
+        "test_class": RangeTrimTest,
+    },
+    {
+        "test_id": 108,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "num_range": 2,
+            "bsrange": "4096-16384",
+            "size": 80*1024*1024,
+            "output-format": "json",
+            "randrepeat": 0,
+            },
+        "test_class": RangeTrimTest,
+    },
+    {
+        "test_id": 109,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "num_range": 4,
+            "bsrange": "4096-20480",
+            "size": 72*1024*1024,
+            "output-format": "json",
+            "randrepeat": 0,
+            },
+        "test_class": RangeTrimTest,
+    },
+    {
+        "test_id": 110,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "time_based": 1,
+            "runtime": 10,
+            "rate": 1024*1024,
+            "num_range": 2,
+            "output-format": "json",
+            },
+        "test_class": RangeTrimTest,
+    },
+    # All of the tests below should fail
+    # TODO check the error messages resulting from the jobs below
+    {
+        "test_id": 200,
+        "fio_opts": {
+            "rw": 'randtrimwrite',
+            "time_based": 1,
+            "runtime": 10,
+            "rate": 1024*1024,
+            "num_range": 2,
+            "output-format": "normal",
+            },
+        "test_class": RangeTrimTest,
+        "success": SUCCESS_NONZERO,
+    },
+    {
+        "test_id": 201,
+        "fio_opts": {
+            "rw": 'trimwrite',
+            "time_based": 1,
+            "runtime": 10,
+            "rate": 1024*1024,
+            "num_range": 2,
+            "output-format": "normal",
+            },
+        "test_class": RangeTrimTest,
+        "success": SUCCESS_NONZERO,
+    },
+    {
+        "test_id": 202,
+        "fio_opts": {
+            "rw": 'trim',
+            "time_based": 1,
+            "runtime": 10,
+            "num_range": 257,
+            "output-format": "normal",
+            },
+        "test_class": RangeTrimTest,
+        "success": SUCCESS_NONZERO,
+    },
+    # The sequence of jobs below constitute a single test with multiple steps
+    # - write a data pattern
+    # - verify the data pattern
+    # - trim the first half of the LBA space
+    # - verify that the trim'd LBA space no longer returns the original data pattern
+    # - verify that the remaining LBA space has the expected pattern
+    {
+        "test_id": 300,
+        "fio_opts": {
+            "rw": 'write',
+            "output-format": 'json',
+            "buffer_pattern": 0x0f,
+            "size": 256*1024*1024,
+            },
+        "test_class": TrimTest,
+    },
+    {
+        "test_id": 301,
+        "fio_opts": {
+            "rw": 'read',
+            "output-format": 'json',
+            "verify_pattern": 0x0f,
+            "verify": "pattern",
+            "size": 256*1024*1024,
+            },
+        "test_class": TrimTest,
+    },
+    {
+        "test_id": 302,
+        "fio_opts": {
+            "rw": 'randtrim',
+            "num_range": 8,
+            "output-format": 'json',
+            "size": 128*1024*1024,
+            },
+        "test_class": TrimTest,
+    },
+    # The identify namespace data structure has a DLFEAT field which specifies
+    # what happens when reading data from deallocated blocks. There are three
+    # options:
+    # - read behavior not reported
+    # - deallocated logical block returns all bytes 0x0
+    # - deallocated logical block returns all bytes 0xff
+    # The test below merely checks that the original data pattern is not returned.
+    # Source: Figure 97 from
+    # https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0c-2022.10.03-Ratified.pdf
+    {
+        "test_id": 303,
+        "fio_opts": {
+            "rw": 'read',
+            "output-format": 'json',
+            "verify_pattern": 0x0f,
+            "verify": "pattern",
+            "size": 128*1024*1024,
+            },
+        "test_class": TrimTest,
+        "success": SUCCESS_NONZERO,
+    },
+    {
+        "test_id": 304,
+        "fio_opts": {
+            "rw": 'read',
+            "output-format": 'json',
+            "verify_pattern": 0x0f,
+            "verify": "pattern",
+            "offset": 128*1024*1024,
+            "size": 128*1024*1024,
+            },
+        "test_class": TrimTest,
+    },
+]
+
+def parse_args():
+    """Parse command-line arguments."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true')
+    parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+    parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+    parser.add_argument('-s', '--skip', nargs='+', type=int,
+                        help='list of test(s) to skip')
+    parser.add_argument('-o', '--run-only', nargs='+', type=int,
+                        help='list of test(s) to run, skipping all others')
+    parser.add_argument('--dut', help='target NVMe character device to test '
+                        '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True)
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands."""
+
+    args = parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    artifact_root = args.artifact_root if args.artifact_root else \
+        f"nvmept-trim-test-{time.strftime('%Y%m%d-%H%M%S')}"
+    os.mkdir(artifact_root)
+    print(f"Artifact directory is {artifact_root}")
+
+    if args.fio:
+        fio_path = str(Path(args.fio).absolute())
+    else:
+        fio_path = 'fio'
+    print(f"fio path is {fio_path}")
+
+    for test in TEST_LIST:
+        test['fio_opts']['filename'] = args.dut
+
+    test_env = {
+              'fio_path': fio_path,
+              'fio_root': str(Path(__file__).absolute().parent.parent),
+              'artifact_root': artifact_root,
+              'basename': 'nvmept-trim',
+              }
+
+    _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
+    sys.exit(failed)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/t/one-core-peak.sh b/t/one-core-peak.sh

new file mode 100755 (executable)

index 0000000..3ac119f
--- /dev/null
+++ b/t/one-core-peak.sh
@@ -0,0 +1,287 @@
+#!/bin/bash
+
+args=$*
+first_cores=""
+taskset_cores=""
+first_cores_count=0
+nb_threads=1
+drives=""
+
+# Default options
+latency_cmdline=""
+
+fatal() {
+  echo "$@"
+  exit 1
+}
+
+hint() {
+  echo "Warning: $*"
+}
+
+info() {
+  item=$1
+  shift
+  echo "${item}: $*"
+}
+
+check_root() {
+  [[ ${EUID} -eq 0 ]] || fatal "You should be root to run this tool"
+}
+
+check_binary() {
+  # Ensure the binaries are present and executable
+  for bin in "$@"; do
+    if [ ! -x ${bin} ]; then
+      command -v ${bin} >/dev/null
+      [ $? -eq 0 ] || fatal "${bin} doesn't exist or is not executable"
+    fi
+  done
+}
+
+detect_first_core() {
+  cpu_to_search="0"
+  if [ "${#drives[@]}" -eq 1 ]; then
+    device_name=$(block_dev_name ${drives[0]})
+    device_dir="/sys/block/${device_name}/device/"
+    pci_addr=$(cat ${device_dir}/address)
+    pci_dir="/sys/bus/pci/devices/${pci_addr}/"
+    cpu_to_search=$(cat ${pci_dir}/local_cpulist | cut -d"," -f 1 | cut -d"-" -f 1)
+  else
+    hint 'Passed multiple devices. Running on the first core.'
+  fi
+  core_to_run=$(lscpu  --all -pSOCKET,CORE,CPU | grep ",$cpu_to_search\$" | cut -d"," -f1-2)
+
+  # Detect which logical cpus belongs to the first physical core
+  # If Hyperthreading is enabled, two cores are returned
+  cpus=$(lscpu  --all -pSOCKET,CORE,CPU | grep "$core_to_run")
+  for cpu in ${cpus}; do
+    IFS=','
+    # shellcheck disable=SC2206
+    array=(${cpu})
+    if [ ${first_cores_count} -eq 0 ]; then
+      first_cores="${array[2]}"
+    else
+      first_cores="${first_cores} ${array[2]}"
+    fi
+
+    first_cores_count=$((first_cores_count + 1))
+    unset IFS
+  done
+  [ ${first_cores_count} -eq 0 ] && fatal "Cannot detect first core"
+  taskset_cores=$(echo "${first_cores}" | tr ' ' ',')
+}
+
+usage() {
+  echo "usage: [options] block_device [other_block_devices]
+
+   -h         : print help
+   -l         : enable latency reporting
+
+   example:
+      t/one-core-peak.sh /dev/nvme0n1
+      t/one-core-peak.sh -l /dev/nvme0n1 /dev/nvme1n1
+  "
+  exit 0
+}
+
+check_args() {
+  local OPTIND option
+  while getopts "hl" option; do
+    case "${option}" in
+        h) # Show help
+            usage
+            ;;
+        l) # Report latency
+            latency_cmdline="1"
+            ;;
+        *)
+            fatal "Unsupported ${option} option"
+            ;;
+    esac
+  done
+  shift $((OPTIND-1))
+  [ $# -eq 0 ] && fatal "Missing drive(s) as argument"
+  drives="$*"
+}
+
+check_drive_exists() {
+  # Ensure the block device exists
+  [ -b $1 ] || fatal "$1 is not a valid block device"
+}
+
+is_nvme() {
+  [[ ${*} == *"nvme"* ]]
+}
+
+check_poll_queue() {
+  # Print a warning if the nvme poll queues aren't enabled
+  is_nvme ${drives} || return
+  poll_queue=$(cat /sys/module/nvme/parameters/poll_queues)
+  [ ${poll_queue} -eq 0 ] && hint "For better performance, you should enable nvme poll queues by setting nvme.poll_queues=32 on the kernel commande line"
+}
+
+block_dev_name() {
+  echo ${1#"/dev/"}
+}
+
+get_sys_block_dir() {
+  # Returns the /sys/block/ directory of a given block device
+  device_name=$1
+  sys_block_dir="/sys/block/${device_name}"
+  [ -d "${sys_block_dir}" ] || fatal "Cannot find ${sys_block_dir} directory"
+  echo ${sys_block_dir}
+}
+
+check_io_scheduler() {
+  # Ensure io_sched is set to none
+  device_name=$(block_dev_name $1)
+  sys_block_dir=$(get_sys_block_dir ${device_name})
+  sched_file="${sys_block_dir}/queue/scheduler"
+  [ -f "${sched_file}" ] || fatal "Cannot find IO scheduler for ${device_name}"
+  grep -q '\[none\]' ${sched_file}
+  if [ $? -ne 0 ]; then
+    info "${device_name}" "set none as io scheduler"
+    echo "none" > ${sched_file}
+  fi
+
+}
+
+check_sysblock_value() {
+  device_name=$(block_dev_name $1)
+  sys_block_dir=$(get_sys_block_dir ${device_name})
+  target_file="${sys_block_dir}/$2"
+  value=$3
+  [ -f "${target_file}" ] || return
+  content=$(cat ${target_file} 2>/dev/null)
+  if [ "${content}" != "${value}" ]; then
+    echo ${value} > ${target_file} 2>/dev/null && info "${device_name}" "${target_file} set to ${value}." || hint "${device_name}: Cannot set ${value} on ${target_file}"
+  fi
+}
+
+compute_nb_threads() {
+  # Increase the number of threads if there is more devices or cores than the default value
+  [ $# -gt ${nb_threads} ] && nb_threads=$#
+  [ ${first_cores_count} -gt ${nb_threads} ] && nb_threads=${first_cores_count}
+}
+
+check_scaling_governor() {
+  driver=$(LC_ALL=C cpupower frequency-info |grep "driver:" |awk '{print $2}')
+  if [ -z "${driver}" ]; then
+    hint "Cannot detect processor scaling driver"
+    return
+  fi
+  cpupower frequency-set -g performance >/dev/null 2>&1 || fatal "Cannot set scaling processor governor"
+}
+
+check_idle_governor() {
+  filename="/sys/devices/system/cpu/cpuidle/current_governor"
+  if [ ! -f "${filename}" ]; then
+    hint "Cannot detect cpu idle governor"
+    return
+  fi
+  echo "menu" > ${filename} 2>/dev/null || fatal "Cannot set cpu idle governor to menu"
+}
+
+show_nvme() {
+  device="$1"
+  device_name=$(block_dev_name $1)
+  device_dir="/sys/block/${device_name}/device/"
+  pci_addr=$(cat ${device_dir}/address)
+  pci_dir="/sys/bus/pci/devices/${pci_addr}/"
+  link_speed=$(cat ${pci_dir}/current_link_speed)
+  irq=$(cat ${pci_dir}/irq)
+  numa=$([ -f ${pci_dir}/numa_node ] && cat ${pci_dir}/numa_node || echo "off")
+  cpus=$(cat ${pci_dir}/local_cpulist)
+  model=$(cat ${device_dir}/model | xargs) #xargs for trimming spaces
+  fw=$(cat ${device_dir}/firmware_rev | xargs) #xargs for trimming spaces
+  serial=$(cat ${device_dir}/serial | xargs) #xargs for trimming spaces
+  info ${device_name} "MODEL=${model} FW=${fw} serial=${serial} PCI=${pci_addr}@${link_speed} IRQ=${irq} NUMA=${numa} CPUS=${cpus} "
+  command -v nvme > /dev/null
+  if [ $? -eq 0 ]; then
+    status=""
+    NCQA=$(nvme get-feature -H -f 0x7 ${device} 2>&1 |grep NCQA |cut -d ':' -f 2 | xargs)
+    [ -n "${NCQA}" ] && status="${status}Completion Queues:${NCQA}, "
+    NSQA=$(nvme get-feature -H -f 0x7 ${device} 2>&1 |grep NSQA |cut -d ':' -f 2 | xargs)
+    [ -n "${NSQA}" ] && status="${status}Submission Queues:${NSQA}, "
+    power_state=$(nvme get-feature -H -f 0x2 ${device} 2>&1 | grep PS |cut -d ":" -f 2 | xargs)
+    [ -n "${power_state}" ] && status="${status}PowerState:${power_state}, "
+    apste=$(nvme get-feature -H -f 0xc ${device} 2>&1 | grep APSTE |cut -d ":" -f 2 | xargs)
+    [ -n "${apste}" ] && status="${status} Autonomous Power State Transition:${apste}, "
+    temp=$(nvme smart-log ${device} 2>&1 |grep 'temperature' |cut -d ':' -f 2 |xargs)
+    [ -n "${temp}" ] && status="${status}Temp:${temp}"
+    info ${device_name} "${status}"
+  fi
+}
+
+show_device() {
+  device_name=$(block_dev_name $1)
+  is_nvme $1 && show_nvme $1
+}
+
+show_kernel_config_item() {
+  config_item="CONFIG_$1"
+  config_file="/boot/config-$(uname -r)"
+  if [ ! -f "${config_file}" ]; then
+    config_file='/proc/config.gz'
+    if [ ! -f "${config_file}" ]; then
+      return
+    fi
+  fi
+  status=$(zgrep ${config_item}= ${config_file})
+  if [ -z "${status}" ]; then
+    echo "${config_item}=N"
+  else
+    echo "${config_item}=$(echo ${status} | cut -d '=' -f 2)"
+  fi
+}
+
+show_system() {
+  CPU_MODEL=$(grep -m1 "model name" /proc/cpuinfo | awk '{print substr($0, index($0,$4))}')
+  MEMORY_SPEED=$(dmidecode -t 17 -q | grep -m 1 "Configured Memory Speed: [0-9]" | awk '{print substr($0, index($0,$4))}')
+  KERNEL=$(uname -r)
+  info "system" "CPU: ${CPU_MODEL}"
+  info "system" "MEMORY: ${MEMORY_SPEED}"
+  info "system" "KERNEL: ${KERNEL}"
+  for config_item in BLK_CGROUP BLK_WBT_MQ HZ RETPOLINE PAGE_TABLE_ISOLATION; do
+    info "system" "KERNEL: $(show_kernel_config_item ${config_item})"
+  done
+  info "system" "KERNEL: $(cat /proc/cmdline)"
+  info "system" "SElinux: $(getenforce)"
+  tsc=$(journalctl -k | grep 'tsc: Refined TSC clocksource calibration:' | awk '{print $11}')
+  if [ -n "${tsc}" ]; then
+    info "system" "TSC: ${tsc} Mhz"
+    tsc=$(echo ${tsc} | tr -d '.')
+    [ -n "${latency_cmdline}" ] && latency_cmdline="-t1 -T${tsc}000"
+  fi
+}
+
+### MAIN
+check_args ${args}
+check_root
+check_binary t/io_uring lscpu grep taskset cpupower awk tr xargs dmidecode
+detect_first_core
+
+info "##################################################"
+show_system
+for drive in ${drives}; do
+  check_drive_exists ${drive}
+  check_io_scheduler ${drive}
+  check_sysblock_value ${drive} "queue/iostats" 0 # Ensure iostats are disabled
+  check_sysblock_value ${drive} "queue/nomerges" 2 # Ensure merge are disabled
+  check_sysblock_value ${drive} "queue/io_poll" 1 # Ensure io_poll is enabled
+  check_sysblock_value ${drive} "queue/wbt_lat_usec" 0 # Disabling wbt lat
+  show_device ${drive}
+done
+
+check_poll_queue
+compute_nb_threads ${drives}
+check_scaling_governor
+check_idle_governor
+
+info "##################################################"
+echo
+
+cmdline="taskset -c ${taskset_cores} t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -B1 -n${nb_threads} ${latency_cmdline} ${drives}"
+info "io_uring" "Running ${cmdline}"
+${cmdline}
diff --git a/t/random_seed.py b/t/random_seed.py

new file mode 100755 (executable)

index 0000000..82beca6
--- /dev/null
+++ b/t/random_seed.py
@@ -0,0 +1,348 @@
+#!/usr/bin/env python3
+"""
+# random_seed.py
+#
+# Test fio's random seed options.
+#
+# - make sure that randseed overrides randrepeat and allrandrepeat
+# - make sure that seeds differ across invocations when [all]randrepeat=0 and randseed is not set
+# - make sure that seeds are always the same when [all]randrepeat=1 and randseed is not set
+#
+# USAGE
+# see python3 random_seed.py --help
+#
+# EXAMPLES
+# python3 t/random_seed.py
+# python3 t/random_seed.py -f ./fio
+#
+# REQUIREMENTS
+# Python 3.6
+#
+"""
+import os
+import sys
+import time
+import locale
+import logging
+import argparse
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+
+class FioRandTest(FioJobCmdTest):
+    """fio random seed test."""
+
+    def setup(self, parameters):
+        """Setup the test."""
+
+        fio_args = [
+            "--debug=random",
+            "--name=random_seed",
+            "--ioengine=null",
+            "--filesize=32k",
+            "--rw=randread",
+            f"--output={self.filenames['output']}",
+        ]
+        for opt in ['randseed', 'randrepeat', 'allrandrepeat']:
+            if opt in self.fio_opts:
+                option = f"--{opt}={self.fio_opts[opt]}"
+                fio_args.append(option)
+
+        super().setup(fio_args)
+
+    def get_rand_seeds(self):
+        """Collect random seeds from --debug=random output."""
+        with open(self.filenames['output'], "r",
+                  encoding=locale.getpreferredencoding()) as out_file:
+            file_data = out_file.read()
+
+            offsets = 0
+            for line in file_data.split('\n'):
+                if 'random' in line and 'FIO_RAND_NR_OFFS=' in line:
+                    tokens = line.split('=')
+                    offsets = int(tokens[len(tokens)-1])
+                    break
+
+            if offsets == 0:
+                pass
+                # find an exception to throw
+
+            seed_list = []
+            for line in file_data.split('\n'):
+                if 'random' not in line:
+                    continue
+                if 'rand_seeds[' in line:
+                    tokens = line.split('=')
+                    seed = int(tokens[-1])
+                    seed_list.append(seed)
+                    # assume that seeds are in order
+
+            return seed_list
+
+
+class TestRR(FioRandTest):
+    """
+    Test object for [all]randrepeat. If run for the first time just collect the
+    seeds. For later runs make sure the seeds match or do not match those
+    previously collected.
+    """
+    # one set of seeds is for randrepeat=0 and the other is for randrepeat=1
+    seeds = { 0: None, 1: None }
+
+    def check_result(self):
+        """Check output for allrandrepeat=1."""
+
+        super().check_result()
+        if not self.passed:
+            return
+
+        opt = 'randrepeat' if 'randrepeat' in self.fio_opts else 'allrandrepeat'
+        rr = self.fio_opts[opt]
+        rand_seeds = self.get_rand_seeds()
+
+        if not TestRR.seeds[rr]:
+            TestRR.seeds[rr] = rand_seeds
+            logging.debug("TestRR: saving rand_seeds for [a]rr=%d", rr)
+        else:
+            if rr:
+                if TestRR.seeds[1] != rand_seeds:
+                    self.passed = False
+                    print(f"TestRR: unexpected seed mismatch for [a]rr={rr}")
+                else:
+                    logging.debug("TestRR: seeds correctly match for [a]rr=%d", rr)
+                if TestRR.seeds[0] == rand_seeds:
+                    self.passed = False
+                    print("TestRR: seeds unexpectedly match those from system RNG")
+            else:
+                if TestRR.seeds[0] == rand_seeds:
+                    self.passed = False
+                    print(f"TestRR: unexpected seed match for [a]rr={rr}")
+                else:
+                    logging.debug("TestRR: seeds correctly don't match for [a]rr=%d", rr)
+                if TestRR.seeds[1] == rand_seeds:
+                    self.passed = False
+                    print("TestRR: random seeds unexpectedly match those from [a]rr=1")
+
+
+class TestRS(FioRandTest):
+    """
+    Test object when randseed=something controls the generated seeds. If run
+    for the first time for a given randseed just collect the seeds. For later
+    runs with the same seed make sure the seeds are the same as those
+    previously collected.
+    """
+    seeds = {}
+
+    def check_result(self):
+        """Check output for randseed=something."""
+
+        super().check_result()
+        if not self.passed:
+            return
+
+        rand_seeds = self.get_rand_seeds()
+        randseed = self.fio_opts['randseed']
+
+        logging.debug("randseed = %s", randseed)
+
+        if randseed not in TestRS.seeds:
+            TestRS.seeds[randseed] = rand_seeds
+            logging.debug("TestRS: saving rand_seeds")
+        else:
+            if TestRS.seeds[randseed] != rand_seeds:
+                self.passed = False
+                print("TestRS: seeds don't match when they should")
+            else:
+                logging.debug("TestRS: seeds correctly match")
+
+        # Now try to find seeds generated using a different randseed and make
+        # sure they *don't* match
+        for key, value in TestRS.seeds.items():
+            if key != randseed:
+                if value == rand_seeds:
+                    self.passed = False
+                    print("TestRS: randseeds differ but generated seeds match.")
+                else:
+                    logging.debug("TestRS: randseeds differ and generated seeds also differ.")
+
+
+def parse_args():
+    """Parse command-line arguments."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+    parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+    parser.add_argument('-d', '--debug', help='enable debug output', action='store_true')
+    parser.add_argument('-s', '--skip', nargs='+', type=int,
+                        help='list of test(s) to skip')
+    parser.add_argument('-o', '--run-only', nargs='+', type=int,
+                        help='list of test(s) to run, skipping all others')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """Run tests of fio random seed options"""
+
+    args = parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
+    artifact_root = args.artifact_root if args.artifact_root else \
+        f"random-seed-test-{time.strftime('%Y%m%d-%H%M%S')}"
+    os.mkdir(artifact_root)
+    print(f"Artifact directory is {artifact_root}")
+
+    if args.fio:
+        fio_path = str(Path(args.fio).absolute())
+    else:
+        fio_path = 'fio'
+    print(f"fio path is {fio_path}")
+
+    test_list = [
+        {
+            "test_id": 1,
+            "fio_opts": {
+                "randrepeat": 0,
+                },
+            "test_class": TestRR,
+        },
+        {
+            "test_id": 2,
+            "fio_opts": {
+                "randrepeat": 0,
+                },
+            "test_class": TestRR,
+        },
+        {
+            "test_id": 3,
+            "fio_opts": {
+                "randrepeat": 1,
+                },
+            "test_class": TestRR,
+        },
+        {
+            "test_id": 4,
+            "fio_opts": {
+                "randrepeat": 1,
+                },
+            "test_class": TestRR,
+        },
+        {
+            "test_id": 5,
+            "fio_opts": {
+                "allrandrepeat": 0,
+                },
+            "test_class": TestRR,
+        },
+        {
+            "test_id": 6,
+            "fio_opts": {
+                "allrandrepeat": 0,
+                },
+            "test_class": TestRR,
+        },
+        {
+            "test_id": 7,
+            "fio_opts": {
+                "allrandrepeat": 1,
+                },
+            "test_class": TestRR,
+        },
+        {
+            "test_id": 8,
+            "fio_opts": {
+                "allrandrepeat": 1,
+                },
+            "test_class": TestRR,
+        },
+        {
+            "test_id": 9,
+            "fio_opts": {
+                "randrepeat": 0,
+                "randseed": "12345",
+                },
+            "test_class": TestRS,
+        },
+        {
+            "test_id": 10,
+            "fio_opts": {
+                "randrepeat": 0,
+                "randseed": "12345",
+                },
+            "test_class": TestRS,
+        },
+        {
+            "test_id": 11,
+            "fio_opts": {
+                "randrepeat": 1,
+                "randseed": "12345",
+                },
+            "test_class": TestRS,
+        },
+        {
+            "test_id": 12,
+            "fio_opts": {
+                "allrandrepeat": 0,
+                "randseed": "12345",
+                },
+            "test_class": TestRS,
+        },
+        {
+            "test_id": 13,
+            "fio_opts": {
+                "allrandrepeat": 1,
+                "randseed": "12345",
+                },
+            "test_class": TestRS,
+        },
+        {
+            "test_id": 14,
+            "fio_opts": {
+                "randrepeat": 0,
+                "randseed": "67890",
+                },
+            "test_class": TestRS,
+        },
+        {
+            "test_id": 15,
+            "fio_opts": {
+                "randrepeat": 1,
+                "randseed": "67890",
+                },
+            "test_class": TestRS,
+        },
+        {
+            "test_id": 16,
+            "fio_opts": {
+                "allrandrepeat": 0,
+                "randseed": "67890",
+                },
+            "test_class": TestRS,
+        },
+        {
+            "test_id": 17,
+            "fio_opts": {
+                "allrandrepeat": 1,
+                "randseed": "67890",
+                },
+            "test_class": TestRS,
+        },
+    ]
+
+    test_env = {
+              'fio_path': fio_path,
+              'fio_root': str(Path(__file__).absolute().parent.parent),
+              'artifact_root': artifact_root,
+              'basename': 'random',
+              }
+
+    _, failed, _ = run_fio_tests(test_list, test_env, args)
+    sys.exit(failed)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/t/read-to-pipe-async.c b/t/read-to-pipe-async.c

index 586e3c95bfd339c276974bc3e641e19837f486d7..de98d03228f3e3f52134bfda535152512d6a0f94 100644 (file)
--- a/t/read-to-pipe-async.c
+++ b/t/read-to-pipe-async.c
@@ -36,6 +36,8 @@
  
  #include "../flist.h"
  
+#include "compiler/compiler.h"
+
  static int bs = 4096;
  static int max_us = 10000;
  static char *file;
@@ -47,6 +49,18 @@ static int separate_writer = 1;
  #define PLAT_NR                (PLAT_GROUP_NR * PLAT_VAL)
  #define PLAT_LIST_MAX  20
  
+#ifndef NDEBUG
+#define CHECK_ZERO_OR_ABORT(code) assert(code)
+#else
+#define CHECK_ZERO_OR_ABORT(code)                                                                              \
+       do {                                                                                                                            \
+               if (fio_unlikely((code) != 0)) {                                                                \
+                       log_err("failed checking code %i != 0", (code));        \
+                       abort();                                                                                                        \
+               }                                                                                                                               \
+       } while (0)
+#endif
+
  struct stats {
         unsigned int plat[PLAT_NR];
         unsigned int nr_samples;
@@ -121,7 +135,7 @@ uint64_t utime_since(const struct timespec *s, const struct timespec *e)
         return ret;
  }
  
-static struct work_item *find_seq(struct writer_thread *w, unsigned int seq)
+static struct work_item *find_seq(struct writer_thread *w, int seq)
  {
         struct work_item *work;
         struct flist_head *entry;
@@ -224,6 +238,8 @@ static int write_work(struct work_item *work)
  
         clock_gettime(CLOCK_MONOTONIC, &s);
         ret = write(STDOUT_FILENO, work->buf, work->buf_size);
+       if (ret < 0)
+               return (int)ret;
         clock_gettime(CLOCK_MONOTONIC, &e);
         assert(ret == work->buf_size);
  
@@ -241,16 +257,14 @@ static void *writer_fn(void *data)
  {
         struct writer_thread *wt = data;
         struct work_item *work;
-       unsigned int seq = 1;
+       int seq = 1;
  
         work = NULL;
-       while (!wt->thread.exit || !flist_empty(&wt->list)) {
+       while (!(seq < 0) && (!wt->thread.exit || !flist_empty(&wt->list))) {
                 pthread_mutex_lock(&wt->thread.lock);
  
-               if (work) {
+               if (work)
                         flist_add_tail(&work->list, &wt->done_list);
-                       work = NULL;
-               }
         
                 work = find_seq(wt, seq);
                 if (work)
@@ -469,10 +483,10 @@ static void init_thread(struct thread_data *thread)
         int ret;
  
         ret = pthread_condattr_init(&cattr);
-       assert(ret == 0);
+       CHECK_ZERO_OR_ABORT(ret);
  #ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
         ret = pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC);
-       assert(ret == 0);
+       CHECK_ZERO_OR_ABORT(ret);
  #endif
         pthread_cond_init(&thread->cond, &cattr);
         pthread_cond_init(&thread->done_cond, &cattr);
@@ -626,10 +640,10 @@ int main(int argc, char *argv[])
         bytes = 0;
  
         ret = pthread_condattr_init(&cattr);
-       assert(ret == 0);
+       CHECK_ZERO_OR_ABORT(ret);
  #ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
         ret = pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC);
-       assert(ret == 0);
+       CHECK_ZERO_OR_ABORT(ret);
  #endif
  
         clock_gettime(CLOCK_MONOTONIC, &s);
diff --git a/t/readonly.py b/t/readonly.py

index 464847c6034d4474b1974174c9dd9d8453c1a200..d36faafa7cfc2215bbfaa148e95d04cf506b1c8f 100755 (executable)
--- a/t/readonly.py
+++ b/t/readonly.py
@@ -2,11 +2,11 @@
  # SPDX-License-Identifier: GPL-2.0-only
  #
  # Copyright (c) 2019 Western Digital Corporation or its affiliates.
-#
-#
+
+"""
  # readonly.py
  #
-# Do some basic tests of the --readonly paramter
+# Do some basic tests of the --readonly parameter
  #
  # USAGE
  # python readonly.py [-f fio-executable]
@@ -18,122 +18,144 @@
  # REQUIREMENTS
  # Python 3.5+
  #
-#
+"""
  
+import os
  import sys
+import time
  import argparse
-import subprocess
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+from fiotestcommon import SUCCESS_DEFAULT, SUCCESS_NONZERO
+
+
+class FioReadOnlyTest(FioJobCmdTest):
+    """fio read only test."""
+
+    def setup(self, parameters):
+        """Setup the test."""
+
+        fio_args = [
+                    "--name=readonly",
+                    "--ioengine=null",
+                    "--time_based",
+                    "--runtime=1s",
+                    "--size=1M",
+                    f"--rw={self.fio_opts['rw']}",
+                   ]
+        if 'readonly-pre' in parameters:
+            fio_args.insert(0, "--readonly")
+        if 'readonly-post' in parameters:
+            fio_args.append("--readonly")
+
+        super().setup(fio_args)
+
+
+TEST_LIST = [
+            {
+                "test_id": 1,
+                "fio_opts": { "rw": "randread", },
+                "readonly-pre": 1,
+                "success": SUCCESS_DEFAULT,
+                "test_class": FioReadOnlyTest,
+            },
+            {
+                "test_id": 2,
+                "fio_opts": { "rw": "randwrite", },
+                "readonly-pre": 1,
+                "success": SUCCESS_NONZERO,
+                "test_class": FioReadOnlyTest,
+            },
+            {
+                "test_id": 3,
+                "fio_opts": { "rw": "randtrim", },
+                "readonly-pre": 1,
+                "success": SUCCESS_NONZERO,
+                "test_class": FioReadOnlyTest,
+            },
+            {
+                "test_id": 4,
+                "fio_opts": { "rw": "randread", },
+                "readonly-post": 1,
+                "success": SUCCESS_DEFAULT,
+                "test_class": FioReadOnlyTest,
+            },
+            {
+                "test_id": 5,
+                "fio_opts": { "rw": "randwrite", },
+                "readonly-post": 1,
+                "success": SUCCESS_NONZERO,
+                "test_class": FioReadOnlyTest,
+            },
+            {
+                "test_id": 6,
+                "fio_opts": { "rw": "randtrim", },
+                "readonly-post": 1,
+                "success": SUCCESS_NONZERO,
+                "test_class": FioReadOnlyTest,
+            },
+            {
+                "test_id": 7,
+                "fio_opts": { "rw": "randread", },
+                "success": SUCCESS_DEFAULT,
+                "test_class": FioReadOnlyTest,
+            },
+            {
+                "test_id": 8,
+                "fio_opts": { "rw": "randwrite", },
+                "success": SUCCESS_DEFAULT,
+                "test_class": FioReadOnlyTest,
+            },
+            {
+                "test_id": 9,
+                "fio_opts": { "rw": "randtrim", },
+                "success": SUCCESS_DEFAULT,
+                "test_class": FioReadOnlyTest,
+            },
+        ]
  
  
  def parse_args():
+    """Parse command-line arguments."""
+
      parser = argparse.ArgumentParser()
-    parser.add_argument('-f', '--fio',
-                        help='path to fio executable (e.g., ./fio)')
+    parser.add_argument('-f', '--fio', help='path to fio executable (e.g., ./fio)')
+    parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+    parser.add_argument('-s', '--skip', nargs='+', type=int,
+                        help='list of test(s) to skip')
+    parser.add_argument('-o', '--run-only', nargs='+', type=int,
+                        help='list of test(s) to run, skipping all others')
      args = parser.parse_args()
  
      return args
  
  
-def run_fio(fio, test, index):
-    fio_args = [
-                "--max-jobs=16",
-                "--name=readonly",
-                "--ioengine=null",
-                "--time_based",
-                "--runtime=1s",
-                "--size=1M",
-                "--rw={rw}".format(**test),
-               ]
-    if 'readonly-pre' in test:
-        fio_args.insert(0, "--readonly")
-    if 'readonly-post' in test:
-        fio_args.append("--readonly")
-
-    output = subprocess.run([fio] + fio_args, stdout=subprocess.PIPE,
-                            stderr=subprocess.PIPE)
-
-    return output
-
-
-def check_output(output, test):
-    expect_error = False
-    if 'readonly-pre' in test or 'readonly-post' in test:
-        if 'write' in test['rw'] or 'trim' in test['rw']:
-            expect_error = True
-
-#    print(output.stdout)
-#    print(output.stderr)
-
-    if output.returncode == 0:
-        if expect_error:
-            return False
-        else:
-            return True
-    else:
-        if expect_error:
-            return True
-        else:
-            return False
-
+def main():
+    """Run readonly tests."""
  
-if __name__ == '__main__':
      args = parse_args()
  
-    tests = [
-                {
-                    "rw": "randread",
-                    "readonly-pre": 1,
-                },
-                {
-                    "rw": "randwrite",
-                    "readonly-pre": 1,
-                },
-                {
-                    "rw": "randtrim",
-                    "readonly-pre": 1,
-                },
-                {
-                    "rw": "randread",
-                    "readonly-post": 1,
-                },
-                {
-                    "rw": "randwrite",
-                    "readonly-post": 1,
-                },
-                {
-                    "rw": "randtrim",
-                    "readonly-post": 1,
-                },
-                {
-                    "rw": "randread",
-                },
-                {
-                    "rw": "randwrite",
-                },
-                {
-                    "rw": "randtrim",
-                },
-            ]
-
-    index = 1
-    passed = 0
-    failed = 0
-
      if args.fio:
-        fio_path = args.fio
+        fio_path = str(Path(args.fio).absolute())
      else:
          fio_path = 'fio'
+    print(f"fio path is {fio_path}")
  
-    for test in tests:
-        output = run_fio(fio_path, test, index)
-        status = check_output(output, test)
-        print("Test {0} {1}".format(index, ("PASSED" if status else "FAILED")))
-        if status:
-            passed = passed + 1
-        else:
-            failed = failed + 1
-        index = index + 1
+    artifact_root = args.artifact_root if args.artifact_root else \
+        f"readonly-test-{time.strftime('%Y%m%d-%H%M%S')}"
+    os.mkdir(artifact_root)
+    print(f"Artifact directory is {artifact_root}")
  
-    print("{0} tests passed, {1} failed".format(passed, failed))
+    test_env = {
+              'fio_path': fio_path,
+              'fio_root': str(Path(__file__).absolute().parent.parent),
+              'artifact_root': artifact_root,
+              'basename': 'readonly',
+              }
  
+    _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
      sys.exit(failed)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/t/run-fio-tests.py b/t/run-fio-tests.py

index a59cdfe054ee5ab60258930ef558b5d3f1396f12..225806134e59a6ea3d181500e1f7c1680717363c 100755 (executable)
--- a/t/run-fio-tests.py
+++ b/t/run-fio-tests.py
@@ -43,314 +43,40 @@
  
  import os
  import sys
-import json
  import time
  import shutil
  import logging
  import argparse
-import platform
-import subprocess
-import multiprocessing
  from pathlib import Path
+from statsmodels.sandbox.stats.runs import runstest_1samp
+from fiotestlib import FioExeTest, FioJobFileTest, run_fio_tests
+from fiotestcommon import *
  
  
-class FioTest(object):
-    """Base for all fio tests."""
-
-    def __init__(self, exe_path, parameters, success):
-        self.exe_path = exe_path
-        self.parameters = parameters
-        self.success = success
-        self.output = {}
-        self.artifact_root = None
-        self.testnum = None
-        self.test_dir = None
-        self.passed = True
-        self.failure_reason = ''
-        self.command_file = None
-        self.stdout_file = None
-        self.stderr_file = None
-        self.exitcode_file = None
-
-    def setup(self, artifact_root, testnum):
-        """Setup instance variables for test."""
-
-        self.artifact_root = artifact_root
-        self.testnum = testnum
-        self.test_dir = os.path.join(artifact_root, "{:04d}".format(testnum))
-        if not os.path.exists(self.test_dir):
-            os.mkdir(self.test_dir)
-
-        self.command_file = os.path.join(
-            self.test_dir,
-            "{0}.command".format(os.path.basename(self.exe_path)))
-        self.stdout_file = os.path.join(
-            self.test_dir,
-            "{0}.stdout".format(os.path.basename(self.exe_path)))
-        self.stderr_file = os.path.join(
-            self.test_dir,
-            "{0}.stderr".format(os.path.basename(self.exe_path)))
-        self.exitcode_file = os.path.join(
-            self.test_dir,
-            "{0}.exitcode".format(os.path.basename(self.exe_path)))
-
-    def run(self):
-        """Run the test."""
-
-        raise NotImplementedError()
-
-    def check_result(self):
-        """Check test results."""
-
-        raise NotImplementedError()
-
-
-class FioExeTest(FioTest):
-    """Test consists of an executable binary or script"""
-
-    def __init__(self, exe_path, parameters, success):
-        """Construct a FioExeTest which is a FioTest consisting of an
-        executable binary or script.
-
-        exe_path:       location of executable binary or script
-        parameters:     list of parameters for executable
-        success:        Definition of test success
-        """
-
-        FioTest.__init__(self, exe_path, parameters, success)
-
-    def run(self):
-        """Execute the binary or script described by this instance."""
-
-        command = [self.exe_path] + self.parameters
-        command_file = open(self.command_file, "w+")
-        command_file.write("%s\n" % command)
-        command_file.close()
-
-        stdout_file = open(self.stdout_file, "w+")
-        stderr_file = open(self.stderr_file, "w+")
-        exitcode_file = open(self.exitcode_file, "w+")
-        try:
-            proc = None
-            # Avoid using subprocess.run() here because when a timeout occurs,
-            # fio will be stopped with SIGKILL. This does not give fio a
-            # chance to clean up and means that child processes may continue
-            # running and submitting IO.
-            proc = subprocess.Popen(command,
-                                    stdout=stdout_file,
-                                    stderr=stderr_file,
-                                    cwd=self.test_dir,
-                                    universal_newlines=True)
-            proc.communicate(timeout=self.success['timeout'])
-            exitcode_file.write('{0}\n'.format(proc.returncode))
-            logging.debug("Test %d: return code: %d", self.testnum, proc.returncode)
-            self.output['proc'] = proc
-        except subprocess.TimeoutExpired:
-            proc.terminate()
-            proc.communicate()
-            assert proc.poll()
-            self.output['failure'] = 'timeout'
-        except Exception:
-            if proc:
-                if not proc.poll():
-                    proc.terminate()
-                    proc.communicate()
-            self.output['failure'] = 'exception'
-            self.output['exc_info'] = sys.exc_info()
-        finally:
-            stdout_file.close()
-            stderr_file.close()
-            exitcode_file.close()
-
-    def check_result(self):
-        """Check results of test run."""
-
-        if 'proc' not in self.output:
-            if self.output['failure'] == 'timeout':
-                self.failure_reason = "{0} timeout,".format(self.failure_reason)
-            else:
-                assert self.output['failure'] == 'exception'
-                self.failure_reason = '{0} exception: {1}, {2}'.format(
-                    self.failure_reason, self.output['exc_info'][0],
-                    self.output['exc_info'][1])
-
-            self.passed = False
-            return
-
-        if 'zero_return' in self.success:
-            if self.success['zero_return']:
-                if self.output['proc'].returncode != 0:
-                    self.passed = False
-                    self.failure_reason = "{0} non-zero return code,".format(self.failure_reason)
-            else:
-                if self.output['proc'].returncode == 0:
-                    self.failure_reason = "{0} zero return code,".format(self.failure_reason)
-                    self.passed = False
-
-        stderr_size = os.path.getsize(self.stderr_file)
-        if 'stderr_empty' in self.success:
-            if self.success['stderr_empty']:
-                if stderr_size != 0:
-                    self.failure_reason = "{0} stderr not empty,".format(self.failure_reason)
-                    self.passed = False
-            else:
-                if stderr_size == 0:
-                    self.failure_reason = "{0} stderr empty,".format(self.failure_reason)
-                    self.passed = False
-
-
-class FioJobTest(FioExeTest):
-    """Test consists of a fio job"""
-
-    def __init__(self, fio_path, fio_job, success, fio_pre_job=None,
-                 fio_pre_success=None, output_format="normal"):
-        """Construct a FioJobTest which is a FioExeTest consisting of a
-        single fio job file with an optional setup step.
-
-        fio_path:           location of fio executable
-        fio_job:            location of fio job file
-        success:            Definition of test success
-        fio_pre_job:        fio job for preconditioning
-        fio_pre_success:    Definition of test success for fio precon job
-        output_format:      normal (default), json, jsonplus, or terse
-        """
-
-        self.fio_job = fio_job
-        self.fio_pre_job = fio_pre_job
-        self.fio_pre_success = fio_pre_success if fio_pre_success else success
-        self.output_format = output_format
-        self.precon_failed = False
-        self.json_data = None
-        self.fio_output = "{0}.output".format(os.path.basename(self.fio_job))
-        self.fio_args = [
-            "--max-jobs=16",
-            "--output-format={0}".format(self.output_format),
-            "--output={0}".format(self.fio_output),
-            self.fio_job,
-            ]
-        FioExeTest.__init__(self, fio_path, self.fio_args, success)
-
-    def setup(self, artifact_root, testnum):
-        """Setup instance variables for fio job test."""
-
-        super(FioJobTest, self).setup(artifact_root, testnum)
-
-        self.command_file = os.path.join(
-            self.test_dir,
-            "{0}.command".format(os.path.basename(self.fio_job)))
-        self.stdout_file = os.path.join(
-            self.test_dir,
-            "{0}.stdout".format(os.path.basename(self.fio_job)))
-        self.stderr_file = os.path.join(
-            self.test_dir,
-            "{0}.stderr".format(os.path.basename(self.fio_job)))
-        self.exitcode_file = os.path.join(
-            self.test_dir,
-            "{0}.exitcode".format(os.path.basename(self.fio_job)))
-
-    def run_pre_job(self):
-        """Run fio job precondition step."""
-
-        precon = FioJobTest(self.exe_path, self.fio_pre_job,
-                            self.fio_pre_success,
-                            output_format=self.output_format)
-        precon.setup(self.artifact_root, self.testnum)
-        precon.run()
-        precon.check_result()
-        self.precon_failed = not precon.passed
-        self.failure_reason = precon.failure_reason
-
-    def run(self):
-        """Run fio job test."""
-
-        if self.fio_pre_job:
-            self.run_pre_job()
-
-        if not self.precon_failed:
-            super(FioJobTest, self).run()
-        else:
-            logging.debug("Test %d: precondition step failed", self.testnum)
-
-    @classmethod
-    def get_file(cls, filename):
-        """Safely read a file."""
-        file_data = ''
-        success = True
-
-        try:
-            with open(filename, "r") as output_file:
-                file_data = output_file.read()
-        except OSError:
-            success = False
-
-        return file_data, success
-
-    def check_result(self):
-        """Check fio job results."""
-
-        if self.precon_failed:
-            self.passed = False
-            self.failure_reason = "{0} precondition step failed,".format(self.failure_reason)
-            return
-
-        super(FioJobTest, self).check_result()
-
-        if not self.passed:
-            return
-
-        if 'json' not in self.output_format:
-            return
-
-        file_data, success = self.get_file(os.path.join(self.test_dir, self.fio_output))
-        if not success:
-            self.failure_reason = "{0} unable to open output file,".format(self.failure_reason)
-            self.passed = False
-            return
-
-        #
-        # Sometimes fio informational messages are included at the top of the
-        # JSON output, especially under Windows. Try to decode output as JSON
-        # data, lopping off up to the first four lines
-        #
-        lines = file_data.splitlines()
-        for i in range(5):
-            file_data = '\n'.join(lines[i:])
-            try:
-                self.json_data = json.loads(file_data)
-            except json.JSONDecodeError:
-                continue
-            else:
-                logging.debug("Test %d: skipped %d lines decoding JSON data", self.testnum, i)
-                return
-
-        self.failure_reason = "{0} unable to decode JSON data,".format(self.failure_reason)
-        self.passed = False
-
-
-class FioJobTest_t0005(FioJobTest):
+class FioJobFileTest_t0005(FioJobFileTest):
      """Test consists of fio test job t0005
      Confirm that read['io_kbytes'] == write['io_kbytes'] == 102400"""
  
      def check_result(self):
-        super(FioJobTest_t0005, self).check_result()
+        super().check_result()
  
          if not self.passed:
              return
  
          if self.json_data['jobs'][0]['read']['io_kbytes'] != 102400:
-            self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason)
+            self.failure_reason = f"{self.failure_reason} bytes read mismatch,"
              self.passed = False
          if self.json_data['jobs'][0]['write']['io_kbytes'] != 102400:
-            self.failure_reason = "{0} bytes written mismatch,".format(self.failure_reason)
+            self.failure_reason = f"{self.failure_reason} bytes written mismatch,"
              self.passed = False
  
  
-class FioJobTest_t0006(FioJobTest):
+class FioJobFileTest_t0006(FioJobFileTest):
      """Test consists of fio test job t0006
      Confirm that read['io_kbytes'] ~ 2*write['io_kbytes']"""
  
      def check_result(self):
-        super(FioJobTest_t0006, self).check_result()
+        super().check_result()
  
          if not self.passed:
              return
@@ -359,56 +85,59 @@ class FioJobTest_t0006(FioJobTest):
              / self.json_data['jobs'][0]['write']['io_kbytes']
          logging.debug("Test %d: ratio: %f", self.testnum, ratio)
          if ratio < 1.99 or ratio > 2.01:
-            self.failure_reason = "{0} read/write ratio mismatch,".format(self.failure_reason)
+            self.failure_reason = f"{self.failure_reason} read/write ratio mismatch,"
              self.passed = False
  
  
-class FioJobTest_t0007(FioJobTest):
+class FioJobFileTest_t0007(FioJobFileTest):
      """Test consists of fio test job t0007
      Confirm that read['io_kbytes'] = 87040"""
  
      def check_result(self):
-        super(FioJobTest_t0007, self).check_result()
+        super().check_result()
  
          if not self.passed:
              return
  
          if self.json_data['jobs'][0]['read']['io_kbytes'] != 87040:
-            self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason)
+            self.failure_reason = f"{self.failure_reason} bytes read mismatch,"
              self.passed = False
  
  
-class FioJobTest_t0008(FioJobTest):
+class FioJobFileTest_t0008(FioJobFileTest):
      """Test consists of fio test job t0008
      Confirm that read['io_kbytes'] = 32768 and that
-                write['io_kbytes'] ~ 16568
+                write['io_kbytes'] ~ 16384
  
-    I did runs with fio-ae2fafc8 and saw write['io_kbytes'] values of
-    16585, 16588. With two runs of fio-3.16 I obtained 16568"""
+    This is a 50/50 seq read/write workload. Since fio flips a coin to
+    determine whether to issue a read or a write, total bytes written will not
+    be exactly 16384K. But total bytes read will be exactly 32768K because
+    reads will include the initial phase as well as the verify phase where all
+    the blocks originally written will be read."""
  
      def check_result(self):
-        super(FioJobTest_t0008, self).check_result()
+        super().check_result()
  
          if not self.passed:
              return
  
-        ratio = self.json_data['jobs'][0]['write']['io_kbytes'] / 16568
+        ratio = self.json_data['jobs'][0]['write']['io_kbytes'] / 16384
          logging.debug("Test %d: ratio: %f", self.testnum, ratio)
  
-        if ratio < 0.99 or ratio > 1.01:
-            self.failure_reason = "{0} bytes written mismatch,".format(self.failure_reason)
+        if ratio < 0.97 or ratio > 1.03:
+            self.failure_reason = f"{self.failure_reason} bytes written mismatch,"
              self.passed = False
          if self.json_data['jobs'][0]['read']['io_kbytes'] != 32768:
-            self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason)
+            self.failure_reason = f"{self.failure_reason} bytes read mismatch,"
              self.passed = False
  
  
-class FioJobTest_t0009(FioJobTest):
+class FioJobFileTest_t0009(FioJobFileTest):
      """Test consists of fio test job t0009
      Confirm that runtime >= 60s"""
  
      def check_result(self):
-        super(FioJobTest_t0009, self).check_result()
+        super().check_result()
  
          if not self.passed:
              return
@@ -416,28 +145,27 @@ class FioJobTest_t0009(FioJobTest):
          logging.debug('Test %d: elapsed: %d', self.testnum, self.json_data['jobs'][0]['elapsed'])
  
          if self.json_data['jobs'][0]['elapsed'] < 60:
-            self.failure_reason = "{0} elapsed time mismatch,".format(self.failure_reason)
+            self.failure_reason = f"{self.failure_reason} elapsed time mismatch,"
              self.passed = False
  
  
-class FioJobTest_t0012(FioJobTest):
+class FioJobFileTest_t0012(FioJobFileTest):
      """Test consists of fio test job t0012
      Confirm ratios of job iops are 1:5:10
      job1,job2,job3 respectively"""
  
      def check_result(self):
-        super(FioJobTest_t0012, self).check_result()
+        super().check_result()
  
          if not self.passed:
              return
  
          iops_files = []
-        for i in range(1,4):
-            file_data, success = self.get_file(os.path.join(self.test_dir, "{0}_iops.{1}.log".format(os.path.basename(self.fio_job), i)))
-
-            if not success:
-                self.failure_reason = "{0} unable to open output file,".format(self.failure_reason)
-                self.passed = False
+        for i in range(1, 4):
+            filename = os.path.join(self.paths['test_dir'], "{0}_iops.{1}.log".format(os.path.basename(
+                self.fio_job), i))
+            file_data = self.get_file_fail(filename)
+            if not file_data:
                  return
  
              iops_files.append(file_data.splitlines())
@@ -453,22 +181,20 @@ class FioJobTest_t0012(FioJobTest):
  
              ratio1 = iops3/iops2
              ratio2 = iops3/iops1
-            logging.debug(
-                "sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} job3/job2={4:.3f} job3/job1={5:.3f}".format(
-                    i, iops1, iops2, iops3, ratio1, ratio2
-                )
-            )
+            logging.debug("sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} " \
+                "job3/job2={4:.3f} job3/job1={5:.3f}".format(i, iops1, iops2, iops3, ratio1,
+                                                             ratio2))
  
          # test job1 and job2 succeeded to recalibrate
          if ratio1 < 1 or ratio1 > 3 or ratio2 < 7 or ratio2 > 13:
-            self.failure_reason = "{0} iops ratio mismatch iops1={1} iops2={2} iops3={3} expected r1~2 r2~10 got r1={4:.3f} r2={5:.3f},".format(
-                self.failure_reason, iops1, iops2, iops3, ratio1, ratio2
-            )
+            self.failure_reason += " iops ratio mismatch iops1={0} iops2={1} iops3={2} " \
+                "expected r1~2 r2~10 got r1={3:.3f} r2={4:.3f},".format(iops1, iops2, iops3,
+                                                                        ratio1, ratio2)
              self.passed = False
              return
  
  
-class FioJobTest_t0014(FioJobTest):
+class FioJobFileTest_t0014(FioJobFileTest):
      """Test consists of fio test job t0014
         Confirm that job1_iops / job2_iops ~ 1:2 for entire duration
         and that job1_iops / job3_iops ~ 1:3 for first half of duration.
@@ -477,18 +203,17 @@ class FioJobTest_t0014(FioJobTest):
      re-calibrate the activity dynamically"""
  
      def check_result(self):
-        super(FioJobTest_t0014, self).check_result()
+        super().check_result()
  
          if not self.passed:
              return
  
          iops_files = []
-        for i in range(1,4):
-            file_data, success = self.get_file(os.path.join(self.test_dir, "{0}_iops.{1}.log".format(os.path.basename(self.fio_job), i)))
-
-            if not success:
-                self.failure_reason = "{0} unable to open output file,".format(self.failure_reason)
-                self.passed = False
+        for i in range(1, 4):
+            filename = os.path.join(self.paths['test_dir'], "{0}_iops.{1}.log".format(os.path.basename(
+                self.fio_job), i))
+            file_data = self.get_file_fail(filename)
+            if not file_data:
                  return
  
              iops_files.append(file_data.splitlines())
@@ -506,10 +231,9 @@ class FioJobTest_t0014(FioJobTest):
  
  
                  if ratio1 < 0.43 or ratio1 > 0.57 or ratio2 < 0.21 or ratio2 > 0.45:
-                    self.failure_reason = "{0} iops ratio mismatch iops1={1} iops2={2} iops3={3}\
-                                                expected r1~0.5 r2~0.33 got r1={4:.3f} r2={5:.3f},".format(
-                        self.failure_reason, iops1, iops2, iops3, ratio1, ratio2
-                    )
+                    self.failure_reason += " iops ratio mismatch iops1={0} iops2={1} iops3={2} " \
+                                           "expected r1~0.5 r2~0.33 got r1={3:.3f} r2={4:.3f},".format(
+                                               iops1, iops2, iops3, ratio1, ratio2)
                      self.passed = False
  
              iops1 = iops1 + float(iops_files[0][i].split(',')[1])
@@ -517,175 +241,350 @@ class FioJobTest_t0014(FioJobTest):
  
              ratio1 = iops1/iops2
              ratio2 = iops1/iops3
-            logging.debug(
-                "sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} job1/job2={4:.3f} job1/job3={5:.3f}".format(
-                    i, iops1, iops2, iops3, ratio1, ratio2
-                )
-            )
+            logging.debug("sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} " \
+                          "job1/job2={4:.3f} job1/job3={5:.3f}".format(i, iops1, iops2, iops3,
+                                                                       ratio1, ratio2))
  
          # test job1 and job2 succeeded to recalibrate
          if ratio1 < 0.43 or ratio1 > 0.57:
-            self.failure_reason = "{0} iops ratio mismatch iops1={1} iops2={2} expected ratio~0.5 got ratio={3:.3f},".format(
-                self.failure_reason, iops1, iops2, ratio1
-            )
+            self.failure_reason += " iops ratio mismatch iops1={0} iops2={1} expected ratio~0.5 " \
+                                   "got ratio={2:.3f},".format(iops1, iops2, ratio1)
              self.passed = False
              return
  
  
-class FioJobTest_iops_rate(FioJobTest):
-    """Test consists of fio test job t0009
+class FioJobFileTest_t0015(FioJobFileTest):
+    """Test consists of fio test jobs t0015 and t0016
+    Confirm that mean(slat) + mean(clat) = mean(tlat)"""
+
+    def check_result(self):
+        super().check_result()
+
+        if not self.passed:
+            return
+
+        slat = self.json_data['jobs'][0]['read']['slat_ns']['mean']
+        clat = self.json_data['jobs'][0]['read']['clat_ns']['mean']
+        tlat = self.json_data['jobs'][0]['read']['lat_ns']['mean']
+        logging.debug('Test %d: slat %f, clat %f, tlat %f', self.testnum, slat, clat, tlat)
+
+        if abs(slat + clat - tlat) > 1:
+            self.failure_reason = "{0} slat {1} + clat {2} = {3} != tlat {4},".format(
+                self.failure_reason, slat, clat, slat+clat, tlat)
+            self.passed = False
+
+
+class FioJobFileTest_t0019(FioJobFileTest):
+    """Test consists of fio test job t0019
+    Confirm that all offsets were touched sequentially"""
+
+    def check_result(self):
+        super().check_result()
+
+        bw_log_filename = os.path.join(self.paths['test_dir'], "test_bw.log")
+        file_data = self.get_file_fail(bw_log_filename)
+        if not file_data:
+            return
+
+        log_lines = file_data.split('\n')
+
+        prev = -4096
+        for line in log_lines:
+            if len(line.strip()) == 0:
+                continue
+            cur = int(line.split(',')[4])
+            if cur - prev != 4096:
+                self.passed = False
+                self.failure_reason = f"offsets {prev}, {cur} not sequential"
+                return
+            prev = cur
+
+        if cur/4096 != 255:
+            self.passed = False
+            self.failure_reason = f"unexpected last offset {cur}"
+
+
+class FioJobFileTest_t0020(FioJobFileTest):
+    """Test consists of fio test jobs t0020 and t0021
+    Confirm that almost all offsets were touched non-sequentially"""
+
+    def check_result(self):
+        super().check_result()
+
+        bw_log_filename = os.path.join(self.paths['test_dir'], "test_bw.log")
+        file_data = self.get_file_fail(bw_log_filename)
+        if not file_data:
+            return
+
+        log_lines = file_data.split('\n')
+
+        offsets = []
+
+        prev = int(log_lines[0].split(',')[4])
+        for line in log_lines[1:]:
+            offsets.append(prev/4096)
+            if len(line.strip()) == 0:
+                continue
+            cur = int(line.split(',')[4])
+            prev = cur
+
+        if len(offsets) != 256:
+            self.passed = False
+            self.failure_reason += f" number of offsets is {len(offsets)} instead of 256"
+
+        for i in range(256):
+            if not i in offsets:
+                self.passed = False
+                self.failure_reason += f" missing offset {i * 4096}"
+
+        (_, p) = runstest_1samp(list(offsets))
+        if p < 0.05:
+            self.passed = False
+            self.failure_reason += f" runs test failed with p = {p}"
+
+
+class FioJobFileTest_t0022(FioJobFileTest):
+    """Test consists of fio test job t0022"""
+
+    def check_result(self):
+        super().check_result()
+
+        bw_log_filename = os.path.join(self.paths['test_dir'], "test_bw.log")
+        file_data = self.get_file_fail(bw_log_filename)
+        if not file_data:
+            return
+
+        log_lines = file_data.split('\n')
+
+        filesize = 1024*1024
+        bs = 4096
+        seq_count = 0
+        offsets = set()
+
+        prev = int(log_lines[0].split(',')[4])
+        for line in log_lines[1:]:
+            offsets.add(prev/bs)
+            if len(line.strip()) == 0:
+                continue
+            cur = int(line.split(',')[4])
+            if cur - prev == bs:
+                seq_count += 1
+            prev = cur
+
+        # 10 is an arbitrary threshold
+        if seq_count > 10:
+            self.passed = False
+            self.failure_reason = f"too many ({seq_count}) consecutive offsets"
+
+        if len(offsets) == filesize/bs:
+            self.passed = False
+            self.failure_reason += " no duplicate offsets found with norandommap=1"
+
+
+class FioJobFileTest_t0023(FioJobFileTest):
+    """Test consists of fio test job t0023 randtrimwrite test."""
+
+    def check_trimwrite(self, filename):
+        """Make sure that trims are followed by writes of the same size at the same offset."""
+
+        bw_log_filename = os.path.join(self.paths['test_dir'], filename)
+        file_data = self.get_file_fail(bw_log_filename)
+        if not file_data:
+            return
+
+        log_lines = file_data.split('\n')
+
+        prev_ddir = 1
+        for line in log_lines:
+            if len(line.strip()) == 0:
+                continue
+            vals = line.split(',')
+            ddir = int(vals[2])
+            bs = int(vals[3])
+            offset = int(vals[4])
+            if prev_ddir == 1:
+                if ddir != 2:
+                    self.passed = False
+                    self.failure_reason += " {0}: write not preceeded by trim: {1}".format(
+                        bw_log_filename, line)
+                    break
+            else:
+                if ddir != 1:   # pylint: disable=no-else-break
+                    self.passed = False
+                    self.failure_reason += " {0}: trim not preceeded by write: {1}".format(
+                        bw_log_filename, line)
+                    break
+                else:
+                    if prev_bs != bs:
+                        self.passed = False
+                        self.failure_reason += " {0}: block size does not match: {1}".format(
+                            bw_log_filename, line)
+                        break
+
+                    if prev_offset != offset:
+                        self.passed = False
+                        self.failure_reason += " {0}: offset does not match: {1}".format(
+                            bw_log_filename, line)
+                        break
+
+            prev_ddir = ddir
+            prev_bs = bs
+            prev_offset = offset
+
+
+    def check_all_offsets(self, filename, sectorsize, filesize):
+        """Make sure all offsets were touched."""
+
+        file_data = self.get_file_fail(os.path.join(self.paths['test_dir'], filename))
+        if not file_data:
+            return
+
+        log_lines = file_data.split('\n')
+
+        offsets = set()
+
+        for line in log_lines:
+            if len(line.strip()) == 0:
+                continue
+            vals = line.split(',')
+            bs = int(vals[3])
+            offset = int(vals[4])
+            if offset % sectorsize != 0:
+                self.passed = False
+                self.failure_reason += " {0}: offset {1} not a multiple of sector size {2}".format(
+                    filename, offset, sectorsize)
+                break
+            if bs % sectorsize != 0:
+                self.passed = False
+                self.failure_reason += " {0}: block size {1} not a multiple of sector size " \
+                    "{2}".format(filename, bs, sectorsize)
+                break
+            for i in range(int(bs/sectorsize)):
+                offsets.add(offset/sectorsize + i)
+
+        if len(offsets) != filesize/sectorsize:
+            self.passed = False
+            self.failure_reason += " {0}: only {1} offsets touched; expected {2}".format(
+                filename, len(offsets), filesize/sectorsize)
+        else:
+            logging.debug("%s: %d sectors touched", filename, len(offsets))
+
+
+    def check_result(self):
+        super().check_result()
+
+        filesize = 1024*1024
+
+        self.check_trimwrite("basic_bw.log")
+        self.check_trimwrite("bs_bw.log")
+        self.check_trimwrite("bsrange_bw.log")
+        self.check_trimwrite("bssplit_bw.log")
+        self.check_trimwrite("basic_no_rm_bw.log")
+        self.check_trimwrite("bs_no_rm_bw.log")
+        self.check_trimwrite("bsrange_no_rm_bw.log")
+        self.check_trimwrite("bssplit_no_rm_bw.log")
+
+        self.check_all_offsets("basic_bw.log", 4096, filesize)
+        self.check_all_offsets("bs_bw.log", 8192, filesize)
+        self.check_all_offsets("bsrange_bw.log", 512, filesize)
+        self.check_all_offsets("bssplit_bw.log", 512, filesize)
+
+
+class FioJobFileTest_t0024(FioJobFileTest_t0023):
+    """Test consists of fio test job t0024 trimwrite test."""
+
+    def check_result(self):
+        # call FioJobFileTest_t0023's parent to skip checks done by t0023
+        super(FioJobFileTest_t0023, self).check_result()
+
+        filesize = 1024*1024
+
+        self.check_trimwrite("basic_bw.log")
+        self.check_trimwrite("bs_bw.log")
+        self.check_trimwrite("bsrange_bw.log")
+        self.check_trimwrite("bssplit_bw.log")
+
+        self.check_all_offsets("basic_bw.log", 4096, filesize)
+        self.check_all_offsets("bs_bw.log", 8192, filesize)
+        self.check_all_offsets("bsrange_bw.log", 512, filesize)
+        self.check_all_offsets("bssplit_bw.log", 512, filesize)
+
+
+class FioJobFileTest_t0025(FioJobFileTest):
+    """Test experimental verify read backs written data pattern."""
+    def check_result(self):
+        super().check_result()
+
+        if not self.passed:
+            return
+
+        if self.json_data['jobs'][0]['read']['io_kbytes'] != 128:
+            self.passed = False
+
+class FioJobFileTest_t0027(FioJobFileTest):
+    def setup(self, *args, **kws):
+        super().setup(*args, **kws)
+        self.pattern_file = os.path.join(self.paths['test_dir'], "t0027.pattern")
+        self.output_file = os.path.join(self.paths['test_dir'], "t0027file")
+        self.pattern = os.urandom(16 << 10)
+        with open(self.pattern_file, "wb") as f:
+            f.write(self.pattern)
+
+    def check_result(self):
+        super().check_result()
+
+        if not self.passed:
+            return
+
+        with open(self.output_file, "rb") as f:
+            data = f.read()
+
+        if data != self.pattern:
+            self.passed = False
+
+class FioJobFileTest_t0029(FioJobFileTest):
+    """Test loops option works with read-verify workload."""
+    def check_result(self):
+        super().check_result()
+
+        if not self.passed:
+            return
+
+        if self.json_data['jobs'][1]['read']['io_kbytes'] != 8:
+            self.passed = False
+
+class FioJobFileTest_iops_rate(FioJobFileTest):
+    """Test consists of fio test job t0011
      Confirm that job0 iops == 1000
      and that job1_iops / job0_iops ~ 8
      With two runs of fio-3.16 I observed a ratio of 8.3"""
  
      def check_result(self):
-        super(FioJobTest_iops_rate, self).check_result()
+        super().check_result()
  
          if not self.passed:
              return
  
          iops1 = self.json_data['jobs'][0]['read']['iops']
+        logging.debug("Test %d: iops1: %f", self.testnum, iops1)
          iops2 = self.json_data['jobs'][1]['read']['iops']
+        logging.debug("Test %d: iops2: %f", self.testnum, iops2)
          ratio = iops2 / iops1
-        logging.debug("Test %d: iops1: %f", self.testnum, iops1)
          logging.debug("Test %d: ratio: %f", self.testnum, ratio)
  
          if iops1 < 950 or iops1 > 1050:
-            self.failure_reason = "{0} iops value mismatch,".format(self.failure_reason)
+            self.failure_reason = f"{self.failure_reason} iops value mismatch,"
              self.passed = False
  
          if ratio < 6 or ratio > 10:
-            self.failure_reason = "{0} iops ratio mismatch,".format(self.failure_reason)
+            self.failure_reason = f"{self.failure_reason} iops ratio mismatch,"
              self.passed = False
  
  
-class Requirements(object):
-    """Requirements consists of multiple run environment characteristics.
-    These are to determine if a particular test can be run"""
-
-    _linux = False
-    _libaio = False
-    _zbd = False
-    _root = False
-    _zoned_nullb = False
-    _not_macos = False
-    _not_windows = False
-    _unittests = False
-    _cpucount4 = False
-
-    def __init__(self, fio_root):
-        Requirements._not_macos = platform.system() != "Darwin"
-        Requirements._not_windows = platform.system() != "Windows"
-        Requirements._linux = platform.system() == "Linux"
-
-        if Requirements._linux:
-            config_file = os.path.join(fio_root, "config-host.h")
-            contents, success = FioJobTest.get_file(config_file)
-            if not success:
-                print("Unable to open {0} to check requirements".format(config_file))
-                Requirements._zbd = True
-            else:
-                Requirements._zbd = "CONFIG_HAS_BLKZONED" in contents
-                Requirements._libaio = "CONFIG_LIBAIO" in contents
-
-            Requirements._root = (os.geteuid() == 0)
-            if Requirements._zbd and Requirements._root:
-                try:
-                    subprocess.run(["modprobe", "null_blk"],
-                                   stdout=subprocess.PIPE,
-                                   stderr=subprocess.PIPE)
-                    if os.path.exists("/sys/module/null_blk/parameters/zoned"):
-                        Requirements._zoned_nullb = True
-                except Exception:
-                    pass
-
-        if platform.system() == "Windows":
-            utest_exe = "unittest.exe"
-        else:
-            utest_exe = "unittest"
-        unittest_path = os.path.join(fio_root, "unittests", utest_exe)
-        Requirements._unittests = os.path.exists(unittest_path)
-
-        Requirements._cpucount4 = multiprocessing.cpu_count() >= 4
-
-        req_list = [Requirements.linux,
-                    Requirements.libaio,
-                    Requirements.zbd,
-                    Requirements.root,
-                    Requirements.zoned_nullb,
-                    Requirements.not_macos,
-                    Requirements.not_windows,
-                    Requirements.unittests,
-                    Requirements.cpucount4]
-        for req in req_list:
-            value, desc = req()
-            logging.debug("Requirements: Requirement '%s' met? %s", desc, value)
-
-    @classmethod
-    def linux(cls):
-        """Are we running on Linux?"""
-        return Requirements._linux, "Linux required"
-
-    @classmethod
-    def libaio(cls):
-        """Is libaio available?"""
-        return Requirements._libaio, "libaio required"
-
-    @classmethod
-    def zbd(cls):
-        """Is ZBD support available?"""
-        return Requirements._zbd, "Zoned block device support required"
-
-    @classmethod
-    def root(cls):
-        """Are we running as root?"""
-        return Requirements._root, "root required"
-
-    @classmethod
-    def zoned_nullb(cls):
-        """Are zoned null block devices available?"""
-        return Requirements._zoned_nullb, "Zoned null block device support required"
-
-    @classmethod
-    def not_macos(cls):
-        """Are we running on a platform other than macOS?"""
-        return Requirements._not_macos, "platform other than macOS required"
-
-    @classmethod
-    def not_windows(cls):
-        """Are we running on a platform other than Windws?"""
-        return Requirements._not_windows, "platform other than Windows required"
-
-    @classmethod
-    def unittests(cls):
-        """Were unittests built?"""
-        return Requirements._unittests, "Unittests support required"
-
-    @classmethod
-    def cpucount4(cls):
-        """Do we have at least 4 CPUs?"""
-        return Requirements._cpucount4, "4+ CPUs required"
-
-
-SUCCESS_DEFAULT = {
-    'zero_return': True,
-    'stderr_empty': True,
-    'timeout': 600,
-    }
-SUCCESS_NONZERO = {
-    'zero_return': False,
-    'stderr_empty': False,
-    'timeout': 600,
-    }
-SUCCESS_STDERR = {
-    'zero_return': True,
-    'stderr_empty': False,
-    'timeout': 600,
-    }
  TEST_LIST = [
      {
          'test_id':          1,
-        'test_class':       FioJobTest,
+        'test_class':       FioJobFileTest,
          'job':              't0001-52c58027.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -694,7 +593,7 @@ TEST_LIST = [
      },
      {
          'test_id':          2,
-        'test_class':       FioJobTest,
+        'test_class':       FioJobFileTest,
          'job':              't0002-13af05ae-post.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          't0002-13af05ae-pre.fio',
@@ -703,7 +602,7 @@ TEST_LIST = [
      },
      {
          'test_id':          3,
-        'test_class':       FioJobTest,
+        'test_class':       FioJobFileTest,
          'job':              't0003-0ae2c6e1-post.fio',
          'success':          SUCCESS_NONZERO,
          'pre_job':          't0003-0ae2c6e1-pre.fio',
@@ -712,7 +611,7 @@ TEST_LIST = [
      },
      {
          'test_id':          4,
-        'test_class':       FioJobTest,
+        'test_class':       FioJobFileTest,
          'job':              't0004-8a99fdf6.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -721,7 +620,7 @@ TEST_LIST = [
      },
      {
          'test_id':          5,
-        'test_class':       FioJobTest_t0005,
+        'test_class':       FioJobFileTest_t0005,
          'job':              't0005-f7078f7b.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -731,7 +630,7 @@ TEST_LIST = [
      },
      {
          'test_id':          6,
-        'test_class':       FioJobTest_t0006,
+        'test_class':       FioJobFileTest_t0006,
          'job':              't0006-82af2a7c.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -741,7 +640,7 @@ TEST_LIST = [
      },
      {
          'test_id':          7,
-        'test_class':       FioJobTest_t0007,
+        'test_class':       FioJobFileTest_t0007,
          'job':              't0007-37cf9e3c.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -751,7 +650,7 @@ TEST_LIST = [
      },
      {
          'test_id':          8,
-        'test_class':       FioJobTest_t0008,
+        'test_class':       FioJobFileTest_t0008,
          'job':              't0008-ae2fafc8.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -761,7 +660,7 @@ TEST_LIST = [
      },
      {
          'test_id':          9,
-        'test_class':       FioJobTest_t0009,
+        'test_class':       FioJobFileTest_t0009,
          'job':              't0009-f8b0bd10.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -773,7 +672,7 @@ TEST_LIST = [
      },
      {
          'test_id':          10,
-        'test_class':       FioJobTest,
+        'test_class':       FioJobFileTest,
          'job':              't0010-b7aae4ba.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -782,7 +681,7 @@ TEST_LIST = [
      },
      {
          'test_id':          11,
-        'test_class':       FioJobTest_iops_rate,
+        'test_class':       FioJobFileTest_iops_rate,
          'job':              't0011-5d2788d5.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -792,7 +691,7 @@ TEST_LIST = [
      },
      {
          'test_id':          12,
-        'test_class':       FioJobTest_t0012,
+        'test_class':       FioJobFileTest_t0012,
          'job':              't0012.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -802,7 +701,7 @@ TEST_LIST = [
      },
      {
          'test_id':          13,
-        'test_class':       FioJobTest,
+        'test_class':       FioJobFileTest,
          'job':              't0013.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -812,7 +711,7 @@ TEST_LIST = [
      },
      {
          'test_id':          14,
-        'test_class':       FioJobTest_t0014,
+        'test_class':       FioJobFileTest_t0014,
          'job':              't0014.fio',
          'success':          SUCCESS_DEFAULT,
          'pre_job':          None,
@@ -820,6 +719,165 @@ TEST_LIST = [
          'output_format':    'json',
          'requirements':     [],
      },
+    {
+        'test_id':          15,
+        'test_class':       FioJobFileTest_t0015,
+        'job':              't0015-4e7e7898.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [Requirements.linux, Requirements.libaio],
+    },
+    {
+        'test_id':          16,
+        'test_class':       FioJobFileTest_t0015,
+        'job':              't0016-d54ae22.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [],
+    },
+    {
+        'test_id':          17,
+        'test_class':       FioJobFileTest_t0015,
+        'job':              't0017.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [Requirements.not_windows],
+    },
+    {
+        'test_id':          18,
+        'test_class':       FioJobFileTest,
+        'job':              't0018.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [Requirements.linux, Requirements.io_uring],
+    },
+    {
+        'test_id':          19,
+        'test_class':       FioJobFileTest_t0019,
+        'job':              't0019.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [],
+    },
+    {
+        'test_id':          20,
+        'test_class':       FioJobFileTest_t0020,
+        'job':              't0020.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [],
+    },
+    {
+        'test_id':          21,
+        'test_class':       FioJobFileTest_t0020,
+        'job':              't0021.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [],
+    },
+    {
+        'test_id':          22,
+        'test_class':       FioJobFileTest_t0022,
+        'job':              't0022.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [],
+    },
+    {
+        'test_id':          23,
+        'test_class':       FioJobFileTest_t0023,
+        'job':              't0023.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [],
+    },
+    {
+        'test_id':          24,
+        'test_class':       FioJobFileTest_t0024,
+        'job':              't0024.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [],
+    },
+    {
+        'test_id':          25,
+        'test_class':       FioJobFileTest_t0025,
+        'job':              't0025.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [],
+    },
+    {
+        'test_id':          26,
+        'test_class':       FioJobFileTest,
+        'job':              't0026.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [Requirements.not_windows],
+    },
+    {
+        'test_id':          27,
+        'test_class':       FioJobFileTest_t0027,
+        'job':              't0027.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [],
+    },
+    {
+        'test_id':          28,
+        'test_class':       FioJobFileTest,
+        'job':              't0028-c6cade16.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'requirements':     [],
+    },
+    {
+        'test_id':          29,
+        'test_class':       FioJobFileTest_t0029,
+        'job':              't0029.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'output_format':    'json',
+        'requirements':     [],
+    },
+    {
+        'test_id':          30,
+        'test_class':       FioJobFileTest,
+        'job':              't0030.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          None,
+        'pre_success':      None,
+        'parameters':       ['--bandwidth-log'],
+        'requirements':     [],
+    },
+    {
+        'test_id':          31,
+        'test_class':       FioJobFileTest,
+        'job':              't0031.fio',
+        'success':          SUCCESS_DEFAULT,
+        'pre_job':          't0031-pre.fio',
+        'pre_success':      SUCCESS_DEFAULT,
+        'requirements':     [Requirements.linux, Requirements.libaio],
+    },
      {
          'test_id':          1000,
          'test_class':       FioExeTest,
@@ -872,7 +930,7 @@ TEST_LIST = [
          'test_id':          1006,
          'test_class':       FioExeTest,
          'exe':              't/strided.py',
-        'parameters':       ['{fio_path}'],
+        'parameters':       ['--fio', '{fio_path}'],
          'success':          SUCCESS_DEFAULT,
          'requirements':     [],
      },
@@ -918,6 +976,38 @@ TEST_LIST = [
          'success':          SUCCESS_DEFAULT,
          'requirements':     [],
      },
+    {
+        'test_id':          1012,
+        'test_class':       FioExeTest,
+        'exe':              't/log_compression.py',
+        'parameters':       ['-f', '{fio_path}'],
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [],
+    },
+    {
+        'test_id':          1013,
+        'test_class':       FioExeTest,
+        'exe':              't/random_seed.py',
+        'parameters':       ['-f', '{fio_path}'],
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [],
+    },
+    {
+        'test_id':          1014,
+        'test_class':       FioExeTest,
+        'exe':              't/nvmept.py',
+        'parameters':       ['-f', '{fio_path}', '--dut', '{nvmecdev}'],
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [Requirements.linux, Requirements.nvmecdev],
+    },
+    {
+        'test_id':          1015,
+        'test_class':       FioExeTest,
+        'exe':              't/nvmept_trim.py',
+        'parameters':       ['-f', '{fio_path}', '--dut', '{nvmecdev}'],
+        'success':          SUCCESS_DEFAULT,
+        'requirements':     [Requirements.linux, Requirements.nvmecdev],
+    },
  ]
  
  
@@ -941,6 +1031,8 @@ def parse_args():
                          help='skip requirements checking')
      parser.add_argument('-p', '--pass-through', action='append',
                          help='pass-through an argument to an executable test')
+    parser.add_argument('--nvmecdev', action='store', default=None,
+                        help='NVMe character device for **DESTRUCTIVE** testing (e.g., /dev/ng0n1)')
      args = parser.parse_args()
  
      return args
@@ -959,7 +1051,7 @@ def main():
      if args.pass_through:
          for arg in args.pass_through:
              if not ':' in arg:
-                print("Invalid --pass-through argument '%s'" % arg)
+                print(f"Invalid --pass-through argument '{arg}'")
                  print("Syntax for --pass-through is TESTNUMBER:ARGUMENT")
                  return
              split = arg.split(":", 1)
@@ -970,7 +1062,7 @@ def main():
          fio_root = args.fio_root
      else:
          fio_root = str(Path(__file__).absolute().parent.parent)
-    print("fio root is %s" % fio_root)
+    print(f"fio root is {fio_root}")
  
      if args.fio:
          fio_path = args.fio
@@ -980,100 +1072,25 @@ def main():
          else:
              fio_exe = "fio"
          fio_path = os.path.join(fio_root, fio_exe)
-    print("fio path is %s" % fio_path)
+    print(f"fio path is {fio_path}")
      if not shutil.which(fio_path):
          print("Warning: fio executable not found")
  
      artifact_root = args.artifact_root if args.artifact_root else \
-        "fio-test-{0}".format(time.strftime("%Y%m%d-%H%M%S"))
+        f"fio-test-{time.strftime('%Y%m%d-%H%M%S')}"
      os.mkdir(artifact_root)
-    print("Artifact directory is %s" % artifact_root)
+    print(f"Artifact directory is {artifact_root}")
  
      if not args.skip_req:
-        req = Requirements(fio_root)
-
-    passed = 0
-    failed = 0
-    skipped = 0
-
-    for config in TEST_LIST:
-        if (args.skip and config['test_id'] in args.skip) or \
-           (args.run_only and config['test_id'] not in args.run_only):
-            skipped = skipped + 1
-            print("Test {0} SKIPPED (User request)".format(config['test_id']))
-            continue
-
-        if issubclass(config['test_class'], FioJobTest):
-            if config['pre_job']:
-                fio_pre_job = os.path.join(fio_root, 't', 'jobs',
-                                           config['pre_job'])
-            else:
-                fio_pre_job = None
-            if config['pre_success']:
-                fio_pre_success = config['pre_success']
-            else:
-                fio_pre_success = None
-            if 'output_format' in config:
-                output_format = config['output_format']
-            else:
-                output_format = 'normal'
-            test = config['test_class'](
-                fio_path,
-                os.path.join(fio_root, 't', 'jobs', config['job']),
-                config['success'],
-                fio_pre_job=fio_pre_job,
-                fio_pre_success=fio_pre_success,
-                output_format=output_format)
-            desc = config['job']
-        elif issubclass(config['test_class'], FioExeTest):
-            exe_path = os.path.join(fio_root, config['exe'])
-            if config['parameters']:
-                parameters = [p.format(fio_path=fio_path) for p in config['parameters']]
-            else:
-                parameters = []
-            if Path(exe_path).suffix == '.py' and platform.system() == "Windows":
-                parameters.insert(0, exe_path)
-                exe_path = "python.exe"
-            if config['test_id'] in pass_through:
-                parameters += pass_through[config['test_id']].split()
-            test = config['test_class'](exe_path, parameters,
-                                        config['success'])
-            desc = config['exe']
-        else:
-            print("Test {0} FAILED: unable to process test config".format(config['test_id']))
-            failed = failed + 1
-            continue
-
-        if not args.skip_req:
-            reqs_met = True
-            for req in config['requirements']:
-                reqs_met, reason = req()
-                logging.debug("Test %d: Requirement '%s' met? %s", config['test_id'], reason,
-                              reqs_met)
-                if not reqs_met:
-                    break
-            if not reqs_met:
-                print("Test {0} SKIPPED ({1}) {2}".format(config['test_id'], reason, desc))
-                skipped = skipped + 1
-                continue
-
-        test.setup(artifact_root, config['test_id'])
-        test.run()
-        test.check_result()
-        if test.passed:
-            result = "PASSED"
-            passed = passed + 1
-        else:
-            result = "FAILED: {0}".format(test.failure_reason)
-            failed = failed + 1
-            contents, _ = FioJobTest.get_file(test.stderr_file)
-            logging.debug("Test %d: stderr:\n%s", config['test_id'], contents)
-            contents, _ = FioJobTest.get_file(test.stdout_file)
-            logging.debug("Test %d: stdout:\n%s", config['test_id'], contents)
-        print("Test {0} {1} {2}".format(config['test_id'], result, desc))
-
-    print("{0} test(s) passed, {1} failed, {2} skipped".format(passed, failed, skipped))
-
+        Requirements(fio_root, args)
+
+    test_env = {
+              'fio_path': fio_path,
+              'fio_root': fio_root,
+              'artifact_root': artifact_root,
+              'pass_through': pass_through,
+              }
+    _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
      sys.exit(failed)
  
  
diff --git a/t/sgunmap-test.py b/t/sgunmap-test.py

index 4960a040ea34b43e0a8cf36e5f4e03091d397d01..6687494f30725aa9e4b2bcfae0a18401ee4f5c5a 100755 (executable)
--- a/t/sgunmap-test.py
+++ b/t/sgunmap-test.py
@@ -3,7 +3,7 @@
  #
  # sgunmap-test.py
  #
-# Limited functonality test for trim workloads using fio's sg ioengine
+# Limited functionality test for trim workloads using fio's sg ioengine
  # This checks only the three sets of reported iodepths
  #
  # !!!WARNING!!!
diff --git a/t/steadystate_tests.py b/t/steadystate_tests.py

index e8bd768c514c2c164b23b3067ec1eb6233ca4347..d0fa73b28d9483418980dd7b6f827fd4b5a52892 100755 (executable)
--- a/t/steadystate_tests.py
+++ b/t/steadystate_tests.py
@@ -2,7 +2,7 @@
  #
  # steadystate_tests.py
  #
-# Test option parsing and functonality for fio's steady state detection feature.
+# Test option parsing and functionality for fio's steady state detection feature.
  #
  # steadystate_tests.py --read file-for-read-testing --write file-for-write-testing ./fio
  #
@@ -115,6 +115,7 @@ if __name__ == '__main__':
                {'s': False, 'timeout': 20, 'numjobs': 2},
                {'s': True, 'timeout': 100, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 5, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True},
                {'s': True, 'timeout': 10, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 500, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+              {'s': True, 'timeout': 10, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 500, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True, 'ss_interval': 5},
              ]
  
      jobnum = 0
diff --git a/t/strided.py b/t/strided.py

index 45e6f148e19c242c7dc986f997ed64febdcbf105..75c429e454cf6c5fff4518674a0be891263b0f2c 100755 (executable)
--- a/t/strided.py
+++ b/t/strided.py
@@ -1,11 +1,12 @@
  #!/usr/bin/env python3
-#
+
+"""
  # strided.py
  #
  # Test zonemode=strided. This uses the null ioengine when no file is
  # specified. If a file is specified, use it for randdom read testing.
  # Some of the zoneranges in the tests are 16MiB. So when using a file
-# a minimum size of 32MiB is recommended.
+# a minimum size of 64MiB is recommended.
  #
  # USAGE
  # python strided.py fio-executable [-f file/device]
@@ -13,12 +14,9 @@
  # EXAMPLES
  # python t/strided.py ./fio
  # python t/strided.py ./fio -f /dev/sda
-# dd if=/dev/zero of=temp bs=1M count=32
+# dd if=/dev/zero of=temp bs=1M count=64
  # python t/strided.py ./fio -f temp
  #
-# REQUIREMENTS
-# Python 2.6+
-#
  # ===TEST MATRIX===
  #
  # --zonemode=strided, zoneskip unset
@@ -28,322 +26,419 @@
  #       zonesize<zonerange  all blocks inside zone
  #
  #   w/o randommap       all blocks inside zone
-#
+"""
  
-from __future__ import absolute_import
-from __future__ import print_function
  import os
  import sys
+import time
  import argparse
-import subprocess
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
  
  
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('fio',
-                        help='path to fio executable (e.g., ./fio)')
-    parser.add_argument('-f', '--filename', help="file/device to test")
-    args = parser.parse_args()
+class StridedTest(FioJobCmdTest):
+    """Test zonemode=strided."""
  
-    return args
+    def setup(self, parameters):
+        fio_args = [
+                    "--name=strided",
+                    "--zonemode=strided",
+                    "--log_offset=1",
+                    "--randrepeat=0",
+                    "--rw=randread",
+                    f"--write_iops_log={self.filenames['iopslog']}",
+                    f"--output={self.filenames['output']}",
+                    f"--zonerange={self.fio_opts['zonerange']}",
+                    f"--zonesize={self.fio_opts['zonesize']}",
+                    f"--bs={self.fio_opts['bs']}",
+                   ]
  
+        for opt in ['norandommap', 'random_generator', 'offset']:
+            if opt in self.fio_opts:
+                option = f"--{opt}={self.fio_opts[opt]}"
+                fio_args.append(option)
  
-def run_fio(fio, test, index):
-    filename = "strided"
-    fio_args = [
-                "--max-jobs=16",
-                "--name=strided",
-                "--zonemode=strided",
-                "--log_offset=1",
-                "--randrepeat=0",
-                "--rw=randread",
-                "--write_iops_log={0}{1:03d}".format(filename, index),
-                "--output={0}{1:03d}.out".format(filename, index),
-                "--zonerange={zonerange}".format(**test),
-                "--zonesize={zonesize}".format(**test),
-                "--bs={bs}".format(**test),
-               ]
-    if 'norandommap' in test:
-        fio_args.append('--norandommap')
-    if 'random_generator' in test:
-        fio_args.append('--random_generator={random_generator}'.format(**test))
-    if 'offset' in test:
-        fio_args.append('--offset={offset}'.format(**test))
-    if 'filename' in test:
-        fio_args.append('--filename={filename}'.format(**test))
-        fio_args.append('--filesize={filesize})'.format(**test))
-    else:
-        fio_args.append('--ioengine=null')
-        fio_args.append('--size={size}'.format(**test))
-        fio_args.append('--io_size={io_size}'.format(**test))
-        fio_args.append('--filesize={size})'.format(**test))
-
-    output = subprocess.check_output([fio] + fio_args, universal_newlines=True)
-
-    f = open("{0}{1:03d}_iops.1.log".format(filename, index), "r")
-    log = f.read()
-    f.close()
-
-    return log
-
-
-def check_output(iops_log, test):
-    zonestart = 0 if 'offset' not in test else test['offset']
-    iospersize = test['zonesize'] / test['bs']
-    iosperrange = test['zonerange'] / test['bs']
-    iosperzone = 0
-    lines = iops_log.split('\n')
-    zoneset = set()
-
-    for line in lines:
-        if len(line) == 0:
-            continue
-
-        if iosperzone == iospersize:
-            # time to move to a new zone
-            iosperzone = 0
-            zoneset = set()
-            zonestart += test['zonerange']
-            if zonestart >= test['filesize']:
-                zonestart = 0 if 'offset' not in test else test['offset']
-
-        iosperzone = iosperzone + 1
-        tokens = line.split(',')
-        offset = int(tokens[4])
-        if offset < zonestart or offset >= zonestart + test['zonerange']:
-            print("Offset {0} outside of zone starting at {1}".format(
-                    offset, zonestart))
-            return False
-
-        # skip next section if norandommap is enabled with no
-        # random_generator or with a random_generator != lfsr
-        if 'norandommap' in test:
-            if 'random_generator' in test:
-                if test['random_generator'] != 'lfsr':
-                    continue
-            else:
+        if 'filename' in self.fio_opts:
+            for opt in ['filename', 'filesize']:
+                option = f"--{opt}={self.fio_opts[opt]}"
+                fio_args.append(option)
+        else:
+            fio_args.append('--ioengine=null')
+            for opt in ['size', 'io_size', 'filesize']:
+                option = f"--{opt}={self.fio_opts[opt]}"
+                fio_args.append(option)
+
+        super().setup(fio_args)
+
+    def check_result(self):
+        super().check_result()
+        if not self.passed:
+            return
+
+        zonestart = 0 if 'offset' not in self.fio_opts else self.fio_opts['offset']
+        iospersize = self.fio_opts['zonesize'] / self.fio_opts['bs']
+        iosperrange = self.fio_opts['zonerange'] / self.fio_opts['bs']
+        iosperzone = 0
+        lines = self.iops_log_lines.split('\n')
+        zoneset = set()
+
+        for line in lines:
+            if len(line) == 0:
                  continue
  
-        # we either have a random map enabled or we
-        # are using an LFSR
-        # so all blocks should be unique and we should have
-        # covered the entire zone when iosperzone % iosperrange == 0
-        block = (offset - zonestart) / test['bs']
-        if block in zoneset:
-            print("Offset {0} in zone already touched".format(offset))
-            return False
+            if iosperzone == iospersize:
+                # time to move to a new zone
+                iosperzone = 0
+                zoneset = set()
+                zonestart += self.fio_opts['zonerange']
+                if zonestart >= self.fio_opts['filesize']:
+                    zonestart = 0 if 'offset' not in self.fio_opts else self.fio_opts['offset']
  
-        zoneset.add(block)
-        if iosperzone % iosperrange == 0:
-            if len(zoneset) != iosperrange:
-                print("Expected {0} blocks in zone but only saw {1}".format(
-                        iosperrange, len(zoneset)))
-                return False
-            zoneset = set()
+            iosperzone = iosperzone + 1
+            tokens = line.split(',')
+            offset = int(tokens[4])
+            if offset < zonestart or offset >= zonestart + self.fio_opts['zonerange']:
+                print(f"Offset {offset} outside of zone starting at {zonestart}")
+                return
  
-    return True
+            # skip next section if norandommap is enabled with no
+            # random_generator or with a random_generator != lfsr
+            if 'norandommap' in self.fio_opts:
+                if 'random_generator' in self.fio_opts:
+                    if self.fio_opts['random_generator'] != 'lfsr':
+                        continue
+                else:
+                    continue
  
+            # we either have a random map enabled or we
+            # are using an LFSR
+            # so all blocks should be unique and we should have
+            # covered the entire zone when iosperzone % iosperrange == 0
+            block = (offset - zonestart) / self.fio_opts['bs']
+            if block in zoneset:
+                print(f"Offset {offset} in zone already touched")
+                return
+
+            zoneset.add(block)
+            if iosperzone % iosperrange == 0:
+                if len(zoneset) != iosperrange:
+                    print(f"Expected {iosperrange} blocks in zone but only saw {len(zoneset)}")
+                    return
+                zoneset = set()
+
+
+TEST_LIST = [   # randommap enabled
+    {
+        "test_id": 1,
+        "fio_opts": {
+            "zonerange": 4096,
+            "zonesize": 4096,
+            "bs": 4096,
+            "offset": 8*4096,
+            "size": 16*4096,
+            "io_size": 16*4096,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 2,
+        "fio_opts": {
+            "zonerange": 4096,
+            "zonesize": 4096,
+            "bs": 4096,
+            "size": 16*4096,
+            "io_size": 16*4096,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 3,
+        "fio_opts": {
+            "zonerange": 16*1024*1024,
+            "zonesize": 16*1024*1024,
+            "bs": 4096,
+            "size": 256*1024*1024,
+            "io_size": 256*1024*204,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 4,
+        "fio_opts": {
+            "zonerange": 4096,
+            "zonesize": 4*4096,
+            "bs": 4096,
+            "size": 16*4096,
+            "io_size": 16*4096,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 5,
+        "fio_opts": {
+            "zonerange": 16*1024*1024,
+            "zonesize": 32*1024*1024,
+            "bs": 4096,
+            "size": 256*1024*1024,
+            "io_size": 256*1024*204,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 6,
+        "fio_opts": {
+            "zonerange": 8192,
+            "zonesize": 4096,
+            "bs": 4096,
+            "size": 16*4096,
+            "io_size": 16*4096,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 7,
+        "fio_opts": {
+            "zonerange": 16*1024*1024,
+            "zonesize": 8*1024*1024,
+            "bs": 4096,
+            "size": 256*1024*1024,
+            "io_size": 256*1024*204,
+            },
+        "test_class": StridedTest,
+    },
+            # lfsr
+    {
+        "test_id": 8,
+        "fio_opts": {
+            "random_generator": "lfsr",
+            "zonerange": 4096*1024,
+            "zonesize": 4096*1024,
+            "bs": 4096,
+            "offset": 8*4096*1024,
+            "size": 16*4096*1024,
+            "io_size": 16*4096*1024,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 9,
+        "fio_opts": {
+            "random_generator": "lfsr",
+            "zonerange": 4096*1024,
+            "zonesize": 4096*1024,
+            "bs": 4096,
+            "size": 16*4096*1024,
+            "io_size": 16*4096*1024,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 10,
+        "fio_opts": {
+            "random_generator": "lfsr",
+            "zonerange": 16*1024*1024,
+            "zonesize": 16*1024*1024,
+            "bs": 4096,
+            "size": 256*1024*1024,
+            "io_size": 256*1024*204,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 11,
+        "fio_opts": {
+            "random_generator": "lfsr",
+            "zonerange": 4096*1024,
+            "zonesize": 4*4096*1024,
+            "bs": 4096,
+            "size": 16*4096*1024,
+            "io_size": 16*4096*1024,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 12,
+        "fio_opts": {
+            "random_generator": "lfsr",
+            "zonerange": 16*1024*1024,
+            "zonesize": 32*1024*1024,
+            "bs": 4096,
+            "size": 256*1024*1024,
+            "io_size": 256*1024*204,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 13,
+        "fio_opts": {
+            "random_generator": "lfsr",
+            "zonerange": 8192*1024,
+            "zonesize": 4096*1024,
+            "bs": 4096,
+            "size": 16*4096*1024,
+            "io_size": 16*4096*1024,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 14,
+        "fio_opts": {
+            "random_generator": "lfsr",
+            "zonerange": 16*1024*1024,
+            "zonesize": 8*1024*1024,
+            "bs": 4096,
+            "size": 256*1024*1024,
+            "io_size": 256*1024*204,
+            },
+        "test_class": StridedTest,
+    },
+    # norandommap
+    {
+        "test_id": 15,
+        "fio_opts": {
+            "norandommap": 1,
+            "zonerange": 4096,
+            "zonesize": 4096,
+            "bs": 4096,
+            "offset": 8*4096,
+            "size": 16*4096,
+            "io_size": 16*4096,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 16,
+        "fio_opts": {
+            "norandommap": 1,
+            "zonerange": 4096,
+            "zonesize": 4096,
+            "bs": 4096,
+            "size": 16*4096,
+            "io_size": 16*4096,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 17,
+        "fio_opts": {
+            "norandommap": 1,
+            "zonerange": 16*1024*1024,
+            "zonesize": 16*1024*1024,
+            "bs": 4096,
+            "size": 256*1024*1024,
+            "io_size": 256*1024*204,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 18,
+        "fio_opts": {
+            "norandommap": 1,
+            "zonerange": 4096,
+            "zonesize": 8192,
+            "bs": 4096,
+            "size": 16*4096,
+            "io_size": 16*4096,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 19,
+        "fio_opts": {
+            "norandommap": 1,
+            "zonerange": 16*1024*1024,
+            "zonesize": 32*1024*1024,
+            "bs": 4096,
+            "size": 256*1024*1024,
+            "io_size": 256*1024*204,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 20,
+        "fio_opts": {
+            "norandommap": 1,
+            "zonerange": 8192,
+            "zonesize": 4096,
+            "bs": 4096,
+            "size": 16*4096,
+            "io_size": 16*4096,
+            },
+        "test_class": StridedTest,
+    },
+    {
+        "test_id": 21,
+        "fio_opts": {
+            "norandommap": 1,
+            "zonerange": 16*1024*1024,
+            "zonesize": 8*1024*1024,
+            "bs": 4096,
+            "size": 256*1024*1024,
+            "io_size": 256*1024*1024,
+            },
+        "test_class": StridedTest,
+    },
+]
+
+
+def parse_args():
+    """Parse command-line arguments."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+    parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+    parser.add_argument('-s', '--skip', nargs='+', type=int,
+                        help='list of test(s) to skip')
+    parser.add_argument('-o', '--run-only', nargs='+', type=int,
+                        help='list of test(s) to run, skipping all others')
+    parser.add_argument('--dut',
+                        help='target file/device to test.')
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """Run zonemode=strided tests."""
  
-if __name__ == '__main__':
      args = parse_args()
  
-    tests = [   # randommap enabled
-                {
-                    "zonerange": 4096,
-                    "zonesize": 4096,
-                    "bs": 4096,
-                    "offset": 8*4096,
-                    "size": 16*4096,
-                    "io_size": 16*4096,
-                },
-                {
-                    "zonerange": 4096,
-                    "zonesize": 4096,
-                    "bs": 4096,
-                    "size": 16*4096,
-                    "io_size": 16*4096,
-                },
-                {
-                    "zonerange": 16*1024*1024,
-                    "zonesize": 16*1024*1024,
-                    "bs": 4096,
-                    "size": 256*1024*1024,
-                    "io_size": 256*1024*204,
-                },
-                {
-                    "zonerange": 4096,
-                    "zonesize": 4*4096,
-                    "bs": 4096,
-                    "size": 16*4096,
-                    "io_size": 16*4096,
-                },
-                {
-                    "zonerange": 16*1024*1024,
-                    "zonesize": 32*1024*1024,
-                    "bs": 4096,
-                    "size": 256*1024*1024,
-                    "io_size": 256*1024*204,
-                },
-                {
-                    "zonerange": 8192,
-                    "zonesize": 4096,
-                    "bs": 4096,
-                    "size": 16*4096,
-                    "io_size": 16*4096,
-                },
-                {
-                    "zonerange": 16*1024*1024,
-                    "zonesize": 8*1024*1024,
-                    "bs": 4096,
-                    "size": 256*1024*1024,
-                    "io_size": 256*1024*204,
-                },
-                # lfsr
-                {
-                    "random_generator": "lfsr",
-                    "zonerange": 4096*1024,
-                    "zonesize": 4096*1024,
-                    "bs": 4096,
-                    "offset": 8*4096*1024,
-                    "size": 16*4096*1024,
-                    "io_size": 16*4096*1024,
-                },
-                {
-                    "random_generator": "lfsr",
-                    "zonerange": 4096*1024,
-                    "zonesize": 4096*1024,
-                    "bs": 4096,
-                    "size": 16*4096*1024,
-                    "io_size": 16*4096*1024,
-                },
-                {
-                    "random_generator": "lfsr",
-                    "zonerange": 16*1024*1024,
-                    "zonesize": 16*1024*1024,
-                    "bs": 4096,
-                    "size": 256*1024*1024,
-                    "io_size": 256*1024*204,
-                },
-                {
-                    "random_generator": "lfsr",
-                    "zonerange": 4096*1024,
-                    "zonesize": 4*4096*1024,
-                    "bs": 4096,
-                    "size": 16*4096*1024,
-                    "io_size": 16*4096*1024,
-                },
-                {
-                    "random_generator": "lfsr",
-                    "zonerange": 16*1024*1024,
-                    "zonesize": 32*1024*1024,
-                    "bs": 4096,
-                    "size": 256*1024*1024,
-                    "io_size": 256*1024*204,
-                },
-                {
-                    "random_generator": "lfsr",
-                    "zonerange": 8192*1024,
-                    "zonesize": 4096*1024,
-                    "bs": 4096,
-                    "size": 16*4096*1024,
-                    "io_size": 16*4096*1024,
-                },
-                {
-                    "random_generator": "lfsr",
-                    "zonerange": 16*1024*1024,
-                    "zonesize": 8*1024*1024,
-                    "bs": 4096,
-                    "size": 256*1024*1024,
-                    "io_size": 256*1024*204,
-                },
-                # norandommap
-                {
-                    "norandommap": 1,
-                    "zonerange": 4096,
-                    "zonesize": 4096,
-                    "bs": 4096,
-                    "offset": 8*4096,
-                    "size": 16*4096,
-                    "io_size": 16*4096,
-                },
-                {
-                    "norandommap": 1,
-                    "zonerange": 4096,
-                    "zonesize": 4096,
-                    "bs": 4096,
-                    "size": 16*4096,
-                    "io_size": 16*4096,
-                },
-                {
-                    "norandommap": 1,
-                    "zonerange": 16*1024*1024,
-                    "zonesize": 16*1024*1024,
-                    "bs": 4096,
-                    "size": 256*1024*1024,
-                    "io_size": 256*1024*204,
-                },
-                {
-                    "norandommap": 1,
-                    "zonerange": 4096,
-                    "zonesize": 8192,
-                    "bs": 4096,
-                    "size": 16*4096,
-                    "io_size": 16*4096,
-                },
-                {
-                    "norandommap": 1,
-                    "zonerange": 16*1024*1024,
-                    "zonesize": 32*1024*1024,
-                    "bs": 4096,
-                    "size": 256*1024*1024,
-                    "io_size": 256*1024*204,
-                },
-                {
-                    "norandommap": 1,
-                    "zonerange": 8192,
-                    "zonesize": 4096,
-                    "bs": 4096,
-                    "size": 16*4096,
-                    "io_size": 16*4096,
-                },
-                {
-                    "norandommap": 1,
-                    "zonerange": 16*1024*1024,
-                    "zonesize": 8*1024*1024,
-                    "bs": 4096,
-                    "size": 256*1024*1024,
-                    "io_size": 256*1024*1024,
-                },
-
-            ]
-
-    index = 1
-    passed = 0
-    failed = 0
-
-    if args.filename:
-        statinfo = os.stat(args.filename)
+    artifact_root = args.artifact_root if args.artifact_root else \
+        f"strided-test-{time.strftime('%Y%m%d-%H%M%S')}"
+    os.mkdir(artifact_root)
+    print(f"Artifact directory is {artifact_root}")
+
+    if args.fio:
+        fio_path = str(Path(args.fio).absolute())
+    else:
+        fio_path = 'fio'
+    print(f"fio path is {fio_path}")
+
+    if args.dut:
+        statinfo = os.stat(args.dut)
          filesize = statinfo.st_size
          if filesize == 0:
-            f = os.open(args.filename, os.O_RDONLY)
+            f = os.open(args.dut, os.O_RDONLY)
              filesize = os.lseek(f, 0, os.SEEK_END)
              os.close(f)
  
-    for test in tests:
-        if args.filename:
-            test['filename'] = args.filename
-            test['filesize'] = filesize
+    for test in TEST_LIST:
+        if args.dut:
+            test['fio_opts']['filename'] = os.path.abspath(args.dut)
+            test['fio_opts']['filesize'] = filesize
          else:
-            test['filesize'] = test['size']
-        iops_log = run_fio(args.fio, test, index)
-        status = check_output(iops_log, test)
-        print("Test {0} {1}".format(index, ("PASSED" if status else "FAILED")))
-        if status:
-            passed = passed + 1
-        else:
-            failed = failed + 1
-        index = index + 1
+            test['fio_opts']['filesize'] = test['fio_opts']['size']
  
-    print("{0} tests passed, {1} failed".format(passed, failed))
+    test_env = {
+              'fio_path': fio_path,
+              'fio_root': str(Path(__file__).absolute().parent.parent),
+              'artifact_root': artifact_root,
+              'basename': 'strided',
+              }
  
+    _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
      sys.exit(failed)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/t/time-test.c b/t/time-test.c

index a74d9206f2344ed653bb6f540b247e69342f615b..3c87d4d4c3f7dd78e3a87470a4cbcd48d1ca1a30 100644 (file)
--- a/t/time-test.c
+++ b/t/time-test.c
@@ -67,7 +67,7 @@
   *     accuracy because the (ticks * clock_mult) product used for final
   *     fractional chunk
   *
- *  iv) 64-bit arithmetic with the clock ticks to nsec conversion occuring in
+ *  iv) 64-bit arithmetic with the clock ticks to nsec conversion occurring in
   *     two stages. This is carried out using locks to update the number of
   *     large time chunks (MAX_CLOCK_SEC_2STAGE) that have elapsed.
   *
diff --git a/t/zbd/functions b/t/zbd/functions

index 40ffe1deebe9ae23e2459082059a3db080b7e7a8..7734371e5e065dae8a487c28c3bb9e865261aa80 100644 (file)
--- a/t/zbd/functions
+++ b/t/zbd/functions
@@ -4,6 +4,7 @@ blkzone=$(type -p blkzone 2>/dev/null)
  sg_inq=$(type -p sg_inq 2>/dev/null)
  zbc_report_zones=$(type -p zbc_report_zones 2>/dev/null)
  zbc_reset_zone=$(type -p zbc_reset_zone 2>/dev/null)
+zbc_close_zone=$(type -p zbc_close_zone 2>/dev/null)
  zbc_info=$(type -p zbc_info 2>/dev/null)
  if [ -z "${blkzone}" ] &&
         { [ -z "${zbc_report_zones}" ] || [ -z "${zbc_reset_zone}" ]; }; then
@@ -26,6 +27,17 @@ blkzone_reports_capacity() {
                 "${blkzone}" report -c 1 -o 0 "${dev}" | grep -q 'cap '
  }
  
+has_command() {
+       local cmd="${1}"
+
+       cmd_path=$(type -p "${cmd}" 2>/dev/null)
+       if [ -z "${cmd_path}" ]; then
+               echo "${cmd} is not available"
+               return 1
+       fi
+       return 0
+}
+
  # Whether or not $1 (/dev/...) is a NVME ZNS device.
  is_nvme_zns() {
         local s
@@ -64,6 +76,34 @@ check_blkzone() {
         fi
  }
  
+# Check zone capacity of each zone and report block size aligned to the zone
+# capacities. If zone capacity is same as zone size for zones, report zone size.
+zone_cap_bs() {
+       local dev="${1}"
+       local zone_size="${2}"
+       local sed_str='s/.*len \([0-9A-Za-z]*\), cap \([0-9A-Za-z]*\).*/\1 \2/p'
+       local cap bs="$zone_size"
+
+       # When blkzone command is neither available nor relevant to the
+       # test device, or when blkzone command does not report capacity,
+       # assume that zone capacity is same as zone size for all zones.
+       if [ -z "${blkzone}" ] || [ -z "$is_zbd" ] || [ -c "$dev" ] ||
+                  ! blkzone_reports_capacity "${dev}"; then
+               echo "$zone_size"
+               return
+       fi
+
+       while read -r -a line; do
+               ((line[0] == line[1])) && continue
+               cap=$((line[1] * 512))
+               while ((bs > 512 && cap % bs)); do
+                       bs=$((bs / 2))
+               done
+       done < <(blkzone report "${dev}" | sed -n "${sed_str}")
+
+       echo "$bs"
+}
+
  # Reports the starting sector and length of the first sequential zone of device
  # $1.
  first_sequential_zone() {
@@ -173,15 +213,29 @@ last_online_zone() {
      fi
  }
  
+# Get max_open_zones of SMR drives using sg_inq or libzbc tools. Two test cases
+# 31 and 32 use this max_open_zones value. The test case 31 uses max_open_zones
+# to decide number of write target zones. The test case 32 passes max_open_zones
+# value to fio with --max_open_zones option. Of note is that fio itself has the
+# feature to get max_open_zones from the device through sysfs or ioengine
+# specific implementation. This max_open_zones fetch by test script is required
+# in case fio is running on an old Linux kernel version which lacks
+# max_open_zones in sysfs, or which lacks zoned block device support completely.
  max_open_zones() {
      local dev=$1
+    local realdev syspath
  
-    if [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then
+    realdev=$(readlink -f "$dev")
+    syspath=/sys/block/${realdev##*/}/queue/max_open_zones
+
+    if [ -b "${realdev}" ] && [ -r "${syspath}" ]; then
+       cat ${syspath}
+    elif [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then
         if ! ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" \
                  > /dev/null 2>&1; then
-           # Non scsi device such as null_blk can not return max open zones.
-           # Use default value.
-           echo 128
+           # When sg_inq can not get max open zones, specify 0 which indicates
+           # fio to get max open zones limit from the device.
+           echo 0
         else
             ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" | tail -1 |
                 {
@@ -194,9 +248,67 @@ max_open_zones() {
                     echo ${max_nr_open_zones}
                 }
         fi
-    else
+    elif [ -n "${use_libzbc}" ]; then
         ${zbc_report_zones} "$dev" |
             sed -n 's/^[[:blank:]]*Maximum number of open sequential write required zones:[[:blank:]]*//p'
+    else
+       echo 0
+    fi
+}
+
+# If sysfs provides, get max_active_zones limit of the zoned block device.
+max_active_zones() {
+       local dev=$1
+       local sys_queue="/sys/block/${dev##*/}/queue/"
+
+       if [[ -e "$sys_queue/max_active_zones" ]]; then
+               cat "$sys_queue/max_active_zones"
+               return
+       fi
+       echo 0
+}
+
+# Get minimum block size to write to seq zones. Refer the sysfs attribute
+# zone_write_granularity which shows the valid minimum size regardless of zoned
+# block device type. If the sysfs attribute is not available, refer physical
+# block size for rotational SMR drives. For non-rotational devices such as ZNS
+# devices, refer logical block size.
+min_seq_write_size() {
+       local sys_path="/sys/block/$1/queue"
+       local -i size=0
+
+       if [[ -r "$sys_path/zone_write_granularity" ]]; then
+               size=$(<"$sys_path/zone_write_granularity")
+       fi
+
+       if ((size)); then
+               echo "$size"
+       elif (($(<"$sys_path/rotational"))); then
+               cat "$sys_path/physical_block_size"
+       else
+               cat "$sys_path/logical_block_size"
+       fi
+}
+
+urswrz() {
+    local dev=$1
+
+    if [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then
+       if ! ${sg_inq} -e --page=0xB6 --len=10 --hex "$dev" \
+                > /dev/null 2>&1; then
+           # Couldn't get URSWRZ bit. Assume the reads are unrestricted
+           # because this configuration is more common.
+           echo 1
+       else
+           ${sg_inq} -e --page=0xB6 --len=10 --hex "$dev" | tail -1 |
+               {
+                   read -r offset b0 b1 b2 b3 b4 trailer && \
+                       echo $(( $b4 & 0x01 )) || echo 0
+               }
+       fi
+    else
+       ${zbc_info} "$dev" |
+           sed -n 's/^[[:blank:]].*Read commands are \(un\)restricted*/\1/p' | grep -q ^ && echo 1 || echo 0
      fi
  }
  
@@ -206,12 +318,12 @@ is_zbc() {
         [[ -z "$(${zbc_info} "$dev" | grep "is not a zoned block device")" ]]
  }
  
-zbc_logical_block_size() {
+zbc_physical_block_size() {
         local dev=$1
  
         ${zbc_info} "$dev" |
-               grep "logical blocks" |
-               sed -n 's/^[[:blank:]]*[0-9]* logical blocks of[[:blank:]]*//p' |
+               grep "physical blocks" |
+               sed -n 's/^[[:blank:]]*[0-9]* physical blocks of[[:blank:]]*//p' |
                 sed 's/ B//'
  }
  
@@ -244,6 +356,18 @@ reset_zone() {
      fi
  }
  
+# Close the zone on device $1 at offset $2. The offset must be specified in
+# units of 512 byte sectors.
+close_zone() {
+       local dev=$1 offset=$2
+
+       if [ -n "${blkzone}" ] && [ -z "${use_libzbc}" ]; then
+               ${blkzone} close -o "${offset}" -c 1 "$dev"
+       else
+               ${zbc_close_zone} -sector "$dev" "${offset}" >/dev/null
+       fi
+}
+
  # Extract the number of bytes that have been transferred from a line like
  # READ: bw=6847KiB/s (7011kB/s), 6847KiB/s-6847KiB/s (7011kB/s-7011kB/s), io=257MiB (269MB), run=38406-38406msec
  fio_io() {
diff --git a/t/zbd/run-tests-against-nullb b/t/zbd/run-tests-against-nullb

index db901179493d242a9478e6a6af6a1a7a92b78bf5..97d2996675f3cf422293ef44bd7c5cddd58950d8 100755 (executable)
--- a/t/zbd/run-tests-against-nullb
+++ b/t/zbd/run-tests-against-nullb
@@ -19,7 +19,6 @@ usage()
         echo -e "\t-L List the device layouts for every section without running"
         echo -e "\t   tests."
         echo -e "\t-s <#section> Only run the section with the given number."
-       echo -e "\t-l Use libzbc ioengine to run the tests."
         echo -e "\t-t <#test> Only run the test with the given number in every section."
         echo -e "\t-o <max_open_zones> Specify MaxOpen value, (${set_max_open} by default)."
         echo -e "\t-n <#number of runs> Set the number of times to run the entire suite "
@@ -68,13 +67,27 @@ configure_nullb()
                         fi
                         echo "${zone_capacity}" > zone_capacity
                 fi
+
                 if ((conv_pcnt)); then
                         if ((!conv_supported)); then
                                 echo "null_blk does not support conventional zones"
                                 return 2
                         fi
                         nr_conv=$((dev_size/zone_size*conv_pcnt/100))
-                       echo "${nr_conv}" > zone_nr_conv
+               else
+                       nr_conv=0
+               fi
+               echo "${nr_conv}" > zone_nr_conv
+
+               if ((max_open)); then
+                       echo "${max_open}" > zone_max_open
+                       if ((max_active)); then
+                               if ((!max_act_supported)); then
+                                       echo "null_blk does not support active zone counts"
+                                       return 2
+                               fi
+                               echo "${max_active}" > zone_max_active
+                       fi
                 fi
         fi
  
@@ -91,6 +104,11 @@ show_nullb_config()
                 echo "    $(printf "Zone Capacity: %d MB" ${zone_capacity})"
                 if ((max_open)); then
                         echo "    $(printf "Max Open: %d Zones" ${max_open})"
+                       if ((max_active)); then
+                               echo "    $(printf "Max Active: %d Zones" ${max_active})"
+                       else
+                               echo "    Max Active: Unlimited Zones"
+                       fi
                 else
                         echo "    Max Open: Unlimited Zones"
                 fi
@@ -125,6 +143,7 @@ section3()
         zone_size=4
         zone_capacity=3
         max_open=0
+       max_active=0
  }
  
  # Zoned device with mostly sequential zones, ZCAP == ZSIZE, unlimited MaxOpen.
@@ -134,6 +153,7 @@ section4()
         zone_size=1
         zone_capacity=1
         max_open=0
+       max_active=0
  }
  
  # Zoned device with mostly sequential zones, ZCAP < ZSIZE, unlimited MaxOpen.
@@ -143,6 +163,7 @@ section5()
         zone_size=4
         zone_capacity=3
         max_open=0
+       max_active=0
  }
  
  # Zoned device with mostly conventional zones, ZCAP == ZSIZE, unlimited MaxOpen.
@@ -152,6 +173,7 @@ section6()
         zone_size=1
         zone_capacity=1
         max_open=0
+       max_active=0
  }
  
  # Zoned device with mostly conventional zones, ZCAP < ZSIZE, unlimited MaxOpen.
@@ -162,9 +184,11 @@ section7()
         zone_size=4
         zone_capacity=3
         max_open=0
+       max_active=0
  }
  
-# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen.
+# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
  section8()
  {
         dev_size=1024
@@ -173,9 +197,11 @@ section8()
         zone_capacity=1
         max_open=${set_max_open}
         zbd_test_opts+=("-o ${max_open}")
+       max_active=0
  }
  
-# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen.
+# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
  section9()
  {
         conv_pcnt=0
@@ -183,9 +209,11 @@ section9()
         zone_capacity=3
         max_open=${set_max_open}
         zbd_test_opts+=("-o ${max_open}")
+       max_active=0
  }
  
-# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen.
+# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
  section10()
  {
         conv_pcnt=10
@@ -193,9 +221,11 @@ section10()
         zone_capacity=1
         max_open=${set_max_open}
         zbd_test_opts+=("-o ${max_open}")
+       max_active=0
  }
  
-# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen.
+# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
  section11()
  {
         conv_pcnt=10
@@ -203,9 +233,11 @@ section11()
         zone_capacity=3
         max_open=${set_max_open}
         zbd_test_opts+=("-o ${max_open}")
+       max_active=0
  }
  
-# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen.
+# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
  section12()
  {
         conv_pcnt=66
@@ -213,9 +245,11 @@ section12()
         zone_capacity=1
         max_open=${set_max_open}
         zbd_test_opts+=("-o ${max_open}")
+       max_active=0
  }
  
-# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen.
+# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
  section13()
  {
         dev_size=2048
@@ -224,6 +258,155 @@ section13()
         zone_capacity=3
         max_open=${set_max_open}
         zbd_test_opts+=("-o ${max_open}")
+       max_active=0
+}
+
+# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section14()
+{
+       dev_size=1024
+       conv_pcnt=0
+       zone_size=1
+       zone_capacity=1
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=${set_max_open}
+}
+
+# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section15()
+{
+       conv_pcnt=0
+       zone_size=4
+       zone_capacity=3
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=${set_max_open}
+}
+
+# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section16()
+{
+       conv_pcnt=10
+       zone_size=1
+       zone_capacity=1
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=${set_max_open}
+}
+
+# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section17()
+{
+       conv_pcnt=10
+       zone_size=4
+       zone_capacity=3
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=${set_max_open}
+}
+
+# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section18()
+{
+       conv_pcnt=66
+       zone_size=1
+       zone_capacity=1
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=${set_max_open}
+}
+
+# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section19()
+{
+       dev_size=2048
+       conv_pcnt=66
+       zone_size=4
+       zone_capacity=3
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=${set_max_open}
+}
+
+# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section20()
+{
+       dev_size=1024
+       conv_pcnt=0
+       zone_size=1
+       zone_capacity=1
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=$((set_max_open+set_extra_max_active))
+}
+
+# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section21()
+{
+       conv_pcnt=0
+       zone_size=4
+       zone_capacity=3
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=$((set_max_open+set_extra_max_active))
+}
+
+# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section22()
+{
+       conv_pcnt=10
+       zone_size=1
+       zone_capacity=1
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=$((set_max_open+set_extra_max_active))
+}
+
+# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section23()
+{
+       conv_pcnt=10
+       zone_size=4
+       zone_capacity=3
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=$((set_max_open+set_extra_max_active))
+}
+
+# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section24()
+{
+       conv_pcnt=66
+       zone_size=1
+       zone_capacity=1
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=$((set_max_open+set_extra_max_active))
+}
+
+# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section25()
+{
+       dev_size=2048
+       conv_pcnt=66
+       zone_size=4
+       zone_capacity=3
+       max_open=${set_max_open}
+       zbd_test_opts+=("-o ${max_open}")
+       max_active=$((set_max_open+set_extra_max_active))
  }
  
  #
@@ -234,12 +417,13 @@ scriptdir="$(cd "$(dirname "$0")" && pwd)"
  sections=()
  zcap_supported=1
  conv_supported=1
+max_act_supported=1
  list_only=0
  dev_size=1024
  dev_blocksize=4096
  set_max_open=8
+set_extra_max_active=2
  zbd_test_opts=()
-libzbc=0
  num_of_runs=1
  test_case=0
  quit_on_err=0
@@ -250,7 +434,6 @@ while (($#)); do
                 -o) set_max_open="${2}"; shift; shift;;
                 -L) list_only=1; shift;;
                 -r) cleanup_nullb; exit 0;;
-               -l) libzbc=1; shift;;
                 -n) num_of_runs="${2}"; shift; shift;;
                 -t) test_case="${2}"; shift; shift;;
                 -q) quit_on_err=1; shift;;
@@ -279,6 +462,9 @@ fi
  if ! cat /sys/kernel/config/nullb/features | grep -q zone_nr_conv; then
         conv_supported=0
  fi
+if ! cat /sys/kernel/config/nullb/features | grep -q zone_max_active; then
+       max_act_supported=0
+fi
  
  rc=0
  test_rc=0
@@ -311,17 +497,6 @@ while ((run_nr <= $num_of_runs)); do
                         exit 1
                 fi
                 show_nullb_config
-               if ((libzbc)); then
-                       if ((zone_capacity < zone_size)); then
-                               echo "libzbc doesn't support zone capacity, skipping section $(printf "%02d" $section_number)"
-                               continue
-                       fi
-                       if ((conv_pcnt == 100)); then
-                               echo "libzbc only supports zoned devices, skipping section $(printf "%02d" $section_number)"
-                               continue
-                       fi
-                       zbd_test_opts+=("-l")
-               fi
                 cd "${scriptdir}"
                 ((intr)) && exit 1
                 ((list_only)) && continue
diff --git a/t/zbd/test-zbd-support b/t/zbd/test-zbd-support

index 1658dc25013be4c0714538bdd58fc2dcabeb03f8..c27d2ad68a7cde4ff05150f431ae81d63b248f93 100755 (executable)
--- a/t/zbd/test-zbd-support
+++ b/t/zbd/test-zbd-support
@@ -12,10 +12,13 @@ usage() {
         echo -e "\t-v Run fio with valgrind --read-var-info option"
         echo -e "\t-l Test with libzbc ioengine"
         echo -e "\t-r Reset all zones before test start"
+       echo -e "\t-w Reset all zones before executing each write test case"
         echo -e "\t-o <max_open_zones> Run fio with max_open_zones limit"
         echo -e "\t-t <test #> Run only a single test case with specified number"
+       echo -e "\t-s <test #> Start testing from the case with the specified number"
         echo -e "\t-q Quit the test run after any failed test"
         echo -e "\t-z Run fio with debug=zbd option"
+       echo -e "\t-u Use io_uring ioengine in place of libaio"
  }
  
  max() {
@@ -37,11 +40,62 @@ min() {
  ioengine() {
         if [ -n "$use_libzbc" ]; then
                 echo -n "--ioengine=libzbc"
+       elif [ "$1" = "libaio" -a -n "$force_io_uring" ]; then
+               echo -n "--ioengine=io_uring"
         else
                 echo -n "--ioengine=$1"
         fi
  }
  
+get_dev_path_by_id() {
+       for d in /sys/block/* /sys/block/*/*; do
+               if [[ ! -r "${d}/dev" ]]; then
+                       continue
+               fi
+               if [[ "${1}" == "$(<"${d}/dev")" ]]; then
+                       echo "/dev/${d##*/}"
+                       return 0
+               fi
+       done
+       return 1
+}
+
+dm_destination_dev_set_io_scheduler() {
+       local dev=$1 sched=$2
+       local dest_dev_id dest_dev path
+
+       has_command dmsetup || return 1
+
+       while read -r dest_dev_id; do
+               if ! dest_dev=$(get_dev_path_by_id "${dest_dev_id}"); then
+                       continue
+               fi
+               path=${dest_dev/dev/sys\/block}/queue/scheduler
+               if [[ ! -w ${path} ]]; then
+                       echo "Can not set scheduler of device mapper destination: ${dest_dev}"
+                       continue
+               fi
+               echo "${2}" > "${path}"
+       done < <(dmsetup table "$(<"/sys/block/$dev/dm/name")" |
+                        sed -n  's/.* \([0-9]*:[0-9]*\).*/\1/p')
+}
+
+dev_has_dm_map() {
+       local dev=${1} target_type=${2}
+       local dm_name
+
+       has_command dmsetup || return 1
+
+       dm_name=$(<"/sys/block/$dev/dm/name")
+       if ! dmsetup status "${dm_name}" | grep -qe "${target_type}"; then
+               return 1
+       fi
+       if dmsetup status "${dm_name}" | grep -v "${target_type}"; then
+               return 1
+       fi
+       return 0
+}
+
  set_io_scheduler() {
      local dev=$1 sched=$2
  
@@ -58,7 +112,17 @@ set_io_scheduler() {
         esac
      fi
  
-    echo "$sched" >"/sys/block/$dev/queue/scheduler"
+    if [ -w "/sys/block/$dev/queue/scheduler" ]; then
+       echo "$sched" >"/sys/block/$dev/queue/scheduler"
+    elif [ -r  "/sys/block/$dev/dm/name" ] &&
+                ( dev_has_dm_map "$dev" linear ||
+                  dev_has_dm_map "$dev" flakey ||
+                  dev_has_dm_map "$dev" crypt ); then
+       dm_destination_dev_set_io_scheduler "$dev" "$sched"
+    else
+       echo "can not set io scheduler"
+       exit 1
+    fi
  }
  
  check_read() {
@@ -162,7 +226,7 @@ write_and_run_one_fio_job() {
      shift 2
      r=$(((RANDOM << 16) | RANDOM))
      write_opts=(--name="write_job" --rw=write "$(ioengine "psync")" \
-                     --bs="${logical_block_size}" --zonemode=zbd \
+                     --bs="${min_seq_write_size}" --zonemode=zbd \
                       --zonesize="${zone_size}" --thread=1 --direct=1 \
                       --offset="${write_offset}" --size="${write_size}")
      write_opts+=("${job_var_opts[@]}")
@@ -182,13 +246,14 @@ run_fio_on_seq() {
      run_one_fio_job "${opts[@]}" "$@"
  }
  
-# Prepare for write test by resetting zones. When max_open_zones option is
-# specified, reset all zones of the test target to ensure that zones out of the
-# test target range do not have open zones. This allows the write test to the
-# target range to be able to open zones up to max_open_zones.
+# Prepare for write test by resetting zones. When reset_before_write or
+# max_open_zones option is specified, reset all zones of the test target to
+# ensure that zones out of the test target range do not have open zones. This
+# allows the write test to the target range to be able to open zones up to
+# max_open_zones limit specified as the option or obtained from sysfs.
  prep_write() {
-       [[ -n "${max_open_zones_opt}" && -n "${is_zbd}" ]] &&
-               reset_zone "${dev}" -1
+       [[ -n "${reset_before_write}" || -n "${max_open_zones_opt}" ]] &&
+               [[ -n "${is_zbd}" ]] && reset_zone "${dev}" -1
  }
  
  SKIP_TESTCASE=255
@@ -227,6 +292,14 @@ require_regular_block_dev() {
         return 0
  }
  
+require_block_dev() {
+       if [[ -b "$realdev" ]]; then
+               return 0
+       fi
+       SKIP_REASON="$dev is not a block device"
+       return 1
+}
+
  require_seq_zones() {
         local req_seq_zones=${1}
         local seq_bytes=$((disk_size - first_sequential_zone_sector * 512))
@@ -249,8 +322,33 @@ require_conv_zones() {
         return 0
  }
  
-# Check whether buffered writes are refused.
+require_max_open_zones() {
+       local min=${1}
+
+       if ((max_open_zones !=0 && max_open_zones < min)); then
+               SKIP_REASON="max_open_zones of $dev is smaller than $min"
+               return 1
+       fi
+       return 0
+}
+
+require_max_active_zones() {
+       local min=${1}
+
+       if ((max_active_zones == 0)); then
+               SKIP_REASON="$dev does not have max_active_zones limit"
+               return 1
+       fi
+       if ((max_active_zones < min)); then
+               SKIP_REASON="max_active_zones of $dev is smaller than $min"
+               return 1
+       fi
+       return 0
+}
+
+# Check whether buffered writes are refused for block devices.
  test1() {
+    require_block_dev || return $SKIP_TESTCASE
      run_fio --name=job1 --filename="$dev" --rw=write --direct=0 --bs=4K        \
             "$(ioengine "psync")" --size="${zone_size}" --thread=1      \
             --zonemode=zbd --zonesize="${zone_size}" 2>&1 |
@@ -310,25 +408,34 @@ test4() {
      off=$((first_sequential_zone_sector * 512 + 129 * zone_size))
      size=$((zone_size))
      [ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
-    opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--bs=$size")
+    opts+=("--name=$dev" "--filename=$dev" "--offset=$off")
+    opts+=(--bs="$(min $((min_seq_write_size * 256)) $size)")
      opts+=("--size=$size" "--thread=1" "--read_beyond_wp=1")
      opts+=("$(ioengine "psync")" "--rw=read" "--direct=1" "--disable_lat=1")
      opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
-    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
-    check_read $size || return $?
+    run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1
+    fio_rc=$?
+    if [[ $unrestricted_reads != 0 ]]; then
+       if [[ $fio_rc != 0 ]]; then
+               return "$fio_rc"
+       fi
+       check_read $size || return $?
+    else
+        [ $fio_rc == 0 ] && return 1 || return 0
+    fi
  }
  
  # Sequential write to sequential zones.
  test5() {
-    local size off capacity
+    local size off capacity bs
  
      prep_write
      off=$((first_sequential_zone_sector * 512))
      capacity=$(total_zone_capacity 4 $off $dev)
      size=$((4 * zone_size))
+    bs=$(min "$(max $((zone_size / 64)) "$min_seq_write_size")" "$zone_cap_bs")
      run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write        \
-                  --bs="$(max $((zone_size / 64)) "$logical_block_size")"\
-                  --do_verify=1 --verify=md5                           \
+                  --bs="$bs" --do_verify=1 --verify=md5 \
                    >>"${logfile}.${test_number}" 2>&1 || return $?
      check_written $capacity || return $?
      check_read $capacity || return $?
@@ -336,18 +443,18 @@ test5() {
  
  # Sequential read from sequential zones.
  test6() {
-    local size off capacity
+    local size off capacity bs
  
      prep_write
      off=$((first_sequential_zone_sector * 512))
      capacity=$(total_zone_capacity 4 $off $dev)
      size=$((4 * zone_size))
+    bs=$(min "$(max $((zone_size / 64)) "$min_seq_write_size")" "$zone_cap_bs")
      write_and_run_one_fio_job \
             $((first_sequential_zone_sector * 512)) "${size}" \
             --offset="${off}" \
             --size="${size}" --zonemode=zbd --zonesize="${zone_size}" \
-           "$(ioengine "psync")" --iodepth=1 --rw=read \
-           --bs="$(max $((zone_size / 64)) "$logical_block_size")" \
+           "$(ioengine "psync")" --iodepth=1 --rw=read --bs="$bs" \
             >>"${logfile}.${test_number}" 2>&1 || return $?
      check_read $capacity || return $?
  }
@@ -435,7 +542,8 @@ test11() {
  test12() {
      local size off capacity
  
-    prep_write
+    [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
      size=$((8 * zone_size))
      off=$((first_sequential_zone_sector * 512))
      capacity=$(total_zone_capacity 8 $off $dev)
@@ -450,7 +558,10 @@ test12() {
  test13() {
      local size off capacity
  
-    prep_write
+    require_max_open_zones 4 || return $SKIP_TESTCASE
+
+    [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
      size=$((8 * zone_size))
      off=$((first_sequential_zone_sector * 512))
      capacity=$(total_zone_capacity 8 $off $dev)
@@ -485,7 +596,7 @@ test14() {
  
  # Sequential read on a mix of empty and full zones.
  test15() {
-    local i off size
+    local i off size bs
      local w_off w_size w_capacity
  
      for ((i=0;i<4;i++)); do
@@ -499,8 +610,9 @@ test15() {
      w_capacity=$(total_zone_capacity 2 $w_off $dev)
      off=$((first_sequential_zone_sector * 512))
      size=$((4 * zone_size))
+    bs=$(min $((zone_size / 16)) "$zone_cap_bs")
      write_and_run_one_fio_job "${w_off}" "${w_size}" \
-                   "$(ioengine "psync")" --rw=read --bs=$((zone_size / 16)) \
+                   "$(ioengine "psync")" --rw=read --bs="$bs" \
                     --zonemode=zbd --zonesize="${zone_size}" --offset=$off \
                     --size=$((size)) >>"${logfile}.${test_number}" 2>&1 ||
         return $?
@@ -698,7 +810,9 @@ test29() {
      require_seq_zones 80 || return $SKIP_TESTCASE
      off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
      size=$((16*zone_size))
-    prep_write
+
+    [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
      opts=("--debug=zbd")
      for ((i=0;i<jobs;i++)); do
         opts+=("--name=job$i" "--filename=$dev" "--offset=$off" "--bs=16K")
@@ -720,7 +834,7 @@ test30() {
      prep_write
      off=$((first_sequential_zone_sector * 512))
      run_one_fio_job "$(ioengine "libaio")" --iodepth=8 --rw=randrw     \
-                   --bs="$(max $((zone_size / 128)) "$logical_block_size")"\
+                   --bs="$(max $((zone_size / 128)) "$min_seq_write_size")"\
                     --zonemode=zbd --zonesize="${zone_size}" --offset=$off\
                     --loops=2 --time_based --runtime=30s --norandommap=1\
                     >>"${logfile}.${test_number}" 2>&1
@@ -731,32 +845,29 @@ test30() {
  test31() {
      local bs inc nz off opts size
  
-    prep_write
-    # Start with writing 128 KB to max_open_zones sequential zones.
-    bs=128K
+    [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+    # As preparation, write 128 KB to sequential write required zones. Limit
+    # write target zones up to max_open_zones to keep test time reasonable.
+    # To distribute the write target zones evenly, skip certain zones for every
+    # write. Utilize zonemode strided for such write patterns.
+    bs=$((128 * 1024))
      nz=$((max_open_zones))
      if [[ $nz -eq 0 ]]; then
         nz=128
      fi
-    # shellcheck disable=SC2017
-    inc=$(((disk_size - (first_sequential_zone_sector * 512)) / (nz * zone_size)
-          * zone_size))
-    if [ "$inc" -eq 0 ]; then
-       require_seq_zones $nz || return $SKIP_TESTCASE
-    fi
-    opts=()
-    for ((off = first_sequential_zone_sector * 512; off < disk_size;
-         off += inc)); do
-       opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--io_size=$bs")
-       opts+=("--bs=$bs" "--size=$zone_size" "$(ioengine "libaio")")
-       opts+=("--rw=write" "--direct=1" "--thread=1" "--stats=0")
-       opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
-       opts+=(${job_var_opts[@]})
-    done
-    "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1
-    # Next, run the test.
      off=$((first_sequential_zone_sector * 512))
      size=$((disk_size - off))
+    inc=$(((size / nz / zone_size) * zone_size))
+    opts=("--name=$dev" "--filename=$dev" "--rw=write" "--bs=${bs}")
+    opts+=("--offset=$off" "--size=$((inc * nz))" "--io_size=$((bs * nz))")
+    opts+=("--zonemode=strided" "--zonesize=${bs}" "--zonerange=${inc}")
+    opts+=("--direct=1" "$(ioengine "psync")")
+    echo "fio ${opts[@]}" >> "${logfile}.${test_number}"
+    "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" \
+                               2>&1 || return $?
+
+    # Next, run the test.
      opts=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size")
      opts+=("--bs=$bs" "$(ioengine "psync")" "--rw=randread" "--direct=1")
      opts+=("--thread=1" "--time_based" "--runtime=30" "--zonemode=zbd")
@@ -772,7 +883,8 @@ test32() {
  
      require_zbd || return $SKIP_TESTCASE
  
-    prep_write
+    [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
      off=$((first_sequential_zone_sector * 512))
      size=$((disk_size - off))
      opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size")
@@ -789,7 +901,8 @@ test33() {
      local bs io_size size
      local off capacity=0;
  
-    prep_write
+    [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
      off=$((first_sequential_zone_sector * 512))
      capacity=$(total_zone_capacity 1 $off $dev)
      size=$((2 * zone_size))
@@ -798,20 +911,30 @@ test33() {
      run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write        \
                    --size=$size --io_size=$io_size --bs=$bs     \
                    >> "${logfile}.${test_number}" 2>&1 || return $?
-    check_written $(((io_size + bs - 1) / bs * bs)) || return $?
+    check_written $((io_size / bs * bs)) || return $?
  }
  
-# Write to sequential zones with a block size that is not a divisor of the
-# zone size and with data verification enabled.
+# Test repeated async write job with verify using two unaligned block sizes.
  test34() {
-    local size
+       local bs off zone_capacity
+       local -a block_sizes
  
-    prep_write
-    size=$((2 * zone_size))
-    run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write --size=$size \
-                  --do_verify=1 --verify=md5 --bs=$((3 * zone_size / 4)) \
-                  >> "${logfile}.${test_number}" 2>&1 && return 1
-    grep -q 'not a divisor of' "${logfile}.${test_number}"
+       require_zbd || return $SKIP_TESTCASE
+       prep_write
+
+       off=$((first_sequential_zone_sector * 512))
+       zone_capacity=$(total_zone_capacity 1 $off $dev)
+       block_sizes=($((4096 * 7)) $(($(min ${zone_capacity} 4194304) - 4096)))
+
+       for bs in ${block_sizes[@]}; do
+               run_fio --name=job --filename="${dev}" --rw=randwrite \
+                       --bs="${bs}" --offset="${off}" \
+                       --size=$((4 * zone_size)) --iodepth=256 \
+                       "$(ioengine "libaio")" --time_based=1 --runtime=15s \
+                       --zonemode=zbd --direct=1 --zonesize="${zone_size}" \
+                       --verify=crc32c --do_verify=1 ${job_var_opts[@]} \
+                       >> "${logfile}.${test_number}" 2>&1 || return $?
+       done
  }
  
  # Test 1/4 for the I/O boundary rounding code: $size < $zone_size.
@@ -856,7 +979,7 @@ test37() {
         off=$(((first_sequential_zone_sector - 1) * 512))
      fi
      size=$((zone_size + 2 * 512))
-    bs=$((zone_size / 4))
+    bs=$(min $((zone_size / 4)) "$zone_cap_bs")
      run_one_fio_job --offset=$off --size=$size "$(ioengine "psync")"   \
                     --iodepth=1 --rw=write --do_verify=1 --verify=md5   \
                     --bs=$bs --zonemode=zbd --zonesize="${zone_size}"   \
@@ -869,9 +992,9 @@ test38() {
      local bs off size
  
      prep_write
-    size=$((logical_block_size))
-    off=$((disk_size - logical_block_size))
-    bs=$((logical_block_size))
+    size=$((min_seq_write_size))
+    off=$((disk_size - min_seq_write_size))
+    bs=$((min_seq_write_size))
      run_one_fio_job --offset=$off --size=$size "$(ioengine "psync")"   \
                     --iodepth=1 --rw=write --do_verify=1 --verify=md5   \
                     --bs=$bs --zonemode=zbd --zonesize="${zone_size}"   \
@@ -889,7 +1012,7 @@ read_one_block() {
         exit 1
      fi
      off=${result[0]}
-    bs=$((logical_block_size))
+    bs=$((min_seq_write_size))
      run_one_fio_job --rw=read "$(ioengine "psync")" --offset=$off --bs=$bs \
                     --size=$bs "$@" 2>&1 |
         tee -a "${logfile}.${test_number}"
@@ -899,14 +1022,14 @@ read_one_block() {
  test39() {
      require_zbd || return $SKIP_TESTCASE
      read_one_block --zonemode=none >/dev/null || return $?
-    check_read $((logical_block_size)) || return $?
+    check_read $((min_seq_write_size)) || return $?
  }
  
  # Check whether fio accepts --zonemode=strided for zoned block devices.
  test40() {
      local bs
  
-    bs=$((logical_block_size))
+    bs=$((min_seq_write_size))
      require_zbd || return $SKIP_TESTCASE
      read_one_block --zonemode=strided |
         grep -q 'fio: --zonesize must be specified when using --zonemode=strided' ||
@@ -926,7 +1049,7 @@ test41() {
  test42() {
      require_regular_block_dev || return $SKIP_TESTCASE
      read_one_block --zonemode=zbd --zonesize=0 |
-       grep -q 'Specifying the zone size is mandatory for regular block devices with --zonemode=zbd'
+       grep -q 'Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd'
  }
  
  # Check whether fio handles --zonesize=1 correctly for regular block devices.
@@ -944,15 +1067,20 @@ test44() {
  
  test45() {
      local bs i
+    local grep_str="fio: first I/O failed. If .* is a zoned block device, consider --zonemode=zbd"
  
      require_zbd || return $SKIP_TESTCASE
      prep_write
-    bs=$((logical_block_size))
-    run_one_fio_job "$(ioengine "psync")" --iodepth=1 --rw=randwrite --bs=$bs\
-                   --offset=$((first_sequential_zone_sector * 512)) \
-                   --size="$zone_size" --do_verify=1 --verify=md5 2>&1 |
-       tee -a "${logfile}.${test_number}" |
-       grep -q "fio: first I/O failed. If .* is a zoned block device, consider --zonemode=zbd"
+    bs=$((min_seq_write_size))
+    for ((i = 0; i < 10; i++)); do
+           run_one_fio_job "$(ioengine "psync")" --iodepth=1 --rw=randwrite \
+                           --offset=$((first_sequential_zone_sector * 512)) \
+                           --bs="$bs" --time_based --runtime=1s \
+                           --do_verify=1 --verify=md5 \
+                   >> "${logfile}.${test_number}" 2>&1
+           grep -qe "$grep_str" "${logfile}.${test_number}" && return 0
+    done
+    return 1
  }
  
  # Random write to sequential zones, libaio, 8 jobs, queue depth 64 per job
@@ -972,7 +1100,7 @@ test47() {
      local bs
  
      prep_write
-    bs=$((logical_block_size))
+    bs=$((min_seq_write_size))
      run_fio_on_seq "$(ioengine "psync")" --rw=write --bs=$bs --zoneskip=1 \
                     >> "${logfile}.${test_number}" 2>&1 && return 1
      grep -q 'zoneskip 1 is not a multiple of the device zone size' "${logfile}.${test_number}"
@@ -989,7 +1117,9 @@ test48() {
  
      off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
      size=$((16*zone_size))
-    prep_write
+
+    [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
      opts=("--aux-path=/tmp" "--allow_file_create=0" "--significant_figures=10")
      opts+=("--debug=zbd")
      opts+=("$(ioengine "libaio")" "--rw=randwrite" "--direct=1")
@@ -1059,7 +1189,7 @@ test51() {
         require_conv_zones 8 || return $SKIP_TESTCASE
         require_seq_zones 8 || return $SKIP_TESTCASE
  
-       prep_write
+       reset_zone "$dev" -1
  
         off=$((first_sequential_zone_sector * 512 - 8 * zone_size))
         opts+=("--size=$((16 * zone_size))" "$(ioengine "libaio")")
@@ -1075,8 +1205,8 @@ test51() {
         run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
  }
  
-# Verify that zone_reset_threshold only takes logical blocks from seq
-# zones into account, and logical blocks of conv zones are not counted.
+# Verify that zone_reset_threshold only accounts written bytes in seq
+# zones, and written data bytes of conv zones are not counted.
  test52() {
         local off io_size
  
@@ -1140,6 +1270,7 @@ test54() {
         require_zbd || return $SKIP_TESTCASE
         require_seq_zones 8 || return $SKIP_TESTCASE
  
+       prep_write
         run_fio --name=job --filename=${dev} "$(ioengine "libaio")" \
                 --time_based=1 --runtime=30s --continue_on_error=0 \
                 --offset=$((first_sequential_zone_sector * 512)) \
@@ -1147,20 +1278,332 @@ test54() {
                 --rw=randrw:2 --rwmixwrite=25 --bsrange=4k-${zone_size} \
                 --zonemode=zbd --zonesize=${zone_size} \
                 --verify=crc32c --do_verify=1 --verify_backlog=2 \
-               --experimental_verify=1 \
                 --alloc-size=65536 --random_generator=tausworthe64 \
                 ${job_var_opts[@]} --debug=zbd \
                 >> "${logfile}.${test_number}" 2>&1 || return $?
  }
  
+# test 'z' suffix parsing only
+test55() {
+       local bs
+       bs=$((min_seq_write_size))
+
+       require_zbd || return $SKIP_TESTCASE
+       # offset=1z + offset_increment=10z + size=2z
+       require_seq_zones 13 || return $SKIP_TESTCASE
+
+       prep_write
+       run_fio --name=j                \
+               --filename=${dev}       \
+               --direct=1              \
+               "$(ioengine "psync")"   \
+               --zonemode=zbd          \
+               --zonesize=${zone_size} \
+               --rw=write              \
+               --bs=${bs}              \
+               --numjobs=2             \
+               --offset_increment=10z  \
+               --offset=1z             \
+               --size=2z               \
+               --io_size=3z            \
+               ${job_var_opts[@]} --debug=zbd \
+               >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# test 'z' suffix parsing only
+test56() {
+       local bs
+       bs=$((min_seq_write_size))
+
+       require_regular_block_dev || return $SKIP_TESTCASE
+       require_seq_zones 10 || return $SKIP_TESTCASE
+
+       prep_write
+       run_fio --name=j                \
+               --filename=${dev}       \
+               --direct=1              \
+               "$(ioengine "psync")"   \
+               --zonemode=strided      \
+               --zonesize=${zone_size} \
+               --rw=write              \
+               --bs=${bs}              \
+               --size=10z              \
+               --zoneskip=2z           \
+               ${job_var_opts[@]} --debug=zbd \
+               >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Test that repeated async write job does not cause zone reset during writes
+# in-flight, when the block size is not a divisor of the zone size.
+test57() {
+       local bs off
+
+       require_zbd || return $SKIP_TESTCASE
+
+       prep_write
+       bs=$((4096 * 7))
+       off=$((first_sequential_zone_sector * 512))
+
+       run_fio --name=job --filename="${dev}" --rw=randwrite --bs="${bs}" \
+               --offset="${off}" --size=$((4 * zone_size)) --iodepth=256 \
+               "$(ioengine "libaio")" --time_based=1 --runtime=30s \
+               --zonemode=zbd --direct=1 --zonesize="${zone_size}" \
+               ${job_var_opts[@]} \
+               >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Random writes and random trims to sequential write required zones for 30s.
+test58() {
+    local off size bs
+
+    require_seq_zones 128 || return $SKIP_TESTCASE
+
+    size=$((zone_size * 128))
+    bs="$(max $((zone_size / 128)) "$min_seq_write_size")"
+    prep_write
+    off=$((first_sequential_zone_sector * 512))
+    run_fio --zonemode=zbd --direct=1 --zonesize="${zone_size}" --thread=1 \
+           --filename="${dev}" --norandommap=1 \
+            --name="precondition"  --rw=write "$(ioengine "psync")" \
+            --offset="${off}" --size=$((zone_size * 16)) --bs="${bs}" \
+           "${job_var_opts[@]}" \
+           --name=wjob --wait_for="precondition" --rw=randwrite \
+           "$(ioengine "libaio")" --iodepth=8 \
+           --offset="${off}" --size="${size}" --bs="${bs}" \
+           --time_based --runtime=30s --flow=128 "${job_var_opts[@]}" \
+           --name=trimjob --wait_for="precondition" --rw=randtrim \
+           "$(ioengine "psync")" \
+           --offset="${off}" --size="${size}" --bs="${zone_size}" \
+           --time_based --runtime=30s --flow=1 "${job_var_opts[@]}" \
+           >>"${logfile}.${test_number}" 2>&1
+}
+
+# Test zone_reset_threshold with verify.
+test59() {
+       local off bs loops=2 size=$((zone_size)) w
+       local -a workloads=(write randwrite rw randrw)
+
+       prep_write
+       off=$((first_sequential_zone_sector * 512))
+
+       bs=$(min $((256*1024)) "$zone_size")
+       for w in "${workloads[@]}"; do
+               run_fio_on_seq "$(ioengine "psync")" --rw=${w} --bs="$bs" \
+                              --size=$size --loops=$loops --do_verify=1 \
+                              --verify=md5 --zone_reset_frequency=.9 \
+                              --zone_reset_threshold=.1 \
+                              >> "${logfile}.${test_number}" 2>&1 || return $?
+       done
+}
+
+# Test fio errors out experimental_verify option with zonemode=zbd.
+test60() {
+       run_fio_on_seq "$(ioengine "psync")" --rw=write --size=$zone_size \
+                      --do_verify=1 --verify=md5 --experimental_verify=1 \
+                      >> "${logfile}.${test_number}" 2>&1 && return 1
+       grep -q 'not support experimental verify' "${logfile}.${test_number}"
+}
+
+# Test fio errors out zone_reset_threshold option for multiple jobs with
+# different write ranges.
+test61() {
+       run_fio_on_seq "$(ioengine "psync")" --rw=write --size="$zone_size" \
+                      --numjobs=2 --offset_increment="$zone_size" \
+                      --zone_reset_threshold=0.1 --zone_reset_frequency=1 \
+                      --exitall_on_error=1 \
+                      >> "${logfile}.${test_number}" 2>&1 && return 1
+       grep -q 'different write ranges' "${logfile}.${test_number}"
+}
+
+# Test zone_reset_threshold option works for multiple jobs with same write
+# range.
+test62() {
+       local bs loops=2 size=$((zone_size))
+
+       [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+       # Two jobs write to single zone twice. Reset zone happens at next write
+       # after half of the zone gets filled. So 2 * 2 * 2 - 1 = 7 times zone
+       # resets are expected.
+       bs=$(min $((256*1024)) $((zone_size / 4)))
+       run_fio_on_seq "$(ioengine "psync")" --rw=write --bs="$bs" \
+                      --size=$size --loops=$loops --numjobs=2 \
+                      --zone_reset_frequency=1 --zone_reset_threshold=.5 \
+                      --group_reporting=1 \
+                      >> "${logfile}.${test_number}" 2>&1 || return $?
+       check_written $((size * loops * 2)) || return $?
+       check_reset_count -eq 7 || return $?
+}
+
+# Test zone_reset_threshold option works for a read job and a write job with
+# different IO range.
+test63() {
+       local bs loops=2 size=$((zone_size)) off1 off2
+
+       [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+       off1=$((first_sequential_zone_sector * 512))
+       off2=$((off1 + zone_size))
+       bs=$(min $((256*1024)) $((zone_size / 4)))
+
+       # One job writes to single zone twice. Reset zone happens at next write
+       # after half of the zone gets filled. So 2 * 2 - 1 = 3 times zone resets
+       # are expected.
+       run_fio "$(ioengine "psync")" --bs="$bs" --size=$size --loops=$loops \
+               --filename="$dev" --group_reporting=1 \
+               --zonemode=zbd --zonesize="$zone_size" --direct=1 \
+               --zone_reset_frequency=1 --zone_reset_threshold=.5 \
+               --name=r --rw=read --offset=$off1 "${job_var_opts[@]}" \
+               --name=w --rw=write --offset=$off2 "${job_var_opts[@]}" \
+                      >> "${logfile}.${test_number}" 2>&1 || return $?
+       check_written $((size * loops)) || return $?
+       check_reset_count -eq 3 || return $?
+}
+
+# Test write zone accounting handles almost full zones correctly. Prepare an
+# almost full, but not full zone. Write to the zone with verify using larger
+# block size. Then confirm fio does not report write zone accounting failure.
+test64() {
+       local bs cap
+
+       [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+       bs=$((zone_size / 8))
+       cap=$(total_zone_capacity 1 $((first_sequential_zone_sector*512)) $dev)
+       run_fio_on_seq "$(ioengine "psync")" --rw=write --bs="$bs" \
+                      --size=$((zone_size)) \
+                      --io_size=$((cap - bs)) \
+                      >> "${logfile}.${test_number}" 2>&1 || return $?
+
+       bs=$((zone_size / 2))
+       run_fio_on_seq "$(ioengine "psync")" --rw=write --bs="$bs" \
+                      --size=$((zone_size)) --do_verify=1 --verify=md5 \
+                      >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Test open zone accounting handles trim workload correctly. Prepare open zones
+# as many as max_open_zones=4. Trim one of the 4 zones. Then write to another
+# zone and check the write amount is expected size.
+test65() {
+       local off capacity
+
+       [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+       off=$((first_sequential_zone_sector * 512))
+       capacity=$(total_zone_capacity 1 $off "$dev")
+       run_fio --zonemode=zbd --direct=1 --zonesize="$zone_size" --thread=1 \
+               --filename="$dev" --group_reporting=1 --max_open_zones=4 \
+               "$(ioengine "psync")" \
+               --name="prep_open_zones" --rw=randwrite --offset="$off" \
+               --size="$((zone_size * 4))" --bs=4096 --io_size="$zone_size" \
+               --name=trimjob --wait_for="prep_open_zones" --rw=trim \
+               --bs="$zone_size" --offset="$off" --size="$zone_size" \
+               --name=write --wait_for="trimjob" --rw=write --bs=4096 \
+               --offset="$((off + zone_size * 4))" --size="$zone_size" \
+               >> "${logfile}.${test_number}" 2>&1
+
+       check_written $((zone_size + capacity))
+}
+
+# Test closed zones are handled as open zones. This test case requires zoned
+# block devices which has same max_open_zones and max_active_zones.
+test66() {
+       local i off
+
+       require_zbd || return $SKIP_TESTCASE
+       require_max_active_zones 2 || return $SKIP_TESTCASE
+       require_max_open_zones "${max_active_zones}" || return $SKIP_TESTCASE
+       require_seq_zones $((max_active_zones * 16)) || return $SKIP_TESTCASE
+
+       reset_zone "$dev" -1
+
+       # Prepare max_active_zones in closed condition.
+       off=$((first_sequential_zone_sector * 512))
+       run_fio --name=w --filename="$dev" --zonemod=zbd --direct=1 \
+               --offset=$((off)) --zonesize="${zone_size}" --rw=randwrite \
+               --bs=4096 --size="$((zone_size * max_active_zones))" \
+               --io_size="${zone_size}" "$(ioengine "psync")" \
+               >> "${logfile}.${test_number}" 2>&1 || return $?
+       for ((i = 0; i < max_active_zones; i++)); do
+               close_zone "$dev" $((off / 512)) || return $?
+               off=$((off + zone_size))
+       done
+
+       # Run random write to the closed zones and empty zones. This confirms
+       # that fio handles closed zones as write target open zones. Otherwise,
+       # fio writes to the empty zones and hit the max_active_zones limit.
+       off=$((first_sequential_zone_sector * 512))
+       run_one_fio_job --zonemod=zbd --direct=1 \
+                      "$(ioengine "psync")" --rw=randwrite --bs=4096 \
+                      --max_open_zones="$max_active_zones" --offset=$((off)) \
+                      --size=$((max_active_zones * 16 * zone_size)) \
+                      --io_size=$((zone_size)) --zonesize="${zone_size}" \
+                      --time_based --runtime=5s \
+                      >> "${logfile}.${test_number}" 2>&1
+}
+
+# Test max_active_zones limit failure is reported with good error message.
+test67() {
+       local i off
+
+       require_zbd || return $SKIP_TESTCASE
+       require_max_active_zones 2 || return $SKIP_TESTCASE
+       require_max_open_zones "${max_active_zones}" || return $SKIP_TESTCASE
+       require_seq_zones $((max_active_zones + 1)) || return $SKIP_TESTCASE
+
+       reset_zone "$dev" -1
+
+       # Prepare max_active_zones in open condition.
+       off=$((first_sequential_zone_sector * 512))
+       run_fio --name=w --filename="$dev" --zonemod=zbd --direct=1 \
+               --offset=$((off)) --zonesize="${zone_size}" --rw=randwrite \
+               --bs=4096 --size="$((zone_size * max_active_zones))" \
+               --io_size="${zone_size}" "$(ioengine "psync")" \
+               >> "${logfile}.${test_number}" 2>&1 || return $?
+
+       # Write to antoher zone and trigger max_active_zones limit error.
+       off=$((off + zone_size * max_active_zones))
+       run_one_fio_job --zonemod=zbd --direct=1 "$(ioengine "psync")" \
+                       --rw=write --bs=$min_seq_write_size --offset=$((off)) \
+                       --size=$((zone_size)) --zonesize="${zone_size}" \
+                       >> "${logfile}.${test_number}" 2>&1 && return $?
+       grep -q 'Exceeded max_active_zones limit' "${logfile}.${test_number}"
+}
+
+# Test rw=randrw and rwmixwrite=0 options do not issue write I/O unit
+test68() {
+       local off size
+
+       require_zbd || return "$SKIP_TESTCASE"
+
+       reset_zone "${dev}" -1
+
+       # Write some data as preparation
+       off=$((first_sequential_zone_sector * 512))
+       size=$min_seq_write_size
+       run_one_fio_job "$(ioengine "psync")" --rw=write --offset="$off" \
+                       --io_size="$size" --zonemode=strided \
+                       --zonesize="$zone_size" --zonerange="$zone_size" \
+                      >> "${logfile}.${test_number}" 2>&1 || return $?
+       # Run random mixed read and write specifying zero write ratio
+       run_fio_on_seq "$(ioengine "psync")" --rw=randrw --rwmixwrite=0 \
+                      --time_based --runtime=1s \
+                      >> "${logfile}.${test_number}" 2>&1 || return $?
+       # "WRITE:" shall be recoreded only once for the preparation
+       [[ $(grep -c "WRITE:" "${logfile}.${test_number}") == 1 ]]
+}
+
  SECONDS=0
  tests=()
  dynamic_analyzer=()
  reset_all_zones=
+reset_before_write=
  use_libzbc=
  zbd_debug=
  max_open_zones_opt=
  quit_on_err=
+force_io_uring=
+start_test=1
  
  while [ "${1#-}" != "$1" ]; do
    case "$1" in
@@ -1171,12 +1614,15 @@ while [ "${1#-}" != "$1" ]; do
         shift;;
      -l) use_libzbc=1; shift;;
      -r) reset_all_zones=1; shift;;
+    -w) reset_before_write=1; shift;;
      -t) tests+=("$2"); shift; shift;;
      -o) max_open_zones_opt="${2}"; shift; shift;;
+    -s) start_test=$2; shift; shift;;
      -v) dynamic_analyzer=(valgrind "--read-var-info=yes");
         shift;;
      -q) quit_on_err=1; shift;;
      -z) zbd_debug=1; shift;;
+    -u) force_io_uring=1; shift;;
      --) shift; break;;
       *) usage; exit 1;;
    esac
@@ -1187,6 +1633,11 @@ if [ $# != 1 ]; then
      exit 1
  fi
  
+if [ -n "$use_libzbc" -a -n "$force_io_uring" ]; then
+    echo "Please specify only one of -l and -u options"
+    exit 1
+fi
+
  # shellcheck source=functions
  source "$(dirname "$0")/functions" || exit $?
  
@@ -1210,7 +1661,7 @@ if [[ -b "$realdev" ]]; then
                 realsysfs=$(readlink "/sys/dev/block/$major:$minor")
                 basename=$(basename "${realsysfs%/*}")
         fi
-       logical_block_size=$(<"/sys/block/$basename/queue/logical_block_size")
+       min_seq_write_size=$(min_seq_write_size "$basename")
         case "$(<"/sys/class/block/$basename/queue/zoned")" in
         host-managed|host-aware)
                 is_zbd=true
@@ -1224,10 +1675,12 @@ if [[ -b "$realdev" ]]; then
                 first_sequential_zone_sector=${result[0]}
                 sectors_per_zone=${result[1]}
                 zone_size=$((sectors_per_zone * 512))
+               unrestricted_reads=$(urswrz "$dev")
                 if ! max_open_zones=$(max_open_zones "$dev"); then
                         echo "Failed to determine maximum number of open zones"
                         exit 1
                 fi
+               max_active_zones=$(max_active_zones "$dev")
                 set_io_scheduler "$basename" deadline || exit $?
                 if [ -n "$reset_all_zones" ]; then
                         reset_zone "$dev" -1
@@ -1235,13 +1688,16 @@ if [[ -b "$realdev" ]]; then
                 ;;
         *)
                 first_sequential_zone_sector=$(((disk_size / 2) &
-                                               (logical_block_size - 1)))
-               zone_size=$(max 65536 "$logical_block_size")
+                                               (min_seq_write_size - 1)))
+               zone_size=$(max 65536 "$min_seq_write_size")
                 sectors_per_zone=$((zone_size / 512))
                 max_open_zones=128
+               max_active_zones=0
+               unrestricted_reads=1
                 set_io_scheduler "$basename" none || exit $?
                 ;;
         esac
+
  elif [[ -c "$realdev" ]]; then
         # For an SG node, we must have libzbc option specified
         if [[ ! -n "$use_libzbc" ]]; then
@@ -1259,8 +1715,8 @@ elif [[ -c "$realdev" ]]; then
                 echo "Failed to determine disk size"
                 exit 1
         fi
-       if ! logical_block_size=($(zbc_logical_block_size "$dev")); then
-               echo "Failed to determine logical block size"
+       if ! min_seq_write_size=($(zbc_physical_block_size "$dev")); then
+               echo "Failed to determine physical block size"
                 exit 1
         fi
         if ! result=($(first_sequential_zone "$dev")); then
@@ -1270,10 +1726,12 @@ elif [[ -c "$realdev" ]]; then
         first_sequential_zone_sector=${result[0]}
         sectors_per_zone=${result[1]}
         zone_size=$((sectors_per_zone * 512))
+       unrestricted_reads=$(urswrz "$dev")
         if ! max_open_zones=$(max_open_zones "$dev"); then
                 echo "Failed to determine maximum number of open zones"
                 exit 1
         fi
+       max_active_zones=0
         if [ -n "$reset_all_zones" ]; then
                 reset_zone "$dev" -1
         fi
@@ -1282,12 +1740,15 @@ fi
  if [[ -n ${max_open_zones_opt} ]]; then
         # Override max_open_zones with the script option value
         max_open_zones="${max_open_zones_opt}"
+       global_var_opts+=("--ignore_zone_limits=1")
         job_var_opts+=("--max_open_zones=${max_open_zones_opt}")
  fi
  
  echo -n "First sequential zone starts at sector $first_sequential_zone_sector;"
  echo " zone size: $((zone_size >> 20)) MB"
  
+zone_cap_bs=$(zone_cap_bs "$dev" "$zone_size")
+
  if [ "${#tests[@]}" = 0 ]; then
      readarray -t tests < <(declare -F | grep "test[0-9]*" | \
                                    tr -c -d "[:digit:]\n" | sort -n)
@@ -1315,6 +1776,7 @@ trap 'intr=1' SIGINT
  ret=0
  
  for test_number in "${tests[@]}"; do
+    [ "${test_number}" -lt "${start_test}" ] && continue
      rm -f "${logfile}.${test_number}"
      unset SKIP_REASON
      echo -n "Running test $(printf "%02d" $test_number) ... "
diff --git a/thread_options.h b/thread_options.h

index f6b15403c4c21df1c183b47337e71a6018e1fe62..a36b79094f26a77afdfe327b85182f601cd09c31 100644 (file)
--- a/thread_options.h
+++ b/thread_options.h
@@ -31,11 +31,31 @@ enum fio_memtype {
         MEM_CUDA_MALLOC,/* use GPU memory */
  };
  
+/*
+ * What mode to use for deduped data generation
+ */
+enum dedupe_mode {
+       DEDUPE_MODE_REPEAT = 0,
+       DEDUPE_MODE_WORKING_SET = 1,
+};
+
  #define ERROR_STR_MAX  128
  
  #define BSSPLIT_MAX    64
  #define ZONESPLIT_MAX  256
  
+struct split {
+       unsigned int nr;
+       unsigned long long val1[ZONESPLIT_MAX];
+       unsigned long long val2[ZONESPLIT_MAX];
+};
+
+struct split_prio {
+       uint64_t bs;
+       int32_t prio;
+       uint32_t perc;
+};
+
  struct bssplit {
         uint64_t bs;
         uint32_t perc;
@@ -83,13 +103,16 @@ struct thread_options {
         unsigned long long size;
         unsigned long long io_size;
         unsigned int size_percent;
+       unsigned int size_nz;
         unsigned int io_size_percent;
+       unsigned int io_size_nz;
         unsigned int fill_device;
         unsigned int file_append;
         unsigned long long file_size_low;
         unsigned long long file_size_high;
         unsigned long long start_offset;
         unsigned long long start_offset_align;
+       unsigned int start_offset_nz;
  
         unsigned long long bs[DDIR_RWDIR_CNT];
         unsigned long long ba[DDIR_RWDIR_CNT];
@@ -121,7 +144,7 @@ struct thread_options {
         unsigned int do_verify;
         unsigned int verify_interval;
         unsigned int verify_offset;
-       char verify_pattern[MAX_PATTERN_SIZE];
+       char *verify_pattern;
         unsigned int verify_pattern_bytes;
         struct pattern_fmt verify_fmt[8];
         unsigned int verify_fmt_sz;
@@ -139,7 +162,6 @@ struct thread_options {
         unsigned int do_disk_util;
         unsigned int override_sync;
         unsigned int rand_repeatable;
-       unsigned int allrand_repeatable;
         unsigned long long rand_seed;
         unsigned int log_avg_msec;
         unsigned int log_hist_msec;
@@ -148,7 +170,8 @@ struct thread_options {
         unsigned int log_offset;
         unsigned int log_gz;
         unsigned int log_gz_store;
-       unsigned int log_unix_epoch;
+       unsigned int log_alternate_epoch;
+       unsigned int log_alternate_epoch_clock_id;
         unsigned int norandommap;
         unsigned int softrandommap;
         unsigned int bs_unaligned;
@@ -174,10 +197,6 @@ struct thread_options {
  
         unsigned int hugepage_size;
         unsigned long long rw_min_bs;
-       unsigned int thinktime;
-       unsigned int thinktime_spin;
-       unsigned int thinktime_blocks;
-       unsigned int thinktime_blocks_type;
         unsigned int fsync_blocks;
         unsigned int fdatasync_blocks;
         unsigned int barrier_blocks;
@@ -190,6 +209,7 @@ struct thread_options {
         fio_fp64_t ss_limit;
         unsigned long long ss_dur;
         unsigned long long ss_ramp_time;
+       unsigned long long ss_check_interval;
         unsigned int overwrite;
         unsigned int bw_avg_time;
         unsigned int iops_avg_time;
@@ -198,12 +218,13 @@ struct thread_options {
         unsigned long long zone_size;
         unsigned long long zone_capacity;
         unsigned long long zone_skip;
+       uint32_t zone_skip_nz;
         enum fio_zone_mode zone_mode;
         unsigned long long lockmem;
         enum fio_memtype mem_type;
         unsigned int mem_align;
  
-       unsigned long long max_latency;
+       unsigned long long max_latency[DDIR_RWDIR_CNT];
  
         unsigned int exit_what;
         unsigned int stonewall;
@@ -226,6 +247,7 @@ struct thread_options {
         unsigned int nice;
         unsigned int ioprio;
         unsigned int ioprio_class;
+       unsigned int ioprio_hint;
         unsigned int file_service_type;
         unsigned int group_reporting;
         unsigned int stats;
@@ -234,11 +256,14 @@ struct thread_options {
         unsigned int zero_buffers;
         unsigned int refill_buffers;
         unsigned int scramble_buffers;
-       char buffer_pattern[MAX_PATTERN_SIZE];
+       char *buffer_pattern;
         unsigned int buffer_pattern_bytes;
         unsigned int compress_percentage;
         unsigned int compress_chunk;
         unsigned int dedupe_percentage;
+       unsigned int dedupe_mode;
+       unsigned int dedupe_working_set_percentage;
+       unsigned int dedupe_global;
         unsigned int time_based;
         unsigned int disable_lat;
         unsigned int disable_clat;
@@ -247,6 +272,7 @@ struct thread_options {
         unsigned int unified_rw_rep;
         unsigned int gtod_reduce;
         unsigned int gtod_cpu;
+       unsigned int job_start_clock_id;
         enum fio_cs clocksource;
         unsigned int no_stall;
         unsigned int trim_percentage;
@@ -283,6 +309,14 @@ struct thread_options {
         char *exec_prerun;
         char *exec_postrun;
  
+       unsigned int thinkcycles;
+
+       unsigned int thinktime;
+       unsigned int thinktime_spin;
+       unsigned int thinktime_blocks;
+       unsigned int thinktime_blocks_type;
+       unsigned int thinktime_iotime;
+
         uint64_t rate[DDIR_RWDIR_CNT];
         uint64_t ratemin[DDIR_RWDIR_CNT];
         unsigned int ratecycle;
@@ -315,15 +349,18 @@ struct thread_options {
         unsigned int gid;
  
         unsigned int offset_increment_percent;
+       unsigned int offset_increment_nz;
         unsigned long long offset_increment;
         unsigned long long number_ios;
  
+       unsigned int num_range;
+
         unsigned int sync_file_range;
  
         unsigned long long latency_target;
         unsigned long long latency_window;
-       fio_fp64_t latency_percentile;
         uint32_t latency_run;
+       fio_fp64_t latency_percentile;
  
         /*
          * flow support
@@ -350,8 +387,18 @@ struct thread_options {
         unsigned int read_beyond_wp;
         int max_open_zones;
         unsigned int job_max_open_zones;
+       unsigned int ignore_zone_limits;
         fio_fp64_t zrt;
         fio_fp64_t zrf;
+
+       unsigned int fdp;
+       unsigned int dp_type;
+       unsigned int dp_id_select;
+       unsigned int dp_ids[FIO_MAX_DP_IDS];
+       unsigned int dp_nr_ids;
+
+       unsigned int log_entries;
+       unsigned int log_prio;
  };
  
  #define FIO_TOP_STR_MAX                256
@@ -379,12 +426,13 @@ struct thread_options_pack {
         uint32_t iodepth_batch_complete_min;
         uint32_t iodepth_batch_complete_max;
         uint32_t serialize_overlap;
-       uint32_t pad;
  
         uint64_t size;
         uint64_t io_size;
         uint32_t size_percent;
+       uint32_t size_nz;
         uint32_t io_size_percent;
+       uint32_t io_size_nz;
         uint32_t fill_device;
         uint32_t file_append;
         uint32_t unique_filename;
@@ -392,6 +440,7 @@ struct thread_options_pack {
         uint64_t file_size_high;
         uint64_t start_offset;
         uint64_t start_offset_align;
+       uint32_t start_offset_nz;
  
         uint64_t bs[DDIR_RWDIR_CNT];
         uint64_t ba[DDIR_RWDIR_CNT];
@@ -423,7 +472,6 @@ struct thread_options_pack {
         uint32_t do_verify;
         uint32_t verify_interval;
         uint32_t verify_offset;
-       uint8_t verify_pattern[MAX_PATTERN_SIZE];
         uint32_t verify_pattern_bytes;
         uint32_t verify_fatal;
         uint32_t verify_dump;
@@ -439,8 +487,6 @@ struct thread_options_pack {
         uint32_t do_disk_util;
         uint32_t override_sync;
         uint32_t rand_repeatable;
-       uint32_t allrand_repeatable;
-       uint32_t pad2;
         uint64_t rand_seed;
         uint32_t log_avg_msec;
         uint32_t log_hist_msec;
@@ -449,7 +495,8 @@ struct thread_options_pack {
         uint32_t log_offset;
         uint32_t log_gz;
         uint32_t log_gz_store;
-       uint32_t log_unix_epoch;
+       uint32_t log_alternate_epoch;
+       uint32_t log_alternate_epoch_clock_id;
         uint32_t norandommap;
         uint32_t softrandommap;
         uint32_t bs_unaligned;
@@ -464,8 +511,6 @@ struct thread_options_pack {
         struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
         uint32_t zone_split_nr[DDIR_RWDIR_CNT];
  
-       uint8_t pad1[4];
-
         fio_fp64_t zipf_theta;
         fio_fp64_t pareto_h;
         fio_fp64_t gauss_dev;
@@ -477,10 +522,6 @@ struct thread_options_pack {
  
         uint32_t hugepage_size;
         uint64_t rw_min_bs;
-       uint32_t thinktime;
-       uint32_t thinktime_spin;
-       uint32_t thinktime_blocks;
-       uint32_t thinktime_blocks_type;
         uint32_t fsync_blocks;
         uint32_t fdatasync_blocks;
         uint32_t barrier_blocks;
@@ -492,6 +533,7 @@ struct thread_options_pack {
         uint64_t ss_ramp_time;
         uint32_t ss_state;
         fio_fp64_t ss_limit;
+       uint64_t ss_check_interval;
         uint32_t overwrite;
         uint32_t bw_avg_time;
         uint32_t iops_avg_time;
@@ -501,6 +543,7 @@ struct thread_options_pack {
         uint64_t zone_capacity;
         uint64_t zone_skip;
         uint64_t lockmem;
+       uint32_t zone_skip_nz;
         uint32_t mem_type;
         uint32_t mem_align;
  
@@ -509,8 +552,6 @@ struct thread_options_pack {
         uint32_t new_group;
         uint32_t numjobs;
  
-       uint8_t pad3[4];
-
         /*
          * We currently can't convert these, so don't enable them
          */
@@ -528,6 +569,7 @@ struct thread_options_pack {
         uint32_t nice;
         uint32_t ioprio;
         uint32_t ioprio_class;
+       uint32_t ioprio_hint;
         uint32_t file_service_type;
         uint32_t group_reporting;
         uint32_t stats;
@@ -536,11 +578,13 @@ struct thread_options_pack {
         uint32_t zero_buffers;
         uint32_t refill_buffers;
         uint32_t scramble_buffers;
-       uint8_t buffer_pattern[MAX_PATTERN_SIZE];
         uint32_t buffer_pattern_bytes;
         uint32_t compress_percentage;
         uint32_t compress_chunk;
         uint32_t dedupe_percentage;
+       uint32_t dedupe_mode;
+       uint32_t dedupe_working_set_percentage;
+       uint32_t dedupe_global;
         uint32_t time_based;
         uint32_t disable_lat;
         uint32_t disable_clat;
@@ -549,6 +593,7 @@ struct thread_options_pack {
         uint32_t unified_rw_rep;
         uint32_t gtod_reduce;
         uint32_t gtod_cpu;
+       uint32_t job_start_clock_id;
         uint32_t clocksource;
         uint32_t no_stall;
         uint32_t trim_percentage;
@@ -559,6 +604,7 @@ struct thread_options_pack {
         uint32_t lat_percentiles;
         uint32_t slat_percentiles;
         uint32_t percentile_precision;
+       uint32_t pad;
         fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
  
         uint8_t read_iolog_file[FIO_TOP_STR_MAX];
@@ -584,6 +630,14 @@ struct thread_options_pack {
         uint8_t exec_prerun[FIO_TOP_STR_MAX];
         uint8_t exec_postrun[FIO_TOP_STR_MAX];
  
+       uint32_t thinkcycles;
+
+       uint32_t thinktime;
+       uint32_t thinktime_spin;
+       uint32_t thinktime_blocks;
+       uint32_t thinktime_blocks_type;
+       uint32_t thinktime_iotime;
+
         uint64_t rate[DDIR_RWDIR_CNT];
         uint64_t ratemin[DDIR_RWDIR_CNT];
         uint32_t ratecycle;
@@ -616,14 +670,15 @@ struct thread_options_pack {
         uint32_t gid;
  
         uint32_t offset_increment_percent;
+       uint32_t offset_increment_nz;
         uint64_t offset_increment;
         uint64_t number_ios;
  
         uint64_t latency_target;
         uint64_t latency_window;
-       uint64_t max_latency;
-       fio_fp64_t latency_percentile;
+       uint64_t max_latency[DDIR_RWDIR_CNT];
         uint32_t latency_run;
+       fio_fp64_t latency_percentile;
  
         /*
          * flow support
@@ -647,11 +702,43 @@ struct thread_options_pack {
         uint32_t allow_mounted_write;
  
         uint32_t zone_mode;
+       int32_t max_open_zones;
+       uint32_t ignore_zone_limits;
+
+       uint32_t log_entries;
+       uint32_t log_prio;
+
+       uint32_t fdp;
+       uint32_t dp_type;
+       uint32_t dp_id_select;
+       uint32_t dp_ids[FIO_MAX_DP_IDS];
+       uint32_t dp_nr_ids;
+
+       uint32_t num_range;
+       /*
+        * verify_pattern followed by buffer_pattern from the unpacked struct
+        */
+       uint8_t patterns[];
  } __attribute__((packed));
  
-extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);
+extern int convert_thread_options_to_cpu(struct thread_options *o,
+               struct thread_options_pack *top, size_t top_sz);
+extern size_t thread_options_pack_size(struct thread_options *o);
  extern void convert_thread_options_to_net(struct thread_options_pack *top, struct thread_options *);
  extern int fio_test_cconv(struct thread_options *);
  extern void options_default_fill(struct thread_options *o);
  
+typedef int (split_parse_fn)(struct thread_options *, void *,
+                            enum fio_ddir, char *, bool);
+
+extern int str_split_parse(struct thread_data *td, char *str,
+                          split_parse_fn *fn, void *eo, bool data);
+
+extern int split_parse_ddir(struct thread_options *o, struct split *split,
+                           char *str, bool absolute, unsigned int max_splits);
+
+extern int split_parse_prio_ddir(struct thread_options *o,
+                                struct split_prio **entries, int *nr_entries,
+                                char *str);
+
  #endif
diff --git a/time.c b/time.c

index cd0e2a89144b59a565d88fab6192dda951a8ff2b..7f85c8de3bcbd2cb40eabe9c5f5c465551539b3a 100644 (file)
--- a/time.c
+++ b/time.c
@@ -38,6 +38,17 @@ uint64_t usec_spin(unsigned int usec)
         return t;
  }
  
+/*
+ * busy loop for a fixed amount of cycles
+ */
+void cycles_spin(unsigned int n)
+{
+       unsigned long i;
+
+       for (i=0; i < n; i++)
+               nop;
+}
+
  uint64_t usec_sleep(struct thread_data *td, unsigned long usec)
  {
         struct timespec req;
@@ -172,14 +183,22 @@ void set_genesis_time(void)
         fio_gettime(&genesis, NULL);
  }
  
-void set_epoch_time(struct thread_data *td, int log_unix_epoch)
+void set_epoch_time(struct thread_data *td, clockid_t log_alternate_epoch_clock_id, clockid_t job_start_clock_id)
  {
+       struct timespec ts;
         fio_gettime(&td->epoch, NULL);
-       if (log_unix_epoch) {
-               struct timeval tv;
-               gettimeofday(&tv, NULL);
-               td->unix_epoch = (unsigned long long)(tv.tv_sec) * 1000 +
-                                (unsigned long long)(tv.tv_usec) / 1000;
+       clock_gettime(log_alternate_epoch_clock_id, &ts);
+       td->alternate_epoch = (unsigned long long)(ts.tv_sec) * 1000 +
+                                                 (unsigned long long)(ts.tv_nsec) / 1000000;
+       if (job_start_clock_id == log_alternate_epoch_clock_id)
+       {
+               td->job_start = td->alternate_epoch;
+       }
+       else
+       {
+               clock_gettime(job_start_clock_id, &ts);
+               td->job_start = (unsigned long long)(ts.tv_sec) * 1000 +
+                                               (unsigned long long)(ts.tv_nsec) / 1000000;
         }
  }
  
diff --git a/tools/fio_generate_plots b/tools/fio_generate_plots

index e4558788150d2cadfa9706a24a16cb75c5994246..468cf27a6c2ae6dd693972947fd24d00f8933424 100755 (executable)
--- a/tools/fio_generate_plots
+++ b/tools/fio_generate_plots
@@ -21,7 +21,7 @@ if [ -z "$1" ]; then
         exit 1
  fi
  
-GNUPLOT=$(which gnuplot)
+GNUPLOT=$(command -v gnuplot)
  if [ ! -x "$GNUPLOT" ]
  then
         echo You need gnuplot installed to generate graphs
diff --git a/tools/fio_jsonplus_clat2csv b/tools/fio_jsonplus_clat2csv

index 7f310fcc473cb7f6720b96a64bf17b0993896a8c..8fdd014d9514ce9039572a18a0d27b276905244f 100755 (executable)
--- a/tools/fio_jsonplus_clat2csv
+++ b/tools/fio_jsonplus_clat2csv
@@ -135,7 +135,7 @@ def more_bins(indices, bins):
  
      Returns:
          True if the indices do not yet point to the end of each bin in bins.
-        False if the indices point beyond their repsective bins.
+        False if the indices point beyond their respective bins.
      """
  
      for key, value in six.iteritems(indices):
@@ -160,7 +160,7 @@ def debug_print(debug, *args):
  def get_csvfile(dest, jobnum):
      """Generate CSV filename from command-line arguments and job numbers.
  
-    Paramaters:
+    Parameters:
          dest        file specification for CSV filename.
          jobnum      job number.
  
diff --git a/tools/fiograph/fiograph.conf b/tools/fiograph/fiograph.conf

new file mode 100644 (file)

index 0000000..123c39a
--- /dev/null
+++ b/tools/fiograph/fiograph.conf
@@ -0,0 +1,114 @@
+[fio_jobs]
+header=<<B><font color="{}"> {} </font></B> >
+header_color=black
+text_color=darkgreen
+shape=box
+shape_color=blue
+style=rounded
+title_style=<<table border='0' cellborder='0' cellspacing='1'> <tr> <td align='center'> <b> {} </b> </td> </tr>
+item_style=<tr> <td align = "left"> <font color="{}" > {} </font> </td> </tr>
+cluster_style=filled
+cluster_color=gainsboro
+
+[exec_prerun]
+text_color=red
+
+[exec_postrun]
+text_color=red
+
+[numjobs]
+text_color=red
+style=<font color="{}" > x {} </font>
+
+[ioengine]
+text_color=darkblue
+specific_options_color=darkblue
+
+# definitions of engine's specific options
+
+[ioengine_cpuio]
+specific_options=cpuload cpumode cpuchunks exit_on_io_done
+
+[ioengine_dfs]
+specific_options=pool  cont  chunk_size  object_class  svcl
+
+[ioengine_e4defrag]
+specific_options=donorname  inplace
+
+[ioengine_exec]
+specific_options=program arguments grace_time std_redirect
+
+[ioengine_filestat]
+specific_options=stat_type
+
+[ioengine_single-instance]
+specific_options=volume  brick
+
+[ioengine_http]
+specific_options=https  http_host  http_user  http_pass  http_s3_key  http_s3_keyid  http_swift_auth_token  http_s3_region  http_mode  http_verbose  http_s3_storage_class  http_s3_sse_customer_key  http_s3_sse_customer_algorithm
+
+[ioengine_ime_aio]
+specific_options=ime_psync  ime_psyncv
+
+[ioengine_io_uring]
+specific_options=hipri  cmdprio_percentage  cmdprio_class  cmdprio  cmdprio_bssplit  fixedbufs  registerfiles  sqthread_poll  sqthread_poll_cpu  nonvectored  uncached  nowait  force_async
+
+[ioengine_io_uring_cmd]
+specific_options=hipri  cmdprio_percentage  cmdprio_class  cmdprio  cmdprio_bssplit  fixedbufs  registerfiles  sqthread_poll  sqthread_poll_cpu  nonvectored  uncached  nowait  force_async  cmd_type  md_per_io_size  pi_act  pi_chk  apptag  apptag_mask
+
+[ioengine_libaio]
+specific_options=userspace_reap  cmdprio_percentage  cmdprio_class  cmdprio  cmdprio_bssplit  nowait
+
+[ioengine_libblkio]
+specific_options=libblkio_driver  libblkio_path  libblkio_pre_connect_props  libblkio_num_entries  libblkio_queue_size  libblkio_pre_start_props  hipri  libblkio_vectored  libblkio_write_zeroes_on_trim  libblkio_wait_mode  libblkio_force_enable_completion_eventfd
+
+[ioengine_libcufile]
+specific_options=gpu_dev_ids  cuda_io
+
+[ioengine_libhdfs]
+specific_options=namenode  hostname  port  hdfsdirectory  chunk_size  single_instance  hdfs_use_direct
+
+[ioengine_libiscsi]
+specific_options=initiator
+
+[ioengine_librpma_apm_server]
+specific_options=librpma_apm_client
+
+[ioengine_busy_wait_polling]
+specific_options=serverip  port  direct_write_to_pmem
+
+[ioengine_librpma_gpspm_server]
+specific_options=librpma_gpspm_client
+
+[ioengine_mmap]
+specific_options=thp
+
+[ioengine_mtd]
+specific_options=skip_bad
+
+[ioengine_nbd]
+specific_options=uri
+
+[ioengine_net]
+specific_options=hostname  port  protocol  nodelay  listen  pingpong  interface  ttl  window_size  mss  netsplice
+
+[ioengine_nfs]
+specific_options=nfs_url
+
+[ioengine_rados]
+specific_options=clustername  pool  clientname  busy_poll  touch_objects
+
+[ioengine_rbd]
+specific_options=clustername  rbdname  pool  clientname  busy_poll
+
+[ioengine_rdma]
+specific_options=hostname  bindname  port  verb
+
+[ioengine_sg]
+specific_options=hipri  readfua  writefua  sg_write_mode  stream_id
+
+[ioengine_pvsync2]
+specific_options=hipri  hipri_percentage  uncached  nowait  sync  psync  vsync  pvsync
+
+[ioengine_xnvme]
+specific_options=hipri  sqthread_poll  xnvme_be  xnvme_async  xnvme_sync  xnvme_admin  xnvme_dev_nsid  xnvme_iovec
diff --git a/tools/fiograph/fiograph.py b/tools/fiograph/fiograph.py

new file mode 100755 (executable)

index 0000000..cfb9b04
--- /dev/null
+++ b/tools/fiograph/fiograph.py
@@ -0,0 +1,331 @@
+#!/usr/bin/env python3
+import uuid
+import time
+import errno
+from graphviz import Digraph
+import argparse
+import configparser
+import os
+
+config_file = None
+fio_file = None
+
+
+def get_section_option(section_name, option_name, default=None):
+    global fio_file
+    if fio_file.has_option(section_name, option_name):
+        return fio_file[section_name][option_name]
+    return default
+
+
+def get_config_option(section_name, option_name, default=None):
+    global config_file
+    if config_file.has_option(section_name, option_name):
+        return config_file[section_name][option_name]
+    return default
+
+
+def get_header_color(keyword='fio_jobs', default_color='black'):
+    return get_config_option(keyword, 'header_color', default_color)
+
+
+def get_shape_color(keyword='fio_jobs', default_color='black'):
+    return get_config_option(keyword, 'shape_color', default_color)
+
+
+def get_text_color(keyword='fio_jobs', default_color='black'):
+    return get_config_option(keyword, 'text_color', default_color)
+
+
+def get_cluster_color(keyword='fio_jobs', default_color='gray92'):
+    return get_config_option(keyword, 'cluster_color', default_color)
+
+
+def get_header(keyword='fio_jobs'):
+    return get_config_option(keyword, 'header')
+
+
+def get_shape(keyword='fio_jobs'):
+    return get_config_option(keyword, 'shape', 'box')
+
+
+def get_style(keyword='fio_jobs'):
+    return get_config_option(keyword, 'style', 'rounded')
+
+
+def get_cluster_style(keyword='fio_jobs'):
+    return get_config_option(keyword, 'cluster_style', 'filled')
+
+
+def get_specific_options(engine):
+    if not engine:
+        return ''
+    return get_config_option('ioengine_{}'.format(engine), 'specific_options', '').split(' ')
+
+
+def render_option(section, label, display, option, color_override=None):
+    # These options are already shown with graphical helpers, no need to report them directly
+    skip_list = ['size', 'stonewall', 'runtime', 'time_based',
+                 'numjobs', 'wait_for', 'wait_for_previous']
+    # If the option doesn't exist or if a special handling is already done
+    # don't render it, just return the current state
+    if option in skip_list or option not in section:
+        return label, display
+    display = option
+    if section[option]:
+        display = '{} = {}'.format(display, section[option])
+
+    # Adding jobs's options into the box, darkgreen is the default color
+    if color_override:
+        color = color_override
+    else:
+        color = get_text_color(option, get_text_color('fio_jobs', 'darkgreen'))
+    label += get_config_option('fio_jobs',
+                               'item_style').format(color, display)
+    return label, display
+
+
+def render_options(fio_file, section_name):
+    """Render all options of a section."""
+    display = section_name
+    section = fio_file[section_name]
+
+    # Add a multiplier to the section_name if numjobs is set
+    numjobs = int(get_section_option(section_name, 'numjobs', '1'))
+    if numjobs > 1:
+        display = display + \
+            get_style('numjobs').format(
+                get_text_color('numjobs'), numjobs)
+
+    # Header of the box
+    label = get_config_option('fio_jobs', 'title_style').format(display)
+
+    # Let's parse all the options of the current fio thread
+    # Some needs to be printed on top or bottom of the job to ease the read
+    to_early_print = ['exec_prerun', 'ioengine']
+    to_late_print = ['exec_postrun']
+
+    # Let's print the options on top of the box
+    for early_print in to_early_print:
+        label, display = render_option(
+            section, label, display, early_print)
+
+    current_io_engine = get_section_option(
+        section_name, 'ioengine', None)
+    if current_io_engine:
+        # Let's print all specifics options for this engine
+        for specific_option in sorted(get_specific_options(current_io_engine)):
+            label, display = render_option(
+                section, label, display, specific_option, get_config_option('ioengine', 'specific_options_color'))
+
+    # Let's print generic options sorted by name
+    for option in sorted(section):
+        if option in to_early_print or option in to_late_print or option in get_specific_options(current_io_engine):
+            continue
+        label, display = render_option(section, label, display, option)
+
+    # let's print options on the bottom of the box
+    for late_print in to_late_print:
+        label, display = render_option(
+            section, label, display, late_print)
+
+    # End of the box content
+    label += '</table>>'
+    return label
+
+
+def render_section(current_graph, fio_file, section_name, label):
+    """Render the section."""
+    attr = None
+    section = fio_file[section_name]
+
+    # Let's render the box associated to a job
+    current_graph.node(section_name, label,
+                       shape=get_shape(),
+                       color=get_shape_color(),
+                       style=get_style())
+
+    # Let's report the duration of the jobs with a self-loop arrow
+    if 'runtime' in section and 'time_based' in section:
+        attr = 'runtime={}'.format(section['runtime'])
+    elif 'size' in section:
+        attr = 'size={}'.format(section['size'])
+    if attr:
+        current_graph.edge(section_name, section_name, attr)
+
+
+def create_sub_graph(name):
+    """Return a new graph."""
+    # We need to put 'cluster' in the name to ensure graphviz consider it as a cluster
+    cluster_name = 'cluster_' + name
+    # Unset the main graph labels to avoid a recopy in each subgraph
+    attr = {}
+    attr['label'] = ''
+    new_graph = Digraph(name=cluster_name, graph_attr=attr)
+    new_graph.attr(style=get_cluster_style(),
+                   color=get_cluster_color())
+    return new_graph
+
+
+def create_legend():
+    """Return a legend."""
+    html_table = "<<table border='0' cellborder='1' cellspacing='0' cellpadding='4'>"
+    html_table += '<tr><td COLSPAN="2"><b>Legend</b></td></tr>'
+    legend_item = '<tr> <td>{}</td> <td><font color="{}">{}</font></td></tr>"'
+    legend_bgcolor_item = '<tr><td>{}</td><td BGCOLOR="{}"></td></tr>'
+    html_table += legend_item.format('numjobs',
+                                     get_text_color('numjobs'), 'x numjobs')
+    html_table += legend_item.format('generic option',
+                                     get_text_color(), 'generic option')
+    html_table += legend_item.format('ioengine option',
+                                     get_text_color('ioengine'), 'ioengine option')
+    html_table += legend_bgcolor_item.format('job', get_shape_color())
+    html_table += legend_bgcolor_item.format(
+        'execution group', get_cluster_color())
+    html_table += '</table>>'
+    legend = Digraph('html_table')
+    legend.node('legend', shape='none', label=html_table)
+    return legend
+
+
+def fio_to_graphviz(filename, format):
+    """Compute the graphviz graph from the fio file."""
+
+    # Let's read the fio file
+    global fio_file
+    fio_file = configparser.RawConfigParser(
+        allow_no_value=True,
+        default_section="global",
+        inline_comment_prefixes="'#', ';'")
+    fio_file.read(filename)
+
+    # Prepare the main graph object
+    # Let's define the header of the document
+    attrs = {}
+    attrs['labelloc'] = 't'
+    attrs['label'] = get_header().format(
+        get_header_color(), os.path.basename(filename))
+    main_graph = Digraph(engine='dot', graph_attr=attrs, format=format)
+
+    # Let's add a legend
+    main_graph.subgraph(create_legend())
+
+    # By default all jobs are run in parallel and depends on "global"
+    depends_on = fio_file.default_section
+
+    # The previous section is by default the global section
+    previous_section = fio_file.default_section
+
+    current_graph = main_graph
+
+    # The first job will be a new execution group
+    new_execution_group = True
+
+    # Let's iterate on all sections to create links between them
+    for section_name in fio_file.sections():
+        # The current section
+        section = fio_file[section_name]
+
+        # If the current section is waiting the previous job
+        if ('stonewall' or 'wait_for_previous') in section:
+            # let's remember what was the previous job we depend on
+            depends_on = previous_section
+            new_execution_group = True
+        elif 'wait_for' in section:
+            # This sections depends on a named section pointed by wait_for
+            depends_on = section['wait_for']
+            new_execution_group = True
+
+        if new_execution_group:
+            # Let's link the current graph with the main one
+            main_graph.subgraph(current_graph)
+            # Let's create a new graph to represent all the incoming jobs running at the same time
+            current_graph = create_sub_graph(section_name)
+
+        # Let's render the current section in its execution group
+        render_section(current_graph, fio_file, section_name,
+                       render_options(fio_file, section_name))
+
+        # Let's trace the link between this job and the one it depends on
+        # If we depend on 'global', we can avoid doing adding an arrow as we don't want to see 'global'
+        if depends_on != fio_file.default_section:
+            current_graph.edge(depends_on, section_name)
+
+        # The current section become the parent of the next one
+        previous_section = section_name
+
+        # We are by default in the same execution group
+        new_execution_group = False
+
+    # The last subgraph isn't rendered yet
+    main_graph.subgraph(current_graph)
+
+    # Let's return the main graphviz object
+    return main_graph
+
+
+def setup_commandline():
+    "Prepare the command line."
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--file', action='store',
+                        type=str,
+                        required=True,
+                        help='the fio file to graph')
+    parser.add_argument('--output', action='store',
+                        type=str,
+                        help='the output filename')
+    parser.add_argument('--format', action='store',
+                        type=str,
+                        default='png',
+                        help='the output format (see https://graphviz.org/docs/outputs/)')
+    parser.add_argument('--view', action='store_true',
+                        default=False,
+                        help='view the graph')
+    parser.add_argument('--keep', action='store_true',
+                        default=False,
+                        help='keep the graphviz script file')
+    parser.add_argument('--config', action='store',
+                        type=str,
+                        help='the configuration filename')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    global config_file
+    args = setup_commandline()
+
+    if args.config is None:
+        if os.path.exists('fiograph.conf'):
+            config_filename = 'fiograph.conf'
+        else:
+            config_filename = os.path.join(os.path.dirname(__file__), 'fiograph.conf')
+            if not os.path.exists(config_filename):
+                raise FileNotFoundError("Cannot locate configuration file")
+    else:
+        config_filename = args.config
+    config_file = configparser.RawConfigParser(allow_no_value=True)
+    config_file.read(config_filename)
+
+    temp_filename = uuid.uuid4().hex
+    image_filename = fio_to_graphviz(args.file, args.format).render(temp_filename, view=args.view)
+
+    output_filename_stub = args.file
+    if args.output:
+        output_filename = args.output
+    else:
+        if output_filename_stub.endswith('.fio'):
+            output_filename_stub = output_filename_stub[:-4]
+        output_filename = image_filename.replace(temp_filename, output_filename_stub)
+    if args.view:
+        time.sleep(1)
+        # allow time for the file to be opened before renaming it
+    os.rename(image_filename, output_filename)
+
+    if not args.keep:
+        os.remove(temp_filename)
+    else:
+        os.rename(temp_filename, output_filename_stub + '.gv')
+
+
+main()
diff --git a/tools/fiologparser.py b/tools/fiologparser.py

index 054f1f60784879023fcc31051379a892d8adbed1..708c5d4920ea7ff287d695cc971d1066c48677de 100755 (executable)
--- a/tools/fiologparser.py
+++ b/tools/fiologparser.py
@@ -166,7 +166,7 @@ class TimeSeries(object):
          f = open(fn, 'r')
          p_time = 0
          for line in f:
-            (time, value, foo, bar) = line.rstrip('\r\n').rsplit(', ')
+            (time, value) = line.rstrip('\r\n').rsplit(', ')[:2]
              self.add_sample(p_time, int(time), int(value))
              p_time = int(time)
   
diff --git a/tools/genfio b/tools/genfio

index 8518bbccf33a3111f097eb8221f14dbdb0f5b01b..c9bc2f764d68dd9f412c712b0a1dba5d0416a90e 100755 (executable)
--- a/tools/genfio
+++ b/tools/genfio
@@ -22,7 +22,8 @@
  BLK_SIZE=
  BLOCK_SIZE=4k
  SEQ=-1
-TEMPLATE=/tmp/template.fio
+TEMPLATE=$(mktemp "${TMPDIR:-${TEMP:-/tmp}}/template.fio.XXXXXX") || exit $?
+trap 'rm -f "$TEMPLATE"' EXIT
  OUTFILE=
  DISKS=
  PRINTABLE_DISKS=
@@ -48,7 +49,7 @@ show_help() {
                                         one test after another then one disk after another
                                         Disabled by default
  -p                             : Run parallel test
-                                       one test after anoter but all disks at the same time
+                                       one test after another but all disks at the same time
                                         Enabled by default
  -D iodepth                     : Run with the specified iodepth
                                         Default is $IODEPTH
diff --git a/tools/hist/fio-histo-log-pctiles.py b/tools/hist/fio-histo-log-pctiles.py

index 08e7722d04fe1a854a2b0aa4edcc2a194e84904d..b5d167de2295fa45354c920be63b3f3f92dfebcd 100755 (executable)
--- a/tools/hist/fio-histo-log-pctiles.py
+++ b/tools/hist/fio-histo-log-pctiles.py
@@ -748,7 +748,7 @@ if unittest2_imported:
      def test_e2_get_pctiles_highest_pct(self):
          fio_v3_bucket_count = 29 * 64
          with open(self.fn, 'w') as f:
-            # make a empty fio v3 histogram
+            # make an empty fio v3 histogram
              buckets = [ 0 for j in range(0, fio_v3_bucket_count) ]
              # add one I/O request to last bucket
              buckets[-1] = 1
diff --git a/tools/plot/fio2gnuplot b/tools/plot/fio2gnuplot

index 78ee82fb8025567c1724138ca60ad25a6a4a91b1..ce3ca2cc9fe01d5dbd5d1faab540558cf4031d6f 100755 (executable)
--- a/tools/plot/fio2gnuplot
+++ b/tools/plot/fio2gnuplot
@@ -198,7 +198,7 @@ def compute_temp_file(fio_data_file,disk_perf,gnuplot_output_dir, min_time, max_
                         # Index will be used to remember what file was featuring what value
                         index=index+1
  
-                       time, perf, x, block_size = line[1]
+                       time, perf, x, block_size = line[1][:4]
                         if (blk_size == 0):
                                 try:
                                         blk_size=int(block_size)
@@ -492,8 +492,8 @@ def main(argv):
      #We need to adjust the output filename regarding the pattern required by the user
      if (pattern_set_by_user == True):
          gnuplot_output_filename=pattern
-        # As we do have some glob in the pattern, let's make this simpliest
-        # We do remove the simpliest parts of the expression to get a clear file name
+        # As we do have some glob in the pattern, let's make this simplest
+        # We do remove the simplest parts of the expression to get a clear file name
          gnuplot_output_filename=gnuplot_output_filename.replace('-*-','-')
          gnuplot_output_filename=gnuplot_output_filename.replace('*','-')
          gnuplot_output_filename=gnuplot_output_filename.replace('--','-')
diff --git a/tools/plot/fio2gnuplot.1 b/tools/plot/fio2gnuplot.1

index 6fb1283f50049b0e84a4c4bb451c1a3c46f0471a..bfa10d26ef00ff553f3a905adfbc23a5394db53a 100644 (file)
--- a/tools/plot/fio2gnuplot.1
+++ b/tools/plot/fio2gnuplot.1
@@ -35,7 +35,7 @@ The resulting graph helps at understanding trends.
  .TP
  .B
  Grouped 2D graph
-All files are plotted in a single image to ease the comparaison. The same rendering options as per the individual 2D graph are used :
+All files are plotted in a single image to ease the comparison. The same rendering options as per the individual 2D graph are used :
  .RS
  .IP \(bu 3
  raw
diff --git a/tools/plot/fio2gnuplot.manpage b/tools/plot/fio2gnuplot.manpage

index 6a12cf819653727127a596102dcc351e3191c9fd..be3f13c20226962ffe564a1aca089dc99fadb577 100644 (file)
--- a/tools/plot/fio2gnuplot.manpage
+++ b/tools/plot/fio2gnuplot.manpage
@@ -20,7 +20,7 @@ DESCRIPTION
                         The resulting graph helps at understanding trends.
  
   Grouped 2D graph   
-       All files are plotted in a single image to ease the comparaison. The same rendering options as per the individual 2D graph are used :
+       All files are plotted in a single image to ease the comparison. The same rendering options as per the individual 2D graph are used :
           - raw
           - smooth
           - trend
diff --git a/verify.c b/verify.c

index a418c05413426905e4772cb62855dad83e273197..b2fede24710fd3fcedfbcda890b3b9def9a957be 100644 (file)
--- a/verify.c
+++ b/verify.c
@@ -338,12 +338,20 @@ static void dump_verify_buffers(struct verify_header *hdr, struct vcont *vc)
  static void log_verify_failure(struct verify_header *hdr, struct vcont *vc)
  {
         unsigned long long offset;
+       uint32_t len;
+       struct thread_data *td = vc->td;
  
         offset = vc->io_u->verify_offset;
-       offset += vc->hdr_num * hdr->len;
+       if (td->o.verify != VERIFY_PATTERN_NO_HDR) {
+               len = hdr->len;
+               offset += (unsigned long long) vc->hdr_num * len;
+       } else {
+               len = vc->io_u->buflen;
+       }
+
         log_err("%.8s: verify failed at file %s offset %llu, length %u"
                         " (requested block: offset=%llu, length=%llu, flags=%x)\n",
-                       vc->name, vc->io_u->file->file_name, offset, hdr->len,
+                       vc->name, vc->io_u->file->file_name, offset, len,
                         vc->io_u->verify_offset, vc->io_u->buflen, vc->io_u->flags);
  
         if (vc->good_crc && vc->bad_crc) {
@@ -398,7 +406,8 @@ static int verify_io_u_pattern(struct verify_header *hdr, struct vcont *vc)
                                 (unsigned char)buf[i],
                                 (unsigned char)pattern[mod],
                                 bits);
-                       log_err("fio: bad pattern block offset %u\n", i);
+                       log_err("fio: bad pattern block offset %u\n",
+                               i + header_size);
                         vc->name = "pattern";
                         log_verify_failure(hdr, vc);
                         return EILSEQ;
@@ -917,9 +926,11 @@ int verify_io_u(struct thread_data *td, struct io_u **io_u_ptr)
                 hdr = p;
  
                 /*
-                * Make rand_seed check pass when have verify_backlog.
+                * Make rand_seed check pass when have verify_backlog or
+                * zone reset frequency for zonemode=zbd.
                  */
-               if (!td_rw(td) || (td->flags & TD_F_VER_BACKLOG))
+               if (!td_rw(td) || (td->flags & TD_F_VER_BACKLOG) ||
+                   td->o.zrf.u.f)
                         io_u->rand_seed = hdr->rand_seed;
  
                 if (td->o.verify != VERIFY_PATTERN_NO_HDR) {
@@ -1287,8 +1298,6 @@ void populate_verify_io_u(struct thread_data *td, struct io_u *io_u)
         if (td->o.verify == VERIFY_NULL)
                 return;
  
-       io_u->numberio = td->io_issues[io_u->ddir];
-
         fill_pattern_headers(td, io_u, 0, 0);
  }
  
@@ -1411,7 +1420,6 @@ static void *verify_async_thread(void *data)
                         ret = pthread_cond_wait(&td->verify_cond,
                                                         &td->io_u_lock);
                         if (ret) {
-                               pthread_mutex_unlock(&td->io_u_lock);
                                 break;
                         }
                 }
@@ -1569,10 +1577,9 @@ static int fill_file_completions(struct thread_data *td,
  struct all_io_list *get_all_io_list(int save_mask, size_t *sz)
  {
         struct all_io_list *rep;
-       struct thread_data *td;
         size_t depth;
         void *next;
-       int i, nr;
+       int nr;
  
         compiletime_assert(sizeof(struct all_io_list) == 8, "all_io_list");
  
@@ -1582,14 +1589,14 @@ struct all_io_list *get_all_io_list(int save_mask, size_t *sz)
          */
         depth = 0;
         nr = 0;
-       for_each_td(td, i) {
-               if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
+       for_each_td(td) {
+               if (save_mask != IO_LIST_ALL && (__td_index + 1) != save_mask)
                         continue;
                 td->stop_io = 1;
                 td->flags |= TD_F_VSTATE_SAVED;
                 depth += (td->o.iodepth * td->o.nr_files);
                 nr++;
-       }
+       } end_for_each();
  
         if (!nr)
                 return NULL;
@@ -1597,26 +1604,25 @@ struct all_io_list *get_all_io_list(int save_mask, size_t *sz)
         *sz = sizeof(*rep);
         *sz += nr * sizeof(struct thread_io_list);
         *sz += depth * sizeof(struct file_comp);
-       rep = malloc(*sz);
-       memset(rep, 0, *sz);
+       rep = calloc(1, *sz);
  
         rep->threads = cpu_to_le64((uint64_t) nr);
  
         next = &rep->state[0];
-       for_each_td(td, i) {
+       for_each_td(td) {
                 struct thread_io_list *s = next;
                 unsigned int comps, index = 0;
  
-               if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
+               if (save_mask != IO_LIST_ALL && (__td_index + 1) != save_mask)
                         continue;
  
                 comps = fill_file_completions(td, s, &index);
  
                 s->no_comps = cpu_to_le64((uint64_t) comps);
-               s->depth = cpu_to_le64((uint64_t) td->o.iodepth);
-               s->nofiles = cpu_to_le64((uint64_t) td->o.nr_files);
+               s->depth = cpu_to_le32((uint32_t) td->o.iodepth);
+               s->nofiles = cpu_to_le32((uint32_t) td->o.nr_files);
                 s->numberio = cpu_to_le64((uint64_t) td->io_issues[DDIR_WRITE]);
-               s->index = cpu_to_le64((uint64_t) i);
+               s->index = cpu_to_le64((uint64_t) __td_index);
                 if (td->random_state.use64) {
                         s->rand.state64.s[0] = cpu_to_le64(td->random_state.state64.s1);
                         s->rand.state64.s[1] = cpu_to_le64(td->random_state.state64.s2);
@@ -1634,7 +1640,7 @@ struct all_io_list *get_all_io_list(int save_mask, size_t *sz)
                 }
                 snprintf((char *) s->name, sizeof(s->name), "%s", td->o.name);
                 next = io_list_next(s);
-       }
+       } end_for_each();
  
         return rep;
  }
@@ -1651,6 +1657,10 @@ static int open_state_file(const char *name, const char *prefix, int num,
         else
                 flags = O_RDONLY;
  
+#ifdef _WIN32
+       flags |= O_BINARY;
+#endif
+
         verify_state_gen_name(out, sizeof(out), name, prefix, num);
  
         fd = open(out, flags, 0644);
diff --git a/workqueue.c b/workqueue.c

index 9e6c41ff2f399172703b6e438061236670962df8..3636bc3a8a60df765b582f8e4a889f8ef25b9769 100644 (file)
--- a/workqueue.c
+++ b/workqueue.c
@@ -136,7 +136,8 @@ static void *worker_thread(void *data)
         sk_out_assign(sw->sk_out);
  
         if (wq->ops.nice) {
-               if (nice(wq->ops.nice) < 0) {
+               errno = 0;
+               if (nice(wq->ops.nice) == -1 && errno != 0) {
                         log_err("workqueue: nice %s\n", strerror(errno));
                         ret = 1;
                 }
diff --git a/zbd.c b/zbd.c

index 6a26fe108a68acf55c7ad2b7e59e8fccb97f0dc8..3741766051547fbc71a46ab42efe72a181826993 100644 (file)
--- a/zbd.c
+++ b/zbd.c
@@ -11,6 +11,7 @@
  #include <sys/stat.h>
  #include <unistd.h>
  
+#include "compiler/compiler.h"
  #include "os/os.h"
  #include "file.h"
  #include "fio.h"
@@ -22,16 +23,153 @@
  #include "pshared.h"
  #include "zbd.h"
  
+static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
+{
+       return (uint64_t)(offset - f->file_offset) < f->io_size;
+}
+
+static inline unsigned int zbd_zone_idx(const struct fio_file *f,
+                                       struct fio_zone_info *zone)
+{
+       return zone - f->zbd_info->zone_info;
+}
+
+/**
+ * zbd_offset_to_zone_idx - convert an offset into a zone number
+ * @f: file pointer.
+ * @offset: offset in bytes. If this offset is in the first zone_size bytes
+ *         past the disk size then the index of the sentinel is returned.
+ */
+static unsigned int zbd_offset_to_zone_idx(const struct fio_file *f,
+                                          uint64_t offset)
+{
+       uint32_t zone_idx;
+
+       if (f->zbd_info->zone_size_log2 > 0)
+               zone_idx = offset >> f->zbd_info->zone_size_log2;
+       else
+               zone_idx = offset / f->zbd_info->zone_size;
+
+       return min(zone_idx, f->zbd_info->nr_zones);
+}
+
+/**
+ * zbd_zone_end - Return zone end location
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
+{
+       return (z+1)->start;
+}
+
+/**
+ * zbd_zone_capacity_end - Return zone capacity limit end location
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
+{
+       return z->start + z->capacity;
+}
+
+/**
+ * zbd_zone_remainder - Return the number of bytes that are still available for
+ *                      writing before the zone gets full
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_remainder(struct fio_zone_info *z)
+{
+       if (z->wp >= zbd_zone_capacity_end(z))
+               return 0;
+
+       return zbd_zone_capacity_end(z) - z->wp;
+}
+
+/**
+ * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
+ * @f: file pointer.
+ * @z: zone info pointer.
+ * @required: minimum number of bytes that must remain in a zone.
+ *
+ * The caller must hold z->mutex.
+ */
+static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
+                         uint64_t required)
+{
+       assert((required & 511) == 0);
+
+       return z->has_wp && required > zbd_zone_remainder(z);
+}
+
+static void zone_lock(struct thread_data *td, const struct fio_file *f,
+                     struct fio_zone_info *z)
+{
+#ifndef NDEBUG
+       unsigned int const nz = zbd_zone_idx(f, z);
+       /* A thread should never lock zones outside its working area. */
+       assert(f->min_zone <= nz && nz < f->max_zone);
+       assert(z->has_wp);
+#endif
+
+       /*
+        * Lock the io_u target zone. The zone will be unlocked if io_u offset
+        * is changed or when io_u completes and zbd_put_io() executed.
+        * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
+        * other waiting for zone locks when building an io_u batch, first
+        * only trylock the zone. If the zone is already locked by another job,
+        * process the currently queued I/Os so that I/O progress is made and
+        * zones unlocked.
+        */
+       if (pthread_mutex_trylock(&z->mutex) != 0) {
+               if (!td_ioengine_flagged(td, FIO_SYNCIO))
+                       io_u_quiesce(td);
+               pthread_mutex_lock(&z->mutex);
+       }
+}
+
+static inline void zone_unlock(struct fio_zone_info *z)
+{
+       assert(z->has_wp);
+       pthread_mutex_unlock(&z->mutex);
+}
+
+static inline struct fio_zone_info *zbd_get_zone(const struct fio_file *f,
+                                                unsigned int zone_idx)
+{
+       return &f->zbd_info->zone_info[zone_idx];
+}
+
+static inline struct fio_zone_info *
+zbd_offset_to_zone(const struct fio_file *f,  uint64_t offset)
+{
+       return zbd_get_zone(f, zbd_offset_to_zone_idx(f, offset));
+}
+
+static bool accounting_vdb(struct thread_data *td, const struct fio_file *f)
+{
+       return td->o.zrt.u.f && td_write(td);
+}
+
  /**
   * zbd_get_zoned_model - Get a device zoned model
   * @td: FIO thread data
   * @f: FIO file for which to get model information
   */
-int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
-                       enum zbd_zoned_model *model)
+static int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
+                              enum zbd_zoned_model *model)
  {
         int ret;
  
+       if (f->filetype == FIO_TYPE_PIPE) {
+               log_err("zonemode=zbd does not support pipes\n");
+               return -EINVAL;
+       }
+
+       /* If regular file, always emulate zones inside the file. */
+       if (f->filetype == FIO_TYPE_FILE) {
+               *model = ZBD_NONE;
+               return 0;
+       }
+
         if (td->io_ops && td->io_ops->get_zoned_model)
                 ret = td->io_ops->get_zoned_model(td, f, model);
         else
@@ -60,9 +198,9 @@ int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
   * upon failure. If the zone report is empty, always assume an error (device
   * problem) and return -EIO.
   */
-int zbd_report_zones(struct thread_data *td, struct fio_file *f,
-                    uint64_t offset, struct zbd_zone *zones,
-                    unsigned int nr_zones)
+static int zbd_report_zones(struct thread_data *td, struct fio_file *f,
+                           uint64_t offset, struct zbd_zone *zones,
+                           unsigned int nr_zones)
  {
         int ret;
  
@@ -72,12 +210,12 @@ int zbd_report_zones(struct thread_data *td, struct fio_file *f,
                 ret = blkzoned_report_zones(td, f, offset, zones, nr_zones);
         if (ret < 0) {
                 td_verror(td, errno, "report zones failed");
-               log_err("%s: report zones from sector %llu failed (%d).\n",
-                       f->file_name, (unsigned long long)offset >> 9, errno);
+               log_err("%s: report zones from sector %"PRIu64" failed (nr_zones=%d; errno=%d).\n",
+                       f->file_name, offset >> 9, nr_zones, errno);
         } else if (ret == 0) {
                 td_verror(td, errno, "Empty zone report");
-               log_err("%s: report zones from sector %llu is empty.\n",
-                       f->file_name, (unsigned long long)offset >> 9);
+               log_err("%s: report zones from sector %"PRIu64" is empty.\n",
+                       f->file_name, offset >> 9);
                 ret = -EIO;
         }
  
@@ -94,8 +232,8 @@ int zbd_report_zones(struct thread_data *td, struct fio_file *f,
   * Reset the write pointer of all zones in the range @offset...@offset+@length.
   * Returns 0 upon success and a negative error code upon failure.
   */
-int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
-                uint64_t offset, uint64_t length)
+static int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
+                       uint64_t offset, uint64_t length)
  {
         int ret;
  
@@ -105,255 +243,525 @@ int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
                 ret = blkzoned_reset_wp(td, f, offset, length);
         if (ret < 0) {
                 td_verror(td, errno, "resetting wp failed");
-               log_err("%s: resetting wp for %llu sectors at sector %llu failed (%d).\n",
-                       f->file_name, (unsigned long long)length >> 9,
-                       (unsigned long long)offset >> 9, errno);
+               log_err("%s: resetting wp for %"PRIu64" sectors at sector %"PRIu64" failed (%d).\n",
+                       f->file_name, length >> 9, offset >> 9, errno);
         }
  
         return ret;
  }
  
  /**
- * zbd_zone_idx - convert an offset into a zone number
- * @f: file pointer.
- * @offset: offset in bytes. If this offset is in the first zone_size bytes
- *         past the disk size then the index of the sentinel is returned.
+ * __zbd_reset_zone - reset the write pointer of a single zone
+ * @td: FIO thread data.
+ * @f: FIO file associated with the disk for which to reset a write pointer.
+ * @z: Zone to reset.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ *
+ * The caller must hold z->mutex.
   */
-static uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset)
+static int __zbd_reset_zone(struct thread_data *td, struct fio_file *f,
+                           struct fio_zone_info *z)
  {
-       uint32_t zone_idx;
+       uint64_t offset = z->start;
+       uint64_t length = (z+1)->start - offset;
+       uint64_t data_in_zone = z->wp - z->start;
+       int ret = 0;
  
-       if (f->zbd_info->zone_size_log2 > 0)
-               zone_idx = offset >> f->zbd_info->zone_size_log2;
-       else
-               zone_idx = offset / f->zbd_info->zone_size;
+       if (!data_in_zone)
+               return 0;
  
-       return min(zone_idx, f->zbd_info->nr_zones);
+       assert(is_valid_offset(f, offset + length - 1));
+
+       dprint(FD_ZBD, "%s: resetting wp of zone %u.\n",
+              f->file_name, zbd_zone_idx(f, z));
+
+       switch (f->zbd_info->model) {
+       case ZBD_HOST_AWARE:
+       case ZBD_HOST_MANAGED:
+               ret = zbd_reset_wp(td, f, offset, length);
+               if (ret < 0)
+                       return ret;
+               break;
+       default:
+               break;
+       }
+
+       if (accounting_vdb(td, f)) {
+               pthread_mutex_lock(&f->zbd_info->mutex);
+               f->zbd_info->wp_valid_data_bytes -= data_in_zone;
+               pthread_mutex_unlock(&f->zbd_info->mutex);
+       }
+
+       z->wp = z->start;
+
+       td->ts.nr_zone_resets++;
+
+       return ret;
  }
  
  /**
- * zbd_zone_end - Return zone end location
- * @z: zone info pointer.
+ * zbd_write_zone_put - Remove a zone from the write target zones array.
+ * @td: FIO thread data.
+ * @f: FIO file that has the write zones array to remove.
+ * @zone_idx: Index of the zone to remove.
+ *
+ * The caller must hold f->zbd_info->mutex.
   */
-static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
+static void zbd_write_zone_put(struct thread_data *td, const struct fio_file *f,
+                              struct fio_zone_info *z)
  {
-       return (z+1)->start;
+       uint32_t zi;
+
+       if (!z->write)
+               return;
+
+       for (zi = 0; zi < f->zbd_info->num_write_zones; zi++) {
+               if (zbd_get_zone(f, f->zbd_info->write_zones[zi]) == z)
+                       break;
+       }
+       if (zi == f->zbd_info->num_write_zones)
+               return;
+
+       dprint(FD_ZBD, "%s: removing zone %u from write zone array\n",
+              f->file_name, zbd_zone_idx(f, z));
+
+       memmove(f->zbd_info->write_zones + zi,
+               f->zbd_info->write_zones + zi + 1,
+               (ZBD_MAX_WRITE_ZONES - (zi + 1)) *
+               sizeof(f->zbd_info->write_zones[0]));
+
+       f->zbd_info->num_write_zones--;
+       td->num_write_zones--;
+       z->write = 0;
  }
  
  /**
- * zbd_zone_capacity_end - Return zone capacity limit end location
- * @z: zone info pointer.
+ * zbd_reset_zone - reset the write pointer of a single zone and remove the zone
+ *                  from the array of write zones.
+ * @td: FIO thread data.
+ * @f: FIO file associated with the disk for which to reset a write pointer.
+ * @z: Zone to reset.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ *
+ * The caller must hold z->mutex.
   */
-static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
+static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
+                         struct fio_zone_info *z)
  {
-       return z->start + z->capacity;
+       int ret;
+
+       ret = __zbd_reset_zone(td, f, z);
+       if (ret)
+               return ret;
+
+       pthread_mutex_lock(&f->zbd_info->mutex);
+       zbd_write_zone_put(td, f, z);
+       pthread_mutex_unlock(&f->zbd_info->mutex);
+       return 0;
  }
  
  /**
- * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
- * @f: file pointer.
- * @z: zone info pointer.
- * @required: minimum number of bytes that must remain in a zone.
+ * zbd_finish_zone - finish the specified zone
+ * @td: FIO thread data.
+ * @f: FIO file for which to finish a zone
+ * @z: Zone to finish.
   *
- * The caller must hold z->mutex.
+ * Finish the zone at @offset with open or close status.
   */
-static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
-                         uint64_t required)
+static int zbd_finish_zone(struct thread_data *td, struct fio_file *f,
+                          struct fio_zone_info *z)
  {
-       assert((required & 511) == 0);
+       uint64_t offset = z->start;
+       uint64_t length = f->zbd_info->zone_size;
+       int ret = 0;
+
+       switch (f->zbd_info->model) {
+       case ZBD_HOST_AWARE:
+       case ZBD_HOST_MANAGED:
+               if (td->io_ops && td->io_ops->finish_zone)
+                       ret = td->io_ops->finish_zone(td, f, offset, length);
+               else
+                       ret = blkzoned_finish_zone(td, f, offset, length);
+               break;
+       default:
+               break;
+       }
  
-       return z->has_wp &&
-               z->wp + required > zbd_zone_capacity_end(z);
+       if (ret < 0) {
+               td_verror(td, errno, "finish zone failed");
+               log_err("%s: finish zone at sector %"PRIu64" failed (%d).\n",
+                       f->file_name, offset >> 9, errno);
+       } else {
+               z->wp = (z+1)->start;
+       }
+
+       return ret;
  }
  
-static void zone_lock(struct thread_data *td, const struct fio_file *f,
-                     struct fio_zone_info *z)
+/**
+ * zbd_reset_zones - Reset a range of zones.
+ * @td: fio thread data.
+ * @f: fio file for which to reset zones
+ * @zb: first zone to reset.
+ * @ze: first zone not to reset.
+ *
+ * Returns 0 upon success and 1 upon failure.
+ */
+static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
+                          struct fio_zone_info *const zb,
+                          struct fio_zone_info *const ze)
  {
-       struct zoned_block_device_info *zbd = f->zbd_info;
-       uint32_t nz = z - zbd->zone_info;
+       struct fio_zone_info *z;
+       const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
+       int res = 0;
  
-       /* A thread should never lock zones outside its working area. */
-       assert(f->min_zone <= nz && nz < f->max_zone);
+       if (fio_unlikely(0 == min_bs))
+               return 1;
  
-       assert(z->has_wp);
+       dprint(FD_ZBD, "%s: examining zones %u .. %u\n",
+              f->file_name, zbd_zone_idx(f, zb), zbd_zone_idx(f, ze));
  
-       /*
-        * Lock the io_u target zone. The zone will be unlocked if io_u offset
-        * is changed or when io_u completes and zbd_put_io() executed.
-        * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
-        * other waiting for zone locks when building an io_u batch, first
-        * only trylock the zone. If the zone is already locked by another job,
-        * process the currently queued I/Os so that I/O progress is made and
-        * zones unlocked.
-        */
-       if (pthread_mutex_trylock(&z->mutex) != 0) {
-               if (!td_ioengine_flagged(td, FIO_SYNCIO))
-                       io_u_quiesce(td);
-               pthread_mutex_lock(&z->mutex);
+       for (z = zb; z < ze; z++) {
+               if (!z->has_wp)
+                       continue;
+
+               zone_lock(td, f, z);
+
+               if (z->wp != z->start) {
+                       dprint(FD_ZBD, "%s: resetting zone %u\n",
+                              f->file_name, zbd_zone_idx(f, z));
+                       if (zbd_reset_zone(td, f, z) < 0)
+                               res = 1;
+               }
+
+               zone_unlock(z);
         }
+
+       return res;
  }
  
-static inline void zone_unlock(struct fio_zone_info *z)
+/**
+ * zbd_get_max_open_zones - Get the maximum number of open zones
+ * @td: FIO thread data
+ * @f: FIO file for which to get max open zones
+ * @max_open_zones: Upon success, result will be stored here.
+ *
+ * A @max_open_zones value set to zero means no limit.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ */
+static int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+                                 unsigned int *max_open_zones)
  {
         int ret;
  
-       assert(z->has_wp);
-       ret = pthread_mutex_unlock(&z->mutex);
-       assert(!ret);
+       if (td->io_ops && td->io_ops->get_max_open_zones)
+               ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
+       else
+               ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
+       if (ret < 0) {
+               td_verror(td, errno, "get max open zones failed");
+               log_err("%s: get max open zones failed (%d).\n",
+                       f->file_name, errno);
+       }
+
+       return ret;
+}
+
+/**
+ * zbd_get_max_active_zones - Get the maximum number of active zones
+ * @td: FIO thread data
+ * @f: FIO file for which to get max active zones
+ *
+ * Returns max_active_zones limit value of the target file if it is available.
+ * Otherwise return zero, which means no limit.
+ */
+static unsigned int zbd_get_max_active_zones(struct thread_data *td,
+                                            struct fio_file *f)
+{
+       unsigned int max_active_zones;
+       int ret;
+
+       if (td->io_ops && td->io_ops->get_max_active_zones)
+               ret = td->io_ops->get_max_active_zones(td, f,
+                                                      &max_active_zones);
+       else
+               ret = blkzoned_get_max_active_zones(td, f, &max_active_zones);
+       if (ret < 0) {
+               dprint(FD_ZBD, "%s: max_active_zones is not available\n",
+                      f->file_name);
+               return 0;
+       }
+
+       return max_active_zones;
+}
+
+/**
+ * __zbd_write_zone_get - Add a zone to the array of write zones.
+ * @td: fio thread data.
+ * @f: fio file that has the write zones array to add.
+ * @zone_idx: Index of the zone to add.
+ *
+ * Do same operation as @zbd_write_zone_get, except it adds the zone at
+ * @zone_idx to write target zones array even when it does not have remainder
+ * space to write one block.
+ */
+static bool __zbd_write_zone_get(struct thread_data *td,
+                                const struct fio_file *f,
+                                struct fio_zone_info *z)
+{
+       struct zoned_block_device_info *zbdi = f->zbd_info;
+       uint32_t zone_idx = zbd_zone_idx(f, z);
+       bool res = true;
+
+       if (z->cond == ZBD_ZONE_COND_OFFLINE)
+               return false;
+
+       /*
+        * Skip full zones with data verification enabled because resetting a
+        * zone causes data loss and hence causes verification to fail.
+        */
+       if (td->o.verify != VERIFY_NONE && zbd_zone_remainder(z) == 0)
+               return false;
+
+       /*
+        * zbdi->max_write_zones == 0 means that there is no limit on the
+        * maximum number of write target zones. In this case, do no track write
+        * target zones in zbdi->write_zones array.
+        */
+       if (!zbdi->max_write_zones)
+               return true;
+
+       pthread_mutex_lock(&zbdi->mutex);
+
+       if (z->write) {
+               /*
+                * If the zone is going to be completely filled by writes
+                * already in-flight, handle it as a full zone instead of a
+                * write target zone.
+                */
+               if (!zbd_zone_remainder(z))
+                       res = false;
+               goto out;
+       }
+
+       res = false;
+       /* Zero means no limit */
+       if (td->o.job_max_open_zones > 0 &&
+           td->num_write_zones >= td->o.job_max_open_zones)
+               goto out;
+       if (zbdi->num_write_zones >= zbdi->max_write_zones)
+               goto out;
+
+       dprint(FD_ZBD, "%s: adding zone %u to write zone array\n",
+              f->file_name, zone_idx);
+
+       zbdi->write_zones[zbdi->num_write_zones++] = zone_idx;
+       td->num_write_zones++;
+       z->write = 1;
+       res = true;
+
+out:
+       pthread_mutex_unlock(&zbdi->mutex);
+       return res;
  }
  
-static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
+/**
+ * zbd_write_zone_get - Add a zone to the array of write zones.
+ * @td: fio thread data.
+ * @f: fio file that has the open zones to add.
+ * @zone_idx: Index of the zone to add.
+ *
+ * Add a ZBD zone to write target zones array, if it is not yet added. Returns
+ * true if either the zone was already added or if the zone was successfully
+ * added to the array without exceeding the maximum number of write zones.
+ * Returns false if the zone was not already added and addition of the zone
+ * would cause the zone limit to be exceeded.
+ */
+static bool zbd_write_zone_get(struct thread_data *td, const struct fio_file *f,
+                              struct fio_zone_info *z)
  {
-       return (uint64_t)(offset - f->file_offset) < f->io_size;
-}
+       const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
  
-static inline struct fio_zone_info *get_zone(const struct fio_file *f,
-                                            unsigned int zone_nr)
-{
-       return &f->zbd_info->zone_info[zone_nr];
+       /*
+        * Skip full zones with data verification enabled because resetting a
+        * zone causes data loss and hence causes verification to fail.
+        */
+       if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
+               return false;
+
+       return __zbd_write_zone_get(td, f, z);
  }
  
-/* Verify whether direct I/O is used for all host-managed zoned drives. */
+/* Verify whether direct I/O is used for all host-managed zoned block drives. */
  static bool zbd_using_direct_io(void)
  {
-       struct thread_data *td;
         struct fio_file *f;
-       int i, j;
+       int j;
  
-       for_each_td(td, i) {
+       for_each_td(td) {
                 if (td->o.odirect || !(td->o.td_ddir & TD_DDIR_WRITE))
                         continue;
                 for_each_file(td, f, j) {
-                       if (f->zbd_info &&
+                       if (f->zbd_info && f->filetype == FIO_TYPE_BLOCK &&
                             f->zbd_info->model == ZBD_HOST_MANAGED)
                                 return false;
                 }
-       }
+       } end_for_each();
  
         return true;
  }
  
  /* Whether or not the I/O range for f includes one or more sequential zones */
-static bool zbd_is_seq_job(struct fio_file *f)
+static bool zbd_is_seq_job(const struct fio_file *f)
  {
         uint32_t zone_idx, zone_idx_b, zone_idx_e;
  
         assert(f->zbd_info);
+
         if (f->io_size == 0)
                 return false;
-       zone_idx_b = zbd_zone_idx(f, f->file_offset);
-       zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size - 1);
+
+       zone_idx_b = zbd_offset_to_zone_idx(f, f->file_offset);
+       zone_idx_e =
+               zbd_offset_to_zone_idx(f, f->file_offset + f->io_size - 1);
         for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++)
-               if (get_zone(f, zone_idx)->has_wp)
+               if (zbd_get_zone(f, zone_idx)->has_wp)
                         return true;
  
         return false;
  }
  
  /*
- * Verify whether offset and size parameters are aligned with zone boundaries.
+ * Verify whether the file offset and size parameters are aligned with zone
+ * boundaries. If the file offset is not aligned, align it down to the start of
+ * the zone containing the start offset and align up the file io_size parameter.
   */
-static bool zbd_verify_sizes(void)
+static bool zbd_zone_align_file_sizes(struct thread_data *td,
+                                     struct fio_file *f)
  {
         const struct fio_zone_info *z;
-       struct thread_data *td;
-       struct fio_file *f;
         uint64_t new_offset, new_end;
-       uint32_t zone_idx;
-       int i, j;
  
-       for_each_td(td, i) {
-               for_each_file(td, f, j) {
-                       if (!f->zbd_info)
-                               continue;
-                       if (f->file_offset >= f->real_file_size)
-                               continue;
-                       if (!zbd_is_seq_job(f))
-                               continue;
+       if (!f->zbd_info)
+               return true;
+       if (f->file_offset >= f->real_file_size)
+               return true;
+       if (!zbd_is_seq_job(f))
+               return true;
  
-                       if (!td->o.zone_size) {
-                               td->o.zone_size = f->zbd_info->zone_size;
-                               if (!td->o.zone_size) {
-                                       log_err("%s: invalid 0 zone size\n",
-                                               f->file_name);
-                                       return false;
-                               }
-                       } else if (td->o.zone_size != f->zbd_info->zone_size) {
-                               log_err("%s: job parameter zonesize %llu does not match disk zone size %llu.\n",
-                                       f->file_name, (unsigned long long) td->o.zone_size,
-                                       (unsigned long long) f->zbd_info->zone_size);
-                               return false;
-                       }
+       if (!td->o.zone_size) {
+               td->o.zone_size = f->zbd_info->zone_size;
+               if (!td->o.zone_size) {
+                       log_err("%s: invalid 0 zone size\n",
+                               f->file_name);
+                       return false;
+               }
+       } else if (td->o.zone_size != f->zbd_info->zone_size) {
+               log_err("%s: zonesize %llu does not match the device zone size %"PRIu64".\n",
+                       f->file_name, td->o.zone_size,
+                       f->zbd_info->zone_size);
+               return false;
+       }
  
-                       if (td->o.zone_skip &&
-                           (td->o.zone_skip < td->o.zone_size ||
-                            td->o.zone_skip % td->o.zone_size)) {
-                               log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
-                                       f->file_name, (unsigned long long) td->o.zone_skip,
-                                       (unsigned long long) td->o.zone_size);
-                               return false;
-                       }
+       if (td->o.zone_skip % td->o.zone_size) {
+               log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
+                       f->file_name, td->o.zone_skip,
+                       td->o.zone_size);
+               return false;
+       }
  
-                       zone_idx = zbd_zone_idx(f, f->file_offset);
-                       z = get_zone(f, zone_idx);
-                       if ((f->file_offset != z->start) &&
-                           (td->o.td_ddir != TD_DDIR_READ)) {
-                               new_offset = zbd_zone_end(z);
-                               if (new_offset >= f->file_offset + f->io_size) {
-                                       log_info("%s: io_size must be at least one zone\n",
-                                                f->file_name);
-                                       return false;
-                               }
-                               log_info("%s: rounded up offset from %llu to %llu\n",
-                                        f->file_name, (unsigned long long) f->file_offset,
-                                        (unsigned long long) new_offset);
-                               f->io_size -= (new_offset - f->file_offset);
-                               f->file_offset = new_offset;
-                       }
-                       zone_idx = zbd_zone_idx(f, f->file_offset + f->io_size);
-                       z = get_zone(f, zone_idx);
-                       new_end = z->start;
-                       if ((td->o.td_ddir != TD_DDIR_READ) &&
-                           (f->file_offset + f->io_size != new_end)) {
-                               if (new_end <= f->file_offset) {
-                                       log_info("%s: io_size must be at least one zone\n",
-                                                f->file_name);
-                                       return false;
-                               }
-                               log_info("%s: rounded down io_size from %llu to %llu\n",
-                                        f->file_name, (unsigned long long) f->io_size,
-                                        (unsigned long long) new_end - f->file_offset);
-                               f->io_size = new_end - f->file_offset;
-                       }
+       if (td->o.td_ddir == TD_DDIR_READ) {
+               z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
+               new_end = z->start;
+               if (f->file_offset + f->io_size > new_end) {
+                       log_info("%s: rounded io_size from %"PRIu64" to %"PRIu64"\n",
+                                f->file_name, f->io_size,
+                                new_end - f->file_offset);
+                       f->io_size = new_end - f->file_offset;
+               }
+               return true;
+       }
+
+       z = zbd_offset_to_zone(f, f->file_offset);
+       if (f->file_offset != z->start) {
+               new_offset = zbd_zone_end(z);
+               if (new_offset >= f->file_offset + f->io_size) {
+                       log_info("%s: io_size must be at least one zone\n",
+                                f->file_name);
+                       return false;
+               }
+               log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
+                        f->file_name, f->file_offset,
+                        new_offset);
+               f->io_size -= (new_offset - f->file_offset);
+               f->file_offset = new_offset;
+       }
+
+       z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
+       new_end = z->start;
+       if (f->file_offset + f->io_size != new_end) {
+               if (new_end <= f->file_offset) {
+                       log_info("%s: io_size must be at least one zone\n",
+                                f->file_name);
+                       return false;
                 }
+               log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
+                        f->file_name, f->io_size,
+                        new_end - f->file_offset);
+               f->io_size = new_end - f->file_offset;
         }
  
         return true;
  }
  
-static bool zbd_verify_bs(void)
+/*
+ * Verify whether offset and size parameters are aligned with zone boundaries.
+ */
+static bool zbd_verify_sizes(void)
  {
-       struct thread_data *td;
         struct fio_file *f;
-       uint32_t zone_size;
-       int i, j, k;
+       int j;
+
+       for_each_td(td) {
+               for_each_file(td, f, j) {
+                       if (!zbd_zone_align_file_sizes(td, f))
+                               return false;
+               }
+       } end_for_each();
+
+       return true;
+}
  
-       for_each_td(td, i) {
+static bool zbd_verify_bs(void)
+{
+       struct fio_file *f;
+       int j;
+
+       for_each_td(td) {
+               if (td_trim(td) &&
+                   (td->o.min_bs[DDIR_TRIM] != td->o.max_bs[DDIR_TRIM] ||
+                    td->o.bssplit_nr[DDIR_TRIM])) {
+                       log_info("bsrange and bssplit are not allowed for trim with zonemode=zbd\n");
+                       return false;
+               }
                 for_each_file(td, f, j) {
+                       uint64_t zone_size;
+
                         if (!f->zbd_info)
                                 continue;
+
                         zone_size = f->zbd_info->zone_size;
-                       for (k = 0; k < FIO_ARRAY_SIZE(td->o.bs); k++) {
-                               if (td->o.verify != VERIFY_NONE &&
-                                   zone_size % td->o.bs[k] != 0) {
-                                       log_info("%s: block size %llu is not a divisor of the zone size %d\n",
-                                                f->file_name, td->o.bs[k],
-                                                zone_size);
-                                       return false;
-                               }
+                       if (td_trim(td) && td->o.bs[DDIR_TRIM] != zone_size) {
+                               log_info("%s: trim block size %llu is not the zone size %"PRIu64"\n",
+                                        f->file_name, td->o.bs[DDIR_TRIM],
+                                        zone_size);
+                               return false;
                         }
                 }
-       }
+       } end_for_each();
         return true;
  }
  
@@ -382,7 +790,7 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
         int i;
  
         if (zone_size == 0) {
-               log_err("%s: Specifying the zone size is mandatory for regular block devices with --zonemode=zbd\n\n",
+               log_err("%s: Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd\n\n",
                         f->file_name);
                 return 1;
         }
@@ -398,11 +806,16 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
  
         if (zone_capacity > zone_size) {
                 log_err("%s: job parameter zonecapacity %llu is larger than zone size %llu\n",
-                       f->file_name, (unsigned long long) td->o.zone_capacity,
-                       (unsigned long long) td->o.zone_size);
+                       f->file_name, td->o.zone_capacity, td->o.zone_size);
                 return 1;
         }
  
+       if (f->real_file_size < zone_size) {
+               log_err("%s: file/device size %"PRIu64" is smaller than zone size %"PRIu64"\n",
+                       f->file_name, f->real_file_size, zone_size);
+               return -EINVAL;
+       }
+
         nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
         zbd_info = scalloc(1, sizeof(*zbd_info) +
                            (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
@@ -447,7 +860,8 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
         int nr_zones, nrz;
         struct zbd_zone *zones, *z;
         struct fio_zone_info *p;
-       uint64_t zone_size, offset;
+       uint64_t zone_size, offset, capacity;
+       bool same_zone_cap = true;
         struct zoned_block_device_info *zbd_info = NULL;
         int i, j, ret = -ENOMEM;
  
@@ -464,20 +878,20 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
         }
  
         zone_size = zones[0].len;
+       capacity = zones[0].capacity;
         nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
  
         if (td->o.zone_size == 0) {
                 td->o.zone_size = zone_size;
         } else if (td->o.zone_size != zone_size) {
-               log_err("fio: %s job parameter zonesize %llu does not match disk zone size %llu.\n",
-                       f->file_name, (unsigned long long) td->o.zone_size,
-                       (unsigned long long) zone_size);
+               log_err("fio: %s job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
+                       f->file_name, td->o.zone_size, zone_size);
                 ret = -EINVAL;
                 goto out;
         }
  
-       dprint(FD_ZBD, "Device %s has %d zones of size %llu KB\n", f->file_name,
-              nr_zones, (unsigned long long) zone_size / 1024);
+       dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n",
+              f->file_name, nr_zones, zone_size / 1024);
  
         zbd_info = scalloc(1, sizeof(*zbd_info) +
                            (nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
@@ -493,6 +907,9 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
                                                      PTHREAD_MUTEX_RECURSIVE);
                         p->start = z->start;
                         p->capacity = z->capacity;
+                       if (capacity != z->capacity)
+                               same_zone_cap = false;
+
                         switch (z->cond) {
                         case ZBD_ZONE_COND_NOT_WP:
                         case ZBD_ZONE_COND_FULL:
@@ -516,8 +933,9 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
                         p->cond = z->cond;
  
                         if (j > 0 && p->start != p[-1].start + zone_size) {
-                               log_info("%s: invalid zone data\n",
-                                        f->file_name);
+                               log_info("%s: invalid zone data [%d:%d]: %"PRIu64" + %"PRIu64" != %"PRIu64"\n",
+                                        f->file_name, j, i,
+                                        p[-1].start, zone_size, p->start);
                                 ret = -EINVAL;
                                 goto out;
                         }
@@ -526,14 +944,14 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
                 offset = z->start + z->len;
                 if (j >= nr_zones)
                         break;
+
                 nrz = zbd_report_zones(td, f, offset, zones,
                                        min((uint32_t)(nr_zones - j),
                                            ZBD_REPORT_MAX_ZONES));
                 if (nrz < 0) {
                         ret = nrz;
-                       log_info("fio: report zones (offset %llu) failed for %s (%d).\n",
-                                (unsigned long long)offset,
-                                f->file_name, -ret);
+                       log_info("fio: report zones (offset %"PRIu64") failed for %s (%d).\n",
+                                offset, f->file_name, -ret);
                         goto out;
                 }
         }
@@ -546,6 +964,12 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
         f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
                 ilog2(zone_size) : 0;
         f->zbd_info->nr_zones = nr_zones;
+       f->zbd_info->max_active_zones = zbd_get_max_active_zones(td, f);
+
+       if (same_zone_cap)
+               dprint(FD_ZBD, "Zone capacity = %"PRIu64" KB\n",
+                      capacity / 1024);
+
         zbd_info = NULL;
         ret = 0;
  
@@ -555,6 +979,56 @@ out:
         return ret;
  }
  
+static int zbd_set_max_write_zones(struct thread_data *td, struct fio_file *f)
+{
+       struct zoned_block_device_info *zbd = f->zbd_info;
+       unsigned int max_open_zones;
+       int ret;
+
+       if (zbd->model != ZBD_HOST_MANAGED || td->o.ignore_zone_limits) {
+               /* Only host-managed devices have a max open limit */
+               zbd->max_write_zones = td->o.max_open_zones;
+               goto out;
+       }
+
+       /* If host-managed, get the max open limit */
+       ret = zbd_get_max_open_zones(td, f, &max_open_zones);
+       if (ret)
+               return ret;
+
+       if (!max_open_zones) {
+               /* No device limit */
+               zbd->max_write_zones = td->o.max_open_zones;
+       } else if (!td->o.max_open_zones) {
+               /* No user limit. Set limit to device limit */
+               zbd->max_write_zones = max_open_zones;
+       } else if (td->o.max_open_zones <= max_open_zones) {
+               /* Both user limit and dev limit. User limit not too large */
+               zbd->max_write_zones = td->o.max_open_zones;
+       } else {
+               /* Both user limit and dev limit. User limit too large */
+               td_verror(td, EINVAL,
+                         "Specified --max_open_zones is too large");
+               log_err("Specified --max_open_zones (%d) is larger than max (%u)\n",
+                       td->o.max_open_zones, max_open_zones);
+               return -EINVAL;
+       }
+
+out:
+       /* Ensure that the limit is not larger than FIO's internal limit */
+       if (zbd->max_write_zones > ZBD_MAX_WRITE_ZONES) {
+               td_verror(td, EINVAL, "'max_open_zones' value is too large");
+               log_err("'max_open_zones' value is larger than %u\n",
+                       ZBD_MAX_WRITE_ZONES);
+               return -EINVAL;
+       }
+
+       dprint(FD_ZBD, "%s: using max write zones limit: %"PRIu32"\n",
+              f->file_name, zbd->max_write_zones);
+
+       return 0;
+}
+
  /*
   * Allocate zone information and store it into f->zbd_info if zonemode=zbd.
   *
@@ -572,14 +1046,16 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
                 return ret;
  
         switch (zbd_model) {
-       case ZBD_IGNORE:
-               return 0;
         case ZBD_HOST_AWARE:
         case ZBD_HOST_MANAGED:
                 ret = parse_zone_info(td, f);
+               if (ret)
+                       return ret;
                 break;
         case ZBD_NONE:
                 ret = init_zone_info(td, f);
+               if (ret)
+                       return ret;
                 break;
         default:
                 td_verror(td, EINVAL, "Unsupported zoned model");
@@ -587,11 +1063,16 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
                 return -EINVAL;
         }
  
-       if (ret == 0) {
-               f->zbd_info->model = zbd_model;
-               f->zbd_info->max_open_zones = td->o.max_open_zones;
+       assert(f->zbd_info);
+       f->zbd_info->model = zbd_model;
+
+       ret = zbd_set_max_write_zones(td, f);
+       if (ret) {
+               zbd_free_zone_info(f);
+               return ret;
         }
-       return ret;
+
+       return 0;
  }
  
  void zbd_free_zone_info(struct fio_file *f)
@@ -620,11 +1101,10 @@ void zbd_free_zone_info(struct fio_file *f)
   */
  static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file)
  {
-       struct thread_data *td2;
         struct fio_file *f2;
-       int i, j, ret;
+       int j, ret;
  
-       for_each_td(td2, i) {
+       for_each_td(td2) {
                 for_each_file(td2, f2, j) {
                         if (td2 == td && f2 == file)
                                 continue;
@@ -635,20 +1115,16 @@ static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file)
                         file->zbd_info->refcount++;
                         return 0;
                 }
-       }
+       } end_for_each();
  
         ret = zbd_create_zone_info(td, file);
         if (ret < 0)
                 td_verror(td, -ret, "zbd_create_zone_info() failed");
+
         return ret;
  }
  
-static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
-                         uint32_t zone_idx);
-static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
-                         struct fio_zone_info *z);
-
-int zbd_setup_files(struct thread_data *td)
+int zbd_init_files(struct thread_data *td)
  {
         struct fio_file *f;
         int i;
@@ -658,194 +1134,175 @@ int zbd_setup_files(struct thread_data *td)
                         return 1;
         }
  
-       if (!zbd_using_direct_io()) {
-               log_err("Using direct I/O is mandatory for writing to ZBD drives\n\n");
-               return 1;
-       }
-
-       if (!zbd_verify_sizes())
-               return 1;
+       return 0;
+}
  
-       if (!zbd_verify_bs())
-               return 1;
+void zbd_recalc_options_with_zone_granularity(struct thread_data *td)
+{
+       struct fio_file *f;
+       int i;
  
         for_each_file(td, f, i) {
                 struct zoned_block_device_info *zbd = f->zbd_info;
-               struct fio_zone_info *z;
-               int zi;
+               uint64_t zone_size;
  
-               if (!zbd)
+               /* zonemode=strided doesn't get per-file zone size. */
+               zone_size = zbd ? zbd->zone_size : td->o.zone_size;
+               if (zone_size == 0)
                         continue;
  
-               f->min_zone = zbd_zone_idx(f, f->file_offset);
-               f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size);
-
-               /*
-                * When all zones in the I/O range are conventional, io_size
-                * can be smaller than zone size, making min_zone the same
-                * as max_zone. This is why the assert below needs to be made
-                * conditional.
-                */
-               if (zbd_is_seq_job(f))
-                       assert(f->min_zone < f->max_zone);
-
-               zbd->max_open_zones = zbd->max_open_zones ?: ZBD_MAX_OPEN_ZONES;
-
-               if (td->o.max_open_zones > 0 &&
-                   zbd->max_open_zones != td->o.max_open_zones) {
-                       log_err("Different 'max_open_zones' values\n");
-                       return 1;
-               }
-               if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
-                       log_err("'max_open_zones' value is limited by %u\n", ZBD_MAX_OPEN_ZONES);
-                       return 1;
-               }
-
-               for (zi = f->min_zone; zi < f->max_zone; zi++) {
-                       z = &zbd->zone_info[zi];
-                       if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
-                           z->cond != ZBD_ZONE_COND_EXP_OPEN)
-                               continue;
-                       if (zbd_open_zone(td, f, zi))
-                               continue;
-                       /*
-                        * If the number of open zones exceeds specified limits,
-                        * reset all extra open zones.
-                        */
-                       if (zbd_reset_zone(td, f, z) < 0) {
-                               log_err("Failed to reest zone %d\n", zi);
-                               return 1;
-                       }
-               }
+               if (td->o.size_nz > 0)
+                       td->o.size = td->o.size_nz * zone_size;
+               if (td->o.io_size_nz > 0)
+                       td->o.io_size = td->o.io_size_nz * zone_size;
+               if (td->o.start_offset_nz > 0)
+                       td->o.start_offset = td->o.start_offset_nz * zone_size;
+               if (td->o.offset_increment_nz > 0)
+                       td->o.offset_increment =
+                               td->o.offset_increment_nz * zone_size;
+               if (td->o.zone_skip_nz > 0)
+                       td->o.zone_skip = td->o.zone_skip_nz * zone_size;
         }
-
-       return 0;
  }
  
-static inline unsigned int zbd_zone_nr(const struct fio_file *f,
-                                      struct fio_zone_info *zone)
+static uint64_t zbd_verify_and_set_vdb(struct thread_data *td,
+                                      const struct fio_file *f)
  {
-       return zone - f->zbd_info->zone_info;
-}
+       struct fio_zone_info *zb, *ze, *z;
+       uint64_t wp_vdb = 0;
+       struct zoned_block_device_info *zbdi = f->zbd_info;
  
-/**
- * zbd_reset_zone - reset the write pointer of a single zone
- * @td: FIO thread data.
- * @f: FIO file associated with the disk for which to reset a write pointer.
- * @z: Zone to reset.
- *
- * Returns 0 upon success and a negative error code upon failure.
- *
- * The caller must hold z->mutex.
- */
-static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
-                         struct fio_zone_info *z)
-{
-       uint64_t offset = z->start;
-       uint64_t length = (z+1)->start - offset;
-       uint64_t data_in_zone = z->wp - z->start;
-       int ret = 0;
+       assert(td->runstate < TD_RUNNING);
+       assert(zbdi);
  
-       if (!data_in_zone)
+       if (!accounting_vdb(td, f))
                 return 0;
  
-       assert(is_valid_offset(f, offset + length - 1));
+       /*
+        * Ensure that the I/O range includes one or more sequential zones so
+        * that f->min_zone and f->max_zone have different values.
+        */
+       if (!zbd_is_seq_job(f))
+               return 0;
  
-       dprint(FD_ZBD, "%s: resetting wp of zone %u.\n", f->file_name,
-               zbd_zone_nr(f, z));
-       switch (f->zbd_info->model) {
-       case ZBD_HOST_AWARE:
-       case ZBD_HOST_MANAGED:
-               ret = zbd_reset_wp(td, f, offset, length);
-               if (ret < 0)
-                       return ret;
-               break;
-       default:
-               break;
+       if (zbdi->write_min_zone != zbdi->write_max_zone) {
+               if (zbdi->write_min_zone != f->min_zone ||
+                   zbdi->write_max_zone != f->max_zone) {
+                       td_verror(td, EINVAL,
+                                 "multi-jobs with different write ranges are "
+                                 "not supported with zone_reset_threshold");
+                       log_err("multi-jobs with different write ranges are "
+                               "not supported with zone_reset_threshold\n");
+               }
+               return 0;
         }
  
-       pthread_mutex_lock(&f->zbd_info->mutex);
-       f->zbd_info->sectors_with_data -= data_in_zone;
-       f->zbd_info->wp_sectors_with_data -= data_in_zone;
-       pthread_mutex_unlock(&f->zbd_info->mutex);
-       z->wp = z->start;
-       z->verify_block = 0;
+       zbdi->write_min_zone = f->min_zone;
+       zbdi->write_max_zone = f->max_zone;
  
-       td->ts.nr_zone_resets++;
+       zb = zbd_get_zone(f, f->min_zone);
+       ze = zbd_get_zone(f, f->max_zone);
+       for (z = zb; z < ze; z++)
+               if (z->has_wp)
+                       wp_vdb += z->wp - z->start;
  
-       return ret;
+       zbdi->wp_valid_data_bytes = wp_vdb;
+
+       return wp_vdb;
  }
  
-/* The caller must hold f->zbd_info->mutex */
-static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
-                          unsigned int zone_idx)
+int zbd_setup_files(struct thread_data *td)
  {
-       uint32_t open_zone_idx = 0;
+       struct fio_file *f;
+       int i;
  
-       for (; open_zone_idx < f->zbd_info->num_open_zones; open_zone_idx++) {
-               if (f->zbd_info->open_zones[open_zone_idx] == zone_idx)
-                       break;
+       if (!zbd_using_direct_io()) {
+               log_err("Using direct I/O is mandatory for writing to ZBD drives\n\n");
+               return 1;
         }
-       if (open_zone_idx == f->zbd_info->num_open_zones)
-               return;
  
-       dprint(FD_ZBD, "%s: closing zone %d\n", f->file_name, zone_idx);
-       memmove(f->zbd_info->open_zones + open_zone_idx,
-               f->zbd_info->open_zones + open_zone_idx + 1,
-               (ZBD_MAX_OPEN_ZONES - (open_zone_idx + 1)) *
-               sizeof(f->zbd_info->open_zones[0]));
-       f->zbd_info->num_open_zones--;
-       td->num_open_zones--;
-       get_zone(f, zone_idx)->open = 0;
-}
+       if (!zbd_verify_sizes())
+               return 1;
  
-/*
- * Reset a range of zones. Returns 0 upon success and 1 upon failure.
- * @td: fio thread data.
- * @f: fio file for which to reset zones
- * @zb: first zone to reset.
- * @ze: first zone not to reset.
- * @all_zones: whether to reset all zones or only those zones for which the
- *     write pointer is not a multiple of td->o.min_bs[DDIR_WRITE].
- */
-static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
-                          struct fio_zone_info *const zb,
-                          struct fio_zone_info *const ze, bool all_zones)
-{
-       struct fio_zone_info *z;
-       const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
-       bool reset_wp;
-       int res = 0;
+       if (!zbd_verify_bs())
+               return 1;
  
-       assert(min_bs);
+       if (td->o.experimental_verify) {
+               log_err("zonemode=zbd does not support experimental verify\n");
+               return 1;
+       }
  
-       dprint(FD_ZBD, "%s: examining zones %u .. %u\n", f->file_name,
-               zbd_zone_nr(f, zb), zbd_zone_nr(f, ze));
-       for (z = zb; z < ze; z++) {
-               uint32_t nz = zbd_zone_nr(f, z);
+       for_each_file(td, f, i) {
+               struct zoned_block_device_info *zbd = f->zbd_info;
+               struct fio_zone_info *z;
+               int zi;
+               uint64_t vdb;
  
-               if (!z->has_wp)
-                       continue;
-               zone_lock(td, f, z);
-               if (all_zones) {
-                       pthread_mutex_lock(&f->zbd_info->mutex);
-                       zbd_close_zone(td, f, nz);
-                       pthread_mutex_unlock(&f->zbd_info->mutex);
+               assert(zbd);
+
+               f->min_zone = zbd_offset_to_zone_idx(f, f->file_offset);
+               f->max_zone =
+                       zbd_offset_to_zone_idx(f, f->file_offset + f->io_size);
+
+               vdb = zbd_verify_and_set_vdb(td, f);
+
+               dprint(FD_ZBD, "%s(%s): valid data bytes = %" PRIu64 "\n",
+                      __func__, f->file_name, vdb);
+
+               /*
+                * When all zones in the I/O range are conventional, io_size
+                * can be smaller than zone size, making min_zone the same
+                * as max_zone. This is why the assert below needs to be made
+                * conditional.
+                */
+               if (zbd_is_seq_job(f))
+                       assert(f->min_zone < f->max_zone);
  
-                       reset_wp = z->wp != z->start;
-               } else {
-                       reset_wp = z->wp % min_bs != 0;
+               if (td->o.max_open_zones > 0 &&
+                   zbd->max_write_zones != td->o.max_open_zones) {
+                       log_err("Different 'max_open_zones' values\n");
+                       return 1;
                 }
-               if (reset_wp) {
-                       dprint(FD_ZBD, "%s: resetting zone %u\n",
-                              f->file_name, zbd_zone_nr(f, z));
-                       if (zbd_reset_zone(td, f, z) < 0)
-                               res = 1;
+
+               /*
+                * The per job max open zones limit cannot be used without a
+                * global max open zones limit. (As the tracking of open zones
+                * is disabled when there is no global max open zones limit.)
+                */
+               if (td->o.job_max_open_zones && !zbd->max_write_zones) {
+                       log_err("'job_max_open_zones' cannot be used without a global open zones limit\n");
+                       return 1;
+               }
+
+               /*
+                * zbd->max_write_zones is the global limit shared for all jobs
+                * that target the same zoned block device. Force sync the per
+                * thread global limit with the actual global limit. (The real
+                * per thread/job limit is stored in td->o.job_max_open_zones).
+                */
+               td->o.max_open_zones = zbd->max_write_zones;
+
+               for (zi = f->min_zone; zi < f->max_zone; zi++) {
+                       z = &zbd->zone_info[zi];
+                       if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
+                           z->cond != ZBD_ZONE_COND_EXP_OPEN &&
+                           z->cond != ZBD_ZONE_COND_CLOSED)
+                               continue;
+                       if (!zbd->max_active_zones &&
+                           z->cond == ZBD_ZONE_COND_CLOSED)
+                               continue;
+                       if (__zbd_write_zone_get(td, f, z))
+                               continue;
+                       /*
+                        * If the number of open zones exceeds specified limits,
+                        * error out.
+                        */
+                       log_err("Number of open zones exceeds max_open_zones limit\n");
+                       return 1;
                 }
-               zone_unlock(z);
         }
  
-       return res;
+       return 0;
  }
  
  /*
@@ -885,239 +1342,140 @@ static bool zbd_dec_and_reset_write_cnt(const struct thread_data *td,
         return write_cnt == 0;
  }
  
-enum swd_action {
-       CHECK_SWD,
-       SET_SWD,
-};
-
-/* Calculate the number of sectors with data (swd) and perform action 'a' */
-static uint64_t zbd_process_swd(struct thread_data *td,
-                               const struct fio_file *f, enum swd_action a)
-{
-       struct fio_zone_info *zb, *ze, *z;
-       uint64_t swd = 0;
-       uint64_t wp_swd = 0;
-
-       zb = get_zone(f, f->min_zone);
-       ze = get_zone(f, f->max_zone);
-       for (z = zb; z < ze; z++) {
-               if (z->has_wp) {
-                       zone_lock(td, f, z);
-                       wp_swd += z->wp - z->start;
-               }
-               swd += z->wp - z->start;
-       }
-       pthread_mutex_lock(&f->zbd_info->mutex);
-       switch (a) {
-       case CHECK_SWD:
-               assert(f->zbd_info->sectors_with_data == swd);
-               assert(f->zbd_info->wp_sectors_with_data == wp_swd);
-               break;
-       case SET_SWD:
-               f->zbd_info->sectors_with_data = swd;
-               f->zbd_info->wp_sectors_with_data = wp_swd;
-               break;
-       }
-       pthread_mutex_unlock(&f->zbd_info->mutex);
-       for (z = zb; z < ze; z++)
-               if (z->has_wp)
-                       zone_unlock(z);
-
-       return swd;
-}
-
-/*
- * The swd check is useful for debugging but takes too much time to leave
- * it enabled all the time. Hence it is disabled by default.
- */
-static const bool enable_check_swd = false;
-
-/* Check whether the values of zbd_info.*sectors_with_data are correct. */
-static void zbd_check_swd(struct thread_data *td, const struct fio_file *f)
-{
-       if (!enable_check_swd)
-               return;
-
-       zbd_process_swd(td, f, CHECK_SWD);
-}
-
  void zbd_file_reset(struct thread_data *td, struct fio_file *f)
  {
         struct fio_zone_info *zb, *ze;
-       uint64_t swd;
+       bool verify_data_left = false;
  
         if (!f->zbd_info || !td_write(td))
                 return;
  
-       zb = get_zone(f, f->min_zone);
-       ze = get_zone(f, f->max_zone);
-       swd = zbd_process_swd(td, f, SET_SWD);
-       dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n", __func__, f->file_name,
-              swd);
+       zb = zbd_get_zone(f, f->min_zone);
+       ze = zbd_get_zone(f, f->max_zone);
+
         /*
          * If data verification is enabled reset the affected zones before
          * writing any data to avoid that a zone reset has to be issued while
          * writing data, which causes data loss.
          */
-       zbd_reset_zones(td, f, zb, ze, td->o.verify != VERIFY_NONE &&
-                       td->runstate != TD_VERIFYING);
+       if (td->o.verify != VERIFY_NONE) {
+               verify_data_left = td->runstate == TD_VERIFYING ||
+                       td->io_hist_len || td->verify_batch;
+               if (td->io_hist_len && td->o.verify_backlog)
+                       verify_data_left =
+                               td->io_hist_len % td->o.verify_backlog;
+               if (!verify_data_left)
+                       zbd_reset_zones(td, f, zb, ze);
+       }
+
         zbd_reset_write_cnt(td, f);
  }
  
-/* The caller must hold f->zbd_info->mutex. */
-static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
-                        unsigned int zone_idx)
+/* Return random zone index for one of the write target zones. */
+static uint32_t pick_random_zone_idx(const struct fio_file *f,
+                                    const struct io_u *io_u)
  {
-       struct zoned_block_device_info *zbdi = f->zbd_info;
-       int i;
-
-       assert(td->o.job_max_open_zones == 0 || td->num_open_zones <= td->o.job_max_open_zones);
-       assert(td->o.job_max_open_zones <= zbdi->max_open_zones);
-       assert(zbdi->num_open_zones <= zbdi->max_open_zones);
-
-       for (i = 0; i < zbdi->num_open_zones; i++)
-               if (zbdi->open_zones[i] == zone_idx)
-                       return true;
-
-       return false;
+       return (io_u->offset - f->file_offset) *
+               f->zbd_info->num_write_zones / f->io_size;
  }
  
-/*
- * Open a ZBD zone if it was not yet open. Returns true if either the zone was
- * already open or if opening a new zone is allowed. Returns false if the zone
- * was not yet open and opening a new zone would cause the zone limit to be
- * exceeded.
- */
-static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
-                         uint32_t zone_idx)
+static bool any_io_in_flight(void)
  {
-       const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
-       struct fio_zone_info *z = get_zone(f, zone_idx);
-       bool res = true;
-
-       if (z->cond == ZBD_ZONE_COND_OFFLINE)
-               return false;
-
-       /*
-        * Skip full zones with data verification enabled because resetting a
-        * zone causes data loss and hence causes verification to fail.
-        */
-       if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
-               return false;
-
-       pthread_mutex_lock(&f->zbd_info->mutex);
-       if (is_zone_open(td, f, zone_idx)) {
-               /*
-                * If the zone is already open and going to be full by writes
-                * in-flight, handle it as a full zone instead of an open zone.
-                */
-               if (z->wp >= zbd_zone_capacity_end(z))
-                       res = false;
-               goto out;
-       }
-       res = false;
-       /* Zero means no limit */
-       if (td->o.job_max_open_zones > 0 &&
-           td->num_open_zones >= td->o.job_max_open_zones)
-               goto out;
-       if (f->zbd_info->num_open_zones >= f->zbd_info->max_open_zones)
-               goto out;
-       dprint(FD_ZBD, "%s: opening zone %d\n", f->file_name, zone_idx);
-       f->zbd_info->open_zones[f->zbd_info->num_open_zones++] = zone_idx;
-       td->num_open_zones++;
-       z->open = 1;
-       res = true;
-
-out:
-       pthread_mutex_unlock(&f->zbd_info->mutex);
-       return res;
-}
+       for_each_td(td) {
+               if (td->io_u_in_flight)
+                       return true;
+       } end_for_each();
  
-/* Anything goes as long as it is not a constant. */
-static uint32_t pick_random_zone_idx(const struct fio_file *f,
-                                    const struct io_u *io_u)
-{
-       return io_u->offset * f->zbd_info->num_open_zones / f->real_file_size;
+       return false;
  }
  
  /*
- * Modify the offset of an I/O unit that does not refer to an open zone such
- * that it refers to an open zone. Close an open zone and open a new zone if
- * necessary. The open zone is searched across sequential zones.
+ * Modify the offset of an I/O unit that does not refer to a zone such that
+ * in write target zones array. Add a zone to or remove a zone from the lsit if
+ * necessary. The write target zone is searched across sequential zones.
   * This algorithm can only work correctly if all write pointers are
   * a multiple of the fio block size. The caller must neither hold z->mutex
   * nor f->zbd_info->mutex. Returns with z->mutex held upon success.
   */
-static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
-                                                     struct io_u *io_u)
+static struct fio_zone_info *zbd_convert_to_write_zone(struct thread_data *td,
+                                                      struct io_u *io_u)
  {
-       const uint32_t min_bs = td->o.min_bs[io_u->ddir];
+       const uint64_t min_bs = td->o.min_bs[io_u->ddir];
         struct fio_file *f = io_u->file;
+       struct zoned_block_device_info *zbdi = f->zbd_info;
         struct fio_zone_info *z;
-       unsigned int open_zone_idx = -1;
+       unsigned int write_zone_idx = -1;
         uint32_t zone_idx, new_zone_idx;
         int i;
-       bool wait_zone_close;
+       bool wait_zone_write;
+       bool in_flight;
+       bool should_retry = true;
  
         assert(is_valid_offset(f, io_u->offset));
  
-       if (td->o.max_open_zones || td->o.job_max_open_zones) {
+       if (zbdi->max_write_zones || td->o.job_max_open_zones) {
                 /*
-                * This statement accesses f->zbd_info->open_zones[] on purpose
+                * This statement accesses zbdi->write_zones[] on purpose
                  * without locking.
                  */
-               zone_idx = f->zbd_info->open_zones[pick_random_zone_idx(f, io_u)];
+               zone_idx = zbdi->write_zones[pick_random_zone_idx(f, io_u)];
         } else {
-               zone_idx = zbd_zone_idx(f, io_u->offset);
+               zone_idx = zbd_offset_to_zone_idx(f, io_u->offset);
         }
         if (zone_idx < f->min_zone)
                 zone_idx = f->min_zone;
         else if (zone_idx >= f->max_zone)
                 zone_idx = f->max_zone - 1;
-       dprint(FD_ZBD, "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
+
+       dprint(FD_ZBD,
+              "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
                __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
  
         /*
-        * Since z->mutex is the outer lock and f->zbd_info->mutex the inner
+        * Since z->mutex is the outer lock and zbdi->mutex the inner
          * lock it can happen that the state of the zone with index zone_idx
-        * has changed after 'z' has been assigned and before f->zbd_info->mutex
+        * has changed after 'z' has been assigned and before zbdi->mutex
          * has been obtained. Hence the loop.
          */
         for (;;) {
                 uint32_t tmp_idx;
  
-               z = get_zone(f, zone_idx);
+               z = zbd_get_zone(f, zone_idx);
                 if (z->has_wp)
                         zone_lock(td, f, z);
-               pthread_mutex_lock(&f->zbd_info->mutex);
+
+               pthread_mutex_lock(&zbdi->mutex);
+
                 if (z->has_wp) {
                         if (z->cond != ZBD_ZONE_COND_OFFLINE &&
-                           td->o.max_open_zones == 0 && td->o.job_max_open_zones == 0)
+                           zbdi->max_write_zones == 0 &&
+                           td->o.job_max_open_zones == 0)
                                 goto examine_zone;
-                       if (f->zbd_info->num_open_zones == 0) {
-                               dprint(FD_ZBD, "%s(%s): no zones are open\n",
+                       if (zbdi->num_write_zones == 0) {
+                               dprint(FD_ZBD, "%s(%s): no zone is write target\n",
                                        __func__, f->file_name);
-                               goto open_other_zone;
+                               goto choose_other_zone;
                         }
                 }
  
                 /*
-                * List of opened zones is per-device, shared across all threads.
-                * Start with quasi-random candidate zone.
-                * Ignore zones which don't belong to thread's offset/size area.
+                * Array of write target zones is per-device, shared across all
+                * threads. Start with quasi-random candidate zone. Ignore
+                * zones which don't belong to thread's offset/size area.
                  */
-               open_zone_idx = pick_random_zone_idx(f, io_u);
-               assert(!open_zone_idx ||
-                      open_zone_idx < f->zbd_info->num_open_zones);
-               tmp_idx = open_zone_idx;
-               for (i = 0; i < f->zbd_info->num_open_zones; i++) {
+               write_zone_idx = pick_random_zone_idx(f, io_u);
+               assert(!write_zone_idx ||
+                      write_zone_idx < zbdi->num_write_zones);
+               tmp_idx = write_zone_idx;
+
+               for (i = 0; i < zbdi->num_write_zones; i++) {
                         uint32_t tmpz;
  
-                       if (tmp_idx >= f->zbd_info->num_open_zones)
+                       if (tmp_idx >= zbdi->num_write_zones)
                                 tmp_idx = 0;
-                       tmpz = f->zbd_info->open_zones[tmp_idx];
+                       tmpz = zbdi->write_zones[tmp_idx];
                         if (f->min_zone <= tmpz && tmpz < f->max_zone) {
-                               open_zone_idx = tmp_idx;
+                               write_zone_idx = tmp_idx;
                                 goto found_candidate_zone;
                         }
  
@@ -1126,54 +1484,61 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
  
                 dprint(FD_ZBD, "%s(%s): no candidate zone\n",
                         __func__, f->file_name);
-               pthread_mutex_unlock(&f->zbd_info->mutex);
+
+               pthread_mutex_unlock(&zbdi->mutex);
+
                 if (z->has_wp)
                         zone_unlock(z);
+
                 return NULL;
  
  found_candidate_zone:
-               new_zone_idx = f->zbd_info->open_zones[open_zone_idx];
+               new_zone_idx = zbdi->write_zones[write_zone_idx];
                 if (new_zone_idx == zone_idx)
                         break;
                 zone_idx = new_zone_idx;
-               pthread_mutex_unlock(&f->zbd_info->mutex);
+
+               pthread_mutex_unlock(&zbdi->mutex);
+
                 if (z->has_wp)
                         zone_unlock(z);
         }
  
-       /* Both z->mutex and f->zbd_info->mutex are held. */
+       /* Both z->mutex and zbdi->mutex are held. */
  
  examine_zone:
-       if (z->wp + min_bs <= zbd_zone_capacity_end(z)) {
-               pthread_mutex_unlock(&f->zbd_info->mutex);
+       if (zbd_zone_remainder(z) >= min_bs) {
+               pthread_mutex_unlock(&zbdi->mutex);
                 goto out;
         }
  
-open_other_zone:
-       /* Check if number of open zones reaches one of limits. */
-       wait_zone_close =
-               f->zbd_info->num_open_zones == f->max_zone - f->min_zone ||
-               (td->o.max_open_zones &&
-                f->zbd_info->num_open_zones == td->o.max_open_zones) ||
+choose_other_zone:
+       /* Check if number of write target zones reaches one of limits. */
+       wait_zone_write =
+               zbdi->num_write_zones == f->max_zone - f->min_zone ||
+               (zbdi->max_write_zones &&
+                zbdi->num_write_zones == zbdi->max_write_zones) ||
                 (td->o.job_max_open_zones &&
-                td->num_open_zones == td->o.job_max_open_zones);
+                td->num_write_zones == td->o.job_max_open_zones);
  
-       pthread_mutex_unlock(&f->zbd_info->mutex);
+       pthread_mutex_unlock(&zbdi->mutex);
  
         /* Only z->mutex is held. */
  
         /*
-        * When number of open zones reaches to one of limits, wait for
-        * zone close before opening a new zone.
+        * When number of write target zones reaches to one of limits, wait for
+        * zone write completion to one of them before trying a new zone.
          */
-       if (wait_zone_close) {
-               dprint(FD_ZBD, "%s(%s): quiesce to allow open zones to close\n",
+       if (wait_zone_write) {
+               dprint(FD_ZBD,
+                      "%s(%s): quiesce to remove a zone from write target zones array\n",
                        __func__, f->file_name);
                 io_u_quiesce(td);
         }
  
-       /* Zone 'z' is full, so try to open a new zone. */
-       for (i = f->io_size / f->zbd_info->zone_size; i > 0; i--) {
+retry:
+       /* Zone 'z' is full, so try to choose a new zone. */
+       for (i = f->io_size / zbdi->zone_size; i > 0; i--) {
                 zone_idx++;
                 if (z->has_wp)
                         zone_unlock(z);
@@ -1181,101 +1546,90 @@ open_other_zone:
                 if (!is_valid_offset(f, z->start)) {
                         /* Wrap-around. */
                         zone_idx = f->min_zone;
-                       z = get_zone(f, zone_idx);
+                       z = zbd_get_zone(f, zone_idx);
                 }
                 assert(is_valid_offset(f, z->start));
                 if (!z->has_wp)
                         continue;
                 zone_lock(td, f, z);
-               if (z->open)
+               if (z->write)
                         continue;
-               if (zbd_open_zone(td, f, zone_idx))
+               if (zbd_write_zone_get(td, f, z))
                         goto out;
         }
  
         /* Only z->mutex is held. */
  
-       /* Check whether the write fits in any of the already opened zones. */
-       pthread_mutex_lock(&f->zbd_info->mutex);
-       for (i = 0; i < f->zbd_info->num_open_zones; i++) {
-               zone_idx = f->zbd_info->open_zones[i];
+       /* Check whether the write fits in any of the write target zones. */
+       pthread_mutex_lock(&zbdi->mutex);
+       for (i = 0; i < zbdi->num_write_zones; i++) {
+               zone_idx = zbdi->write_zones[i];
                 if (zone_idx < f->min_zone || zone_idx >= f->max_zone)
                         continue;
-               pthread_mutex_unlock(&f->zbd_info->mutex);
+               pthread_mutex_unlock(&zbdi->mutex);
                 zone_unlock(z);
  
-               z = get_zone(f, zone_idx);
+               z = zbd_get_zone(f, zone_idx);
  
                 zone_lock(td, f, z);
-               if (z->wp + min_bs <= zbd_zone_capacity_end(z))
+               if (zbd_zone_remainder(z) >= min_bs)
                         goto out;
-               pthread_mutex_lock(&f->zbd_info->mutex);
+               pthread_mutex_lock(&zbdi->mutex);
         }
-       pthread_mutex_unlock(&f->zbd_info->mutex);
+
+       /*
+        * When any I/O is in-flight or when all I/Os in-flight get completed,
+        * the I/Os might have removed zones from the write target array then
+        * retry the steps to choose a zone. Before retry, call io_u_quiesce()
+        * to complete in-flight writes.
+        */
+       in_flight = any_io_in_flight();
+       if (in_flight || should_retry) {
+               dprint(FD_ZBD,
+                      "%s(%s): wait zone write and retry write target zone selection\n",
+                      __func__, f->file_name);
+               should_retry = in_flight;
+               pthread_mutex_unlock(&zbdi->mutex);
+               zone_unlock(z);
+               io_u_quiesce(td);
+               zone_lock(td, f, z);
+               goto retry;
+       }
+
+       pthread_mutex_unlock(&zbdi->mutex);
+
         zone_unlock(z);
-       dprint(FD_ZBD, "%s(%s): did not open another zone\n", __func__,
-              f->file_name);
+
+       dprint(FD_ZBD, "%s(%s): did not choose another write zone\n",
+              __func__, f->file_name);
+
         return NULL;
  
  out:
-       dprint(FD_ZBD, "%s(%s): returning zone %d\n", __func__, f->file_name,
-              zone_idx);
+       dprint(FD_ZBD, "%s(%s): returning zone %d\n",
+              __func__, f->file_name, zone_idx);
+
         io_u->offset = z->start;
         assert(z->has_wp);
         assert(z->cond != ZBD_ZONE_COND_OFFLINE);
-       return z;
-}
-
-/* The caller must hold z->mutex. */
-static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
-                                                   struct io_u *io_u,
-                                                   struct fio_zone_info *z)
-{
-       const struct fio_file *f = io_u->file;
-       const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
-
-       if (!zbd_open_zone(td, f, zbd_zone_nr(f, z))) {
-               zone_unlock(z);
-               z = zbd_convert_to_open_zone(td, io_u);
-               assert(z);
-       }
-
-       if (z->verify_block * min_bs >= z->capacity) {
-               log_err("%s: %d * %d >= %llu\n", f->file_name, z->verify_block,
-                       min_bs, (unsigned long long)z->capacity);
-               /*
-                * If the assertion below fails during a test run, adding
-                * "--experimental_verify=1" to the command line may help.
-                */
-               assert(false);
-       }
-       io_u->offset = z->start + z->verify_block * min_bs;
-       if (io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
-               log_err("%s: %llu + %llu >= %llu\n", f->file_name, io_u->offset,
-                       io_u->buflen, (unsigned long long) zbd_zone_capacity_end(z));
-               assert(false);
-       }
-       z->verify_block += io_u->buflen / min_bs;
  
         return z;
  }
  
  /*
- * Find another zone for which @io_u fits in the readable data in the zone.
- * Search in zones @zb + 1 .. @zl. For random workload, also search in zones
- * @zb - 1 .. @zf.
+ * Find another zone which has @min_bytes of readable data. Search in zones
+ * @zb + 1 .. @zl. For random workload, also search in zones @zb - 1 .. @zf.
   *
   * Either returns NULL or returns a zone pointer. When the zone has write
   * pointer, hold the mutex for the zone.
   */
  static struct fio_zone_info *
-zbd_find_zone(struct thread_data *td, struct io_u *io_u,
+zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
               struct fio_zone_info *zb, struct fio_zone_info *zl)
  {
-       const uint32_t min_bs = td->o.min_bs[io_u->ddir];
         struct fio_file *f = io_u->file;
         struct fio_zone_info *z1, *z2;
-       const struct fio_zone_info *const zf = get_zone(f, f->min_zone);
+       const struct fio_zone_info *const zf = zbd_get_zone(f, f->min_zone);
  
         /*
          * Skip to the next non-empty zone in case of sequential I/O and to
@@ -1285,25 +1639,29 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u,
                 if (z1 < zl && z1->cond != ZBD_ZONE_COND_OFFLINE) {
                         if (z1->has_wp)
                                 zone_lock(td, f, z1);
-                       if (z1->start + min_bs <= z1->wp)
+                       if (z1->start + min_bytes <= z1->wp)
                                 return z1;
                         if (z1->has_wp)
                                 zone_unlock(z1);
                 } else if (!td_random(td)) {
                         break;
                 }
+
                 if (td_random(td) && z2 >= zf &&
                     z2->cond != ZBD_ZONE_COND_OFFLINE) {
                         if (z2->has_wp)
                                 zone_lock(td, f, z2);
-                       if (z2->start + min_bs <= z2->wp)
+                       if (z2->start + min_bytes <= z2->wp)
                                 return z2;
                         if (z2->has_wp)
                                 zone_unlock(z2);
                 }
         }
-       dprint(FD_ZBD, "%s: adjusting random read offset failed\n",
-              f->file_name);
+
+       dprint(FD_ZBD,
+              "%s: no zone has %"PRIu64" bytes of readable data\n",
+              f->file_name, min_bytes);
+
         return NULL;
  }
  
@@ -1312,7 +1670,8 @@ zbd_find_zone(struct thread_data *td, struct io_u *io_u,
   * @io_u: I/O unit
   * @z: zone info pointer
   *
- * If the write command made the zone full, close it.
+ * If the write command made the zone full, remove it from the write target
+ * zones array.
   *
   * The caller must hold z->mutex.
   */
@@ -1324,7 +1683,7 @@ static void zbd_end_zone_io(struct thread_data *td, const struct io_u *io_u,
         if (io_u->ddir == DDIR_WRITE &&
             io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
                 pthread_mutex_lock(&f->zbd_info->mutex);
-               zbd_close_zone(td, f, zbd_zone_nr(f, z));
+               zbd_write_zone_put(td, f, z);
                 pthread_mutex_unlock(&f->zbd_info->mutex);
         }
  }
@@ -1344,16 +1703,11 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
         const struct fio_file *f = io_u->file;
         struct zoned_block_device_info *zbd_info = f->zbd_info;
         struct fio_zone_info *z;
-       uint32_t zone_idx;
         uint64_t zone_end;
  
-       if (!zbd_info)
-               return;
-
-       zone_idx = zbd_zone_idx(f, io_u->offset);
-       assert(zone_idx < zbd_info->nr_zones);
-       z = get_zone(f, zone_idx);
+       assert(zbd_info);
  
+       z = zbd_offset_to_zone(f, io_u->offset);
         assert(z->has_wp);
  
         if (!success)
@@ -1361,27 +1715,24 @@ static void zbd_queue_io(struct thread_data *td, struct io_u *io_u, int q,
  
         dprint(FD_ZBD,
                "%s: queued I/O (%lld, %llu) for zone %u\n",
-              f->file_name, io_u->offset, io_u->buflen, zone_idx);
+              f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
  
         switch (io_u->ddir) {
         case DDIR_WRITE:
                 zone_end = min((uint64_t)(io_u->offset + io_u->buflen),
                                zbd_zone_capacity_end(z));
-               pthread_mutex_lock(&zbd_info->mutex);
+
                 /*
                  * z->wp > zone_end means that one or more I/O errors
                  * have occurred.
                  */
-               if (z->wp <= zone_end) {
-                       zbd_info->sectors_with_data += zone_end - z->wp;
-                       zbd_info->wp_sectors_with_data += zone_end - z->wp;
+               if (accounting_vdb(td, f) && z->wp <= zone_end) {
+                       pthread_mutex_lock(&zbd_info->mutex);
+                       zbd_info->wp_valid_data_bytes += zone_end - z->wp;
+                       pthread_mutex_unlock(&zbd_info->mutex);
                 }
-               pthread_mutex_unlock(&zbd_info->mutex);
                 z->wp = zone_end;
                 break;
-       case DDIR_TRIM:
-               assert(z->wp == z->start);
-               break;
         default:
                 break;
         }
@@ -1404,27 +1755,20 @@ unlock:
  static void zbd_put_io(struct thread_data *td, const struct io_u *io_u)
  {
         const struct fio_file *f = io_u->file;
-       struct zoned_block_device_info *zbd_info = f->zbd_info;
         struct fio_zone_info *z;
-       uint32_t zone_idx;
-
-       if (!zbd_info)
-               return;
  
-       zone_idx = zbd_zone_idx(f, io_u->offset);
-       assert(zone_idx < zbd_info->nr_zones);
-       z = get_zone(f, zone_idx);
+       assert(f->zbd_info);
  
+       z = zbd_offset_to_zone(f, io_u->offset);
         assert(z->has_wp);
  
         dprint(FD_ZBD,
                "%s: terminate I/O (%lld, %llu) for zone %u\n",
-              f->file_name, io_u->offset, io_u->buflen, zone_idx);
+              f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
  
         zbd_end_zone_io(td, io_u, z);
  
         zone_unlock(z);
-       zbd_check_swd(td, f);
  }
  
  /*
@@ -1461,28 +1805,26 @@ void setup_zbd_zone_mode(struct thread_data *td, struct io_u *io_u)
         struct fio_file *f = io_u->file;
         enum fio_ddir ddir = io_u->ddir;
         struct fio_zone_info *z;
-       uint32_t zone_idx;
  
         assert(td->o.zone_mode == ZONE_MODE_ZBD);
         assert(td->o.zone_size);
+       assert(f->zbd_info);
  
-       zone_idx = zbd_zone_idx(f, f->last_pos[ddir]);
-       z = get_zone(f, zone_idx);
+       z = zbd_offset_to_zone(f, f->last_pos[ddir]);
  
         /*
          * When the zone capacity is smaller than the zone size and the I/O is
          * sequential write, skip to zone end if the latest position is at the
          * zone capacity limit.
          */
-       if (z->capacity < f->zbd_info->zone_size && !td_random(td) &&
-           ddir == DDIR_WRITE &&
+       if (z->capacity < f->zbd_info->zone_size &&
+           !td_random(td) && ddir == DDIR_WRITE &&
             f->last_pos[ddir] >= zbd_zone_capacity_end(z)) {
                 dprint(FD_ZBD,
                        "%s: Jump from zone capacity limit to zone end:"
-                      " (%llu -> %llu) for zone %u (%llu)\n",
-                      f->file_name, (unsigned long long) f->last_pos[ddir],
-                      (unsigned long long) zbd_zone_end(z), zone_idx,
-                      (unsigned long long) z->capacity);
+                      " (%"PRIu64" -> %"PRIu64") for zone %u (%"PRIu64")\n",
+                      f->file_name, f->last_pos[ddir],
+                      zbd_zone_end(z), zbd_zone_idx(f, z), z->capacity);
                 td->io_skip_bytes += zbd_zone_end(z) - f->last_pos[ddir];
                 f->last_pos[ddir] = zbd_zone_end(z);
         }
@@ -1539,11 +1881,12 @@ enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u,
          * devices with all empty zones. Overwrite the first I/O direction as
          * write to make sure data to read exists.
          */
+       assert(io_u->file->zbd_info);
         if (ddir != DDIR_READ || !td_rw(td))
                 return ddir;
  
-       if (io_u->file->zbd_info->sectors_with_data ||
-           td->o.read_beyond_wp)
+       if (io_u->file->last_start[DDIR_WRITE] != -1ULL ||
+           td->o.read_beyond_wp || td->o.rwmix[DDIR_WRITE] == 0)
                 return DDIR_READ;
  
         return DDIR_WRITE;
@@ -1561,27 +1904,26 @@ enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u,
  enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
  {
         struct fio_file *f = io_u->file;
-       uint32_t zone_idx_b;
+       struct zoned_block_device_info *zbdi = f->zbd_info;
         struct fio_zone_info *zb, *zl, *orig_zb;
         uint32_t orig_len = io_u->buflen;
-       uint32_t min_bs = td->o.min_bs[io_u->ddir];
+       uint64_t min_bs = td->o.min_bs[io_u->ddir];
         uint64_t new_len;
         int64_t range;
  
-       if (!f->zbd_info)
-               return io_u_accept;
-
+       assert(zbdi);
         assert(min_bs);
         assert(is_valid_offset(f, io_u->offset));
         assert(io_u->buflen);
-       zone_idx_b = zbd_zone_idx(f, io_u->offset);
-       zb = get_zone(f, zone_idx_b);
+
+       zb = zbd_offset_to_zone(f, io_u->offset);
         orig_zb = zb;
  
         if (!zb->has_wp) {
                 /* Accept non-write I/Os for conventional zones. */
                 if (io_u->ddir != DDIR_WRITE)
                         return io_u_accept;
+
                 /*
                  * Make sure that writes to conventional zones
                  * don't cross over to any sequential zones.
@@ -1592,15 +1934,19 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
  
                 if (io_u->offset + min_bs > (zb + 1)->start) {
                         dprint(FD_IO,
-                              "%s: off=%llu + min_bs=%u > next zone %llu\n",
+                              "%s: off=%llu + min_bs=%"PRIu64" > next zone %"PRIu64"\n",
                                f->file_name, io_u->offset,
-                              min_bs, (unsigned long long) (zb + 1)->start);
-                       io_u->offset = zb->start + (zb + 1)->start - io_u->offset;
-                       new_len = min(io_u->buflen, (zb + 1)->start - io_u->offset);
+                              min_bs, (zb + 1)->start);
+                       io_u->offset =
+                               zb->start + (zb + 1)->start - io_u->offset;
+                       new_len = min(io_u->buflen,
+                                     (zb + 1)->start - io_u->offset);
                 } else {
                         new_len = (zb + 1)->start - io_u->offset;
                 }
+
                 io_u->buflen = new_len / min_bs * min_bs;
+
                 return io_u_accept;
         }
  
@@ -1612,16 +1958,13 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
             io_u->ddir == DDIR_READ && td->o.read_beyond_wp)
                 return io_u_accept;
  
-       zbd_check_swd(td, f);
-
         zone_lock(td, f, zb);
  
         switch (io_u->ddir) {
         case DDIR_READ:
-               if (td->runstate == TD_VERIFYING && td_write(td)) {
-                       zb = zbd_replay_write_order(td, io_u, zb);
+               if (td->runstate == TD_VERIFYING && td_write(td))
                         goto accept;
-               }
+
                 /*
                  * Check that there is enough written data in the zone to do an
                  * I/O of at least min_bs B. If there isn't, find a new zone for
@@ -1632,8 +1975,8 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                 if (range < min_bs ||
                     ((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) {
                         zone_unlock(zb);
-                       zl = get_zone(f, f->max_zone);
-                       zb = zbd_find_zone(td, io_u, zb, zl);
+                       zl = zbd_get_zone(f, f->max_zone);
+                       zb = zbd_find_zone(td, io_u, min_bs, zb, zl);
                         if (!zb) {
                                 dprint(FD_ZBD,
                                        "%s: zbd_find_zone(%lld, %llu) failed\n",
@@ -1651,6 +1994,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                         if (!td_random(td))
                                 io_u->offset = zb->start;
                 }
+
                 /*
                  * Make sure the I/O is within the zone valid data range while
                  * maximizing the I/O size and preserving randomness.
@@ -1661,12 +2005,14 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                         io_u->offset = zb->start +
                                 ((io_u->offset - orig_zb->start) %
                                  (range - io_u->buflen)) / min_bs * min_bs;
+
                 /*
                  * When zbd_find_zone() returns a conventional zone,
                  * we can simply accept the new i/o offset here.
                  */
                 if (!zb->has_wp)
                         return io_u_accept;
+
                 /*
                  * Make sure the I/O does not cross over the zone wp position.
                  */
@@ -1678,39 +2024,85 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                         dprint(FD_IO, "Changed length from %u into %llu\n",
                                orig_len, io_u->buflen);
                 }
+
                 assert(zb->start <= io_u->offset);
                 assert(io_u->offset + io_u->buflen <= zb->wp);
+
                 goto accept;
+
         case DDIR_WRITE:
-               if (io_u->buflen > f->zbd_info->zone_size) {
+               if (io_u->buflen > zbdi->zone_size) {
                         td_verror(td, EINVAL, "I/O buflen exceeds zone size");
                         dprint(FD_IO,
-                              "%s: I/O buflen %llu exceeds zone size %llu\n",
-                              f->file_name, io_u->buflen,
-                              (unsigned long long) f->zbd_info->zone_size);
+                              "%s: I/O buflen %llu exceeds zone size %"PRIu64"\n",
+                              f->file_name, io_u->buflen, zbdi->zone_size);
                         goto eof;
                 }
-               if (!zbd_open_zone(td, f, zone_idx_b)) {
+
+retry:
+               if (zbd_zone_remainder(zb) > 0 &&
+                   zbd_zone_remainder(zb) < min_bs) {
+                       pthread_mutex_lock(&f->zbd_info->mutex);
+                       zbd_write_zone_put(td, f, zb);
+                       pthread_mutex_unlock(&f->zbd_info->mutex);
+                       dprint(FD_ZBD,
+                              "%s: finish zone %d\n",
+                              f->file_name, zbd_zone_idx(f, zb));
+                       io_u_quiesce(td);
+                       zbd_finish_zone(td, f, zb);
+                       if (zbd_zone_idx(f, zb) + 1 >= f->max_zone) {
+                               if (!td_random(td))
+                                       goto eof;
+                       }
+                       zone_unlock(zb);
+
+                       /* Find the next write pointer zone */
+                       do {
+                               zb++;
+                               if (zbd_zone_idx(f, zb) >= f->max_zone)
+                                       zb = zbd_get_zone(f, f->min_zone);
+                       } while (!zb->has_wp);
+
+                       zone_lock(td, f, zb);
+               }
+
+               if (!zbd_write_zone_get(td, f, zb)) {
                         zone_unlock(zb);
-                       zb = zbd_convert_to_open_zone(td, io_u);
+                       zb = zbd_convert_to_write_zone(td, io_u);
                         if (!zb) {
-                               dprint(FD_IO, "%s: can't convert to open zone",
+                               dprint(FD_IO, "%s: can't convert to write target zone",
                                        f->file_name);
                                 goto eof;
                         }
-                       zone_idx_b = zbd_zone_nr(f, zb);
                 }
+
+               if (zbd_zone_remainder(zb) > 0 &&
+                   zbd_zone_remainder(zb) < min_bs)
+                       goto retry;
+
                 /* Check whether the zone reset threshold has been exceeded */
                 if (td->o.zrf.u.f) {
-                       if (f->zbd_info->wp_sectors_with_data >=
+                       if (zbdi->wp_valid_data_bytes >=
                             f->io_size * td->o.zrt.u.f &&
-                           zbd_dec_and_reset_write_cnt(td, f)) {
+                           zbd_dec_and_reset_write_cnt(td, f))
                                 zb->reset_zone = 1;
-                       }
                 }
+
                 /* Reset the zone pointer if necessary */
                 if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) {
-                       assert(td->o.verify == VERIFY_NONE);
+                       if (td->o.verify != VERIFY_NONE) {
+                               /*
+                                * Unset io-u->file to tell get_next_verify()
+                                * that this IO is not requeue.
+                                */
+                               io_u->file = NULL;
+                               if (!get_next_verify(td, io_u)) {
+                                       zone_unlock(zb);
+                                       return io_u_accept;
+                               }
+                               io_u->file = f;
+                       }
+
                         /*
                          * Since previous write requests may have been submitted
                          * asynchronously and since we will submit the zone
@@ -1720,17 +2112,17 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                          */
                         io_u_quiesce(td);
                         zb->reset_zone = 0;
-                       if (zbd_reset_zone(td, f, zb) < 0)
+                       if (__zbd_reset_zone(td, f, zb) < 0)
                                 goto eof;
  
                         if (zb->capacity < min_bs) {
                                 td_verror(td, EINVAL, "ZCAP is less min_bs");
-                               log_err("zone capacity %llu smaller than minimum block size %d\n",
-                                       (unsigned long long)zb->capacity,
-                                       min_bs);
+                               log_err("zone capacity %"PRIu64" smaller than minimum block size %"PRIu64"\n",
+                                       zb->capacity, min_bs);
                                 goto eof;
                         }
                 }
+
                 /* Make writes occur at the write pointer */
                 assert(!zbd_zone_full(f, zb, min_bs));
                 io_u->offset = zb->wp;
@@ -1740,6 +2132,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                                f->file_name, io_u->offset);
                         goto eof;
                 }
+
                 /*
                  * Make sure that the buflen is a multiple of the minimal
                  * block size. Give up if shrinking would make the request too
@@ -1756,18 +2149,39 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
                                orig_len, io_u->buflen);
                         goto accept;
                 }
+
                 td_verror(td, EIO, "zone remainder too small");
-               log_err("zone remainder %lld smaller than min block size %d\n",
+               log_err("zone remainder %lld smaller than min block size %"PRIu64"\n",
                         (zbd_zone_capacity_end(zb) - io_u->offset), min_bs);
+
                 goto eof;
+
         case DDIR_TRIM:
-               /* fall-through */
+               /* Check random trim targets a non-empty zone */
+               if (!td_random(td) || zb->wp > zb->start)
+                       goto accept;
+
+               /* Find out a non-empty zone to trim */
+               zone_unlock(zb);
+               zl = zbd_get_zone(f, f->max_zone);
+               zb = zbd_find_zone(td, io_u, 1, zb, zl);
+               if (zb) {
+                       io_u->offset = zb->start;
+                       dprint(FD_ZBD, "%s: found new zone(%lld) for trim\n",
+                              f->file_name, io_u->offset);
+                       goto accept;
+               }
+
+               goto eof;
+
         case DDIR_SYNC:
+               /* fall-through */
         case DDIR_DATASYNC:
         case DDIR_SYNC_FILE_RANGE:
         case DDIR_WAIT:
         case DDIR_LAST:
         case DDIR_INVAL:
+       case DDIR_TIMEOUT:
                 goto accept;
         }
  
@@ -1778,19 +2192,23 @@ accept:
         assert(zb->cond != ZBD_ZONE_COND_OFFLINE);
         assert(!io_u->zbd_queue_io);
         assert(!io_u->zbd_put_io);
+
         io_u->zbd_queue_io = zbd_queue_io;
         io_u->zbd_put_io = zbd_put_io;
+
         /*
          * Since we return with the zone lock still held,
          * add an annotation to let Coverity know that it
          * is intentional.
          */
         /* coverity[missing_unlock] */
+
         return io_u_accept;
  
  eof:
         if (zb && zb->has_wp)
                 zone_unlock(zb);
+
         return io_u_eof;
  }
  
@@ -1799,7 +2217,52 @@ char *zbd_write_status(const struct thread_stat *ts)
  {
         char *res;
  
-       if (asprintf(&res, "; %llu zone resets", (unsigned long long) ts->nr_zone_resets) < 0)
+       if (asprintf(&res, "; %"PRIu64" zone resets", ts->nr_zone_resets) < 0)
                 return NULL;
         return res;
  }
+
+/**
+ * zbd_do_io_u_trim - If reset zone is applicable, do reset zone instead of trim
+ *
+ * @td: FIO thread data.
+ * @io_u: FIO I/O unit.
+ *
+ * It is assumed that z->mutex is already locked.
+ * Return io_u_completed when reset zone succeeds. Return 0 when the target zone
+ * does not have write pointer. On error, return negative errno.
+ */
+int zbd_do_io_u_trim(struct thread_data *td, struct io_u *io_u)
+{
+       struct fio_file *f = io_u->file;
+       struct fio_zone_info *z;
+       int ret;
+
+       z = zbd_offset_to_zone(f, io_u->offset);
+       if (!z->has_wp)
+               return 0;
+
+       if (io_u->offset != z->start) {
+               log_err("Trim offset not at zone start (%lld)\n",
+                       io_u->offset);
+               return -EINVAL;
+       }
+
+       ret = zbd_reset_zone((struct thread_data *)td, f, z);
+       if (ret < 0)
+               return ret;
+
+       return io_u_completed;
+}
+
+void zbd_log_err(const struct thread_data *td, const struct io_u *io_u)
+{
+       const struct fio_file *f = io_u->file;
+
+       if (td->o.zone_mode != ZONE_MODE_ZBD)
+               return;
+
+       if (io_u->error == EOVERFLOW)
+               log_err("%s: Exceeded max_active_zones limit. Check conditions of zones out of I/O ranges.\n",
+                       f->file_name);
+}
diff --git a/zbd.h b/zbd.h

index cc3ab6241e9b35782c0580d8b397edf1f2bba9b4..5750a0b8088e7437ccdaa270a8997fa0bc1110af 100644 (file)
--- a/zbd.h
+++ b/zbd.h
@@ -17,6 +17,7 @@ struct fio_file;
  enum io_u_action {
         io_u_accept     = 0,
         io_u_eof        = 1,
+       io_u_completed  = 2,
  };
  
  /**
@@ -24,13 +25,12 @@ enum io_u_action {
   * @start: zone start location (bytes)
   * @wp: zone write pointer location (bytes)
   * @capacity: maximum size usable from the start of a zone (bytes)
- * @verify_block: number of blocks that have been verified for this zone
   * @mutex: protects the modifiable members in this structure
   * @type: zone type (BLK_ZONE_TYPE_*)
   * @cond: zone state (BLK_ZONE_COND_*)
   * @has_wp: whether or not this zone can have a valid write pointer
- * @open: whether or not this zone is currently open. Only relevant if
- *             max_open_zones > 0.
+ * @write: whether or not this zone is the write target at this moment. Only
+ *              relevant if zbd->max_open_zones > 0.
   * @reset_zone: whether or not this zone should be reset before writing to it
   */
  struct fio_zone_info {
@@ -38,33 +38,37 @@ struct fio_zone_info {
         uint64_t                start;
         uint64_t                wp;
         uint64_t                capacity;
-       uint32_t                verify_block;
         enum zbd_zone_type      type:2;
         enum zbd_zone_cond      cond:4;
         unsigned int            has_wp:1;
-       unsigned int            open:1;
+       unsigned int            write:1;
         unsigned int            reset_zone:1;
  };
  
  /**
   * zoned_block_device_info - zoned block device characteristics
   * @model: Device model.
- * @max_open_zones: global limit on the number of simultaneously opened
- *     sequential write zones.
+ * @max_write_zones: global limit on the number of sequential write zones which
+ *      are simultaneously written. A zero value means unlimited zones of
+ *      simultaneous writes and that write target zones will not be tracked in
+ *      the write_zones array.
+ * @max_active_zones: device side limit on the number of sequential write zones
+ *     in open or closed conditions. A zero value means unlimited number of
+ *     zones in the conditions.
   * @mutex: Protects the modifiable members in this structure (refcount and
   *             num_open_zones).
   * @zone_size: size of a single zone in bytes.
- * @sectors_with_data: total size of data in all zones in units of 512 bytes
- * @wp_sectors_with_data: total size of data in zones with write pointers in
- *                        units of 512 bytes
+ * @wp_valid_data_bytes: total size of data in zones with write pointers
+ * @write_min_zone: Minimum zone index of all job's write ranges. Inclusive.
+ * @write_max_zone: Maximum zone index of all job's write ranges. Exclusive.
   * @zone_size_log2: log2 of the zone size in bytes if it is a power of 2 or 0
   *             if the zone size is not a power of 2.
   * @nr_zones: number of zones
   * @refcount: number of fio files that share this structure
- * @num_open_zones: number of open zones
+ * @num_write_zones: number of write target zones
   * @write_cnt: Number of writes since the latest zone reset triggered by
   *            the zone_reset_frequency fio job parameter.
- * @open_zones: zone numbers of open zones
+ * @write_zones: zone numbers of write target zones
   * @zone_info: description of the individual zones
   *
   * Only devices for which all zones have the same size are supported.
@@ -73,20 +77,24 @@ struct fio_zone_info {
   */
  struct zoned_block_device_info {
         enum zbd_zoned_model    model;
-       uint32_t                max_open_zones;
+       uint32_t                max_write_zones;
+       uint32_t                max_active_zones;
         pthread_mutex_t         mutex;
         uint64_t                zone_size;
-       uint64_t                sectors_with_data;
-       uint64_t                wp_sectors_with_data;
+       uint64_t                wp_valid_data_bytes;
+       uint32_t                write_min_zone;
+       uint32_t                write_max_zone;
         uint32_t                zone_size_log2;
         uint32_t                nr_zones;
         uint32_t                refcount;
-       uint32_t                num_open_zones;
+       uint32_t                num_write_zones;
         uint32_t                write_cnt;
-       uint32_t                open_zones[ZBD_MAX_OPEN_ZONES];
+       uint32_t                write_zones[ZBD_MAX_WRITE_ZONES];
         struct fio_zone_info    zone_info[0];
  };
  
+int zbd_init_files(struct thread_data *td);
+void zbd_recalc_options_with_zone_granularity(struct thread_data *td);
  int zbd_setup_files(struct thread_data *td);
  void zbd_free_zone_info(struct fio_file *f);
  void zbd_file_reset(struct thread_data *td, struct fio_file *f);
@@ -96,6 +104,8 @@ enum fio_ddir zbd_adjust_ddir(struct thread_data *td, struct io_u *io_u,
                               enum fio_ddir ddir);
  enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u);
  char *zbd_write_status(const struct thread_stat *ts);
+int zbd_do_io_u_trim(struct thread_data *td, struct io_u *io_u);
+void zbd_log_err(const struct thread_data *td, const struct io_u *io_u);
  
  static inline void zbd_close_file(struct fio_file *f)
  {
diff --git a/zbd_types.h b/zbd_types.h

index 5ed41aa06c8ac371e264b9fd62af3a33a848bf9f..5f44f308f67300ee3b83c8c55140c161755f4771 100644 (file)
--- a/zbd_types.h
+++ b/zbd_types.h
@@ -8,16 +8,15 @@
  
  #include <inttypes.h>
  
-#define ZBD_MAX_OPEN_ZONES     4096
+#define ZBD_MAX_WRITE_ZONES    4096
  
  /*
   * Zoned block device models.
   */
  enum zbd_zoned_model {
-       ZBD_IGNORE,             /* Ignore file */
-       ZBD_NONE,               /* Regular block device */
-       ZBD_HOST_AWARE,         /* Host-aware zoned block device */
-       ZBD_HOST_MANAGED,       /* Host-managed zoned block device */
+       ZBD_NONE                = 0x1,  /* No zone support. Emulate zones. */
+       ZBD_HOST_AWARE          = 0x2,  /* Host-aware zoned block device */
+       ZBD_HOST_MANAGED        = 0x3,  /* Host-managed zoned block device */
  };
  
  /*
author	Vincent Fu <vincent.fu@samsung.com>
	Wed, 17 Jan 2024 18:52:48 +0000 (18:52 +0000)
committer	Vincent Fu <vincent.fu@samsung.com>
	Wed, 24 Apr 2024 17:44:09 +0000 (13:44 -0400)