Merge branch 'master' of https://github.com/dyniusz/fio

author Jens Axboe <axboe@kernel.dk>

Tue, 3 Oct 2017 17:19:26 +0000 (11:19 -0600)

committer Jens Axboe <axboe@kernel.dk>

Tue, 3 Oct 2017 17:19:26 +0000 (11:19 -0600)
author Jens Axboe <axboe@kernel.dk>
Tue, 3 Oct 2017 17:19:26 +0000 (11:19 -0600)
committer Jens Axboe <axboe@kernel.dk>
Tue, 3 Oct 2017 17:19:26 +0000 (11:19 -0600)
diff --git a/.travis.yml b/.travis.yml

index ca50e22644c7e6d323bde2a1996ef66686b3cc16..94f69fb59763d5ceb9ac4824f9771e4d6e36cb14 100644 (file)
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,24 +5,47 @@ compiler:
    - clang
    - gcc
  env:
+  matrix:
+    - BUILD_ARCH="x86"
+    - BUILD_ARCH="x86_64"
    global:
      - MAKEFLAGS="-j 2"
  matrix:
    include:
      - os: osx
        compiler: clang # Workaround travis setting CC=["clang", "gcc"]
+      env: BUILD_ARCH="x86_64"
      # Build using the 10.12 SDK but target and run on OSX 10.11
  #   - os: osx
  #     compiler: clang
  #     osx_image: xcode8
  #     env: SDKROOT=/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk MACOSX_DEPLOYMENT_TARGET=10.11
-    # Build on the latest OSX version (will eventually become obsolete)
+    # Build on the latest OSX version (will eventually become obsolete)
      - os: osx
        compiler: clang
-      osx_image: xcode8.2
+      osx_image: xcode8.3
+      env: BUILD_ARCH="x86_64"
    exclude:
      - os: osx
        compiler: gcc
+  exclude:
+    - os: linux
+      compiler: clang
+      env: BUILD_ARCH="x86" # Only do the gcc x86 build to reduce clutter
  before_install:
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get -qq update; fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo apt-get install -qq -y libaio-dev libnuma-dev libz-dev; fi
+  - EXTRA_CFLAGS="-Werror"
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
+        pkgs=(libaio-dev libnuma-dev libz-dev librbd-dev libibverbs-dev librdmacm-dev);
+        if [[ "$BUILD_ARCH" == "x86" ]]; then
+            pkgs=("${pkgs[@]/%/:i386}");
+            pkgs+=(gcc-multilib);
+            EXTRA_CFLAGS="${EXTRA_CFLAGS} -m32";
+        else
+            pkgs+=(glusterfs-common);
+        fi;
+        sudo apt-get -qq update;
+        sudo apt-get install --no-install-recommends -qq -y "${pkgs[@]}";
+    fi
+script:
+  - ./configure --extra-cflags="${EXTRA_CFLAGS}" && make
+  - make test
diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN

index 4cc903fd6326e0df4bd002ec38a79437cf2a4cc3..8c075cbd634e9d6205a61548179066743f9a59da 100755 (executable)
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN
@@ -1,7 +1,7 @@
  #!/bin/sh
  
  GVF=FIO-VERSION-FILE
-DEF_VER=fio-2.19
+DEF_VER=fio-3.1
  
  LF='
  '
diff --git a/HOWTO b/HOWTO

index d9e881abdcc3aa2495cc18957d3b4c681d943f8d..8fad2ce6f4d889e8de2013391ee1dad00787681d 100644 (file)
--- a/HOWTO
+++ b/HOWTO
@@ -54,56 +54,63 @@ Command line options
  
  .. option:: --debug=type
  
-    Enable verbose tracing of various fio actions.  May be ``all`` for all types
-    or individual types separated by a comma (e.g. ``--debug=file,mem`` will
-    enable file and memory debugging).  Currently, additional logging is
-    available for:
+       Enable verbose tracing `type` of various fio actions.  May be ``all`` for all types
+       or individual types separated by a comma (e.g. ``--debug=file,mem`` will
+       enable file and memory debugging).  Currently, additional logging is
+       available for:
  
-    *process*
+       *process*
                         Dump info related to processes.
-    *file*
+       *file*
                         Dump info related to file actions.
-    *io*
+       *io*
                         Dump info related to I/O queuing.
-    *mem*
+       *mem*
                         Dump info related to memory allocations.
-    *blktrace*
+       *blktrace*
                         Dump info related to blktrace setup.
-    *verify*
+       *verify*
                         Dump info related to I/O verification.
-    *all*
+       *all*
                         Enable all debug options.
-    *random*
+       *random*
                         Dump info related to random offset generation.
-    *parse*
+       *parse*
                         Dump info related to option matching and parsing.
-    *diskutil*
+       *diskutil*
                         Dump info related to disk utilization updates.
-    *job:x*
+       *job:x*
                         Dump info only related to job number x.
-    *mutex*
+       *mutex*
                         Dump info only related to mutex up/down ops.
-    *profile*
+       *profile*
                         Dump info related to profile extensions.
-    *time*
+       *time*
                         Dump info related to internal time keeping.
-    *net*
+       *net*
                         Dump info related to networking connections.
-    *rate*
+       *rate*
                         Dump info related to I/O rate switching.
-    *compress*
+       *compress*
                         Dump info related to log compress/decompress.
-    *?* or *help*
+       *?* or *help*
                         Show available debug options.
  
  .. option:: --parse-only
  
-    Parse options only, don\'t start any I/O.
+       Parse options only, don't start any I/O.
  
  .. option:: --output=filename
  
         Write output to file `filename`.
  
+.. option:: --output-format=format
+
+       Set the reporting `format` to `normal`, `terse`, `json`, or `json+`.  Multiple
+       formats can be selected, separated by a comma.  `terse` is a CSV based
+       format.  `json+` is like `json`, except it adds a full dump of the latency
+       buckets.
+
  .. option:: --bandwidth-log
  
         Generate aggregate bandwidth logs.
@@ -114,38 +121,31 @@ Command line options
  
  .. option:: --append-terse
  
-    Print statistics in selected mode AND terse, semicolon-delimited format.
-    **deprecated**, use :option:`--output-format` instead to select multiple
-    formats.
+       Print statistics in selected mode AND terse, semicolon-delimited format.
+       **Deprecated**, use :option:`--output-format` instead to select multiple
+       formats.
  
-.. option:: --output-format=type
-
-       Set the reporting format to `normal`, `terse`, `json`, or `json+`.  Multiple
-       formats can be selected, separate by a comma.  `terse` is a CSV based
-       format.  `json+` is like `json`, except it adds a full dump of the latency
-       buckets.
+.. option:: --terse-version=version
  
-.. option:: --terse-version=type
-
-       Set terse version output format (default 3, or 2 or 4).
+       Set terse `version` output format (default 3, or 2 or 4 or 5).
  
  .. option:: --version
  
-       Print version info and exit.
+       Print version information and exit.
  
  .. option:: --help
  
-       Print this page.
+       Print a summary of the command line options and exit.
  
  .. option:: --cpuclock-test
  
         Perform test and validation of internal CPU clock.
  
-.. option:: --crctest=test
+.. option:: --crctest=[test]
  
-    Test the speed of the builtin checksumming functions. If no argument is
-    given, all of them are tested. Or a comma separated list can be passed, in
-    which case the given ones are tested.
+       Test the speed of the built-in checksumming functions. If no argument is
+       given, all of them are tested. Alternatively, a comma separated list can
+       be passed, in which case the given ones are tested.
  
  .. option:: --cmdhelp=command
  
@@ -153,114 +153,125 @@ Command line options
  
  .. option:: --enghelp=[ioengine[,command]]
  
-    List all commands defined by :option:`ioengine`, or print help for `command`
-    defined by :option:`ioengine`.  If no :option:`ioengine` is given, list all
-    available ioengines.
+       List all commands defined by `ioengine`, or print help for `command`
+       defined by `ioengine`.  If no `ioengine` is given, list all
+       available ioengines.
  
  .. option:: --showcmd=jobfile
  
-       Turn a job file into command line options.
+       Convert `jobfile` to a set of command-line options.
  
  .. option:: --readonly
  
-    Turn on safety read-only checks, preventing writes.  The ``--readonly``
-    option is an extra safety guard to prevent users from accidentally starting
-    a write workload when that is not desired.  Fio will only write if
-    `rw=write/randwrite/rw/randrw` is given.  This extra safety net can be used
-    as an extra precaution as ``--readonly`` will also enable a write check in
-    the I/O engine core to prevent writes due to unknown user space bug(s).
+       Turn on safety read-only checks, preventing writes.  The ``--readonly``
+       option is an extra safety guard to prevent users from accidentally starting
+       a write workload when that is not desired.  Fio will only write if
+       `rw=write/randwrite/rw/randrw` is given.  This extra safety net can be used
+       as an extra precaution as ``--readonly`` will also enable a write check in
+       the I/O engine core to prevent writes due to unknown user space bug(s).
  
  .. option:: --eta=when
  
-       When real-time ETA estimate should be printed.  May be `always`, `never` or
-       `auto`.
+       Specifies when real-time ETA estimate should be printed.  `when` may be
+       `always`, `never` or `auto`.
  
  .. option:: --eta-newline=time
  
-       Force a new line for every `time` period passed.
+       Force a new line for every `time` period passed.  When the unit is omitted,
+       the value is interpreted in seconds.
  
  .. option:: --status-interval=time
  
-       Force full status dump every `time` period passed.
+       Force a full status dump of cumulative (from job start) values at `time`
+       intervals. This option does *not* provide per-period measurements. So
+       values such as bandwidth are running averages. When the time unit is omitted,
+       `time` is interpreted in seconds.
  
  .. option:: --section=name
  
-    Only run specified section in job file.  Multiple sections can be specified.
-    The ``--section`` option allows one to combine related jobs into one file.
-    E.g. one job file could define light, moderate, and heavy sections. Tell
-    fio to run only the "heavy" section by giving ``--section=heavy``
-    command line option.  One can also specify the "write" operations in one
-    section and "verify" operation in another section.  The ``--section`` option
-    only applies to job sections.  The reserved *global* section is always
-    parsed and used.
+       Only run specified section `name` in job file.  Multiple sections can be specified.
+       The ``--section`` option allows one to combine related jobs into one file.
+       E.g. one job file could define light, moderate, and heavy sections. Tell
+       fio to run only the "heavy" section by giving ``--section=heavy``
+       command line option.  One can also specify the "write" operations in one
+       section and "verify" operation in another section.  The ``--section`` option
+       only applies to job sections.  The reserved *global* section is always
+       parsed and used.
  
  .. option:: --alloc-size=kb
  
-    Set the internal smalloc pool to this size in kb (def 1024).  The
-    ``--alloc-size`` switch allows one to use a larger pool size for smalloc.
-    If running large jobs with randommap enabled, fio can run out of memory.
-    Smalloc is an internal allocator for shared structures from a fixed size
-    memory pool. The pool size defaults to 16M and can grow to 8 pools.
+       Set the internal smalloc pool size to `kb` in KiB.  The
+       ``--alloc-size`` switch allows one to use a larger pool size for smalloc.
+       If running large jobs with randommap enabled, fio can run out of memory.
+       Smalloc is an internal allocator for shared structures from a fixed size
+       memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
  
-    NOTE: While running :file:`.fio_smalloc.*` backing store files are visible
-    in :file:`/tmp`.
+       NOTE: While running :file:`.fio_smalloc.*` backing store files are visible
+       in :file:`/tmp`.
  
  .. option:: --warnings-fatal
  
-    All fio parser warnings are fatal, causing fio to exit with an
-    error.
+       All fio parser warnings are fatal, causing fio to exit with an
+       error.
  
  .. option:: --max-jobs=nr
  
-       Maximum number of threads/processes to support.
+       Set the maximum number of threads/processes to support to `nr`.
  
  .. option:: --server=args
  
-    Start a backend server, with `args` specifying what to listen to.
-    See `Client/Server`_ section.
+       Start a backend server, with `args` specifying what to listen to.
+       See `Client/Server`_ section.
  
  .. option:: --daemonize=pidfile
  
-    Background a fio server, writing the pid to the given `pidfile` file.
+       Background a fio server, writing the pid to the given `pidfile` file.
  
  .. option:: --client=hostname
  
-    Instead of running the jobs locally, send and run them on the given host or
-    set of hosts.  See `Client/Server`_ section.
+       Instead of running the jobs locally, send and run them on the given `hostname`
+       or set of `hostname`s.  See `Client/Server`_ section.
  
  .. option:: --remote-config=file
  
-       Tell fio server to load this local file.
+       Tell fio server to load this local `file`.
  
  .. option:: --idle-prof=option
  
-       Report cpu idleness on a system or percpu basis
-       ``--idle-prof=system,percpu`` or
-       run unit work calibration only ``--idle-prof=calibrate``.
+       Report CPU idleness. `option` is one of the following:
+
+               **calibrate**
+                       Run unit work calibration only and exit.
+
+               **system**
+                       Show aggregate system idleness and unit work.
+
+               **percpu**
+                       As **system** but also show per CPU idleness.
  
  .. option:: --inflate-log=log
  
-       Inflate and output compressed log.
+       Inflate and output compressed `log`.
  
  .. option:: --trigger-file=file
  
-       Execute trigger cmd when file exists.
+       Execute trigger command when `file` exists.
  
-.. option:: --trigger-timeout=t
+.. option:: --trigger-timeout=time
  
-       Execute trigger at this time.
+       Execute trigger at this `time`.
  
-.. option:: --trigger=cmd
+.. option:: --trigger=command
  
-       Set this command as local trigger.
+       Set this `command` as local trigger.
  
-.. option:: --trigger-remote=cmd
+.. option:: --trigger-remote=command
  
-       Set this command as remote trigger.
+       Set this `command` as remote trigger.
  
  .. option:: --aux-path=path
  
-       Use this path for fio state generated files.
+       Use this `path` for fio state generated files.
  
  Any parameters following the options will be assumed to be job files, unless
  they match a job file parameter. Multiple job files can be listed and each job
@@ -284,8 +295,8 @@ override a *global* section parameter, and a job file may even have several
  *global* sections if so desired. A job is only affected by a *global* section
  residing above it.
  
-The :option:`--cmdhelp` option also lists all options. If used with an `option`
-argument, :option:`--cmdhelp` will detail the given `option`.
+The :option:`--cmdhelp` option also lists all options. If used with a `command`
+argument, :option:`--cmdhelp` will detail the given `command`.
  
  See the `examples/` directory for inspiration on how to write job files.  Note
  the copyright and license requirements currently apply to `examples/` files.
@@ -436,7 +447,7 @@ automatically substituted with the current system values when the job is
  run. Simple math is also supported on these keywords, so you can perform actions
  like::
  
-        size=8*$mb_memory
+       size=8*$mb_memory
  
  and get that properly expanded to 8 times the size of memory in the machine.
  
@@ -465,13 +476,13 @@ Parameter types
  ~~~~~~~~~~~~~~~
  
  **str**
-    String. This is a sequence of alpha characters.
+       String: A sequence of alphanumeric characters.
  
  **time**
-       Integer with possible time suffix. In seconds unless otherwise
-       specified, use e.g. 10m for 10 minutes. Accepts s/m/h for seconds, minutes,
-       and hours, and accepts 'ms' (or 'msec') for milliseconds, and 'us' (or
-       'usec') for microseconds.
+       Integer with possible time suffix.  Without a unit value is interpreted as
+       seconds unless otherwise specified.  Accepts a suffix of 'd' for days, 'h' for
+       hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and
+       'us' (or 'usec') for microseconds.  For example, use 10m for 10 minutes.
  
  .. _int:
  
@@ -479,44 +490,45 @@ Parameter types
         Integer. A whole number value, which may contain an integer prefix
         and an integer suffix:
  
-        [*integer prefix*] **number** [*integer suffix*]
+       [*integer prefix*] **number** [*integer suffix*]
  
         The optional *integer prefix* specifies the number's base. The default
         is decimal. *0x* specifies hexadecimal.
  
         The optional *integer suffix* specifies the number's units, and includes an
         optional unit prefix and an optional unit.  For quantities of data, the
-       default unit is bytes. For quantities of time, the default unit is seconds.
+       default unit is bytes. For quantities of time, the default unit is seconds
+       unless otherwise specified.
  
-       With :option:`kb_base` =1000, fio follows international standards for unit
+       With :option:`kb_base`\=1000, fio follows international standards for unit
         prefixes.  To specify power-of-10 decimal values defined in the
         International System of Units (SI):
  
-               * *Ki* -- means kilo (K) or 1000
-               * *Mi* -- means mega (M) or 1000**2
-               * *Gi* -- means giga (G) or 1000**3
-               * *Ti* -- means tera (T) or 1000**4
-               * *Pi* -- means peta (P) or 1000**5
+               * *K* -- means kilo (K) or 1000
+               * *M* -- means mega (M) or 1000**2
+               * *G* -- means giga (G) or 1000**3
+               * *T* -- means tera (T) or 1000**4
+               * *P* -- means peta (P) or 1000**5
  
         To specify power-of-2 binary values defined in IEC 80000-13:
  
-               * *k* -- means kibi (Ki) or 1024
-               * *M* -- means mebi (Mi) or 1024**2
-               * *G* -- means gibi (Gi) or 1024**3
-               * *T* -- means tebi (Ti) or 1024**4
-               * *P* -- means pebi (Pi) or 1024**5
+               * *Ki* -- means kibi (Ki) or 1024
+               * *Mi* -- means mebi (Mi) or 1024**2
+               * *Gi* -- means gibi (Gi) or 1024**3
+               * *Ti* -- means tebi (Ti) or 1024**4
+               * *Pi* -- means pebi (Pi) or 1024**5
  
-       With :option:`kb_base` =1024 (the default), the unit prefixes are opposite
+       With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite
         from those specified in the SI and IEC 80000-13 standards to provide
         compatibility with old scripts.  For example, 4k means 4096.
  
         For quantities of data, an optional unit of 'B' may be included
-       (e.g.,  'kB' is the same as 'k').
+       (e.g., 'kB' is the same as 'k').
  
         The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
         not milli). 'b' and 'B' both mean byte, not bit.
  
-       Examples with :option:`kb_base` =1000:
+       Examples with :option:`kb_base`\=1000:
  
                 * *4 KiB*: 4096, 4096b, 4096B, 4ki, 4kib, 4kiB, 4Ki, 4KiB
                 * *1 MiB*: 1048576, 1mi, 1024ki
@@ -524,7 +536,7 @@ Parameter types
                 * *1 TiB*: 1099511627776, 1ti, 1024gi, 1048576mi
                 * *1 TB*: 1000000000, 1t, 1000m, 1000000k
  
-       Examples with :option:`kb_base` =1024 (default):
+       Examples with :option:`kb_base`\=1024 (default):
  
                 * *4 KiB*: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
                 * *1 MiB*: 1048576, 1m, 1024k
@@ -536,15 +548,15 @@ Parameter types
  
                 * *D* -- means days
                 * *H* -- means hours
-               * *M* -- mean minutes
+               * *M* -- means minutes
                 * *s* -- or sec means seconds (default)
                 * *ms* -- or *msec* means milliseconds
                 * *us* -- or *usec* means microseconds
  
         If the option accepts an upper and lower range, use a colon ':' or
         minus '-' to separate such values. See :ref:`irange <irange>`.
-       If the lower value specified happens to be larger than the upper value,
-       two values are swapped.
+       If the lower value specified happens to be larger than the upper value
+       the two values are swapped.
  
  .. _bool:
  
@@ -563,6 +575,8 @@ Parameter types
  **float_list**
         A list of floating point numbers, separated by a ':' character.
  
+With the above in mind, here follows the complete list of fio job parameters.
+
  
  Units
  ~~~~~
@@ -609,9 +623,6 @@ Units
                 Bit based.
  
  
-With the above in mind, here follows the complete list of fio job parameters.
-
-
  Job description
  ~~~~~~~~~~~~~~~
  
@@ -638,7 +649,7 @@ Job description
         larger number of threads/processes doing the same thing. Each thread is
         reported separately; to see statistics for all clones as a whole, use
         :option:`group_reporting` in conjunction with :option:`new_group`.
-       See :option:`--max-jobs`.
+       See :option:`--max-jobs`.  Default: 1.
  
  
  Time related parameters
@@ -649,7 +660,7 @@ Time related parameters
         Tell fio to terminate processing after the specified period of time.  It
         can be quite hard to determine for how long a specified job will run, so
         this parameter is handy to cap the total runtime to a given time.  When
-       the unit is omitted, the value is given in seconds.
+       the unit is omitted, the value is intepreted in seconds.
  
  .. option:: time_based
  
@@ -659,10 +670,9 @@ Time related parameters
  
  .. option:: startdelay=irange(time)
  
-       Delay start of job for the specified number of seconds. Supports all time
-       suffixes to allow specification of hours, minutes, seconds and milliseconds
-       -- seconds are the default if a unit is omitted.  Can be given as a range
-       which causes each thread to choose randomly out of the range.
+       Delay the start of job for the specified amount of time.  Can be a single
+       value or a range.  When given as a range, each thread will choose a value
+       randomly from within the range.  Value is in seconds if a unit is omitted.
  
  .. option:: ramp_time=time
  
@@ -723,36 +733,41 @@ Target file/device
         Prefix filenames with this directory. Used to place files in a different
         location than :file:`./`.  You can specify a number of directories by
         separating the names with a ':' character. These directories will be
-       assigned equally distributed to job clones creates with :option:`numjobs` as
+       assigned equally distributed to job clones created by :option:`numjobs` as
         long as they are using generated filenames. If specific `filename(s)` are
         set fio will use the first listed directory, and thereby matching the
         `filename` semantic which generates a file each clone if not specified, but
         let all clones use the same if set.
  
-       See the :option:`filename` option for escaping certain characters.
+       See the :option:`filename` option for information on how to escape "``:``" and
+       "``\``" characters within the directory path itself.
  
  .. option:: filename=str
  
         Fio normally makes up a `filename` based on the job name, thread number, and
-       file number. If you want to share files between threads in a job or several
+       file number (see :option:`filename_format`). If you want to share files
+       between threads in a job or several
         jobs with fixed file paths, specify a `filename` for each of them to override
         the default. If the ioengine is file based, you can specify a number of files
         by separating the names with a ':' colon. So if you wanted a job to open
         :file:`/dev/sda` and :file:`/dev/sdb` as the two working files, you would use
         ``filename=/dev/sda:/dev/sdb``. This also means that whenever this option is
         specified, :option:`nrfiles` is ignored. The size of regular files specified
-       by this option will be :option:`size` divided by number of files unless
+       by this option will be :option:`size` divided by number of files unless an
         explicit size is specified by :option:`filesize`.
  
+       Each colon and backslash in the wanted path must be escaped with a ``\``
+       character.  For instance, if the path is :file:`/dev/dsk/foo@3,0:c` then you
+       would use ``filename=/dev/dsk/foo@3,0\:c`` and if the path is
+       :file:`F:\\filename` then you would use ``filename=F\:\\filename``.
+
         On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for
         the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc.
         Note: Windows and FreeBSD prevent write access to areas
-       of the disk containing in-use data (e.g. filesystems).  If the wanted
-       `filename` does need to include a colon, then escape that with a ``\``
-       character. For instance, if the `filename` is :file:`/dev/dsk/foo@3,0:c`,
-       then you would use ``filename="/dev/dsk/foo@3,0\:c"``.  The
-       :file:`-` is a reserved name, meaning stdin or stdout.  Which of the two
-       depends on the read/write direction set.
+       of the disk containing in-use data (e.g. filesystems).
+
+       The filename "`-`" is a reserved name, meaning *stdin* or *stdout*.  Which
+       of the two depends on the read/write direction set.
  
  .. option:: filename_format=str
  
@@ -831,7 +846,7 @@ Target file/device
  
                 **sequential**
                         Finish one file before moving on to the next. Multiple files can
-                       still be open depending on 'openfiles'.
+                       still be open depending on :option:`openfiles`.
  
                 **zipf**
                         Use a *Zipf* distribution to decide what file to access.
@@ -839,10 +854,13 @@ Target file/device
                 **pareto**
                         Use a *Pareto* distribution to decide what file to access.
  
-               **gauss**
+               **normal**
                         Use a *Gaussian* (normal) distribution to decide what file to
                         access.
  
+               **gauss**
+                       Alias for normal.
+
         For *random*, *roundrobin*, and *sequential*, a postfix can be appended to
         tell fio how many I/Os to issue before switching to a new file. For example,
         specifying ``file_service_type=random:8`` would cause fio to issue
@@ -860,27 +878,28 @@ Target file/device
  
         If true, serialize the file creation for the jobs.  This may be handy to
         avoid interleaving of data files, which may greatly depend on the filesystem
-       used and even the number of processors in the system.
+       used and even the number of processors in the system.  Default: true.
  
  .. option:: create_fsync=bool
  
-       fsync the data file after creation. This is the default.
+       :manpage:`fsync(2)` the data file after creation. This is the default.
  
  .. option:: create_on_open=bool
  
-       Don't pre-setup the files for I/O, just create open() when it's time to do
-       I/O to that file.
+       If true, don't pre-create files but allow the job's open() to create a file
+       when it's time to do I/O.  Default: false -- pre-create all necessary files
+       when the job starts.
  
  .. option:: create_only=bool
  
         If true, fio will only run the setup phase of the job.  If files need to be
-       laid out or updated on disk, only that will be done. The actual job contents
-       are not executed.
+       laid out or updated on disk, only that will be done -- the actual job contents
+       are not executed.  Default: false.
  
  .. option:: allow_file_create=bool
  
-       If true, fio is permitted to create files as part of its workload. This is
-       the default behavior. If this option is false, then fio will error out if
+       If true, fio is permitted to create files as part of its workload.  If this
+       option is false, then fio will error out if
         the files it needs to use don't already exist. Default: true.
  
  .. option:: allow_mounted_write=bool
@@ -897,16 +916,18 @@ Target file/device
         given I/O operation. This will also clear the :option:`invalidate` flag,
         since it is pointless to pre-read and then drop the cache. This will only
         work for I/O engines that are seek-able, since they allow you to read the
-       same data multiple times. Thus it will not work on e.g. network or splice I/O.
+       same data multiple times. Thus it will not work on non-seekable I/O engines
+       (e.g. network, splice). Default: false.
  
  .. option:: unlink=bool
  
         Unlink the job files when done. Not the default, as repeated runs of that
-       job would then waste time recreating the file set again and again.
+       job would then waste time recreating the file set again and again. Default:
+       false.
  
  .. option:: unlink_each_loop=bool
  
-       Unlink job files after each iteration or loop.
+       Unlink job files after each iteration or loop.  Default: false.
  
  .. option:: zonesize=int
  
@@ -928,7 +949,7 @@ I/O type
  .. option:: direct=bool
  
         If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
-       ZFS on Solaris doesn't support direct I/O.  On Windows the synchronous
+       OpenBSD and ZFS on Solaris don't support direct I/O.  On Windows the synchronous
         ioengines don't support direct I/O.  Default: false.
  
  .. option:: atomic=bool
@@ -952,10 +973,10 @@ I/O type
                                 Sequential writes.
                 **trim**
                                 Sequential trims (Linux block devices only).
-               **randwrite**
-                               Random writes.
                 **randread**
                                 Random reads.
+               **randwrite**
+                               Random writes.
                 **randtrim**
                                 Random trims (Linux block devices only).
                 **rw,readwrite**
@@ -968,15 +989,16 @@ I/O type
  
         Fio defaults to read if the option is not specified.  For the mixed I/O
         types, the default is to split them 50/50.  For certain types of I/O the
-       result may still be skewed a bit, since the speed may be different. It is
-       possible to specify a number of I/O's to do before getting a new offset,
-       this is done by appending a ``:<nr>`` to the end of the string given.  For a
+       result may still be skewed a bit, since the speed may be different.
+
+       It is possible to specify the number of I/Os to do before getting a new
+       offset by appending ``:<nr>`` to the end of the string given.  For a
         random read, it would look like ``rw=randread:8`` for passing in an offset
         modifier with a value of 8. If the suffix is used with a sequential I/O
-       pattern, then the value specified will be added to the generated offset for
-       each I/O.  For instance, using ``rw=write:4k`` will skip 4k for every
-       write. It turns sequential I/O into sequential I/O with holes.  See the
-       :option:`rw_sequencer` option.
+       pattern, then the *<nr>* value specified will be **added** to the generated
+       offset for each I/O turning sequential I/O into sequential I/O with holes.
+       For instance, using ``rw=write:4k`` will skip 4k for every write.  Also see
+       the :option:`rw_sequencer` option.
  
  .. option:: rw_sequencer=str
  
@@ -991,8 +1013,8 @@ I/O type
  
         ``sequential`` is only useful for random I/O, where fio would normally
         generate a new random offset for every I/O. If you append e.g. 8 to randread,
-       you would get a new random offset for every 8 I/O's. The result would be a
-       seek for only every 8 I/O's, instead of for every I/O. Use ``rw=randread:8``
+       you would get a new random offset for every 8 I/Os. The result would be a
+       seek for only every 8 I/Os, instead of for every I/O. Use ``rw=randread:8``
         to specify that. As sequential I/O is already sequential, setting
         ``sequential`` for that would not result in any differences.  ``identical``
         behaves in a similar fashion, except it sends the same offset 8 number of
@@ -1028,6 +1050,10 @@ I/O type
                 **none**
                         Do not pre-allocate space.
  
+               **native**
+                       Use a platform's native pre-allocation call but fall back to
+                       **none** behavior if it fails/is not implemented.
+
                 **posix**
                         Pre-allocate via :manpage:`posix_fallocate(3)`.
  
@@ -1042,8 +1068,9 @@ I/O type
                         Backward-compatible alias for **posix**.
  
         May not be available on all supported platforms. **keep** is only available
-       on Linux. If using ZFS on Solaris this must be set to **none** because ZFS
-       doesn't support it. Default: **posix**.
+       on Linux. If using ZFS on Solaris this cannot be set to **posix**
+       because ZFS doesn't support pre-allocation. Default: **native** if any
+       pre-allocation methods are available, **none** if not.
  
  .. option:: fadvise_hint=str
  
@@ -1064,18 +1091,39 @@ I/O type
                 **random**
                         Advise using **FADV_RANDOM**.
  
-.. option:: fadvise_stream=int
+.. option:: write_hint=str
+
+       Use :manpage:`fcntl(2)` to advise the kernel what life time to expect
+       from a write. Only supported on Linux, as of version 4.13. Accepted
+       values are:
+
+               **none**
+                       No particular life time associated with this file.
+
+               **short**
+                       Data written to this file has a short life time.
  
-       Use :manpage:`posix_fadvise(2)` to advise the kernel what stream ID the
-       writes issued belong to. Only supported on Linux. Note, this option may
-       change going forward.
+               **medium**
+                       Data written to this file has a medium life time.
+
+               **long**
+                       Data written to this file has a long life time.
+
+               **extreme**
+                       Data written to this file has a very long life time.
+
+       The values are all relative to each other, and no absolute meaning
+       should be associated with them.
  
  .. option:: offset=int
  
-       Start I/O at the given offset in the file. The data before the given offset
-       will not be touched. This effectively caps the file size at `real_size -
-       offset`. Can be combined with :option:`size` to constrain the start and
-       end range that I/O will be done within.
+       Start I/O at the provided offset in the file, given as either a fixed size in
+       bytes or a percentage. If a percentage is given, the next ``blockalign``-ed
+       offset will be used. Data before the given offset will not be touched. This
+       effectively caps the file size at `real_size - offset`. Can be combined with
+       :option:`size` to constrain the start and end range of the I/O workload.
+       A percentage can be specified by a number between 1 and 100 followed by '%',
+       for example, ``offset=20%`` to specify 20%.
  
  .. option:: offset_increment=int
  
@@ -1098,27 +1146,29 @@ I/O type
  
  .. option:: fsync=int
  
-       If writing to a file, issue a sync of the dirty data for every number of
-       blocks given. For example, if you give 32 as a parameter, fio will sync the
-       file for every 32 writes issued. If fio is using non-buffered I/O, we may
-       not sync the file. The exception is the sg I/O engine, which synchronizes
-       the disk cache anyway. Defaults to 0, which means no sync every certain
-       number of writes.
+       If writing to a file, issue an :manpage:`fsync(2)` (or its equivalent) of
+       the dirty data for every number of blocks given. For example, if you give 32
+       as a parameter, fio will sync the file after every 32 writes issued. If fio is
+       using non-buffered I/O, we may not sync the file. The exception is the sg
+       I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
+       means fio does not periodically issue and wait for a sync to complete. Also
+       see :option:`end_fsync` and :option:`fsync_on_close`.
  
  .. option:: fdatasync=int
  
         Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
         not metadata blocks.  In Windows, FreeBSD, and DragonFlyBSD there is no
-       :manpage:`fdatasync(2)`, this falls back to using :manpage:`fsync(2)`.
-       Defaults to 0, which means no sync data every certain number of writes.
+       :manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`.
+       Defaults to 0, which means fio does not periodically issue and wait for a
+       data-only sync to complete.
  
  .. option:: write_barrier=int
  
-   Make every `N-th` write a barrier write.
+       Make every `N-th` write a barrier write.
  
-.. option:: sync_file_range=str:val
+.. option:: sync_file_range=str:int
  
-       Use :manpage:`sync_file_range(2)` for every `val` number of write
+       Use :manpage:`sync_file_range(2)` for every `int` number of write
         operations. Fio will track range of writes that have happened since the last
         :manpage:`sync_file_range(2)` call. `str` can currently be one or more of:
  
@@ -1139,17 +1189,18 @@ I/O type
         If true, writes to a file will always overwrite existing data. If the file
         doesn't already exist, it will be created before the write phase begins. If
         the file exists and is large enough for the specified write phase, nothing
-       will be done.
+       will be done. Default: false.
  
  .. option:: end_fsync=bool
  
-       If true, fsync file contents when a write stage has completed.
+       If true, :manpage:`fsync(2)` file contents when a write stage has completed.
+       Default: false.
  
  .. option:: fsync_on_close=bool
  
         If true, fio will :manpage:`fsync(2)` a dirty file on close.  This differs
-       from end_fsync in that it will happen on every file close, not just at the
-       end of the job.
+       from :option:`end_fsync` in that it will happen on every file close, not
+       just at the end of the job.  Default: false.
  
  .. option:: rwmixread=int
  
@@ -1180,30 +1231,30 @@ I/O type
                 **pareto**
                                 Pareto distribution
  
-               **gauss**
+               **normal**
                                 Normal (Gaussian) distribution
  
                 **zoned**
                                 Zoned random distribution
  
         When using a **zipf** or **pareto** distribution, an input value is also
-       needed to define the access pattern. For **zipf**, this is the `zipf
+       needed to define the access pattern. For **zipf**, this is the `Zipf
         theta`. For **pareto**, it's the `Pareto power`. Fio includes a test
-       program, :command:`genzipf`, that can be used visualize what the given input
+       program, :command:`fio-genzipf`, that can be used visualize what the given input
         values will yield in terms of hit rates.  If you wanted to use **zipf** with
         a `theta` of 1.2, you would use ``random_distribution=zipf:1.2`` as the
         option. If a non-uniform model is used, fio will disable use of the random
-       map. For the **gauss** distribution, a normal deviation is supplied as a
-       value between 0 and 100.
+       map. For the **normal** distribution, a normal (Gaussian) deviation is
+       supplied as a value between 0 and 100.
  
         For a **zoned** distribution, fio supports specifying percentages of I/O
         access that should fall within what range of the file or device. For
         example, given a criteria of:
  
-       * 60% of accesses should be to the first 10%
-       * 30% of accesses should be to the next 20%
-       * 8% of accesses should be to to the next 30%
-       * 2% of accesses should be to the next 40%
+               * 60% of accesses should be to the first 10%
+               * 30% of accesses should be to the next 20%
+               * 8% of accesses should be to the next 30%
+               * 2% of accesses should be to the next 40%
  
         we can define that through zoning of the random accesses. For the above
         example, the user would do::
@@ -1243,21 +1294,20 @@ I/O type
  
  .. option:: random_generator=str
  
-       Fio supports the following engines for generating
-       I/O offsets for random I/O:
+       Fio supports the following engines for generating I/O offsets for random I/O:
  
                 **tausworthe**
-                       Strong 2^88 cycle random number generator
+                       Strong 2^88 cycle random number generator.
                 **lfsr**
-                       Linear feedback shift register generator
+                       Linear feedback shift register generator.
                 **tausworthe64**
-                       Strong 64-bit 2^258 cycle random number generator
+                       Strong 64-bit 2^258 cycle random number generator.
  
         **tausworthe** is a strong random number generator, but it requires tracking
         on the side if we want to ensure that blocks are only read or written
-       once. **LFSR** guarantees that we never generate the same offset twice, and
+       once. **lfsr** guarantees that we never generate the same offset twice, and
         it's also less computationally expensive. It's not a true random generator,
-       however, though for I/O purposes it's typically good enough. **LFSR** only
+       however, though for I/O purposes it's typically good enough. **lfsr** only
         works with single block sizes, not with workloads that use multiple block
         sizes. If used with such a workload, fio may read or write some blocks
         multiple times. The default value is **tausworthe**, unless the required
@@ -1290,7 +1340,7 @@ Block size
                         means default for reads, 8k for writes and trims.
  
                 **bs=,8k,**
-                       means default for reads, 8k for writes, and default for writes.
+                       means default for reads, 8k for writes, and default for trims.
  
  .. option:: blocksize_range=irange[,irange][,irange], bsrange=irange[,irange][,irange]
  
@@ -1340,7 +1390,7 @@ Block size
         typically won't work with direct I/O, as that normally requires sector
         alignment.
  
-.. option:: bs_is_seq_rand
+.. option:: bs_is_seq_rand=bool
  
         If this option is set, fio will use the normal read,write blocksize settings
         as sequential,random blocksize settings instead. Any random read or write
@@ -1382,8 +1432,8 @@ Buffers and memory
  .. option:: buffer_compress_percentage=int
  
         If this is set, then fio will attempt to provide I/O buffer content (on
-       WRITEs) that compress to the specified level. Fio does this by providing a
-       mix of random data and a fixed pattern. The fixed pattern is either zeroes,
+       WRITEs) that compresses to the specified level. Fio does this by providing a
+       mix of random data and a fixed pattern. The fixed pattern is either zeros,
         or the pattern specified by :option:`buffer_pattern`. If the pattern option
         is used, it might skew the compression ratio slightly. Note that this is per
         block size unit, for file/disk wide compression level that matches this
@@ -1400,11 +1450,18 @@ Buffers and memory
  
  .. option:: buffer_pattern=str
  
-       If set, fio will fill the I/O buffers with this pattern. If not set, the
-       contents of I/O buffers is defined by the other options related to buffer
-       contents. The setting can be any pattern of bytes, and can be prefixed with
-       0x for hex values. It may also be a string, where the string must then be
-       wrapped with ``""``, e.g.::
+       If set, fio will fill the I/O buffers with this pattern or with the contents
+       of a file. If not set, the contents of I/O buffers are defined by the other
+       options related to buffer contents. The setting can be any pattern of bytes,
+       and can be prefixed with 0x for hex values. It may also be a string, where
+       the string must then be wrapped with ``""``. Or it may also be a filename,
+       where the filename must be wrapped with ``''`` in which case the file is
+       opened and read. Note that not all the file contents will be read if that
+       would cause the buffers to overflow. So, for example::
+
+               buffer_pattern='filename'
+
+       or::
  
                 buffer_pattern="abcd"
  
@@ -1418,7 +1475,7 @@ Buffers and memory
  
         Also you can combine everything together in any order::
  
-               buffer_pattern=0xdeadface"abcd"-12
+               buffer_pattern=0xdeadface"abcd"-12'filename'
  
  .. option:: dedupe_percentage=int
  
@@ -1430,8 +1487,8 @@ Buffers and memory
  
  .. option:: invalidate=bool
  
-       Invalidate the buffer/page cache parts for this file prior to starting
-       I/O if the platform and file type support it. Defaults to true.
+       Invalidate the buffer/page cache parts of the files to be used prior to
+       starting I/O if the platform and file type support it.  Defaults to true.
         This will be ignored if :option:`pre_read` is also specified for the
         same job.
  
@@ -1457,7 +1514,7 @@ Buffers and memory
                         Same as shm, but use huge pages as backing.
  
                 **mmap**
-                       Use mmap to allocate buffers. May either be anonymous memory, or can
+                       Use :manpage:`mmap(2)` to allocate buffers. May either be anonymous memory, or can
                         be file backed if a filename is given after the option. The format
                         is `mem=mmap:/path/to/file`.
  
@@ -1470,6 +1527,7 @@ Buffers and memory
  
                 **cudamalloc**
                         Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+                       The :option:`ioengine` must be `rdma`.
  
         The area allocated is a function of the maximum allowed bs size for the job,
         multiplied by the I/O depth given. Note that for **shmhuge** and
@@ -1488,7 +1546,7 @@ Buffers and memory
         should point there. So if it's mounted in :file:`/huge`, you would use
         `mem=mmaphuge:/huge/somefile`.
  
-.. option:: iomem_align=int
+.. option:: iomem_align=int, mem_align=int
  
         This indicates the memory alignment of the I/O memory buffers.  Note that
         the given alignment is applied to the first I/O unit buffer, if using
@@ -1543,7 +1601,7 @@ I/O size
         and :option:`io_size` is set to 40GiB, then fio will do 40GiB of I/O within
         the 0..20GiB region.
  
-.. option:: filesize=int
+.. option:: filesize=irange(int)
  
         Individual file sizes. May be a range, in which case fio will select sizes
         for files at random within the given range and limited to :option:`size` in
@@ -1596,7 +1654,7 @@ I/O engine
  
                 **libaio**
                         Linux native asynchronous I/O. Note that Linux may only support
-                       queued behaviour with non-buffered I/O (set ``direct=1`` or
+                       queued behavior with non-buffered I/O (set ``direct=1`` or
                         ``buffered=0``).
                         This engine defines engine specific options.
  
@@ -1623,8 +1681,8 @@ I/O engine
                         SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
                         ioctl, or if the target is an sg character device we use
                         :manpage:`read(2)` and :manpage:`write(2)` for asynchronous
-                       I/O. Requires filename option to specify either block or character
-                       devices.
+                       I/O. Requires :option:`filename` option to specify either block or
+                       character devices.
  
                 **null**
                         Doesn't transfer any data, just pretends to.  This is mainly used to
@@ -1646,9 +1704,9 @@ I/O engine
                 **cpuio**
                         Doesn't transfer any data, but burns CPU cycles according to the
                         :option:`cpuload` and :option:`cpuchunks` options. Setting
-                       :option:`cpuload` =85 will cause that job to do nothing but burn 85%
-                       of the CPU. In case of SMP machines, use :option:`numjobs`
-                       =<no_of_cpu> to get desired CPU usage, as the cpuload only loads a
+                       :option:`cpuload`\=85 will cause that job to do nothing but burn 85%
+                       of the CPU. In case of SMP machines, use :option:`numjobs`=<nr_of_cpu>
+                       to get desired CPU usage, as the cpuload only loads a
                         single CPU at the desired rate. A job never finishes unless there is
                         at least one non-cpuio job.
  
@@ -1681,7 +1739,7 @@ I/O engine
                 **ftruncate**
                         I/O engine that sends :manpage:`ftruncate(2)` operations in response
                         to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
-                       size to the current block offset. Block size is ignored.
+                       size to the current block offset. :option:`blocksize` is ignored.
  
                 **e4defrag**
                         I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
@@ -1693,26 +1751,26 @@ I/O engine
                         ioengine defines engine specific options.
  
                 **gfapi**
-                       Using Glusterfs libgfapi sync interface to direct access to
-                       Glusterfs volumes without having to go through FUSE.  This ioengine
+                       Using GlusterFS libgfapi sync interface to direct access to
+                       GlusterFS volumes without having to go through FUSE.  This ioengine
                         defines engine specific options.
  
                 **gfapi_async**
-                       Using Glusterfs libgfapi async interface to direct access to
-                       Glusterfs volumes without having to go through FUSE. This ioengine
+                       Using GlusterFS libgfapi async interface to direct access to
+                       GlusterFS volumes without having to go through FUSE. This ioengine
                         defines engine specific options.
  
                 **libhdfs**
-                       Read and write through Hadoop (HDFS).  The :file:`filename` option
+                       Read and write through Hadoop (HDFS).  The :option:`filename` option
                         is used to specify host,port of the hdfs name-node to connect.  This
                         engine interprets offsets a little differently.  In HDFS, files once
-                       created cannot be modified.  So random writes are not possible. To
-                       imitate this, libhdfs engine expects bunch of small files to be
-                       created over HDFS, and engine will randomly pick a file out of those
-                       files based on the offset generated by fio backend. (see the example
+                       created cannot be modified so random writes are not possible. To
+                       imitate this the libhdfs engine expects a bunch of small files to be
+                       created over HDFS and will randomly pick a file from them
+                       based on the offset generated by fio backend (see the example
                         job file to create such files, use ``rw=write`` option). Please
-                       note, you might want to set necessary environment variables to work
-                       with hdfs/libhdfs properly.  Each job uses its own connection to
+                       note, it may be necessary to set environment variables to work
+                       with HDFS/libhdfs properly.  Each job uses its own connection to
                         HDFS.
  
                 **mtd**
@@ -1720,7 +1778,7 @@ I/O engine
                         :file:`/dev/mtd0`). Discards are treated as erases. Depending on the
                         underlying device type, the I/O may have to go in a certain pattern,
                         e.g., on NAND, writing sequentially to erase blocks and discarding
-                       before overwriting. The writetrim mode works well for this
+                       before overwriting. The `trimwrite` mode works well for this
                         constraint.
  
                 **pmemblk**
@@ -1735,15 +1793,17 @@ I/O engine
                 **external**
                         Prefix to specify loading an external I/O engine object file. Append
                         the engine filename, e.g. ``ioengine=external:/tmp/foo.o`` to load
-                       ioengine :file:`foo.o` in :file:`/tmp`.
+                       ioengine :file:`foo.o` in :file:`/tmp`. The path can be either
+                       absolute or relative. See :file:`engines/skeleton_external.c` for
+                       details of writing an external I/O engine.
  
  
  I/O engine specific parameters
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  
  In addition, there are some parameters which are only valid when a specific
-ioengine is in use. These are used identically to normal parameters, with the
-caveat that when used on the command line, they must come after the
+:option:`ioengine` is in use. These are used identically to normal parameters,
+with the caveat that when used on the command line, they must come after the
  :option:`ioengine` that defines them is selected.
  
  .. option:: userspace_reap : [libaio]
@@ -1759,6 +1819,11 @@ caveat that when used on the command line, they must come after the
         Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
         than normal.
  
+.. option:: hipri_percentage : [pvsync2]
+
+       When hipri is set this determines the probability of a pvsync2 I/O being high
+       priority. The default is 100%.
+
  .. option:: cpuload=int : [cpuio]
  
         Attempt to use the specified percentage of CPU cycles. This is a mandatory
@@ -1772,18 +1837,16 @@ caveat that when used on the command line, they must come after the
  
         Detect when I/O threads are done, then exit.
  
-.. option:: hostname=str : [netsplice] [net]
-
-       The host name or IP address to use for TCP or UDP based I/O.  If the job is
-       a TCP listener or UDP reader, the host name is not used and must be omitted
-       unless it is a valid UDP multicast address.
-
  .. option:: namenode=str : [libhdfs]
  
-       The host name or IP address of a HDFS cluster namenode to contact.
+       The hostname or IP address of a HDFS cluster namenode to contact.
  
  .. option:: port=int
  
+   [libhdfs]
+
+               The listening port of the HFDS cluster namenode.
+
     [netsplice], [net]
  
                 The TCP or UDP port to bind to or connect to. If this is used with
@@ -1791,9 +1854,11 @@ caveat that when used on the command line, they must come after the
                 this will be the starting port number since fio will use a range of
                 ports.
  
-   [libhdfs]
+.. option:: hostname=str : [netsplice] [net]
  
-               the listening port of the HFDS cluster namenode.
+       The hostname or IP address to use for TCP or UDP based I/O.  If the job is
+       a TCP listener or UDP reader, the hostname is not used and must be omitted
+       unless it is a valid UDP multicast address.
  
  .. option:: interface=str : [netsplice] [net]
  
@@ -1808,9 +1873,7 @@ caveat that when used on the command line, they must come after the
  
         Set TCP_NODELAY on TCP connections.
  
-.. option:: protocol=str : [netsplice] [net]
-
-.. option:: proto=str : [netsplice] [net]
+.. option:: protocol=str, proto=str : [netsplice] [net]
  
         The network protocol to use. Accepted values are:
  
@@ -1827,15 +1890,15 @@ caveat that when used on the command line, they must come after the
  
         When the protocol is TCP or UDP, the port must also be given, as well as the
         hostname if the job is a TCP listener or UDP reader. For unix sockets, the
-       normal filename option should be used and the port is invalid.
+       normal :option:`filename` option should be used and the port is invalid.
  
-.. option:: listen : [net]
+.. option:: listen : [netsplice] [net]
  
         For TCP network connections, tell fio to listen for incoming connections
         rather than initiating an outgoing connection. The :option:`hostname` must
         be omitted if this option is used.
  
-.. option:: pingpong : [net]
+.. option:: pingpong : [netsplice] [net]
  
         Normally a network writer will just continue writing data, and a network
         reader will just consume packages. If ``pingpong=1`` is set, a writer will
@@ -1847,17 +1910,17 @@ caveat that when used on the command line, they must come after the
         ``pingpong=1`` should only be set for a single reader when multiple readers
         are listening to the same address.
  
-.. option:: window_size : [net]
+.. option:: window_size : [netsplice] [net]
  
         Set the desired socket buffer size for the connection.
  
-.. option:: mss : [net]
+.. option:: mss : [netsplice] [net]
  
         Set the TCP maximum segment size (TCP_MAXSEG).
  
  .. option:: donorname=str : [e4defrag]
  
-       File will be used as a block donor(swap extents between files).
+       File will be used as a block donor (swap extents between files).
  
  .. option:: inplace=int : [e4defrag]
  
@@ -1866,7 +1929,7 @@ caveat that when used on the command line, they must come after the
         **0**
                 Default. Preallocate donor's file on init.
         **1**
-               Allocate space immediately inside defragment event,     and free right
+               Allocate space immediately inside defragment event, and free right
                 after event.
  
  .. option:: clustername=str : [rbd]
@@ -1898,7 +1961,7 @@ caveat that when used on the command line, they must come after the
  
  .. option:: chunk_size : [libhdfs]
  
-       the size of the chunk to use for each file.
+       The size of the chunk to use for each file.
  
  
  I/O depth
@@ -1911,7 +1974,7 @@ I/O depth
         for small degrees when :option:`verify_async` is in use).  Even async
         engines may impose OS restrictions causing the desired depth not to be
         achieved.  This may happen on Linux when using libaio and not setting
-       :option:`direct` =1, since buffered I/O is not async on that OS.  Keep an
+       :option:`direct`\=1, since buffered I/O is not async on that OS.  Keep an
         eye on the I/O depth distribution in the fio output to verify that the
         achieved depth is as expected. Default: 1.
  
@@ -1934,9 +1997,9 @@ I/O depth
  .. option:: iodepth_batch_complete_max=int
  
         This defines maximum pieces of I/O to retrieve at once. This variable should
-       be used along with :option:`iodepth_batch_complete_min` =int variable,
+       be used along with :option:`iodepth_batch_complete_min`\=int variable,
         specifying the range of min and max amount of I/O which should be
-       retrieved. By default it is equal to :option:`iodepth_batch_complete_min`
+       retrieved. By default it is equal to the :option:`iodepth_batch_complete_min`
         value.
  
         Example #1::
@@ -1965,6 +2028,21 @@ I/O depth
         16 requests, it will let the depth drain down to 4 before starting to fill
         it again.
  
+.. option:: serialize_overlap=bool
+
+       Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+       When two or more I/Os are submitted simultaneously, there is no guarantee that
+       the I/Os will be processed or completed in the submitted order. Further, if
+       two or more of those I/Os are writes, any overlapping region between them can
+       become indeterminate/undefined on certain storage. These issues can cause
+       verification to fail erratically when at least one of the racing I/Os is
+       changing data and the overlapping region has a non-zero size. Setting
+       ``serialize_overlap`` tells fio to avoid provoking this behavior by explicitly
+       serializing in-flight I/Os that have a non-zero overlap. Note that setting
+       this option can reduce both performance and the `:option:iodepth` achieved.
+       Additionally this option does not work when :option:`io_submit_mode` is set to
+       offload. Default: false.
+
  .. option:: io_submit_mode=str
  
         This option controls how fio submits the I/O to the I/O engine. The default
@@ -1974,7 +2052,7 @@ I/O depth
         has a bit of extra overhead, especially for lower queue depth I/O where it
         can increase latencies. The benefit is that fio can manage submission rates
         independently of the device completion rates. This avoids skewed latency
-       reporting if I/O gets back up on the device side (the coordinated omission
+       reporting if I/O gets backed up on the device side (the coordinated omission
         problem).
  
  
@@ -1985,7 +2063,7 @@ I/O rate
  
         Stall the job for the specified period of time after an I/O has completed before issuing the
         next. May be used to simulate processing being done by an application.
-       When the unit is omitted, the value is given in microseconds.  See
+       When the unit is omitted, the value is interpreted in microseconds.  See
         :option:`thinktime_blocks` and :option:`thinktime_spin`.
  
  .. option:: thinktime_spin=time
@@ -1993,15 +2071,15 @@ I/O rate
         Only valid if :option:`thinktime` is set - pretend to spend CPU time doing
         something with the data received, before falling back to sleeping for the
         rest of the period specified by :option:`thinktime`.  When the unit is
-       omitted, the value is given in microseconds.
+       omitted, the value is interpreted in microseconds.
  
  .. option:: thinktime_blocks=int
  
         Only valid if :option:`thinktime` is set - control how many blocks to issue,
-       before waiting `thinktime` usecs. If not set, defaults to 1 which will make
-       fio wait `thinktime` usecs after every block. This effectively makes any
+       before waiting :option:`thinktime` usecs. If not set, defaults to 1 which will make
+       fio wait :option:`thinktime` usecs after every block. This effectively makes any
         queue depth setting redundant, since no more than 1 I/O will be queued
-       before we have to complete it and do our thinktime. In other words, this
+       before we have to complete it and do our :option:`thinktime`. In other words, this
         setting effectively caps the queue depth if the latter is larger.
  
  .. option:: rate=int[,int][,int]
@@ -2010,6 +2088,11 @@ I/O rate
         suffix rules apply.  Comma-separated values may be specified for reads,
         writes, and trims as described in :option:`blocksize`.
  
+       For example, using `rate=1m,500k` would limit reads to 1MiB/sec and writes to
+       500KiB/sec.  Capping only reads or writes can be done with `rate=,500k` or
+       `rate=500k,` where the former will only limit writes (to 500KiB/sec) and the
+       latter will only limit reads.
+
  .. option:: rate_min=int[,int][,int]
  
         Tell fio to do whatever it can to maintain at least this bandwidth. Failing
@@ -2049,14 +2132,14 @@ I/O latency
  
         If set, fio will attempt to find the max performance point that the given
         workload will run at while maintaining a latency below this target.  When
-       the unit is omitted, the value is given in microseconds.  See
+       the unit is omitted, the value is interpreted in microseconds.  See
         :option:`latency_window` and :option:`latency_percentile`.
  
  .. option:: latency_window=time
  
         Used with :option:`latency_target` to specify the sample window that the job
         is run at varying queue depths to test the performance.  When the unit is
-       omitted, the value is given in microseconds.
+       omitted, the value is interpreted in microseconds.
  
  .. option:: latency_percentile=float
  
@@ -2068,13 +2151,13 @@ I/O latency
  .. option:: max_latency=time
  
         If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
-       maximum latency. When the unit is omitted, the value is given in
+       maximum latency. When the unit is omitted, the value is interpreted in
         microseconds.
  
  .. option:: rate_cycle=int
  
         Average bandwidth for :option:`rate` and :option:`rate_min` over this number
-       of milliseconds.
+       of milliseconds. Defaults to 1000.
  
  
  I/O replay
@@ -2088,7 +2171,7 @@ I/O replay
  
  .. option:: read_iolog=str
  
-       Open an iolog with the specified file name and replay the I/O patterns it
+       Open an iolog with the specified filename and replay the I/O patterns it
         contains. This can be used to store a workload and replay it sometime
         later. The iolog given may also be a blktrace binary file, which allows fio
         to replay a workload captured by :command:`blktrace`. See
@@ -2096,10 +2179,10 @@ I/O replay
         replay, the file needs to be turned into a blkparse binary data file first
         (``blkparse <device> -o /dev/null -d file_for_fio.bin``).
  
-.. option:: replay_no_stall=int
+.. option:: replay_no_stall=bool
  
         When replaying I/O with :option:`read_iolog` the default behavior is to
-       attempt to respect the time stamps within the log and replay them with the
+       attempt to respect the timestamps within the log and replay them with the
         appropriate delay between IOPS. By setting this variable fio will not
         respect the timestamps and attempt to replay them as fast as possible while
         still respecting ordering. The result is the same I/O pattern to a given
@@ -2112,9 +2195,9 @@ I/O replay
         from.  This is sometimes undesirable because on a different machine those
         major/minor numbers can map to a different device.  Changing hardware on the
         same system can also result in a different major/minor mapping.
-       ``replay_redirect`` causes all IOPS to be replayed onto the single specified
+       ``replay_redirect`` causes all I/Os to be replayed onto the single specified
         device regardless of the device it was recorded
-       from. i.e. :option:`replay_redirect` = :file:`/dev/sdc` would cause all I/O
+       from. i.e. :option:`replay_redirect`\= :file:`/dev/sdc` would cause all I/O
         in the blktrace or iolog to be replayed onto :file:`/dev/sdc`.  This means
         multiple devices will be replayed onto a single device, if the trace
         contains multiple devices. If you want multiple devices to be replayed
@@ -2138,15 +2221,14 @@ Threads, processes and job synchronization
  
  .. option:: thread
  
-       Fio defaults to forking jobs, however if this option is given, fio will use
-       POSIX Threads function :manpage:`pthread_create(3)` to create threads instead
-       of forking processes.
+       Fio defaults to creating jobs by using fork, however if this option is
+       given, fio will create jobs by using POSIX Threads' function
+       :manpage:`pthread_create(3)` to create threads instead.
  
  .. option:: wait_for=str
  
-       Specifies the name of the already defined job to wait for. Single waitee
-       name only may be specified. If set, the job won't be started until all
-       workers of the waitee job are done.
+       If set, the current job won't be started until all workers of the specified
+       waitee job are done.
  
         ``wait_for`` operates on the job name basis, so there are a few
         limitations. First, the waitee must be defined prior to the waiter job
@@ -2174,8 +2256,8 @@ Threads, processes and job synchronization
  
  .. option:: cpumask=int
  
-       Set the CPU affinity of this job. The parameter given is a bitmask of
-       allowed CPU's the job may run on. So if you want the allowed CPUs to be 1
+       Set the CPU affinity of this job. The parameter given is a bit mask of
+       allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
         and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
         :manpage:`sched_setaffinity(2)`. This may not work on all supported
         operating systems or kernel versions. This option doesn't work well for a
@@ -2185,23 +2267,23 @@ Threads, processes and job synchronization
  
  .. option:: cpus_allowed=str
  
-       Controls the same options as :option:`cpumask`, but it allows a text setting
-       of the permitted CPUs instead. So to use CPUs 1 and 5, you would specify
-       ``cpus_allowed=1,5``. This options also allows a range of CPUs. Say you
-       wanted a binding to CPUs 1, 5, and 8-15, you would set
-       ``cpus_allowed=1,5,8-15``.
+       Controls the same options as :option:`cpumask`, but accepts a textual
+       specification of the permitted CPUs instead. So to use CPUs 1 and 5 you
+       would specify ``cpus_allowed=1,5``. This option also allows a range of CPUs
+       to be specified -- say you wanted a binding to CPUs 1, 5, and 8 to 15, you
+       would set ``cpus_allowed=1,5,8-15``.
  
  .. option:: cpus_allowed_policy=str
  
         Set the policy of how fio distributes the CPUs specified by
-       :option:`cpus_allowed` or cpumask. Two policies are supported:
+       :option:`cpus_allowed` or :option:`cpumask`. Two policies are supported:
  
                 **shared**
                         All jobs will share the CPU set specified.
                 **split**
                         Each job will get a unique CPU from the CPU set.
  
-       **shared** is the default behaviour, if the option isn't specified. If
+       **shared** is the default behavior, if the option isn't specified. If
         **split** is specified, then fio will will assign one cpu per job. If not
         enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
         in the set.
@@ -2210,7 +2292,7 @@ Threads, processes and job synchronization
  
         Set this job running on specified NUMA nodes' CPUs. The arguments allow
         comma delimited list of cpu numbers, A-B ranges, or `all`. Note, to enable
-       numa options support, fio must be built on a system with libnuma-dev(el)
+       NUMA options support, fio must be built on a system with libnuma-dev(el)
         installed.
  
  .. option:: numa_mem_policy=str
@@ -2220,11 +2302,11 @@ Threads, processes and job synchronization
  
                 <mode>[:<nodelist>]
  
-       ``mode`` is one of the following memory policy: ``default``, ``prefer``,
-       ``bind``, ``interleave``, ``local`` For ``default`` and ``local`` memory
-       policy, no node is needed to be specified.  For ``prefer``, only one node is
-       allowed.  For ``bind`` and ``interleave``, it allow comma delimited list of
-       numbers, A-B ranges, or `all`.
+       ``mode`` is one of the following memory poicies: ``default``, ``prefer``,
+       ``bind``, ``interleave`` or ``local``. For ``default`` and ``local`` memory
+       policies, no node needs to be specified.  For ``prefer``, only one node is
+       allowed.  For ``bind`` and ``interleave`` the ``nodelist`` may be as
+       follows: a comma delimited list of numbers, A-B ranges, or `all`.
  
  .. option:: cgroup=str
  
@@ -2280,8 +2362,9 @@ Threads, processes and job synchronization
  
  .. option:: exitall
  
-       When one job finishes, terminate the rest. The default is to wait for each
-       job to finish, sometimes that is not the desired action.
+       By default, fio will continue running all other jobs when one job finishes
+       but sometimes this is not the desired action.  Setting ``exitall`` will
+       instead make fio terminate all other jobs when one job finishes.
  
  .. option:: exec_prerun=str
  
@@ -2339,13 +2422,14 @@ Verification
                         header of each block.
  
                 **crc32c**
-                       Use a crc32c sum of the data area and store it in the header of each
-                       block.
+                       Use a crc32c sum of the data area and store it in the header of
+                       each block. This will automatically use hardware acceleration
+                       (e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
+                       fall back to software crc32c if none is found. Generally the
+                       fatest checksum fio supports when hardware accelerated.
  
                 **crc32c-intel**
-                       Use hardware assisted crc32c calculation provided on SSE4.2 enabled
-                       processors. Falls back to regular software crc32c, if not supported
-                       by the system.
+                       Synonym for crc32c.
  
                 **crc32**
                         Use a crc32 sum of the data area and store it in the header of each
@@ -2398,7 +2482,7 @@ Verification
  
                 **null**
                         Only pretend to verify. Useful for testing internals with
-                       :option:`ioengine` `=null`, not for much else.
+                       :option:`ioengine`\=null, not for much else.
  
         This option can be used for repeated burn-in tests of a system to make sure
         that the written data is also correctly read back. If the data direction
@@ -2416,7 +2500,7 @@ Verification
  
  .. option:: verifysort_nr=int
  
-   Pre-load and sort verify blocks for a read workload.
+       Pre-load and sort verify blocks for a read workload.
  
  .. option:: verify_offset=int
  
@@ -2434,7 +2518,7 @@ Verification
         If set, fio will fill the I/O buffers with this pattern. Fio defaults to
         filling with totally random bytes, but sometimes it's interesting to fill
         with a known pattern for I/O verification purposes. Depending on the width
-       of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time(it can
+       of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
         be either a decimal or a hex number).  The ``verify_pattern`` if larger than
         a 32-bit quantity has to be a hex number that starts with either "0x" or
         "0X". Use with :option:`verify`. Also, ``verify_pattern`` supports %o
@@ -2467,6 +2551,7 @@ Verification
         contents to one or more separate threads. If using this offload option, even
         sync I/O engines can benefit from using an :option:`iodepth` setting higher
         than 1, as it allows them to have I/O in flight while verifies are running.
+       Defaults to 0 async threads, i.e. verification is not asynchronous.
  
  .. option:: verify_async_cpus=str
  
@@ -2499,18 +2584,19 @@ Verification
         state is loaded for the verify read phase. The format of the filename is,
         roughly::
  
-       <type>-<jobname>-<jobindex>-verify.state.
+               <type>-<jobname>-<jobindex>-verify.state.
  
         <type> is "local" for a local run, "sock" for a client/server socket
         connection, and "ip" (192.168.0.1, for instance) for a networked
-       client/server connection.
+       client/server connection. Defaults to true.
  
  .. option:: verify_state_load=bool
  
         If a verify termination trigger was used, fio stores the current write state
         of each thread. This can be used at verification time so that fio knows how
         far it should verify.  Without this information, fio will run a full
-       verification pass, according to the settings in the job file used.
+       verification pass, according to the settings in the job file used.  Default
+       false.
  
  .. option:: trim_percentage=int
  
@@ -2518,11 +2604,11 @@ Verification
  
  .. option:: trim_verify_zero=bool
  
-       Verify that trim/discarded blocks are returned as zeroes.
+       Verify that trim/discarded blocks are returned as zeros.
  
  .. option:: trim_backlog=int
  
-       Verify that trim/discarded blocks are returned as zeroes.
+       Trim after this number of blocks are written.
  
  .. option:: trim_backlog_batch=int
  
@@ -2532,7 +2618,6 @@ Verification
  
         Enable experimental verification.
  
-
  Steady state
  ~~~~~~~~~~~~
  
@@ -2573,13 +2658,13 @@ Steady state
         A rolling window of this duration will be used to judge whether steady state
         has been reached. Data will be collected once per second. The default is 0
         which disables steady state detection.  When the unit is omitted, the
-       value is given in seconds.
+       value is interpreted in seconds.
  
  .. option:: steadystate_ramp_time=time, ss_ramp=time
  
         Allow the job to run for the specified duration before beginning data
         collection for checking the steady state job termination criterion. The
-       default is 0.  When the unit is omitted, the value is given in seconds.
+       default is 0.  When the unit is omitted, the value is interpreted in seconds.
  
  
  Measurements and reporting
@@ -2607,7 +2692,7 @@ Measurements and reporting
         all jobs in a file will be part of the same reporting group, unless
         separated by a :option:`stonewall`.
  
-.. option:: stats
+.. option:: stats=bool
  
         By default, fio collects and shows final output results for all jobs
         that run. If this option is set to 0, then fio will ignore it in
@@ -2618,7 +2703,7 @@ Measurements and reporting
         If given, write a bandwidth log for this job. Can be used to store data of
         the bandwidth of the jobs in their lifetime. The included
         :command:`fio_generate_plots` script uses :command:`gnuplot` to turn these
-       text files into nice graphs. See :option:`write_lat_log` for behaviour of
+       text files into nice graphs. See :option:`write_lat_log` for behavior of
         given filename. For this option, the postfix is :file:`_bw.x.log`, where `x`
         is the index of the job (`1..N`, where `N` is the number of jobs). If
         :option:`per_job_logs` is false, then the filename will not include the job
@@ -2635,8 +2720,8 @@ Measurements and reporting
                 write_lat_log=foo
  
         The actual log names will be :file:`foo_slat.x.log`, :file:`foo_clat.x.log`,
-       and :file:`foo_lat.x.log`, where `x` is the index of the job (1..N, where N
-       is the number of jobs). This helps :command:`fio_generate_plot` find the
+       and :file:`foo_lat.x.log`, where `x` is the index of the job (`1..N`, where `N`
+       is the number of jobs). This helps :command:`fio_generate_plots` find the
         logs automatically. If :option:`per_job_logs` is false, then the filename
         will not include the job index.  See `Log File Formats`_.
  
@@ -2645,7 +2730,7 @@ Measurements and reporting
         Same as :option:`write_lat_log`, but writes I/O completion latency
         histograms. If no filename is given with this option, the default filename
         of :file:`jobname_clat_hist.x.log` is used, where `x` is the index of the
-       job (1..N, where `N` is the number of jobs). Even if the filename is given,
+       job (`1..N`, where `N` is the number of jobs). Even if the filename is given,
         fio will still append the type of log.  If :option:`per_job_logs` is false,
         then the filename will not include the job index. See `Log File Formats`_.
  
@@ -2653,7 +2738,7 @@ Measurements and reporting
  
         Same as :option:`write_bw_log`, but writes IOPS. If no filename is given
         with this option, the default filename of :file:`jobname_type.x.log` is
-       used,where `x` is the index of the job (1..N, where `N` is the number of
+       used, where `x` is the index of the job (`1..N`, where `N` is the number of
         jobs). Even if the filename is given, fio will still append the type of
         log. If :option:`per_job_logs` is false, then the filename will not include
         the job index. See `Log File Formats`_.
@@ -2665,6 +2750,7 @@ Measurements and reporting
         very large size. Setting this option makes fio average the each log entry
         over the specified period of time, reducing the resolution of the log.  See
         :option:`log_max_value` as well. Defaults to 0, logging all entries.
+       Also see `Log File Formats`_.
  
  .. option:: log_hist_msec=int
  
@@ -2689,10 +2775,11 @@ Measurements and reporting
         you instead want to log the maximum value, set this option to 1. Defaults to
         0, meaning that averaged values are logged.
  
-.. option:: log_offset=int
+.. option:: log_offset=bool
  
         If this is set, the iolog options will include the byte offset for the I/O
-       entry as well as the other data values.
+       entry as well as the other data values. Defaults to 0 meaning that
+       offsets are not present in logs. Also see `Log File Formats`_.
  
  .. option:: log_compression=int
  
@@ -2766,7 +2853,7 @@ Measurements and reporting
  .. option:: disable_slat=bool
  
         Disable measurements of submission latency numbers. See
-       :option:`disable_slat`.
+       :option:`disable_lat`.
  
  .. option:: disable_bw_measurement=bool, disable_bw=bool
  
@@ -2775,7 +2862,15 @@ Measurements and reporting
  
  .. option:: clat_percentiles=bool
  
-       Enable the reporting of percentiles of completion latencies.
+       Enable the reporting of percentiles of completion latencies.  This
+       option is mutually exclusive with :option:`lat_percentiles`.
+
+.. option:: lat_percentiles=bool
+
+       Enable the reporting of percentiles of IO latencies. This is similar
+       to :option:`clat_percentiles`, except that this includes the
+       submission latency. This option is mutually exclusive with
+       :option:`clat_percentiles`.
  
  .. option:: percentile_list=float_list
  
@@ -2834,7 +2929,8 @@ Error handling
  .. option:: ignore_error=str
  
         Sometimes you want to ignore some errors during test in that case you can
-       specify error list for each error type.
+       specify error list for each error type, instead of only being able to
+       ignore the default 'non-fatal error' using :option:`continue_on_error`.
         ``ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST`` errors for
         given error type is separated with ':'. Error may be symbol ('ENOSPC',
         'ENOMEM') or integer.  Example::
@@ -2842,7 +2938,8 @@ Error handling
                 ignore_error=EAGAIN,ENOSPC:122
  
         This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
-       WRITE.
+       WRITE. This option works by overriding :option:`continue_on_error` with
+       the list of errors for each error type if any.
  
  .. option:: error_dump=bool
  
@@ -2868,7 +2965,7 @@ other tools.
  To view a profile's additional options use :option:`--cmdhelp` after specifying
  the profile.  For example::
  
-$ fio --profile=act --cmdhelp
+       $ fio --profile=act --cmdhelp
  
  Act profile options
  ~~~~~~~~~~~~~~~~~~~
@@ -2886,12 +2983,13 @@ Act profile options
  .. option:: test-duration=time
         :noindex:
  
-       How long the entire test takes to run.  Default: 24h.
+       How long the entire test takes to run.  When the unit is omitted, the value
+       is given in seconds.  Default: 24h.
  
  .. option:: threads-per-queue=int
         :noindex:
  
-       Number of read IO threads per device.  Default: 8.
+       Number of read I/O threads per device.  Default: 8.
  
  .. option:: read-req-num-512-blocks=int
         :noindex:
@@ -2914,7 +3012,7 @@ Tiobench profile options
  .. option:: size=str
         :noindex:
  
-       Size in MiB
+       Size in MiB.
  
  .. option:: block=int
         :noindex:
@@ -2939,13 +3037,20 @@ Tiobench profile options
  Interpreting the output
  -----------------------
  
+..
+       Example output was based on the following:
+       TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --time_based \
+               --rate=1256k --bs=14K --name=quick --runtime=1s --name=mixed \
+               --runtime=2m --rw=rw
+
  Fio spits out a lot of output. While running, fio will display the status of the
  jobs created. An example of that would be::
  
      Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
  
-The characters inside the square brackets denote the current status of each
-thread. The possible values (in typical life cycle order) are:
+The characters inside the first set of square brackets denote the current status of
+each thread.  The first character is the first job defined in the job file, and so
+forth.  The possible values (in typical life cycle order) are:
  
  +------+-----+-----------------------------------------------------------+
  | Idle | Run |                                                           |
@@ -2958,6 +3063,8 @@ thread. The possible values (in typical life cycle order) are:
  +------+-----+-----------------------------------------------------------+
  |      |  p  | Thread running pre-reading file(s).                       |
  +------+-----+-----------------------------------------------------------+
+|      |  /  | Thread is in ramp period.                                 |
++------+-----+-----------------------------------------------------------+
  |      |  R  | Running, doing sequential reads.                          |
  +------+-----+-----------------------------------------------------------+
  |      |  r  | Running, doing random reads.                              |
@@ -2970,77 +3077,103 @@ thread. The possible values (in typical life cycle order) are:
  +------+-----+-----------------------------------------------------------+
  |      |  m  | Running, doing mixed random reads/writes.                 |
  +------+-----+-----------------------------------------------------------+
-|      |  F  | Running, currently waiting for :manpage:`fsync(2)`        |
+|      |  D  | Running, doing sequential trims.                          |
++------+-----+-----------------------------------------------------------+
+|      |  d  | Running, doing random trims.                              |
++------+-----+-----------------------------------------------------------+
+|      |  F  | Running, currently waiting for :manpage:`fsync(2)`.       |
  +------+-----+-----------------------------------------------------------+
  |      |  V  | Running, doing verification of written data.              |
  +------+-----+-----------------------------------------------------------+
+| f    |     | Thread finishing.                                         |
++------+-----+-----------------------------------------------------------+
  | E    |     | Thread exited, not reaped by main thread yet.             |
  +------+-----+-----------------------------------------------------------+
-| _    |     | Thread reaped, or                                         |
+| _    |     | Thread reaped.                                            |
  +------+-----+-----------------------------------------------------------+
  | X    |     | Thread reaped, exited with an error.                      |
  +------+-----+-----------------------------------------------------------+
  | K    |     | Thread reaped, exited due to signal.                      |
  +------+-----+-----------------------------------------------------------+
  
+..
+       Example output was based on the following:
+       TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --runtime=58m \
+               --time_based --rate=2512k --bs=256K --numjobs=10 \
+               --name=readers --rw=read --name=writers --rw=write
+
  Fio will condense the thread string as not to take up more space on the command
-line as is needed. For instance, if you have 10 readers and 10 writers running,
+line than needed. For instance, if you have 10 readers and 10 writers running,
  the output would look like this::
  
      Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
  
-Fio will still maintain the ordering, though. So the above means that jobs 1..10
-are readers, and 11..20 are writers.
+Note that the status string is displayed in order, so it's possible to tell which of
+the jobs are currently doing what.  In the example above this means that jobs 1--10
+are readers and 11--20 are writers.
  
  The other values are fairly self explanatory -- number of threads currently
-running and doing I/O, the number of currently open files (f=), the rate of I/O
-since last check (read speed listed first, then write speed and optionally trim
-speed), and the estimated completion percentage and time for the current
-running group. It's impossible to estimate runtime of the following groups (if
-any). Note that the string is displayed in order, so it's possible to tell which
-of the jobs are currently doing what. The first character is the first job
-defined in the job file, and so forth.
-
-When fio is done (or interrupted by :kbd:`ctrl-c`), it will show the data for
-each thread, group of threads, and disks in that order. For each data direction,
-the output looks like::
-
-    Client1 (g=0): err= 0:
-      write: io=    32MiB, bw=   666KiB/s, iops=89 , runt= 50320msec
-        slat (msec): min=    0, max=  136, avg= 0.03, stdev= 1.92
-        clat (msec): min=    0, max=  631, avg=48.50, stdev=86.82
-        bw (KiB/s) : min=    0, max= 1196, per=51.00%, avg=664.02, stdev=681.68
-      cpu        : usr=1.49%, sys=0.25%, ctx=7969, majf=0, minf=17
-      IO depths    : 1=0.1%, 2=0.3%, 4=0.5%, 8=99.0%, 16=0.0%, 32=0.0%, >32=0.0%
-         submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
-         complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
-         issued r/w: total=0/32768, short=0/0
-         lat (msec): 2=1.6%, 4=0.0%, 10=3.2%, 20=12.8%, 50=38.4%, 100=24.8%,
-         lat (msec): 250=15.2%, 500=0.0%, 750=0.0%, 1000=0.0%, >=2048=0.0%
-
-The client number is printed, along with the group id and error of that
-thread. Below is the I/O statistics, here for writes. In the order listed, they
-denote:
-
-**io**
-               Number of megabytes I/O performed.
-
-**bw**
-               Average bandwidth rate.
-
-**iops**
-               Average I/Os performed per second.
-
-**runt**
-               The runtime of that thread.
+running and doing I/O, the number of currently open files (f=), the estimated
+completion percentage, the rate of I/O since last check (read speed listed first,
+then write speed and optionally trim speed) in terms of bandwidth and IOPS,
+and time to completion for the current running group. It's impossible to estimate
+runtime of the following groups (if any).
+
+..
+       Example output was based on the following:
+       TZ=UTC fio --iodepth=16 --ioengine=posixaio --filename=/tmp/fiofile \
+               --direct=1 --size=100M --time_based --runtime=50s --rate_iops=89 \
+               --bs=7K --name=Client1 --rw=write
+
+When fio is done (or interrupted by :kbd:`Ctrl-C`), it will show the data for
+each thread, group of threads, and disks in that order. For each overall thread (or
+group) the output looks like::
+
+       Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
+         write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
+           slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
+           clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
+            lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
+           clat percentiles (usec):
+            |  1.00th=[  302],  5.00th=[  326], 10.00th=[  343], 20.00th=[  363],
+            | 30.00th=[  392], 40.00th=[  404], 50.00th=[  416], 60.00th=[  445],
+            | 70.00th=[  816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
+            | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
+            | 99.99th=[78119]
+          bw (  KiB/s): min=  532, max=  686, per=0.10%, avg=622.87, stdev=24.82, samples=  100
+          iops        : min=   76, max=   98, avg=88.98, stdev= 3.54, samples=  100
+         lat (usec)   : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
+         lat (msec)   : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
+         lat (msec)   : 100=0.65%
+         cpu          : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
+         IO depths    : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
+            submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+            complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+            issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
+            latency   : target=0, window=0, percentile=100.00%, depth=8
+
+The job name (or first job's name when using :option:`group_reporting`) is printed,
+along with the group id, count of jobs being aggregated, last error id seen (which
+is 0 when there are no errors), pid/tid of that thread and the time the job/group
+completed.  Below are the I/O statistics for each data direction performed (showing
+writes in the example above).  In the order listed, they denote:
+
+**read/write/trim**
+               The string before the colon shows the I/O direction the statistics
+               are for.  **IOPS** is the average I/Os performed per second.  **BW**
+               is the average bandwidth rate shown as: value in power of 2 format
+               (value in power of 10 format).  The last two values show: (**total
+               I/O performed** in power of 2 format / **runtime** of that thread).
  
  **slat**
-               Submission latency (avg being the average, stdev being the standard
-               deviation). This is the time it took to submit the I/O. For sync I/O,
-               the slat is really the completion latency, since queue/complete is one
-               operation there. This value can be in milliseconds or microseconds, fio
-               will choose the most appropriate base and print that. In the example
-               above, milliseconds is the best scale. Note: in :option:`--minimal` mode
+               Submission latency (**min** being the minimum, **max** being the
+               maximum, **avg** being the average, **stdev** being the standard
+               deviation).  This is the time it took to submit the I/O.  For
+               sync I/O this row is not displayed as the slat is really the
+               completion latency (since queue/complete is one operation there).
+               This value can be in nanoseconds, microseconds or milliseconds ---
+               fio will choose the most appropriate base and print that (in the
+               example above nanoseconds was the best scale).  Note: in :option:`--minimal` mode
                 latencies are always expressed in microseconds.
  
  **clat**
@@ -3050,12 +3183,28 @@ denote:
                 complete is basically just CPU time (I/O has already been done, see slat
                 explanation).
  
+**lat**
+               Total latency. Same names as slat and clat, this denotes the time from
+               when fio created the I/O unit to completion of the I/O operation.
+
  **bw**
-               Bandwidth. Same names as the xlat stats, but also includes an
-               approximate percentage of total aggregate bandwidth this thread received
-               in this group. This last value is only really useful if the threads in
-               this group are on the same disk, since they are then competing for disk
-               access.
+               Bandwidth statistics based on samples. Same names as the xlat stats,
+               but also includes the number of samples taken (**samples**) and an
+               approximate percentage of total aggregate bandwidth this thread
+               received in its group (**per**). This last value is only really
+               useful if the threads in this group are on the same disk, since they
+               are then competing for disk access.
+
+**iops**
+               IOPS statistics based on samples. Same names as bw.
+
+**lat (nsec/usec/msec)**
+               The distribution of I/O completion latencies. This is the time from when
+               I/O leaves fio and when it gets completed. Unlike the separate
+               read/write/trim sections above, the data here and in the remaining
+               sections apply to all I/Os for the reporting group. 250=0.04% means that
+               0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
+               of the I/Os required 250 to 499us for completion.
  
  **cpu**
                 CPU usage. User and system time, along with the number of context
@@ -3065,54 +3214,60 @@ denote:
                 context and fault counters are summed.
  
  **IO depths**
-               The distribution of I/O depths over the job life time. The numbers are
-               divided into powers of 2, so for example the 16= entries includes depths
-               up to that value but higher than the previous entry. In other words, it
-               covers the range from 16 to 31.
+               The distribution of I/O depths over the job lifetime.  The numbers are
+               divided into powers of 2 and each entry covers depths from that value
+               up to those that are lower than the next entry -- e.g., 16= covers
+               depths from 16 to 31.  Note that the range covered by a depth
+               distribution entry can be different to the range covered by the
+               equivalent submit/complete distribution entry.
  
  **IO submit**
                 How many pieces of I/O were submitting in a single submit call. Each
                 entry denotes that amount and below, until the previous entry -- e.g.,
-               8=100% mean that we submitted anywhere in between 5-8 I/Os per submit
-               call.
+               16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
+               call.  Note that the range covered by a submit distribution entry can
+               be different to the range covered by the equivalent depth distribution
+               entry.
  
  **IO complete**
                 Like the above submit number, but for completions instead.
  
-**IO issued**
-               The number of read/write requests issued, and how many of them were
-               short.
+**IO issued rwt**
+               The number of read/write/trim requests issued, and how many of them were
+               short or dropped.
  
-**IO latencies**
-               The distribution of I/O completion latencies. This is the time from when
-               I/O leaves fio and when it gets completed.  The numbers follow the same
-               pattern as the I/O depths, meaning that 2=1.6% means that 1.6% of the
-               I/O completed within 2 msecs, 20=12.8% means that 12.8% of the I/O took
-               more than 10 msecs, but less than (or equal to) 20 msecs.
+**IO latency**
+               These values are for `--latency-target` and related options. When
+               these options are engaged, this section describes the I/O depth required
+               to meet the specified latency target.
+
+..
+       Example output was based on the following:
+       TZ=UTC fio --ioengine=null --iodepth=2 --size=100M --numjobs=2 \
+               --rate_process=poisson --io_limit=32M --name=read --bs=128k \
+               --rate=11M --name=write --rw=write --bs=2k --rate=700k
  
  After each client has been listed, the group statistics are printed. They
  will look like this::
  
      Run status group 0 (all jobs):
-       READ: io=64MB, aggrb=22178, minb=11355, maxb=11814, mint=2840msec, maxt=2955msec
-      WRITE: io=64MB, aggrb=1302, minb=666, maxb=669, mint=50093msec, maxt=50320msec
+       READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s-10.8MiB/s (10.9MB/s-11.3MB/s), io=64.0MiB (67.1MB), run=2973-3069msec
+      WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s-621KiB/s (630kB/s-636kB/s), io=64.0MiB (67.1MB), run=52747-53223msec
  
-For each data direction, it prints:
+For each data direction it prints:
  
+**bw**
+               Aggregate bandwidth of threads in this group followed by the
+               minimum and maximum bandwidth of all the threads in this group.
+               Values outside of brackets are power-of-2 format and those
+               within are the equivalent value in a power-of-10 format.
  **io**
-               Number of megabytes I/O performed.
-**aggrb**
-               Aggregate bandwidth of threads in this group.
-**minb**
-               The minimum average bandwidth a thread saw.
-**maxb**
-               The maximum average bandwidth a thread saw.
-**mint**
-               The smallest runtime of the threads in that group.
-**maxt**
-               The longest runtime of the threads in that group.
-
-And finally, the disk statistics are printed. They will look like this::
+               Aggregate I/O performed of all threads in this group. The
+               format is the same as bw.
+**run**
+               The smallest and longest runtimes of the threads in this group.
+
+And finally, the disk statistics are printed. This is Linux specific. They will look like this::
  
    Disk stats (read/write):
      sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
@@ -3123,10 +3278,10 @@ numbers denote:
  **ios**
                 Number of I/Os performed by all groups.
  **merge**
-               Number of merges I/O the I/O scheduler.
+               Number of merges performed by the I/O scheduler.
  **ticks**
                 Number of ticks we kept the disk busy.
-**io_queue**
+**in_queue**
                 Total time spent in the disk queue.
  **util**
                 The disk utilization. A value of 100% means we kept the disk
@@ -3152,16 +3307,18 @@ is one long line of values, such as::
  
  The job description (if provided) follows on a second line.
  
-To enable terse output, use the :option:`--minimal` command line option. The
+To enable terse output, use the :option:`--minimal` or
+:option:`--output-format`\=terse command line options. The
  first value is the version of the terse output format. If the output has to be
  changed for some reason, this number will be incremented by 1 to signify that
  change.
  
-Split up, the format is as follows:
+Split up, the format is as follows (comments in brackets denote when a
+field was introduced or whether it's specific to some terse version):
  
      ::
  
-        terse version, fio version, jobname, groupid, error
+        terse version, fio version [v3], jobname, groupid, error
  
      READ status::
  
@@ -3170,7 +3327,8 @@ Split up, the format is as follows:
          Completion latency: min, max, mean, stdev (usec)
          Completion latency percentiles: 20 fields (see below)
          Total latency: min, max, mean, stdev (usec)
-        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+        IOPS [v5]: min, max, mean, stdev, number of samples
  
      WRITE status:
  
@@ -3178,10 +3336,15 @@ Split up, the format is as follows:
  
          Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
          Submission latency: min, max, mean, stdev (usec)
-        Completion latency: min, max, mean, stdev(usec)
+        Completion latency: min, max, mean, stdev (usec)
          Completion latency percentiles: 20 fields (see below)
          Total latency: min, max, mean, stdev (usec)
-        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+        IOPS [v5]: min, max, mean, stdev, number of samples
+
+    TRIM status [all but version 3]:
+
+        Fields are similar to READ/WRITE status.
  
      CPU usage::
  
@@ -3199,12 +3362,10 @@ Split up, the format is as follows:
  
          <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
  
-    Disk utilization::
+    Disk utilization [v3]::
  
-        Disk name, Read ios, write ios,
-        Read merges, write merges,
-        Read ticks, write ticks,
-        Time spent in queue, disk utilization percentage
+        disk name, read ios, write ios, read merges, write merges, read ticks, write ticks,
+        time spent in queue, disk utilization percentage
  
      Additional Info (dependent on continue_on_error, default off)::
  
@@ -3217,13 +3378,48 @@ Split up, the format is as follows:
  Completion latency percentiles can be a grouping of up to 20 sets, so for the
  terse output fio writes all of them. Each field will look like this::
  
-       1.00%=6112
+        1.00%=6112
  
  which is the Xth percentile, and the `usec` latency associated with it.
  
-For disk utilization, all disks used by fio are shown. So for each disk there
+For `Disk utilization`, all disks used by fio are shown. So for each disk there
  will be a disk utilization section.
  
+Below is a single line containing short names for each of the fields in the
+minimal output v3, separated by semicolons::
+
+        terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+
+
+JSON output
+------------
+
+The `json` output format is intended to be both human readable and convenient
+for automated parsing. For the most part its sections mirror those of the
+`normal` output. The `runtime` value is reported in msec and the `bw` value is
+reported in 1024 bytes per second units.
+
+
+JSON+ output
+------------
+
+The `json+` output format is identical to the `json` output format except that it
+adds a full dump of the completion latency bins. Each `bins` object contains a
+set of (key, value) pairs where keys are latency durations and values count how
+many I/Os had completion latencies of the corresponding duration. For example,
+consider:
+
+       "bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
+
+This data indicates that one I/O required 87,552ns to complete, two I/Os required
+100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
+
+Also included with fio is a Python script `fio_jsonplus_clat2csv` that takes
+json+ output and generates CSV-formatted latency data suitable for plotting.
+
+The latency durations actually represent the midpoints of latency intervals.
+For details refer to :file:`stat.h`.
+
  
  Trace file format
  -----------------
@@ -3242,9 +3438,9 @@ Each line represents a single I/O action in the following format::
  
         rw, offset, length
  
-where `rw=0/1` for read/write, and the offset and length entries being in bytes.
+where `rw=0/1` for read/write, and the `offset` and `length` entries being in bytes.
  
-This format is not supported in fio versions => 1.20-rc3.
+This format is not supported in fio versions >= 1.20-rc3.
  
  
  Trace file format v2
@@ -3264,15 +3460,15 @@ The file management format::
  
      filename action
  
-The filename is given as an absolute path. The action can be one of these:
+The `filename` is given as an absolute path. The `action` can be one of these:
  
  **add**
-               Add the given filename to the trace.
+               Add the given `filename` to the trace.
  **open**
-               Open the file with the given filename. The filename has to have
+               Open the file with the given `filename`. The `filename` has to have
                 been added with the **add** action before.
  **close**
-               Close the file with the given filename. The file has to have been
+               Close the file with the given `filename`. The file has to have been
                 opened before.
  
  
@@ -3341,7 +3537,7 @@ completions, etc.
  
  A trigger is invoked either through creation ('touch') of a specified file in
  the system, or through a timeout setting. If fio is run with
-:option:`--trigger-file` = :file:`/tmp/trigger-file`, then it will continually
+:option:`--trigger-file`\= :file:`/tmp/trigger-file`, then it will continually
  check for the existence of :file:`/tmp/trigger-file`. When it sees this file, it
  will fire off the trigger (thus saving state, and executing the trigger
  command).
@@ -3355,8 +3551,8 @@ will then execute the trigger.
  Verification trigger example
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  
-Lets say we want to run a powercut test on the remote machine 'server'.  Our
-write workload is in :file:`write-test.fio`. We want to cut power to 'server' at
+Let's say we want to run a powercut test on the remote Linux machine 'server'.
+Our write workload is in :file:`write-test.fio`. We want to cut power to 'server' at
  some point during the run, and we'll run this test from the safety or our local
  machine, 'localbox'. On the server, we'll start the fio backend normally::
  
@@ -3374,7 +3570,7 @@ on the server once it has received the trigger and sent us the write state. This
  will work, but it's not **really** cutting power to the server, it's merely
  abruptly rebooting it. If we have a remote way of cutting power to the server
  through IPMI or similar, we could do that through a local trigger command
-instead. Lets assume we have a script that does IPMI reboot of a given hostname,
+instead. Let's assume we have a script that does IPMI reboot of a given hostname,
  ipmi-reboot. On localbox, we could then have run fio with a local trigger
  instead::
  
@@ -3386,7 +3582,7 @@ execute ``ipmi-reboot server`` when that happened.
  Loading verify state
  ~~~~~~~~~~~~~~~~~~~~
  
-To load store write state, read verification job file must contain the
+To load stored write state, a read verification job file must contain the
  :option:`verify_state_load` option. If that is set, fio will load the previously
  stored state. For a local fio run this is done by loading the files directly,
  and on a client/server run, the server backend will ask the client to send the
@@ -3399,13 +3595,14 @@ Log File Formats
  Fio supports a variety of log file formats, for logging latencies, bandwidth,
  and IOPS. The logs share a common format, which looks like this:
  
-    *time* (`msec`), *value*, *data direction*, *offset*
+    *time* (`msec`), *value*, *data direction*, *block size* (`bytes`),
+    *offset* (`bytes`)
  
-Time for the log entry is always in milliseconds. The *value* logged depends
+*Time* for the log entry is always in milliseconds. The *value* logged depends
  on the type of log, it will be one of the following:
  
      **Latency log**
-               Value is latency in usecs
+               Value is latency in nsecs
      **Bandwidth log**
                 Value is in KiB/sec
      **IOPS log**
@@ -3420,31 +3617,31 @@ on the type of log, it will be one of the following:
         **2**
                 I/O is a TRIM
  
-The *offset* is the offset, in bytes, from the start of the file, for that
-particular I/O. The logging of the offset can be toggled with
-:option:`log_offset`.
-
-If windowed logging is enabled through :option:`log_avg_msec` then fio doesn't
-log individual I/Os. Instead of logs the average values over the specified period
-of time. Since 'data direction' and 'offset' are per-I/O values, they aren't
-applicable if windowed logging is enabled. If windowed logging is enabled and
-:option:`log_max_value` is set, then fio logs maximum values in that window
-instead of averages.
+The entry's *block size* is always in bytes. The *offset* is the offset, in bytes,
+from the start of the file, for that particular I/O. The logging of the offset can be
+toggled with :option:`log_offset`.
  
+Fio defaults to logging every individual I/O.  When IOPS are logged for individual
+I/Os the *value* entry will always be 1. If windowed logging is enabled through
+:option:`log_avg_msec`, fio logs the average values over the specified period of time.
+If windowed logging is enabled and :option:`log_max_value` is set, then fio logs
+maximum values in that window instead of averages. Since *data direction*, *block
+size* and *offset* are per-I/O values, if windowed logging is enabled they
+aren't applicable and will be 0.
  
-Client/server
+Client/Server
  -------------
  
  Normally fio is invoked as a stand-alone application on the machine where the
-I/O workload should be generated. However, the frontend and backend of fio can
-be run separately. Ie the fio server can generate an I/O workload on the "Device
-Under Test" while being controlled from another machine.
+I/O workload should be generated. However, the backend and frontend of fio can
+be run separately i.e., the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled by a client on another machine.
  
  Start the server on the machine which has access to the storage DUT::
  
-       fio --server=args
+       $ fio --server=args
  
-where args defines what fio listens to. The arguments are of the form
+where `args` defines what fio listens to. The arguments are of the form
  ``type,hostname`` or ``IP,port``. *type* is either ``ip`` (or ip4) for TCP/IP
  v4, ``ip6`` for TCP/IP v6, or ``sock`` for a local unix domain socket.
  *hostname* is either a hostname or IP address, and *port* is the port to listen
@@ -3472,7 +3669,7 @@ to (only valid for TCP/IP, not a local socket). Some examples:
  
  6) ``fio --server=sock:/tmp/fio.sock``
  
-   Start a fio server, listening on the local socket /tmp/fio.sock.
+   Start a fio server, listening on the local socket :file:`/tmp/fio.sock`.
  
  Once a server is running, a "client" can connect to the fio server with::
  
@@ -3512,7 +3709,7 @@ servers receive the same job file.
  
  In order to let ``fio --client`` runs use a shared filesystem from multiple
  hosts, ``fio --client`` now prepends the IP address of the server to the
-filename.  For example, if fio is using directory :file:`/mnt/nfs/fio` and is
+filename.  For example, if fio is using the directory :file:`/mnt/nfs/fio` and is
  writing filename :file:`fileio.tmp`, with a :option:`--client` `hostfile`
  containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and
  192.168.10.121, then fio will create two files::
diff --git a/Makefile b/Makefile

index 1f0f5d04c8b8aedaee17ec278414f86d2e61158f..3764da55085102d8d67a20d533e67cafb91ac295 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -26,7 +26,7 @@ OPTFLAGS= -g -ffast-math
  CFLAGS = -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
  LIBS   += -lm $(EXTLIBS)
  PROGS  = fio
-SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py)
+SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/fio_jsonplus_clat2csv)
  
  ifndef CONFIG_FIO_NO_OPT
    CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2
@@ -36,8 +36,8 @@ ifdef CONFIG_GFIO
    PROGS += gfio
  endif
  
-SOURCE :=      $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
-               $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/lib/*.c)) \
+SOURCE :=      $(sort $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
+               $(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/lib/*.c))) \
                 gettime.c ioengines.c init.c stat.c log.c time.c filesetup.c \
                 eta.c verify.c memory.c io_u.c parse.c mutex.c options.c \
                 smalloc.c filehash.c profile.c debug.c engines/cpu.c \
@@ -107,6 +107,9 @@ endif
  ifndef CONFIG_STRLCAT
    SOURCE += oslib/strlcat.c
  endif
+ifndef CONFIG_HAVE_STRNDUP
+  SOURCE += oslib/strndup.c
+endif
  ifndef CONFIG_GETOPT_LONG_ONLY
    SOURCE += oslib/getopt_long.c
  endif
@@ -140,7 +143,7 @@ ifeq ($(CONFIG_TARGET_OS), Linux)
    LDFLAGS += -rdynamic
  endif
  ifeq ($(CONFIG_TARGET_OS), Android)
-  SOURCE += diskutil.c fifo.c blktrace.c trim.c profiles/tiobench.c \
+  SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \
                 oslib/linux-dev-lookup.c
    LIBS += -ldl
    LDFLAGS += -rdynamic
@@ -209,7 +212,8 @@ T_IEEE_PROGS = t/ieee754
  
  T_ZIPF_OBS = t/genzipf.o
  T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/pattern.o lib/zipf.o \
-               lib/strntol.o lib/gauss.o t/genzipf.o oslib/strcasestr.o
+               lib/strntol.o lib/gauss.o t/genzipf.o oslib/strcasestr.o \
+               oslib/strndup.o
  T_ZIPF_PROGS = t/fio-genzipf
  
  T_AXMAP_OBJS = t/axmap.o
@@ -222,7 +226,7 @@ T_LFSR_TEST_PROGS = t/lfsr-test
  
  T_GEN_RAND_OBJS = t/gen-rand.o
  T_GEN_RAND_OBJS += t/log.o t/debug.o lib/rand.o lib/pattern.o lib/strntol.o \
-                       oslib/strcasestr.o
+                       oslib/strcasestr.o oslib/strndup.o
  T_GEN_RAND_PROGS = t/gen-rand
  
  ifeq ($(CONFIG_TARGET_OS), Linux)
@@ -246,6 +250,9 @@ T_PIPE_ASYNC_PROGS = t/read-to-pipe-async
  T_MEMLOCK_OBJS = t/memlock.o
  T_MEMLOCK_PROGS = t/memlock
  
+T_TT_OBJS = t/time-test.o
+T_TT_PROGS = t/time-test
+
  T_OBJS = $(T_SMALLOC_OBJS)
  T_OBJS += $(T_IEEE_OBJS)
  T_OBJS += $(T_ZIPF_OBJS)
@@ -257,6 +264,7 @@ T_OBJS += $(T_DEDUPE_OBJS)
  T_OBJS += $(T_VS_OBJS)
  T_OBJS += $(T_PIPE_ASYNC_OBJS)
  T_OBJS += $(T_MEMLOCK_OBJS)
+T_OBJS += $(T_TT_OBJS)
  
  ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
      T_DEDUPE_OBJS += os/windows/posix.o lib/hweight.o
@@ -319,8 +327,13 @@ override CFLAGS += -DFIO_VERSION='"$(FIO_VERSION)"'
         @$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d
         @mv -f $*.d $*.d.tmp
         @sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d
+ifeq ($(CONFIG_TARGET_OS), NetBSD)
+       @sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | tr -cs "[:graph:]" "\n" | \
+               sed -e 's/^ *//' -e '/^$$/ d' -e 's/$$/:/' >> $*.d
+else
         @sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $*.d
+endif
         @rm -f $*.d.tmp
  
  ifdef CONFIG_ARITHMETIC
@@ -358,8 +371,13 @@ init.o: init.c FIO-VERSION-FILE
         @$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d
         @mv -f $*.d $*.d.tmp
         @sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d
+ifeq ($(CONFIG_TARGET_OS), NetBSD)
+       @sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | tr -cs "[:graph:]" "\n" | \
+               sed -e 's/^ *//' -e '/^$$/ d' -e 's/$$/:/' >> $*.d
+else
         @sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $*.d
+endif
         @rm -f $*.d.tmp
  
  gcompat.o: gcompat.c gcompat.h
@@ -430,6 +448,9 @@ t/fio-dedupe: $(T_DEDUPE_OBJS)
  t/fio-verify-state: $(T_VS_OBJS)
         $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
  
+t/time-test: $(T_TT_OBJS)
+       $(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_TT_OBJS) $(LIBS)
+
  clean: FORCE
         @rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio FIO-VERSION-FILE *.d lib/*.d oslib/*.d crc/*.d engines/*.d profiles/*.d t/*.d config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
         @rm -rf  doc/output
@@ -450,7 +471,7 @@ doc: tools/plot/fio2gnuplot.1
         @man -t tools/hist/fiologparser_hist.py.1 | ps2pdf - fiologparser_hist.pdf
  
  test: fio
-       ./fio --minimal --thread --ioengine=null --runtime=1s --name=nulltest --rw=randrw --iodepth=2 --norandommap --random_generator=tausworthe64 --size=16T --name=verifynulltest --rw=write --verify=crc32c --verify_state_save=0 --size=100M
+       ./fio --minimal --thread --exitall_on_error --runtime=1s --name=nulltest --ioengine=null --rw=randrw --iodepth=2 --norandommap --random_generator=tausworthe64 --size=16T --name=verifyfstest --filename=fiotestfile.tmp --unlink=1 --rw=write --verify=crc32c --verify_state_save=0 --size=16K
  
  install: $(PROGS) $(SCRIPTS) tools/plot/fio2gnuplot.1 FORCE
         $(INSTALL) -m 755 -d $(DESTDIR)$(bindir)
diff --git a/README b/README

index 951550b83f141a1f0063cb563650f4906644f090..72ff465ed7ebe68c11960619733bf600c9394016 100644 (file)
--- a/README
+++ b/README
@@ -59,7 +59,8 @@ Mailing list
  ------------
  
  The fio project mailing list is meant for anything related to fio including
-general discussion, bug reporting, questions, and development.
+general discussion, bug reporting, questions, and development. For bug reporting,
+see REPORTING-BUGS.
  
  An automated mail detailing recent commits is automatically sent to the list at
  most daily. The list address is fio@vger.kernel.org, subscribe by sending an
@@ -102,12 +103,16 @@ Ubuntu:
  Red Hat, Fedora, CentOS & Co:
         Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
         packages are part of the Fedora/EPEL repositories.
-       https://admin.fedoraproject.org/pkgdb/package/rpms/fio/ .
+       https://apps.fedoraproject.org/packages/fio .
  
  Mandriva:
         Mandriva has integrated fio into their package repository, so installing
         on that distro should be as easy as typing ``urpmi fio``.
  
+Arch Linux:
+        An Arch Linux package is provided under the Community sub-repository:
+        https://www.archlinux.org/packages/?sort=&q=fio
+
  Solaris:
         Packages for Solaris are available from OpenCSW. Install their pkgutil
         tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
@@ -176,7 +181,9 @@ To build fio on 32-bit Windows, run ``./configure --build-32bit-win`` before
  It's recommended that once built or installed, fio be run in a Command Prompt or
  other 'native' console such as console2, since there are known to be display and
  signal issues when running it under a Cygwin shell (see
-http://code.google.com/p/mintty/issues/detail?id=56 for details).
+https://github.com/mintty/mintty/issues/56 and
+https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs
+for details).
  
  
  Documentation
diff --git a/appveyor.yml b/appveyor.yml

index 754339355e0fe4a6ee98f056ee3388432aa8a2c2..39f50a80cf169b9fbc5da5c4e0601cdeea35becb 100644 (file)
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -13,7 +13,7 @@ environment:
  
  build_script:
    - SET PATH=%CYG_ROOT%\bin;%PATH%
-  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && ./configure ${CONFIGURE_OPTIONS} && make.exe'
+  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && ./configure --extra-cflags=\"-Werror\" ${CONFIGURE_OPTIONS} && make.exe'
  
  after_build:
    - cd os\windows && dobuild.cmd %BUILD_ARCH%
diff --git a/arch/arch-arm.h b/arch/arch-arm.h

index 31671fdbe7283ba9007c8c25901dde1cc776aad5..dd286d04464f20bbb3aabbb3997f8b410af325f1 100644 (file)
--- a/arch/arch-arm.h
+++ b/arch/arch-arm.h
@@ -14,6 +14,8 @@
  #define        nop             __asm__ __volatile__ ("nop")
  #define read_barrier() __sync_synchronize()
  #define write_barrier()        __sync_synchronize()
+#else
+#error "unsupported ARM architecture"
  #endif
  
  #endif
diff --git a/arch/arch-ia64.h b/arch/arch-ia64.h

index 53c049fdf1866f11f4ddd517a8b9deaa2326dcc4..ece3f7e2291c7dbfc20f88eeb035fdbe8ddf332e 100644 (file)
--- a/arch/arch-ia64.h
+++ b/arch/arch-ia64.h
@@ -28,10 +28,10 @@ static inline unsigned long long get_cpu_clock(void)
  }
  
  #define ARCH_HAVE_INIT
-extern int tsc_reliable;
+extern bool tsc_reliable;
  static inline int arch_init(char *envp[])
  {
-       tsc_reliable = 1;
+       tsc_reliable = true;
         return 0;
  }
  
diff --git a/arch/arch-ppc.h b/arch/arch-ppc.h

index 4a8aa97c4d9dbc01bd427e6c49e10fefbe5c41ac..804d596aecc632cb27f547c90a5b2b997ef79b42 100644 (file)
--- a/arch/arch-ppc.h
+++ b/arch/arch-ppc.h
@@ -62,7 +62,8 @@ static inline unsigned long long get_cpu_clock(void)
                 "       cmpwi %0,0;\n"
                 "       beq-  90b;\n"
         : "=r" (rval)
-       : "i" (SPRN_TBRL));
+       : "i" (SPRN_TBRL)
+       : "cr0");
  
         return rval;
  }
@@ -117,12 +118,12 @@ static void atb_clocktest(void)
  #endif
  
  #define ARCH_HAVE_INIT
-extern int tsc_reliable;
+extern bool tsc_reliable;
  
  static inline int arch_init(char *envp[])
  {
  #if 0
-       tsc_reliable = 1;
+       tsc_reliable = true;
         atb_clocktest();
  #endif
         return 0;
diff --git a/arch/arch-s390.h b/arch/arch-s390.h

index 2e84bf8ad45744ec6d74c7ed09918d83565f9c32..6bf033b00a84d4208ec5f83a0b084d5cf961ed07 100644 (file)
--- a/arch/arch-s390.h
+++ b/arch/arch-s390.h
@@ -28,10 +28,10 @@ static inline unsigned long long get_cpu_clock(void)
  #undef ARCH_CPU_CLOCK_WRAPS
  
  #define ARCH_HAVE_INIT
-extern int tsc_reliable;
+extern bool tsc_reliable;
  static inline int arch_init(char *envp[])
  {
-       tsc_reliable = 1;
+       tsc_reliable = true;
         return 0;
  }
  
diff --git a/arch/arch-x86-common.h b/arch/arch-x86-common.h

index cbf66b8d04c18c63566dadff2d06e40a90fd8f69..c51c04c2d1b5f3a9958092ba05e468307e3cc42d 100644 (file)
--- a/arch/arch-x86-common.h
+++ b/arch/arch-x86-common.h
@@ -14,7 +14,7 @@ static inline void cpuid(unsigned int op,
  
  #define ARCH_HAVE_INIT
  
-extern int tsc_reliable;
+extern bool tsc_reliable;
  extern int arch_random;
  
  static inline void arch_init_intel(unsigned int level)
diff --git a/arch/arch.h b/arch/arch.h

index 00d247c4ca44216a096f9a5f292b488093ab6378..4fb9b518085aeeae06a390874ee35346ad5ef9b7 100644 (file)
--- a/arch/arch.h
+++ b/arch/arch.h
@@ -1,6 +1,8 @@
  #ifndef ARCH_H
  #define ARCH_H
  
+#include "../lib/types.h"
+
  enum {
         arch_x86_64 = 1,
         arch_x86,
diff --git a/backend.c b/backend.c

index 9a684edb6a01185d9461b7224352211ff2f4b6e1..b1995ef4ff813de82e8e6b2045b78b709ce8e29c 100644 (file)
--- a/backend.c
+++ b/backend.c
@@ -136,7 +136,7 @@ static void set_sig_handlers(void)
  /*
   * Check if we are above the minimum rate given.
   */
-static bool __check_min_rate(struct thread_data *td, struct timeval *now,
+static bool __check_min_rate(struct thread_data *td, struct timespec *now,
                              enum fio_ddir ddir)
  {
         unsigned long long bytes = 0;
@@ -223,7 +223,7 @@ static bool __check_min_rate(struct thread_data *td, struct timeval *now,
         return false;
  }
  
-static bool check_min_rate(struct thread_data *td, struct timeval *now)
+static bool check_min_rate(struct thread_data *td, struct timespec *now)
  {
         bool ret = false;
  
@@ -335,18 +335,18 @@ static int fio_file_fsync(struct thread_data *td, struct fio_file *f)
         return ret;
  }
  
-static inline void __update_tv_cache(struct thread_data *td)
+static inline void __update_ts_cache(struct thread_data *td)
  {
-       fio_gettime(&td->tv_cache, NULL);
+       fio_gettime(&td->ts_cache, NULL);
  }
  
-static inline void update_tv_cache(struct thread_data *td)
+static inline void update_ts_cache(struct thread_data *td)
  {
-       if ((++td->tv_cache_nr & td->tv_cache_mask) == td->tv_cache_mask)
-               __update_tv_cache(td);
+       if ((++td->ts_cache_nr & td->ts_cache_mask) == td->ts_cache_mask)
+               __update_ts_cache(td);
  }
  
-static inline bool runtime_exceeded(struct thread_data *td, struct timeval *t)
+static inline bool runtime_exceeded(struct thread_data *td, struct timespec *t)
  {
         if (in_ramp_time(td))
                 return false;
@@ -430,7 +430,7 @@ static void check_update_rusage(struct thread_data *td)
         }
  }
  
-static int wait_for_completions(struct thread_data *td, struct timeval *time)
+static int wait_for_completions(struct thread_data *td, struct timespec *time)
  {
         const int full = queue_full(td);
         int min_evts = 0;
@@ -462,7 +462,7 @@ static int wait_for_completions(struct thread_data *td, struct timeval *time)
  
  int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
                    enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify,
-                  struct timeval *comp_time)
+                  struct timespec *comp_time)
  {
         int ret2;
  
@@ -499,7 +499,6 @@ int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
                         if (ddir_rw(io_u->ddir))
                                 td->ts.short_io_u[io_u->ddir]++;
  
-                       f = io_u->file;
                         if (io_u->offset == f->real_file_size)
                                 goto sync_done;
  
@@ -586,6 +585,50 @@ static int unlink_all_files(struct thread_data *td)
         return ret;
  }
  
+/*
+ * Check if io_u will overlap an in-flight IO in the queue
+ */
+static bool in_flight_overlap(struct io_u_queue *q, struct io_u *io_u)
+{
+       bool overlap;
+       struct io_u *check_io_u;
+       unsigned long long x1, x2, y1, y2;
+       int i;
+
+       x1 = io_u->offset;
+       x2 = io_u->offset + io_u->buflen;
+       overlap = false;
+       io_u_qiter(q, check_io_u, i) {
+               if (check_io_u->flags & IO_U_F_FLIGHT) {
+                       y1 = check_io_u->offset;
+                       y2 = check_io_u->offset + check_io_u->buflen;
+
+                       if (x1 < y2 && y1 < x2) {
+                               overlap = true;
+                               dprint(FD_IO, "in-flight overlap: %llu/%lu, %llu/%lu\n",
+                                               x1, io_u->buflen,
+                                               y1, check_io_u->buflen);
+                               break;
+                       }
+               }
+       }
+
+       return overlap;
+}
+
+static int io_u_submit(struct thread_data *td, struct io_u *io_u)
+{
+       /*
+        * Check for overlap if the user asked us to, and we have
+        * at least one IO in flight besides this one.
+        */
+       if (td->o.serialize_overlap && td->cur_depth > 1 &&
+           in_flight_overlap(&td->io_u_all, io_u))
+               return FIO_Q_BUSY;
+
+       return td_io_queue(td, io_u);
+}
+
  /*
   * The main verify engine. Runs over the writes we previously submitted,
   * reads the blocks back in, and checks the crc/md5 of the data.
@@ -633,12 +676,12 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes)
                 enum fio_ddir ddir;
                 int full;
  
-               update_tv_cache(td);
+               update_ts_cache(td);
                 check_update_rusage(td);
  
-               if (runtime_exceeded(td, &td->tv_cache)) {
-                       __update_tv_cache(td);
-                       if (runtime_exceeded(td, &td->tv_cache)) {
+               if (runtime_exceeded(td, &td->ts_cache)) {
+                       __update_ts_cache(td);
+                       if (runtime_exceeded(td, &td->ts_cache)) {
                                 fio_mark_td_terminate(td);
                                 break;
                         }
@@ -716,7 +759,7 @@ static void do_verify(struct thread_data *td, uint64_t verify_bytes)
                 if (!td->o.disable_slat)
                         fio_gettime(&io_u->start_time, NULL);
  
-               ret = td_io_queue(td, io_u);
+               ret = io_u_submit(td, io_u);
  
                 if (io_queue_event(td, io_u, &ret, ddir, NULL, 1, NULL))
                         break;
@@ -874,7 +917,7 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
         while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
                 (!flist_empty(&td->trim_list)) || !io_issue_bytes_exceeded(td) ||
                 td->o.time_based) {
-               struct timeval comp_time;
+               struct timespec comp_time;
                 struct io_u *io_u;
                 int full;
                 enum fio_ddir ddir;
@@ -884,11 +927,11 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
                 if (td->terminate || td->done)
                         break;
  
-               update_tv_cache(td);
+               update_ts_cache(td);
  
-               if (runtime_exceeded(td, &td->tv_cache)) {
-                       __update_tv_cache(td);
-                       if (runtime_exceeded(td, &td->tv_cache)) {
+               if (runtime_exceeded(td, &td->ts_cache)) {
+                       __update_ts_cache(td);
+                       if (runtime_exceeded(td, &td->ts_cache)) {
                                 fio_mark_td_terminate(td);
                                 break;
                         }
@@ -983,7 +1026,7 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
                                 td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
  
                 } else {
-                       ret = td_io_queue(td, io_u);
+                       ret = io_u_submit(td, io_u);
  
                         if (should_check_rate(td))
                                 td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
@@ -1347,6 +1390,8 @@ static bool keep_running(struct thread_data *td)
  
         if (td->done)
                 return false;
+       if (td->terminate)
+               return false;
         if (td->o.time_based)
                 return true;
         if (td->o.loops) {
@@ -1459,7 +1504,7 @@ static void *thread_main(void *data)
         struct sk_out *sk_out = fd->sk_out;
         uint64_t bytes_done[DDIR_RWDIR_CNT];
         int deadlock_loop_cnt;
-       int clear_state;
+       bool clear_state, did_some_io;
         int ret;
  
         sk_out_assign(sk_out);
@@ -1680,13 +1725,14 @@ static void *thread_main(void *data)
         }
  
         memset(bytes_done, 0, sizeof(bytes_done));
-       clear_state = 0;
+       clear_state = false;
+       did_some_io = false;
  
         while (keep_running(td)) {
                 uint64_t verify_bytes;
  
                 fio_gettime(&td->start, NULL);
-               memcpy(&td->tv_cache, &td->start, sizeof(td->start));
+               memcpy(&td->ts_cache, &td->start, sizeof(td->start));
  
                 if (clear_state) {
                         clear_io_state(td, 0);
@@ -1719,7 +1765,7 @@ static void *thread_main(void *data)
                 if (td->runstate >= TD_EXITED)
                         break;
  
-               clear_state = 1;
+               clear_state = true;
  
                 /*
                  * Make sure we've successfully updated the rusage stats
@@ -1758,6 +1804,9 @@ static void *thread_main(void *data)
                     td_ioengine_flagged(td, FIO_UNIDIR))
                         continue;
  
+               if (ddir_rw_sum(bytes_done))
+                       did_some_io = true;
+
                 clear_io_state(td, 0);
  
                 fio_gettime(&td->start, NULL);
@@ -1784,6 +1833,7 @@ static void *thread_main(void *data)
          * (Are we not missing other flags that can be ignored ?)
          */
         if ((td->o.size || td->o.io_size) && !ddir_rw_sum(bytes_done) &&
+           !did_some_io &&
             !(td_ioengine_flagged(td, FIO_NOIO) ||
               td_ioengine_flagged(td, FIO_DISKLESSIO)))
                 log_err("%s: No I/O performed by %s, "
@@ -1998,7 +2048,10 @@ static bool __check_trigger_file(void)
  static bool trigger_timedout(void)
  {
         if (trigger_timeout)
-               return time_since_genesis() >= trigger_timeout;
+               if (time_since_genesis() >= trigger_timeout) {
+                       trigger_timeout = 0;
+                       return true;
+               }
  
         return false;
  }
@@ -2007,7 +2060,7 @@ void exec_trigger(const char *cmd)
  {
         int ret;
  
-       if (!cmd)
+       if (!cmd || cmd[0] == '\0')
                 return;
  
         ret = system(cmd);
@@ -2202,7 +2255,7 @@ reap:
  
         while (todo) {
                 struct thread_data *map[REAL_MAX_JOBS];
-               struct timeval this_start;
+               struct timespec this_start;
                 int this_jobs = 0, left;
                 struct fork_data *fd;
  
@@ -2293,6 +2346,7 @@ reap:
                                 fio_terminate_threads(TERMINATE_ALL);
                                 fio_abort = 1;
                                 nr_started--;
+                               free(fd);
                                 break;
                         }
                         dprint(FD_MUTEX, "done waiting on startup_mutex\n");
diff --git a/blktrace.c b/blktrace.c

index a3474cb57ee9825077684f3b3f7de96b272a2e4c..65b600f5cfed020c2008e3c77511829fcd4cf02b 100644 (file)
--- a/blktrace.c
+++ b/blktrace.c
@@ -10,6 +10,7 @@
  
  #include "flist.h"
  #include "fio.h"
+#include "blktrace.h"
  #include "blktrace_api.h"
  #include "oslib/linux-dev-lookup.h"
  
diff --git a/blktrace.h b/blktrace.h

new file mode 100644 (file)

index 0000000..8656a95
--- /dev/null
+++ b/blktrace.h
@@ -0,0 +1,23 @@
+#ifndef FIO_BLKTRACE_H
+#define FIO_BLKTRACE_H
+
+#ifdef FIO_HAVE_BLKTRACE
+
+int is_blktrace(const char *, int *);
+int load_blktrace(struct thread_data *, const char *, int);
+
+#else
+
+static inline int is_blktrace(const char *fname, int *need_swap)
+{
+       return 0;
+}
+
+static inline int load_blktrace(struct thread_data *td, const char *fname,
+                               int need_swap)
+{
+       return 1;
+}
+
+#endif
+#endif
diff --git a/blktrace_api.h b/blktrace_api.h

index 3df3347d386a196c21a3fac5e25ee1d2a0875fb4..e2d8cb38d3583c3b21b9b9152f3cbe39b038c326 100644 (file)
--- a/blktrace_api.h
+++ b/blktrace_api.h
@@ -127,9 +127,4 @@ struct blk_user_trace_setup {
         __u32 pid;
  };
  
-#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
-#define BLKTRACESTART _IO(0x12,116)
-#define BLKTRACESTOP _IO(0x12,117)
-#define BLKTRACETEARDOWN _IO(0x12,118)
-
  #endif
diff --git a/cconv.c b/cconv.c

index 3295824b5a26f18459fa119fbeeb00596342fb5f..f809fd5197521ba58432100d694cb902538a1382 100644 (file)
--- a/cconv.c
+++ b/cconv.c
@@ -96,6 +96,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->iodepth_batch = le32_to_cpu(top->iodepth_batch);
         o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min);
         o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max);
+       o->serialize_overlap = le32_to_cpu(top->serialize_overlap);
         o->size = le64_to_cpu(top->size);
         o->io_size = le64_to_cpu(top->io_size);
         o->size_percent = le32_to_cpu(top->size_percent);
@@ -104,6 +105,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->file_size_low = le64_to_cpu(top->file_size_low);
         o->file_size_high = le64_to_cpu(top->file_size_high);
         o->start_offset = le64_to_cpu(top->start_offset);
+       o->start_offset_percent = le32_to_cpu(top->start_offset_percent);
  
         for (i = 0; i < DDIR_RWDIR_CNT; i++) {
                 o->bs[i] = le32_to_cpu(top->bs[i]);
@@ -155,6 +157,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->end_fsync = le32_to_cpu(top->end_fsync);
         o->pre_read = le32_to_cpu(top->pre_read);
         o->sync_io = le32_to_cpu(top->sync_io);
+       o->write_hint = le32_to_cpu(top->write_hint);
         o->verify = le32_to_cpu(top->verify);
         o->do_verify = le32_to_cpu(top->do_verify);
         o->verifysort = le32_to_cpu(top->verifysort);
@@ -264,6 +267,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->trim_batch = le32_to_cpu(top->trim_batch);
         o->trim_zero = le32_to_cpu(top->trim_zero);
         o->clat_percentiles = le32_to_cpu(top->clat_percentiles);
+       o->lat_percentiles = le32_to_cpu(top->lat_percentiles);
         o->percentile_precision = le32_to_cpu(top->percentile_precision);
         o->continue_on_error = le32_to_cpu(top->continue_on_error);
         o->cgroup_weight = le32_to_cpu(top->cgroup_weight);
@@ -281,7 +285,6 @@ void convert_thread_options_to_cpu(struct thread_options *o,
         o->compress_percentage = le32_to_cpu(top->compress_percentage);
         o->compress_chunk = le32_to_cpu(top->compress_chunk);
         o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
-       o->skip_bad = le32_to_cpu(top->skip_bad);
         o->block_error_hist = le32_to_cpu(top->block_error_hist);
         o->replay_align = le32_to_cpu(top->replay_align);
         o->replay_scale = le32_to_cpu(top->replay_scale);
@@ -345,6 +348,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->iodepth_batch = cpu_to_le32(o->iodepth_batch);
         top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min);
         top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max);
+       top->serialize_overlap = cpu_to_le32(o->serialize_overlap);
         top->size_percent = cpu_to_le32(o->size_percent);
         top->fill_device = cpu_to_le32(o->fill_device);
         top->file_append = cpu_to_le32(o->file_append);
@@ -364,6 +368,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->end_fsync = cpu_to_le32(o->end_fsync);
         top->pre_read = cpu_to_le32(o->pre_read);
         top->sync_io = cpu_to_le32(o->sync_io);
+       top->write_hint = cpu_to_le32(o->write_hint);
         top->verify = cpu_to_le32(o->verify);
         top->do_verify = cpu_to_le32(o->do_verify);
         top->verifysort = cpu_to_le32(o->verifysort);
@@ -450,6 +455,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->trim_batch = cpu_to_le32(o->trim_batch);
         top->trim_zero = cpu_to_le32(o->trim_zero);
         top->clat_percentiles = cpu_to_le32(o->clat_percentiles);
+       top->lat_percentiles = cpu_to_le32(o->lat_percentiles);
         top->percentile_precision = cpu_to_le32(o->percentile_precision);
         top->continue_on_error = cpu_to_le32(o->continue_on_error);
         top->cgroup_weight = cpu_to_le32(o->cgroup_weight);
@@ -468,7 +474,6 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->compress_chunk = cpu_to_le32(o->compress_chunk);
         top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
         top->block_error_hist = cpu_to_le32(o->block_error_hist);
-       top->skip_bad = cpu_to_le32(o->skip_bad);
         top->replay_align = cpu_to_le32(o->replay_align);
         top->replay_scale = cpu_to_le32(o->replay_scale);
         top->per_job_logs = cpu_to_le32(o->per_job_logs);
@@ -543,6 +548,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
         top->file_size_low = __cpu_to_le64(o->file_size_low);
         top->file_size_high = __cpu_to_le64(o->file_size_high);
         top->start_offset = __cpu_to_le64(o->start_offset);
+       top->start_offset_percent = __cpu_to_le32(o->start_offset_percent);
         top->trim_backlog = __cpu_to_le64(o->trim_backlog);
         top->offset_increment = __cpu_to_le64(o->offset_increment);
         top->number_ios = __cpu_to_le64(o->number_ios);
diff --git a/client.c b/client.c

index 80096bf8a81a7bc652fab31fc146dca24718d6e7..779fb9d7f8d04f53bd56388d13f869d0760980e5 100644 (file)
--- a/client.c
+++ b/client.c
@@ -48,7 +48,7 @@ struct client_ops fio_client_ops = {
         .client_type    = FIO_CLIENT_TYPE_CLI,
  };
  
-static struct timeval eta_tv;
+static struct timespec eta_ts;
  
  static FLIST_HEAD(client_list);
  static FLIST_HEAD(eta_list);
@@ -318,7 +318,7 @@ struct fio_client *fio_client_add_explicit(struct client_ops *ops,
         client->hostname = strdup(hostname);
  
         if (type == Fio_client_socket)
-               client->is_sock = 1;
+               client->is_sock = true;
         else {
                 int ipv6;
  
@@ -728,7 +728,7 @@ static int __fio_client_send_remote_ini(struct fio_client *client,
         strcpy((char *) pdu->file, filename);
         pdu->client_type = cpu_to_le16((uint16_t) client->type);
  
-       client->sent_job = 1;
+       client->sent_job = true;
         ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_LOAD_FILE, pdu, p_size,NULL, NULL);
         free(pdu);
         return ret;
@@ -781,7 +781,7 @@ static int __fio_client_send_local_ini(struct fio_client *client,
         pdu->buf_len = __cpu_to_le32(sb.st_size);
         pdu->client_type = cpu_to_le32(client->type);
  
-       client->sent_job = 1;
+       client->sent_job = true;
         ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_JOB, pdu, p_size, NULL, NULL);
         free(pdu);
         close(fd);
@@ -799,7 +799,7 @@ int fio_client_send_ini(struct fio_client *client, const char *filename,
                 ret = __fio_client_send_remote_ini(client, filename);
  
         if (!ret)
-               client->sent_job = 1;
+               client->sent_job = true;
  
         return ret;
  }
@@ -885,6 +885,7 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
                 convert_io_stat(&dst->slat_stat[i], &src->slat_stat[i]);
                 convert_io_stat(&dst->lat_stat[i], &src->lat_stat[i]);
                 convert_io_stat(&dst->bw_stat[i], &src->bw_stat[i]);
+               convert_io_stat(&dst->iops_stat[i], &src->iops_stat[i]);
         }
  
         dst->usr_time           = le64_to_cpu(src->usr_time);
@@ -892,7 +893,8 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
         dst->ctx                = le64_to_cpu(src->ctx);
         dst->minf               = le64_to_cpu(src->minf);
         dst->majf               = le64_to_cpu(src->majf);
-       dst->clat_percentiles   = le64_to_cpu(src->clat_percentiles);
+       dst->clat_percentiles   = le32_to_cpu(src->clat_percentiles);
+       dst->lat_percentiles    = le32_to_cpu(src->lat_percentiles);
         dst->percentile_precision = le64_to_cpu(src->percentile_precision);
  
         for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
@@ -908,6 +910,8 @@ static void convert_ts(struct thread_stat *dst, struct thread_stat *src)
                 dst->io_u_complete[i]   = le32_to_cpu(src->io_u_complete[i]);
         }
  
+       for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+               dst->io_u_lat_n[i]      = le32_to_cpu(src->io_u_lat_n[i]);
         for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
                 dst->io_u_lat_u[i]      = le32_to_cpu(src->io_u_lat_u[i]);
         for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
@@ -1001,7 +1005,7 @@ static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd)
                 opt_list = &client->opt_lists[p->ts.thread_number - 1];
  
         tsobj = show_thread_status(&p->ts, &p->rs, opt_list, NULL);
-       client->did_stat = 1;
+       client->did_stat = true;
         if (tsobj) {
                 json_object_add_client_info(tsobj, client);
                 json_array_add_value_object(clients_array, tsobj);
@@ -1123,7 +1127,7 @@ static void handle_du(struct fio_client *client, struct fio_net_cmd *cmd)
         struct cmd_du_pdu *du = (struct cmd_du_pdu *) cmd->payload;
  
         if (!client->disk_stats_shown) {
-               client->disk_stats_shown = 1;
+               client->disk_stats_shown = true;
                 log_info("\nDisk stats (read/write):\n");
         }
  
@@ -1308,14 +1312,16 @@ static void client_flush_hist_samples(FILE *f, int hist_coarseness, void *sample
  static int fio_client_handle_iolog(struct fio_client *client,
                                    struct fio_net_cmd *cmd)
  {
-       struct cmd_iolog_pdu *pdu;
+       struct cmd_iolog_pdu *pdu = NULL;
         bool store_direct;
-       char *log_pathname;
+       char *log_pathname = NULL;
+       int ret = 0;
  
         pdu = convert_iolog(cmd, &store_direct);
         if (!pdu) {
                 log_err("fio: failed converting IO log\n");
-               return 1;
+               ret = 1;
+               goto out;
         }
  
          /* allocate buffer big enough for next sprintf() call */
@@ -1323,7 +1329,8 @@ static int fio_client_handle_iolog(struct fio_client *client,
                         strlen(client->hostname));
         if (!log_pathname) {
                 log_err("fio: memory allocation of unique pathname failed\n");
-               return -1;
+               ret = -1;
+               goto out;
         }
         /* generate a unique pathname for the log file using hostname */
         sprintf(log_pathname, "%s.%s", pdu->name, client->hostname);
@@ -1338,7 +1345,8 @@ static int fio_client_handle_iolog(struct fio_client *client,
                 if (fd < 0) {
                         log_err("fio: open log %s: %s\n",
                                 log_pathname, strerror(errno));
-                       return 1;
+                       ret = 1;
+                       goto out;
                 }
  
                 sz = cmd->pdu_len - sizeof(*pdu);
@@ -1347,17 +1355,19 @@ static int fio_client_handle_iolog(struct fio_client *client,
  
                 if (ret != sz) {
                         log_err("fio: short write on compressed log\n");
-                       return 1;
+                       ret = 1;
+                       goto out;
                 }
  
-               return 0;
+               ret = 0;
         } else {
                 FILE *f;
                 f = fopen((const char *) log_pathname, "w");
                 if (!f) {
                         log_err("fio: fopen log %s : %s\n",
                                 log_pathname, strerror(errno));
-                       return 1;
+                       ret = 1;
+                       goto out;
                 }
  
                 if (pdu->log_type == IO_LOG_TYPE_HIST) {
@@ -1368,8 +1378,17 @@ static int fio_client_handle_iolog(struct fio_client *client,
                                         pdu->nr_samples * sizeof(struct io_sample));
                 }
                 fclose(f);
-               return 0;
+               ret = 0;
         }
+
+out:
+       if (pdu && pdu != (void *) cmd->payload)
+               free(pdu);
+
+       if (log_pathname)
+               free(log_pathname);
+
+       return ret;
  }
  
  static void handle_probe(struct fio_client *client, struct fio_net_cmd *cmd)
@@ -1450,7 +1469,7 @@ static struct cmd_iolog_pdu *convert_iolog_gz(struct fio_net_cmd *cmd,
         z_stream stream;
         uint32_t nr_samples;
         size_t total;
-       void *p;
+       char *p;
  
         stream.zalloc = Z_NULL;
         stream.zfree = Z_NULL;
@@ -1476,10 +1495,10 @@ static struct cmd_iolog_pdu *convert_iolog_gz(struct fio_net_cmd *cmd,
  
         memcpy(ret, pdu, sizeof(*pdu));
  
-       p = (void *) ret + sizeof(*pdu);
+       p = (char *) ret + sizeof(*pdu);
  
         stream.avail_in = cmd->pdu_len - sizeof(*pdu);
-       stream.next_in = (void *) pdu + sizeof(*pdu);
+       stream.next_in = (void *)((char *) pdu + sizeof(*pdu));
         while (stream.avail_in) {
                 unsigned int this_chunk = 65536;
                 unsigned int this_len;
@@ -1489,7 +1508,7 @@ static struct cmd_iolog_pdu *convert_iolog_gz(struct fio_net_cmd *cmd,
                         this_chunk = total;
  
                 stream.avail_out = this_chunk;
-               stream.next_out = p;
+               stream.next_out = (void *)p;
                 err = inflate(&stream, Z_NO_FLUSH);
                 /* may be Z_OK, or Z_STREAM_END */
                 if (err < 0) {
@@ -1564,7 +1583,7 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
  
                 s = __get_sample(samples, ret->log_offset, i);
                 if (ret->log_type == IO_LOG_TYPE_HIST)
-                       s = (struct io_sample *)((void *)s + sizeof(struct io_u_plat_entry) * i);
+                       s = (struct io_sample *)((char *)s + sizeof(struct io_u_plat_entry) * i);
  
                 s->time         = le64_to_cpu(s->time);
                 s->data.val     = le64_to_cpu(s->data.val);
@@ -1578,7 +1597,7 @@ static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
                 }
  
                 if (ret->log_type == IO_LOG_TYPE_HIST) {
-                       s->data.plat_entry = (struct io_u_plat_entry *)(((void *)s) + sizeof(*s));
+                       s->data.plat_entry = (struct io_u_plat_entry *)(((char *)s) + sizeof(*s));
                         s->data.plat_entry->list.next = NULL;
                         s->data.plat_entry->list.prev = NULL;
                 }
@@ -1845,10 +1864,12 @@ static void request_client_etas(struct client_ops *ops)
  static int handle_cmd_timeout(struct fio_client *client,
                               struct fio_net_cmd_reply *reply)
  {
+       uint16_t reply_opcode = reply->opcode;
+
         flist_del(&reply->list);
         free(reply);
  
-       if (reply->opcode != FIO_NET_CMD_SEND_ETA)
+       if (reply_opcode != FIO_NET_CMD_SEND_ETA)
                 return 1;
  
         log_info("client <%s>: timeout on SEND_ETA\n", client->hostname);
@@ -1869,7 +1890,7 @@ static int handle_cmd_timeout(struct fio_client *client,
  }
  
  static int client_check_cmd_timeout(struct fio_client *client,
-                                   struct timeval *now)
+                                   struct timespec *now)
  {
         struct fio_net_cmd_reply *reply;
         struct flist_head *entry, *tmp;
@@ -1878,7 +1899,7 @@ static int client_check_cmd_timeout(struct fio_client *client,
         flist_for_each_safe(entry, tmp, &client->cmd_list) {
                 reply = flist_entry(entry, struct fio_net_cmd_reply, list);
  
-               if (mtime_since(&reply->tv, now) < FIO_NET_CLIENT_TIMEOUT)
+               if (mtime_since(&reply->ts, now) < FIO_NET_CLIENT_TIMEOUT)
                         continue;
  
                 if (!handle_cmd_timeout(client, reply))
@@ -1896,10 +1917,10 @@ static int fio_check_clients_timed_out(void)
  {
         struct fio_client *client;
         struct flist_head *entry, *tmp;
-       struct timeval tv;
+       struct timespec ts;
         int ret = 0;
  
-       fio_gettime(&tv, NULL);
+       fio_gettime(&ts, NULL);
  
         flist_for_each_safe(entry, tmp, &client_list) {
                 client = flist_entry(entry, struct fio_client, list);
@@ -1907,7 +1928,7 @@ static int fio_check_clients_timed_out(void)
                 if (flist_empty(&client->cmd_list))
                         continue;
  
-               if (!client_check_cmd_timeout(client, &tv))
+               if (!client_check_cmd_timeout(client, &ts))
                         continue;
  
                 if (client->ops->timed_out)
@@ -1928,7 +1949,7 @@ int fio_handle_clients(struct client_ops *ops)
         struct pollfd *pfds;
         int i, ret = 0, retval = 0;
  
-       fio_gettime(&eta_tv, NULL);
+       fio_gettime(&eta_ts, NULL);
  
         pfds = malloc(nr_clients * sizeof(struct pollfd));
  
@@ -1960,13 +1981,13 @@ int fio_handle_clients(struct client_ops *ops)
                 assert(i == nr_clients);
  
                 do {
-                       struct timeval tv;
+                       struct timespec ts;
                         int timeout;
  
-                       fio_gettime(&tv, NULL);
-                       if (mtime_since(&eta_tv, &tv) >= 900) {
+                       fio_gettime(&ts, NULL);
+                       if (mtime_since(&eta_ts, &ts) >= 900) {
                                 request_client_etas(ops);
-                               memcpy(&eta_tv, &tv, sizeof(tv));
+                               memcpy(&eta_ts, &ts, sizeof(ts));
  
                                 if (fio_check_clients_timed_out())
                                         break;
diff --git a/client.h b/client.h

index fc9c19693a9ff0c60bfcee6429755344b72cdfdf..394b685df70f608a9a7bb3535c2a4a92239684f5 100644 (file)
--- a/client.h
+++ b/client.h
@@ -6,6 +6,7 @@
  #include <netinet/in.h>
  #include <arpa/inet.h>
  
+#include "lib/types.h"
  #include "stat.h"
  
  struct fio_net_cmd;
@@ -45,16 +46,16 @@ struct fio_client {
  
         int state;
  
-       int skip_newline;
-       int is_sock;
-       int disk_stats_shown;
+       bool skip_newline;
+       bool is_sock;
+       bool disk_stats_shown;
         unsigned int jobs;
         unsigned int nr_stat;
         int error;
         int signal;
         int ipv6;
-       int sent_job;
-       int did_stat;
+       bool sent_job;
+       bool did_stat;
         uint32_t type;
  
         uint32_t thread_number;
diff --git a/configure b/configure

index 21bcaf46604980a27ccfbe0f32acceead6067c4e..cefd61032284ddc36013a7d7ab1d8aaa9bc71b71 100755 (executable)
--- a/configure
+++ b/configure
@@ -37,6 +37,11 @@ fatal() {
    exit 1
  }
  
+# Print result for each configuration test
+print_config() {
+  printf "%-30s%s\n" "$1" "$2"
+}
+
  # Default CFLAGS
  CFLAGS="-D_GNU_SOURCE -include config-host.h"
  BUILD_CFLAGS=""
@@ -475,11 +480,11 @@ EOF
  fi
  
  
-echo "Operating system              $targetos"
-echo "CPU                           $cpu"
-echo "Big endian                    $bigendian"
-echo "Compiler                      $cc"
-echo "Cross compile                 $cross_compile"
+print_config "Operating system" "$targetos"
+print_config "CPU" "$cpu"
+print_config "Big endian" "$bigendian"
+print_config "Compiler" "$cc"
+print_config "Cross compile" "$cross_compile"
  echo
  
  ##########################################
@@ -490,7 +495,7 @@ if test "$build_static" = "yes" ; then
  else
    build_static="no"
  fi
-echo "Static build                  $build_static"
+print_config "Static build" "$build_static"
  
  ##########################################
  # check for wordsize
@@ -511,7 +516,7 @@ elif compile_prog "-DWORDSIZE=64" "" "wordsize"; then
  else
    fatal "Unknown wordsize"
  fi
-echo "Wordsize                      $wordsize"
+print_config "Wordsize" "$wordsize"
  
  ##########################################
  # zlib probe
@@ -532,7 +537,7 @@ if compile_prog "" "-lz" "zlib" ; then
    zlib=yes
    LIBS="-lz $LIBS"
  fi
-echo "zlib                          $zlib"
+print_config "zlib" "$zlib"
  
  ##########################################
  # linux-aio probe
@@ -559,7 +564,7 @@ EOF
      libaio=no
    fi
  fi
-echo "Linux AIO support             $libaio"
+print_config "Linux AIO support" "$libaio"
  
  ##########################################
  # posix aio probe
@@ -585,8 +590,8 @@ elif compile_prog "" "-lrt" "posixaio"; then
    posix_aio_lrt="yes"
    LIBS="-lrt $LIBS"
  fi
-echo "POSIX AIO support             $posix_aio"
-echo "POSIX AIO support needs -lrt  $posix_aio_lrt"
+print_config "POSIX AIO support" "$posix_aio"
+print_config "POSIX AIO support needs -lrt" "$posix_aio_lrt"
  
  ##########################################
  # posix aio fsync probe
@@ -608,7 +613,7 @@ EOF
      posix_aio_fsync=yes
    fi
  fi
-echo "POSIX AIO fsync               $posix_aio_fsync"
+print_config "POSIX AIO fsync" "$posix_aio_fsync"
  
  ##########################################
  # POSIX pshared attribute probe
@@ -638,7 +643,7 @@ EOF
  if compile_prog "" "$LIBS" "posix_pshared" ; then
    posix_pshared=yes
  fi
-echo "POSIX pshared support         $posix_pshared"
+print_config "POSIX pshared support" "$posix_pshared"
  
  ##########################################
  # solaris aio probe
@@ -660,7 +665,7 @@ if compile_prog "" "-laio" "solarisaio" ; then
    solaris_aio=yes
    LIBS="-laio $LIBS"
  fi
-echo "Solaris AIO support           $solaris_aio"
+print_config "Solaris AIO support" "$solaris_aio"
  
  ##########################################
  # __sync_fetch_and_add test
@@ -684,7 +689,7 @@ EOF
  if compile_prog "" "" "__sync_fetch_and_add()" ; then
      sfaa="yes"
  fi
-echo "__sync_fetch_and_add          $sfaa"
+print_config "__sync_fetch_and_add" "$sfaa"
  
  ##########################################
  # libverbs probe
@@ -692,8 +697,7 @@ if test "$libverbs" != "yes" ; then
    libverbs="no"
  fi
  cat > $TMPC << EOF
-#include <stdio.h>
-#include <infiniband/arch.h>
+#include <infiniband/verbs.h>
  int main(int argc, char **argv)
  {
    struct ibv_pd *pd = ibv_alloc_pd(NULL);
@@ -704,7 +708,7 @@ if test "$disable_rdma" != "yes" && compile_prog "" "-libverbs" "libverbs" ; the
      libverbs="yes"
      LIBS="-libverbs $LIBS"
  fi
-echo "libverbs                      $libverbs"
+print_config "libverbs" "$libverbs"
  
  ##########################################
  # rdmacm probe
@@ -724,7 +728,7 @@ if test "$disable_rdma" != "yes" && compile_prog "" "-lrdmacm" "rdma"; then
      rdmacm="yes"
      LIBS="-lrdmacm $LIBS"
  fi
-echo "rdmacm                        $rdmacm"
+print_config "rdmacm" "$rdmacm"
  
  ##########################################
  # Linux fallocate probe
@@ -744,7 +748,7 @@ EOF
  if compile_prog "" "" "linux_fallocate"; then
      linux_fallocate="yes"
  fi
-echo "Linux fallocate               $linux_fallocate"
+print_config "Linux fallocate" "$linux_fallocate"
  
  ##########################################
  # POSIX fadvise probe
@@ -763,7 +767,7 @@ EOF
  if compile_prog "" "" "posix_fadvise"; then
      posix_fadvise="yes"
  fi
-echo "POSIX fadvise                 $posix_fadvise"
+print_config "POSIX fadvise" "$posix_fadvise"
  
  ##########################################
  # POSIX fallocate probe
@@ -782,7 +786,7 @@ EOF
  if compile_prog "" "" "posix_fallocate"; then
      posix_fallocate="yes"
  fi
-echo "POSIX fallocate               $posix_fallocate"
+print_config "POSIX fallocate" "$posix_fallocate"
  
  ##########################################
  # sched_set/getaffinity 2 or 3 argument test
@@ -815,8 +819,8 @@ EOF
      linux_2arg_affinity="yes"
    fi
  fi
-echo "sched_setaffinity(3 arg)      $linux_3arg_affinity"
-echo "sched_setaffinity(2 arg)      $linux_2arg_affinity"
+print_config "sched_setaffinity(3 arg)" "$linux_3arg_affinity"
+print_config "sched_setaffinity(2 arg)" "$linux_2arg_affinity"
  
  ##########################################
  # clock_gettime probe
@@ -837,7 +841,7 @@ elif compile_prog "" "-lrt" "clock_gettime"; then
      clock_gettime="yes"
      LIBS="-lrt $LIBS"
  fi
-echo "clock_gettime                 $clock_gettime"
+print_config "clock_gettime" "$clock_gettime"
  
  ##########################################
  # CLOCK_MONOTONIC probe
@@ -857,7 +861,7 @@ EOF
        clock_monotonic="yes"
    fi
  fi
-echo "CLOCK_MONOTONIC               $clock_monotonic"
+print_config "CLOCK_MONOTONIC" "$clock_monotonic"
  
  ##########################################
  # CLOCK_MONOTONIC_RAW probe
@@ -877,7 +881,7 @@ EOF
        clock_monotonic_raw="yes"
    fi
  fi
-echo "CLOCK_MONOTONIC_RAW           $clock_monotonic_raw"
+print_config "CLOCK_MONOTONIC_RAW" "$clock_monotonic_raw"
  
  ##########################################
  # CLOCK_MONOTONIC_PRECISE probe
@@ -897,7 +901,7 @@ EOF
        clock_monotonic_precise="yes"
    fi
  fi
-echo "CLOCK_MONOTONIC_PRECISE       $clock_monotonic_precise"
+print_config "CLOCK_MONOTONIC_PRECISE" "$clock_monotonic_precise"
  
  ##########################################
  # clockid_t probe
@@ -917,7 +921,7 @@ EOF
  if compile_prog "" "$LIBS" "clockid_t"; then
    clockid_t="yes"
  fi
-echo "clockid_t                     $clockid_t"
+print_config "clockid_t" "$clockid_t"
  
  ##########################################
  # gettimeofday() probe
@@ -936,7 +940,7 @@ EOF
  if compile_prog "" "" "gettimeofday"; then
      gettimeofday="yes"
  fi
-echo "gettimeofday                  $gettimeofday"
+print_config "gettimeofday" "$gettimeofday"
  
  ##########################################
  # fdatasync() probe
@@ -954,7 +958,7 @@ EOF
  if compile_prog "" "" "fdatasync"; then
    fdatasync="yes"
  fi
-echo "fdatasync                     $fdatasync"
+print_config "fdatasync" "$fdatasync"
  
  ##########################################
  # sync_file_range() probe
@@ -976,7 +980,7 @@ EOF
  if compile_prog "" "" "sync_file_range"; then
    sync_file_range="yes"
  fi
-echo "sync_file_range               $sync_file_range"
+print_config "sync_file_range" "$sync_file_range"
  
  ##########################################
  # ext4 move extent probe
@@ -1000,7 +1004,7 @@ elif test $targetos = "Linux" ; then
    # work. Takes a while to bubble back.
    ext4_me="yes"
  fi
-echo "EXT4 move extent              $ext4_me"
+print_config "EXT4 move extent" "$ext4_me"
  
  ##########################################
  # splice probe
@@ -1018,7 +1022,7 @@ EOF
  if compile_prog "" "" "linux splice"; then
    linux_splice="yes"
  fi
-echo "Linux splice(2)               $linux_splice"
+print_config "Linux splice(2)" "$linux_splice"
  
  ##########################################
  # GUASI probe
@@ -1037,7 +1041,7 @@ EOF
  if compile_prog "" "" "guasi"; then
    guasi="yes"
  fi
-echo "GUASI                         $guasi"
+print_config "GUASI" "$guasi"
  
  ##########################################
  # fusion-aw probe
@@ -1059,7 +1063,7 @@ if compile_prog "" "-L/usr/lib/fio -L/usr/lib/nvm -lnvm-primitives -ldl -lpthrea
    LIBS="-L/usr/lib/fio -L/usr/lib/nvm -lnvm-primitives -ldl -lpthread $LIBS"
    fusion_aw="yes"
  fi
-echo "Fusion-io atomic engine       $fusion_aw"
+print_config "Fusion-io atomic engine" "$fusion_aw"
  
  ##########################################
  # libnuma probe
@@ -1077,7 +1081,7 @@ if test "$disable_numa" != "yes"  && compile_prog "" "-lnuma" "libnuma"; then
    libnuma="yes"
    LIBS="-lnuma $LIBS"
  fi
-echo "libnuma                       $libnuma"
+print_config "libnuma" "$libnuma"
  
  ##########################################
  # libnuma 2.x version API, initialize with "no" only if $libnuma is set to "yes"
@@ -1094,7 +1098,7 @@ EOF
  if compile_prog "" "" "libnuma api"; then
    libnuma_v2="yes"
  fi
-echo "libnuma v2                    $libnuma_v2"
+print_config "libnuma v2" "$libnuma_v2"
  fi
  
  ##########################################
@@ -1114,7 +1118,7 @@ EOF
  if compile_prog "" "" "strsep"; then
    strsep="yes"
  fi
-echo "strsep                        $strsep"
+print_config "strsep" "$strsep"
  
  ##########################################
  # strcasestr() probe
@@ -1131,7 +1135,7 @@ EOF
  if compile_prog "" "" "strcasestr"; then
    strcasestr="yes"
  fi
-echo "strcasestr                    $strcasestr"
+print_config "strcasestr" "$strcasestr"
  
  ##########################################
  # strlcat() probe
@@ -1152,7 +1156,7 @@ EOF
  if compile_prog "" "" "strlcat"; then
    strlcat="yes"
  fi
-echo "strlcat                       $strlcat"
+print_config "strlcat" "$strlcat"
  
  ##########################################
  # getopt_long_only() probe
@@ -1172,7 +1176,7 @@ EOF
  if compile_prog "" "" "getopt_long_only"; then
    getopt_long_only="yes"
  fi
-echo "getopt_long_only()            $getopt_long_only"
+print_config "getopt_long_only()" "$getopt_long_only"
  
  ##########################################
  # inet_aton() probe
@@ -1192,7 +1196,7 @@ EOF
  if compile_prog "" "" "inet_aton"; then
    inet_aton="yes"
  fi
-echo "inet_aton                     $inet_aton"
+print_config "inet_aton" "$inet_aton"
  
  ##########################################
  # socklen_t probe
@@ -1210,7 +1214,7 @@ EOF
  if compile_prog "" "" "socklen_t"; then
    socklen_t="yes"
  fi
-echo "socklen_t                     $socklen_t"
+print_config "socklen_t" "$socklen_t"
  
  ##########################################
  # Whether or not __thread is supported for TLS
@@ -1228,7 +1232,7 @@ EOF
  if compile_prog "" "" "__thread"; then
    tls_thread="yes"
  fi
-echo "__thread                      $tls_thread"
+print_config "__thread" "$tls_thread"
  
  ##########################################
  # Check if we have required gtk/glib support for gfio
@@ -1278,7 +1282,7 @@ LDFLAGS=$ORG_LDFLAGS
  fi
  
  if test "$gfio_check" = "yes" ; then
-  echo "gtk 2.18 or higher            $gfio"
+  print_config "gtk 2.18 or higher" "$gfio"
  fi
  
  ##########################################
@@ -1299,7 +1303,7 @@ EOF
  if compile_prog "" "" "RUSAGE_THREAD"; then
    rusage_thread="yes"
  fi
-echo "RUSAGE_THREAD                 $rusage_thread"
+print_config "RUSAGE_THREAD" "$rusage_thread"
  
  ##########################################
  # Check whether we have SCHED_IDLE
@@ -1317,7 +1321,7 @@ EOF
  if compile_prog "" "" "SCHED_IDLE"; then
    sched_idle="yes"
  fi
-echo "SCHED_IDLE                    $sched_idle"
+print_config "SCHED_IDLE" "$sched_idle"
  
  ##########################################
  # Check whether we have TCP_NODELAY
@@ -1337,7 +1341,7 @@ EOF
  if compile_prog "" "" "TCP_NODELAY"; then
    tcp_nodelay="yes"
  fi
-echo "TCP_NODELAY                   $tcp_nodelay"
+print_config "TCP_NODELAY" "$tcp_nodelay"
  
  ##########################################
  # Check whether we have SO_SNDBUF
@@ -1358,7 +1362,7 @@ EOF
  if compile_prog "" "" "SO_SNDBUF"; then
    window_size="yes"
  fi
-echo "Net engine window_size        $window_size"
+print_config "Net engine window_size" "$window_size"
  
  ##########################################
  # Check whether we have TCP_MAXSEG
@@ -1380,7 +1384,7 @@ EOF
  if compile_prog "" "" "TCP_MAXSEG"; then
    mss="yes"
  fi
-echo "TCP_MAXSEG                    $mss"
+print_config "TCP_MAXSEG" "$mss"
  
  ##########################################
  # Check whether we have RLIMIT_MEMLOCK
@@ -1399,7 +1403,7 @@ EOF
  if compile_prog "" "" "RLIMIT_MEMLOCK"; then
    rlimit_memlock="yes"
  fi
-echo "RLIMIT_MEMLOCK                $rlimit_memlock"
+print_config "RLIMIT_MEMLOCK" "$rlimit_memlock"
  
  ##########################################
  # Check whether we have pwritev/preadv
@@ -1417,7 +1421,7 @@ EOF
  if compile_prog "" "" "pwritev"; then
    pwritev="yes"
  fi
-echo "pwritev/preadv                $pwritev"
+print_config "pwritev/preadv" "$pwritev"
  
  ##########################################
  # Check whether we have pwritev2/preadv2
@@ -1435,7 +1439,7 @@ EOF
  if compile_prog "" "" "pwritev2"; then
    pwritev2="yes"
  fi
-echo "pwritev2/preadv2              $pwritev2"
+print_config "pwritev2/preadv2" "$pwritev2"
  
  ##########################################
  # Check whether we have the required functions for ipv6
@@ -1464,7 +1468,7 @@ EOF
  if compile_prog "" "" "ipv6"; then
    ipv6="yes"
  fi
-echo "IPv6 helpers                  $ipv6"
+print_config "IPv6 helpers" "$ipv6"
  
  ##########################################
  # check for rbd
@@ -1478,12 +1482,16 @@ int main(int argc, char **argv)
  {
    rados_t cluster;
    rados_ioctx_t io_ctx;
+  const char cluster_name[] = "ceph";
+  const char user_name[] = "client.admin";
    const char pool[] = "rbd";
-
    int major, minor, extra;
-  rbd_version(&major, &minor, &extra);
  
+  rbd_version(&major, &minor, &extra);
+  /* The rados_create2 signature required was only introduced in ceph 0.65 */
+  rados_create2(&cluster, cluster_name, user_name, 0);
    rados_ioctx_create(cluster, pool, &io_ctx);
+
    return 0;
  }
  EOF
@@ -1491,7 +1499,7 @@ if test "$disable_rbd" != "yes"  && compile_prog "" "-lrbd -lrados" "rbd"; then
    LIBS="-lrbd -lrados $LIBS"
    rbd="yes"
  fi
-echo "Rados Block Device engine     $rbd"
+print_config "Rados Block Device engine" "$rbd"
  
  ##########################################
  # check for rbd_poll
@@ -1518,7 +1526,7 @@ EOF
  if compile_prog "" "-lrbd -lrados" "rbd"; then
    rbd_poll="yes"
  fi
-echo "rbd_poll                      $rbd_poll"
+print_config "rbd_poll" "$rbd_poll"
  fi
  
  ##########################################
@@ -1540,7 +1548,7 @@ EOF
  if compile_prog "" "-lrbd -lrados" "rbd"; then
    rbd_inval="yes"
  fi
-echo "rbd_invalidate_cache          $rbd_inval"
+print_config "rbd_invalidate_cache" "$rbd_inval"
  fi
  
  ##########################################
@@ -1571,7 +1579,7 @@ if test "$disable_rbd" != "yes" && test "$disable_rbd_blkin" != "yes" \
    LIBS="-lblkin $LIBS"
    rbd_blkin="yes"
  fi
-echo "rbd blkin tracing             $rbd_blkin"
+print_config "rbd blkin tracing" "$rbd_blkin"
  
  ##########################################
  # Check whether we have setvbuf
@@ -1591,7 +1599,7 @@ EOF
  if compile_prog "" "" "setvbuf"; then
    setvbuf="yes"
  fi
-echo "setvbuf                       $setvbuf"
+print_config "setvbuf" "$setvbuf"
  
  ##########################################
  # check for gfapi
@@ -1612,7 +1620,7 @@ if test "$disable_gfapi" != "yes"  && compile_prog "" "-lgfapi -lglusterfs" "gfa
    LIBS="-lgfapi -lglusterfs $LIBS"
    gfapi="yes"
  fi
- echo "Gluster API engine            $gfapi"
+print_config "Gluster API engine" "$gfapi"
  
  ##########################################
  # check for gfapi fadvise support, initialize with "no" only if $gfapi is set to "yes"
@@ -1632,7 +1640,7 @@ EOF
  if compile_prog "" "-lgfapi -lglusterfs" "gfapi"; then
    gf_fadvise="yes"
  fi
-echo "Gluster API use fadvise       $gf_fadvise"
+print_config "Gluster API use fadvise" "$gf_fadvise"
  fi
  
  ##########################################
@@ -1652,7 +1660,7 @@ EOF
  if compile_prog "" "-lgfapi -lglusterfs" "gf trim"; then
    gf_trim="yes"
  fi
-echo "Gluster API trim support      $gf_trim"
+print_config "Gluster API trim support" "$gf_trim"
  fi
  
  ##########################################
@@ -1682,11 +1690,11 @@ int main(int argc, char **argv)
  EOF
  if compile_prog "" "" "s390_z196_facilities"; then
    $TMPE
-  if [[ $? -eq 0 ]]; then
+  if [ $? -eq 0 ]; then
         s390_z196_facilities="yes"
    fi
  fi
-echo "s390_z196_facilities          $s390_z196_facilities"
+print_config "s390_z196_facilities" "$s390_z196_facilities"
  
  ##########################################
  # Check if we have required environment variables configured for libhdfs
@@ -1712,7 +1720,7 @@ if test "$libhdfs" = "yes" ; then
      FIO_HDFS_CPU="amd64"
    fi
  fi
-echo "HDFS engine                   $libhdfs"
+print_config "HDFS engine" "$libhdfs"
  
  ##########################################
  # Check whether we have MTD
@@ -1735,7 +1743,7 @@ EOF
  if compile_prog "" "" "mtd"; then
    mtd="yes"
  fi
-echo "MTD                           $mtd"
+print_config "MTD" "$mtd"
  
  ##########################################
  # Check whether we have libpmem
@@ -1755,7 +1763,7 @@ if compile_prog "" "-lpmem" "libpmem"; then
    libpmem="yes"
    LIBS="-lpmem $LIBS"
  fi
-echo "libpmem                       $libpmem"
+print_config "libpmem" "$libpmem"
  
  ##########################################
  # Check whether we have libpmemblk
@@ -1778,7 +1786,7 @@ EOF
      LIBS="-lpmemblk $LIBS"
    fi
  fi
-echo "libpmemblk                    $libpmemblk"
+print_config "libpmemblk" "$libpmemblk"
  
  # Choose the ioengines
  if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
@@ -1790,11 +1798,11 @@ fi
  
  ##########################################
  # Report whether pmemblk engine is enabled
-echo "NVML pmemblk engine           $pmemblk"
+print_config "NVML pmemblk engine" "$pmemblk"
  
  ##########################################
  # Report whether dev-dax engine is enabled
-echo "NVML dev-dax engine           $devdax"
+print_config "NVML dev-dax engine" "$devdax"
  
  ##########################################
  # Check if we have lex/yacc available
@@ -1855,7 +1863,7 @@ fi
  fi
  fi
  
-echo "lex/yacc for arithmetic       $arith"
+print_config "lex/yacc for arithmetic" "$arith"
  
  ##########################################
  # Check whether we have setmntent/getmntent
@@ -1876,7 +1884,7 @@ EOF
  if compile_prog "" "" "getmntent"; then
    getmntent="yes"
  fi
-echo "getmntent                     $getmntent"
+print_config "getmntent" "$getmntent"
  
  ##########################################
  # Check whether we have getmntinfo
@@ -1901,7 +1909,7 @@ EOF
  if compile_prog "-Werror" "" "getmntinfo"; then
    getmntinfo="yes"
  fi
-echo "getmntinfo                    $getmntinfo"
+print_config "getmntinfo" "$getmntinfo"
  
  # getmntinfo(3) for NetBSD.
  if test "$getmntinfo_statvfs" != "yes" ; then
@@ -1919,7 +1927,7 @@ EOF
  # Skip the test if the one with statfs arg is detected.
  if test "$getmntinfo" != "yes" && compile_prog "-Werror" "" "getmntinfo_statvfs"; then
    getmntinfo_statvfs="yes"
-  echo "getmntinfo_statvfs            $getmntinfo_statvfs"
+  print_config "getmntinfo_statvfs" "$getmntinfo_statvfs"
  fi
  
  ##########################################
@@ -1930,16 +1938,7 @@ fi
  cat > $TMPC << EOF
  #include <assert.h>
  #include <stdlib.h>
-#undef offsetof
-#ifdef __compiler_offsetof
-#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
-#else
-#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
-#endif
-
-#define container_of(ptr, type, member) ({                     \
-       const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
-       (type *)( (char *)__mptr - offsetof(type,member) );})
+#include <stddef.h>
  
  struct foo {
    int a, b;
@@ -1954,7 +1953,7 @@ EOF
  if compile_prog "" "" "static_assert"; then
      static_assert="yes"
  fi
-echo "Static Assert                 $static_assert"
+print_config "Static Assert" "$static_assert"
  
  ##########################################
  # Check whether we have bool / stdbool.h
@@ -1972,7 +1971,26 @@ EOF
  if compile_prog "" "" "bool"; then
    have_bool="yes"
  fi
-echo "bool                          $have_bool"
+print_config "bool" "$have_bool"
+
+##########################################
+# Check whether we have strndup()
+strndup="no"
+cat > $TMPC << EOF
+#include <string.h>
+#include <stdlib.h>
+int main(int argc, char **argv)
+{
+  char *res = strndup("test string", 8);
+
+  free(res);
+  return 0;
+}
+EOF
+if compile_prog "" "" "strndup"; then
+  strndup="yes"
+fi
+print_config "strndup" "$strndup"
  
  ##########################################
  # check march=armv8-a+crc+crypto
@@ -1995,7 +2013,7 @@ EOF
      CFLAGS="$CFLAGS -march=armv8-a+crc+crypto -DARCH_HAVE_CRC_CRYPTO"
    fi
  fi
-echo "march_armv8_a_crc_crypto      $march_armv8_a_crc_crypto"
+print_config "march_armv8_a_crc_crypto" "$march_armv8_a_crc_crypto"
  
  ##########################################
  # cuda probe
@@ -2013,7 +2031,7 @@ if test "$enable_cuda" = "yes" && compile_prog "" "-lcuda" "cuda"; then
    cuda="yes"
    LIBS="-lcuda $LIBS"
  fi
-echo "cuda                          $cuda"
+print_config "cuda" "$cuda"
  
  #############################################################################
  
@@ -2231,6 +2249,9 @@ fi
  if test "$have_bool" = "yes" ; then
    output_sym "CONFIG_HAVE_BOOL"
  fi
+if test "$strndup" = "yes" ; then
+  output_sym "CONFIG_HAVE_STRNDUP"
+fi
  if test "$disable_opt" = "yes" ; then
    output_sym "CONFIG_DISABLE_OPTIMIZATIONS"
  fi
diff --git a/crc/crc32c-arm64.c b/crc/crc32c-arm64.c

index c3f42c7cd0ace8fbaef7acbbace2e2c9548b7303..08177ba6cff66e78884eaed9148c6909a67097b8 100644 (file)
--- a/crc/crc32c-arm64.c
+++ b/crc/crc32c-arm64.c
@@ -19,7 +19,7 @@
  #define HWCAP_CRC32             (1 << 7)
  #endif /* HWCAP_CRC32 */
  
-int crc32c_arm64_available = 0;
+bool crc32c_arm64_available = false;
  
  #ifdef ARCH_HAVE_ARM64_CRC_CRYPTO
  
@@ -27,7 +27,7 @@ int crc32c_arm64_available = 0;
  #include <arm_acle.h>
  #include <arm_neon.h>
  
-static int crc32c_probed;
+static bool crc32c_probed;
  
  /*
   * Function to calculate reflected crc with PMULL Instruction
@@ -106,9 +106,8 @@ void crc32c_arm64_probe(void)
  
         if (!crc32c_probed) {
                 hwcap = getauxval(AT_HWCAP);
-               if (hwcap & HWCAP_CRC32)
-                       crc32c_arm64_available = 1;
-               crc32c_probed = 1;
+               crc32c_arm64_available = (hwcap & HWCAP_CRC32) != 0;
+               crc32c_probed = true;
         }
  }
  
diff --git a/crc/crc32c-intel.c b/crc/crc32c-intel.c

index 0b0f193c0564a75de8d5d00e80695af40e49dd6a..05a087dcb6060f919deca5dc94699e58eee8bbe5 100644 (file)
--- a/crc/crc32c-intel.c
+++ b/crc/crc32c-intel.c
@@ -18,7 +18,7 @@
   * Volume 2A: Instruction Set Reference, A-M
   */
  
-int crc32c_intel_available = 0;
+bool crc32c_intel_available = false;
  
  #ifdef ARCH_HAVE_SSE4_2
  
@@ -30,7 +30,7 @@ int crc32c_intel_available = 0;
  #define SCALE_F 4
  #endif
  
-static int crc32c_probed;
+static bool crc32c_probed;
  
  static uint32_t crc32c_intel_le_hw_byte(uint32_t crc, unsigned char const *data,
                                         unsigned long length)
@@ -87,7 +87,7 @@ void crc32c_intel_probe(void)
  
                 do_cpuid(&eax, &ebx, &ecx, &edx);
                 crc32c_intel_available = (ecx & (1 << 20)) != 0;
-               crc32c_probed = 1;
+               crc32c_probed = true;
         }
  }
  
diff --git a/crc/crc32c.h b/crc/crc32c.h

index 5d664079b940fcb1691ceb569e7882fee7b57966..d513f3aa5403df89fb9e9ee1b709561bf868681f 100644 (file)
--- a/crc/crc32c.h
+++ b/crc/crc32c.h
@@ -19,10 +19,11 @@
  #define CRC32C_H
  
  #include "../arch/arch.h"
+#include "../lib/types.h"
  
  extern uint32_t crc32c_sw(unsigned char const *, unsigned long);
-extern int crc32c_arm64_available;
-extern int crc32c_intel_available;
+extern bool crc32c_arm64_available;
+extern bool crc32c_intel_available;
  
  #ifdef ARCH_HAVE_ARM64_CRC_CRYPTO
  extern uint32_t crc32c_arm64(unsigned char const *, unsigned long);
diff --git a/crc/test.c b/crc/test.c

index 368229e736bec8d9ef38c745c66bdd534363f66b..b119872625864eae504123faa47157ee61984d4e 100644 (file)
--- a/crc/test.c
+++ b/crc/test.c
@@ -392,7 +392,7 @@ int fio_crctest(const char *type)
         fill_random_buf(&state, buf, CHUNK);
  
         for (i = 0; t[i].name; i++) {
-               struct timeval tv;
+               struct timespec ts;
                 double mb_sec;
                 uint64_t usec;
                 char pre[3];
@@ -409,9 +409,9 @@ int fio_crctest(const char *type)
                         t[i].fn(&t[i], buf, CHUNK);
                 }
  
-               fio_gettime(&tv, NULL);
+               fio_gettime(&ts, NULL);
                 t[i].fn(&t[i], buf, CHUNK);
-               usec = utime_since_now(&tv);
+               usec = utime_since_now(&ts);
  
                 if (usec) {
                         mb_sec = (double) mb / (double) usec;
diff --git a/diskutil.c b/diskutil.c

index dca37483fa96a2401c0e49cffc4909659e79ccf0..618cae8b54f4c3138f55f2ea537e0b9b6ed12a69 100644 (file)
--- a/diskutil.c
+++ b/diskutil.c
@@ -3,6 +3,7 @@
  #include <sys/time.h>
  #include <sys/types.h>
  #include <sys/stat.h>
+#include <sys/sysmacros.h>
  #include <dirent.h>
  #include <libgen.h>
  #include <math.h>
@@ -84,7 +85,7 @@ static int get_io_ticks(struct disk_util *du, struct disk_util_stat *dus)
  static void update_io_tick_disk(struct disk_util *du)
  {
         struct disk_util_stat __dus, *dus, *ldus;
-       struct timeval t;
+       struct timespec t;
  
         if (!du->users)
                 return;
@@ -363,7 +364,7 @@ static int find_block_dir(int majdev, int mindev, char *path, int link_ok)
                 return 0;
  
         while ((dir = readdir(D)) != NULL) {
-               char full_path[256];
+               char full_path[257];
  
                 if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
                         continue;
diff --git a/diskutil.h b/diskutil.h

index f7730667c7cf81a6831acabec904159fd87634b2..91b42020a8077024607a3818c9314e7882189390 100644 (file)
--- a/diskutil.h
+++ b/diskutil.h
@@ -64,7 +64,7 @@ struct disk_util {
          */
         struct flist_head slaves;
  
-       struct timeval time;
+       struct timespec time;
  
         struct fio_mutex *lock;
         unsigned long users;
diff --git a/doc/fio_examples.rst b/doc/fio_examples.rst

index ae0ef6f8b923de7fca5a22d29495f03f9629888c..cff1f39444b5f63f24294b97a9f13436572719a0 100644 (file)
--- a/doc/fio_examples.rst
+++ b/doc/fio_examples.rst
@@ -60,3 +60,13 @@ Fixed rate submission
  
  .. literalinclude:: ../examples/fixed-rate-submission.fio
         :language: ini
+
+Butterfly seek pattern
+-----------------------
+
+.. only:: builder_html
+
+:download:`Download butterfly.fio <../examples/butterfly.fio>`
+
+.. literalinclude:: ../examples/butterfly.fio
+       :language: ini
diff --git a/engines/binject.c b/engines/binject.c

index 932534a03987eb85fbe1241f35944a4fc287dbed..792dbbdd2ee89da71dfb42b3382a3e4adddbc877 100644 (file)
--- a/engines/binject.c
+++ b/engines/binject.c
@@ -59,11 +59,12 @@ static int pollin_events(struct pollfd *pfds, int fds)
         return 0;
  }
  
-static unsigned int binject_read_commands(struct thread_data *td, void *p,
+static unsigned int binject_read_commands(struct thread_data *td, void *buf,
                                           int left, int *err)
  {
         struct fio_file *f;
         int i, ret, events;
+       char *p = buf;
  
  one_more:
         events = 0;
diff --git a/engines/glusterfs.c b/engines/glusterfs.c

index 2abc283fc048f1580271cb257e1831454e5a3f18..981dfa35e03783b6a4c06b4441da9297f21186ea 100644 (file)
--- a/engines/glusterfs.c
+++ b/engines/glusterfs.c
@@ -165,11 +165,11 @@ int fio_gf_open_file(struct thread_data *td, struct fio_file *f)
         if (td_read(td)) {
                 if (glfs_lstat(g->fs, f->file_name, &sb)
                     || sb.st_size < f->real_file_size) {
-                       dprint(FD_FILE, "fio extend file %s from %ld to %ld\n",
-                              f->file_name, sb.st_size, f->real_file_size);
+                       dprint(FD_FILE, "fio extend file %s from %jd to %" PRIu64 "\n",
+                              f->file_name, (intmax_t) sb.st_size, f->real_file_size);
                         ret = glfs_ftruncate(g->fd, f->real_file_size);
                         if (ret) {
-                               log_err("failed fio extend file %s to %ld\n",
+                               log_err("failed fio extend file %s to %" PRIu64 "\n",
                                         f->file_name, f->real_file_size);
                         } else {
                                 unsigned long long left;
@@ -190,7 +190,7 @@ int fio_gf_open_file(struct thread_data *td, struct fio_file *f)
  
                                         r = glfs_write(g->fd, b, bs, 0);
                                         dprint(FD_IO,
-                                              "fio write %d of %ld file %s\n",
+                                              "fio write %d of %" PRIu64 " file %s\n",
                                                r, f->real_file_size,
                                                f->file_name);
  
diff --git a/engines/glusterfs_async.c b/engines/glusterfs_async.c

index f46cb263dd781e9f1180d8b19b8ec5c2eb157f04..97271d67f13927f7c95e7e814df42df662f20058 100644 (file)
--- a/engines/glusterfs_async.c
+++ b/engines/glusterfs_async.c
@@ -92,7 +92,7 @@ static void gf_async_cb(glfs_fd_t * fd, ssize_t ret, void *data)
         struct io_u *io_u = data;
         struct fio_gf_iou *iou = io_u->engine_data;
  
-       dprint(FD_IO, "%s ret %lu\n", __FUNCTION__, ret);
+       dprint(FD_IO, "%s ret %zd\n", __FUNCTION__, ret);
         iou->io_complete = 1;
  }
  
diff --git a/engines/guasi.c b/engines/guasi.c

index eb12c899b0b36f62b03f7213d71f77740852d520..9644ee59d4a4fdd39e1a82cebd2d3ede95fad414 100644 (file)
--- a/engines/guasi.c
+++ b/engines/guasi.c
@@ -132,7 +132,7 @@ static void fio_guasi_queued(struct thread_data *td, struct io_u **io_us, int nr
  {
         int i;
         struct io_u *io_u;
-       struct timeval now;
+       struct timespec now;
  
         if (!fio_fill_issue_time(td))
                 return;
diff --git a/engines/libaio.c b/engines/libaio.c

index e15c519e453015422db3513866fad84844a0e464..e0d7cbbafb780551f35c62c83bb9dd3b26b4b164 100644 (file)
--- a/engines/libaio.c
+++ b/engines/libaio.c
@@ -220,7 +220,7 @@ static int fio_libaio_queue(struct thread_data *td, struct io_u *io_u)
  static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us,
                               unsigned int nr)
  {
-       struct timeval now;
+       struct timespec now;
         unsigned int i;
  
         if (!fio_fill_issue_time(td))
@@ -241,7 +241,7 @@ static int fio_libaio_commit(struct thread_data *td)
         struct libaio_data *ld = td->io_ops_data;
         struct iocb **iocbs;
         struct io_u **io_us;
-       struct timeval tv;
+       struct timespec ts;
         int ret, wait_start = 0;
  
         if (!ld->queued)
@@ -282,9 +282,9 @@ static int fio_libaio_commit(struct thread_data *td)
                                 break;
                         }
                         if (!wait_start) {
-                               fio_gettime(&tv, NULL);
+                               fio_gettime(&ts, NULL);
                                 wait_start = 1;
-                       } else if (mtime_since_now(&tv) > 30000) {
+                       } else if (mtime_since_now(&ts) > 30000) {
                                 log_err("fio: aio appears to be stalled, giving up\n");
                                 break;
                         }
diff --git a/engines/mtd.c b/engines/mtd.c

index 3c22a1b15be99032b53d1bb70a27cf1109811cde..b4a660041cf1c9ecf2210e9387aef3e38d9b47da 100644 (file)
--- a/engines/mtd.c
+++ b/engines/mtd.c
@@ -13,6 +13,7 @@
  #include <mtd/mtd-user.h>
  
  #include "../fio.h"
+#include "../optgroup.h"
  #include "../verify.h"
  #include "../oslib/libmtd.h"
  
@@ -22,6 +23,28 @@ struct fio_mtd_data {
         struct mtd_dev_info info;
  };
  
+struct fio_mtd_options {
+       void *pad; /* avoid off1 == 0 */
+       unsigned int skip_bad;
+};
+
+static struct fio_option options[] = {
+       {
+               .name   = "skip_bad",
+               .lname  = "Skip operations against bad blocks",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct fio_mtd_options, skip_bad),
+               .help   = "Skip operations against known bad blocks.",
+               .hide   = 1,
+               .def    = "0",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_MTD,
+       },
+       {
+               .name   = NULL,
+       },
+};
+
  static int fio_mtd_maybe_mark_bad(struct thread_data *td,
                                   struct fio_mtd_data *fmd,
                                   struct io_u *io_u, int eb)
@@ -55,6 +78,7 @@ static int fio_mtd_queue(struct thread_data *td, struct io_u *io_u)
  {
         struct fio_file *f = io_u->file;
         struct fio_mtd_data *fmd = FILE_ENG_DATA(f);
+       struct fio_mtd_options *o = td->eo;
         int local_offs = 0;
         int ret;
  
@@ -77,7 +101,7 @@ static int fio_mtd_queue(struct thread_data *td, struct io_u *io_u)
                               (int)fmd->info.eb_size - eb_offs);
                 char *buf = ((char *)io_u->buf) + local_offs;
  
-               if (td->o.skip_bad) {
+               if (o->skip_bad) {
                         ret = fio_mtd_is_bad(td, fmd, io_u, eb);
                         if (ret == -1)
                                 break;
@@ -190,6 +214,8 @@ static struct ioengine_ops ioengine = {
         .close_file     = fio_mtd_close_file,
         .get_file_size  = fio_mtd_get_file_size,
         .flags          = FIO_SYNCIO | FIO_NOEXTEND,
+       .options        = options,
+       .option_struct_size     = sizeof(struct fio_mtd_options),
  };
  
  static void fio_init fio_mtd_register(void)
diff --git a/engines/rbd.c b/engines/rbd.c

index 4bae425cb3d088efb4b926582b550324a19b178d..39501eb00551a1b1572c7a5bdc7921a5684e602c 100644 (file)
--- a/engines/rbd.c
+++ b/engines/rbd.c
@@ -517,6 +517,7 @@ static int fio_rbd_queue(struct thread_data *td, struct io_u *io_u)
         } else {
                 dprint(FD_IO, "%s: Warning: unhandled ddir: %d\n", __func__,
                        io_u->ddir);
+               r = -EINVAL;
                 goto failed_comp;
         }
  
@@ -604,7 +605,7 @@ static int fio_rbd_setup(struct thread_data *td)
                 goto cleanup;
         }
  
-       dprint(FD_IO, "rbd-engine: image size: %lu\n", info.size);
+       dprint(FD_IO, "rbd-engine: image size: %" PRIu64 "\n", info.size);
  
         /* taken from "net" engine. Pretend we deal with files,
          * even if we do not have any ideas about files.
diff --git a/engines/rdma.c b/engines/rdma.c

index 10e60dc8449b7634ee1949e051289f55a606eac6..da00cba8b66b3f6db0bcd3a9cba8b45b9362ffa7 100644 (file)
--- a/engines/rdma.c
+++ b/engines/rdma.c
@@ -44,7 +44,6 @@
  #include "../optgroup.h"
  
  #include <rdma/rdma_cma.h>
-#include <infiniband/arch.h>
  
  #define FIO_RDMA_MAX_IO_DEPTH    512
  
@@ -216,7 +215,7 @@ static int client_recv(struct thread_data *td, struct ibv_wc *wc)
                 rd->rmt_nr = ntohl(rd->recv_buf.nr);
  
                 for (i = 0; i < rd->rmt_nr; i++) {
-                       rd->rmt_us[i].buf = ntohll(rd->recv_buf.rmt_us[i].buf);
+                       rd->rmt_us[i].buf = be64_to_cpu(rd->recv_buf.rmt_us[i].buf);
                         rd->rmt_us[i].rkey = ntohl(rd->recv_buf.rmt_us[i].rkey);
                         rd->rmt_us[i].size = ntohl(rd->recv_buf.rmt_us[i].size);
  
@@ -802,7 +801,7 @@ static void fio_rdmaio_queued(struct thread_data *td, struct io_u **io_us,
                               unsigned int nr)
  {
         struct rdmaio_data *rd = td->io_ops_data;
-       struct timeval now;
+       struct timespec now;
         unsigned int i;
  
         if (!fio_fill_issue_time(td))
@@ -1300,7 +1299,7 @@ static int fio_rdmaio_init(struct thread_data *td)
                 }
  
                 rd->send_buf.rmt_us[i].buf =
-                   htonll((uint64_t) (unsigned long)io_u->buf);
+                   cpu_to_be64((uint64_t) (unsigned long)io_u->buf);
                 rd->send_buf.rmt_us[i].rkey = htonl(io_u->mr->rkey);
                 rd->send_buf.rmt_us[i].size = htonl(max_bs);
  
diff --git a/engines/sg.c b/engines/sg.c

index 2148e87c190f8c1dfc4f648aecb99796888ca777..4540b57354cd1c1d02e84faa429cd6e069aa376c 100644 (file)
--- a/engines/sg.c
+++ b/engines/sg.c
@@ -124,7 +124,7 @@ static int fio_sgio_getevents(struct thread_data *td, unsigned int min,
         }
  
         while (left) {
-               void *p;
+               char *p;
  
                 dprint(FD_IO, "sgio_getevents: sd %p: left=%d\n", sd, left);
  
@@ -184,7 +184,7 @@ re_read:
                         if (hdr->info & SG_INFO_CHECK) {
                                 struct io_u *io_u;
                                 io_u = (struct io_u *)(hdr->usr_ptr);
-                               memcpy((void*)&(io_u->hdr), (void*)hdr, sizeof(struct sg_io_hdr));
+                               memcpy(&io_u->hdr, hdr, sizeof(struct sg_io_hdr));
                                 sd->events[i]->error = EIO;
                         }
                 }
@@ -572,17 +572,17 @@ static char *fio_sgio_errdetails(struct io_u *io_u)
         struct sg_io_hdr *hdr = &io_u->hdr;
  #define MAXERRDETAIL 1024
  #define MAXMSGCHUNK  128
-       char *msg, msgchunk[MAXMSGCHUNK], *ret = NULL;
+       char *msg, msgchunk[MAXMSGCHUNK];
         int i;
  
         msg = calloc(1, MAXERRDETAIL);
+       strcpy(msg, "");
  
         /*
          * can't seem to find sg_err.h, so I'll just echo the define values
          * so others can search on internet to find clearer clues of meaning.
          */
         if (hdr->info & SG_INFO_CHECK) {
-               ret = msg;
                 if (hdr->host_status) {
                         snprintf(msgchunk, MAXMSGCHUNK, "SG Host Status: 0x%02x; ", hdr->host_status);
                         strlcat(msg, msgchunk, MAXERRDETAIL);
@@ -755,14 +755,14 @@ static char *fio_sgio_errdetails(struct io_u *io_u)
                 if (hdr->resid != 0) {
                         snprintf(msgchunk, MAXMSGCHUNK, "SG Driver: %d bytes out of %d not transferred. ", hdr->resid, hdr->dxfer_len);
                         strlcat(msg, msgchunk, MAXERRDETAIL);
-                       ret = msg;
                 }
         }
  
-       if (!ret)
-               ret = strdup("SG Driver did not report a Host, Driver or Device check");
+       if (!(hdr->info & SG_INFO_CHECK) && !strlen(msg))
+               strncpy(msg, "SG Driver did not report a Host, Driver or Device check",
+                       MAXERRDETAIL - 1);
  
-       return ret;
+       return msg;
  }
  
  /*
diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c

index 4bebcc45a9d39fa857233c86cc9a3b6264d88423..56f89f957b5f502ccfe0add680ac80c6ef9d4a85 100644 (file)
--- a/engines/skeleton_external.c
+++ b/engines/skeleton_external.c
@@ -3,7 +3,8 @@
   *
   * Should be compiled with:
   *
- * gcc -Wall -O2 -g -shared -rdynamic -fPIC -o engine.o engine.c
+ * gcc -Wall -O2 -g -shared -rdynamic -fPIC -o skeleton_external.o skeleton_external.c
+ * (also requires -D_GNU_SOURCE -DCONFIG_STRSEP on Linux)
   *
   */
  #include <stdio.h>
@@ -13,6 +14,7 @@
  #include <assert.h>
  
  #include "../fio.h"
+#include "../optgroup.h"
  
  /*
   * The core of the module is identical to the ones included with fio,
@@ -20,6 +22,32 @@
   * for external modules, they should be gotten through dlsym()
   */
  
+/*
+ * The io engine can define its own options within the io engine source.
+ * The option member must not be at offset 0, due to the way fio parses
+ * the given option. Just add a padding pointer unless the io engine has
+ * something usable.
+ */
+struct fio_skeleton_options {
+       void *pad; /* avoid ->off1 of fio_option becomes 0 */
+       unsigned int dummy;
+};
+
+static struct fio_option options[] = {
+       {
+               .name   = "dummy",
+               .lname  = "ldummy",
+               .type   = FIO_OPT_STR_SET,
+               .off1   = offsetof(struct fio_skeleton_options, dummy),
+               .help   = "Set dummy",
+               .category = FIO_OPT_C_ENGINE, /* always use this */
+               .group  = FIO_OPT_G_INVALID, /* this can be different */
+       },
+       {
+               .name   = NULL,
+       },
+};
+
  /*
   * The ->event() hook is called to match an event number with an io_u.
   * After the core has called ->getevents() and it has returned eg 3,
@@ -140,4 +168,6 @@ struct ioengine_ops ioengine = {
         .cleanup        = fio_skeleton_cleanup,
         .open_file      = fio_skeleton_open,
         .close_file     = fio_skeleton_close,
+       .options        = options,
+       .option_struct_size     = sizeof(struct fio_skeleton_options),
  };
diff --git a/engines/splice.c b/engines/splice.c

index eba093e810ea72934773f08292345799eb5e0e3b..d5d8ab0ebafdac9cdc1c6ebd3617109f97adc13a 100644 (file)
--- a/engines/splice.c
+++ b/engines/splice.c
@@ -32,7 +32,7 @@ static int fio_splice_read_old(struct thread_data *td, struct io_u *io_u)
         struct fio_file *f = io_u->file;
         int ret, ret2, buflen;
         off_t offset;
-       void *p;
+       char *p;
  
         offset = io_u->offset;
         buflen = io_u->xfer_buflen;
@@ -77,7 +77,8 @@ static int fio_splice_read(struct thread_data *td, struct io_u *io_u)
         struct iovec iov;
         int ret , buflen, mmap_len;
         off_t offset;
-       void *p, *map;
+       void *map;
+       char *p;
  
         ret = 0;
         offset = io_u->offset;
diff --git a/engines/sync.c b/engines/sync.c

index e76bbbb49e237b807c855d74dd0e40995ef7c0ad..26b98b60e2c7896e846c6b0daa2a1ad138b8b6fb 100644 (file)
--- a/engines/sync.c
+++ b/engines/sync.c
@@ -14,6 +14,7 @@
  
  #include "../fio.h"
  #include "../optgroup.h"
+#include "../lib/rand.h"
  
  /*
   * Sync engine uses engine_data to store last offset
@@ -30,12 +31,15 @@ struct syncio_data {
         unsigned long long last_offset;
         struct fio_file *last_file;
         enum fio_ddir last_ddir;
+
+       struct frand_state rand_state;
  };
  
  #ifdef FIO_HAVE_PWRITEV2
  struct psyncv2_options {
         void *pad;
         unsigned int hipri;
+       unsigned int hipri_percentage;
  };
  
  static struct fio_option options[] = {
@@ -48,6 +52,18 @@ static struct fio_option options[] = {
                 .category = FIO_OPT_C_ENGINE,
                 .group  = FIO_OPT_G_INVALID,
         },
+       {
+               .name   = "hipri_percentage",
+               .lname  = "RWF_HIPRI_PERCENTAGE",
+               .type   = FIO_OPT_INT,
+               .off1   = offsetof(struct psyncv2_options, hipri_percentage),
+               .minval = 0,
+               .maxval = 100,
+               .def    = "100",
+               .help   = "Probabilistically set RWF_HIPRI for pwritev2/preadv2",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_INVALID,
+       },
         {
                 .name   = NULL,
         },
@@ -132,7 +148,8 @@ static int fio_pvsyncio2_queue(struct thread_data *td, struct io_u *io_u)
  
         fio_ro_check(td, io_u);
  
-       if (o->hipri)
+       if (o->hipri &&
+           (rand32_between(&sd->rand_state, 1, 100) <= o->hipri_percentage))
                 flags |= RWF_HIPRI;
  
         iov->iov_base = io_u->xfer_buf;
@@ -363,6 +380,7 @@ static int fio_vsyncio_init(struct thread_data *td)
         sd->last_offset = -1ULL;
         sd->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
         sd->io_us = malloc(td->o.iodepth * sizeof(struct io_u *));
+       init_rand(&sd->rand_state, 0);
  
         td->io_ops_data = sd;
         return 0;
diff --git a/engines/windowsaio.c b/engines/windowsaio.c

index f5cb04838a31f70dc6cf801239e33df16800f54c..314eaadf480c485a00f4753b2cd53206b5964211 100644 (file)
--- a/engines/windowsaio.c
+++ b/engines/windowsaio.c
@@ -35,17 +35,7 @@ struct thread_ctx {
         struct windowsaio_data *wd;
  };
  
-static BOOL timeout_expired(DWORD start_count, DWORD end_count);
-static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
-                               unsigned int max, const struct timespec *t);
-static struct io_u *fio_windowsaio_event(struct thread_data *td, int event);
-static int fio_windowsaio_queue(struct thread_data *td,
-                                 struct io_u *io_u);
-static void fio_windowsaio_cleanup(struct thread_data *td);
  static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter);
-static int fio_windowsaio_init(struct thread_data *td);
-static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f);
-static int fio_windowsaio_close_file(struct thread_data fio_unused *td, struct fio_file *f);
  
  static int fio_windowsaio_init(struct thread_data *td)
  {
@@ -152,7 +142,6 @@ static void fio_windowsaio_cleanup(struct thread_data *td)
         }
  }
  
-
  static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f)
  {
         int rc = 0;
@@ -180,13 +169,26 @@ static int fio_windowsaio_open_file(struct thread_data *td, struct fio_file *f)
  
         /*
          * Inform Windows whether we're going to be doing sequential or
-        * random io so it can tune the Cache Manager
+        * random IO so it can tune the Cache Manager
          */
-       if (td->o.td_ddir == TD_DDIR_READ  ||
-               td->o.td_ddir == TD_DDIR_WRITE)
-               flags |= FILE_FLAG_SEQUENTIAL_SCAN;
-       else
+       switch (td->o.fadvise_hint) {
+       case F_ADV_TYPE:
+               if (td_random(td))
+                       flags |= FILE_FLAG_RANDOM_ACCESS;
+               else
+                       flags |= FILE_FLAG_SEQUENTIAL_SCAN;
+               break;
+       case F_ADV_RANDOM:
                 flags |= FILE_FLAG_RANDOM_ACCESS;
+               break;
+       case F_ADV_SEQUENTIAL:
+               flags |= FILE_FLAG_SEQUENTIAL_SCAN;
+               break;
+       case F_ADV_NONE:
+               break;
+       default:
+               log_err("fio: unknown fadvise type %d\n", td->o.fadvise_hint);
+       }
  
         if (!td_write(td) || read_only)
                 access = GENERIC_READ;
diff --git a/eta.c b/eta.c

index adf7f94c658607009c85ad49b8348f9160e33fa1..baaa68151dff8c85d6b76c3f885e06a0c4ca102a 100644 (file)
--- a/eta.c
+++ b/eta.c
@@ -358,12 +358,12 @@ bool calc_thread_status(struct jobs_eta *je, int force)
         uint64_t rate_time, disp_time, bw_avg_time, *eta_secs;
         unsigned long long io_bytes[DDIR_RWDIR_CNT];
         unsigned long long io_iops[DDIR_RWDIR_CNT];
-       struct timeval now;
+       struct timespec now;
  
         static unsigned long long rate_io_bytes[DDIR_RWDIR_CNT];
         static unsigned long long disp_io_bytes[DDIR_RWDIR_CNT];
         static unsigned long long disp_io_iops[DDIR_RWDIR_CNT];
-       static struct timeval rate_prev_time, disp_prev_time;
+       static struct timespec rate_prev_time, disp_prev_time;
  
         if (!force) {
                 if (!(output_format & FIO_OUTPUT_NORMAL) &&
@@ -511,7 +511,7 @@ bool calc_thread_status(struct jobs_eta *je, int force)
  
  void display_thread_status(struct jobs_eta *je)
  {
-       static struct timeval disp_eta_new_line;
+       static struct timespec disp_eta_new_line;
         static int eta_new_line_init, eta_new_line_pending;
         static int linelen_last;
         static int eta_good;
diff --git a/examples/butterfly.fio b/examples/butterfly.fio

new file mode 100644 (file)

index 0000000..42d253d
--- /dev/null
+++ b/examples/butterfly.fio
@@ -0,0 +1,19 @@
+# Perform a butterfly/funnel seek pattern. This won't always alternate ends on
+# every I/O but it will get close.
+
+[global]
+filename=/tmp/testfile
+bs=4k
+direct=1
+
+[forward]
+rw=read
+flow=2
+# Uncomment the size= and offset= lines to prevent each direction going past
+# the middle of the file
+#size=50%
+
+[backward]
+rw=read:-8k
+flow=-2
+#offset=50%
diff --git a/examples/mtd.fio b/examples/mtd.fio

index ca097352a868781d5e3daa20466095daf511e91e..e5dcea4c04b8e2677ede8a2b9ca822490db2f1ef 100644 (file)
--- a/examples/mtd.fio
+++ b/examples/mtd.fio
@@ -17,5 +17,5 @@ rw=write
  [write]
  stonewall
  block_error_percentiles=1
-rw=writetrim
+rw=trimwrite
  loops=4
diff --git a/exp/README.md b/exp/README.md

deleted file mode 100644 (file)

index 48c11c9..0000000
--- a/exp/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-simple-expression-parser
-========================
-
-A simple expression parser for arithmetic expressions made with bison + flex
-
-To use, see the example test-expression-parser.c
-
diff --git a/file.h b/file.h

index 4c2ebd4adcbdaf4d1cc04b1b71d7e1950027118b..e3864ee505971ed3733efbf055887df92f94e8af 100644 (file)
--- a/file.h
+++ b/file.h
@@ -63,6 +63,7 @@ enum fio_fallocate_mode {
         FIO_FALLOCATE_NONE      = 1,
         FIO_FALLOCATE_POSIX     = 2,
         FIO_FALLOCATE_KEEP_SIZE = 3,
+       FIO_FALLOCATE_NATIVE    = 4,
  };
  
  /*
@@ -216,5 +217,6 @@ extern void filesetup_mem_free(void);
  extern void fio_file_reset(struct thread_data *, struct fio_file *);
  extern bool fio_files_done(struct thread_data *);
  extern bool exists_and_not_regfile(const char *);
+extern int fio_set_directio(struct thread_data *, struct fio_file *);
  
  #endif
diff --git a/filesetup.c b/filesetup.c

index 612e79474dc4b43c0707d6a43dccf3d98a2f9e8f..891a55a1ddb97ab30c7cc375cd9246fed21addd8 100644 (file)
--- a/filesetup.c
+++ b/filesetup.c
@@ -38,12 +38,76 @@ static inline void clear_error(struct thread_data *td)
         td->verror[0] = '\0';
  }
  
+static inline int native_fallocate(struct thread_data *td, struct fio_file *f)
+{
+       bool success;
+
+       success = fio_fallocate(f, 0, f->real_file_size);
+       dprint(FD_FILE, "native fallocate of file %s size %llu was "
+                       "%ssuccessful\n", f->file_name,
+                       (unsigned long long) f->real_file_size,
+                       !success ? "un": "");
+
+       if (success)
+               return 0;
+
+       if (errno == ENOSYS)
+               dprint(FD_FILE, "native fallocate is not implemented\n");
+
+       return -1;
+}
+
+static void fallocate_file(struct thread_data *td, struct fio_file *f)
+{
+       int r;
+
+       if (td->o.fill_device)
+               return;
+
+       switch (td->o.fallocate_mode) {
+       case FIO_FALLOCATE_NATIVE:
+               r = native_fallocate(td, f);
+               if (r != 0 && errno != ENOSYS)
+                       log_err("fio: native_fallocate call failed: %s\n",
+                                       strerror(errno));
+               break;
+       case FIO_FALLOCATE_NONE:
+               break;
+#ifdef CONFIG_POSIX_FALLOCATE
+       case FIO_FALLOCATE_POSIX:
+               dprint(FD_FILE, "posix_fallocate file %s size %llu\n",
+                                f->file_name,
+                                (unsigned long long) f->real_file_size);
+
+               r = posix_fallocate(f->fd, 0, f->real_file_size);
+               if (r > 0)
+                       log_err("fio: posix_fallocate fails: %s\n", strerror(r));
+               break;
+#endif /* CONFIG_POSIX_FALLOCATE */
+#ifdef CONFIG_LINUX_FALLOCATE
+       case FIO_FALLOCATE_KEEP_SIZE:
+               dprint(FD_FILE, "fallocate(FALLOC_FL_KEEP_SIZE) "
+                               "file %s size %llu\n", f->file_name,
+                               (unsigned long long) f->real_file_size);
+
+               r = fallocate(f->fd, FALLOC_FL_KEEP_SIZE, 0, f->real_file_size);
+               if (r != 0)
+                       td_verror(td, errno, "fallocate");
+
+               break;
+#endif /* CONFIG_LINUX_FALLOCATE */
+       default:
+               log_err("fio: unknown fallocate mode: %d\n", td->o.fallocate_mode);
+               assert(0);
+       }
+}
+
  /*
   * Leaves f->fd open on success, caller must close
   */
  static int extend_file(struct thread_data *td, struct fio_file *f)
  {
-       int r, new_layout = 0, unlink_file = 0, flags;
+       int new_layout = 0, unlink_file = 0, flags;
         unsigned long long left;
         unsigned int bs;
         char *b = NULL;
@@ -100,43 +164,7 @@ static int extend_file(struct thread_data *td, struct fio_file *f)
                 return 1;
         }
  
-#ifdef CONFIG_POSIX_FALLOCATE
-       if (!td->o.fill_device) {
-               switch (td->o.fallocate_mode) {
-               case FIO_FALLOCATE_NONE:
-                       break;
-               case FIO_FALLOCATE_POSIX:
-                       dprint(FD_FILE, "posix_fallocate file %s size %llu\n",
-                                f->file_name,
-                                (unsigned long long) f->real_file_size);
-
-                       r = posix_fallocate(f->fd, 0, f->real_file_size);
-                       if (r > 0) {
-                               log_err("fio: posix_fallocate fails: %s\n",
-                                               strerror(r));
-                       }
-                       break;
-#ifdef CONFIG_LINUX_FALLOCATE
-               case FIO_FALLOCATE_KEEP_SIZE:
-                       dprint(FD_FILE,
-                               "fallocate(FALLOC_FL_KEEP_SIZE) "
-                               "file %s size %llu\n", f->file_name,
-                               (unsigned long long) f->real_file_size);
-
-                       r = fallocate(f->fd, FALLOC_FL_KEEP_SIZE, 0,
-                                       f->real_file_size);
-                       if (r != 0)
-                               td_verror(td, errno, "fallocate");
-
-                       break;
-#endif /* CONFIG_LINUX_FALLOCATE */
-               default:
-                       log_err("fio: unknown fallocate mode: %d\n",
-                               td->o.fallocate_mode);
-                       assert(0);
-               }
-       }
-#endif /* CONFIG_POSIX_FALLOCATE */
+       fallocate_file(td, f);
  
         /*
          * If our jobs don't require regular files initially, we're done.
@@ -171,6 +199,8 @@ static int extend_file(struct thread_data *td, struct fio_file *f)
         }
  
         while (left && !td->terminate) {
+               ssize_t r;
+
                 if (bs > left)
                         bs = left;
  
@@ -497,8 +527,6 @@ static int __file_invalidate_cache(struct thread_data *td, struct fio_file *f,
                 }
                 if (ret < 0)
                         errval = errno;
-               else if (ret) /* probably not supported */
-                       errval = ret;
         } else if (f->filetype == FIO_TYPE_CHAR ||
                    f->filetype == FIO_TYPE_PIPE) {
                 dprint(FD_IO, "invalidate not supported %s\n", f->file_name);
@@ -833,12 +861,42 @@ static unsigned long long get_fs_free_counts(struct thread_data *td)
  uint64_t get_start_offset(struct thread_data *td, struct fio_file *f)
  {
         struct thread_options *o = &td->o;
+       unsigned long long align_bs;
+       unsigned long long offset;
  
         if (o->file_append && f->filetype == FIO_TYPE_FILE)
                 return f->real_file_size;
  
-       return td->o.start_offset +
-               td->subjob_number * td->o.offset_increment;
+       if (o->start_offset_percent > 0) {
+               /*
+                * if blockalign is provided, find the min across read, write,
+                * and trim
+                */
+               if (fio_option_is_set(o, ba)) {
+                       align_bs = (unsigned long long) min(o->ba[DDIR_READ], o->ba[DDIR_WRITE]);
+                       align_bs = min((unsigned long long) o->ba[DDIR_TRIM], align_bs);
+               } else {
+                       /* else take the minimum block size */
+                       align_bs = td_min_bs(td);
+               }
+
+               /* calculate the raw offset */
+               offset = (f->real_file_size * o->start_offset_percent / 100) +
+                       (td->subjob_number * o->offset_increment);
+
+               /*
+                * block align the offset at the next available boundary at
+                * ceiling(offset / align_bs) * align_bs
+                */
+               offset = (offset / align_bs + (offset % align_bs != 0)) * align_bs;
+
+       } else {
+               /* start_offset_percent not set */
+               offset = o->start_offset +
+                               td->subjob_number * o->offset_increment;
+       }
+
+       return offset;
  }
  
  /*
@@ -986,7 +1044,14 @@ int setup_files(struct thread_data *td)
                         total_size = -1ULL;
                 else {
                          if (o->size_percent) {
-                               f->io_size = (f->io_size * o->size_percent) / 100;
+                               uint64_t file_size;
+
+                               file_size = f->io_size + f->file_offset;
+                               f->io_size = (file_size *
+                                             o->size_percent) / 100;
+                               if (f->io_size > (file_size - f->file_offset))
+                                       f->io_size = file_size - f->file_offset;
+
                                 f->io_size -= (f->io_size % td_min_bs(td));
                         }
                         total_size += f->io_size;
@@ -1778,3 +1843,32 @@ void filesetup_mem_free(void)
  {
         free_already_allocated();
  }
+
+/*
+ * This function is for platforms which support direct I/O but not O_DIRECT.
+ */
+int fio_set_directio(struct thread_data *td, struct fio_file *f)
+{
+#ifdef FIO_OS_DIRECTIO
+       int ret = fio_set_odirect(f);
+
+       if (ret) {
+               td_verror(td, ret, "fio_set_directio");
+#if defined(__sun__)
+               if (ret == ENOTTY) { /* ENOTTY suggests RAW device or ZFS */
+                       log_err("fio: doing directIO to RAW devices or ZFS not supported\n");
+               } else {
+                       log_err("fio: the file system does not seem to support direct IO\n");
+               }
+#else
+               log_err("fio: the file system does not seem to support direct IO\n");
+#endif
+               return -1;
+       }
+
+       return 0;
+#else
+       log_err("fio: direct IO is not supported on this host operating system\n");
+       return -1;
+#endif
+}
diff --git a/fio.1 b/fio.1

index 138bcbb988178b4edf1eebcd26bf5915113646ce..b943db2289d66c87ced5bd3b7112cf9b580db8ab 100644 (file)
--- a/fio.1
+++ b/fio.1
@@ -1,4 +1,4 @@
-.TH fio 1 "March 2017" "User Manual"
+.TH fio 1 "August 2017" "User Manual"
  .SH NAME
  fio \- flexible I/O tester
  .SH SYNOPSIS
@@ -13,275 +13,549 @@ one wants to simulate.
  .SH OPTIONS
  .TP
  .BI \-\-debug \fR=\fPtype
-Enable verbose tracing of various fio actions. May be `all' for all types
-or individual types separated by a comma (eg \-\-debug=io,file). `help' will
-list all available tracing options.
+Enable verbose tracing \fItype\fR of various fio actions. May be `all' for all \fItype\fRs
+or individual types separated by a comma (e.g. `\-\-debug=file,mem' will enable
+file and memory debugging). `help' will list all available tracing options.
+.TP
+.BI \-\-parse\-only
+Parse options only, don't start any I/O.
  .TP
  .BI \-\-output \fR=\fPfilename
  Write output to \fIfilename\fR.
  .TP
-.BI \-\-output-format \fR=\fPformat
-Set the reporting format to \fInormal\fR, \fIterse\fR, \fIjson\fR, or
-\fIjson+\fR. Multiple formats can be selected, separate by a comma. \fIterse\fR
-is a CSV based format. \fIjson+\fR is like \fIjson\fR, except it adds a full
+.BI \-\-output\-format \fR=\fPformat
+Set the reporting \fIformat\fR to `normal', `terse', `json', or
+`json+'. Multiple formats can be selected, separate by a comma. `terse'
+is a CSV based format. `json+' is like `json', except it adds a full
  dump of the latency buckets.
  .TP
-.BI \-\-runtime \fR=\fPruntime
-Limit run time to \fIruntime\fR seconds.
-.TP
-.B \-\-bandwidth\-log
+.BI \-\-bandwidth\-log
  Generate aggregate bandwidth logs.
  .TP
-.B \-\-minimal
-Print statistics in a terse, semicolon-delimited format.
+.BI \-\-minimal
+Print statistics in a terse, semicolon\-delimited format.
  .TP
-.B \-\-append-terse
-Print statistics in selected mode AND terse, semicolon-delimited format.
-Deprecated, use \-\-output-format instead to select multiple formats.
-.TP
-.B \-\-version
-Display version information and exit.
+.BI \-\-append\-terse
+Print statistics in selected mode AND terse, semicolon\-delimited format.
+\fBDeprecated\fR, use \fB\-\-output\-format\fR instead to select multiple formats.
  .TP
  .BI \-\-terse\-version \fR=\fPversion
-Set terse version output format (Current version 3, or older version 2).
+Set terse \fIversion\fR output format (default `3', or `2', `4', `5').
+.TP
+.BI \-\-version
+Print version information and exit.
  .TP
-.B \-\-help
-Display usage information and exit.
+.BI \-\-help
+Print a summary of the command line options and exit.
  .TP
-.B \-\-cpuclock-test
-Perform test and validation of internal CPU clock
+.BI \-\-cpuclock\-test
+Perform test and validation of internal CPU clock.
  .TP
-.BI \-\-crctest[\fR=\fPtest]
-Test the speed of the builtin checksumming functions. If no argument is given,
-all of them are tested. Or a comma separated list can be passed, in which
+.BI \-\-crctest \fR=\fP[test]
+Test the speed of the built\-in checksumming functions. If no argument is given,
+all of them are tested. Alternatively, a comma separated list can be passed, in which
  case the given ones are tested.
  .TP
  .BI \-\-cmdhelp \fR=\fPcommand
-Print help information for \fIcommand\fR.  May be `all' for all commands.
+Print help information for \fIcommand\fR. May be `all' for all commands.
  .TP
-.BI \-\-enghelp \fR=\fPioengine[,command]
-List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR defined by \fIioengine\fR.
+.BI \-\-enghelp \fR=\fP[ioengine[,command]]
+List all commands defined by \fIioengine\fR, or print help for \fIcommand\fR
+defined by \fIioengine\fR. If no \fIioengine\fR is given, list all
+available ioengines.
  .TP
  .BI \-\-showcmd \fR=\fPjobfile
-Convert \fIjobfile\fR to a set of command-line options.
+Convert \fIjobfile\fR to a set of command\-line options.
+.TP
+.BI \-\-readonly
+Turn on safety read\-only checks, preventing writes. The \fB\-\-readonly\fR
+option is an extra safety guard to prevent users from accidentally starting
+a write workload when that is not desired. Fio will only write if
+`rw=write/randwrite/rw/randrw' is given. This extra safety net can be used
+as an extra precaution as \fB\-\-readonly\fR will also enable a write check in
+the I/O engine core to prevent writes due to unknown user space bug(s).
  .TP
  .BI \-\-eta \fR=\fPwhen
-Specifies when real-time ETA estimate should be printed.  \fIwhen\fR may
-be one of `always', `never' or `auto'.
+Specifies when real\-time ETA estimate should be printed. \fIwhen\fR may
+be `always', `never' or `auto'.
  .TP
  .BI \-\-eta\-newline \fR=\fPtime
-Force an ETA newline for every `time` period passed.
+Force a new line for every \fItime\fR period passed. When the unit is omitted,
+the value is interpreted in seconds.
  .TP
  .BI \-\-status\-interval \fR=\fPtime
-Report full output status every `time` period passed.
-.TP
-.BI \-\-readonly
-Turn on safety read-only checks, preventing any attempted write.
-.TP
-.BI \-\-section \fR=\fPsec
-Only run section \fIsec\fR from job file. This option can be used multiple times to add more sections to run.
+Force a full status dump of cumulative (from job start) values at \fItime\fR
+intervals. This option does *not* provide per-period measurements. So
+values such as bandwidth are running averages. When the time unit is omitted,
+\fItime\fR is interpreted in seconds.
+.TP
+.BI \-\-section \fR=\fPname
+Only run specified section \fIname\fR in job file. Multiple sections can be specified.
+The \fB\-\-section\fR option allows one to combine related jobs into one file.
+E.g. one job file could define light, moderate, and heavy sections. Tell
+fio to run only the "heavy" section by giving `\-\-section=heavy'
+command line option. One can also specify the "write" operations in one
+section and "verify" operation in another section. The \fB\-\-section\fR option
+only applies to job sections. The reserved *global* section is always
+parsed and used.
  .TP
  .BI \-\-alloc\-size \fR=\fPkb
-Set the internal smalloc pool size to \fIkb\fP kilobytes.
+Set the internal smalloc pool size to \fIkb\fR in KiB. The
+\fB\-\-alloc\-size\fR switch allows one to use a larger pool size for smalloc.
+If running large jobs with randommap enabled, fio can run out of memory.
+Smalloc is an internal allocator for shared structures from a fixed size
+memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
+NOTE: While running `.fio_smalloc.*' backing store files are visible
+in `/tmp'.
  .TP
  .BI \-\-warnings\-fatal
  All fio parser warnings are fatal, causing fio to exit with an error.
  .TP
  .BI \-\-max\-jobs \fR=\fPnr
-Set the maximum allowed number of jobs (threads/processes) to support.
+Set the maximum number of threads/processes to support to \fInr\fR.
  .TP
  .BI \-\-server \fR=\fPargs
-Start a backend server, with \fIargs\fP specifying what to listen to. See client/server section.
+Start a backend server, with \fIargs\fR specifying what to listen to.
+See \fBCLIENT/SERVER\fR section.
  .TP
  .BI \-\-daemonize \fR=\fPpidfile
-Background a fio server, writing the pid to the given pid file.
+Background a fio server, writing the pid to the given \fIpidfile\fR file.
+.TP
+.BI \-\-client \fR=\fPhostname
+Instead of running the jobs locally, send and run them on the given \fIhostname\fR
+or set of \fIhostname\fRs. See \fBCLIENT/SERVER\fR section.
  .TP
-.BI \-\-client \fR=\fPhost
-Instead of running the jobs locally, send and run them on the given host or set of hosts.  See client/server section.
+.BI \-\-remote\-config \fR=\fPfile
+Tell fio server to load this local \fIfile\fR.
  .TP
  .BI \-\-idle\-prof \fR=\fPoption
-Report cpu idleness on a system or percpu basis (\fIoption\fP=system,percpu) or run unit work calibration only (\fIoption\fP=calibrate).
-.SH "JOB FILE FORMAT"
-Job files are in `ini' format. They consist of one or more
-job definitions, which begin with a job name in square brackets and
-extend to the next job name.  The job name can be any ASCII string
-except `global', which has a special meaning.  Following the job name is
-a sequence of zero or more parameters, one per line, that define the
-behavior of the job.  Any line starting with a `;' or `#' character is
-considered a comment and ignored.
-.P
-If \fIjobfile\fR is specified as `-', the job file will be read from
-standard input.
-.SS "Global Section"
-The global section contains default parameters for jobs specified in the
-job file.  A job is only affected by global sections residing above it,
-and there may be any number of global sections.  Specific job definitions
-may override any parameter set in global sections.
-.SH "JOB PARAMETERS"
-.SS Types
-Some parameters may take arguments of a specific type.
-Anywhere a numeric value is required, an arithmetic expression may be used,
-provided it is surrounded by parentheses. Supported operators are:
+Report CPU idleness. \fIoption\fR is one of the following:
  .RS
  .RS
  .TP
-.B addition (+)
+.B calibrate
+Run unit work calibration only and exit.
  .TP
-.B subtraction (-)
+.B system
+Show aggregate system idleness and unit work.
  .TP
-.B multiplication (*)
+.B percpu
+As \fBsystem\fR but also show per CPU idleness.
+.RE
+.RE
  .TP
-.B division (/)
+.BI \-\-inflate\-log \fR=\fPlog
+Inflate and output compressed \fIlog\fR.
  .TP
-.B modulus (%)
+.BI \-\-trigger\-file \fR=\fPfile
+Execute trigger command when \fIfile\fR exists.
+.TP
+.BI \-\-trigger\-timeout \fR=\fPtime
+Execute trigger at this \fItime\fR.
+.TP
+.BI \-\-trigger \fR=\fPcommand
+Set this \fIcommand\fR as local trigger.
  .TP
+.BI \-\-trigger\-remote \fR=\fPcommand
+Set this \fIcommand\fR as remote trigger.
+.TP
+.BI \-\-aux\-path \fR=\fPpath
+Use this \fIpath\fR for fio state generated files.
+.SH "JOB FILE FORMAT"
+Any parameters following the options will be assumed to be job files, unless
+they match a job file parameter. Multiple job files can be listed and each job
+file will be regarded as a separate group. Fio will \fBstonewall\fR execution
+between each group.
+
+Fio accepts one or more job files describing what it is
+supposed to do. The job file format is the classic ini file, where the names
+enclosed in [] brackets define the job name. You are free to use any ASCII name
+you want, except *global* which has special meaning. Following the job name is
+a sequence of zero or more parameters, one per line, that define the behavior of
+the job. If the first character in a line is a ';' or a '#', the entire line is
+discarded as a comment.
+
+A *global* section sets defaults for the jobs described in that file. A job may
+override a *global* section parameter, and a job file may even have several
+*global* sections if so desired. A job is only affected by a *global* section
+residing above it.
+
+The \fB\-\-cmdhelp\fR option also lists all options. If used with an \fIcommand\fR
+argument, \fB\-\-cmdhelp\fR will detail the given \fIcommand\fR.
+
+See the `examples/' directory for inspiration on how to write job files. Note
+the copyright and license requirements currently apply to
+`examples/' files.
+.SH "JOB FILE PARAMETERS"
+Some parameters take an option of a given type, such as an integer or a
+string. Anywhere a numeric value is required, an arithmetic expression may be
+used, provided it is surrounded by parentheses. Supported operators are:
+.RS
+.P
+.B addition (+)
+.P
+.B subtraction (\-)
+.P
+.B multiplication (*)
+.P
+.B division (/)
+.P
+.B modulus (%)
+.P
  .B exponentiation (^)
  .RE
-.RE
  .P
  For time values in expressions, units are microseconds by default. This is
  different than for time values not in expressions (not enclosed in
-parentheses). The types used are:
+parentheses).
+.SH "PARAMETER TYPES"
+The following parameter types are used.
  .TP
  .I str
-String: a sequence of alphanumeric characters.
+String. A sequence of alphanumeric characters.
+.TP
+.I time
+Integer with possible time suffix. Without a unit value is interpreted as
+seconds unless otherwise specified. Accepts a suffix of 'd' for days, 'h' for
+hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and 'us'
+(or 'usec') for microseconds. For example, use 10m for 10 minutes.
  .TP
  .I int
  Integer. A whole number value, which may contain an integer prefix
  and an integer suffix.
-
-[integer prefix]number[integer suffix]
-
-The optional integer prefix specifies the number's base. The default
-is decimal. 0x specifies hexadecimal.
-
-The optional integer suffix specifies the number's units, and includes
-an optional unit prefix and an optional unit.  For quantities
-of data, the default unit is bytes. For quantities of time,
-the default unit is seconds.
-
-With \fBkb_base=1000\fR, fio follows international standards for unit prefixes.
-To specify power-of-10 decimal values defined in the International
-System of Units (SI):
-.nf
-ki means kilo (K) or 1000
-mi means mega (M) or 1000**2
-gi means giga (G) or 1000**3
-ti means tera (T) or 1000**4
-pi means peta (P) or 1000**5
-.fi
-
-To specify power-of-2 binary values defined in IEC 80000-13:
-.nf
-k means kibi (Ki) or 1024
-m means mebi (Mi) or 1024**2
-g means gibi (Gi) or 1024**3
-t means tebi (Ti) or 1024**4
-p means pebi (Pi) or 1024**5
-.fi
-
-With \fBkb_base=1024\fR (the default), the unit prefixes are opposite from
-those specified in the SI and IEC 80000-13 standards to provide
-compatibility with old scripts.  For example, 4k means 4096.
-
-.nf
-Examples with \fBkb_base=1000\fR:
+.RS
+.RS
+.P
+[*integer prefix*] **number** [*integer suffix*]
+.RE
+.P
+The optional *integer prefix* specifies the number's base. The default
+is decimal. *0x* specifies hexadecimal.
+.P
+The optional *integer suffix* specifies the number's units, and includes an
+optional unit prefix and an optional unit. For quantities of data, the
+default unit is bytes. For quantities of time, the default unit is seconds
+unless otherwise specified.
+.P
+With `kb_base=1000', fio follows international standards for unit
+prefixes. To specify power\-of\-10 decimal values defined in the
+International System of Units (SI):
+.RS
+.P
+.PD 0
+K means kilo (K) or 1000
+.P
+M means mega (M) or 1000**2
+.P
+G means giga (G) or 1000**3
+.P
+T means tera (T) or 1000**4
+.P
+P means peta (P) or 1000**5
+.PD
+.RE
+.P
+To specify power\-of\-2 binary values defined in IEC 80000\-13:
+.RS
+.P
+.PD 0
+Ki means kibi (Ki) or 1024
+.P
+Mi means mebi (Mi) or 1024**2
+.P
+Gi means gibi (Gi) or 1024**3
+.P
+Ti means tebi (Ti) or 1024**4
+.P
+Pi means pebi (Pi) or 1024**5
+.PD
+.RE
+.P
+With `kb_base=1024' (the default), the unit prefixes are opposite
+from those specified in the SI and IEC 80000\-13 standards to provide
+compatibility with old scripts. For example, 4k means 4096.
+.P
+For quantities of data, an optional unit of 'B' may be included
+(e.g., 'kB' is the same as 'k').
+.P
+The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
+not milli). 'b' and 'B' both mean byte, not bit.
+.P
+Examples with `kb_base=1000':
+.RS
+.P
+.PD 0
  4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+.P
  1 MiB: 1048576, 1m, 1024k
+.P
  1 MB: 1000000, 1mi, 1000ki
+.P
  1 TiB: 1073741824, 1t, 1024m, 1048576k
+.P
  1 TB: 1000000000, 1ti, 1000mi, 1000000ki
-.fi
-
-.nf
-Examples with \fBkb_base=1024\fR (default):
+.PD
+.RE
+.P
+Examples with `kb_base=1024' (default):
+.RS
+.P
+.PD 0
  4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+.P
  1 MiB: 1048576, 1m, 1024k
+.P
  1 MB: 1000000, 1mi, 1000ki
+.P
  1 TiB: 1073741824, 1t, 1024m, 1048576k
+.P
  1 TB: 1000000000, 1ti, 1000mi, 1000000ki
-.fi
-
-For quantities of data, an optional unit of 'B' may be included
-(e.g.,  'kb' is the same as 'k').
-
-The integer suffix is not case sensitive (e.g., m/mi mean mebi/mega,
-not milli). 'b' and 'B' both mean byte, not bit.
-
+.PD
+.RE
+.P
  To specify times (units are not case sensitive):
-.nf
+.RS
+.P
+.PD 0
  D means days
+.P
  H means hours
+.P
  M mean minutes
+.P
  s or sec means seconds (default)
+.P
  ms or msec means milliseconds
+.P
  us or usec means microseconds
-.fi
-
+.PD
+.RE
+.P
+If the option accepts an upper and lower range, use a colon ':' or
+minus '\-' to separate such values. See \fIirange\fR parameter type.
+If the lower value specified happens to be larger than the upper value
+the two values are swapped.
+.RE
  .TP
  .I bool
-Boolean: a true or false value. `0' denotes false, `1' denotes true.
+Boolean. Usually parsed as an integer, however only defined for
+true and false (1 and 0).
  .TP
  .I irange
-Integer range: a range of integers specified in the format
-\fIlower\fR:\fIupper\fR or \fIlower\fR\-\fIupper\fR. \fIlower\fR and
-\fIupper\fR may contain a suffix as described above.  If an option allows two
-sets of ranges, they are separated with a `,' or `/' character. For example:
-`8\-8k/8M\-4G'.
+Integer range with suffix. Allows value range to be given, such as
+1024\-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
+option allows two sets of ranges, they can be specified with a ',' or '/'
+delimiter: 1k\-4k/8k\-32k. Also see \fIint\fR parameter type.
  .TP
  .I float_list
-List of floating numbers: A list of floating numbers, separated by
-a ':' character.
-.SS "Parameter List"
+A list of floating point numbers, separated by a ':' character.
+.SH "JOB PARAMETERS"
+With the above in mind, here follows the complete list of fio job parameters.
+.SS "Units"
  .TP
-.BI name \fR=\fPstr
-May be used to override the job name.  On the command line, this parameter
-has the special purpose of signalling the start of a new job.
+.BI kb_base \fR=\fPint
+Select the interpretation of unit prefixes in input parameters.
+.RS
+.RS
  .TP
-.BI wait_for \fR=\fPstr
-Specifies the name of the already defined job to wait for. Single waitee name
-only may be specified. If set, the job won't be started until all workers of
-the waitee job are done.  Wait_for operates on the job name basis, so there are
-a few limitations. First, the waitee must be defined prior to the waiter job
-(meaning no forward references). Second, if a job is being referenced as a
-waitee, it must have a unique name (no duplicate waitees).
+.B 1000
+Inputs comply with IEC 80000\-13 and the International
+System of Units (SI). Use:
+.RS
+.P
+.PD 0
+\- power\-of\-2 values with IEC prefixes (e.g., KiB)
+.P
+\- power\-of\-10 values with SI prefixes (e.g., kB)
+.PD
+.RE
+.TP
+.B 1024
+Compatibility mode (default). To avoid breaking old scripts:
+.P
+.RS
+.PD 0
+\- power\-of\-2 values with SI prefixes
+.P
+\- power\-of\-10 values with IEC prefixes
+.PD
+.RE
+.RE
+.P
+See \fBbs\fR for more details on input parameters.
+.P
+Outputs always use correct prefixes. Most outputs include both
+side\-by\-side, like:
+.P
+.RS
+bw=2383.3kB/s (2327.4KiB/s)
+.RE
+.P
+If only one value is reported, then kb_base selects the one to use:
+.P
+.RS
+.PD 0
+1000 \-\- SI prefixes
+.P
+1024 \-\- IEC prefixes
+.PD
+.RE
+.RE
+.TP
+.BI unit_base \fR=\fPint
+Base unit for reporting. Allowed values are:
+.RS
+.RS
+.TP
+.B 0
+Use auto\-detection (default).
+.TP
+.B 8
+Byte based.
+.TP
+.B 1
+Bit based.
+.RE
+.RE
+.SS "Job description"
+.TP
+.BI name \fR=\fPstr
+ASCII name of the job. This may be used to override the name printed by fio
+for this job. Otherwise the job name is used. On the command line this
+parameter has the special purpose of also signaling the start of a new job.
  .TP
  .BI description \fR=\fPstr
-Human-readable description of the job. It is printed when the job is run, but
-otherwise has no special purpose.
+Text description of the job. Doesn't do anything except dump this text
+description when this job is run. It's not parsed.
+.TP
+.BI loops \fR=\fPint
+Run the specified number of iterations of this job. Used to repeat the same
+workload a given number of times. Defaults to 1.
+.TP
+.BI numjobs \fR=\fPint
+Create the specified number of clones of this job. Each clone of job
+is spawned as an independent thread or process. May be used to setup a
+larger number of threads/processes doing the same thing. Each thread is
+reported separately; to see statistics for all clones as a whole, use
+\fBgroup_reporting\fR in conjunction with \fBnew_group\fR.
+See \fB\-\-max\-jobs\fR. Default: 1.
+.SS "Time related parameters"
+.TP
+.BI runtime \fR=\fPtime
+Tell fio to terminate processing after the specified period of time. It
+can be quite hard to determine for how long a specified job will run, so
+this parameter is handy to cap the total runtime to a given time. When
+the unit is omitted, the value is intepreted in seconds.
+.TP
+.BI time_based
+If set, fio will run for the duration of the \fBruntime\fR specified
+even if the file(s) are completely read or written. It will simply loop over
+the same workload as many times as the \fBruntime\fR allows.
+.TP
+.BI startdelay \fR=\fPirange(int)
+Delay the start of job for the specified amount of time. Can be a single
+value or a range. When given as a range, each thread will choose a value
+randomly from within the range. Value is in seconds if a unit is omitted.
+.TP
+.BI ramp_time \fR=\fPtime
+If set, fio will run the specified workload for this amount of time before
+logging any performance numbers. Useful for letting performance settle
+before logging results, thus minimizing the runtime required for stable
+results. Note that the \fBramp_time\fR is considered lead in time for a job,
+thus it will increase the total runtime if a special timeout or
+\fBruntime\fR is specified. When the unit is omitted, the value is
+given in seconds.
+.TP
+.BI clocksource \fR=\fPstr
+Use the given clocksource as the base of timing. The supported options are:
+.RS
+.RS
+.TP
+.B gettimeofday
+\fBgettimeofday\fR\|(2)
+.TP
+.B clock_gettime
+\fBclock_gettime\fR\|(2)
+.TP
+.B cpu
+Internal CPU clock source
+.RE
+.P
+\fBcpu\fR is the preferred clocksource if it is reliable, as it is very fast (and
+fio is heavy on time calls). Fio will automatically use this clocksource if
+it's supported and considered reliable on the system it is running on,
+unless another clocksource is specifically set. For x86/x86\-64 CPUs, this
+means supporting TSC Invariant.
+.RE
+.TP
+.BI gtod_reduce \fR=\fPbool
+Enable all of the \fBgettimeofday\fR\|(2) reducing options
+(\fBdisable_clat\fR, \fBdisable_slat\fR, \fBdisable_bw_measurement\fR) plus
+reduce precision of the timeout somewhat to really shrink the
+\fBgettimeofday\fR\|(2) call count. With this option enabled, we only do
+about 0.4% of the \fBgettimeofday\fR\|(2) calls we would have done if all
+time keeping was enabled.
+.TP
+.BI gtod_cpu \fR=\fPint
+Sometimes it's cheaper to dedicate a single thread of execution to just
+getting the current time. Fio (and databases, for instance) are very
+intensive on \fBgettimeofday\fR\|(2) calls. With this option, you can set
+one CPU aside for doing nothing but logging current time to a shared memory
+location. Then the other threads/processes that run I/O workloads need only
+copy that segment, instead of entering the kernel with a
+\fBgettimeofday\fR\|(2) call. The CPU set aside for doing these time
+calls will be excluded from other uses. Fio will manually clear it from the
+CPU mask of other jobs.
+.SS "Target file/device"
  .TP
  .BI directory \fR=\fPstr
-Prefix filenames with this directory.  Used to place files in a location other
-than `./'.
-You can specify a number of directories by separating the names with a ':'
-character. These directories will be assigned equally distributed to job clones
-creates with \fInumjobs\fR as long as they are using generated filenames.
-If specific \fIfilename(s)\fR are set fio will use the first listed directory,
-and thereby matching the  \fIfilename\fR semantic which generates a file each
-clone if not specified, but let all clones use the same if set. See
-\fIfilename\fR for considerations regarding escaping certain characters on
-some platforms.
+Prefix \fBfilename\fRs with this directory. Used to place files in a different
+location than `./'. You can specify a number of directories by
+separating the names with a ':' character. These directories will be
+assigned equally distributed to job clones created by \fBnumjobs\fR as
+long as they are using generated filenames. If specific \fBfilename\fR(s) are
+set fio will use the first listed directory, and thereby matching the
+\fBfilename\fR semantic which generates a file each clone if not specified, but
+let all clones use the same if set.
+.RS
+.P
+See the \fBfilename\fR option for information on how to escape ':' and '\'
+characters within the directory path itself.
+.RE
  .TP
  .BI filename \fR=\fPstr
-.B fio
-normally makes up a file name based on the job name, thread number, and file
-number. If you want to share files between threads in a job or several jobs,
-specify a \fIfilename\fR for each of them to override the default.
-If the I/O engine is file-based, you can specify
-a number of files by separating the names with a `:' character. `\-' is a
-reserved name, meaning stdin or stdout, depending on the read/write direction
-set. On Windows, disk devices are accessed as \\.\PhysicalDrive0 for the first
-device, \\.\PhysicalDrive1 for the second etc. Note: Windows and FreeBSD
-prevent write access to areas of the disk containing in-use data
-(e.g. filesystems). If the wanted filename does need to include a colon, then
-escape that with a '\\' character. For instance, if the filename is
-"/dev/dsk/foo@3,0:c", then you would use filename="/dev/dsk/foo@3,0\\:c".
+Fio normally makes up a \fBfilename\fR based on the job name, thread number, and
+file number (see \fBfilename_format\fR). If you want to share files
+between threads in a job or several
+jobs with fixed file paths, specify a \fBfilename\fR for each of them to override
+the default. If the ioengine is file based, you can specify a number of files
+by separating the names with a ':' colon. So if you wanted a job to open
+`/dev/sda' and `/dev/sdb' as the two working files, you would use
+`filename=/dev/sda:/dev/sdb'. This also means that whenever this option is
+specified, \fBnrfiles\fR is ignored. The size of regular files specified
+by this option will be \fBsize\fR divided by number of files unless an
+explicit size is specified by \fBfilesize\fR.
+.RS
+.P
+Each colon and backslash in the wanted path must be escaped with a '\'
+character. For instance, if the path is `/dev/dsk/foo@3,0:c' then you
+would use `filename=/dev/dsk/foo@3,0\\:c' and if the path is
+`F:\\\\filename' then you would use `filename=F\\:\\\\filename'.
+.P
+On Windows, disk devices are accessed as `\\\\\\\\.\\\\PhysicalDrive0' for
+the first device, `\\\\\\\\.\\\\PhysicalDrive1' for the second etc.
+Note: Windows and FreeBSD prevent write access to areas
+of the disk containing in\-use data (e.g. filesystems).
+.P
+The filename `\-' is a reserved name, meaning *stdin* or *stdout*. Which
+of the two depends on the read/write direction set.
+.RE
  .TP
  .BI filename_format \fR=\fPstr
-If sharing multiple files between jobs, it is usually necessary to have
-fio generate the exact names that you want. By default, fio will name a file
+If sharing multiple files between jobs, it is usually necessary to have fio
+generate the exact names that you want. By default, fio will name a file
  based on the default file format specification of
-\fBjobname.jobnumber.filenumber\fP. With this option, that can be
+`jobname.jobnumber.filenumber'. With this option, that can be
  customized. Fio will recognize and replace the following keywords in this
  string:
  .RS
@@ -297,130 +571,250 @@ The incremental number of the worker thread or process.
  The incremental number of the file for that worker thread or process.
  .RE
  .P
-To have dependent jobs share a set of files, this option can be set to
-have fio generate filenames that are shared between the two. For instance,
-if \fBtestfiles.$filenum\fR is specified, file number 4 for any job will
-be named \fBtestfiles.4\fR. The default of \fB$jobname.$jobnum.$filenum\fR
+To have dependent jobs share a set of files, this option can be set to have
+fio generate filenames that are shared between the two. For instance, if
+`testfiles.$filenum' is specified, file number 4 for any job will be
+named `testfiles.4'. The default of `$jobname.$jobnum.$filenum'
  will be used if no other format specifier is given.
  .RE
-.P
  .TP
  .BI unique_filename \fR=\fPbool
-To avoid collisions between networked clients, fio defaults to prefixing
-any generated filenames (with a directory specified) with the source of
-the client connecting. To disable this behavior, set this option to 0.
+To avoid collisions between networked clients, fio defaults to prefixing any
+generated filenames (with a directory specified) with the source of the
+client connecting. To disable this behavior, set this option to 0.
+.TP
+.BI opendir \fR=\fPstr
+Recursively open any files below directory \fIstr\fR.
  .TP
  .BI lockfile \fR=\fPstr
-Fio defaults to not locking any files before it does IO to them. If a file or
-file descriptor is shared, fio can serialize IO to that file to make the end
-result consistent. This is usual for emulating real workloads that share files.
-The lock modes are:
+Fio defaults to not locking any files before it does I/O to them. If a file
+or file descriptor is shared, fio can serialize I/O to that file to make the
+end result consistent. This is usual for emulating real workloads that share
+files. The lock modes are:
  .RS
  .RS
  .TP
  .B none
-No locking. This is the default.
+No locking. The default.
  .TP
  .B exclusive
-Only one thread or process may do IO at a time, excluding all others.
+Only one thread or process may do I/O at a time, excluding all others.
  .TP
  .B readwrite
-Read-write locking on the file. Many readers may access the file at the same
-time, but writes get exclusive access.
+Read\-write locking on the file. Many readers may
+access the file at the same time, but writes get exclusive access.
  .RE
  .RE
-.P
-.BI opendir \fR=\fPstr
-Recursively open any files below directory \fIstr\fR.
  .TP
-.BI readwrite \fR=\fPstr "\fR,\fP rw" \fR=\fPstr
-Type of I/O pattern.  Accepted values are:
-.RS
-.RS
+.BI nrfiles \fR=\fPint
+Number of files to use for this job. Defaults to 1. The size of files
+will be \fBsize\fR divided by this unless explicit size is specified by
+\fBfilesize\fR. Files are created for each thread separately, and each
+file will have a file number within its name by default, as explained in
+\fBfilename\fR section.
  .TP
-.B read
-Sequential reads.
+.BI openfiles \fR=\fPint
+Number of files to keep open at the same time. Defaults to the same as
+\fBnrfiles\fR, can be set smaller to limit the number simultaneous
+opens.
  .TP
-.B write
-Sequential writes.
+.BI file_service_type \fR=\fPstr
+Defines how fio decides which file from a job to service next. The following
+types are defined:
+.RS
+.RS
  .TP
-.B trim
-Sequential trims (Linux block devices only).
+.B random
+Choose a file at random.
  .TP
-.B randread
-Random reads.
+.B roundrobin
+Round robin over opened files. This is the default.
  .TP
-.B randwrite
-Random writes.
+.B sequential
+Finish one file before moving on to the next. Multiple files can
+still be open depending on \fBopenfiles\fR.
  .TP
-.B randtrim
-Random trims (Linux block devices only).
+.B zipf
+Use a Zipf distribution to decide what file to access.
  .TP
-.B rw, readwrite
-Mixed sequential reads and writes.
+.B pareto
+Use a Pareto distribution to decide what file to access.
  .TP
-.B randrw
-Mixed random reads and writes.
+.B normal
+Use a Gaussian (normal) distribution to decide what file to access.
  .TP
-.B trimwrite
-Sequential trim and write mixed workload. Blocks will be trimmed first, then
-the same blocks will be written to.
+.B gauss
+Alias for normal.
  .RE
  .P
-Fio defaults to read if the option is not specified.
-For mixed I/O, the default split is 50/50. For certain types of io the result
-may still be skewed a bit, since the speed may be different. It is possible to
-specify a number of IO's to do before getting a new offset, this is done by
-appending a `:\fI<nr>\fR to the end of the string given. For a random read, it
-would look like \fBrw=randread:8\fR for passing in an offset modifier with a
-value of 8. If the postfix is used with a sequential IO pattern, then the value
-specified will be added to the generated offset for each IO. For instance,
-using \fBrw=write:4k\fR will skip 4k for every write. It turns sequential IO
-into sequential IO with holes. See the \fBrw_sequencer\fR option.
+For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be appended to
+tell fio how many I/Os to issue before switching to a new file. For example,
+specifying `file_service_type=random:8' would cause fio to issue
+8 I/Os before selecting a new file at random. For the non\-uniform
+distributions, a floating point postfix can be given to influence how the
+distribution is skewed. See \fBrandom_distribution\fR for a description
+of how that would work.
  .RE
  .TP
-.BI rw_sequencer \fR=\fPstr
-If an offset modifier is given by appending a number to the \fBrw=<str>\fR line,
-then this option controls how that number modifies the IO offset being
-generated. Accepted values are:
-.RS
-.RS
+.BI ioscheduler \fR=\fPstr
+Attempt to switch the device hosting the file to the specified I/O scheduler
+before running.
  .TP
-.B sequential
-Generate sequential offset
+.BI create_serialize \fR=\fPbool
+If true, serialize the file creation for the jobs. This may be handy to
+avoid interleaving of data files, which may greatly depend on the filesystem
+used and even the number of processors in the system. Default: true.
  .TP
-.B identical
-Generate the same offset
-.RE
-.P
-\fBsequential\fR is only useful for random IO, where fio would normally
-generate a new random offset for every IO. If you append eg 8 to randread, you
-would get a new random offset for every 8 IO's. The result would be a seek for
-only every 8 IO's, instead of for every IO. Use \fBrw=randread:8\fR to specify
-that. As sequential IO is already sequential, setting \fBsequential\fR for that
-would not result in any differences.  \fBidentical\fR behaves in a similar
-fashion, except it sends the same offset 8 number of times before generating a
-new offset.
+.BI create_fsync \fR=\fPbool
+\fBfsync\fR\|(2) the data file after creation. This is the default.
+.TP
+.BI create_on_open \fR=\fPbool
+If true, don't pre\-create files but allow the job's open() to create a file
+when it's time to do I/O. Default: false \-\- pre\-create all necessary files
+when the job starts.
+.TP
+.BI create_only \fR=\fPbool
+If true, fio will only run the setup phase of the job. If files need to be
+laid out or updated on disk, only that will be done \-\- the actual job contents
+are not executed. Default: false.
+.TP
+.BI allow_file_create \fR=\fPbool
+If true, fio is permitted to create files as part of its workload. If this
+option is false, then fio will error out if
+the files it needs to use don't already exist. Default: true.
+.TP
+.BI allow_mounted_write \fR=\fPbool
+If this isn't set, fio will abort jobs that are destructive (e.g. that write)
+to what appears to be a mounted device or partition. This should help catch
+creating inadvertently destructive tests, not realizing that the test will
+destroy data on the mounted file system. Note that some platforms don't allow
+writing against a mounted device regardless of this option. Default: false.
+.TP
+.BI pre_read \fR=\fPbool
+If this is given, files will be pre\-read into memory before starting the
+given I/O operation. This will also clear the \fBinvalidate\fR flag,
+since it is pointless to pre\-read and then drop the cache. This will only
+work for I/O engines that are seek\-able, since they allow you to read the
+same data multiple times. Thus it will not work on non\-seekable I/O engines
+(e.g. network, splice). Default: false.
+.TP
+.BI unlink \fR=\fPbool
+Unlink the job files when done. Not the default, as repeated runs of that
+job would then waste time recreating the file set again and again. Default:
+false.
+.TP
+.BI unlink_each_loop \fR=\fPbool
+Unlink job files after each iteration or loop. Default: false.
+.TP
+.BI zonesize \fR=\fPint
+Divide a file into zones of the specified size. See \fBzoneskip\fR.
+.TP
+.BI zonerange \fR=\fPint
+Give size of an I/O zone. See \fBzoneskip\fR.
+.TP
+.BI zoneskip \fR=\fPint
+Skip the specified number of bytes when \fBzonesize\fR data has been
+read. The two zone options can be used to only do I/O on zones of a file.
+.SS "I/O type"
+.TP
+.BI direct \fR=\fPbool
+If value is true, use non\-buffered I/O. This is usually O_DIRECT. Note that
+OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous
+ioengines don't support direct I/O. Default: false.
+.TP
+.BI atomic \fR=\fPbool
+If value is true, attempt to use atomic direct I/O. Atomic writes are
+guaranteed to be stable once acknowledged by the operating system. Only
+Linux supports O_ATOMIC right now.
+.TP
+.BI buffered \fR=\fPbool
+If value is true, use buffered I/O. This is the opposite of the
+\fBdirect\fR option. Defaults to true.
+.TP
+.BI readwrite \fR=\fPstr "\fR,\fP rw" \fR=\fPstr
+Type of I/O pattern. Accepted values are:
+.RS
+.RS
+.TP
+.B read
+Sequential reads.
+.TP
+.B write
+Sequential writes.
+.TP
+.B trim
+Sequential trims (Linux block devices only).
+.TP
+.B randread
+Random reads.
+.TP
+.B randwrite
+Random writes.
+.TP
+.B randtrim
+Random trims (Linux block devices only).
+.TP
+.B rw,readwrite
+Sequential mixed reads and writes.
+.TP
+.B randrw
+Random mixed reads and writes.
+.TP
+.B trimwrite
+Sequential trim+write sequences. Blocks will be trimmed first,
+then the same blocks will be written to.
  .RE
  .P
+Fio defaults to read if the option is not specified. For the mixed I/O
+types, the default is to split them 50/50. For certain types of I/O the
+result may still be skewed a bit, since the speed may be different.
+.P
+It is possible to specify the number of I/Os to do before getting a new
+offset by appending `:<nr>' to the end of the string given. For a
+random read, it would look like `rw=randread:8' for passing in an offset
+modifier with a value of 8. If the suffix is used with a sequential I/O
+pattern, then the `<nr>' value specified will be added to the generated
+offset for each I/O turning sequential I/O into sequential I/O with holes.
+For instance, using `rw=write:4k' will skip 4k for every write. Also see
+the \fBrw_sequencer\fR option.
+.RE
  .TP
-.BI kb_base \fR=\fPint
-The base unit for a kilobyte. The defacto base is 2^10, 1024.  Storage
-manufacturers like to use 10^3 or 1000 as a base ten unit instead, for obvious
-reasons. Allowed values are 1024 or 1000, with 1024 being the default.
+.BI rw_sequencer \fR=\fPstr
+If an offset modifier is given by appending a number to the `rw=\fIstr\fR'
+line, then this option controls how that number modifies the I/O offset
+being generated. Accepted values are:
+.RS
+.RS
+.TP
+.B sequential
+Generate sequential offset.
+.TP
+.B identical
+Generate the same offset.
+.RE
+.P
+\fBsequential\fR is only useful for random I/O, where fio would normally
+generate a new random offset for every I/O. If you append e.g. 8 to randread,
+you would get a new random offset for every 8 I/Os. The result would be a
+seek for only every 8 I/Os, instead of for every I/O. Use `rw=randread:8'
+to specify that. As sequential I/O is already sequential, setting
+\fBsequential\fR for that would not result in any differences. \fBidentical\fR
+behaves in a similar fashion, except it sends the same offset 8 number of
+times before generating a new offset.
+.RE
  .TP
  .BI unified_rw_reporting \fR=\fPbool
  Fio normally reports statistics on a per data direction basis, meaning that
-reads, writes, and trims are accounted and reported separately. If this option is
-set fio sums the results and reports them as "mixed" instead.
+reads, writes, and trims are accounted and reported separately. If this
+option is set fio sums the results and report them as "mixed" instead.
  .TP
  .BI randrepeat \fR=\fPbool
-Seed the random number generator used for random I/O patterns in a predictable
-way so the pattern is repeatable across runs.  Default: true.
+Seed the random number generator used for random I/O patterns in a
+predictable way so the pattern is repeatable across runs. Default: true.
  .TP
  .BI allrandrepeat \fR=\fPbool
  Seed all random number generators in a predictable way so results are
-repeatable across runs.  Default: false.
+repeatable across runs. Default: false.
  .TP
  .BI randseed \fR=\fPint
  Seed the random number generators based on this seed value, to be able to
@@ -428,30 +822,36 @@ control what sequence of output is being generated. If not set, the random
  sequence depends on the \fBrandrepeat\fR setting.
  .TP
  .BI fallocate \fR=\fPstr
-Whether pre-allocation is performed when laying down files. Accepted values
-are:
+Whether pre\-allocation is performed when laying down files.
+Accepted values are:
  .RS
  .RS
  .TP
  .B none
-Do not pre-allocate space.
+Do not pre\-allocate space.
+.TP
+.B native
+Use a platform's native pre\-allocation call but fall back to
+\fBnone\fR behavior if it fails/is not implemented.
  .TP
  .B posix
-Pre-allocate via \fBposix_fallocate\fR\|(3).
+Pre\-allocate via \fBposix_fallocate\fR\|(3).
  .TP
  .B keep
-Pre-allocate via \fBfallocate\fR\|(2) with FALLOC_FL_KEEP_SIZE set.
+Pre\-allocate via \fBfallocate\fR\|(2) with
+FALLOC_FL_KEEP_SIZE set.
  .TP
  .B 0
-Backward-compatible alias for 'none'.
+Backward\-compatible alias for \fBnone\fR.
  .TP
  .B 1
-Backward-compatible alias for 'posix'.
+Backward\-compatible alias for \fBposix\fR.
  .RE
  .P
-May not be available on all supported platforms. 'keep' is only
-available on Linux. If using ZFS on Solaris this must be set to 'none'
-because ZFS doesn't support it. Default: 'posix'.
+May not be available on all supported platforms. \fBkeep\fR is only available
+on Linux. If using ZFS on Solaris this cannot be set to \fBposix\fR
+because ZFS doesn't support pre\-allocation. Default: \fBnative\fR if any
+pre\-allocation methods are available, \fBnone\fR if not.
  .RE
  .TP
  .BI fadvise_hint \fR=\fPstr
@@ -465,239 +865,569 @@ Backwards compatible hint for "no hint".
  .TP
  .B 1
  Backwards compatible hint for "advise with fio workload type". This
-uses \fBFADV_RANDOM\fR for a random workload, and \fBFADV_SEQUENTIAL\fR
+uses FADV_RANDOM for a random workload, and FADV_SEQUENTIAL
  for a sequential workload.
  .TP
  .B sequential
-Advise using \fBFADV_SEQUENTIAL\fR
+Advise using FADV_SEQUENTIAL.
  .TP
  .B random
-Advise using \fBFADV_RANDOM\fR
+Advise using FADV_RANDOM.
  .RE
  .RE
  .TP
-.BI fadvise_stream \fR=\fPint
-Use \fBposix_fadvise\fR\|(2) to advise the kernel what stream ID the
-writes issued belong to. Only supported on Linux. Note, this option
-may change going forward.
+.BI write_hint \fR=\fPstr
+Use \fBfcntl\fR\|(2) to advise the kernel what life time to expect
+from a write. Only supported on Linux, as of version 4.13. Accepted
+values are:
+.RS
+.RS
  .TP
-.BI size \fR=\fPint
-Total size of I/O for this job.  \fBfio\fR will run until this many bytes have
-been transferred, unless limited by other options (\fBruntime\fR, for instance,
-or increased/descreased by \fBio_size\fR). Unless \fBnrfiles\fR and
-\fBfilesize\fR options are given, this amount will be divided between the
-available files for the job. If not set, fio will use the full size of the
-given files or devices. If the files do not exist, size must be given. It is
-also possible to give size as a percentage between 1 and 100. If size=20% is
-given, fio will use 20% of the full size of the given files or devices.
+.B none
+No particular life time associated with this file.
  .TP
-.BI io_size \fR=\fPint "\fR,\fB io_limit \fR=\fPint
-Normally fio operates within the region set by \fBsize\fR, which means that
-the \fBsize\fR option sets both the region and size of IO to be performed.
-Sometimes that is not what you want. With this option, it is possible to
-define just the amount of IO that fio should do. For instance, if \fBsize\fR
-is set to 20G and \fBio_limit\fR is set to 5G, fio will perform IO within
-the first 20G but exit when 5G have been done. The opposite is also
-possible - if \fBsize\fR is set to 20G, and \fBio_size\fR is set to 40G, then
-fio will do 40G of IO within the 0..20G region.
+.B short
+Data written to this file has a short life time.
  .TP
-.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
-Sets size to something really large and waits for ENOSPC (no space left on
-device) as the terminating condition. Only makes sense with sequential write.
-For a read workload, the mount point will be filled first then IO started on
-the result. This option doesn't make sense if operating on a raw device node,
-since the size of that is already known by the file system. Additionally,
-writing beyond end-of-device will not return ENOSPC there.
-.TP
-.BI filesize \fR=\fPirange
-Individual file sizes. May be a range, in which case \fBfio\fR will select sizes
-for files at random within the given range, limited to \fBsize\fR in total (if
-that is given). If \fBfilesize\fR is not specified, each created file is the
-same size.
+.B medium
+Data written to this file has a medium life time.
  .TP
-.BI file_append \fR=\fPbool
-Perform IO after the end of the file. Normally fio will operate within the
-size of a file. If this option is set, then fio will append to the file
-instead. This has identical behavior to setting \fRoffset\fP to the size
-of a file. This option is ignored on non-regular files.
+.B long
+Data written to this file has a long life time.
+.TP
+.B extreme
+Data written to this file has a very long life time.
+.RE
+.P
+The values are all relative to each other, and no absolute meaning
+should be associated with them.
+.RE
+.TP
+.BI offset \fR=\fPint
+Start I/O at the provided offset in the file, given as either a fixed size in
+bytes or a percentage. If a percentage is given, the next \fBblockalign\fR\-ed
+offset will be used. Data before the given offset will not be touched. This
+effectively caps the file size at `real_size \- offset'. Can be combined with
+\fBsize\fR to constrain the start and end range of the I/O workload.
+A percentage can be specified by a number between 1 and 100 followed by '%',
+for example, `offset=20%' to specify 20%.
+.TP
+.BI offset_increment \fR=\fPint
+If this is provided, then the real offset becomes `\fBoffset\fR + \fBoffset_increment\fR
+* thread_number', where the thread number is a counter that starts at 0 and
+is incremented for each sub\-job (i.e. when \fBnumjobs\fR option is
+specified). This option is useful if there are several jobs which are
+intended to operate on a file in parallel disjoint segments, with even
+spacing between the starting points.
+.TP
+.BI number_ios \fR=\fPint
+Fio will normally perform I/Os until it has exhausted the size of the region
+set by \fBsize\fR, or if it exhaust the allocated time (or hits an error
+condition). With this setting, the range/size can be set independently of
+the number of I/Os to perform. When fio reaches this number, it will exit
+normally and report status. Note that this does not extend the amount of I/O
+that will be done, it will only stop fio if this condition is met before
+other end\-of\-job criteria.
+.TP
+.BI fsync \fR=\fPint
+If writing to a file, issue an \fBfsync\fR\|(2) (or its equivalent) of
+the dirty data for every number of blocks given. For example, if you give 32
+as a parameter, fio will sync the file after every 32 writes issued. If fio is
+using non\-buffered I/O, we may not sync the file. The exception is the sg
+I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
+means fio does not periodically issue and wait for a sync to complete. Also
+see \fBend_fsync\fR and \fBfsync_on_close\fR.
+.TP
+.BI fdatasync \fR=\fPint
+Like \fBfsync\fR but uses \fBfdatasync\fR\|(2) to only sync data and
+not metadata blocks. In Windows, FreeBSD, and DragonFlyBSD there is no
+\fBfdatasync\fR\|(2) so this falls back to using \fBfsync\fR\|(2).
+Defaults to 0, which means fio does not periodically issue and wait for a
+data\-only sync to complete.
+.TP
+.BI write_barrier \fR=\fPint
+Make every N\-th write a barrier write.
+.TP
+.BI sync_file_range \fR=\fPstr:int
+Use \fBsync_file_range\fR\|(2) for every \fIint\fR number of write
+operations. Fio will track range of writes that have happened since the last
+\fBsync_file_range\fR\|(2) call. \fIstr\fR can currently be one or more of:
+.RS
+.RS
+.TP
+.B wait_before
+SYNC_FILE_RANGE_WAIT_BEFORE
+.TP
+.B write
+SYNC_FILE_RANGE_WRITE
+.TP
+.B wait_after
+SYNC_FILE_RANGE_WRITE_AFTER
+.RE
+.P
+So if you do `sync_file_range=wait_before,write:8', fio would use
+`SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE' for every 8
+writes. Also see the \fBsync_file_range\fR\|(2) man page. This option is
+Linux specific.
+.RE
+.TP
+.BI overwrite \fR=\fPbool
+If true, writes to a file will always overwrite existing data. If the file
+doesn't already exist, it will be created before the write phase begins. If
+the file exists and is large enough for the specified write phase, nothing
+will be done. Default: false.
+.TP
+.BI end_fsync \fR=\fPbool
+If true, \fBfsync\fR\|(2) file contents when a write stage has completed.
+Default: false.
+.TP
+.BI fsync_on_close \fR=\fPbool
+If true, fio will \fBfsync\fR\|(2) a dirty file on close. This differs
+from \fBend_fsync\fR in that it will happen on every file close, not
+just at the end of the job. Default: false.
+.TP
+.BI rwmixread \fR=\fPint
+Percentage of a mixed workload that should be reads. Default: 50.
+.TP
+.BI rwmixwrite \fR=\fPint
+Percentage of a mixed workload that should be writes. If both
+\fBrwmixread\fR and \fBrwmixwrite\fR is given and the values do not
+add up to 100%, the latter of the two will be used to override the
+first. This may interfere with a given rate setting, if fio is asked to
+limit reads or writes to a certain rate. If that is the case, then the
+distribution may be skewed. Default: 50.
+.TP
+.BI random_distribution \fR=\fPstr:float[,str:float][,str:float]
+By default, fio will use a completely uniform random distribution when asked
+to perform random I/O. Sometimes it is useful to skew the distribution in
+specific ways, ensuring that some parts of the data is more hot than others.
+fio includes the following distribution models:
+.RS
+.RS
+.TP
+.B random
+Uniform random distribution
+.TP
+.B zipf
+Zipf distribution
+.TP
+.B pareto
+Pareto distribution
+.TP
+.B normal
+Normal (Gaussian) distribution
+.TP
+.B zoned
+Zoned random distribution
+.RE
+.P
+When using a \fBzipf\fR or \fBpareto\fR distribution, an input value is also
+needed to define the access pattern. For \fBzipf\fR, this is the `Zipf theta'.
+For \fBpareto\fR, it's the `Pareto power'. Fio includes a test
+program, \fBfio\-genzipf\fR, that can be used visualize what the given input
+values will yield in terms of hit rates. If you wanted to use \fBzipf\fR with
+a `theta' of 1.2, you would use `random_distribution=zipf:1.2' as the
+option. If a non\-uniform model is used, fio will disable use of the random
+map. For the \fBnormal\fR distribution, a normal (Gaussian) deviation is
+supplied as a value between 0 and 100.
+.P
+For a \fBzoned\fR distribution, fio supports specifying percentages of I/O
+access that should fall within what range of the file or device. For
+example, given a criteria of:
+.RS
+.P
+.PD 0
+60% of accesses should be to the first 10%
+.P
+30% of accesses should be to the next 20%
+.P
+8% of accesses should be to the next 30%
+.P
+2% of accesses should be to the next 40%
+.PD
+.RE
+.P
+we can define that through zoning of the random accesses. For the above
+example, the user would do:
+.RS
+.P
+random_distribution=zoned:60/10:30/20:8/30:2/40
+.RE
+.P
+similarly to how \fBbssplit\fR works for setting ranges and percentages
+of block sizes. Like \fBbssplit\fR, it's possible to specify separate
+zones for reads, writes, and trims. If just one set is given, it'll apply to
+all of them.
+.RE
+.TP
+.BI percentage_random \fR=\fPint[,int][,int]
+For a random workload, set how big a percentage should be random. This
+defaults to 100%, in which case the workload is fully random. It can be set
+from anywhere from 0 to 100. Setting it to 0 would make the workload fully
+sequential. Any setting in between will result in a random mix of sequential
+and random I/O, at the given percentages. Comma\-separated values may be
+specified for reads, writes, and trims as described in \fBblocksize\fR.
+.TP
+.BI norandommap
+Normally fio will cover every block of the file when doing random I/O. If
+this option is given, fio will just get a new random offset without looking
+at past I/O history. This means that some blocks may not be read or written,
+and that some blocks may be read/written more than once. If this option is
+used with \fBverify\fR and multiple blocksizes (via \fBbsrange\fR),
+only intact blocks are verified, i.e., partially\-overwritten blocks are
+ignored.
+.TP
+.BI softrandommap \fR=\fPbool
+See \fBnorandommap\fR. If fio runs with the random block map enabled and
+it fails to allocate the map, if this option is set it will continue without
+a random block map. As coverage will not be as complete as with random maps,
+this option is disabled by default.
+.TP
+.BI random_generator \fR=\fPstr
+Fio supports the following engines for generating I/O offsets for random I/O:
+.RS
+.RS
+.TP
+.B tausworthe
+Strong 2^88 cycle random number generator.
+.TP
+.B lfsr
+Linear feedback shift register generator.
+.TP
+.B tausworthe64
+Strong 64\-bit 2^258 cycle random number generator.
+.RE
+.P
+\fBtausworthe\fR is a strong random number generator, but it requires tracking
+on the side if we want to ensure that blocks are only read or written
+once. \fBlfsr\fR guarantees that we never generate the same offset twice, and
+it's also less computationally expensive. It's not a true random generator,
+however, though for I/O purposes it's typically good enough. \fBlfsr\fR only
+works with single block sizes, not with workloads that use multiple block
+sizes. If used with such a workload, fio may read or write some blocks
+multiple times. The default value is \fBtausworthe\fR, unless the required
+space exceeds 2^32 blocks. If it does, then \fBtausworthe64\fR is
+selected automatically.
+.RE
+.SS "Block size"
  .TP
  .BI blocksize \fR=\fPint[,int][,int] "\fR,\fB bs" \fR=\fPint[,int][,int]
-The block size in bytes for I/O units.  Default: 4096.
-A single value applies to reads, writes, and trims.
-Comma-separated values may be specified for reads, writes, and trims.
-Empty values separated by commas use the default value. A value not
-terminated in a comma applies to subsequent types.
-.nf
-Examples:
-bs=256k    means 256k for reads, writes and trims
-bs=8k,32k  means 8k for reads, 32k for writes and trims
-bs=8k,32k, means 8k for reads, 32k for writes, and default for trims
-bs=,8k     means default for reads, 8k for writes and trims
-bs=,8k,    means default for reads, 8k for writes, and default for writes
-.fi
+The block size in bytes used for I/O units. Default: 4096. A single value
+applies to reads, writes, and trims. Comma\-separated values may be
+specified for reads, writes, and trims. A value not terminated in a comma
+applies to subsequent types. Examples:
+.RS
+.RS
+.P
+.PD 0
+bs=256k        means 256k for reads, writes and trims.
+.P
+bs=8k,32k      means 8k for reads, 32k for writes and trims.
+.P
+bs=8k,32k,     means 8k for reads, 32k for writes, and default for trims.
+.P
+bs=,8k         means default for reads, 8k for writes and trims.
+.P
+bs=,8k,        means default for reads, 8k for writes, and default for trims.
+.PD
+.RE
+.RE
  .TP
  .BI blocksize_range \fR=\fPirange[,irange][,irange] "\fR,\fB bsrange" \fR=\fPirange[,irange][,irange]
-A range of block sizes in bytes for I/O units.
-The issued I/O unit will always be a multiple of the minimum size, unless
+A range of block sizes in bytes for I/O units. The issued I/O unit will
+always be a multiple of the minimum size, unless
  \fBblocksize_unaligned\fR is set.
-Comma-separated ranges may be specified for reads, writes, and trims
-as described in \fBblocksize\fR.
-.nf
-Example: bsrange=1k-4k,2k-8k.
-.fi
+Comma\-separated ranges may be specified for reads, writes, and trims as
+described in \fBblocksize\fR. Example:
+.RS
+.RS
+.P
+bsrange=1k\-4k,2k\-8k
+.RE
+.RE
  .TP
  .BI bssplit \fR=\fPstr[,str][,str]
-This option allows even finer grained control of the block sizes issued,
-not just even splits between them. With this option, you can weight various
-block sizes for exact control of the issued IO for a job that has mixed
-block sizes. The format of the option is bssplit=blocksize/percentage,
-optionally adding as many definitions as needed separated by a colon.
-Example: bssplit=4k/10:64k/50:32k/40 would issue 50% 64k blocks, 10% 4k
-blocks and 40% 32k blocks. \fBbssplit\fR also supports giving separate
-splits to reads, writes, and trims.
-Comma-separated values may be specified for reads, writes, and trims
-as described in \fBblocksize\fR.
-.TP
-.B blocksize_unaligned\fR,\fB bs_unaligned
-If set, fio will issue I/O units with any size within \fBblocksize_range\fR,
-not just multiples of the minimum size.  This typically won't
-work with direct I/O, as that normally requires sector alignment.
+Sometimes you want even finer grained control of the block sizes issued, not
+just an even split between them. This option allows you to weight various
+block sizes, so that you are able to define a specific amount of block sizes
+issued. The format for this option is:
+.RS
+.RS
+.P
+bssplit=blocksize/percentage:blocksize/percentage
+.RE
+.P
+for as many block sizes as needed. So if you want to define a workload that
+has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would write:
+.RS
+.P
+bssplit=4k/10:64k/50:32k/40
+.RE
+.P
+Ordering does not matter. If the percentage is left blank, fio will fill in
+the remaining values evenly. So a bssplit option like this one:
+.RS
+.P
+bssplit=4k/50:1k/:32k/
+.RE
+.P
+would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always add up
+to 100, if bssplit is given a range that adds up to more, it will error out.
+.P
+Comma\-separated values may be specified for reads, writes, and trims as
+described in \fBblocksize\fR.
+.P
+If you want a workload that has 50% 2k reads and 50% 4k reads, while having
+90% 4k writes and 10% 8k writes, you would specify:
+.RS
+.P
+bssplit=2k/50:4k/50,4k/90,8k/10
+.RE
+.RE
+.TP
+.BI blocksize_unaligned "\fR,\fB bs_unaligned"
+If set, fio will issue I/O units with any size within
+\fBblocksize_range\fR, not just multiples of the minimum size. This
+typically won't work with direct I/O, as that normally requires sector
+alignment.
  .TP
  .BI bs_is_seq_rand \fR=\fPbool
-If this option is set, fio will use the normal read,write blocksize settings as
-sequential,random blocksize settings instead. Any random read or write will
-use the WRITE blocksize settings, and any sequential read or write will use
-the READ blocksize settings.
+If this option is set, fio will use the normal read,write blocksize settings
+as sequential,random blocksize settings instead. Any random read or write
+will use the WRITE blocksize settings, and any sequential read or write will
+use the READ blocksize settings.
  .TP
  .BI blockalign \fR=\fPint[,int][,int] "\fR,\fB ba" \fR=\fPint[,int][,int]
-Boundary to which fio will align random I/O units. Default: \fBblocksize\fR.
-Minimum alignment is typically 512b for using direct IO, though it usually
-depends on the hardware block size.  This option is mutually exclusive with
-using a random map for files, so it will turn off that option.
-Comma-separated values may be specified for reads, writes, and trims
-as described in \fBblocksize\fR.
-.TP
-.B zero_buffers
+Boundary to which fio will align random I/O units. Default:
+\fBblocksize\fR. Minimum alignment is typically 512b for using direct
+I/O, though it usually depends on the hardware block size. This option is
+mutually exclusive with using a random map for files, so it will turn off
+that option. Comma\-separated values may be specified for reads, writes, and
+trims as described in \fBblocksize\fR.
+.SS "Buffers and memory"
+.TP
+.BI zero_buffers
  Initialize buffers with all zeros. Default: fill buffers with random data.
  .TP
-.B refill_buffers
-If this option is given, fio will refill the IO buffers on every submit. The
-default is to only fill it at init time and reuse that data. Only makes sense
-if zero_buffers isn't specified, naturally. If data verification is enabled,
-refill_buffers is also automatically enabled.
+.BI refill_buffers
+If this option is given, fio will refill the I/O buffers on every
+submit. The default is to only fill it at init time and reuse that
+data. Only makes sense if zero_buffers isn't specified, naturally. If data
+verification is enabled, \fBrefill_buffers\fR is also automatically enabled.
  .TP
  .BI scramble_buffers \fR=\fPbool
  If \fBrefill_buffers\fR is too costly and the target is using data
-deduplication, then setting this option will slightly modify the IO buffer
-contents to defeat normal de-dupe attempts. This is not enough to defeat
-more clever block compression attempts, but it will stop naive dedupe
-of blocks. Default: true.
+deduplication, then setting this option will slightly modify the I/O buffer
+contents to defeat normal de\-dupe attempts. This is not enough to defeat
+more clever block compression attempts, but it will stop naive dedupe of
+blocks. Default: true.
  .TP
  .BI buffer_compress_percentage \fR=\fPint
-If this is set, then fio will attempt to provide IO buffer content (on WRITEs)
-that compress to the specified level. Fio does this by providing a mix of
-random data and a fixed pattern. The fixed pattern is either zeroes, or the
-pattern specified by \fBbuffer_pattern\fR. If the pattern option is used, it
-might skew the compression ratio slightly. Note that this is per block size
-unit, for file/disk wide compression level that matches this setting. Note
-that this is per block size unit, for file/disk wide compression level that
-matches this setting, you'll also want to set refill_buffers.
+If this is set, then fio will attempt to provide I/O buffer content (on
+WRITEs) that compresses to the specified level. Fio does this by providing a
+mix of random data and a fixed pattern. The fixed pattern is either zeros,
+or the pattern specified by \fBbuffer_pattern\fR. If the pattern option
+is used, it might skew the compression ratio slightly. Note that this is per
+block size unit, for file/disk wide compression level that matches this
+setting, you'll also want to set \fBrefill_buffers\fR.
  .TP
  .BI buffer_compress_chunk \fR=\fPint
-See \fBbuffer_compress_percentage\fR. This setting allows fio to manage how
-big the ranges of random data and zeroed data is. Without this set, fio will
-provide \fBbuffer_compress_percentage\fR of blocksize random data, followed by
-the remaining zeroed. With this set to some chunk size smaller than the block
-size, fio can alternate random and zeroed data throughout the IO buffer.
+See \fBbuffer_compress_percentage\fR. This setting allows fio to manage
+how big the ranges of random data and zeroed data is. Without this set, fio
+will provide \fBbuffer_compress_percentage\fR of blocksize random data,
+followed by the remaining zeroed. With this set to some chunk size smaller
+than the block size, fio can alternate random and zeroed data throughout the
+I/O buffer.
  .TP
  .BI buffer_pattern \fR=\fPstr
-If set, fio will fill the IO buffers with this pattern. If not set, the contents
-of IO buffers is defined by the other options related to buffer contents. The
-setting can be any pattern of bytes, and can be prefixed with 0x for hex
-values. It may also be a string, where the string must then be wrapped with
-"", e.g.:
-.RS
-.RS
-\fBbuffer_pattern\fR="abcd"
+If set, fio will fill the I/O buffers with this pattern or with the contents
+of a file. If not set, the contents of I/O buffers are defined by the other
+options related to buffer contents. The setting can be any pattern of bytes,
+and can be prefixed with 0x for hex values. It may also be a string, where
+the string must then be wrapped with "". Or it may also be a filename,
+where the filename must be wrapped with '' in which case the file is
+opened and read. Note that not all the file contents will be read if that
+would cause the buffers to overflow. So, for example:
  .RS
-or
-.RE
-\fBbuffer_pattern\fR=-12
  .RS
-or
-.RE
-\fBbuffer_pattern\fR=0xdeadface
+.P
+.PD 0
+buffer_pattern='filename'
+.P
+or:
+.P
+buffer_pattern="abcd"
+.P
+or:
+.P
+buffer_pattern=\-12
+.P
+or:
+.P
+buffer_pattern=0xdeadface
+.PD
  .RE
-.LP
+.P
  Also you can combine everything together in any order:
-.LP
  .RS
-\fBbuffer_pattern\fR=0xdeadface"abcd"-12
+.P
+buffer_pattern=0xdeadface"abcd"\-12'filename'
  .RE
  .RE
  .TP
  .BI dedupe_percentage \fR=\fPint
-If set, fio will generate this percentage of identical buffers when writing.
-These buffers will be naturally dedupable. The contents of the buffers depend
-on what other buffer compression settings have been set. It's possible to have
-the individual buffers either fully compressible, or not at all. This option
-only controls the distribution of unique buffers.
+If set, fio will generate this percentage of identical buffers when
+writing. These buffers will be naturally dedupable. The contents of the
+buffers depend on what other buffer compression settings have been set. It's
+possible to have the individual buffers either fully compressible, or not at
+all. This option only controls the distribution of unique buffers.
  .TP
-.BI nrfiles \fR=\fPint
-Number of files to use for this job.  Default: 1.
+.BI invalidate \fR=\fPbool
+Invalidate the buffer/page cache parts of the files to be used prior to
+starting I/O if the platform and file type support it. Defaults to true.
+This will be ignored if \fBpre_read\fR is also specified for the
+same job.
  .TP
-.BI openfiles \fR=\fPint
-Number of files to keep open at the same time.  Default: \fBnrfiles\fR.
+.BI sync \fR=\fPbool
+Use synchronous I/O for buffered writes. For the majority of I/O engines,
+this means using O_SYNC. Default: false.
  .TP
-.BI file_service_type \fR=\fPstr
-Defines how files to service are selected.  The following types are defined:
+.BI iomem \fR=\fPstr "\fR,\fP mem" \fR=\fPstr
+Fio can use various types of memory as the I/O unit buffer. The allowed
+values are:
  .RS
  .RS
  .TP
-.B random
-Choose a file at random.
+.B malloc
+Use memory from \fBmalloc\fR\|(3) as the buffers. Default memory type.
  .TP
-.B roundrobin
-Round robin over opened files (default).
+.B shm
+Use shared memory as the buffers. Allocated through \fBshmget\fR\|(2).
  .TP
-.B sequential
-Do each file in the set sequentially.
+.B shmhuge
+Same as \fBshm\fR, but use huge pages as backing.
  .TP
-.B zipf
-Use a zipfian distribution to decide what file to access.
+.B mmap
+Use \fBmmap\fR\|(2) to allocate buffers. May either be anonymous memory, or can
+be file backed if a filename is given after the option. The format
+is `mem=mmap:/path/to/file'.
  .TP
-.B pareto
-Use a pareto distribution to decide what file to access.
+.B mmaphuge
+Use a memory mapped huge file as the buffer backing. Append filename
+after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file'.
  .TP
-.B gauss
-Use a gaussian (normal) distribution to decide what file to access.
+.B mmapshared
+Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
+.TP
+.B cudamalloc
+Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+The \fBioengine\fR must be \fBrdma\fR.
  .RE
  .P
-For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be
-appended to tell fio how many I/Os to issue before switching to a new file.
-For example, specifying \fBfile_service_type=random:8\fR would cause fio to
-issue \fI8\fR I/Os before selecting a new file at random. For the non-uniform
-distributions, a floating point postfix can be given to influence how the
-distribution is skewed. See \fBrandom_distribution\fR for a description of how
-that would work.
+The area allocated is a function of the maximum allowed bs size for the job,
+multiplied by the I/O depth given. Note that for \fBshmhuge\fR and
+\fBmmaphuge\fR to work, the system must have free huge pages allocated. This
+can normally be checked and set by reading/writing
+`/proc/sys/vm/nr_hugepages' on a Linux system. Fio assumes a huge page
+is 4MiB in size. So to calculate the number of huge pages you need for a
+given job file, add up the I/O depth of all jobs (normally one unless
+\fBiodepth\fR is used) and multiply by the maximum bs set. Then divide
+that number by the huge page size. You can see the size of the huge pages in
+`/proc/meminfo'. If no huge pages are allocated by having a non\-zero
+number in `nr_hugepages', using \fBmmaphuge\fR or \fBshmhuge\fR will fail. Also
+see \fBhugepage\-size\fR.
+.P
+\fBmmaphuge\fR also needs to have hugetlbfs mounted and the file location
+should point there. So if it's mounted in `/huge', you would use
+`mem=mmaphuge:/huge/somefile'.
  .RE
  .TP
+.BI iomem_align \fR=\fPint "\fR,\fP mem_align" \fR=\fPint
+This indicates the memory alignment of the I/O memory buffers. Note that
+the given alignment is applied to the first I/O unit buffer, if using
+\fBiodepth\fR the alignment of the following buffers are given by the
+\fBbs\fR used. In other words, if using a \fBbs\fR that is a
+multiple of the page sized in the system, all buffers will be aligned to
+this value. If using a \fBbs\fR that is not page aligned, the alignment
+of subsequent I/O memory buffers is the sum of the \fBiomem_align\fR and
+\fBbs\fR used.
+.TP
+.BI hugepage\-size \fR=\fPint
+Defines the size of a huge page. Must at least be equal to the system
+setting, see `/proc/meminfo'. Defaults to 4MiB. Should probably
+always be a multiple of megabytes, so using `hugepage\-size=Xm' is the
+preferred way to set this to avoid setting a non\-pow\-2 bad value.
+.TP
+.BI lockmem \fR=\fPint
+Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to
+simulate a smaller amount of memory. The amount specified is per worker.
+.SS "I/O size"
+.TP
+.BI size \fR=\fPint
+The total size of file I/O for each thread of this job. Fio will run until
+this many bytes has been transferred, unless runtime is limited by other options
+(such as \fBruntime\fR, for instance, or increased/decreased by \fBio_size\fR).
+Fio will divide this size between the available files determined by options
+such as \fBnrfiles\fR, \fBfilename\fR, unless \fBfilesize\fR is
+specified by the job. If the result of division happens to be 0, the size is
+set to the physical size of the given files or devices if they exist.
+If this option is not specified, fio will use the full size of the given
+files or devices. If the files do not exist, size must be given. It is also
+possible to give size as a percentage between 1 and 100. If `size=20%' is
+given, fio will use 20% of the full size of the given files or devices.
+Can be combined with \fBoffset\fR to constrain the start and end range
+that I/O will be done within.
+.TP
+.BI io_size \fR=\fPint "\fR,\fB io_limit" \fR=\fPint
+Normally fio operates within the region set by \fBsize\fR, which means
+that the \fBsize\fR option sets both the region and size of I/O to be
+performed. Sometimes that is not what you want. With this option, it is
+possible to define just the amount of I/O that fio should do. For instance,
+if \fBsize\fR is set to 20GiB and \fBio_size\fR is set to 5GiB, fio
+will perform I/O within the first 20GiB but exit when 5GiB have been
+done. The opposite is also possible \-\- if \fBsize\fR is set to 20GiB,
+and \fBio_size\fR is set to 40GiB, then fio will do 40GiB of I/O within
+the 0..20GiB region.
+.TP
+.BI filesize \fR=\fPirange(int)
+Individual file sizes. May be a range, in which case fio will select sizes
+for files at random within the given range and limited to \fBsize\fR in
+total (if that is given). If not given, each created file is the same size.
+This option overrides \fBsize\fR in terms of file size, which means
+this value is used as a fixed size or possible range of each file.
+.TP
+.BI file_append \fR=\fPbool
+Perform I/O after the end of the file. Normally fio will operate within the
+size of a file. If this option is set, then fio will append to the file
+instead. This has identical behavior to setting \fBoffset\fR to the size
+of a file. This option is ignored on non\-regular files.
+.TP
+.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
+Sets size to something really large and waits for ENOSPC (no space left on
+device) as the terminating condition. Only makes sense with sequential
+write. For a read workload, the mount point will be filled first then I/O
+started on the result. This option doesn't make sense if operating on a raw
+device node, since the size of that is already known by the file system.
+Additionally, writing beyond end\-of\-device will not return ENOSPC there.
+.SS "I/O engine"
+.TP
  .BI ioengine \fR=\fPstr
-Defines how the job issues I/O.  The following types are defined:
+Defines how the job issues I/O to the file. The following types are defined:
  .RS
  .RS
  .TP
  .B sync
-Basic \fBread\fR\|(2) or \fBwrite\fR\|(2) I/O.  \fBfseek\fR\|(2) is used to
-position the I/O location.
+Basic \fBread\fR\|(2) or \fBwrite\fR\|(2)
+I/O. \fBlseek\fR\|(2) is used to position the I/O location.
+See \fBfsync\fR and \fBfdatasync\fR for syncing write I/Os.
  .TP
  .B psync
-Basic \fBpread\fR\|(2) or \fBpwrite\fR\|(2) I/O.
-Default on all supported operating systems except for Windows.
+Basic \fBpread\fR\|(2) or \fBpwrite\fR\|(2) I/O. Default on
+all supported operating systems except for Windows.
  .TP
  .B vsync
-Basic \fBreadv\fR\|(2) or \fBwritev\fR\|(2) I/O. Will emulate queuing by
-coalescing adjacent IOs into a single submission.
+Basic \fBreadv\fR\|(2) or \fBwritev\fR\|(2) I/O. Will emulate
+queuing by coalescing adjacent I/Os into a single submission.
  .TP
  .B pvsync
  Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O.
@@ -706,10 +1436,14 @@ Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O.
  Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O.
  .TP
  .B libaio
-Linux native asynchronous I/O. This ioengine defines engine specific options.
+Linux native asynchronous I/O. Note that Linux may only support
+queued behavior with non\-buffered I/O (set `direct=1' or
+`buffered=0').
+This engine defines engine specific options.
  .TP
  .B posixaio
-POSIX asynchronous I/O using \fBaio_read\fR\|(3) and \fBaio_write\fR\|(3).
+POSIX asynchronous I/O using \fBaio_read\fR\|(3) and
+\fBaio_write\fR\|(3).
  .TP
  .B solarisaio
  Solaris native asynchronous I/O.
@@ -718,462 +1452,554 @@ Solaris native asynchronous I/O.
  Windows native asynchronous I/O. Default on Windows.
  .TP
  .B mmap
-File is memory mapped with \fBmmap\fR\|(2) and data copied using
-\fBmemcpy\fR\|(3).
+File is memory mapped with \fBmmap\fR\|(2) and data copied
+to/from using \fBmemcpy\fR\|(3).
  .TP
  .B splice
-\fBsplice\fR\|(2) is used to transfer the data and \fBvmsplice\fR\|(2) to
-transfer data from user-space to the kernel.
+\fBsplice\fR\|(2) is used to transfer the data and
+\fBvmsplice\fR\|(2) to transfer data from user space to the
+kernel.
  .TP
  .B sg
-SCSI generic sg v3 I/O. May be either synchronous using the SG_IO ioctl, or if
-the target is an sg character device, we use \fBread\fR\|(2) and
-\fBwrite\fR\|(2) for asynchronous I/O.
+SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
+ioctl, or if the target is an sg character device we use
+\fBread\fR\|(2) and \fBwrite\fR\|(2) for asynchronous
+I/O. Requires \fBfilename\fR option to specify either block or
+character devices.
  .TP
  .B null
-Doesn't transfer any data, just pretends to.  Mainly used to exercise \fBfio\fR
-itself and for debugging and testing purposes.
+Doesn't transfer any data, just pretends to. This is mainly used to
+exercise fio itself and for debugging/testing purposes.
  .TP
  .B net
-Transfer over the network.  The protocol to be used can be defined with the
-\fBprotocol\fR parameter.  Depending on the protocol, \fBfilename\fR,
-\fBhostname\fR, \fBport\fR, or \fBlisten\fR must be specified.
-This ioengine defines engine specific options.
+Transfer over the network to given `host:port'. Depending on the
+\fBprotocol\fR used, the \fBhostname\fR, \fBport\fR,
+\fBlisten\fR and \fBfilename\fR options are used to specify
+what sort of connection to make, while the \fBprotocol\fR option
+determines which protocol will be used. This engine defines engine
+specific options.
  .TP
  .B netsplice
-Like \fBnet\fR, but uses \fBsplice\fR\|(2) and \fBvmsplice\fR\|(2) to map data
-and send/receive. This ioengine defines engine specific options.
+Like \fBnet\fR, but uses \fBsplice\fR\|(2) and
+\fBvmsplice\fR\|(2) to map data and send/receive.
+This engine defines engine specific options.
  .TP
  .B cpuio
-Doesn't transfer any data, but burns CPU cycles according to \fBcpuload\fR and
-\fBcpuchunks\fR parameters. A job never finishes unless there is at least one
-non-cpuio job.
+Doesn't transfer any data, but burns CPU cycles according to the
+\fBcpuload\fR and \fBcpuchunks\fR options. Setting
+\fBcpuload\fR\=85 will cause that job to do nothing but burn 85%
+of the CPU. In case of SMP machines, use `numjobs=<nr_of_cpu>'
+to get desired CPU usage, as the cpuload only loads a
+single CPU at the desired rate. A job never finishes unless there is
+at least one non\-cpuio job.
  .TP
  .B guasi
-The GUASI I/O engine is the Generic Userspace Asynchronous Syscall Interface
-approach to asynchronous I/O.
-.br
-See <http://www.xmailserver.org/guasi\-lib.html>.
+The GUASI I/O engine is the Generic Userspace Asyncronous Syscall
+Interface approach to async I/O. See \fIhttp://www.xmailserver.org/guasi\-lib.html\fR
+for more info on GUASI.
  .TP
  .B rdma
-The RDMA I/O engine supports both RDMA memory semantics (RDMA_WRITE/RDMA_READ)
-and channel semantics (Send/Recv) for the InfiniBand, RoCE and iWARP protocols.
-.TP
-.B external
-Loads an external I/O engine object file.  Append the engine filename as
-`:\fIenginepath\fR'.
+The RDMA I/O engine supports both RDMA memory semantics
+(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
+InfiniBand, RoCE and iWARP protocols.
  .TP
  .B falloc
-   IO engine that does regular linux native fallocate call to simulate data
-transfer as fio ioengine
-.br
-  DDIR_READ  does fallocate(,mode = FALLOC_FL_KEEP_SIZE,)
-.br
-  DIR_WRITE does fallocate(,mode = 0)
-.br
-  DDIR_TRIM does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE)
+I/O engine that does regular fallocate to simulate data transfer as
+fio ioengine.
+.RS
+.P
+.PD 0
+DDIR_READ      does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
+.P
+DIR_WRITE      does fallocate(,mode = 0).
+.P
+DDIR_TRIM      does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
+.PD
+.RE
+.TP
+.B ftruncate
+I/O engine that sends \fBftruncate\fR\|(2) operations in response
+to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
+size to the current block offset. \fBblocksize\fR is ignored.
  .TP
  .B e4defrag
-IO engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate defragment activity
-request to DDIR_WRITE event
+I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+defragment activity in request to DDIR_WRITE event.
  .TP
  .B rbd
-IO engine supporting direct access to Ceph Rados Block Devices (RBD) via librbd
-without the need to use the kernel rbd driver. This ioengine defines engine specific
-options.
+I/O engine supporting direct access to Ceph Rados Block Devices
+(RBD) via librbd without the need to use the kernel rbd driver. This
+ioengine defines engine specific options.
  .TP
  .B gfapi
-Using Glusterfs libgfapi sync interface to direct access to Glusterfs volumes without
-having to go through FUSE. This ioengine defines engine specific
-options.
+Using GlusterFS libgfapi sync interface to direct access to
+GlusterFS volumes without having to go through FUSE. This ioengine
+defines engine specific options.
  .TP
  .B gfapi_async
-Using Glusterfs libgfapi async interface to direct access to Glusterfs volumes without
-having to go through FUSE. This ioengine defines engine specific
-options.
+Using GlusterFS libgfapi async interface to direct access to
+GlusterFS volumes without having to go through FUSE. This ioengine
+defines engine specific options.
  .TP
  .B libhdfs
-Read and write through Hadoop (HDFS).  The \fBfilename\fR option is used to
-specify host,port of the hdfs name-node to connect. This engine interprets
-offsets a little differently. In HDFS, files once created cannot be modified.
-So random writes are not possible. To imitate this, libhdfs engine expects
-bunch of small files to be created over HDFS, and engine will randomly pick a
-file out of those files based on the offset generated by fio backend. (see the
-example job file to create such files, use rw=write option). Please note, you
-might want to set necessary environment variables to work with hdfs/libhdfs
-properly.
+Read and write through Hadoop (HDFS). The \fBfilename\fR option
+is used to specify host,port of the hdfs name\-node to connect. This
+engine interprets offsets a little differently. In HDFS, files once
+created cannot be modified so random writes are not possible. To
+imitate this the libhdfs engine expects a bunch of small files to be
+created over HDFS and will randomly pick a file from them
+based on the offset generated by fio backend (see the example
+job file to create such files, use `rw=write' option). Please
+note, it may be necessary to set environment variables to work
+with HDFS/libhdfs properly. Each job uses its own connection to
+HDFS.
  .TP
  .B mtd
-Read, write and erase an MTD character device (e.g., /dev/mtd0). Discards are
-treated as erases. Depending on the underlying device type, the I/O may have
-to go in a certain pattern, e.g., on NAND, writing sequentially to erase blocks
-and discarding before overwriting. The trimwrite mode works well for this
+Read, write and erase an MTD character device (e.g.,
+`/dev/mtd0'). Discards are treated as erases. Depending on the
+underlying device type, the I/O may have to go in a certain pattern,
+e.g., on NAND, writing sequentially to erase blocks and discarding
+before overwriting. The \fBtrimwrite\fR mode works well for this
  constraint.
  .TP
  .B pmemblk
-Read and write using filesystem DAX to a file on a filesystem mounted with
-DAX on a persistent memory device through the NVML libpmemblk library.
-.TP
-.B dev-dax
-Read and write using device DAX to a persistent memory device
-(e.g., /dev/dax0.0) through the NVML libpmem library.
-.RE
-.P
-.RE
-.TP
-.BI iodepth \fR=\fPint
-Number of I/O units to keep in flight against the file. Note that increasing
-iodepth beyond 1 will not affect synchronous ioengines (except for small
-degress when verify_async is in use). Even async engines may impose OS
-restrictions causing the desired depth not to be achieved.  This may happen on
-Linux when using libaio and not setting \fBdirect\fR=1, since buffered IO is
-not async on that OS. Keep an eye on the IO depth distribution in the
-fio output to verify that the achieved depth is as expected. Default: 1.
-.TP
-.BI iodepth_batch \fR=\fPint "\fR,\fP iodepth_batch_submit" \fR=\fPint
-This defines how many pieces of IO to submit at once. It defaults to 1
-which means that we submit each IO as soon as it is available, but can
-be raised to submit bigger batches of IO at the time. If it is set to 0
-the \fBiodepth\fR value will be used.
-.TP
-.BI iodepth_batch_complete_min \fR=\fPint "\fR,\fP iodepth_batch_complete" \fR=\fPint
-This defines how many pieces of IO to retrieve at once. It defaults to 1 which
- means that we'll ask for a minimum of 1 IO in the retrieval process from the
-kernel. The IO retrieval will go on until we hit the limit set by
-\fBiodepth_low\fR. If this variable is set to 0, then fio will always check for
-completed events before queuing more IO. This helps reduce IO latency, at the
-cost of more retrieval system calls.
-.TP
-.BI iodepth_batch_complete_max \fR=\fPint
-This defines maximum pieces of IO to
-retrieve at once. This variable should be used along with
-\fBiodepth_batch_complete_min\fR=int variable, specifying the range
-of min and max amount of IO which should be retrieved. By default
-it is equal to \fBiodepth_batch_complete_min\fR value.
-
-Example #1:
-.RS
-.RS
-\fBiodepth_batch_complete_min\fR=1
-.LP
-\fBiodepth_batch_complete_max\fR=<iodepth>
-.RE
-
-which means that we will retrieve at least 1 IO and up to the
-whole submitted queue depth. If none of IO has been completed
-yet, we will wait.
-
-Example #2:
-.RS
-\fBiodepth_batch_complete_min\fR=0
-.LP
-\fBiodepth_batch_complete_max\fR=<iodepth>
-.RE
-
-which means that we can retrieve up to the whole submitted
-queue depth, but if none of IO has been completed yet, we will
-NOT wait and immediately exit the system call. In this example
-we simply do polling.
-.RE
-.TP
-.BI iodepth_low \fR=\fPint
-Low watermark indicating when to start filling the queue again.  Default:
-\fBiodepth\fR.
-.TP
-.BI io_submit_mode \fR=\fPstr
-This option controls how fio submits the IO to the IO engine. The default is
-\fBinline\fR, which means that the fio job threads submit and reap IO directly.
-If set to \fBoffload\fR, the job threads will offload IO submission to a
-dedicated pool of IO threads. This requires some coordination and thus has a
-bit of extra overhead, especially for lower queue depth IO where it can
-increase latencies. The benefit is that fio can manage submission rates
-independently of the device completion rates. This avoids skewed latency
-reporting if IO gets back up on the device side (the coordinated omission
-problem).
-.TP
-.BI direct \fR=\fPbool
-If true, use non-buffered I/O (usually O_DIRECT).  Default: false.
-.TP
-.BI atomic \fR=\fPbool
-If value is true, attempt to use atomic direct IO. Atomic writes are guaranteed
-to be stable once acknowledged by the operating system. Only Linux supports
-O_ATOMIC right now.
-.TP
-.BI buffered \fR=\fPbool
-If true, use buffered I/O.  This is the opposite of the \fBdirect\fR parameter.
-Default: true.
+Read and write using filesystem DAX to a file on a filesystem
+mounted with DAX on a persistent memory device through the NVML
+libpmemblk library.
  .TP
-.BI offset \fR=\fPint
-Offset in the file to start I/O. Data before the offset will not be touched.
-.TP
-.BI offset_increment \fR=\fPint
-If this is provided, then the real offset becomes the
-offset + offset_increment * thread_number, where the thread number is a
-counter that starts at 0 and is incremented for each sub-job (i.e. when
-numjobs option is specified). This option is useful if there are several jobs
-which are intended to operate on a file in parallel disjoint segments, with
-even spacing between the starting points.
-.TP
-.BI number_ios \fR=\fPint
-Fio will normally perform IOs until it has exhausted the size of the region
-set by \fBsize\fR, or if it exhaust the allocated time (or hits an error
-condition). With this setting, the range/size can be set independently of
-the number of IOs to perform. When fio reaches this number, it will exit
-normally and report status. Note that this does not extend the amount
-of IO that will be done, it will only stop fio if this condition is met
-before other end-of-job criteria.
+.B dev\-dax
+Read and write using device DAX to a persistent memory device (e.g.,
+/dev/dax0.0) through the NVML libpmem library.
  .TP
-.BI fsync \fR=\fPint
-How many I/Os to perform before issuing an \fBfsync\fR\|(2) of dirty data.  If
-0, don't sync.  Default: 0.
+.B external
+Prefix to specify loading an external I/O engine object file. Append
+the engine filename, e.g. `ioengine=external:/tmp/foo.o' to load
+ioengine `foo.o' in `/tmp'. The path can be either
+absolute or relative. See `engines/skeleton_external.c' in the fio source for
+details of writing an external I/O engine.
+.SS "I/O engine specific parameters"
+In addition, there are some parameters which are only valid when a specific
+\fBioengine\fR is in use. These are used identically to normal parameters,
+with the caveat that when used on the command line, they must come after the
+\fBioengine\fR that defines them is selected.
  .TP
-.BI fdatasync \fR=\fPint
-Like \fBfsync\fR, but uses \fBfdatasync\fR\|(2) instead to only sync the
-data parts of the file. Default: 0.
+.BI (libaio)userspace_reap
+Normally, with the libaio engine in use, fio will use the
+\fBio_getevents\fR\|(3) system call to reap newly returned events. With
+this flag turned on, the AIO ring will be read directly from user\-space to
+reap events. The reaping mode is only enabled when polling for a minimum of
+0 events (e.g. when `iodepth_batch_complete=0').
  .TP
-.BI write_barrier \fR=\fPint
-Make every Nth write a barrier write.
+.BI (pvsync2)hipri
+Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+than normal.
  .TP
-.BI sync_file_range \fR=\fPstr:int
-Use \fBsync_file_range\fR\|(2) for every \fRval\fP number of write operations. Fio will
-track range of writes that have happened since the last \fBsync_file_range\fR\|(2) call.
-\fRstr\fP can currently be one or more of:
-.RS
+.BI (pvsync2)hipri_percentage
+When hipri is set this determines the probability of a pvsync2 I/O being high
+priority. The default is 100%.
  .TP
-.B wait_before
-SYNC_FILE_RANGE_WAIT_BEFORE
+.BI (cpuio)cpuload \fR=\fPint
+Attempt to use the specified percentage of CPU cycles. This is a mandatory
+option when using cpuio I/O engine.
  .TP
-.B write
-SYNC_FILE_RANGE_WRITE
+.BI (cpuio)cpuchunks \fR=\fPint
+Split the load into cycles of the given time. In microseconds.
  .TP
-.B wait_after
-SYNC_FILE_RANGE_WRITE
+.BI (cpuio)exit_on_io_done \fR=\fPbool
+Detect when I/O threads are done, then exit.
  .TP
-.RE
-.P
-So if you do sync_file_range=wait_before,write:8, fio would use
-\fBSYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE\fP for every 8 writes.
-Also see the \fBsync_file_range\fR\|(2) man page.  This option is Linux specific.
+.BI (libhdfs)namenode \fR=\fPstr
+The hostname or IP address of a HDFS cluster namenode to contact.
  .TP
-.BI overwrite \fR=\fPbool
-If writing, setup the file first and do overwrites.  Default: false.
+.BI (libhdfs)port
+The listening port of the HFDS cluster namenode.
  .TP
-.BI end_fsync \fR=\fPbool
-Sync file contents when a write stage has completed.  Default: false.
+.BI (netsplice,net)port
+The TCP or UDP port to bind to or connect to. If this is used with
+\fBnumjobs\fR to spawn multiple instances of the same job type, then
+this will be the starting port number since fio will use a range of
+ports.
  .TP
-.BI fsync_on_close \fR=\fPbool
-If true, sync file contents on close.  This differs from \fBend_fsync\fR in that
-it will happen on every close, not just at the end of the job.  Default: false.
+.BI (netsplice,net)hostname \fR=\fPstr
+The hostname or IP address to use for TCP or UDP based I/O. If the job is
+a TCP listener or UDP reader, the hostname is not used and must be omitted
+unless it is a valid UDP multicast address.
  .TP
-.BI rwmixread \fR=\fPint
-Percentage of a mixed workload that should be reads. Default: 50.
+.BI (netsplice,net)interface \fR=\fPstr
+The IP address of the network interface used to send or receive UDP
+multicast.
  .TP
-.BI rwmixwrite \fR=\fPint
-Percentage of a mixed workload that should be writes.  If \fBrwmixread\fR and
-\fBrwmixwrite\fR are given and do not sum to 100%, the latter of the two
-overrides the first. This may interfere with a given rate setting, if fio is
-asked to limit reads or writes to a certain rate. If that is the case, then
-the distribution may be skewed. Default: 50.
+.BI (netsplice,net)ttl \fR=\fPint
+Time\-to\-live value for outgoing UDP multicast packets. Default: 1.
  .TP
-.BI random_distribution \fR=\fPstr:float
-By default, fio will use a completely uniform random distribution when asked
-to perform random IO. Sometimes it is useful to skew the distribution in
-specific ways, ensuring that some parts of the data is more hot than others.
-Fio includes the following distribution models:
-.RS
+.BI (netsplice,net)nodelay \fR=\fPbool
+Set TCP_NODELAY on TCP connections.
  .TP
-.B random
-Uniform random distribution
+.BI (netsplice,net)protocol \fR=\fPstr "\fR,\fP proto" \fR=\fPstr
+The network protocol to use. Accepted values are:
+.RS
+.RS
  .TP
-.B zipf
-Zipf distribution
+.B tcp
+Transmission control protocol.
  .TP
-.B pareto
-Pareto distribution
+.B tcpv6
+Transmission control protocol V6.
  .TP
-.B gauss
-Normal (gaussian) distribution
+.B udp
+User datagram protocol.
  .TP
-.B zoned
-Zoned random distribution
+.B udpv6
+User datagram protocol V6.
  .TP
+.B unix
+UNIX domain socket.
  .RE
-When using a \fBzipf\fR or \fBpareto\fR distribution, an input value is also
-needed to define the access pattern. For \fBzipf\fR, this is the zipf theta.
-For \fBpareto\fR, it's the pareto power. Fio includes a test program, genzipf,
-that can be used visualize what the given input values will yield in terms of
-hit rates. If you wanted to use \fBzipf\fR with a theta of 1.2, you would use
-random_distribution=zipf:1.2 as the option. If a non-uniform model is used,
-fio will disable use of the random map. For the \fBgauss\fR distribution, a
-normal deviation is supplied as a value between 0 and 100.
-.P
-.RS
-For a \fBzoned\fR distribution, fio supports specifying percentages of IO
-access that should fall within what range of the file or device. For example,
-given a criteria of:
  .P
+When the protocol is TCP or UDP, the port must also be given, as well as the
+hostname if the job is a TCP listener or UDP reader. For unix sockets, the
+normal \fBfilename\fR option should be used and the port is invalid.
+.RE
+.TP
+.BI (netsplice,net)listen
+For TCP network connections, tell fio to listen for incoming connections
+rather than initiating an outgoing connection. The \fBhostname\fR must
+be omitted if this option is used.
+.TP
+.BI (netsplice,net)pingpong
+Normally a network writer will just continue writing data, and a network
+reader will just consume packages. If `pingpong=1' is set, a writer will
+send its normal payload to the reader, then wait for the reader to send the
+same payload back. This allows fio to measure network latencies. The
+submission and completion latencies then measure local time spent sending or
+receiving, and the completion latency measures how long it took for the
+other end to receive and send back. For UDP multicast traffic
+`pingpong=1' should only be set for a single reader when multiple readers
+are listening to the same address.
+.TP
+.BI (netsplice,net)window_size \fR=\fPint
+Set the desired socket buffer size for the connection.
+.TP
+.BI (netsplice,net)mss \fR=\fPint
+Set the TCP maximum segment size (TCP_MAXSEG).
+.TP
+.BI (e4defrag)donorname \fR=\fPstr
+File will be used as a block donor (swap extents between files).
+.TP
+.BI (e4defrag)inplace \fR=\fPint
+Configure donor file blocks allocation strategy:
  .RS
-60% of accesses should be to the first 10%
-.RE
-.RS
-30% of accesses should be to the next 20%
-.RE
-.RS
-8% of accesses should be to to the next 30%
-.RE
-.RS
-2% of accesses should be to the next 40%
-.RE
-.P
-we can define that through zoning of the random accesses. For the above
-example, the user would do:
-.P
  .RS
-.B random_distribution=zoned:60/10:30/20:8/30:2/40
+.TP
+.B 0
+Default. Preallocate donor's file on init.
+.TP
+.B 1
+Allocate space immediately inside defragment event, and free right
+after event.
  .RE
-.P
-similarly to how \fBbssplit\fR works for setting ranges and percentages of block
-sizes. Like \fBbssplit\fR, it's possible to specify separate zones for reads,
-writes, and trims. If just one set is given, it'll apply to all of them.
  .RE
  .TP
-.BI percentage_random \fR=\fPint[,int][,int]
-For a random workload, set how big a percentage should be random. This defaults
-to 100%, in which case the workload is fully random. It can be set from
-anywhere from 0 to 100.  Setting it to 0 would make the workload fully
-sequential. It is possible to set different values for reads, writes, and
-trim. To do so, simply use a comma separated list. See \fBblocksize\fR.
+.BI (rbd)clustername \fR=\fPstr
+Specifies the name of the Ceph cluster.
  .TP
-.B norandommap
-Normally \fBfio\fR will cover every block of the file when doing random I/O. If
-this parameter is given, a new offset will be chosen without looking at past
-I/O history.  This parameter is mutually exclusive with \fBverify\fR.
+.BI (rbd)rbdname \fR=\fPstr
+Specifies the name of the RBD.
  .TP
-.BI softrandommap \fR=\fPbool
-See \fBnorandommap\fR. If fio runs with the random block map enabled and it
-fails to allocate the map, if this option is set it will continue without a
-random block map. As coverage will not be as complete as with random maps, this
-option is disabled by default.
+.BI (rbd)pool \fR=\fPstr
+Specifies the name of the Ceph pool containing RBD.
  .TP
-.BI random_generator \fR=\fPstr
-Fio supports the following engines for generating IO offsets for random IO:
-.RS
+.BI (rbd)clientname \fR=\fPstr
+Specifies the username (without the 'client.' prefix) used to access the
+Ceph cluster. If the \fBclustername\fR is specified, the \fBclientname\fR shall be
+the full *type.id* string. If no type. prefix is given, fio will add 'client.'
+by default.
  .TP
-.B tausworthe
-Strong 2^88 cycle random number generator
+.BI (mtd)skip_bad \fR=\fPbool
+Skip operations against known bad blocks.
  .TP
-.B lfsr
-Linear feedback shift register generator
+.BI (libhdfs)hdfsdirectory
+libhdfs will create chunk in this HDFS directory.
  .TP
-.B tausworthe64
-Strong 64-bit 2^258 cycle random number generator
+.BI (libhdfs)chunk_size
+The size of the chunk to use for each file.
+.SS "I/O depth"
+.TP
+.BI iodepth \fR=\fPint
+Number of I/O units to keep in flight against the file. Note that
+increasing \fBiodepth\fR beyond 1 will not affect synchronous ioengines (except
+for small degrees when \fBverify_async\fR is in use). Even async
+engines may impose OS restrictions causing the desired depth not to be
+achieved. This may happen on Linux when using libaio and not setting
+`direct=1', since buffered I/O is not async on that OS. Keep an
+eye on the I/O depth distribution in the fio output to verify that the
+achieved depth is as expected. Default: 1.
+.TP
+.BI iodepth_batch_submit \fR=\fPint "\fR,\fP iodepth_batch" \fR=\fPint
+This defines how many pieces of I/O to submit at once. It defaults to 1
+which means that we submit each I/O as soon as it is available, but can be
+raised to submit bigger batches of I/O at the time. If it is set to 0 the
+\fBiodepth\fR value will be used.
+.TP
+.BI iodepth_batch_complete_min \fR=\fPint "\fR,\fP iodepth_batch_complete" \fR=\fPint
+This defines how many pieces of I/O to retrieve at once. It defaults to 1
+which means that we'll ask for a minimum of 1 I/O in the retrieval process
+from the kernel. The I/O retrieval will go on until we hit the limit set by
+\fBiodepth_low\fR. If this variable is set to 0, then fio will always
+check for completed events before queuing more I/O. This helps reduce I/O
+latency, at the cost of more retrieval system calls.
  .TP
+.BI iodepth_batch_complete_max \fR=\fPint
+This defines maximum pieces of I/O to retrieve at once. This variable should
+be used along with \fBiodepth_batch_complete_min\fR=\fIint\fR variable,
+specifying the range of min and max amount of I/O which should be
+retrieved. By default it is equal to \fBiodepth_batch_complete_min\fR
+value. Example #1:
+.RS
+.RS
+.P
+.PD 0
+iodepth_batch_complete_min=1
+.P
+iodepth_batch_complete_max=<iodepth>
+.PD
  .RE
  .P
-Tausworthe is a strong random number generator, but it requires tracking on the
-side if we want to ensure that blocks are only read or written once. LFSR
-guarantees that we never generate the same offset twice, and it's also less
-computationally expensive. It's not a true random generator, however, though
-for IO purposes it's typically good enough. LFSR only works with single block
-sizes, not with workloads that use multiple block sizes. If used with such a
-workload, fio may read or write some blocks multiple times. The default
-value is tausworthe, unless the required space exceeds 2^32 blocks. If it does,
-then tausworthe64 is selected automatically.
-.TP
-.BI nice \fR=\fPint
-Run job with given nice value.  See \fBnice\fR\|(2).
+which means that we will retrieve at least 1 I/O and up to the whole
+submitted queue depth. If none of I/O has been completed yet, we will wait.
+Example #2:
+.RS
+.P
+.PD 0
+iodepth_batch_complete_min=0
+.P
+iodepth_batch_complete_max=<iodepth>
+.PD
+.RE
+.P
+which means that we can retrieve up to the whole submitted queue depth, but
+if none of I/O has been completed yet, we will NOT wait and immediately exit
+the system call. In this example we simply do polling.
+.RE
  .TP
-.BI prio \fR=\fPint
-Set I/O priority value of this job between 0 (highest) and 7 (lowest).  See
-\fBionice\fR\|(1).
+.BI iodepth_low \fR=\fPint
+The low water mark indicating when to start filling the queue
+again. Defaults to the same as \fBiodepth\fR, meaning that fio will
+attempt to keep the queue full at all times. If \fBiodepth\fR is set to
+e.g. 16 and \fBiodepth_low\fR is set to 4, then after fio has filled the queue of
+16 requests, it will let the depth drain down to 4 before starting to fill
+it again.
+.TP
+.BI serialize_overlap \fR=\fPbool
+Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+When two or more I/Os are submitted simultaneously, there is no guarantee that
+the I/Os will be processed or completed in the submitted order. Further, if
+two or more of those I/Os are writes, any overlapping region between them can
+become indeterminate/undefined on certain storage. These issues can cause
+verification to fail erratically when at least one of the racing I/Os is
+changing data and the overlapping region has a non-zero size. Setting
+\fBserialize_overlap\fR tells fio to avoid provoking this behavior by explicitly
+serializing in-flight I/Os that have a non-zero overlap. Note that setting
+this option can reduce both performance and the \fBiodepth\fR achieved.
+Additionally this option does not work when \fBio_submit_mode\fR is set to
+offload. Default: false.
  .TP
-.BI prioclass \fR=\fPint
-Set I/O priority class.  See \fBionice\fR\|(1).
+.BI io_submit_mode \fR=\fPstr
+This option controls how fio submits the I/O to the I/O engine. The default
+is `inline', which means that the fio job threads submit and reap I/O
+directly. If set to `offload', the job threads will offload I/O submission
+to a dedicated pool of I/O threads. This requires some coordination and thus
+has a bit of extra overhead, especially for lower queue depth I/O where it
+can increase latencies. The benefit is that fio can manage submission rates
+independently of the device completion rates. This avoids skewed latency
+reporting if I/O gets backed up on the device side (the coordinated omission
+problem).
+.SS "I/O rate"
  .TP
-.BI thinktime \fR=\fPint
-Stall job for given number of microseconds between issuing I/Os.
+.BI thinktime \fR=\fPtime
+Stall the job for the specified period of time after an I/O has completed before issuing the
+next. May be used to simulate processing being done by an application.
+When the unit is omitted, the value is interpreted in microseconds. See
+\fBthinktime_blocks\fR and \fBthinktime_spin\fR.
  .TP
-.BI thinktime_spin \fR=\fPint
-Pretend to spend CPU time for given number of microseconds, sleeping the rest
-of the time specified by \fBthinktime\fR.  Only valid if \fBthinktime\fR is set.
+.BI thinktime_spin \fR=\fPtime
+Only valid if \fBthinktime\fR is set \- pretend to spend CPU time doing
+something with the data received, before falling back to sleeping for the
+rest of the period specified by \fBthinktime\fR. When the unit is
+omitted, the value is interpreted in microseconds.
  .TP
  .BI thinktime_blocks \fR=\fPint
-Only valid if thinktime is set - control how many blocks to issue, before
-waiting \fBthinktime\fR microseconds. If not set, defaults to 1 which will
-make fio wait \fBthinktime\fR microseconds after every block. This
-effectively makes any queue depth setting redundant, since no more than 1 IO
-will be queued before we have to complete it and do our thinktime. In other
-words, this setting effectively caps the queue depth if the latter is larger.
-Default: 1.
+Only valid if \fBthinktime\fR is set \- control how many blocks to issue,
+before waiting \fBthinktime\fR usecs. If not set, defaults to 1 which will make
+fio wait \fBthinktime\fR usecs after every block. This effectively makes any
+queue depth setting redundant, since no more than 1 I/O will be queued
+before we have to complete it and do our \fBthinktime\fR. In other words, this
+setting effectively caps the queue depth if the latter is larger.
  .TP
  .BI rate \fR=\fPint[,int][,int]
-Cap bandwidth used by this job. The number is in bytes/sec, the normal postfix
-rules apply. You can use \fBrate\fR=500k to limit reads and writes to 500k each,
-or you can specify reads, write, and trim limits separately.
-Using \fBrate\fR=1m,500k would
-limit reads to 1MiB/sec and writes to 500KiB/sec. Capping only reads or writes
-can be done with \fBrate\fR=,500k or \fBrate\fR=500k,. The former will only
-limit writes (to 500KiB/sec), the latter will only limit reads.
+Cap the bandwidth used by this job. The number is in bytes/sec, the normal
+suffix rules apply. Comma\-separated values may be specified for reads,
+writes, and trims as described in \fBblocksize\fR.
+.RS
+.P
+For example, using `rate=1m,500k' would limit reads to 1MiB/sec and writes to
+500KiB/sec. Capping only reads or writes can be done with `rate=,500k' or
+`rate=500k,' where the former will only limit writes (to 500KiB/sec) and the
+latter will only limit reads.
+.RE
  .TP
  .BI rate_min \fR=\fPint[,int][,int]
-Tell \fBfio\fR to do whatever it can to maintain at least the given bandwidth.
-Failing to meet this requirement will cause the job to exit. The same format
-as \fBrate\fR is used for read vs write vs trim separation.
+Tell fio to do whatever it can to maintain at least this bandwidth. Failing
+to meet this requirement will cause the job to exit. Comma\-separated values
+may be specified for reads, writes, and trims as described in
+\fBblocksize\fR.
  .TP
  .BI rate_iops \fR=\fPint[,int][,int]
-Cap the bandwidth to this number of IOPS. Basically the same as rate, just
-specified independently of bandwidth. The same format as \fBrate\fR is used for
-read vs write vs trim separation. If \fBblocksize\fR is a range, the smallest block
-size is used as the metric.
+Cap the bandwidth to this number of IOPS. Basically the same as
+\fBrate\fR, just specified independently of bandwidth. If the job is
+given a block size range instead of a fixed value, the smallest block size
+is used as the metric. Comma\-separated values may be specified for reads,
+writes, and trims as described in \fBblocksize\fR.
  .TP
  .BI rate_iops_min \fR=\fPint[,int][,int]
-If this rate of I/O is not met, the job will exit. The same format as \fBrate\fR
-is used for read vs write vs trim separation.
+If fio doesn't meet this rate of I/O, it will cause the job to exit.
+Comma\-separated values may be specified for reads, writes, and trims as
+described in \fBblocksize\fR.
  .TP
  .BI rate_process \fR=\fPstr
-This option controls how fio manages rated IO submissions. The default is
-\fBlinear\fR, which submits IO in a linear fashion with fixed delays between
-IOs that gets adjusted based on IO completion rates. If this is set to
-\fBpoisson\fR, fio will submit IO based on a more real world random request
+This option controls how fio manages rated I/O submissions. The default is
+`linear', which submits I/O in a linear fashion with fixed delays between
+I/Os that gets adjusted based on I/O completion rates. If this is set to
+`poisson', fio will submit I/O based on a more real world random request
  flow, known as the Poisson process
-(https://en.wikipedia.org/wiki/Poisson_process). The lambda will be
+(\fIhttps://en.wikipedia.org/wiki/Poisson_point_process\fR). The lambda will be
  10^6 / IOPS for the given workload.
+.SS "I/O latency"
  .TP
-.BI rate_cycle \fR=\fPint
-Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number of
-milliseconds.  Default: 1000ms.
-.TP
-.BI latency_target \fR=\fPint
+.BI latency_target \fR=\fPtime
  If set, fio will attempt to find the max performance point that the given
-workload will run at while maintaining a latency below this target. The
-values is given in microseconds. See \fBlatency_window\fR and
-\fBlatency_percentile\fR.
+workload will run at while maintaining a latency below this target. When
+the unit is omitted, the value is interpreted in microseconds. See
+\fBlatency_window\fR and \fBlatency_percentile\fR.
  .TP
-.BI latency_window \fR=\fPint
+.BI latency_window \fR=\fPtime
  Used with \fBlatency_target\fR to specify the sample window that the job
-is run at varying queue depths to test the performance. The value is given
-in microseconds.
+is run at varying queue depths to test the performance. When the unit is
+omitted, the value is interpreted in microseconds.
  .TP
  .BI latency_percentile \fR=\fPfloat
-The percentage of IOs that must fall within the criteria specified by
-\fBlatency_target\fR and \fBlatency_window\fR. If not set, this defaults
-to 100.0, meaning that all IOs must be equal or below to the value set
-by \fBlatency_target\fR.
+The percentage of I/Os that must fall within the criteria specified by
+\fBlatency_target\fR and \fBlatency_window\fR. If not set, this
+defaults to 100.0, meaning that all I/Os must be equal or below to the value
+set by \fBlatency_target\fR.
+.TP
+.BI max_latency \fR=\fPtime
+If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
+maximum latency. When the unit is omitted, the value is interpreted in
+microseconds.
+.TP
+.BI rate_cycle \fR=\fPint
+Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number
+of milliseconds. Defaults to 1000.
+.SS "I/O replay"
+.TP
+.BI write_iolog \fR=\fPstr
+Write the issued I/O patterns to the specified file. See
+\fBread_iolog\fR. Specify a separate file for each job, otherwise the
+iologs will be interspersed and the file may be corrupt.
+.TP
+.BI read_iolog \fR=\fPstr
+Open an iolog with the specified filename and replay the I/O patterns it
+contains. This can be used to store a workload and replay it sometime
+later. The iolog given may also be a blktrace binary file, which allows fio
+to replay a workload captured by blktrace. See
+\fBblktrace\fR\|(8) for how to capture such logging data. For blktrace
+replay, the file needs to be turned into a blkparse binary data file first
+(`blkparse <device> \-o /dev/null \-d file_for_fio.bin').
+.TP
+.BI replay_no_stall \fR=\fPbool
+When replaying I/O with \fBread_iolog\fR the default behavior is to
+attempt to respect the timestamps within the log and replay them with the
+appropriate delay between IOPS. By setting this variable fio will not
+respect the timestamps and attempt to replay them as fast as possible while
+still respecting ordering. The result is the same I/O pattern to a given
+device, but different timings.
+.TP
+.BI replay_redirect \fR=\fPstr
+While replaying I/O patterns using \fBread_iolog\fR the default behavior
+is to replay the IOPS onto the major/minor device that each IOP was recorded
+from. This is sometimes undesirable because on a different machine those
+major/minor numbers can map to a different device. Changing hardware on the
+same system can also result in a different major/minor mapping.
+\fBreplay_redirect\fR causes all I/Os to be replayed onto the single specified
+device regardless of the device it was recorded
+from. i.e. `replay_redirect=/dev/sdc' would cause all I/O
+in the blktrace or iolog to be replayed onto `/dev/sdc'. This means
+multiple devices will be replayed onto a single device, if the trace
+contains multiple devices. If you want multiple devices to be replayed
+concurrently to multiple redirected devices you must blkparse your trace
+into separate traces and replay them with independent fio invocations.
+Unfortunately this also breaks the strict time ordering between multiple
+device accesses.
+.TP
+.BI replay_align \fR=\fPint
+Force alignment of I/O offsets and lengths in a trace to this power of 2
+value.
+.TP
+.BI replay_scale \fR=\fPint
+Scale sector offsets down by this factor when replaying traces.
+.SS "Threads, processes and job synchronization"
+.TP
+.BI thread
+Fio defaults to creating jobs by using fork, however if this option is
+given, fio will create jobs by using POSIX Threads' function
+\fBpthread_create\fR\|(3) to create threads instead.
+.TP
+.BI wait_for \fR=\fPstr
+If set, the current job won't be started until all workers of the specified
+waitee job are done.
+.\" ignore blank line here from HOWTO as it looks normal without it
+\fBwait_for\fR operates on the job name basis, so there are a few
+limitations. First, the waitee must be defined prior to the waiter job
+(meaning no forward references). Second, if a job is being referenced as a
+waitee, it must have a unique name (no duplicate waitees).
+.TP
+.BI nice \fR=\fPint
+Run the job with the given nice value. See man \fBnice\fR\|(2).
+.\" ignore blank line here from HOWTO as it looks normal without it
+On Windows, values less than \-15 set the process class to "High"; \-1 through
+\-15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
+priority class.
+.TP
+.BI prio \fR=\fPint
+Set the I/O priority value of this job. Linux limits us to a positive value
+between 0 and 7, with 0 being the highest. See man
+\fBionice\fR\|(1). Refer to an appropriate manpage for other operating
+systems since meaning of priority may differ.
  .TP
-.BI max_latency \fR=\fPint
-If set, fio will exit the job if it exceeds this maximum latency. It will exit
-with an ETIME error.
+.BI prioclass \fR=\fPint
+Set the I/O priority class. See man \fBionice\fR\|(1).
  .TP
  .BI cpumask \fR=\fPint
-Set CPU affinity for this job. \fIint\fR is a bitmask of allowed CPUs the job
-may run on.  See \fBsched_setaffinity\fR\|(2).
+Set the CPU affinity of this job. The parameter given is a bit mask of
+allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
+and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
+\fBsched_setaffinity\fR\|(2). This may not work on all supported
+operating systems or kernel versions. This option doesn't work well for a
+higher CPU count than what you can store in an integer mask, so it can only
+control cpus 1\-32. For boxes with larger CPU counts, use
+\fBcpus_allowed\fR.
  .TP
  .BI cpus_allowed \fR=\fPstr
-Same as \fBcpumask\fR, but allows a comma-delimited list of CPU numbers.
+Controls the same options as \fBcpumask\fR, but accepts a textual
+specification of the permitted CPUs instead. So to use CPUs 1 and 5 you
+would specify `cpus_allowed=1,5'. This option also allows a range of CPUs
+to be specified \-\- say you wanted a binding to CPUs 1, 5, and 8 to 15, you
+would set `cpus_allowed=1,5,8\-15'.
  .TP
  .BI cpus_allowed_policy \fR=\fPstr
-Set the policy of how fio distributes the CPUs specified by \fBcpus_allowed\fR
-or \fBcpumask\fR. Two policies are supported:
+Set the policy of how fio distributes the CPUs specified by
+\fBcpus_allowed\fR or \fBcpumask\fR. Two policies are supported:
  .RS
  .RS
  .TP
@@ -1184,834 +2010,711 @@ All jobs will share the CPU set specified.
  Each job will get a unique CPU from the CPU set.
  .RE
  .P
-\fBshared\fR is the default behaviour, if the option isn't specified. If
-\fBsplit\fR is specified, then fio will assign one cpu per job. If not enough
-CPUs are given for the jobs listed, then fio will roundrobin the CPUs in
-the set.
+\fBshared\fR is the default behavior, if the option isn't specified. If
+\fBsplit\fR is specified, then fio will will assign one cpu per job. If not
+enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
+in the set.
  .RE
-.P
  .TP
  .BI numa_cpu_nodes \fR=\fPstr
  Set this job running on specified NUMA nodes' CPUs. The arguments allow
-comma delimited list of cpu numbers, A-B ranges, or 'all'.
+comma delimited list of cpu numbers, A\-B ranges, or `all'. Note, to enable
+NUMA options support, fio must be built on a system with libnuma\-dev(el)
+installed.
  .TP
  .BI numa_mem_policy \fR=\fPstr
-Set this job's memory policy and corresponding NUMA nodes. Format of
-the arguments:
+Set this job's memory policy and corresponding NUMA nodes. Format of the
+arguments:
  .RS
-.TP
-.B <mode>[:<nodelist>]
-.TP
-.B mode
-is one of the following memory policy:
-.TP
-.B default, prefer, bind, interleave, local
-.TP
+.RS
+.P
+<mode>[:<nodelist>]
+.RE
+.P
+`mode' is one of the following memory poicies: `default', `prefer',
+`bind', `interleave' or `local'. For `default' and `local' memory
+policies, no node needs to be specified. For `prefer', only one node is
+allowed. For `bind' and `interleave' the `nodelist' may be as
+follows: a comma delimited list of numbers, A\-B ranges, or `all'.
  .RE
-For \fBdefault\fR and \fBlocal\fR memory policy, no \fBnodelist\fR is
-needed to be specified. For \fBprefer\fR, only one node is
-allowed. For \fBbind\fR and \fBinterleave\fR, \fBnodelist\fR allows
-comma delimited list of numbers, A-B ranges, or 'all'.
-.TP
-.BI startdelay \fR=\fPirange
-Delay start of job for the specified number of seconds. Supports all time
-suffixes to allow specification of hours, minutes, seconds and
-milliseconds - seconds are the default if a unit is omitted.
-Can be given as a range which causes each thread to choose randomly out of the
-range.
-.TP
-.BI runtime \fR=\fPint
-Terminate processing after the specified number of seconds.
-.TP
-.B time_based
-If given, run for the specified \fBruntime\fR duration even if the files are
-completely read or written. The same workload will be repeated as many times
-as \fBruntime\fR allows.
-.TP
-.BI ramp_time \fR=\fPint
-If set, fio will run the specified workload for this amount of time before
-logging any performance numbers. Useful for letting performance settle before
-logging results, thus minimizing the runtime required for stable results. Note
-that the \fBramp_time\fR is considered lead in time for a job, thus it will
-increase the total runtime if a special timeout or runtime is specified.
  .TP
-.BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float
-Define the criterion and limit for assessing steady state performance. The
-first parameter designates the criterion whereas the second parameter sets the
-threshold. When the criterion falls below the threshold for the specified
-duration, the job will stop. For example, iops_slope:0.1% will direct fio
-to terminate the job when the least squares regression slope falls below 0.1%
-of the mean IOPS. If group_reporting is enabled this will apply to all jobs in
-the group. All assessments are carried out using only data from the rolling
-collection window. Threshold limits can be expressed as a fixed value or as a
-percentage of the mean in the collection window. Below are the available steady
-state assessment criteria.
+.BI cgroup \fR=\fPstr
+Add job to this control group. If it doesn't exist, it will be created. The
+system must have a mounted cgroup blkio mount point for this to work. If
+your system doesn't have it mounted, you can do so with:
  .RS
  .RS
-.TP
-.B iops
-Collect IOPS data. Stop the job if all individual IOPS measurements are within
-the specified limit of the mean IOPS (e.g., iops:2 means that all individual
-IOPS values must be within 2 of the mean, whereas iops:0.2% means that all
-individual IOPS values must be within 0.2% of the mean IOPS to terminate the
-job).
-.TP
-.B iops_slope
-Collect IOPS data and calculate the least squares regression slope. Stop the
-job if the slope falls below the specified limit.
-.TP
-.B bw
-Collect bandwidth data. Stop the job if all individual bandwidth measurements
-are within the specified limit of the mean bandwidth.
-.TP
-.B bw_slope
-Collect bandwidth data and calculate the least squares regression slope. Stop
-the job if the slope falls below the specified limit.
+.P
+# mount \-t cgroup \-o blkio none /cgroup
  .RE
  .RE
  .TP
-.BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime
-A rolling window of this duration will be used to judge whether steady state
-has been reached. Data will be collected once per second. The default is 0
-which disables steady state detection.
-.TP
-.BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime
-Allow the job to run for the specified duration before beginning data collection
-for checking the steady state job termination criterion. The default is 0.
-.TP
-.BI invalidate \fR=\fPbool
-Invalidate buffer-cache for the file prior to starting I/O.  Default: true.
+.BI cgroup_weight \fR=\fPint
+Set the weight of the cgroup to this value. See the documentation that comes
+with the kernel, allowed values are in the range of 100..1000.
  .TP
-.BI sync \fR=\fPbool
-Use synchronous I/O for buffered writes.  For the majority of I/O engines,
-this means using O_SYNC.  Default: false.
+.BI cgroup_nodelete \fR=\fPbool
+Normally fio will delete the cgroups it has created after the job
+completion. To override this behavior and to leave cgroups around after the
+job completion, set `cgroup_nodelete=1'. This can be useful if one wants
+to inspect various cgroup files after job completion. Default: false.
  .TP
-.BI iomem \fR=\fPstr "\fR,\fP mem" \fR=\fPstr
-Allocation method for I/O unit buffer.  Allowed values are:
-.RS
-.RS
+.BI flow_id \fR=\fPint
+The ID of the flow. If not specified, it defaults to being a global
+flow. See \fBflow\fR.
  .TP
-.B malloc
-Allocate memory with \fBmalloc\fR\|(3). Default memory type.
+.BI flow \fR=\fPint
+Weight in token\-based flow control. If this value is used, then there is
+a 'flow counter' which is used to regulate the proportion of activity between
+two or more jobs. Fio attempts to keep this flow counter near zero. The
+\fBflow\fR parameter stands for how much should be added or subtracted to the
+flow counter on each iteration of the main I/O loop. That is, if one job has
+`flow=8' and another job has `flow=\-1', then there will be a roughly 1:8
+ratio in how much one runs vs the other.
  .TP
-.B shm
-Use shared memory buffers allocated through \fBshmget\fR\|(2).
+.BI flow_watermark \fR=\fPint
+The maximum value that the absolute value of the flow counter is allowed to
+reach before the job must wait for a lower value of the counter.
  .TP
-.B shmhuge
-Same as \fBshm\fR, but use huge pages as backing.
+.BI flow_sleep \fR=\fPint
+The period of time, in microseconds, to wait after the flow watermark has
+been exceeded before retrying operations.
  .TP
-.B mmap
-Use \fBmmap\fR\|(2) for allocation.  Uses anonymous memory unless a filename
-is given after the option in the format `:\fIfile\fR'.
+.BI stonewall "\fR,\fB wait_for_previous"
+Wait for preceding jobs in the job file to exit, before starting this
+one. Can be used to insert serialization points in the job file. A stone
+wall also implies starting a new reporting group, see
+\fBgroup_reporting\fR.
  .TP
-.B mmaphuge
-Same as \fBmmap\fR, but use huge files as backing.
+.BI exitall
+By default, fio will continue running all other jobs when one job finishes
+but sometimes this is not the desired action. Setting \fBexitall\fR will
+instead make fio terminate all other jobs when one job finishes.
  .TP
-.B mmapshared
-Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
+.BI exec_prerun \fR=\fPstr
+Before running this job, issue the command specified through
+\fBsystem\fR\|(3). Output is redirected in a file called `jobname.prerun.txt'.
  .TP
-.B cudamalloc
-Use GPU memory as the buffers for GPUDirect RDMA benchmark. The ioengine must be \fBrdma\fR.
-.RE
-.P
-The amount of memory allocated is the maximum allowed \fBblocksize\fR for the
-job multiplied by \fBiodepth\fR.  For \fBshmhuge\fR or \fBmmaphuge\fR to work,
-the system must have free huge pages allocated.  \fBmmaphuge\fR also needs to
-have hugetlbfs mounted, and \fIfile\fR must point there. At least on Linux,
-huge pages must be manually allocated. See \fB/proc/sys/vm/nr_hugehages\fR
-and the documentation for that. Normally you just need to echo an appropriate
-number, eg echoing 8 will ensure that the OS has 8 huge pages ready for
-use.
-.RE
+.BI exec_postrun \fR=\fPstr
+After the job completes, issue the command specified though
+\fBsystem\fR\|(3). Output is redirected in a file called `jobname.postrun.txt'.
  .TP
-.BI iomem_align \fR=\fPint "\fR,\fP mem_align" \fR=\fPint
-This indicates the memory alignment of the IO memory buffers. Note that the
-given alignment is applied to the first IO unit buffer, if using \fBiodepth\fR
-the alignment of the following buffers are given by the \fBbs\fR used. In
-other words, if using a \fBbs\fR that is a multiple of the page sized in the
-system, all buffers will be aligned to this value. If using a \fBbs\fR that
-is not page aligned, the alignment of subsequent IO memory buffers is the
-sum of the \fBiomem_align\fR and \fBbs\fR used.
+.BI uid \fR=\fPint
+Instead of running as the invoking user, set the user ID to this value
+before the thread/process does any work.
  .TP
-.BI hugepage\-size \fR=\fPint
-Defines the size of a huge page.  Must be at least equal to the system setting.
-Should be a multiple of 1MiB. Default: 4MiB.
+.BI gid \fR=\fPint
+Set group ID, see \fBuid\fR.
+.SS "Verification"
  .TP
-.B exitall
-Terminate all jobs when one finishes.  Default: wait for each job to finish.
+.BI verify_only
+Do not perform specified workload, only verify data still matches previous
+invocation of this workload. This option allows one to check data multiple
+times at a later date without overwriting it. This option makes sense only
+for workloads that write data, and does not support workloads with the
+\fBtime_based\fR option set.
  .TP
-.B exitall_on_error \fR=\fPbool
-Terminate all jobs if one job finishes in error.  Default: wait for each job
-to finish.
+.BI do_verify \fR=\fPbool
+Run the verify phase after a write phase. Only valid if \fBverify\fR is
+set. Default: true.
  .TP
-.BI bwavgtime \fR=\fPint
-Average bandwidth calculations over the given time in milliseconds. If the job
-also does bandwidth logging through \fBwrite_bw_log\fR, then the minimum of
-this option and \fBlog_avg_msec\fR will be used.  Default: 500ms.
+.BI verify \fR=\fPstr
+If writing to a file, fio can verify the file contents after each iteration
+of the job. Each verification method also implies verification of special
+header, which is written to the beginning of each block. This header also
+includes meta information, like offset of the block, block number, timestamp
+when block was written, etc. \fBverify\fR can be combined with
+\fBverify_pattern\fR option. The allowed values are:
+.RS
+.RS
  .TP
-.BI iopsavgtime \fR=\fPint
-Average IOPS calculations over the given time in milliseconds. If the job
-also does IOPS logging through \fBwrite_iops_log\fR, then the minimum of
-this option and \fBlog_avg_msec\fR will be used.  Default: 500ms.
+.B md5
+Use an md5 sum of the data area and store it in the header of
+each block.
  .TP
-.BI create_serialize \fR=\fPbool
-If true, serialize file creation for the jobs.  Default: true.
+.B crc64
+Use an experimental crc64 sum of the data area and store it in the
+header of each block.
  .TP
-.BI create_fsync \fR=\fPbool
-\fBfsync\fR\|(2) data file after creation.  Default: true.
+.B crc32c
+Use a crc32c sum of the data area and store it in the header of
+each block. This will automatically use hardware acceleration
+(e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
+fall back to software crc32c if none is found. Generally the
+fatest checksum fio supports when hardware accelerated.
  .TP
-.BI create_on_open \fR=\fPbool
-If true, the files are not created until they are opened for IO by the job.
+.B crc32c\-intel
+Synonym for crc32c.
  .TP
-.BI create_only \fR=\fPbool
-If true, fio will only run the setup phase of the job. If files need to be
-laid out or updated on disk, only that will be done. The actual job contents
-are not executed.
+.B crc32
+Use a crc32 sum of the data area and store it in the header of each
+block.
  .TP
-.BI allow_file_create \fR=\fPbool
-If true, fio is permitted to create files as part of its workload. This is
-the default behavior. If this option is false, then fio will error out if the
-files it needs to use don't already exist. Default: true.
+.B crc16
+Use a crc16 sum of the data area and store it in the header of each
+block.
  .TP
-.BI allow_mounted_write \fR=\fPbool
-If this isn't set, fio will abort jobs that are destructive (eg that write)
-to what appears to be a mounted device or partition. This should help catch
-creating inadvertently destructive tests, not realizing that the test will
-destroy data on the mounted file system. Default: false.
+.B crc7
+Use a crc7 sum of the data area and store it in the header of each
+block.
  .TP
-.BI pre_read \fR=\fPbool
-If this is given, files will be pre-read into memory before starting the given
-IO operation. This will also clear the \fR \fBinvalidate\fR flag, since it is
-pointless to pre-read and then drop the cache. This will only work for IO
-engines that are seekable, since they allow you to read the same data
-multiple times. Thus it will not work on eg network or splice IO.
+.B xxhash
+Use xxhash as the checksum function. Generally the fastest software
+checksum that fio supports.
  .TP
-.BI unlink \fR=\fPbool
-Unlink job files when done.  Default: false.
+.B sha512
+Use sha512 as the checksum function.
  .TP
-.BI unlink_each_loop \fR=\fPbool
-Unlink job files after each iteration or loop.  Default: false.
+.B sha256
+Use sha256 as the checksum function.
  .TP
-.BI loops \fR=\fPint
-Specifies the number of iterations (runs of the same workload) of this job.
-Default: 1.
+.B sha1
+Use optimized sha1 as the checksum function.
  .TP
-.BI verify_only \fR=\fPbool
-Do not perform the specified workload, only verify data still matches previous
-invocation of this workload. This option allows one to check data multiple
-times at a later date without overwriting it. This option makes sense only for
-workloads that write data, and does not support workloads with the
-\fBtime_based\fR option set.
+.B sha3\-224
+Use optimized sha3\-224 as the checksum function.
  .TP
-.BI do_verify \fR=\fPbool
-Run the verify phase after a write phase.  Only valid if \fBverify\fR is set.
-Default: true.
+.B sha3\-256
+Use optimized sha3\-256 as the checksum function.
  .TP
-.BI verify \fR=\fPstr
-Method of verifying file contents after each iteration of the job. Each
-verification method also implies verification of special header, which is
-written to the beginning of each block. This header also includes meta
-information, like offset of the block, block number, timestamp when block
-was written, etc.  \fBverify\fR=str can be combined with \fBverify_pattern\fR=str
-option.  The allowed values are:
-.RS
-.RS
+.B sha3\-384
+Use optimized sha3\-384 as the checksum function.
  .TP
-.B md5 crc16 crc32 crc32c crc32c-intel crc64 crc7 sha256 sha512 sha1 sha3-224 sha3-256 sha3-384 sha3-512 xxhash
-Store appropriate checksum in the header of each block. crc32c-intel is
-hardware accelerated SSE4.2 driven, falls back to regular crc32c if
-not supported by the system.
+.B sha3\-512
+Use optimized sha3\-512 as the checksum function.
  .TP
  .B meta
-This option is deprecated, since now meta information is included in generic
-verification header and meta verification happens by default.  For detailed
-information see the description of the \fBverify\fR=str setting. This option
-is kept because of compatibility's sake with old configurations. Do not use it.
+This option is deprecated, since now meta information is included in
+generic verification header and meta verification happens by
+default. For detailed information see the description of the
+\fBverify\fR setting. This option is kept because of
+compatibility's sake with old configurations. Do not use it.
  .TP
  .B pattern
-Verify a strict pattern. Normally fio includes a header with some basic
-information and checksumming, but if this option is set, only the
-specific pattern set with \fBverify_pattern\fR is verified.
+Verify a strict pattern. Normally fio includes a header with some
+basic information and checksumming, but if this option is set, only
+the specific pattern set with \fBverify_pattern\fR is verified.
  .TP
  .B null
-Pretend to verify.  Used for testing internals.
+Only pretend to verify. Useful for testing internals with
+`ioengine=null', not for much else.
  .RE
-
-This option can be used for repeated burn-in tests of a system to make sure
-that the written data is also correctly read back. If the data direction given
-is a read or random read, fio will assume that it should verify a previously
-written file. If the data direction includes any form of write, the verify will
-be of the newly written data.
+.P
+This option can be used for repeated burn\-in tests of a system to make sure
+that the written data is also correctly read back. If the data direction
+given is a read or random read, fio will assume that it should verify a
+previously written file. If the data direction includes any form of write,
+the verify will be of the newly written data.
  .RE
  .TP
  .BI verifysort \fR=\fPbool
-If true, written verify blocks are sorted if \fBfio\fR deems it to be faster to
-read them back in a sorted manner.  Default: true.
+If true, fio will sort written verify blocks when it deems it faster to read
+them back in a sorted manner. This is often the case when overwriting an
+existing file, since the blocks are already laid out in the file system. You
+can ignore this option unless doing huge amounts of really fast I/O where
+the red\-black tree sorting CPU time becomes significant. Default: true.
  .TP
  .BI verifysort_nr \fR=\fPint
-Pre-load and sort verify blocks for a read workload.
+Pre\-load and sort verify blocks for a read workload.
  .TP
  .BI verify_offset \fR=\fPint
  Swap the verification header with data somewhere else in the block before
-writing.  It is swapped back before verifying.
+writing. It is swapped back before verifying.
  .TP
  .BI verify_interval \fR=\fPint
-Write the verification header for this number of bytes, which should divide
-\fBblocksize\fR.  Default: \fBblocksize\fR.
+Write the verification header at a finer granularity than the
+\fBblocksize\fR. It will be written for chunks the size of
+\fBverify_interval\fR. \fBblocksize\fR should divide this evenly.
  .TP
  .BI verify_pattern \fR=\fPstr
-If set, fio will fill the io buffers with this pattern. Fio defaults to filling
-with totally random bytes, but sometimes it's interesting to fill with a known
-pattern for io verification purposes. Depending on the width of the pattern,
-fio will fill 1/2/3/4 bytes of the buffer at the time(it can be either a
-decimal or a hex number). The verify_pattern if larger than a 32-bit quantity
-has to be a hex number that starts with either "0x" or "0X". Use with
-\fBverify\fP=str. Also, verify_pattern supports %o format, which means that for
-each block offset will be written and then verified back, e.g.:
+If set, fio will fill the I/O buffers with this pattern. Fio defaults to
+filling with totally random bytes, but sometimes it's interesting to fill
+with a known pattern for I/O verification purposes. Depending on the width
+of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
+be either a decimal or a hex number). The \fBverify_pattern\fR if larger than
+a 32\-bit quantity has to be a hex number that starts with either "0x" or
+"0X". Use with \fBverify\fR. Also, \fBverify_pattern\fR supports %o
+format, which means that for each block offset will be written and then
+verified back, e.g.:
  .RS
  .RS
-\fBverify_pattern\fR=%o
+.P
+verify_pattern=%o
  .RE
+.P
  Or use combination of everything:
-.LP
  .RS
-\fBverify_pattern\fR=0xff%o"abcd"-21
+.P
+verify_pattern=0xff%o"abcd"\-12
  .RE
  .RE
  .TP
  .BI verify_fatal \fR=\fPbool
-If true, exit the job on the first observed verification failure.  Default:
-false.
+Normally fio will keep checking the entire contents before quitting on a
+block verification failure. If this option is set, fio will exit the job on
+the first observed failure. Default: false.
  .TP
  .BI verify_dump \fR=\fPbool
-If set, dump the contents of both the original data block and the data block we
-read off disk to files. This allows later analysis to inspect just what kind of
-data corruption occurred. Off by default.
+If set, dump the contents of both the original data block and the data block
+we read off disk to files. This allows later analysis to inspect just what
+kind of data corruption occurred. Off by default.
  .TP
  .BI verify_async \fR=\fPint
-Fio will normally verify IO inline from the submitting thread. This option
-takes an integer describing how many async offload threads to create for IO
-verification instead, causing fio to offload the duty of verifying IO contents
-to one or more separate threads.  If using this offload option, even sync IO
-engines can benefit from using an \fBiodepth\fR setting higher than 1, as it
-allows them to have IO in flight while verifies are running.
+Fio will normally verify I/O inline from the submitting thread. This option
+takes an integer describing how many async offload threads to create for I/O
+verification instead, causing fio to offload the duty of verifying I/O
+contents to one or more separate threads. If using this offload option, even
+sync I/O engines can benefit from using an \fBiodepth\fR setting higher
+than 1, as it allows them to have I/O in flight while verifies are running.
+Defaults to 0 async threads, i.e. verification is not asynchronous.
  .TP
  .BI verify_async_cpus \fR=\fPstr
-Tell fio to set the given CPU affinity on the async IO verification threads.
-See \fBcpus_allowed\fP for the format used.
+Tell fio to set the given CPU affinity on the async I/O verification
+threads. See \fBcpus_allowed\fR for the format used.
  .TP
  .BI verify_backlog \fR=\fPint
  Fio will normally verify the written contents of a job that utilizes verify
  once that job has completed. In other words, everything is written then
  everything is read back and verified. You may want to verify continually
-instead for a variety of reasons. Fio stores the meta data associated with an
-IO block in memory, so for large verify workloads, quite a bit of memory would
-be used up holding this meta data. If this option is enabled, fio will write
-only N blocks before verifying these blocks.
+instead for a variety of reasons. Fio stores the meta data associated with
+an I/O block in memory, so for large verify workloads, quite a bit of memory
+would be used up holding this meta data. If this option is enabled, fio will
+write only N blocks before verifying these blocks.
  .TP
  .BI verify_backlog_batch \fR=\fPint
-Control how many blocks fio will verify if verify_backlog is set. If not set,
-will default to the value of \fBverify_backlog\fR (meaning the entire queue is
-read back and verified).  If \fBverify_backlog_batch\fR is less than
-\fBverify_backlog\fR then not all blocks will be verified,  if
-\fBverify_backlog_batch\fR is larger than \fBverify_backlog\fR,  some blocks
-will be verified more than once.
-.TP
-.BI trim_percentage \fR=\fPint
-Number of verify blocks to discard/trim.
-.TP
-.BI trim_verify_zero \fR=\fPbool
-Verify that trim/discarded blocks are returned as zeroes.
-.TP
-.BI trim_backlog \fR=\fPint
-Trim after this number of blocks are written.
-.TP
-.BI trim_backlog_batch \fR=\fPint
-Trim this number of IO blocks.
-.TP
-.BI experimental_verify \fR=\fPbool
-Enable experimental verification.
+Control how many blocks fio will verify if \fBverify_backlog\fR is
+set. If not set, will default to the value of \fBverify_backlog\fR
+(meaning the entire queue is read back and verified). If
+\fBverify_backlog_batch\fR is less than \fBverify_backlog\fR then not all
+blocks will be verified, if \fBverify_backlog_batch\fR is larger than
+\fBverify_backlog\fR, some blocks will be verified more than once.
  .TP
  .BI verify_state_save \fR=\fPbool
  When a job exits during the write phase of a verify workload, save its
-current state. This allows fio to replay up until that point, if the
-verify state is loaded for the verify read phase.
+current state. This allows fio to replay up until that point, if the verify
+state is loaded for the verify read phase. The format of the filename is,
+roughly:
+.RS
+.RS
+.P
+<type>\-<jobname>\-<jobindex>\-verify.state.
+.RE
+.P
+<type> is "local" for a local run, "sock" for a client/server socket
+connection, and "ip" (192.168.0.1, for instance) for a networked
+client/server connection. Defaults to true.
+.RE
  .TP
  .BI verify_state_load \fR=\fPbool
-If a verify termination trigger was used, fio stores the current write
-state of each thread. This can be used at verification time so that fio
-knows how far it should verify. Without this information, fio will run
-a full verification pass, according to the settings in the job file used.
-.TP
-.B stonewall "\fR,\fP wait_for_previous"
-Wait for preceding jobs in the job file to exit before starting this one.
-\fBstonewall\fR implies \fBnew_group\fR.
-.TP
-.B new_group
-Start a new reporting group.  If not given, all jobs in a file will be part
-of the same reporting group, unless separated by a stonewall.
-.TP
-.BI stats \fR=\fPbool
-By default, fio collects and shows final output results for all jobs that run.
-If this option is set to 0, then fio will ignore it in the final stat output.
+If a verify termination trigger was used, fio stores the current write state
+of each thread. This can be used at verification time so that fio knows how
+far it should verify. Without this information, fio will run a full
+verification pass, according to the settings in the job file used. Default
+false.
  .TP
-.BI numjobs \fR=\fPint
-Number of clones (processes/threads performing the same workload) of this job.
-Default: 1.
+.BI trim_percentage \fR=\fPint
+Number of verify blocks to discard/trim.
  .TP
-.B group_reporting
-If set, display per-group reports instead of per-job when \fBnumjobs\fR is
-specified.
+.BI trim_verify_zero \fR=\fPbool
+Verify that trim/discarded blocks are returned as zeros.
  .TP
-.B thread
-Use threads created with \fBpthread_create\fR\|(3) instead of processes created
-with \fBfork\fR\|(2).
+.BI trim_backlog \fR=\fPint
+Verify that trim/discarded blocks are returned as zeros.
  .TP
-.BI zonesize \fR=\fPint
-Divide file into zones of the specified size in bytes.  See \fBzoneskip\fR.
+.BI trim_backlog_batch \fR=\fPint
+Trim this number of I/O blocks.
  .TP
-.BI zonerange \fR=\fPint
-Give size of an IO zone.  See \fBzoneskip\fR.
+.BI experimental_verify \fR=\fPbool
+Enable experimental verification.
+.SS "Steady state"
  .TP
-.BI zoneskip \fR=\fPint
-Skip the specified number of bytes when \fBzonesize\fR bytes of data have been
-read.
+.BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float
+Define the criterion and limit for assessing steady state performance. The
+first parameter designates the criterion whereas the second parameter sets
+the threshold. When the criterion falls below the threshold for the
+specified duration, the job will stop. For example, `iops_slope:0.1%' will
+direct fio to terminate the job when the least squares regression slope
+falls below 0.1% of the mean IOPS. If \fBgroup_reporting\fR is enabled
+this will apply to all jobs in the group. Below is the list of available
+steady state assessment criteria. All assessments are carried out using only
+data from the rolling collection window. Threshold limits can be expressed
+as a fixed value or as a percentage of the mean in the collection window.
+.RS
+.RS
  .TP
-.BI write_iolog \fR=\fPstr
-Write the issued I/O patterns to the specified file.  Specify a separate file
-for each job, otherwise the iologs will be interspersed and the file may be
-corrupt.
+.B iops
+Collect IOPS data. Stop the job if all individual IOPS measurements
+are within the specified limit of the mean IOPS (e.g., `iops:2'
+means that all individual IOPS values must be within 2 of the mean,
+whereas `iops:0.2%' means that all individual IOPS values must be
+within 0.2% of the mean IOPS to terminate the job).
  .TP
-.BI read_iolog \fR=\fPstr
-Replay the I/O patterns contained in the specified file generated by
-\fBwrite_iolog\fR, or may be a \fBblktrace\fR binary file.
+.B iops_slope
+Collect IOPS data and calculate the least squares regression
+slope. Stop the job if the slope falls below the specified limit.
  .TP
-.BI replay_no_stall \fR=\fPint
-While replaying I/O patterns using \fBread_iolog\fR the default behavior
-attempts to respect timing information between I/Os.  Enabling
-\fBreplay_no_stall\fR causes I/Os to be replayed as fast as possible while
-still respecting ordering.
+.B bw
+Collect bandwidth data. Stop the job if all individual bandwidth
+measurements are within the specified limit of the mean bandwidth.
  .TP
-.BI replay_redirect \fR=\fPstr
-While replaying I/O patterns using \fBread_iolog\fR the default behavior
-is to replay the IOPS onto the major/minor device that each IOP was recorded
-from.  Setting \fBreplay_redirect\fR causes all IOPS to be replayed onto the
-single specified device regardless of the device it was recorded from.
+.B bw_slope
+Collect bandwidth data and calculate the least squares regression
+slope. Stop the job if the slope falls below the specified limit.
+.RE
+.RE
  .TP
-.BI replay_align \fR=\fPint
-Force alignment of IO offsets and lengths in a trace to this power of 2 value.
+.BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime
+A rolling window of this duration will be used to judge whether steady state
+has been reached. Data will be collected once per second. The default is 0
+which disables steady state detection. When the unit is omitted, the
+value is interpreted in seconds.
  .TP
-.BI replay_scale \fR=\fPint
-Scale sector offsets down by this factor when replaying traces.
+.BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime
+Allow the job to run for the specified duration before beginning data
+collection for checking the steady state job termination criterion. The
+default is 0. When the unit is omitted, the value is interpreted in seconds.
+.SS "Measurements and reporting"
  .TP
  .BI per_job_logs \fR=\fPbool
  If set, this generates bw/clat/iops log with per file private filenames. If
-not set, jobs with identical names will share the log filename. Default: true.
+not set, jobs with identical names will share the log filename. Default:
+true.
+.TP
+.BI group_reporting
+It may sometimes be interesting to display statistics for groups of jobs as
+a whole instead of for each individual job. This is especially true if
+\fBnumjobs\fR is used; looking at individual thread/process output
+quickly becomes unwieldy. To see the final report per\-group instead of
+per\-job, use \fBgroup_reporting\fR. Jobs in a file will be part of the
+same reporting group, unless if separated by a \fBstonewall\fR, or by
+using \fBnew_group\fR.
+.TP
+.BI new_group
+Start a new reporting group. See: \fBgroup_reporting\fR. If not given,
+all jobs in a file will be part of the same reporting group, unless
+separated by a \fBstonewall\fR.
+.TP
+.BI stats \fR=\fPbool
+By default, fio collects and shows final output results for all jobs
+that run. If this option is set to 0, then fio will ignore it in
+the final stat output.
  .TP
  .BI write_bw_log \fR=\fPstr
-If given, write a bandwidth log for this job. Can be used to store data of the
-bandwidth of the jobs in their lifetime. The included fio_generate_plots script
-uses gnuplot to turn these text files into nice graphs. See \fBwrite_lat_log\fR
-for behaviour of given filename. For this option, the postfix is _bw.x.log,
-where x is the index of the job (1..N, where N is the number of jobs). If
-\fBper_job_logs\fR is false, then the filename will not include the job index.
-See the \fBLOG FILE FORMATS\fR
-section.
+If given, write a bandwidth log for this job. Can be used to store data of
+the bandwidth of the jobs in their lifetime. The included
+\fBfio_generate_plots\fR script uses gnuplot to turn these
+text files into nice graphs. See \fBwrite_lat_log\fR for behavior of
+given filename. For this option, the postfix is `_bw.x.log', where `x'
+is the index of the job (1..N, where N is the number of jobs). If
+\fBper_job_logs\fR is false, then the filename will not include the job
+index. See \fBLOG FILE FORMATS\fR section.
  .TP
  .BI write_lat_log \fR=\fPstr
-Same as \fBwrite_bw_log\fR, but writes I/O completion latencies.  If no
-filename is given with this option, the default filename of
-"jobname_type.x.log" is used, where x is the index of the job (1..N, where
-N is the number of jobs). Even if the filename is given, fio will still
-append the type of log. If \fBper_job_logs\fR is false, then the filename will
-not include the job index. See the \fBLOG FILE FORMATS\fR section.
+Same as \fBwrite_bw_log\fR, except that this option stores I/O
+submission, completion, and total latencies instead. If no filename is given
+with this option, the default filename of `jobname_type.log' is
+used. Even if the filename is given, fio will still append the type of
+log. So if one specifies:
+.RS
+.RS
+.P
+write_lat_log=foo
+.RE
+.P
+The actual log names will be `foo_slat.x.log', `foo_clat.x.log',
+and `foo_lat.x.log', where `x' is the index of the job (1..N, where N
+is the number of jobs). This helps \fBfio_generate_plots\fR find the
+logs automatically. If \fBper_job_logs\fR is false, then the filename
+will not include the job index. See \fBLOG FILE FORMATS\fR section.
+.RE
  .TP
  .BI write_hist_log \fR=\fPstr
-Same as \fBwrite_lat_log\fR, but writes I/O completion latency histograms. If
-no filename is given with this option, the default filename of
-"jobname_clat_hist.x.log" is used, where x is the index of the job (1..N, where
-N is the number of jobs). Even if the filename is given, fio will still append
-the type of log. If \fBper_job_logs\fR is false, then the filename will not
-include the job index. See the \fBLOG FILE FORMATS\fR section.
+Same as \fBwrite_lat_log\fR, but writes I/O completion latency
+histograms. If no filename is given with this option, the default filename
+of `jobname_clat_hist.x.log' is used, where `x' is the index of the
+job (1..N, where N is the number of jobs). Even if the filename is given,
+fio will still append the type of log. If \fBper_job_logs\fR is false,
+then the filename will not include the job index. See \fBLOG FILE FORMATS\fR section.
  .TP
  .BI write_iops_log \fR=\fPstr
-Same as \fBwrite_bw_log\fR, but writes IOPS. If no filename is given with this
-option, the default filename of "jobname_type.x.log" is used, where x is the
-index of the job (1..N, where N is the number of jobs). Even if the filename
-is given, fio will still append the type of log. If \fBper_job_logs\fR is false,
-then the filename will not include the job index. See the \fBLOG FILE FORMATS\fR
-section.
+Same as \fBwrite_bw_log\fR, but writes IOPS. If no filename is given
+with this option, the default filename of `jobname_type.x.log' is
+used, where `x' is the index of the job (1..N, where N is the number of
+jobs). Even if the filename is given, fio will still append the type of
+log. If \fBper_job_logs\fR is false, then the filename will not include
+the job index. See \fBLOG FILE FORMATS\fR section.
  .TP
  .BI log_avg_msec \fR=\fPint
  By default, fio will log an entry in the iops, latency, or bw log for every
-IO that completes. When writing to the disk log, that can quickly grow to a
+I/O that completes. When writing to the disk log, that can quickly grow to a
  very large size. Setting this option makes fio average the each log entry
  over the specified period of time, reducing the resolution of the log. See
-\fBlog_max_value\fR as well.  Defaults to 0, logging all entries.
-.TP
-.BI log_max_value \fR=\fPbool
-If \fBlog_avg_msec\fR is set, fio logs the average over that window. If you
-instead want to log the maximum value, set this option to 1.  Defaults to
-0, meaning that averaged values are logged.
+\fBlog_max_value\fR as well. Defaults to 0, logging all entries.
+Also see \fBLOG FILE FORMATS\fR section.
  .TP
  .BI log_hist_msec \fR=\fPint
-Same as \fBlog_avg_msec\fR, but logs entries for completion latency histograms.
-Computing latency percentiles from averages of intervals using \fBlog_avg_msec\fR
-is innacurate. Setting this option makes fio log histogram entries over the
-specified period of time, reducing log sizes for high IOPS devices while
-retaining percentile accuracy. See \fBlog_hist_coarseness\fR as well. Defaults
-to 0, meaning histogram logging is disabled.
+Same as \fBlog_avg_msec\fR, but logs entries for completion latency
+histograms. Computing latency percentiles from averages of intervals using
+\fBlog_avg_msec\fR is inaccurate. Setting this option makes fio log
+histogram entries over the specified period of time, reducing log sizes for
+high IOPS devices while retaining percentile accuracy. See
+\fBlog_hist_coarseness\fR as well. Defaults to 0, meaning histogram
+logging is disabled.
  .TP
  .BI log_hist_coarseness \fR=\fPint
-Integer ranging from 0 to 6, defining the coarseness of the resolution of the
-histogram logs enabled with \fBlog_hist_msec\fR. For each increment in
-coarseness, fio outputs half as many bins. Defaults to 0, for which histogram
-logs contain 1216 latency bins. See the \fBLOG FILE FORMATS\fR section.
+Integer ranging from 0 to 6, defining the coarseness of the resolution of
+the histogram logs enabled with \fBlog_hist_msec\fR. For each increment
+in coarseness, fio outputs half as many bins. Defaults to 0, for which
+histogram logs contain 1216 latency bins. See \fBLOG FILE FORMATS\fR section.
+.TP
+.BI log_max_value \fR=\fPbool
+If \fBlog_avg_msec\fR is set, fio logs the average over that window. If
+you instead want to log the maximum value, set this option to 1. Defaults to
+0, meaning that averaged values are logged.
  .TP
  .BI log_offset \fR=\fPbool
-If this is set, the iolog options will include the byte offset for the IO
-entry as well as the other data values.
+If this is set, the iolog options will include the byte offset for the I/O
+entry as well as the other data values. Defaults to 0 meaning that
+offsets are not present in logs. Also see \fBLOG FILE FORMATS\fR section.
  .TP
  .BI log_compression \fR=\fPint
-If this is set, fio will compress the IO logs as it goes, to keep the memory
-footprint lower. When a log reaches the specified size, that chunk is removed
-and compressed in the background. Given that IO logs are fairly highly
-compressible, this yields a nice memory savings for longer runs. The downside
-is that the compression will consume some background CPU cycles, so it may
-impact the run. This, however, is also true if the logging ends up consuming
-most of the system memory. So pick your poison. The IO logs are saved
-normally at the end of a run, by decompressing the chunks and storing them
-in the specified log file. This feature depends on the availability of zlib.
+If this is set, fio will compress the I/O logs as it goes, to keep the
+memory footprint lower. When a log reaches the specified size, that chunk is
+removed and compressed in the background. Given that I/O logs are fairly
+highly compressible, this yields a nice memory savings for longer runs. The
+downside is that the compression will consume some background CPU cycles, so
+it may impact the run. This, however, is also true if the logging ends up
+consuming most of the system memory. So pick your poison. The I/O logs are
+saved normally at the end of a run, by decompressing the chunks and storing
+them in the specified log file. This feature depends on the availability of
+zlib.
  .TP
  .BI log_compression_cpus \fR=\fPstr
-Define the set of CPUs that are allowed to handle online log compression
-for the IO jobs. This can provide better isolation between performance
+Define the set of CPUs that are allowed to handle online log compression for
+the I/O jobs. This can provide better isolation between performance
  sensitive jobs, and background compression work.
  .TP
  .BI log_store_compressed \fR=\fPbool
  If set, fio will store the log files in a compressed format. They can be
-decompressed with fio, using the \fB\-\-inflate-log\fR command line parameter.
-The files will be stored with a \fB\.fz\fR suffix.
+decompressed with fio, using the \fB\-\-inflate\-log\fR command line
+parameter. The files will be stored with a `.fz' suffix.
  .TP
  .BI log_unix_epoch \fR=\fPbool
  If set, fio will log Unix timestamps to the log files produced by enabling
-\fBwrite_type_log\fR for each log type, instead of the default zero-based
+write_type_log for each log type, instead of the default zero\-based
  timestamps.
  .TP
  .BI block_error_percentiles \fR=\fPbool
-If set, record errors in trim block-sized units from writes and trims and output
-a histogram of how many trims it took to get to errors, and what kind of error
-was encountered.
-.TP
-.BI disable_lat \fR=\fPbool
-Disable measurements of total latency numbers. Useful only for cutting
-back the number of calls to \fBgettimeofday\fR\|(2), as that does impact performance at
-really high IOPS rates.  Note that to really get rid of a large amount of these
-calls, this option must be used with disable_slat and disable_bw as well.
-.TP
-.BI disable_clat \fR=\fPbool
-Disable measurements of completion latency numbers. See \fBdisable_lat\fR.
-.TP
-.BI disable_slat \fR=\fPbool
-Disable measurements of submission latency numbers. See \fBdisable_lat\fR.
-.TP
-.BI disable_bw_measurement \fR=\fPbool
-Disable measurements of throughput/bandwidth numbers. See \fBdisable_lat\fR.
-.TP
-.BI lockmem \fR=\fPint
-Pin the specified amount of memory with \fBmlock\fR\|(2).  Can be used to
-simulate a smaller amount of memory. The amount specified is per worker.
-.TP
-.BI exec_prerun \fR=\fPstr
-Before running the job, execute the specified command with \fBsystem\fR\|(3).
-.RS
-Output is redirected in a file called \fBjobname.prerun.txt\fR
-.RE
+If set, record errors in trim block\-sized units from writes and trims and
+output a histogram of how many trims it took to get to errors, and what kind
+of error was encountered.
  .TP
-.BI exec_postrun \fR=\fPstr
-Same as \fBexec_prerun\fR, but the command is executed after the job completes.
-.RS
-Output is redirected in a file called \fBjobname.postrun.txt\fR
-.RE
+.BI bwavgtime \fR=\fPint
+Average the calculated bandwidth over the given time. Value is specified in
+milliseconds. If the job also does bandwidth logging through
+\fBwrite_bw_log\fR, then the minimum of this option and
+\fBlog_avg_msec\fR will be used. Default: 500ms.
  .TP
-.BI ioscheduler \fR=\fPstr
-Attempt to switch the device hosting the file to the specified I/O scheduler.
+.BI iopsavgtime \fR=\fPint
+Average the calculated IOPS over the given time. Value is specified in
+milliseconds. If the job also does IOPS logging through
+\fBwrite_iops_log\fR, then the minimum of this option and
+\fBlog_avg_msec\fR will be used. Default: 500ms.
  .TP
  .BI disk_util \fR=\fPbool
-Generate disk utilization statistics if the platform supports it. Default: true.
-.TP
-.BI clocksource \fR=\fPstr
-Use the given clocksource as the base of timing. The supported options are:
-.RS
+Generate disk utilization statistics, if the platform supports it.
+Default: true.
  .TP
-.B gettimeofday
-\fBgettimeofday\fR\|(2)
+.BI disable_lat \fR=\fPbool
+Disable measurements of total latency numbers. Useful only for cutting back
+the number of calls to \fBgettimeofday\fR\|(2), as that does impact
+performance at really high IOPS rates. Note that to really get rid of a
+large amount of these calls, this option must be used with
+\fBdisable_slat\fR and \fBdisable_bw_measurement\fR as well.
  .TP
-.B clock_gettime
-\fBclock_gettime\fR\|(2)
+.BI disable_clat \fR=\fPbool
+Disable measurements of completion latency numbers. See
+\fBdisable_lat\fR.
  .TP
-.B cpu
-Internal CPU clock source
+.BI disable_slat \fR=\fPbool
+Disable measurements of submission latency numbers. See
+\fBdisable_lat\fR.
  .TP
-.RE
-.P
-\fBcpu\fR is the preferred clocksource if it is reliable, as it is very fast
-(and fio is heavy on time calls). Fio will automatically use this clocksource
-if it's supported and considered reliable on the system it is running on,
-unless another clocksource is specifically set. For x86/x86-64 CPUs, this
-means supporting TSC Invariant.
+.BI disable_bw_measurement \fR=\fPbool "\fR,\fP disable_bw" \fR=\fPbool
+Disable measurements of throughput/bandwidth numbers. See
+\fBdisable_lat\fR.
  .TP
-.BI gtod_reduce \fR=\fPbool
-Enable all of the \fBgettimeofday\fR\|(2) reducing options (disable_clat, disable_slat,
-disable_bw) plus reduce precision of the timeout somewhat to really shrink the
-\fBgettimeofday\fR\|(2) call count. With this option enabled, we only do about 0.4% of
-the gtod() calls we would have done if all time keeping was enabled.
+.BI clat_percentiles \fR=\fPbool
+Enable the reporting of percentiles of completion latencies. This option is
+mutually exclusive with \fBlat_percentiles\fR.
  .TP
-.BI gtod_cpu \fR=\fPint
-Sometimes it's cheaper to dedicate a single thread of execution to just getting
-the current time. Fio (and databases, for instance) are very intensive on
-\fBgettimeofday\fR\|(2) calls. With this option, you can set one CPU aside for doing
-nothing but logging current time to a shared memory location. Then the other
-threads/processes that run IO workloads need only copy that segment, instead of
-entering the kernel with a \fBgettimeofday\fR\|(2) call. The CPU set aside for doing
-these time calls will be excluded from other uses. Fio will manually clear it
-from the CPU mask of other jobs.
+.BI lat_percentiles \fR=\fPbool
+Enable the reporting of percentiles of IO latencies. This is similar to
+\fBclat_percentiles\fR, except that this includes the submission latency.
+This option is mutually exclusive with \fBclat_percentiles\fR.
  .TP
-.BI ignore_error \fR=\fPstr
-Sometimes you want to ignore some errors during test in that case you can specify
-error list for each error type.
-.br
-ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST
-.br
-errors for given error type is separated with ':'.
-Error may be symbol ('ENOSPC', 'ENOMEM') or an integer.
-.br
-Example: ignore_error=EAGAIN,ENOSPC:122 .
-.br
-This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from WRITE.
+.BI percentile_list \fR=\fPfloat_list
+Overwrite the default list of percentiles for completion latencies and the
+block error histogram. Each number is a floating number in the range
+(0,100], and the maximum length of the list is 20. Use ':' to separate the
+numbers, and list the numbers in ascending order. For example,
+`\-\-percentile_list=99.5:99.9' will cause fio to report the values of
+completion latency below which 99.5% and 99.9% of the observed latencies
+fell, respectively.
+.SS "Error handling"
  .TP
-.BI error_dump \fR=\fPbool
-If set dump every error even if it is non fatal, true by default. If disabled
-only fatal error will be dumped
+.BI exitall_on_error
+When one job finishes in error, terminate the rest. The default is to wait
+for each job to finish.
  .TP
-.BI profile \fR=\fPstr
-Select a specific builtin performance test.
+.BI continue_on_error \fR=\fPstr
+Normally fio will exit the job on the first observed failure. If this option
+is set, fio will continue the job when there is a 'non\-fatal error' (EIO or
+EILSEQ) until the runtime is exceeded or the I/O size specified is
+completed. If this option is used, there are two more stats that are
+appended, the total error count and the first error. The error field given
+in the stats is the first error that was hit during the run.
+The allowed values are:
+.RS
+.RS
  .TP
-.BI cgroup \fR=\fPstr
-Add job to this control group. If it doesn't exist, it will be created.
-The system must have a mounted cgroup blkio mount point for this to work. If
-your system doesn't have it mounted, you can do so with:
-
-# mount \-t cgroup \-o blkio none /cgroup
+.B none
+Exit on any I/O or verify errors.
  .TP
-.BI cgroup_weight \fR=\fPint
-Set the weight of the cgroup to this value. See the documentation that comes
-with the kernel, allowed values are in the range of 100..1000.
+.B read
+Continue on read errors, exit on all others.
  .TP
-.BI cgroup_nodelete \fR=\fPbool
-Normally fio will delete the cgroups it has created after the job completion.
-To override this behavior and to leave cgroups around after the job completion,
-set cgroup_nodelete=1. This can be useful if one wants to inspect various
-cgroup files after job completion. Default: false
+.B write
+Continue on write errors, exit on all others.
  .TP
-.BI uid \fR=\fPint
-Instead of running as the invoking user, set the user ID to this value before
-the thread/process does any work.
+.B io
+Continue on any I/O error, exit on all others.
  .TP
-.BI gid \fR=\fPint
-Set group ID, see \fBuid\fR.
+.B verify
+Continue on verify errors, exit on all others.
  .TP
-.BI unit_base \fR=\fPint
-Base unit for reporting.  Allowed values are:
-.RS
+.B all
+Continue on all errors.
  .TP
  .B 0
-Use auto-detection (default).
-.TP
-.B 8
-Byte based.
+Backward\-compatible alias for 'none'.
  .TP
  .B 1
-Bit based.
+Backward\-compatible alias for 'all'.
+.RE
  .RE
-.P
-.TP
-.BI flow_id \fR=\fPint
-The ID of the flow. If not specified, it defaults to being a global flow. See
-\fBflow\fR.
-.TP
-.BI flow \fR=\fPint
-Weight in token-based flow control. If this value is used, then there is a
-\fBflow counter\fR which is used to regulate the proportion of activity between
-two or more jobs. fio attempts to keep this flow counter near zero. The
-\fBflow\fR parameter stands for how much should be added or subtracted to the
-flow counter on each iteration of the main I/O loop. That is, if one job has
-\fBflow=8\fR and another job has \fBflow=-1\fR, then there will be a roughly
-1:8 ratio in how much one runs vs the other.
-.TP
-.BI flow_watermark \fR=\fPint
-The maximum value that the absolute value of the flow counter is allowed to
-reach before the job must wait for a lower value of the counter.
-.TP
-.BI flow_sleep \fR=\fPint
-The period of time, in microseconds, to wait after the flow watermark has been
-exceeded before retrying operations
-.TP
-.BI clat_percentiles \fR=\fPbool
-Enable the reporting of percentiles of completion latencies.
-.TP
-.BI percentile_list \fR=\fPfloat_list
-Overwrite the default list of percentiles for completion latencies and the
-block error histogram. Each number is a floating number in the range (0,100],
-and the maximum length of the list is 20. Use ':' to separate the
-numbers. For example, \-\-percentile_list=99.5:99.9 will cause fio to
-report the values of completion latency below which 99.5% and 99.9% of
-the observed latencies fell, respectively.
-.SS "Ioengine Parameters List"
-Some parameters are only valid when a specific ioengine is in use. These are
-used identically to normal parameters, with the caveat that when used on the
-command line, they must come after the ioengine.
-.TP
-.BI (cpuio)cpuload \fR=\fPint
-Attempt to use the specified percentage of CPU cycles.
-.TP
-.BI (cpuio)cpuchunks \fR=\fPint
-Split the load into cycles of the given time. In microseconds.
-.TP
-.BI (cpuio)exit_on_io_done \fR=\fPbool
-Detect when IO threads are done, then exit.
-.TP
-.BI (libaio)userspace_reap
-Normally, with the libaio engine in use, fio will use
-the io_getevents system call to reap newly returned events.
-With this flag turned on, the AIO ring will be read directly
-from user-space to reap events. The reaping mode is only
-enabled when polling for a minimum of 0 events (eg when
-iodepth_batch_complete=0).
-.TP
-.BI (pvsync2)hipri
-Set RWF_HIPRI on IO, indicating to the kernel that it's of
-higher priority than normal.
-.TP
-.BI (net,netsplice)hostname \fR=\fPstr
-The host name or IP address to use for TCP or UDP based IO.
-If the job is a TCP listener or UDP reader, the hostname is not
-used and must be omitted unless it is a valid UDP multicast address.
-.TP
-.BI (net,netsplice)port \fR=\fPint
-The TCP or UDP port to bind to or connect to. If this is used with
-\fBnumjobs\fR to spawn multiple instances of the same job type, then
-this will be the starting port number since fio will use a range of ports.
-.TP
-.BI (net,netsplice)interface \fR=\fPstr
-The IP address of the network interface used to send or receive UDP multicast
-packets.
-.TP
-.BI (net,netsplice)ttl \fR=\fPint
-Time-to-live value for outgoing UDP multicast packets. Default: 1
-.TP
-.BI (net,netsplice)nodelay \fR=\fPbool
-Set TCP_NODELAY on TCP connections.
  .TP
-.BI (net,netsplice)protocol \fR=\fPstr "\fR,\fP proto" \fR=\fPstr
-The network protocol to use. Accepted values are:
+.BI ignore_error \fR=\fPstr
+Sometimes you want to ignore some errors during test in that case you can
+specify error list for each error type, instead of only being able to
+ignore the default 'non\-fatal error' using \fBcontinue_on_error\fR.
+`ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST' errors for
+given error type is separated with ':'. Error may be symbol ('ENOSPC', 'ENOMEM')
+or integer. Example:
  .RS
  .RS
+.P
+ignore_error=EAGAIN,ENOSPC:122
+.RE
+.P
+This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
+WRITE. This option works by overriding \fBcontinue_on_error\fR with
+the list of errors for each error type if any.
+.RE
  .TP
-.B tcp
-Transmission control protocol
-.TP
-.B tcpv6
-Transmission control protocol V6
+.BI error_dump \fR=\fPbool
+If set dump every error even if it is non fatal, true by default. If
+disabled only fatal error will be dumped.
+.SS "Running predefined workloads"
+Fio includes predefined profiles that mimic the I/O workloads generated by
+other tools.
  .TP
-.B udp
-User datagram protocol
+.BI profile \fR=\fPstr
+The predefined workload to run. Current profiles are:
+.RS
+.RS
  .TP
-.B udpv6
-User datagram protocol V6
+.B tiobench
+Threaded I/O bench (tiotest/tiobench) like workload.
  .TP
-.B unix
-UNIX domain socket
+.B act
+Aerospike Certification Tool (ACT) like workload.
+.RE
  .RE
  .P
-When the protocol is TCP or UDP, the port must also be given,
-as well as the hostname if the job is a TCP listener or UDP
-reader. For unix sockets, the normal filename option should be
-used and the port is invalid.
+To view a profile's additional options use \fB\-\-cmdhelp\fR after specifying
+the profile. For example:
+.RS
+.TP
+$ fio \-\-profile=act \-\-cmdhelp
  .RE
+.SS "Act profile options"
  .TP
-.BI (net,netsplice)listen
-For TCP network connections, tell fio to listen for incoming
-connections rather than initiating an outgoing connection. The
-hostname must be omitted if this option is used.
+.BI device\-names \fR=\fPstr
+Devices to use.
  .TP
-.BI (net, pingpong) \fR=\fPbool
-Normally a network writer will just continue writing data, and a network reader
-will just consume packets. If pingpong=1 is set, a writer will send its normal
-payload to the reader, then wait for the reader to send the same payload back.
-This allows fio to measure network latencies. The submission and completion
-latencies then measure local time spent sending or receiving, and the
-completion latency measures how long it took for the other end to receive and
-send back. For UDP multicast traffic pingpong=1 should only be set for a single
-reader when multiple readers are listening to the same address.
+.BI load \fR=\fPint
+ACT load multiplier. Default: 1.
  .TP
-.BI (net, window_size) \fR=\fPint
-Set the desired socket buffer size for the connection.
+.BI test\-duration\fR=\fPtime
+How long the entire test takes to run. When the unit is omitted, the value
+is given in seconds. Default: 24h.
  .TP
-.BI (net, mss) \fR=\fPint
-Set the TCP maximum segment size (TCP_MAXSEG).
+.BI threads\-per\-queue\fR=\fPint
+Number of read I/O threads per device. Default: 8.
  .TP
-.BI (e4defrag,donorname) \fR=\fPstr
-File will be used as a block donor (swap extents between files)
+.BI read\-req\-num\-512\-blocks\fR=\fPint
+Number of 512B blocks to read at the time. Default: 3.
  .TP
-.BI (e4defrag,inplace) \fR=\fPint
-Configure donor file block allocation strategy
-.RS
-.BI 0(default) :
-Preallocate donor's file on init
+.BI large\-block\-op\-kbytes\fR=\fPint
+Size of large block ops in KiB (writes). Default: 131072.
  .TP
-.BI 1:
-allocate space immediately inside defragment event, and free right after event
-.RE
+.BI prep
+Set to run ACT prep phase.
+.SS "Tiobench profile options"
  .TP
-.BI (rbd)clustername \fR=\fPstr
-Specifies the name of the ceph cluster.
+.BI size\fR=\fPstr
+Size in MiB.
  .TP
-.BI (rbd)rbdname \fR=\fPstr
-Specifies the name of the RBD.
+.BI block\fR=\fPint
+Block size in bytes. Default: 4096.
  .TP
-.BI (rbd)pool \fR=\fPstr
-Specifies the name of the Ceph pool containing the RBD.
+.BI numruns\fR=\fPint
+Number of runs.
  .TP
-.BI (rbd)clientname \fR=\fPstr
-Specifies the username (without the 'client.' prefix) used to access the Ceph
-cluster. If the clustername is specified, the clientname shall be the full
-type.id string. If no type. prefix is given, fio will add 'client.' by default.
+.BI dir\fR=\fPstr
+Test directory.
  .TP
-.BI (mtd)skipbad \fR=\fPbool
-Skip operations against known bad blocks.
+.BI threads\fR=\fPint
+Number of threads.
  .SH OUTPUT
-While running, \fBfio\fR will display the status of the created jobs.  For
-example:
-.RS
-.P
-Jobs: 1: [_r] [24.8% done] [ 13509/  8334 kb/s] [eta 00h:01m:31s]
-.RE
+Fio spits out a lot of output. While running, fio will display the status of the
+jobs created. An example of that would be:
  .P
-The characters in the first set of brackets denote the current status of each
-threads.  The possible values are:
+.nf
+               Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
+.fi
  .P
-.PD 0
+The characters inside the first set of square brackets denote the current status of
+each thread. The first character is the first job defined in the job file, and so
+forth. The possible values (in typical life cycle order) are:
  .RS
  .TP
+.PD 0
  .B P
-Setup but not started.
+Thread setup, but not started.
  .TP
  .B C
  Thread created.
  .TP
  .B I
-Initialized, waiting.
+Thread initialized, waiting or generating necessary data.
+.TP
+.B P
+Thread running pre\-reading file(s).
+.TP
+.B /
+Thread is in ramp period.
  .TP
  .B R
  Running, doing sequential reads.
@@ -2031,570 +2734,759 @@ Running, doing mixed sequential reads/writes.
  .B m
  Running, doing mixed random reads/writes.
  .TP
+.B D
+Running, doing sequential trims.
+.TP
+.B d
+Running, doing random trims.
+.TP
  .B F
  Running, currently waiting for \fBfsync\fR\|(2).
  .TP
  .B V
-Running, verifying written data.
+Running, doing verification of written data.
+.TP
+.B f
+Thread finishing.
  .TP
  .B E
-Exited, not reaped by main thread.
+Thread exited, not reaped by main thread yet.
  .TP
  .B \-
-Exited, thread reaped.
-.RE
+Thread reaped.
+.TP
+.B X
+Thread reaped, exited with an error.
+.TP
+.B K
+Thread reaped, exited due to signal.
  .PD
+.RE
+.P
+Fio will condense the thread string as not to take up more space on the command
+line than needed. For instance, if you have 10 readers and 10 writers running,
+the output would look like this:
+.P
+.nf
+               Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
+.fi
+.P
+Note that the status string is displayed in order, so it's possible to tell which of
+the jobs are currently doing what. In the example above this means that jobs 1\-\-10
+are readers and 11\-\-20 are writers.
  .P
-The second set of brackets shows the estimated completion percentage of
-the current group.  The third set shows the read and write I/O rate,
-respectively. Finally, the estimated run time of the job is displayed.
+The other values are fairly self explanatory \-\- number of threads currently
+running and doing I/O, the number of currently open files (f=), the estimated
+completion percentage, the rate of I/O since last check (read speed listed first,
+then write speed and optionally trim speed) in terms of bandwidth and IOPS,
+and time to completion for the current running group. It's impossible to estimate
+runtime of the following groups (if any).
  .P
-When \fBfio\fR completes (or is interrupted by Ctrl-C), it will show data
-for each thread, each group of threads, and each disk, in that order.
+When fio is done (or interrupted by Ctrl\-C), it will show the data for
+each thread, group of threads, and disks in that order. For each overall thread (or
+group) the output looks like:
  .P
-Per-thread statistics first show the threads client number, group-id, and
-error code.  The remaining figures are as follows:
+.nf
+               Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
+                 write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
+                   slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
+                   clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
+                    lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
+                   clat percentiles (usec):
+                    |  1.00th=[  302],  5.00th=[  326], 10.00th=[  343], 20.00th=[  363],
+                    | 30.00th=[  392], 40.00th=[  404], 50.00th=[  416], 60.00th=[  445],
+                    | 70.00th=[  816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
+                    | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
+                    | 99.99th=[78119]
+                  bw (  KiB/s): min=  532, max=  686, per=0.10%, avg=622.87, stdev=24.82, samples=  100
+                  iops        : min=   76, max=   98, avg=88.98, stdev= 3.54, samples=  100
+                 lat (usec)   : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
+                 lat (msec)   : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
+                 lat (msec)   : 100=0.65%
+                 cpu          : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
+                 IO depths    : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
+                    submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+                    complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+                    issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
+                    latency   : target=0, window=0, percentile=100.00%, depth=8
+.fi
+.P
+The job name (or first job's name when using \fBgroup_reporting\fR) is printed,
+along with the group id, count of jobs being aggregated, last error id seen (which
+is 0 when there are no errors), pid/tid of that thread and the time the job/group
+completed. Below are the I/O statistics for each data direction performed (showing
+writes in the example above). In the order listed, they denote:
  .RS
  .TP
-.B io
-Number of megabytes of I/O performed.
-.TP
-.B bw
-Average data rate (bandwidth).
-.TP
-.B runt
-Threads run time.
+.B read/write/trim
+The string before the colon shows the I/O direction the statistics
+are for. \fIIOPS\fR is the average I/Os performed per second. \fIBW\fR
+is the average bandwidth rate shown as: value in power of 2 format
+(value in power of 10 format). The last two values show: (total
+I/O performed in power of 2 format / \fIruntime\fR of that thread).
  .TP
  .B slat
-Submission latency minimum, maximum, average and standard deviation. This is
-the time it took to submit the I/O.
+Submission latency (\fImin\fR being the minimum, \fImax\fR being the
+maximum, \fIavg\fR being the average, \fIstdev\fR being the standard
+deviation). This is the time it took to submit the I/O. For
+sync I/O this row is not displayed as the slat is really the
+completion latency (since queue/complete is one operation there).
+This value can be in nanoseconds, microseconds or milliseconds \-\-\-
+fio will choose the most appropriate base and print that (in the
+example above nanoseconds was the best scale). Note: in \fB\-\-minimal\fR mode
+latencies are always expressed in microseconds.
  .TP
  .B clat
-Completion latency minimum, maximum, average and standard deviation.  This
-is the time between submission and completion.
+Completion latency. Same names as slat, this denotes the time from
+submission to completion of the I/O pieces. For sync I/O, clat will
+usually be equal (or very close) to 0, as the time from submit to
+complete is basically just CPU time (I/O has already been done, see slat
+explanation).
+.TP
+.B lat
+Total latency. Same names as slat and clat, this denotes the time from
+when fio created the I/O unit to completion of the I/O operation.
  .TP
  .B bw
-Bandwidth minimum, maximum, percentage of aggregate bandwidth received, average
-and standard deviation.
+Bandwidth statistics based on samples. Same names as the xlat stats,
+but also includes the number of samples taken (\fIsamples\fR) and an
+approximate percentage of total aggregate bandwidth this thread
+received in its group (\fIper\fR). This last value is only really
+useful if the threads in this group are on the same disk, since they
+are then competing for disk access.
  .TP
-.B cpu
-CPU usage statistics. Includes user and system time, number of context switches
-this thread went through and number of major and minor page faults. The CPU
-utilization numbers are averages for the jobs in that reporting group, while
-the context and fault counters are summed.
+.B iops
+IOPS statistics based on samples. Same names as \fBbw\fR.
  .TP
-.B IO depths
-Distribution of I/O depths.  Each depth includes everything less than (or equal)
-to it, but greater than the previous depth.
+.B lat (nsec/usec/msec)
+The distribution of I/O completion latencies. This is the time from when
+I/O leaves fio and when it gets completed. Unlike the separate
+read/write/trim sections above, the data here and in the remaining
+sections apply to all I/Os for the reporting group. 250=0.04% means that
+0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
+of the I/Os required 250 to 499us for completion.
  .TP
-.B IO issued
-Number of read/write requests issued, and number of short read/write requests.
+.B cpu
+CPU usage. User and system time, along with the number of context
+switches this thread went through, usage of system and user time, and
+finally the number of major and minor page faults. The CPU utilization
+numbers are averages for the jobs in that reporting group, while the
+context and fault counters are summed.
  .TP
-.B IO latencies
-Distribution of I/O completion latencies.  The numbers follow the same pattern
-as \fBIO depths\fR.
+.B IO depths
+The distribution of I/O depths over the job lifetime. The numbers are
+divided into powers of 2 and each entry covers depths from that value
+up to those that are lower than the next entry \-\- e.g., 16= covers
+depths from 16 to 31. Note that the range covered by a depth
+distribution entry can be different to the range covered by the
+equivalent \fBsubmit\fR/\fBcomplete\fR distribution entry.
+.TP
+.B IO submit
+How many pieces of I/O were submitting in a single submit call. Each
+entry denotes that amount and below, until the previous entry \-\- e.g.,
+16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
+call. Note that the range covered by a \fBsubmit\fR distribution entry can
+be different to the range covered by the equivalent depth distribution
+entry.
+.TP
+.B IO complete
+Like the above \fBsubmit\fR number, but for completions instead.
+.TP
+.B IO issued rwt
+The number of \fBread/write/trim\fR requests issued, and how many of them were
+short or dropped.
+.TP
+.B IO latency
+These values are for \fBlatency-target\fR and related options. When
+these options are engaged, this section describes the I/O depth required
+to meet the specified latency target.
  .RE
  .P
-The group statistics show:
-.PD 0
+After each client has been listed, the group statistics are printed. They
+will look like this:
+.P
+.nf
+               Run status group 0 (all jobs):
+                  READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s\-10.8MiB/s (10.9MB/s\-11.3MB/s), io=64.0MiB (67.1MB), run=2973\-3069msec
+                 WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s\-621KiB/s (630kB/s\-636kB/s), io=64.0MiB (67.1MB), run=52747\-53223msec
+.fi
+.P
+For each data direction it prints:
  .RS
  .TP
-.B io
-Number of megabytes I/O performed.
-.TP
-.B aggrb
-Aggregate bandwidth of threads in the group.
-.TP
-.B minb
-Minimum average bandwidth a thread saw.
-.TP
-.B maxb
-Maximum average bandwidth a thread saw.
+.B bw
+Aggregate bandwidth of threads in this group followed by the
+minimum and maximum bandwidth of all the threads in this group.
+Values outside of brackets are power\-of\-2 format and those
+within are the equivalent value in a power\-of\-10 format.
  .TP
-.B mint
-Shortest runtime of threads in the group.
+.B io
+Aggregate I/O performed of all threads in this group. The
+format is the same as \fBbw\fR.
  .TP
-.B maxt
-Longest runtime of threads in the group.
+.B run
+The smallest and longest runtimes of the threads in this group.
  .RE
-.PD
  .P
-Finally, disk statistics are printed with reads first:
-.PD 0
+And finally, the disk statistics are printed. This is Linux specific.
+They will look like this:
+.P
+.nf
+                 Disk stats (read/write):
+                   sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+.fi
+.P
+Each value is printed for both reads and writes, with reads first. The
+numbers denote:
  .RS
  .TP
  .B ios
  Number of I/Os performed by all groups.
  .TP
  .B merge
-Number of merges in the I/O scheduler.
+Number of merges performed by the I/O scheduler.
  .TP
  .B ticks
  Number of ticks we kept the disk busy.
  .TP
-.B io_queue
+.B in_queue
  Total time spent in the disk queue.
  .TP
  .B util
-Disk utilization.
+The disk utilization. A value of 100% means we kept the disk
+busy constantly, 50% would be a disk idling half of the time.
  .RE
-.PD
  .P
-It is also possible to get fio to dump the current output while it is
-running, without terminating the job. To do that, send fio the \fBUSR1\fR
-signal.
+It is also possible to get fio to dump the current output while it is running,
+without terminating the job. To do that, send fio the USR1 signal. You can
+also get regularly timed dumps by using the \fB\-\-status\-interval\fR
+parameter, or by creating a file in `/tmp' named
+`fio\-dump\-status'. If fio sees this file, it will unlink it and dump the
+current output status.
  .SH TERSE OUTPUT
-If the \fB\-\-minimal\fR / \fB\-\-append-terse\fR options are given, the
-results will be printed/appended in a semicolon-delimited format suitable for
-scripted use.
-A job description (if provided) follows on a new line.  Note that the first
-number in the line is the version number. If the output has to be changed
-for some reason, this number will be incremented by 1 to signify that
-change.  The fields are:
+For scripted usage where you typically want to generate tables or graphs of the
+results, fio can output the results in a semicolon separated format. The format
+is one long line of values, such as:
  .P
-.RS
-.B terse version, fio version, jobname, groupid, error
+.nf
+               2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
+               A description of this job goes here.
+.fi
  .P
-Read status:
-.RS
-.B Total I/O \fR(KiB)\fP, bandwidth \fR(KiB/s)\fP, IOPS, runtime \fR(ms)\fP
+The job description (if provided) follows on a second line.
  .P
-Submission latency:
-.RS
-.B min, max, mean, standard deviation
-.RE
-Completion latency:
-.RS
-.B min, max, mean, standard deviation
-.RE
-Completion latency percentiles (20 fields):
-.RS
-.B Xth percentile=usec
-.RE
-Total latency:
-.RS
-.B min, max, mean, standard deviation
-.RE
-Bandwidth:
-.RS
-.B min, max, aggregate percentage of total, mean, standard deviation
-.RE
-.RE
+To enable terse output, use the \fB\-\-minimal\fR or
+`\-\-output\-format=terse' command line options. The
+first value is the version of the terse output format. If the output has to be
+changed for some reason, this number will be incremented by 1 to signify that
+change.
  .P
-Write status:
+Split up, the format is as follows (comments in brackets denote when a
+field was introduced or whether it's specific to some terse version):
+.P
+.nf
+                       terse version, fio version [v3], jobname, groupid, error
+.fi
  .RS
-.B Total I/O \fR(KiB)\fP, bandwidth \fR(KiB/s)\fP, IOPS, runtime \fR(ms)\fP
  .P
-Submission latency:
-.RS
-.B min, max, mean, standard deviation
+.B
+READ status:
  .RE
-Completion latency:
+.P
+.nf
+                       Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+                       Submission latency: min, max, mean, stdev (usec)
+                       Completion latency: min, max, mean, stdev (usec)
+                       Completion latency percentiles: 20 fields (see below)
+                       Total latency: min, max, mean, stdev (usec)
+                       Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+                       IOPS [v5]: min, max, mean, stdev, number of samples
+.fi
  .RS
-.B min, max, mean, standard deviation
+.P
+.B
+WRITE status:
  .RE
-Completion latency percentiles (20 fields):
+.P
+.nf
+                       Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+                       Submission latency: min, max, mean, stdev (usec)
+                       Completion latency: min, max, mean, stdev (usec)
+                       Completion latency percentiles: 20 fields (see below)
+                       Total latency: min, max, mean, stdev (usec)
+                       Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+                       IOPS [v5]: min, max, mean, stdev, number of samples
+.fi
  .RS
-.B Xth percentile=usec
+.P
+.B
+TRIM status [all but version 3]:
  .RE
-Total latency:
+.P
+.nf
+                       Fields are similar to \fBREAD/WRITE\fR status.
+.fi
  .RS
-.B min, max, mean, standard deviation
+.P
+.B
+CPU usage:
  .RE
-Bandwidth:
+.P
+.nf
+                       user, system, context switches, major faults, minor faults
+.fi
  .RS
-.B min, max, aggregate percentage of total, mean, standard deviation
-.RE
+.P
+.B
+I/O depths:
  .RE
  .P
-CPU usage:
+.nf
+                       <=1, 2, 4, 8, 16, 32, >=64
+.fi
  .RS
-.B user, system, context switches, major page faults, minor page faults
+.P
+.B
+I/O latencies microseconds:
  .RE
  .P
-IO depth distribution:
+.nf
+                       <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+.fi
  .RS
-.B <=1, 2, 4, 8, 16, 32, >=64
+.P
+.B
+I/O latencies milliseconds:
  .RE
  .P
-IO latency distribution:
-.RS
-Microseconds:
+.nf
+                       <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+.fi
  .RS
-.B <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+.P
+.B
+Disk utilization [v3]:
  .RE
-Milliseconds:
+.P
+.nf
+                       disk name, read ios, write ios, read merges, write merges, read ticks, write ticks, time spent in queue, disk utilization percentage
+.fi
  .RS
-.B <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
-.RE
+.P
+.B
+Additional Info (dependent on continue_on_error, default off):
  .RE
  .P
-Disk utilization (1 for each disk used):
+.nf
+                       total # errors, first error code
+.fi
  .RS
-.B name, read ios, write ios, read merges, write merges, read ticks, write ticks, read in-queue time, write in-queue time, disk utilization percentage
+.P
+.B
+Additional Info (dependent on description being set):
  .RE
  .P
-Error Info (dependent on continue_on_error, default off):
+.nf
+                       Text description
+.fi
+.P
+Completion latency percentiles can be a grouping of up to 20 sets, so for the
+terse output fio writes all of them. Each field will look like this:
+.P
+.nf
+               1.00%=6112
+.fi
+.P
+which is the Xth percentile, and the `usec' latency associated with it.
+.P
+For \fBDisk utilization\fR, all disks used by fio are shown. So for each disk there
+will be a disk utilization section.
+.P
+Below is a single line containing short names for each of the fields in the
+minimal output v3, separated by semicolons:
+.P
+.nf
+               terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+.fi
+.SH JSON OUTPUT
+The \fBjson\fR output format is intended to be both human readable and convenient
+for automated parsing. For the most part its sections mirror those of the
+\fBnormal\fR output. The \fBruntime\fR value is reported in msec and the \fBbw\fR value is
+reported in 1024 bytes per second units.
+.fi
+.SH JSON+ OUTPUT
+The \fBjson+\fR output format is identical to the \fBjson\fR output format except that it
+adds a full dump of the completion latency bins. Each \fBbins\fR object contains a
+set of (key, value) pairs where keys are latency durations and values count how
+many I/Os had completion latencies of the corresponding duration. For example,
+consider:
  .RS
-.B total # errors, first error code
-.RE
  .P
-.B text description (if provided in config - appears on newline)
+"bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
  .RE
+.P
+This data indicates that one I/O required 87,552ns to complete, two I/Os required
+100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
+.P
+Also included with fio is a Python script \fBfio_jsonplus_clat2csv\fR that takes
+json+ output and generates CSV\-formatted latency data suitable for plotting.
+.P
+The latency durations actually represent the midpoints of latency intervals.
+For details refer to `stat.h' in the fio source.
  .SH TRACE FILE FORMAT
-There are two trace file format that you can encounter. The older (v1) format
-is unsupported since version 1.20-rc3 (March 2008). It will still be described
+There are two trace file format that you can encounter. The older (v1) format is
+unsupported since version 1.20\-rc3 (March 2008). It will still be described
  below in case that you get an old trace and want to understand it.
-
-In any case the trace is a simple text file with a single action per line.
-
  .P
+In any case the trace is a simple text file with a single action per line.
+.TP
  .B Trace file format v1
+Each line represents a single I/O action in the following format:
  .RS
-Each line represents a single io action in the following format:
-
+.RS
+.P
  rw, offset, length
-
-where rw=0/1 for read/write, and the offset and length entries being in bytes.
-
-This format is not supported in Fio versions => 1.20-rc3.
-
  .RE
  .P
+where `rw=0/1' for read/write, and the `offset' and `length' entries being in bytes.
+.P
+This format is not supported in fio versions >= 1.20\-rc3.
+.RE
+.TP
  .B Trace file format v2
+The second version of the trace file format was added in fio version 1.17. It
+allows to access more then one file per trace and has a bigger set of possible
+file actions.
  .RS
-The second version of the trace file format was added in Fio version 1.17.
-It allows one to access more then one file per trace and has a bigger set of
-possible file actions.
-
+.P
  The first line of the trace file has to be:
-
-\fBfio version 2 iolog\fR
-
+.RS
+.P
+"fio version 2 iolog"
+.RE
+.P
  Following this can be lines in two different formats, which are described below.
+.P
+.B
  The file management format:
-
-\fBfilename action\fR
-
-The filename is given as an absolute path. The action can be one of these:
-
+.RS
+filename action
  .P
-.PD 0
+The `filename' is given as an absolute path. The `action' can be one of these:
  .RS
  .TP
  .B add
-Add the given filename to the trace
+Add the given `filename' to the trace.
  .TP
  .B open
-Open the file with the given filename. The filename has to have been previously
-added with the \fBadd\fR action.
+Open the file with the given `filename'. The `filename' has to have
+been added with the \fBadd\fR action before.
  .TP
  .B close
-Close the file with the given filename. The file must have previously been
-opened.
+Close the file with the given `filename'. The file has to have been
+\fBopen\fRed before.
+.RE
  .RE
-.PD
  .P
-
-The file io action format:
-
-\fBfilename action offset length\fR
-
-The filename is given as an absolute path, and has to have been added and opened
-before it can be used with this format. The offset and length are given in
-bytes. The action can be one of these:
-
+.B
+The file I/O action format:
+.RS
+filename action offset length
  .P
-.PD 0
+The `filename' is given as an absolute path, and has to have been \fBadd\fRed and
+\fBopen\fRed before it can be used with this format. The `offset' and `length' are
+given in bytes. The `action' can be one of these:
  .RS
  .TP
  .B wait
-Wait for 'offset' microseconds. Everything below 100 is discarded.  The time is
-relative to the previous wait statement.
+Wait for `offset' microseconds. Everything below 100 is discarded.
+The time is relative to the previous `wait' statement.
  .TP
  .B read
-Read \fBlength\fR bytes beginning from \fBoffset\fR
+Read `length' bytes beginning from `offset'.
  .TP
  .B write
-Write \fBlength\fR bytes beginning from \fBoffset\fR
+Write `length' bytes beginning from `offset'.
  .TP
  .B sync
-fsync() the file
+\fBfsync\fR\|(2) the file.
  .TP
  .B datasync
-fdatasync() the file
+\fBfdatasync\fR\|(2) the file.
  .TP
  .B trim
-trim the given file from the given \fBoffset\fR for \fBlength\fR bytes
+Trim the given file from the given `offset' for `length' bytes.
+.RE
  .RE
-.PD
-.P
-
  .SH CPU IDLENESS PROFILING
-In some cases, we want to understand CPU overhead in a test. For example,
-we test patches for the specific goodness of whether they reduce CPU usage.
-fio implements a balloon approach to create a thread per CPU that runs at
-idle priority, meaning that it only runs when nobody else needs the cpu.
-By measuring the amount of work completed by the thread, idleness of each
-CPU can be derived accordingly.
-
-An unit work is defined as touching a full page of unsigned characters. Mean
-and standard deviation of time to complete an unit work is reported in "unit
-work" section. Options can be chosen to report detailed percpu idleness or
-overall system idleness by aggregating percpu stats.
-
+In some cases, we want to understand CPU overhead in a test. For example, we
+test patches for the specific goodness of whether they reduce CPU usage.
+Fio implements a balloon approach to create a thread per CPU that runs at idle
+priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each CPU
+can be derived accordingly.
+.P
+An unit work is defined as touching a full page of unsigned characters. Mean and
+standard deviation of time to complete an unit work is reported in "unit work"
+section. Options can be chosen to report detailed percpu idleness or overall
+system idleness by aggregating percpu stats.
  .SH VERIFICATION AND TRIGGERS
-Fio is usually run in one of two ways, when data verification is done. The
-first is a normal write job of some sort with verify enabled. When the
-write phase has completed, fio switches to reads and verifies everything
-it wrote. The second model is running just the write phase, and then later
-on running the same job (but with reads instead of writes) to repeat the
-same IO patterns and verify the contents. Both of these methods depend
-on the write phase being completed, as fio otherwise has no idea how much
-data was written.
-
-With verification triggers, fio supports dumping the current write state
-to local files. Then a subsequent read verify workload can load this state
-and know exactly where to stop. This is useful for testing cases where
-power is cut to a server in a managed fashion, for instance.
-
+Fio is usually run in one of two ways, when data verification is done. The first
+is a normal write job of some sort with verify enabled. When the write phase has
+completed, fio switches to reads and verifies everything it wrote. The second
+model is running just the write phase, and then later on running the same job
+(but with reads instead of writes) to repeat the same I/O patterns and verify
+the contents. Both of these methods depend on the write phase being completed,
+as fio otherwise has no idea how much data was written.
+.P
+With verification triggers, fio supports dumping the current write state to
+local files. Then a subsequent read verify workload can load this state and know
+exactly where to stop. This is useful for testing cases where power is cut to a
+server in a managed fashion, for instance.
+.P
  A verification trigger consists of two things:
-
  .RS
-Storing the write state of each job
-.LP
-Executing a trigger command
+.P
+1) Storing the write state of each job.
+.P
+2) Executing a trigger command.
  .RE
-
-The write state is relatively small, on the order of hundreds of bytes
-to single kilobytes. It contains information on the number of completions
-done, the last X completions, etc.
-
-A trigger is invoked either through creation (\fBtouch\fR) of a specified
-file in the system, or through a timeout setting. If fio is run with
-\fB\-\-trigger\-file=/tmp/trigger-file\fR, then it will continually check for
-the existence of /tmp/trigger-file. When it sees this file, it will
-fire off the trigger (thus saving state, and executing the trigger
+.P
+The write state is relatively small, on the order of hundreds of bytes to single
+kilobytes. It contains information on the number of completions done, the last X
+completions, etc.
+.P
+A trigger is invoked either through creation ('touch') of a specified file in
+the system, or through a timeout setting. If fio is run with
+`\-\-trigger\-file=/tmp/trigger\-file', then it will continually
+check for the existence of `/tmp/trigger\-file'. When it sees this file, it
+will fire off the trigger (thus saving state, and executing the trigger
  command).
-
-For client/server runs, there's both a local and remote trigger. If
-fio is running as a server backend, it will send the job states back
-to the client for safe storage, then execute the remote trigger, if
-specified. If a local trigger is specified, the server will still send
-back the write state, but the client will then execute the trigger.
-
+.P
+For client/server runs, there's both a local and remote trigger. If fio is
+running as a server backend, it will send the job states back to the client for
+safe storage, then execute the remote trigger, if specified. If a local trigger
+is specified, the server will still send back the write state, but the client
+will then execute the trigger.
  .RE
  .P
  .B Verification trigger example
  .RS
-
-Lets say we want to run a powercut test on the remote machine 'server'.
-Our write workload is in write-test.fio. We want to cut power to 'server'
-at some point during the run, and we'll run this test from the safety
-or our local machine, 'localbox'. On the server, we'll start the fio
-backend normally:
-
-server# \fBfio \-\-server\fR
-
+Let's say we want to run a powercut test on the remote Linux machine 'server'.
+Our write workload is in `write\-test.fio'. We want to cut power to 'server' at
+some point during the run, and we'll run this test from the safety or our local
+machine, 'localbox'. On the server, we'll start the fio backend normally:
+.RS
+.P
+server# fio \-\-server
+.RE
+.P
  and on the client, we'll fire off the workload:
-
-localbox$ \fBfio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger-remote="bash \-c "echo b > /proc/sysrq-triger""\fR
-
-We set \fB/tmp/my-trigger\fR as the trigger file, and we tell fio to execute
-
-\fBecho b > /proc/sysrq-trigger\fR
-
-on the server once it has received the trigger and sent us the write
-state. This will work, but it's not \fIreally\fR cutting power to the server,
-it's merely abruptly rebooting it. If we have a remote way of cutting
-power to the server through IPMI or similar, we could do that through
-a local trigger command instead. Lets assume we have a script that does
-IPMI reboot of a given hostname, ipmi-reboot. On localbox, we could
-then have run fio with a local trigger instead:
-
-localbox$ \fBfio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger="ipmi-reboot server"\fR
-
-For this case, fio would wait for the server to send us the write state,
-then execute 'ipmi-reboot server' when that happened.
-
+.RS
+.P
+localbox$ fio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger\-remote="bash \-c "echo b > /proc/sysrq\-triger""
+.RE
+.P
+We set `/tmp/my\-trigger' as the trigger file, and we tell fio to execute:
+.RS
+.P
+echo b > /proc/sysrq\-trigger
+.RE
+.P
+on the server once it has received the trigger and sent us the write state. This
+will work, but it's not really cutting power to the server, it's merely
+abruptly rebooting it. If we have a remote way of cutting power to the server
+through IPMI or similar, we could do that through a local trigger command
+instead. Let's assume we have a script that does IPMI reboot of a given hostname,
+ipmi\-reboot. On localbox, we could then have run fio with a local trigger
+instead:
+.RS
+.P
+localbox$ fio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger="ipmi\-reboot server"
+.RE
+.P
+For this case, fio would wait for the server to send us the write state, then
+execute `ipmi\-reboot server' when that happened.
  .RE
  .P
  .B Loading verify state
  .RS
-To load store write state, read verification job file must contain
-the verify_state_load option. If that is set, fio will load the previously
+To load stored write state, a read verification job file must contain the
+\fBverify_state_load\fR option. If that is set, fio will load the previously
  stored state. For a local fio run this is done by loading the files directly,
-and on a client/server run, the server backend will ask the client to send
-the files over and load them from there.
-
+and on a client/server run, the server backend will ask the client to send the
+files over and load them from there.
  .RE
-
  .SH LOG FILE FORMATS
-
  Fio supports a variety of log file formats, for logging latencies, bandwidth,
  and IOPS. The logs share a common format, which looks like this:
-
-.B time (msec), value, data direction, offset
-
-Time for the log entry is always in milliseconds. The value logged depends
-on the type of log, it will be one of the following:
-
+.RS
  .P
-.PD 0
+time (msec), value, data direction, block size (bytes), offset (bytes)
+.RE
+.P
+`Time' for the log entry is always in milliseconds. The `value' logged depends
+on the type of log, it will be one of the following:
+.RS
  .TP
  .B Latency log
-Value is in latency in usecs
+Value is latency in nsecs
  .TP
  .B Bandwidth log
  Value is in KiB/sec
  .TP
  .B IOPS log
-Value is in IOPS
-.PD
-.P
-
-Data direction is one of the following:
-
+Value is IOPS
+.RE
  .P
-.PD 0
+`Data direction' is one of the following:
+.RS
  .TP
  .B 0
-IO is a READ
+I/O is a READ
  .TP
  .B 1
-IO is a WRITE
+I/O is a WRITE
  .TP
  .B 2
-IO is a TRIM
-.PD
-.P
-
-The \fIoffset\fR is the offset, in bytes, from the start of the file, for that
-particular IO. The logging of the offset can be toggled with \fBlog_offset\fR.
-
-If windowed logging is enabled through \fBlog_avg_msec\fR, then fio doesn't log
-individual IOs. Instead of logs the average values over the specified
-period of time. Since \fIdata direction\fR and \fIoffset\fR are per-IO values,
-they aren't applicable if windowed logging is enabled. If windowed logging
-is enabled and \fBlog_max_value\fR is set, then fio logs maximum values in
-that window instead of averages.
-
-For histogram logging the logs look like this:
-
-.B time (msec), data direction, block-size, bin 0, bin 1, ..., bin 1215
-
-Where 'bin i' gives the frequency of IO requests with a latency falling in
-the i-th bin. See \fBlog_hist_coarseness\fR for logging fewer bins.
-
+I/O is a TRIM
  .RE
-
+.P
+The entry's `block size' is always in bytes. The `offset' is the offset, in bytes,
+from the start of the file, for that particular I/O. The logging of the offset can be
+toggled with \fBlog_offset\fR.
+.P
+Fio defaults to logging every individual I/O. When IOPS are logged for individual
+I/Os the `value' entry will always be 1. If windowed logging is enabled through
+\fBlog_avg_msec\fR, fio logs the average values over the specified period of time.
+If windowed logging is enabled and \fBlog_max_value\fR is set, then fio logs
+maximum values in that window instead of averages. Since `data direction', `block size'
+and `offset' are per\-I/O values, if windowed logging is enabled they
+aren't applicable and will be 0.
  .SH CLIENT / SERVER
-Normally you would run fio as a stand-alone application on the machine
-where the IO workload should be generated. However, it is also possible to
-run the frontend and backend of fio separately. This makes it possible to
-have a fio server running on the machine(s) where the IO workload should
-be running, while controlling it from another machine.
-
-To start the server, you would do:
-
-\fBfio \-\-server=args\fR
-
-on that machine, where args defines what fio listens to. The arguments
-are of the form 'type:hostname or IP:port'. 'type' is either 'ip' (or ip4)
-for TCP/IP v4, 'ip6' for TCP/IP v6, or 'sock' for a local unix domain
-socket. 'hostname' is either a hostname or IP address, and 'port' is the port to
-listen to (only valid for TCP/IP, not a local socket). Some examples:
-
+Normally fio is invoked as a stand\-alone application on the machine where the
+I/O workload should be generated. However, the backend and frontend of fio can
+be run separately i.e., the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled by a client on another machine.
+.P
+Start the server on the machine which has access to the storage DUT:
+.RS
+.P
+$ fio \-\-server=args
+.RE
+.P
+where `args' defines what fio listens to. The arguments are of the form
+`type,hostname' or `IP,port'. `type' is either `ip' (or ip4) for TCP/IP
+v4, `ip6' for TCP/IP v6, or `sock' for a local unix domain socket.
+`hostname' is either a hostname or IP address, and `port' is the port to listen
+to (only valid for TCP/IP, not a local socket). Some examples:
+.RS
+.TP
  1) \fBfio \-\-server\fR
-
-   Start a fio server, listening on all interfaces on the default port (8765).
-
+Start a fio server, listening on all interfaces on the default port (8765).
+.TP
  2) \fBfio \-\-server=ip:hostname,4444\fR
-
-   Start a fio server, listening on IP belonging to hostname and on port 4444.
-
+Start a fio server, listening on IP belonging to hostname and on port 4444.
+.TP
  3) \fBfio \-\-server=ip6:::1,4444\fR
-
-   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
-
+Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
+.TP
  4) \fBfio \-\-server=,4444\fR
-
-   Start a fio server, listening on all interfaces on port 4444.
-
+Start a fio server, listening on all interfaces on port 4444.
+.TP
  5) \fBfio \-\-server=1.2.3.4\fR
-
-   Start a fio server, listening on IP 1.2.3.4 on the default port.
-
+Start a fio server, listening on IP 1.2.3.4 on the default port.
+.TP
  6) \fBfio \-\-server=sock:/tmp/fio.sock\fR
-
-   Start a fio server, listening on the local socket /tmp/fio.sock.
-
-When a server is running, you can connect to it from a client. The client
-is run with:
-
-\fBfio \-\-local-args \-\-client=server \-\-remote-args <job file(s)>\fR
-
-where \-\-local-args are arguments that are local to the client where it is
-running, 'server' is the connect string, and \-\-remote-args and <job file(s)>
-are sent to the server. The 'server' string follows the same format as it
-does on the server side, to allow IP/hostname/socket and port strings.
-You can connect to multiple clients as well, to do that you could run:
-
-\fBfio \-\-client=server2 \-\-client=server2 <job file(s)>\fR
-
-If the job file is located on the fio server, then you can tell the server
-to load a local file as well. This is done by using \-\-remote-config:
-
-\fBfio \-\-client=server \-\-remote-config /path/to/file.fio\fR
-
-Then fio will open this local (to the server) job file instead
-of being passed one from the client.
-
+Start a fio server, listening on the local socket `/tmp/fio.sock'.
+.RE
+.P
+Once a server is running, a "client" can connect to the fio server with:
+.RS
+.P
+$ fio <local\-args> \-\-client=<server> <remote\-args> <job file(s)>
+.RE
+.P
+where `local\-args' are arguments for the client where it is running, `server'
+is the connect string, and `remote\-args' and `job file(s)' are sent to the
+server. The `server' string follows the same format as it does on the server
+side, to allow IP/hostname/socket and port strings.
+.P
+Fio can connect to multiple servers this way:
+.RS
+.P
+$ fio \-\-client=<server1> <job file(s)> \-\-client=<server2> <job file(s)>
+.RE
+.P
+If the job file is located on the fio server, then you can tell the server to
+load a local file as well. This is done by using \fB\-\-remote\-config\fR:
+.RS
+.P
+$ fio \-\-client=server \-\-remote\-config /path/to/file.fio
+.RE
+.P
+Then fio will open this local (to the server) job file instead of being passed
+one from the client.
+.P
  If you have many servers (example: 100 VMs/containers), you can input a pathname
-of a file containing host IPs/names as the parameter value for the \-\-client option.
-For example, here is an example "host.list" file containing 2 hostnames:
-
+of a file containing host IPs/names as the parameter value for the
+\fB\-\-client\fR option. For example, here is an example `host.list'
+file containing 2 hostnames:
+.RS
+.P
+.PD 0
  host1.your.dns.domain
-.br
+.P
  host2.your.dns.domain
-
+.PD
+.RE
+.P
  The fio command would then be:
-
-\fBfio \-\-client=host.list <job file>\fR
-
-In this mode, you cannot input server-specific parameters or job files, and all
+.RS
+.P
+$ fio \-\-client=host.list <job file(s)>
+.RE
+.P
+In this mode, you cannot input server\-specific parameters or job files \-\- all
  servers receive the same job file.
-
-In order to enable fio \-\-client runs utilizing a shared filesystem from multiple hosts,
-fio \-\-client now prepends the IP address of the server to the filename. For example,
-if fio is using directory /mnt/nfs/fio and is writing filename fileio.tmp,
-with a \-\-client hostfile
-containing two hostnames h1 and h2 with IP addresses 192.168.10.120 and 192.168.10.121, then
-fio will create two files:
-
+.P
+In order to let `fio \-\-client' runs use a shared filesystem from multiple
+hosts, `fio \-\-client' now prepends the IP address of the server to the
+filename. For example, if fio is using the directory `/mnt/nfs/fio' and is
+writing filename `fileio.tmp', with a \fB\-\-client\fR `hostfile'
+containing two hostnames `h1' and `h2' with IP addresses 192.168.10.120 and
+192.168.10.121, then fio will create two files:
+.RS
+.P
+.PD 0
  /mnt/nfs/fio/192.168.10.120.fileio.tmp
-.br
+.P
  /mnt/nfs/fio/192.168.10.121.fileio.tmp
-
+.PD
+.RE
  .SH AUTHORS
-
  .B fio
  was written by Jens Axboe <jens.axboe@oracle.com>,
  now Jens Axboe <axboe@fb.com>.
  .br
  This man page was written by Aaron Carroll <aaronc@cse.unsw.edu.au> based
  on documentation by Jens Axboe.
+.br
+This man page was rewritten by Tomohiro Kusumi <tkusumi@tuxera.com> based
+on documentation by Jens Axboe.
  .SH "REPORTING BUGS"
  Report bugs to the \fBfio\fR mailing list <fio@vger.kernel.org>.
-See \fBREADME\fR.
+.br
+See \fBREPORTING\-BUGS\fR.
+.P
+\fBREPORTING\-BUGS\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/REPORTING\-BUGS\fR
  .SH "SEE ALSO"
  For further documentation see \fBHOWTO\fR and \fBREADME\fR.
  .br
-Sample jobfiles are available in the \fBexamples\fR directory.
-.br
-These are typically located under /usr/share/doc/fio.
-
-\fBHOWTO\fR:  http://git.kernel.dk/?p=fio.git;a=blob_plain;f=HOWTO
+Sample jobfiles are available in the `examples/' directory.
  .br
-\fBREADME\fR: http://git.kernel.dk/?p=fio.git;a=blob_plain;f=README
+These are typically located under `/usr/share/doc/fio'.
+.P
+\fBHOWTO\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/HOWTO\fR
  .br
+\fBREADME\fR: \fIhttp://git.kernel.dk/cgit/fio/plain/README\fR
diff --git a/fio.h b/fio.h

index e11a03902676285635ae5d2e2dc2b60ed83eb401..8814d84eed133127b4213db17a073e5b855a8d67 100644 (file)
--- a/fio.h
+++ b/fio.h
@@ -87,10 +87,13 @@ enum {
         TD_F_CHILD              = 1U << 12,
         TD_F_NO_PROGRESS        = 1U << 13,
         TD_F_REGROW_LOGS        = 1U << 14,
+       TD_F_MMAP_KEEP          = 1U << 15,
  };
  
  enum {
         FIO_RAND_BS_OFF         = 0,
+       FIO_RAND_BS1_OFF,
+       FIO_RAND_BS2_OFF,
         FIO_RAND_VER_OFF,
         FIO_RAND_MIX_OFF,
         FIO_RAND_FILE_OFF,
@@ -149,7 +152,7 @@ struct thread_data {
         unsigned int thread_number;
         unsigned int subjob_number;
         unsigned int groupid;
-       struct thread_stat ts;
+       struct thread_stat ts __attribute__ ((aligned(8)));
  
         int client_type;
  
@@ -165,10 +168,10 @@ struct thread_data {
         struct thread_data *parent;
  
         uint64_t stat_io_bytes[DDIR_RWDIR_CNT];
-       struct timeval bw_sample_time;
+       struct timespec bw_sample_time;
  
         uint64_t stat_io_blocks[DDIR_RWDIR_CNT];
-       struct timeval iops_sample_time;
+       struct timespec iops_sample_time;
  
         volatile int update_rusage;
         struct fio_mutex *rusage_sem;
@@ -214,7 +217,7 @@ struct thread_data {
  
         unsigned long rand_seeds[FIO_RAND_NR_OFFS];
  
-       struct frand_state bsrange_state;
+       struct frand_state bsrange_state[DDIR_RWDIR_CNT];
         struct frand_state verify_state;
         struct frand_state trim_state;
         struct frand_state delay_state;
@@ -287,7 +290,7 @@ struct thread_data {
         unsigned long rate_bytes[DDIR_RWDIR_CNT];
         unsigned long rate_blocks[DDIR_RWDIR_CNT];
         unsigned long long rate_io_issue_bytes[DDIR_RWDIR_CNT];
-       struct timeval lastrate[DDIR_RWDIR_CNT];
+       struct timespec lastrate[DDIR_RWDIR_CNT];
         int64_t last_usec[DDIR_RWDIR_CNT];
         struct frand_state poisson_state[DDIR_RWDIR_CNT];
  
@@ -323,21 +326,21 @@ struct thread_data {
          */
         struct frand_state random_state;
  
-       struct timeval start;   /* start of this loop */
-       struct timeval epoch;   /* time job was started */
+       struct timespec start;  /* start of this loop */
+       struct timespec epoch;  /* time job was started */
         unsigned long long unix_epoch; /* Time job was started, unix epoch based. */
-       struct timeval last_issue;
+       struct timespec last_issue;
         long time_offset;
-       struct timeval tv_cache;
-       struct timeval terminate_time;
-       unsigned int tv_cache_nr;
-       unsigned int tv_cache_mask;
-       unsigned int ramp_time_over;
+       struct timespec ts_cache;
+       struct timespec terminate_time;
+       unsigned int ts_cache_nr;
+       unsigned int ts_cache_mask;
+       bool ramp_time_over;
  
         /*
          * Time since last latency_window was started
          */
-       struct timeval latency_ts;
+       struct timespec latency_ts;
         unsigned int latency_qd;
         unsigned int latency_qd_high;
         unsigned int latency_qd_low;
@@ -640,17 +643,9 @@ extern void free_threads_shm(void);
   */
  extern void reset_all_stats(struct thread_data *);
  
-/*
- * blktrace support
- */
-#ifdef FIO_HAVE_BLKTRACE
-extern int is_blktrace(const char *, int *);
-extern int load_blktrace(struct thread_data *, const char *, int);
-#endif
-
  extern int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
                    enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify,
-                  struct timeval *comp_time);
+                  struct timespec *comp_time);
  
  /*
   * Latency target helpers
diff --git a/fio_time.h b/fio_time.h

index b49cc828713e4c67d53e362372760cd510510353..f4eac793f4fb3311dbd77069fc5a392477f33fef 100644 (file)
--- a/fio_time.h
+++ b/fio_time.h
@@ -4,22 +4,24 @@
  #include "lib/types.h"
  
  struct thread_data;
-extern uint64_t utime_since(const struct timeval *,const  struct timeval *);
-extern uint64_t utime_since_now(const struct timeval *);
-extern uint64_t mtime_since(const struct timeval *, const struct timeval *);
-extern uint64_t mtime_since_now(const struct timeval *);
-extern uint64_t time_since_now(const struct timeval *);
+extern uint64_t ntime_since(const struct timespec *, const struct timespec *);
+extern uint64_t utime_since(const struct timespec *, const struct timespec *);
+extern uint64_t utime_since_now(const struct timespec *);
+extern uint64_t mtime_since(const struct timespec *, const struct timespec *);
+extern uint64_t mtime_since_now(const struct timespec *);
+extern uint64_t mtime_since_tv(const struct timeval *, const struct timeval *);
+extern uint64_t time_since_now(const struct timespec *);
  extern uint64_t time_since_genesis(void);
  extern uint64_t mtime_since_genesis(void);
  extern uint64_t utime_since_genesis(void);
  extern uint64_t usec_spin(unsigned int);
  extern uint64_t usec_sleep(struct thread_data *, unsigned long);
-extern void fill_start_time(struct timeval *);
+extern void fill_start_time(struct timespec *);
  extern void set_genesis_time(void);
  extern bool ramp_time_over(struct thread_data *);
  extern bool in_ramp_time(struct thread_data *);
  extern void fio_time_init(void);
-extern void timeval_add_msec(struct timeval *, unsigned int);
+extern void timespec_add_msec(struct timespec *, unsigned int);
  extern void set_epoch_time(struct thread_data *, int);
  
  #endif
diff --git a/flist.h b/flist.h

index b4fe6e65f10f2c1dfb02943a0680c7c35e1cc0b6..2ca3d7771232b6d3031d3c73cff118ef2c5dbd50 100644 (file)
--- a/flist.h
+++ b/flist.h
@@ -2,13 +2,7 @@
  #define _LINUX_FLIST_H
  
  #include <stdlib.h>
-
-#undef offsetof
-#ifdef __compiler_offsetof
-#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
-#else
-#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
-#endif
+#include <stddef.h>
  
  #define container_of(ptr, type, member) ({                     \
         const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
diff --git a/gclient.c b/gclient.c

index 928a1b7641da2976c64c9709aaa7314030410a4f..43c8a0891818ac0d28d36d770e507960eb29bd4a 100644 (file)
--- a/gclient.c
+++ b/gclient.c
@@ -930,8 +930,10 @@ static gint on_config_lat_drawing_area(GtkWidget *w, GdkEventConfigure *event,
  static void gfio_show_latency_buckets(struct gfio_client *gc, GtkWidget *vbox,
                                       struct thread_stat *ts)
  {
-       double io_u_lat[FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR];
-       const char *ranges[] = { "2us", "4us", "10us", "20us", "50us", "100us",
+       double io_u_lat[FIO_IO_U_LAT_N_NR + FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR];
+       const char *ranges[] = { "2ns", "4ns", "10ns", "20ns", "50ns", "100ns",
+                                "250ns", "500ns", "750ns", "1000ns", "2us",
+                                "4us", "10us", "20us", "50us", "100us",
                                  "250us", "500us", "750us", "1ms", "2ms",
                                  "4ms", "10ms", "20ms", "50ms", "100ms",
                                  "250ms", "500ms", "750ms", "1s", "2s", ">= 2s" };
@@ -940,8 +942,9 @@ static void gfio_show_latency_buckets(struct gfio_client *gc, GtkWidget *vbox,
         GtkWidget *frame, *tree_view, *hbox, *completion_vbox, *drawing_area;
         struct gui_entry *ge = gc->ge;
  
-       stat_calc_lat_u(ts, io_u_lat);
-       stat_calc_lat_m(ts, &io_u_lat[FIO_IO_U_LAT_U_NR]);
+       stat_calc_lat_n(ts, io_u_lat);
+       stat_calc_lat_u(ts, &io_u_lat[FIO_IO_U_LAT_N_NR]);
+       stat_calc_lat_m(ts, &io_u_lat[FIO_IO_U_LAT_N_NR + FIO_IO_U_LAT_U_NR]);
  
         /*
          * Found out which first bucket has entries, and which last bucket
@@ -983,16 +986,18 @@ static void gfio_show_latency_buckets(struct gfio_client *gc, GtkWidget *vbox,
         gtk_box_pack_start(GTK_BOX(hbox), tree_view, TRUE, TRUE, 3);
  }
  
-static void gfio_show_lat(GtkWidget *vbox, const char *name, unsigned long min,
-                         unsigned long max, double mean, double dev)
+static void gfio_show_lat(GtkWidget *vbox, const char *name, unsigned long long min,
+                         unsigned long long max, double mean, double dev)
  {
-       const char *base = "(usec)";
+       const char *base = "(nsec)";
         GtkWidget *hbox, *label, *frame;
         char *minp, *maxp;
         char tmp[64];
  
-       if (usec_to_msec(&min, &max, &mean, &dev))
+       if (nsec_to_msec(&min, &max, &mean, &dev))
                 base = "(msec)";
+       else if (nsec_to_usec(&min, &max, &mean, &dev))
+               base = "(usec)";
  
         minp = num2str(min, 6, 1, 0, N2S_NONE);
         maxp = num2str(max, 6, 1, 0, N2S_NONE);
@@ -1019,7 +1024,7 @@ static void gfio_show_lat(GtkWidget *vbox, const char *name, unsigned long min,
         free(maxp);
  }
  
-static GtkWidget *gfio_output_clat_percentiles(unsigned int *ovals,
+static GtkWidget *gfio_output_clat_percentiles(unsigned long long *ovals,
                                                fio_fp64_t *plist,
                                                unsigned int len,
                                                const char *base,
@@ -1030,10 +1035,10 @@ static GtkWidget *gfio_output_clat_percentiles(unsigned int *ovals,
         GtkTreeSelection *selection;
         GtkListStore *model;
         GtkTreeIter iter;
-       int i;
+       int i, j;
  
         for (i = 0; i < len; i++)
-               types[i] = G_TYPE_INT;
+               types[i] = G_TYPE_ULONG;
  
         model = gtk_list_store_newv(len, types);
  
@@ -1056,15 +1061,15 @@ static GtkWidget *gfio_output_clat_percentiles(unsigned int *ovals,
         gtk_list_store_append(model, &iter);
  
         for (i = 0; i < len; i++) {
-               if (scale)
+               for (j = 0; j < scale; j++)
                         ovals[i] = (ovals[i] + 999) / 1000;
-               gtk_list_store_set(model, &iter, i, ovals[i], -1);
+               gtk_list_store_set(model, &iter, i, (unsigned long) ovals[i], -1);
         }
  
         return tree_view;
  }
  
-static struct graph *setup_clat_graph(char *title, unsigned int *ovals,
+static struct graph *setup_clat_graph(char *title, unsigned long long *ovals,
                                       fio_fp64_t *plist,
                                       unsigned int len,
                                       double xdim, double ydim)
@@ -1096,7 +1101,8 @@ static void gfio_show_clat_percentiles(struct gfio_client *gc,
         unsigned int *io_u_plat = ts->io_u_plat[ddir];
         unsigned long nr = ts->clat_stat[ddir].samples;
         fio_fp64_t *plist = ts->percentile_list;
-       unsigned int *ovals, len, minv, maxv, scale_down;
+       unsigned int len, scale_down;
+       unsigned long long *ovals, minv, maxv;
         const char *base;
         GtkWidget *tree_view, *frame, *hbox, *drawing_area, *completion_vbox;
         struct gui_entry *ge = gc->ge;
@@ -1107,18 +1113,25 @@ static void gfio_show_clat_percentiles(struct gfio_client *gc,
                 goto out;
  
         /*
-        * We default to usecs, but if the value range is such that we
-        * should scale down to msecs, do that.
+        * We default to nsecs, but if the value range is such that we
+        * should scale down to usecs or msecs, do that.
          */
-       if (minv > 2000 && maxv > 99999) {
-               scale_down = 1;
+        if (minv > 2000000 && maxv > 99999999ULL) {
+                scale_down = 2;
                 base = "msec";
-       } else {
-               scale_down = 0;
+        } else if (minv > 2000 && maxv > 99999) {
+                scale_down = 1;
                 base = "usec";
-       }
+        } else {
+                scale_down = 0;
+               base = "nsec";
+        }
+
+       if (ts->clat_percentiles)
+               sprintf(tmp, "Completion percentiles (%s)", base);
+       else
+               sprintf(tmp, "Latency percentiles (%s)", base);
  
-       sprintf(tmp, "Completion percentiles (%s)", base);
         tree_view = gfio_output_clat_percentiles(ovals, plist, len, base, scale_down);
         ge->clat_graph = setup_clat_graph(tmp, ovals, plist, len, 700.0, 300.0);
  
@@ -1152,7 +1165,8 @@ static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
  {
         const char *ddir_label[3] = { "Read", "Write", "Trim" };
         GtkWidget *frame, *label, *box, *vbox, *main_vbox;
-       unsigned long min[3], max[3], runt;
+       unsigned long long min[3], max[3];
+       unsigned long runt;
         unsigned long long bw, iops;
         unsigned int flags = 0;
         double mean[3], dev[3];
diff --git a/gettime-thread.c b/gettime-thread.c

index 19541b474b75878999657475f88d46178ad7c831..cbb81dc83e9ff3c7f427e071161b92f016c1b9d1 100644 (file)
--- a/gettime-thread.c
+++ b/gettime-thread.c
@@ -6,30 +6,30 @@
  #include "fio.h"
  #include "smalloc.h"
  
-struct timeval *fio_tv = NULL;
+struct timespec *fio_ts = NULL;
  int fio_gtod_offload = 0;
  static pthread_t gtod_thread;
  static os_cpu_mask_t fio_gtod_cpumask;
  
  void fio_gtod_init(void)
  {
-       if (fio_tv)
+       if (fio_ts)
                 return;
  
-       fio_tv = smalloc(sizeof(struct timeval));
-       if (!fio_tv)
+       fio_ts = smalloc(sizeof(*fio_ts));
+       if (!fio_ts)
                 log_err("fio: smalloc pool exhausted\n");
  }
  
  static void fio_gtod_update(void)
  {
-       if (fio_tv) {
+       if (fio_ts) {
                 struct timeval __tv;
  
                 gettimeofday(&__tv, NULL);
-               fio_tv->tv_sec = __tv.tv_sec;
+               fio_ts->tv_sec = __tv.tv_sec;
                 write_barrier();
-               fio_tv->tv_usec = __tv.tv_usec;
+               fio_ts->tv_nsec = __tv.tv_usec * 1000;
                 write_barrier();
         }
  }
diff --git a/gettime.c b/gettime.c

index 628aad640810e4c01ef2892bd6c2158c0d5c84a2..3dcaaf680803fdcdb798de6009745f7821118d39 100644 (file)
--- a/gettime.c
+++ b/gettime.c
@@ -15,19 +15,22 @@
  
  #if defined(ARCH_HAVE_CPU_CLOCK)
  #ifndef ARCH_CPU_CLOCK_CYCLES_PER_USEC
-static unsigned long cycles_per_usec;
-static unsigned long inv_cycles_per_usec;
-static uint64_t max_cycles_for_mult;
+static unsigned long cycles_per_msec;
+static unsigned long long cycles_start;
+static unsigned long long clock_mult;
+static unsigned long long max_cycles_mask;
+static unsigned long long nsecs_for_max_cycles;
+static unsigned int clock_shift;
+static unsigned int max_cycles_shift;
+#define MAX_CLOCK_SEC 60*60
  #endif
  #ifdef ARCH_CPU_CLOCK_WRAPS
-static unsigned long long cycles_start, cycles_wrap;
+static unsigned int cycles_wrap;
  #endif
  #endif
-int tsc_reliable = 0;
+bool tsc_reliable = false;
  
  struct tv_valid {
-       uint64_t last_cycles;
-       int last_tv_valid;
         int warned;
  };
  #ifdef ARCH_HAVE_CPU_CLOCK
@@ -143,31 +146,31 @@ static int fill_clock_gettime(struct timespec *ts)
  }
  #endif
  
-static void __fio_gettime(struct timeval *tp)
+static void __fio_gettime(struct timespec *tp)
  {
         switch (fio_clock_source) {
  #ifdef CONFIG_GETTIMEOFDAY
-       case CS_GTOD:
-               gettimeofday(tp, NULL);
+       case CS_GTOD: {
+               struct timeval tv;
+               gettimeofday(&tv, NULL);
+
+               tp->tv_sec = tv.tv_sec;
+               tp->tv_nsec = tv.tv_usec * 1000;
                 break;
+               }
  #endif
  #ifdef CONFIG_CLOCK_GETTIME
         case CS_CGETTIME: {
-               struct timespec ts;
-
-               if (fill_clock_gettime(&ts) < 0) {
+               if (fill_clock_gettime(tp) < 0) {
                         log_err("fio: clock_gettime fails\n");
                         assert(0);
                 }
-
-               tp->tv_sec = ts.tv_sec;
-               tp->tv_usec = ts.tv_nsec / 1000;
                 break;
                 }
  #endif
  #ifdef ARCH_HAVE_CPU_CLOCK
         case CS_CPUCLOCK: {
-               uint64_t usecs, t;
+               uint64_t nsecs, t, multiples;
                 struct tv_valid *tv;
  
  #ifdef CONFIG_TLS_THREAD
@@ -184,21 +187,17 @@ static void __fio_gettime(struct timeval *tp)
                         log_err("fio: double CPU clock wrap\n");
                         tv->warned = 1;
                 }
-
-               t -= cycles_start;
  #endif
-               tv->last_cycles = t;
-               tv->last_tv_valid = 1;
  #ifdef ARCH_CPU_CLOCK_CYCLES_PER_USEC
-               usecs = t / ARCH_CPU_CLOCK_CYCLES_PER_USEC;
+               nsecs = t / ARCH_CPU_CLOCK_CYCLES_PER_USEC * 1000;
  #else
-               if (t < max_cycles_for_mult)
-                       usecs = (t * inv_cycles_per_usec) / 16777216UL;
-               else
-                       usecs = t / cycles_per_usec;
+               t -= cycles_start;
+               multiples = t >> max_cycles_shift;
+               nsecs = multiples * nsecs_for_max_cycles;
+               nsecs += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
  #endif
-               tp->tv_sec = usecs / 1000000;
-               tp->tv_usec = usecs % 1000000;
+               tp->tv_sec = nsecs / 1000000000ULL;
+               tp->tv_nsec = nsecs % 1000000000ULL;
                 break;
                 }
  #endif
@@ -209,9 +208,9 @@ static void __fio_gettime(struct timeval *tp)
  }
  
  #ifdef FIO_DEBUG_TIME
-void fio_gettime(struct timeval *tp, void *caller)
+void fio_gettime(struct timespec *tp, void *caller)
  #else
-void fio_gettime(struct timeval *tp, void fio_unused *caller)
+void fio_gettime(struct timespec *tp, void fio_unused *caller)
  #endif
  {
  #ifdef FIO_DEBUG_TIME
@@ -227,9 +226,9 @@ void fio_gettime(struct timeval *tp, void fio_unused *caller)
  }
  
  #if defined(ARCH_HAVE_CPU_CLOCK) && !defined(ARCH_CPU_CLOCK_CYCLES_PER_USEC)
-static unsigned long get_cycles_per_usec(void)
+static unsigned long get_cycles_per_msec(void)
  {
-       struct timeval s, e;
+       struct timespec s, e;
         uint64_t c_s, c_e;
         enum fio_cs old_cs = fio_clock_source;
         uint64_t elapsed;
@@ -253,7 +252,7 @@ static unsigned long get_cycles_per_usec(void)
         } while (1);
  
         fio_clock_source = old_cs;
-       return (c_e - c_s) / elapsed;
+       return (c_e - c_s) * 1000 / elapsed;
  }
  
  #define NR_TIME_ITERS  50
@@ -262,12 +261,13 @@ static int calibrate_cpu_clock(void)
  {
         double delta, mean, S;
         uint64_t minc, maxc, avg, cycles[NR_TIME_ITERS];
-       int i, samples;
+       int i, samples, sft = 0;
+       unsigned long long tmp, max_ticks, max_mult;
  
-       cycles[0] = get_cycles_per_usec();
+       cycles[0] = get_cycles_per_msec();
         S = delta = mean = 0.0;
         for (i = 0; i < NR_TIME_ITERS; i++) {
-               cycles[i] = get_cycles_per_usec();
+               cycles[i] = get_cycles_per_msec();
                 delta = cycles[i] - mean;
                 if (delta) {
                         mean += delta / (i + 1.0);
@@ -304,19 +304,67 @@ static int calibrate_cpu_clock(void)
                 dprint(FD_TIME, "cycles[%d]=%llu\n", i, (unsigned long long) cycles[i]);
  
         avg /= samples;
+       cycles_per_msec = avg;
         dprint(FD_TIME, "avg: %llu\n", (unsigned long long) avg);
         dprint(FD_TIME, "min=%llu, max=%llu, mean=%f, S=%f\n",
                         (unsigned long long) minc,
                         (unsigned long long) maxc, mean, S);
  
-       cycles_per_usec = avg;
-       inv_cycles_per_usec = 16777216UL / cycles_per_usec;
-       max_cycles_for_mult = ~0ULL / inv_cycles_per_usec;
-       dprint(FD_TIME, "inv_cycles_per_usec=%lu\n", inv_cycles_per_usec);
-#ifdef ARCH_CPU_CLOCK_WRAPS
+       max_ticks = MAX_CLOCK_SEC * cycles_per_msec * 1000ULL;
+       max_mult = ULLONG_MAX / max_ticks;
+       dprint(FD_TIME, "\n\nmax_ticks=%llu, __builtin_clzll=%d, "
+                       "max_mult=%llu\n", max_ticks,
+                       __builtin_clzll(max_ticks), max_mult);
+
+        /*
+         * Find the largest shift count that will produce
+         * a multiplier that does not exceed max_mult
+         */
+        tmp = max_mult * cycles_per_msec / 1000000;
+        while (tmp > 1) {
+                tmp >>= 1;
+                sft++;
+                dprint(FD_TIME, "tmp=%llu, sft=%u\n", tmp, sft);
+        }
+
+       clock_shift = sft;
+       clock_mult = (1ULL << sft) * 1000000 / cycles_per_msec;
+       dprint(FD_TIME, "clock_shift=%u, clock_mult=%llu\n", clock_shift,
+                                                       clock_mult);
+
+       /*
+        * Find the greatest power of 2 clock ticks that is less than the
+        * ticks in MAX_CLOCK_SEC_2STAGE
+        */
+       max_cycles_shift = max_cycles_mask = 0;
+       tmp = MAX_CLOCK_SEC * 1000ULL * cycles_per_msec;
+       dprint(FD_TIME, "tmp=%llu, max_cycles_shift=%u\n", tmp,
+                                                       max_cycles_shift);
+       while (tmp > 1) {
+               tmp >>= 1;
+               max_cycles_shift++;
+               dprint(FD_TIME, "tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+       }
+       /*
+        * if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_msec
+        * here we will have a discontinuity every
+        * (1ULL << max_cycles_shift) cycles
+        */
+       nsecs_for_max_cycles = ((1ULL << max_cycles_shift) * clock_mult)
+                                       >> clock_shift;
+
+       /* Use a bitmask to calculate ticks % (1ULL << max_cycles_shift) */
+       for (tmp = 0; tmp < max_cycles_shift; tmp++)
+               max_cycles_mask |= 1ULL << tmp;
+
+       dprint(FD_TIME, "max_cycles_shift=%u, 2^max_cycles_shift=%llu, "
+                       "nsecs_for_max_cycles=%llu, "
+                       "max_cycles_mask=%016llx\n",
+                       max_cycles_shift, (1ULL << max_cycles_shift),
+                       nsecs_for_max_cycles, max_cycles_mask);
+
         cycles_start = get_cpu_clock();
         dprint(FD_TIME, "cycles_start=%llu\n", cycles_start);
-#endif
         return 0;
  }
  #else
@@ -365,7 +413,7 @@ void fio_clock_init(void)
         fio_clock_source_inited = fio_clock_source;
  
         if (calibrate_cpu_clock())
-               tsc_reliable = 0;
+               tsc_reliable = false;
  
         /*
          * If the arch sets tsc_reliable != 0, then it must be good enough
@@ -377,14 +425,35 @@ void fio_clock_init(void)
                         fio_clock_source = CS_CPUCLOCK;
         } else if (fio_clock_source == CS_CPUCLOCK)
                 log_info("fio: clocksource=cpu may not be reliable\n");
+       dprint(FD_TIME, "gettime: clocksource=%d\n", (int) fio_clock_source);
+}
+
+uint64_t ntime_since(const struct timespec *s, const struct timespec *e)
+{
+       int64_t sec, nsec;
+
+       sec = e->tv_sec - s->tv_sec;
+       nsec = e->tv_nsec - s->tv_nsec;
+       if (sec > 0 && nsec < 0) {
+              sec--;
+              nsec += 1000000000LL;
+       }
+
+       /*
+       * time warp bug on some kernels?
+       */
+       if (sec < 0 || (sec == 0 && nsec < 0))
+              return 0;
+
+       return nsec + (sec * 1000000000LL);
  }
  
-uint64_t utime_since(const struct timeval *s, const struct timeval *e)
+uint64_t utime_since(const struct timespec *s, const struct timespec *e)
  {
         int64_t sec, usec;
  
         sec = e->tv_sec - s->tv_sec;
-       usec = e->tv_usec - s->tv_usec;
+       usec = (e->tv_nsec - s->tv_nsec) / 1000;
         if (sec > 0 && usec < 0) {
                 sec--;
                 usec += 1000000;
@@ -399,9 +468,9 @@ uint64_t utime_since(const struct timeval *s, const struct timeval *e)
         return usec + (sec * 1000000);
  }
  
-uint64_t utime_since_now(const struct timeval *s)
+uint64_t utime_since_now(const struct timespec *s)
  {
-       struct timeval t;
+       struct timespec t;
  #ifdef FIO_DEBUG_TIME
         void *p = __builtin_return_address(0);
  
@@ -413,12 +482,12 @@ uint64_t utime_since_now(const struct timeval *s)
         return utime_since(s, &t);
  }
  
-uint64_t mtime_since(const struct timeval *s, const struct timeval *e)
+uint64_t mtime_since_tv(const struct timeval *s, const struct timeval *e)
  {
-       long sec, usec;
+       int64_t sec, usec;
  
         sec = e->tv_sec - s->tv_sec;
-       usec = e->tv_usec - s->tv_usec;
+       usec = (e->tv_usec - s->tv_usec);
         if (sec > 0 && usec < 0) {
                 sec--;
                 usec += 1000000;
@@ -432,9 +501,9 @@ uint64_t mtime_since(const struct timeval *s, const struct timeval *e)
         return sec + usec;
  }
  
-uint64_t mtime_since_now(const struct timeval *s)
+uint64_t mtime_since_now(const struct timespec *s)
  {
-       struct timeval t;
+       struct timespec t;
  #ifdef FIO_DEBUG_TIME
         void *p = __builtin_return_address(0);
  
@@ -446,7 +515,26 @@ uint64_t mtime_since_now(const struct timeval *s)
         return mtime_since(s, &t);
  }
  
-uint64_t time_since_now(const struct timeval *s)
+uint64_t mtime_since(const struct timespec *s, const struct timespec *e)
+{
+       int64_t sec, usec;
+
+       sec = e->tv_sec - s->tv_sec;
+       usec = (e->tv_nsec - s->tv_nsec) / 1000;
+       if (sec > 0 && usec < 0) {
+               sec--;
+               usec += 1000000;
+       }
+
+       if (sec < 0 || (sec == 0 && usec < 0))
+               return 0;
+
+       sec *= 1000;
+       usec /= 1000;
+       return sec + usec;
+}
+
+uint64_t time_since_now(const struct timespec *s)
  {
         return mtime_since_now(s) / 1000;
  }
@@ -455,7 +543,7 @@ uint64_t time_since_now(const struct timeval *s)
      defined(CONFIG_SFAA)
  
  #define CLOCK_ENTRIES_DEBUG    100000
-#define CLOCK_ENTRIES_TEST     10000
+#define CLOCK_ENTRIES_TEST     1000
  
  struct clock_entry {
         uint32_t seq;
diff --git a/gettime.h b/gettime.h

index 86d55bd9d6298b69de406a3d7209184ba786db0a..11e2a7b9c26c0216ee4b2124f6dff929cf50175d 100644 (file)
--- a/gettime.h
+++ b/gettime.h
@@ -13,27 +13,27 @@ enum fio_cs {
         CS_INVAL,
  };
  
-extern void fio_gettime(struct timeval *, void *);
+extern void fio_gettime(struct timespec *, void *);
  extern void fio_gtod_init(void);
  extern void fio_clock_init(void);
  extern int fio_start_gtod_thread(void);
  extern int fio_monotonic_clocktest(int debug);
  extern void fio_local_clock_init(int);
  
-extern struct timeval *fio_tv;
+extern struct timespec *fio_ts;
  
-static inline int fio_gettime_offload(struct timeval *tv)
+static inline int fio_gettime_offload(struct timespec *ts)
  {
         time_t last_sec;
  
-       if (!fio_tv)
+       if (!fio_ts)
                 return 0;
  
         do {
                 read_barrier();
-               last_sec = tv->tv_sec = fio_tv->tv_sec;
-               tv->tv_usec = fio_tv->tv_usec;
-       } while (fio_tv->tv_sec != last_sec);
+               last_sec = ts->tv_sec = fio_ts->tv_sec;
+               ts->tv_nsec = fio_ts->tv_nsec;
+       } while (fio_ts->tv_sec != last_sec);
  
         return 1;
  }
diff --git a/gfio.c b/gfio.c

index 7c92a5099f9f6418340283f45fa4fc85c2afb16d..7160c3a9f0ca7883b04265d334e66f82433c37d4 100644 (file)
--- a/gfio.c
+++ b/gfio.c
@@ -1243,7 +1243,7 @@ static void about_dialog(GtkWidget *w, gpointer data)
                 "website", "http://git.kernel.dk/cgit/fio/",
                 "authors", authors,
                 "version", fio_version_string,
-               "copyright", "© 2012 Jens Axboe <axboe@kernel.dk>",
+               "copyright", "© 2012-2017 Jens Axboe <axboe@kernel.dk>",
                 "logo-icon-name", "fio",
                 /* Must be last: */
                 "wrap-license", TRUE,
diff --git a/helper_thread.c b/helper_thread.c

index 47ec728cfa31d25f68742af81178bb95f3e5f928..9c6e0a2b52da53adc495c6a478c9e5509a25880c 100644 (file)
--- a/helper_thread.c
+++ b/helper_thread.c
@@ -71,45 +71,45 @@ static void *helper_thread_main(void *data)
  {
         struct helper_data *hd = data;
         unsigned int msec_to_next_event, next_log, next_ss = STEADYSTATE_MSEC;
-       struct timeval tv, last_du, last_ss;
+       struct timeval tv;
+       struct timespec ts, last_du, last_ss;
         int ret = 0;
  
         sk_out_assign(hd->sk_out);
  
         gettimeofday(&tv, NULL);
-       memcpy(&last_du, &tv, sizeof(tv));
-       memcpy(&last_ss, &tv, sizeof(tv));
+       ts.tv_sec = tv.tv_sec;
+       ts.tv_nsec = tv.tv_usec * 1000;
+       memcpy(&last_du, &ts, sizeof(ts));
+       memcpy(&last_ss, &ts, sizeof(ts));
  
         fio_mutex_up(hd->startup_mutex);
  
         msec_to_next_event = DISK_UTIL_MSEC;
         while (!ret && !hd->exit) {
-               struct timespec ts;
-               struct timeval now;
                 uint64_t since_du, since_ss = 0;
  
-               timeval_add_msec(&tv, msec_to_next_event);
-               ts.tv_sec = tv.tv_sec;
-               ts.tv_nsec = tv.tv_usec * 1000;
+               timespec_add_msec(&ts, msec_to_next_event);
  
                 pthread_mutex_lock(&hd->lock);
                 pthread_cond_timedwait(&hd->cond, &hd->lock, &ts);
  
-               gettimeofday(&now, NULL);
+               gettimeofday(&tv, NULL);
+               ts.tv_sec = tv.tv_sec;
+               ts.tv_nsec = tv.tv_usec * 1000;
  
                 if (hd->reset) {
-                       memcpy(&tv, &now, sizeof(tv));
-                       memcpy(&last_du, &now, sizeof(last_du));
-                       memcpy(&last_ss, &now, sizeof(last_ss));
+                       memcpy(&last_du, &ts, sizeof(ts));
+                       memcpy(&last_ss, &ts, sizeof(ts));
                         hd->reset = 0;
                 }
  
                 pthread_mutex_unlock(&hd->lock);
  
-               since_du = mtime_since(&last_du, &now);
+               since_du = mtime_since(&last_du, &ts);
                 if (since_du >= DISK_UTIL_MSEC || DISK_UTIL_MSEC - since_du < 10) {
                         ret = update_io_ticks();
-                       timeval_add_msec(&last_du, DISK_UTIL_MSEC);
+                       timespec_add_msec(&last_du, DISK_UTIL_MSEC);
                         msec_to_next_event = DISK_UTIL_MSEC;
                         if (since_du >= DISK_UTIL_MSEC)
                                 msec_to_next_event -= (since_du - DISK_UTIL_MSEC);
@@ -126,10 +126,10 @@ static void *helper_thread_main(void *data)
                         next_log = DISK_UTIL_MSEC;
  
                 if (steadystate_enabled) {
-                       since_ss = mtime_since(&last_ss, &now);
+                       since_ss = mtime_since(&last_ss, &ts);
                         if (since_ss >= STEADYSTATE_MSEC || STEADYSTATE_MSEC - since_ss < 10) {
                                 steadystate_check();
-                               timeval_add_msec(&last_ss, since_ss);
+                               timespec_add_msec(&last_ss, since_ss);
                                 if (since_ss > STEADYSTATE_MSEC)
                                         next_ss = STEADYSTATE_MSEC - (since_ss - STEADYSTATE_MSEC);
                                 else
diff --git a/idletime.c b/idletime.c

index 4c00d80df692590b6b8b62a57bd9af2fa528dcd2..90bc1d9eb09fb7a763dcf567e21062ad2320973d 100644 (file)
--- a/idletime.c
+++ b/idletime.c
@@ -11,7 +11,7 @@ static volatile struct idle_prof_common ipc;
  static double calibrate_unit(unsigned char *data)
  {
         unsigned long t, i, j, k;
-       struct timeval tps;
+       struct timespec tps;
         double tunit = 0.0;
  
         for (i = 0; i < CALIBRATE_RUNS; i++) {
@@ -183,7 +183,6 @@ static void calibration_stats(void)
  void fio_idle_prof_init(void)
  {
         int i, ret;
-       struct timeval tp;
         struct timespec ts;
         pthread_attr_t tattr;
         struct idle_prof_thread *ipt;
@@ -282,9 +281,8 @@ void fio_idle_prof_init(void)
                 pthread_mutex_lock(&ipt->init_lock);
                 while ((ipt->state != TD_EXITED) &&
                        (ipt->state!=TD_INITIALIZED)) {
-                       fio_gettime(&tp, NULL);
-                       ts.tv_sec = tp.tv_sec + 1;
-                       ts.tv_nsec = tp.tv_usec * 1000;
+                       fio_gettime(&ts, NULL);
+                       ts.tv_sec += 1;
                         pthread_cond_timedwait(&ipt->cond, &ipt->init_lock, &ts);
                 }
                 pthread_mutex_unlock(&ipt->init_lock);
@@ -325,7 +323,6 @@ void fio_idle_prof_stop(void)
  {
         int i;
         uint64_t runt;
-       struct timeval tp;
         struct timespec ts;
         struct idle_prof_thread *ipt;
  
@@ -343,9 +340,8 @@ void fio_idle_prof_stop(void)
                 pthread_mutex_lock(&ipt->start_lock);
                 while ((ipt->state != TD_EXITED) &&
                        (ipt->state!=TD_NOT_CREATED)) {
-                       fio_gettime(&tp, NULL);
-                       ts.tv_sec = tp.tv_sec + 1;
-                       ts.tv_nsec = tp.tv_usec * 1000;
+                       fio_gettime(&ts, NULL);
+                       ts.tv_sec += 1;
                         /* timed wait in case a signal is not received */
                         pthread_cond_timedwait(&ipt->cond, &ipt->start_lock, &ts);
                 }
diff --git a/idletime.h b/idletime.h

index 84c1fbbe79a8a293103ce19f1450c9f17847a743..b8376c2ce6098cd698d8933604fed7de2f35ff4d 100644 (file)
--- a/idletime.h
+++ b/idletime.h
@@ -26,8 +26,8 @@ struct idle_prof_thread {
         pthread_t thread;
         int cpu;
         int state;
-       struct timeval tps;
-       struct timeval tpe;
+       struct timespec tps;
+       struct timespec tpe;
         double cali_time; /* microseconds to finish a unit work */
         double loops;
         double idleness;
diff --git a/init.c b/init.c

index 52a5f0301d196fe11b378fce6e59e41800745126..e80aec300770036346cfcc72d56f53d3c9623c65 100644 (file)
--- a/init.c
+++ b/init.c
@@ -361,7 +361,7 @@ static int setup_thread_area(void)
  #endif
  
         memset(threads, 0, max_jobs * sizeof(struct thread_data));
-       fio_debug_jobp = (void *) threads + max_jobs * sizeof(struct thread_data);
+       fio_debug_jobp = (unsigned int *)(threads + max_jobs);
         *fio_debug_jobp = -1;
  
         flow_init();
@@ -698,6 +698,23 @@ static int fixup_options(struct thread_data *td)
         if (o->iodepth_batch_complete_min > o->iodepth_batch_complete_max)
                 o->iodepth_batch_complete_max = o->iodepth_batch_complete_min;
  
+       /*
+        * There's no need to check for in-flight overlapping IOs if the job
+        * isn't changing data or the maximum iodepth is guaranteed to be 1
+        */
+       if (o->serialize_overlap && !(td->flags & TD_F_READ_IOLOG) &&
+           (!(td_write(td) || td_trim(td)) || o->iodepth == 1))
+               o->serialize_overlap = 0;
+       /*
+        * Currently can't check for overlaps in offload mode
+        */
+       if (o->serialize_overlap && o->io_submit_mode == IO_MODE_OFFLOAD) {
+               log_err("fio: checking for in-flight overlaps when the "
+                       "io_submit_mode is offload is not supported\n");
+               o->serialize_overlap = 0;
+               ret = warnings_fatal;
+       }
+
         if (o->nr_files > td->files_index)
                 o->nr_files = td->files_index;
  
@@ -731,13 +748,30 @@ static int fixup_options(struct thread_data *td)
                 o->size = -1ULL;
  
         if (o->verify != VERIFY_NONE) {
-               if (td_write(td) && o->do_verify && o->numjobs > 1) {
-                       log_info("Multiple writers may overwrite blocks that "
-                               "belong to other jobs. This can cause "
+               if (td_write(td) && o->do_verify && o->numjobs > 1 &&
+                   (o->filename ||
+                    !(o->unique_filename &&
+                      strstr(o->filename_format, "$jobname") &&
+                      strstr(o->filename_format, "$jobnum") &&
+                      strstr(o->filename_format, "$filenum")))) {
+                       log_info("fio: multiple writers may overwrite blocks "
+                               "that belong to other jobs. This can cause "
                                 "verification failures.\n");
                         ret = warnings_fatal;
                 }
  
+               /*
+                * Warn if verification is requested but no verification of any
+                * kind can be started due to time constraints
+                */
+               if (td_write(td) && o->do_verify && o->timeout &&
+                   o->time_based && !td_read(td) && !o->verify_backlog) {
+                       log_info("fio: verification read phase will never "
+                                "start because write phase uses all of "
+                                "runtime\n");
+                       ret = warnings_fatal;
+               }
+
                 if (!fio_option_is_set(o, refill_buffers))
                         o->refill_buffers = 1;
  
@@ -781,6 +815,11 @@ static int fixup_options(struct thread_data *td)
                         o->unit_base = 8;
         }
  
+#ifndef FIO_HAVE_ANY_FALLOCATE
+       /* Platform doesn't support any fallocate so force it to none */
+       o->fallocate_mode = FIO_FALLOCATE_NONE;
+#endif
+
  #ifndef CONFIG_FDATASYNC
         if (o->fdatasync_blocks) {
                 log_info("fio: this platform does not support fdatasync()"
@@ -798,7 +837,7 @@ static int fixup_options(struct thread_data *td)
          * Windows doesn't support O_DIRECT or O_SYNC with the _open interface,
          * so fail if we're passed those flags
          */
-       if (td_ioengine_flagged(td, FIO_SYNCIO) && (td->o.odirect || td->o.sync_io)) {
+       if (td_ioengine_flagged(td, FIO_SYNCIO) && (o->odirect || o->sync_io)) {
                 log_err("fio: Windows does not support direct or non-buffered io with"
                                 " the synchronous ioengines. Use the 'windowsaio' ioengine"
                                 " with 'direct=1' and 'iodepth=1' instead.\n");
@@ -824,8 +863,8 @@ static int fixup_options(struct thread_data *td)
          * Using a non-uniform random distribution excludes usage of
          * a random map
          */
-       if (td->o.random_distribution != FIO_RAND_DIST_RANDOM)
-               td->o.norandommap = 1;
+       if (o->random_distribution != FIO_RAND_DIST_RANDOM)
+               o->norandommap = 1;
  
         /*
          * If size is set but less than the min block size, complain
@@ -839,16 +878,16 @@ static int fixup_options(struct thread_data *td)
         /*
          * O_ATOMIC implies O_DIRECT
          */
-       if (td->o.oatomic)
-               td->o.odirect = 1;
+       if (o->oatomic)
+               o->odirect = 1;
  
         /*
          * If randseed is set, that overrides randrepeat
          */
-       if (fio_option_is_set(&td->o, rand_seed))
-               td->o.rand_repeatable = 0;
+       if (fio_option_is_set(o, rand_seed))
+               o->rand_repeatable = 0;
  
-       if (td_ioengine_flagged(td, FIO_NOEXTEND) && td->o.file_append) {
+       if (td_ioengine_flagged(td, FIO_NOEXTEND) && o->file_append) {
                 log_err("fio: can't append/extent with IO engine %s\n", td->io_ops->name);
                 ret = 1;
         }
@@ -863,28 +902,28 @@ static int fixup_options(struct thread_data *td)
         if (!td->loops)
                 td->loops = 1;
  
-       if (td->o.block_error_hist && td->o.nr_files != 1) {
+       if (o->block_error_hist && o->nr_files != 1) {
                 log_err("fio: block error histogram only available "
                         "with a single file per job, but %d files "
-                       "provided\n", td->o.nr_files);
+                       "provided\n", o->nr_files);
                 ret = 1;
         }
  
-       return ret;
-}
-
-/* External engines are specified by "external:name.o") */
-static const char *get_engine_name(const char *str)
-{
-       char *p = strstr(str, ":");
-
-       if (!p)
-               return str;
+       if (fio_option_is_set(o, clat_percentiles) &&
+           !fio_option_is_set(o, lat_percentiles)) {
+               o->lat_percentiles = !o->clat_percentiles;
+       } else if (fio_option_is_set(o, lat_percentiles) &&
+                  !fio_option_is_set(o, clat_percentiles)) {
+               o->clat_percentiles = !o->lat_percentiles;
+       } else if (fio_option_is_set(o, lat_percentiles) &&
+                  fio_option_is_set(o, clat_percentiles) &&
+                  o->lat_percentiles && o->clat_percentiles) {
+               log_err("fio: lat_percentiles and clat_percentiles are "
+                       "mutually exclusive\n");
+               ret = 1;
+       }
  
-       p++;
-       strip_blank_front(&p);
-       strip_blank_end(p);
-       return p;
+       return ret;
  }
  
  static void init_rand_file_service(struct thread_data *td)
@@ -909,9 +948,9 @@ void td_fill_verify_state_seed(struct thread_data *td)
         bool use64;
  
         if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
-               use64 = 1;
+               use64 = true;
         else
-               use64 = 0;
+               use64 = false;
  
         init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF],
                 use64);
@@ -921,7 +960,22 @@ static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
  {
         int i;
  
-       init_rand_seed(&td->bsrange_state, td->rand_seeds[FIO_RAND_BS_OFF], use64);
+       /*
+        * trimwrite is special in that we need to generate the same
+        * offsets to get the "write after trim" effect. If we are
+        * using bssplit to set buffer length distributions, ensure that
+        * we seed the trim and write generators identically.
+        */
+       if (td_trimwrite(td)) {
+               init_rand_seed(&td->bsrange_state[DDIR_READ], td->rand_seeds[FIO_RAND_BS_OFF], use64);
+               init_rand_seed(&td->bsrange_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_BS1_OFF], use64);
+               init_rand_seed(&td->bsrange_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_BS1_OFF], use64);
+       } else {
+               init_rand_seed(&td->bsrange_state[DDIR_READ], td->rand_seeds[FIO_RAND_BS_OFF], use64);
+               init_rand_seed(&td->bsrange_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_BS1_OFF], use64);
+               init_rand_seed(&td->bsrange_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_BS2_OFF], use64);
+       }
+
         td_fill_verify_state_seed(td);
         init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], false);
  
@@ -967,9 +1021,9 @@ void td_fill_rand_seeds(struct thread_data *td)
         }
  
         if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
-               use64 = 1;
+               use64 = true;
         else
-               use64 = 0;
+               use64 = false;
  
         td_fill_rand_seeds_internal(td, use64);
  
@@ -983,22 +1037,27 @@ void td_fill_rand_seeds(struct thread_data *td)
   */
  int ioengine_load(struct thread_data *td)
  {
-       const char *engine;
-
-       /*
-        * Engine has already been loaded.
-        */
-       if (td->io_ops)
-               return 0;
         if (!td->o.ioengine) {
                 log_err("fio: internal fault, no IO engine specified\n");
                 return 1;
         }
  
-       engine = get_engine_name(td->o.ioengine);
-       td->io_ops = load_ioengine(td, engine);
+       if (td->io_ops) {
+               /* An engine is loaded, but the requested ioengine
+                * may have changed.
+                */
+               if (!strcmp(td->io_ops->name, td->o.ioengine)) {
+                       /* The right engine is already loaded */
+                       return 0;
+               }
+
+               /* Unload the old engine. */
+               free_ioengine(td);
+       }
+
+       td->io_ops = load_ioengine(td);
         if (!td->io_ops) {
-               log_err("fio: failed to load engine %s\n", engine);
+               log_err("fio: failed to load engine\n");
                 return 1;
         }
  
@@ -1080,8 +1139,12 @@ static int setup_random_seeds(struct thread_data *td)
         unsigned long seed;
         unsigned int i;
  
-       if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed))
-               return init_random_state(td, td->rand_seeds, sizeof(td->rand_seeds));
+       if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) {
+               int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds));
+               if (!ret)
+                       td_fill_rand_seeds(td);
+               return ret;
+       }
  
         seed = td->o.rand_seed;
         for (i = 0; i < 4; i++)
@@ -1352,6 +1415,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
         td->mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
  
         td->ts.clat_percentiles = o->clat_percentiles;
+       td->ts.lat_percentiles = o->lat_percentiles;
         td->ts.percentile_precision = o->percentile_precision;
         memcpy(td->ts.percentile_list, o->percentile_list, sizeof(o->percentile_list));
  
@@ -1360,6 +1424,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
                 td->ts.slat_stat[i].min_val = ULONG_MAX;
                 td->ts.lat_stat[i].min_val = ULONG_MAX;
                 td->ts.bw_stat[i].min_val = ULONG_MAX;
+               td->ts.iops_stat[i].min_val = ULONG_MAX;
         }
         td->ddir_seq_nr = o->ddir_seq_nr;
  
@@ -1376,7 +1441,7 @@ static int add_job(struct thread_data *td, const char *jobname, int job_add_num,
         prev_group_jobs++;
  
         if (setup_random_seeds(td)) {
-               td_verror(td, errno, "init_random_state");
+               td_verror(td, errno, "setup_random_seeds");
                 goto err;
         }
  
@@ -2017,7 +2082,7 @@ static void usage(const char *name)
         printf("  --version\t\tPrint version info and exit\n");
         printf("  --help\t\tPrint this page\n");
         printf("  --cpuclock-test\tPerform test/validation of CPU clock\n");
-       printf("  --crctest=type\tTest speed of checksum functions\n");
+       printf("  --crctest=[type]\tTest speed of checksum functions\n");
         printf("  --cmdhelp=cmd\t\tPrint command help, \"all\" for all of"
                 " them\n");
         printf("  --enghelp=engine\tPrint ioengine help, or list"
@@ -2050,7 +2115,7 @@ static void usage(const char *name)
         printf("  --inflate-log=log\tInflate and output compressed log\n");
  #endif
         printf("  --trigger-file=file\tExecute trigger cmd when file exists\n");
-       printf("  --trigger-timeout=t\tExecute trigger af this time\n");
+       printf("  --trigger-timeout=t\tExecute trigger at this time\n");
         printf("  --trigger=cmd\t\tSet this command as local trigger\n");
         printf("  --trigger-remote=cmd\tSet this command as remote trigger\n");
         printf("  --aux-path=path\tUse this path for fio state generated files\n");
@@ -2399,8 +2464,7 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
                         break;
                 case 'V':
                         terse_version = atoi(optarg);
-                       if (!(terse_version == 2 || terse_version == 3 ||
-                            terse_version == 4)) {
+                       if (!(terse_version >= 2 && terse_version <= 5)) {
                                 log_err("fio: bad terse version format\n");
                                 exit_val = 1;
                                 do_exit++;
@@ -2511,7 +2575,6 @@ int parse_cmd_line(int argc, char *argv[], int client_type)
                         }
  
                         if (!ret && !strcmp(opt, "ioengine")) {
-                               free_ioengine(td);
                                 if (ioengine_load(td)) {
                                         put_job(td);
                                         td = NULL;
diff --git a/io_u.c b/io_u.c

index fd63119888327fff8705bbdef066b910d8f37d1e..58c23202bd3d20bb1eee1b4fbcf1731d9bf312ea 100644 (file)
--- a/io_u.c
+++ b/io_u.c
@@ -20,7 +20,7 @@ struct io_completion_data {
  
         int error;                      /* output */
         uint64_t bytes_done[DDIR_RWDIR_CNT];    /* output */
-       struct timeval time;            /* output */
+       struct timespec time;           /* output */
  };
  
  /*
@@ -37,7 +37,7 @@ static bool random_map_free(struct fio_file *f, const uint64_t block)
   */
  static void mark_random_map(struct thread_data *td, struct io_u *io_u)
  {
-       unsigned int min_bs = td->o.rw_min_bs;
+       unsigned int min_bs = td->o.min_bs[io_u->ddir];
         struct fio_file *f = io_u->file;
         unsigned int nr_blocks;
         uint64_t block;
@@ -552,9 +552,9 @@ static unsigned int __get_next_buflen(struct thread_data *td, struct io_u *io_u,
         if (!io_u_fits(td, io_u, minbs))
                 return 0;
  
-       frand_max = rand_max(&td->bsrange_state);
+       frand_max = rand_max(&td->bsrange_state[ddir]);
         do {
-               r = __rand(&td->bsrange_state);
+               r = __rand(&td->bsrange_state[ddir]);
  
                 if (!td->o.bssplit_nr[ddir]) {
                         buflen = 1 + (unsigned int) ((double) maxbs *
@@ -662,7 +662,7 @@ int io_u_quiesce(struct thread_data *td)
  static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
  {
         enum fio_ddir odir = ddir ^ 1;
-       long usec;
+       uint64_t usec;
         uint64_t now;
  
         assert(ddir_rw(ddir));
@@ -989,11 +989,52 @@ void io_u_mark_depth(struct thread_data *td, unsigned int nr)
         td->ts.io_u_map[idx] += nr;
  }
  
-static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec)
+static void io_u_mark_lat_nsec(struct thread_data *td, unsigned long long nsec)
  {
         int idx = 0;
  
-       assert(usec < 1000);
+       assert(nsec < 1000);
+
+       switch (nsec) {
+       case 750 ... 999:
+               idx = 9;
+               break;
+       case 500 ... 749:
+               idx = 8;
+               break;
+       case 250 ... 499:
+               idx = 7;
+               break;
+       case 100 ... 249:
+               idx = 6;
+               break;
+       case 50 ... 99:
+               idx = 5;
+               break;
+       case 20 ... 49:
+               idx = 4;
+               break;
+       case 10 ... 19:
+               idx = 3;
+               break;
+       case 4 ... 9:
+               idx = 2;
+               break;
+       case 2 ... 3:
+               idx = 1;
+       case 0 ... 1:
+               break;
+       }
+
+       assert(idx < FIO_IO_U_LAT_N_NR);
+       td->ts.io_u_lat_n[idx]++;
+}
+
+static void io_u_mark_lat_usec(struct thread_data *td, unsigned long long usec)
+{
+       int idx = 0;
+
+       assert(usec < 1000 && usec >= 1);
  
         switch (usec) {
         case 750 ... 999:
@@ -1030,10 +1071,12 @@ static void io_u_mark_lat_usec(struct thread_data *td, unsigned long usec)
         td->ts.io_u_lat_u[idx]++;
  }
  
-static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec)
+static void io_u_mark_lat_msec(struct thread_data *td, unsigned long long msec)
  {
         int idx = 0;
  
+       assert(msec >= 1);
+
         switch (msec) {
         default:
                 idx = 11;
@@ -1075,12 +1118,14 @@ static void io_u_mark_lat_msec(struct thread_data *td, unsigned long msec)
         td->ts.io_u_lat_m[idx]++;
  }
  
-static void io_u_mark_latency(struct thread_data *td, unsigned long usec)
+static void io_u_mark_latency(struct thread_data *td, unsigned long long nsec)
  {
-       if (usec < 1000)
-               io_u_mark_lat_usec(td, usec);
+       if (nsec < 1000)
+               io_u_mark_lat_nsec(td, nsec);
+       else if (nsec < 1000000)
+               io_u_mark_lat_usec(td, nsec / 1000);
         else
-               io_u_mark_lat_msec(td, usec / 1000);
+               io_u_mark_lat_msec(td, nsec / 1000000);
  }
  
  static unsigned int __get_next_fileno_rand(struct thread_data *td)
@@ -1557,7 +1602,7 @@ static void small_content_scramble(struct io_u *io_u)
         unsigned int i, nr_blocks = io_u->buflen / 512;
         uint64_t boffset;
         unsigned int offset;
-       void *p, *end;
+       char *p, *end;
  
         if (!nr_blocks)
                 return;
@@ -1572,7 +1617,7 @@ static void small_content_scramble(struct io_u *io_u)
                  * the buffer, given by the product of the usec time
                  * and the actual offset.
                  */
-               offset = (io_u->start_time.tv_usec ^ boffset) & 511;
+               offset = ((io_u->start_time.tv_nsec/1000) ^ boffset) & 511;
                 offset &= ~(sizeof(uint64_t) - 1);
                 if (offset >= 512 - sizeof(uint64_t))
                         offset -= sizeof(uint64_t);
@@ -1729,7 +1774,7 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
                                   const enum fio_ddir idx, unsigned int bytes)
  {
         const int no_reduce = !gtod_reduce(td);
-       unsigned long lusec = 0;
+       unsigned long long llnsec = 0;
  
         if (td->parent)
                 td = td->parent;
@@ -1738,37 +1783,37 @@ static void account_io_completion(struct thread_data *td, struct io_u *io_u,
                 return;
  
         if (no_reduce)
-               lusec = utime_since(&io_u->issue_time, &icd->time);
+               llnsec = ntime_since(&io_u->issue_time, &icd->time);
  
         if (!td->o.disable_lat) {
-               unsigned long tusec;
+               unsigned long long tnsec;
  
-               tusec = utime_since(&io_u->start_time, &icd->time);
-               add_lat_sample(td, idx, tusec, bytes, io_u->offset);
+               tnsec = ntime_since(&io_u->start_time, &icd->time);
+               add_lat_sample(td, idx, tnsec, bytes, io_u->offset);
  
                 if (td->flags & TD_F_PROFILE_OPS) {
                         struct prof_io_ops *ops = &td->prof_io_ops;
  
                         if (ops->io_u_lat)
-                               icd->error = ops->io_u_lat(td, tusec);
+                               icd->error = ops->io_u_lat(td, tnsec/1000);
                 }
  
-               if (td->o.max_latency && tusec > td->o.max_latency)
-                       lat_fatal(td, icd, tusec, td->o.max_latency);
-               if (td->o.latency_target && tusec > td->o.latency_target) {
+               if (td->o.max_latency && tnsec/1000 > td->o.max_latency)
+                       lat_fatal(td, icd, tnsec/1000, td->o.max_latency);
+               if (td->o.latency_target && tnsec/1000 > td->o.latency_target) {
                         if (lat_target_failed(td))
-                               lat_fatal(td, icd, tusec, td->o.latency_target);
+                               lat_fatal(td, icd, tnsec/1000, td->o.latency_target);
                 }
         }
  
         if (ddir_rw(idx)) {
                 if (!td->o.disable_clat) {
-                       add_clat_sample(td, idx, lusec, bytes, io_u->offset);
-                       io_u_mark_latency(td, lusec);
+                       add_clat_sample(td, idx, llnsec, bytes, io_u->offset);
+                       io_u_mark_latency(td, llnsec);
                 }
  
                 if (!td->o.disable_bw && per_unit_log(td->bw_log))
-                       add_bw_sample(td, io_u, bytes, lusec);
+                       add_bw_sample(td, io_u, bytes, llnsec);
  
                 if (no_reduce && per_unit_log(td->iops_log))
                         add_iops_sample(td, io_u, bytes);
@@ -2000,7 +2045,7 @@ void io_u_queued(struct thread_data *td, struct io_u *io_u)
         if (!td->o.disable_slat && ramp_time_over(td) && td->o.stats) {
                 unsigned long slat_time;
  
-               slat_time = utime_since(&io_u->start_time, &io_u->issue_time);
+               slat_time = ntime_since(&io_u->start_time, &io_u->issue_time);
  
                 if (td->parent)
                         td = td->parent;
@@ -2143,7 +2188,7 @@ int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
         struct fio_file *f = io_u->file;
         int ret;
  
-       ret = os_trim(f->fd, io_u->offset, io_u->xfer_buflen);
+       ret = os_trim(f, io_u->offset, io_u->xfer_buflen);
         if (!ret)
                 return io_u->xfer_buflen;
  
diff --git a/io_u.h b/io_u.h

index 155344d4351aa4579a6201aa6a0e80e18af6ddc6..b228e2e84d7dffc1aef4baceef3f7fa6fc404d5b 100644 (file)
--- a/io_u.h
+++ b/io_u.h
@@ -31,8 +31,8 @@ enum {
   * The io unit
   */
  struct io_u {
-       struct timeval start_time;
-       struct timeval issue_time;
+       struct timespec start_time;
+       struct timespec issue_time;
  
         struct fio_file *file;
         unsigned int flags;
diff --git a/ioengines.c b/ioengines.c

index 04fd06ec6bb7f9200c699ac874883614c5fbf5d8..1bfc06f96d12bf15f07e8aded629e8fa9c7cc92d 100644 (file)
--- a/ioengines.c
+++ b/ioengines.c
@@ -123,13 +123,10 @@ static struct ioengine_ops *dlopen_ioengine(struct thread_data *td,
         return ops;
  }
  
-struct ioengine_ops *load_ioengine(struct thread_data *td, const char *name)
+static struct ioengine_ops *__load_ioengine(const char *name)
  {
-       struct ioengine_ops *ops;
         char engine[64];
  
-       dprint(FD_IO, "load ioengine %s\n", name);
-
         engine[sizeof(engine) - 1] = '\0';
         strncpy(engine, name, sizeof(engine) - 1);
  
@@ -141,10 +138,37 @@ struct ioengine_ops *load_ioengine(struct thread_data *td, const char *name)
                 strcpy(engine, "libaio");
         }
  
-       ops = find_ioengine(engine);
+       dprint(FD_IO, "load ioengine %s\n", engine);
+       return find_ioengine(engine);
+}
+
+struct ioengine_ops *load_ioengine(struct thread_data *td)
+{
+       struct ioengine_ops *ops = NULL;
+       const char *name;
+
+       /*
+        * Use ->ioengine_so_path if an external ioengine path is specified.
+        * In this case, ->ioengine is "external" which also means the prefix
+        * for external ioengines "external:" is properly used.
+        */
+       name = td->o.ioengine_so_path ?: td->o.ioengine;
+
+       /*
+        * Try to load ->ioengine first, and if failed try to dlopen(3) either
+        * ->ioengine or ->ioengine_so_path.  This is redundant for an external
+        * ioengine with prefix, and also leaves the possibility of unexpected
+        * behavior (e.g. if the "external" ioengine exists), but we do this
+        * so as not to break job files not using the prefix.
+        */
+       ops = __load_ioengine(td->o.ioengine);
         if (!ops)
                 ops = dlopen_ioengine(td, name);
  
+       /*
+        * If ops is NULL, we failed to load ->ioengine, and also failed to
+        * dlopen(3) either ->ioengine or ->ioengine_so_path as a path.
+        */
         if (!ops) {
                 log_err("fio: engine %s not loadable\n", name);
                 return NULL;
@@ -283,7 +307,7 @@ int td_io_queue(struct thread_data *td, struct io_u *io_u)
                  */
                 if (td->o.read_iolog_file)
                         memcpy(&td->last_issue, &io_u->issue_time,
-                                       sizeof(struct timeval));
+                                       sizeof(io_u->issue_time));
         }
  
         if (ddir_rw(ddir)) {
@@ -320,8 +344,8 @@ int td_io_queue(struct thread_data *td, struct io_u *io_u)
             td->o.odirect) {
  
                 log_info("fio: first direct IO errored. File system may not "
-                        "support direct IO, or iomem_align= is bad. Try "
-                        "setting direct=0.\n");
+                        "support direct IO, or iomem_align= is bad, or "
+                        "invalid block size. Try setting direct=0.\n");
         }
  
         if (!td->io_ops->commit || io_u->ddir == DDIR_TRIM) {
@@ -358,7 +382,7 @@ int td_io_queue(struct thread_data *td, struct io_u *io_u)
                  */
                 if (td->o.read_iolog_file)
                         memcpy(&td->last_issue, &io_u->issue_time,
-                                       sizeof(struct timeval));
+                                       sizeof(io_u->issue_time));
         }
  
         return ret;
@@ -475,39 +499,32 @@ int td_io_open_file(struct thread_data *td, struct fio_file *f)
                         goto err;
                 }
         }
-#ifdef FIO_HAVE_STREAMID
-       if (td->o.fadvise_stream &&
+#ifdef FIO_HAVE_WRITE_HINT
+       if (fio_option_is_set(&td->o, write_hint) &&
             (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
-               off_t stream = td->o.fadvise_stream;
-
-               if (posix_fadvise(f->fd, stream, f->io_size, POSIX_FADV_STREAMID) < 0) {
-                       td_verror(td, errno, "fadvise streamid");
-                       goto err;
-               }
-       }
-#endif
-
-#ifdef FIO_OS_DIRECTIO
-       /*
-        * Some OS's have a distinct call to mark the file non-buffered,
-        * instead of using O_DIRECT (Solaris)
-        */
-       if (td->o.odirect) {
-               int ret = fio_set_odirect(f->fd);
+               uint64_t hint = td->o.write_hint;
+               int cmd;
  
-               if (ret) {
-                       td_verror(td, ret, "fio_set_odirect");
-                       if (ret == ENOTTY) { /* ENOTTY suggests RAW device or ZFS */
-                               log_err("fio: doing directIO to RAW devices or ZFS not supported\n");
-                       } else {
-                               log_err("fio: the file system does not seem to support direct IO\n");
-                       }
+               /*
+                * For direct IO, we just need/want to set the hint on
+                * the file descriptor. For buffered IO, we need to set
+                * it on the inode.
+                */
+               if (td->o.odirect)
+                       cmd = F_SET_FILE_RW_HINT;
+               else
+                       cmd = F_SET_RW_HINT;
  
+               if (fcntl(f->fd, cmd, &hint) < 0) {
+                       td_verror(td, errno, "fcntl write hint");
                         goto err;
                 }
         }
  #endif
  
+       if (td->o.odirect && !OS_O_DIRECT && fio_set_directio(td, f))
+               goto err;
+
  done:
         log_file(td, f, FIO_LOG_OPEN_FILE);
         return 0;
@@ -562,7 +579,6 @@ int td_io_get_file_size(struct thread_data *td, struct fio_file *f)
  int fio_show_ioengine_help(const char *engine)
  {
         struct flist_head *entry;
-       struct thread_data td;
         struct ioengine_ops *io_ops;
         char *sep;
         int ret = 1;
@@ -581,9 +597,7 @@ int fio_show_ioengine_help(const char *engine)
                 sep++;
         }
  
-       memset(&td, 0, sizeof(td));
-
-       io_ops = load_ioengine(&td, engine);
+       io_ops = __load_ioengine(engine);
         if (!io_ops) {
                 log_info("IO engine %s not found\n", engine);
                 return 1;
@@ -594,7 +608,5 @@ int fio_show_ioengine_help(const char *engine)
         else
                 log_info("IO engine %s has no options\n", io_ops->name);
  
-       free_ioengine(&td);
-
         return ret;
  }
diff --git a/ioengines.h b/ioengines.h

index f24f4df5389ba691e248cb9598fdad83dcdb4362..177cbc053c33ba8aedf7dbb092d15bdc92b58ccb 100644 (file)
--- a/ioengines.h
+++ b/ioengines.h
@@ -79,7 +79,7 @@ extern int td_io_close_file(struct thread_data *, struct fio_file *);
  extern int td_io_unlink_file(struct thread_data *, struct fio_file *);
  extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *);
  
-extern struct ioengine_ops *load_ioengine(struct thread_data *, const char *);
+extern struct ioengine_ops *load_ioengine(struct thread_data *);
  extern void register_ioengine(struct ioengine_ops *);
  extern void unregister_ioengine(struct ioengine_ops *);
  extern void free_ioengine(struct thread_data *);
diff --git a/iolog.c b/iolog.c

index 31d674c24dab7dc607bd7c43b120f524e7ae5ab9..760d7b0a43d97962bb7d065c6e464c7b89a7f5f5 100644 (file)
--- a/iolog.c
+++ b/iolog.c
@@ -19,6 +19,7 @@
  #include "trim.h"
  #include "filelock.h"
  #include "smalloc.h"
+#include "blktrace.h"
  
  static int iolog_flush(struct io_log *log);
  
@@ -64,7 +65,7 @@ static void iolog_delay(struct thread_data *td, unsigned long delay)
  {
         uint64_t usec = utime_since_now(&td->last_issue);
         uint64_t this_delay;
-       struct timeval tv;
+       struct timespec ts;
  
         if (delay < td->time_offset) {
                 td->time_offset = 0;
@@ -77,7 +78,7 @@ static void iolog_delay(struct thread_data *td, unsigned long delay)
  
         delay -= usec;
  
-       fio_gettime(&tv, NULL);
+       fio_gettime(&ts, NULL);
         while (delay && !td->terminate) {
                 this_delay = delay;
                 if (this_delay > 500000)
@@ -87,7 +88,7 @@ static void iolog_delay(struct thread_data *td, unsigned long delay)
                 delay -= this_delay;
         }
  
-       usec = utime_since_now(&tv);
+       usec = utime_since_now(&ts);
         if (usec > delay)
                 td->time_offset = usec - delay;
         else
@@ -226,21 +227,16 @@ void log_io_piece(struct thread_data *td, struct io_u *io_u)
         }
  
         /*
-        * We don't need to sort the entries, if:
+        * We don't need to sort the entries if we only performed sequential
+        * writes. In this case, just reading back data in the order we wrote
+        * it out is the faster but still safe.
          *
-        *      Sequential writes, or
-        *      Random writes that lay out the file as it goes along
-        *
-        * For both these cases, just reading back data in the order we
-        * wrote it out is the fastest.
-        *
-        * One exception is if we don't have a random map AND we are doing
-        * verifies, in that case we need to check for duplicate blocks and
-        * drop the old one, which we rely on the rb insert/lookup for
-        * handling.
+        * One exception is if we don't have a random map in which case we need
+        * to check for duplicate blocks and drop the old one, which we rely on
+        * the rb insert/lookup for handling.
          */
-       if (((!td->o.verifysort) || !td_random(td) || !td->o.overwrite) &&
-             (file_randommap(td, ipo->file) || td->o.verify == VERIFY_NONE)) {
+       if (((!td->o.verifysort) || !td_random(td)) &&
+             file_randommap(td, ipo->file)) {
                 INIT_FLIST_HEAD(&ipo->list);
                 flist_add_tail(&ipo->list, &td->io_hist_list);
                 ipo->flags |= IP_F_ONLIST;
@@ -283,7 +279,8 @@ restart:
                         td->io_hist_len--;
                         rb_erase(parent, &td->io_hist_tree);
                         remove_trim_entry(td, __ipo);
-                       free(__ipo);
+                       if (!(__ipo->flags & IP_F_IN_FLIGHT))
+                               free(__ipo);
                         goto restart;
                 }
         }
@@ -642,6 +639,7 @@ void setup_log(struct io_log **log, struct log_params *p,
                 l->log_gz = 0;
         else if (l->log_gz || l->log_gz_store) {
                 mutex_init_pshared(&l->chunk_lock);
+               mutex_init_pshared(&l->deferred_free_lock);
                 p->td->flags |= TD_F_COMPRESS_LOG;
         }
  
@@ -1143,6 +1141,42 @@ size_t log_chunk_sizes(struct io_log *log)
  
  #ifdef CONFIG_ZLIB
  
+static bool warned_on_drop;
+
+static void iolog_put_deferred(struct io_log *log, void *ptr)
+{
+       if (!ptr)
+               return;
+
+       pthread_mutex_lock(&log->deferred_free_lock);
+       if (log->deferred < IOLOG_MAX_DEFER) {
+               log->deferred_items[log->deferred] = ptr;
+               log->deferred++;
+       } else if (!warned_on_drop) {
+               log_err("fio: had to drop log entry free\n");
+               warned_on_drop = true;
+       }
+       pthread_mutex_unlock(&log->deferred_free_lock);
+}
+
+static void iolog_free_deferred(struct io_log *log)
+{
+       int i;
+
+       if (!log->deferred)
+               return;
+
+       pthread_mutex_lock(&log->deferred_free_lock);
+
+       for (i = 0; i < log->deferred; i++) {
+               free(log->deferred_items[i]);
+               log->deferred_items[i] = NULL;
+       }
+
+       log->deferred = 0;
+       pthread_mutex_unlock(&log->deferred_free_lock);
+}
+
  static int gz_work(struct iolog_flush_data *data)
  {
         struct iolog_compress *c = NULL;
@@ -1235,7 +1269,7 @@ static int gz_work(struct iolog_flush_data *data)
         if (ret != Z_OK)
                 log_err("fio: deflateEnd %d\n", ret);
  
-       free(data->samples);
+       iolog_put_deferred(data->log, data->samples);
  
         if (!flist_empty(&list)) {
                 pthread_mutex_lock(&data->log->chunk_lock);
@@ -1246,7 +1280,7 @@ static int gz_work(struct iolog_flush_data *data)
         ret = 0;
  done:
         if (data->free)
-               free(data);
+               sfree(data);
         return ret;
  err:
         while (!flist_empty(&list)) {
@@ -1347,7 +1381,7 @@ int iolog_cur_flush(struct io_log *log, struct io_logs *cur_log)
  {
         struct iolog_flush_data *data;
  
-       data = malloc(sizeof(*data));
+       data = smalloc(sizeof(*data));
         if (!data)
                 return 1;
  
@@ -1361,6 +1395,9 @@ int iolog_cur_flush(struct io_log *log, struct io_logs *cur_log)
         cur_log->log = NULL;
  
         workqueue_enqueue(&log->td->log_compress_wq, &data->work);
+
+       iolog_free_deferred(log);
+
         return 0;
  }
  #else
diff --git a/iolog.h b/iolog.h

index 0733ad33c3c8b5872b3f3f897a299f9b41d1e606..bc3a0b5fea93770fa39309fb71526c751e803034 100644 (file)
--- a/iolog.h
+++ b/iolog.h
@@ -117,7 +117,7 @@ struct io_log {
          */
         struct io_stat avg_window[DDIR_RWDIR_CNT];
         unsigned long avg_msec;
-       unsigned long avg_last;
+       unsigned long avg_last[DDIR_RWDIR_CNT];
  
         /*
          * Windowed latency histograms, for keeping track of when we need to
@@ -131,6 +131,11 @@ struct io_log {
         pthread_mutex_t chunk_lock;
         unsigned int chunk_seq;
         struct flist_head chunk_list;
+
+       pthread_mutex_t deferred_free_lock;
+#define IOLOG_MAX_DEFER        8
+       void *deferred_items[IOLOG_MAX_DEFER];
+       unsigned int deferred;
  };
  
  /*
@@ -259,7 +264,7 @@ struct log_params {
  
  static inline bool per_unit_log(struct io_log *log)
  {
-       return log && !log->avg_msec;
+       return log && (!log->avg_msec || log->log_gz || log->log_gz_store);
  }
  
  static inline bool inline_log(struct io_log *log)
diff --git a/lib/axmap.c b/lib/axmap.c

index 2ee3a2563f818b04ec0edd0400fee7287d7800f9..bf203dfe7b2d0ba379ee09b06a6924c3bd00fd03 100644 (file)
--- a/lib/axmap.c
+++ b/lib/axmap.c
@@ -184,6 +184,9 @@ static bool axmap_clear_fn(struct axmap_level *al, unsigned long offset,
  void axmap_clear(struct axmap *axmap, uint64_t bit_nr)
  {
         axmap_handler(axmap, bit_nr, axmap_clear_fn, NULL);
+
+       if (bit_nr < axmap->first_free)
+               axmap->first_free = bit_nr;
  }
  
  struct axmap_set_data {
@@ -191,7 +194,7 @@ struct axmap_set_data {
         unsigned int set_bits;
  };
  
-static unsigned long bit_masks[] = {
+static const unsigned long bit_masks[] = {
         0x0000000000000000, 0x0000000000000001, 0x0000000000000003, 0x0000000000000007,
         0x000000000000000f, 0x000000000000001f, 0x000000000000003f, 0x000000000000007f,
         0x00000000000000ff, 0x00000000000001ff, 0x00000000000003ff, 0x00000000000007ff,
@@ -372,10 +375,9 @@ static uint64_t axmap_find_first_free(struct axmap *axmap, unsigned int level,
  
  static uint64_t axmap_first_free(struct axmap *axmap)
  {
-       if (firstfree_valid(axmap))
-               return axmap->first_free;
+       if (!firstfree_valid(axmap))
+               axmap->first_free = axmap_find_first_free(axmap, axmap->nr_levels - 1, 0);
  
-       axmap->first_free = axmap_find_first_free(axmap, axmap->nr_levels - 1, 0);
         return axmap->first_free;
  }
  
diff --git a/lib/ffz.h b/lib/ffz.h

index e2c1b8e94a39731fd2dbcfe6bebd8a19c69dd830..16c9ae9ebea8adaefbf11d6c51de26be71a9473b 100644 (file)
--- a/lib/ffz.h
+++ b/lib/ffz.h
@@ -27,10 +27,8 @@ static inline int ffs64(uint64_t word)
                 word >>= 2;
                 r += 2;
         }
-       if (!(word & 1)) {
-               word >>= 1;
+       if (!(word & 1))
                 r += 1;
-       }
  
         return r;
  }
diff --git a/lib/memalign.c b/lib/memalign.c

index 137cc8ec3ec4eef920b9524c9b6a374cba32660f..bfbd1e80c851da90410dfbf0565849ce725f3193 100644 (file)
--- a/lib/memalign.c
+++ b/lib/memalign.c
@@ -18,7 +18,7 @@ void *fio_memalign(size_t alignment, size_t size)
  
         assert(!(alignment & (alignment - 1)));
  
-       ptr = malloc(size + alignment + size + sizeof(*f) - 1);
+       ptr = malloc(size + alignment + sizeof(*f) - 1);
         if (ptr) {
                 ret = PTR_ALIGN(ptr, alignment - 1);
                 f = ret + size;
diff --git a/lib/output_buffer.c b/lib/output_buffer.c

index c1fdfc95f610aca10ece35ebdc2dac437c01af66..f6c304bdf02efdfa6c229bfefdc6043002b72893 100644 (file)
--- a/lib/output_buffer.c
+++ b/lib/output_buffer.c
@@ -3,7 +3,6 @@
  #include <stdlib.h>
  
  #include "output_buffer.h"
-#include "../log.h"
  #include "../minmax.h"
  
  #define BUF_INC        1024
@@ -18,6 +17,7 @@ void buf_output_init(struct buf_output *out)
  void buf_output_free(struct buf_output *out)
  {
         free(out->buf);
+       buf_output_init(out);
  }
  
  size_t buf_output_add(struct buf_output *out, const char *buf, size_t len)
@@ -40,16 +40,3 @@ size_t buf_output_add(struct buf_output *out, const char *buf, size_t len)
         out->buflen += len;
         return len;
  }
-
-size_t buf_output_flush(struct buf_output *out)
-{
-       size_t ret = 0;
-
-       if (out->buflen) {
-               ret = log_info_buf(out->buf, out->buflen);
-               memset(out->buf, 0, out->max_buflen);
-               out->buflen = 0;
-       }
-
-       return ret;
-}
diff --git a/lib/output_buffer.h b/lib/output_buffer.h

index 396002fbfa91474fd02bfb291acdd72fa712918f..a235af20739ecdc0a3c554a522f10946f8ba45ed 100644 (file)
--- a/lib/output_buffer.h
+++ b/lib/output_buffer.h
@@ -12,6 +12,5 @@ struct buf_output {
  void buf_output_init(struct buf_output *out);
  void buf_output_free(struct buf_output *out);
  size_t buf_output_add(struct buf_output *out, const char *buf, size_t len);
-size_t buf_output_flush(struct buf_output *out);
  
  #endif
diff --git a/lib/pattern.c b/lib/pattern.c

index 0aeb935266fa129d2995a4c49f94b0f0c76b1bc2..31ee4eaf9896246b487cc5cccd4c62fb42b724f1 100644 (file)
--- a/lib/pattern.c
+++ b/lib/pattern.c
@@ -4,11 +4,74 @@
  #include <limits.h>
  #include <errno.h>
  #include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
  
  #include "strntol.h"
  #include "pattern.h"
  #include "../minmax.h"
  #include "../oslib/strcasestr.h"
+#include "../oslib/strndup.h"
+
+/**
+ * parse_file() - parses binary file to fill buffer
+ * @beg - string input, extract filename from this
+ * @out - output buffer where parsed number should be put
+ * @out_len - length of the output buffer
+ * @filled - pointer where number of bytes successfully
+ *           parsed will be put
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_file(const char *beg, char *out,
+                             unsigned int out_len,
+                             unsigned int *filled)
+{
+       const char *end;
+       char *file;
+       int fd;
+       ssize_t count;
+
+       if (!out_len)
+               goto err_out;
+
+       assert(*beg == '\'');
+       beg++;
+       end = strchr(beg, '\'');
+       if (!end)
+               goto err_out;
+
+       file = strndup(beg, end - beg);
+       if (file == NULL)
+               goto err_out;
+
+       fd = open(file, O_RDONLY);
+       if (fd < 0)
+               goto err_free_out;
+
+       count = read(fd, out, out_len);
+       if (count == -1)
+               goto err_free_close_out;
+
+       *filled = count;
+       close(fd);
+       free(file);
+
+       /* Catch up quote */
+       return end + 1;
+
+err_free_close_out:
+       close(fd);
+err_free_out:
+       free(file);
+err_out:
+       return NULL;
+
+}
  
  /**
   * parse_string() - parses string in double quotes, like "abc"
@@ -271,6 +334,9 @@ int parse_and_fill_pattern(const char *in, unsigned int in_len,
                 parsed_fmt = 0;
  
                 switch (*beg) {
+               case '\'':
+                       end = parse_file(beg, out, out_len, &filled);
+                       break;
                 case '"':
                         end = parse_string(beg, out, out_len, &filled);
                         break;
diff --git a/lib/seqlock.h b/lib/seqlock.h

index 1ac1eb6bdea47cc5cd7647ca4610d1fc6ca0032f..762b6ec1d2dc7fa7ba5df9fe75d4e28da9a1c53b 100644 (file)
--- a/lib/seqlock.h
+++ b/lib/seqlock.h
@@ -1,6 +1,7 @@
  #ifndef FIO_SEQLOCK_H
  #define FIO_SEQLOCK_H
  
+#include "types.h"
  #include "../arch/arch.h"
  
  struct seqlock {
diff --git a/libfio.c b/libfio.c

index 83107084a4239c096298c3bddd702b7c8166cef0..14ddc4d03302758d95d08a45a2eb4d740798441b 100644 (file)
--- a/libfio.c
+++ b/libfio.c
@@ -144,10 +144,10 @@ void reset_all_stats(struct thread_data *td)
         }
  
         set_epoch_time(td, td->o.log_unix_epoch);
-       memcpy(&td->start, &td->epoch, sizeof(struct timeval));
-       memcpy(&td->iops_sample_time, &td->epoch, sizeof(struct timeval));
-       memcpy(&td->bw_sample_time, &td->epoch, sizeof(struct timeval));
-       memcpy(&td->ss.prev_time, &td->epoch, sizeof(struct timeval));
+       memcpy(&td->start, &td->epoch, sizeof(td->epoch));
+       memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
+       memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
+       memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
  
         lat_target_reset(td);
         clear_rusage_stat(td);
@@ -353,14 +353,17 @@ int initialize_fio(char *envp[])
          * can run into problems on archs that fault on unaligned fp
          * access (ARM).
          */
+       compiletime_assert((offsetof(struct thread_data, ts) % sizeof(void *)) == 0, "ts");
         compiletime_assert((offsetof(struct thread_stat, percentile_list) % 8) == 0, "stat percentile_list");
         compiletime_assert((offsetof(struct thread_stat, total_run_time) % 8) == 0, "total_run_time");
         compiletime_assert((offsetof(struct thread_stat, total_err_count) % 8) == 0, "total_err_count");
         compiletime_assert((offsetof(struct thread_stat, latency_percentile) % 8) == 0, "stat latency_percentile");
+       compiletime_assert((offsetof(struct thread_data, ts.clat_stat) % 8) == 0, "ts.clat_stat");
         compiletime_assert((offsetof(struct thread_options_pack, zipf_theta) % 8) == 0, "zipf_theta");
         compiletime_assert((offsetof(struct thread_options_pack, pareto_h) % 8) == 0, "pareto_h");
         compiletime_assert((offsetof(struct thread_options_pack, percentile_list) % 8) == 0, "percentile_list");
         compiletime_assert((offsetof(struct thread_options_pack, latency_percentile) % 8) == 0, "latency_percentile");
+       compiletime_assert((offsetof(struct jobs_eta, m_rate) % 8) == 0, "m_rate");
  
         err = endian_check();
         if (err) {
diff --git a/log.c b/log.c

index 4eb4af5905c40a45c8df1bd12f4bf02a99ea70a0..95351d57d96ca4fd0ea97e43d38352f9b7b3dde1 100644 (file)
--- a/log.c
+++ b/log.c
@@ -6,8 +6,16 @@
  
  #include "fio.h"
  
+#define LOG_START_SZ           512
+
  size_t log_info_buf(const char *buf, size_t len)
  {
+       /*
+        * buf could be NULL (not just "").
+        */
+       if (!buf)
+               return 0;
+
         if (is_backend) {
                 size_t ret = fio_server_text_output(FIO_LOG_INFO, buf, len);
                 if (ret != -1)
@@ -21,40 +29,66 @@ size_t log_info_buf(const char *buf, size_t len)
                 return fwrite(buf, len, 1, f_out);
  }
  
-size_t log_valist(const char *str, va_list args)
+static size_t valist_to_buf(char **buffer, const char *fmt, va_list src_args)
  {
-       char buffer[1024];
+       size_t len, cur = LOG_START_SZ;
+       va_list args;
+
+       do {
+               *buffer = calloc(1, cur);
+
+               va_copy(args, src_args);
+               len = vsnprintf(*buffer, cur, fmt, args);
+               va_end(args);
+
+               if (len < cur)
+                       break;
+
+               cur = len + 1;
+               free(*buffer);
+       } while (1);
+
+       return len;
+}
+
+size_t log_valist(const char *fmt, va_list args)
+{
+       char *buffer;
         size_t len;
  
-       len = vsnprintf(buffer, sizeof(buffer), str, args);
+       len = valist_to_buf(&buffer, fmt, args);
+       len = log_info_buf(buffer, len);
+       free(buffer);
  
-       return log_info_buf(buffer, min(len, sizeof(buffer) - 1));
+       return len;
  }
  
  size_t log_info(const char *format, ...)
  {
-       char buffer[1024];
         va_list args;
-       size_t len;
+       size_t ret;
  
         va_start(args, format);
-       len = vsnprintf(buffer, sizeof(buffer), format, args);
+       ret = log_valist(format, args);
         va_end(args);
  
-       return log_info_buf(buffer, min(len, sizeof(buffer) - 1));
+       return ret;
  }
  
  size_t __log_buf(struct buf_output *buf, const char *format, ...)
  {
-       char buffer[1024];
+       char *buffer;
         va_list args;
         size_t len;
  
         va_start(args, format);
-       len = vsnprintf(buffer, sizeof(buffer), format, args);
+       len = valist_to_buf(&buffer, format, args);
         va_end(args);
  
-       return buf_output_add(buf, buffer, min(len, sizeof(buffer) - 1));
+       len = buf_output_add(buf, buffer, len);
+       free(buffer);
+
+       return len;
  }
  
  int log_info_flush(void)
@@ -67,33 +101,33 @@ int log_info_flush(void)
  
  size_t log_err(const char *format, ...)
  {
-       char buffer[1024];
+       size_t ret, len;
+       char *buffer;
         va_list args;
-       size_t len;
  
         va_start(args, format);
-       len = vsnprintf(buffer, sizeof(buffer), format, args);
+       len = valist_to_buf(&buffer, format, args);
         va_end(args);
-       len = min(len, sizeof(buffer) - 1);
  
         if (is_backend) {
-               size_t ret = fio_server_text_output(FIO_LOG_ERR, buffer, len);
+               ret = fio_server_text_output(FIO_LOG_ERR, buffer, len);
                 if (ret != -1)
-                       return ret;
+                       goto done;
         }
  
         if (log_syslog) {
                 syslog(LOG_INFO, "%s", buffer);
-               return len;
+               ret = len;
         } else {
-               if (f_err != stderr) {
-                       int fio_unused ret;
-
+               if (f_err != stderr)
                         ret = fwrite(buffer, len, 1, stderr);
-               }
  
-               return fwrite(buffer, len, 1, f_err);
+               ret = fwrite(buffer, len, 1, f_err);
         }
+
+done:
+       free(buffer);
+       return ret;
  }
  
  const char *log_get_level(int level)
diff --git a/log.h b/log.h

index a39dea61893194e7a45740e653dc1e57a39181de..66546c446c397f19bb2d13fad09d292305220c1b 100644 (file)
--- a/log.h
+++ b/log.h
@@ -16,13 +16,15 @@ extern size_t log_valist(const char *str, va_list);
  extern size_t log_info_buf(const char *buf, size_t len);
  extern int log_info_flush(void);
  
-#define log_buf(buf, format, args...)          \
-do {                                           \
-       if ((buf) != NULL)                      \
-               __log_buf(buf, format, ##args); \
-       else                                    \
-               log_info(format, ##args);       \
-} while (0)
+#define log_buf(buf, format, args...)                  \
+({                                                     \
+       size_t __ret;                                   \
+       if ((buf) != NULL)                              \
+               __ret = __log_buf(buf, format, ##args); \
+       else                                            \
+               __ret = log_info(format, ##args);       \
+       __ret;                                          \
+})
  
  enum {
         FIO_LOG_DEBUG   = 1,
diff --git a/memory.c b/memory.c

index 22a7f5ddde6c1dc785d9a6f37ad4a16d53cb38ec..04dc3be8bfda1b312298591bff312d8d2b39912a 100644 (file)
--- a/memory.c
+++ b/memory.c
@@ -138,6 +138,9 @@ static int alloc_mem_mmap(struct thread_data *td, size_t total_mem)
         }
  
         if (td->o.mmapfile) {
+               if (access(td->o.mmapfile, F_OK) == 0)
+                       td->flags |= TD_F_MMAP_KEEP;
+
                 td->mmapfd = open(td->o.mmapfile, O_RDWR|O_CREAT, 0644);
  
                 if (td->mmapfd < 0) {
@@ -169,7 +172,7 @@ static int alloc_mem_mmap(struct thread_data *td, size_t total_mem)
                 td->orig_buffer = NULL;
                 if (td->mmapfd != 1 && td->mmapfd != -1) {
                         close(td->mmapfd);
-                       if (td->o.mmapfile)
+                       if (td->o.mmapfile && !(td->flags & TD_F_MMAP_KEEP))
                                 unlink(td->o.mmapfile);
                 }
  
@@ -187,7 +190,8 @@ static void free_mem_mmap(struct thread_data *td, size_t total_mem)
         if (td->o.mmapfile) {
                 if (td->mmapfd != -1)
                         close(td->mmapfd);
-               unlink(td->o.mmapfile);
+               if (!(td->flags & TD_F_MMAP_KEEP))
+                       unlink(td->o.mmapfile);
                 free(td->o.mmapfile);
         }
  }
diff --git a/mutex.c b/mutex.c

index d8c482519909ff334ac5929565908232f541ac24..9fab715bd7429e87b2690000d49a69bbaea3785a 100644 (file)
--- a/mutex.c
+++ b/mutex.c
@@ -141,11 +141,15 @@ struct fio_mutex *fio_mutex_init(int value)
         return NULL;
  }
  
-static bool mutex_timed_out(struct timeval *t, unsigned int msecs)
+static bool mutex_timed_out(struct timespec *t, unsigned int msecs)
  {
-       struct timeval now;
+       struct timeval tv;
+       struct timespec now;
+
+       gettimeofday(&tv, NULL);
+       now.tv_sec = tv.tv_sec;
+       now.tv_nsec = tv.tv_usec * 1000;
  
-       gettimeofday(&now, NULL);
         return mtime_since(t, &now) >= msecs;
  }
  
@@ -177,7 +181,7 @@ int fio_mutex_down_timeout(struct fio_mutex *mutex, unsigned int msecs)
                  * way too early, double check.
                  */
                 ret = pthread_cond_timedwait(&mutex->cond, &mutex->lock, &t);
-               if (ret == ETIMEDOUT && !mutex_timed_out(&tv_s, msecs))
+               if (ret == ETIMEDOUT && !mutex_timed_out(&t, msecs))
                         ret = 0;
         }
         mutex->waiters--;
diff --git a/options.c b/options.c

index b489e90210ae008be47b427195242a56272daa90..5c1abe91817dc7c3ba62fab0108fbac041f3c032 100644 (file)
--- a/options.c
+++ b/options.c
@@ -270,7 +270,8 @@ static int str2error(char *str)
         return 0;
  }
  
-static int ignore_error_type(struct thread_data *td, int etype, char *str)
+static int ignore_error_type(struct thread_data *td, enum error_type_bit etype,
+                               char *str)
  {
         unsigned int i;
         int *error;
@@ -282,7 +283,7 @@ static int ignore_error_type(struct thread_data *td, int etype, char *str)
         }
  
         td->o.ignore_error_nr[etype] = 4;
-       error = malloc(4 * sizeof(struct bssplit));
+       error = calloc(4, sizeof(int));
  
         i = 0;
         while ((fname = strsep(&str, ":")) != NULL) {
@@ -306,8 +307,9 @@ static int ignore_error_type(struct thread_data *td, int etype, char *str)
                                 error[i] = -error[i];
                 }
                 if (!error[i]) {
-                       log_err("Unknown error %s, please use number value \n",
+                       log_err("Unknown error %s, please use number value\n",
                                   fname);
+                       td->o.ignore_error_nr[etype] = 0;
                         free(error);
                         return 1;
                 }
@@ -317,8 +319,10 @@ static int ignore_error_type(struct thread_data *td, int etype, char *str)
                 td->o.continue_on_error |= 1 << etype;
                 td->o.ignore_error_nr[etype] = i;
                 td->o.ignore_error[etype] = error;
-       } else
+       } else {
+               td->o.ignore_error_nr[etype] = 0;
                 free(error);
+       }
  
         return 0;
  
@@ -328,7 +332,8 @@ static int str_ignore_error_cb(void *data, const char *input)
  {
         struct thread_data *td = cb_data_to_td(data);
         char *str, *p, *n;
-       int type = 0, ret = 1;
+       int ret = 1;
+       enum error_type_bit type = 0;
  
         if (parse_dryrun())
                 return 0;
@@ -1376,7 +1381,23 @@ static int str_gtod_reduce_cb(void *data, int *il)
         td->o.disable_bw = !!val;
         td->o.clat_percentiles = !val;
         if (val)
-               td->tv_cache_mask = 63;
+               td->ts_cache_mask = 63;
+
+       return 0;
+}
+
+static int str_offset_cb(void *data, unsigned long long *__val)
+{
+       struct thread_data *td = cb_data_to_td(data);
+       unsigned long long v = *__val;
+
+       if (parse_is_percent(v)) {
+               td->o.start_offset = 0;
+               td->o.start_offset_percent = -1ULL - v;
+               dprint(FD_PARSE, "SET start_offset_percent %d\n",
+                                       td->o.start_offset_percent);
+       } else
+               td->o.start_offset = v;
  
         return 0;
  }
@@ -1389,6 +1410,8 @@ static int str_size_cb(void *data, unsigned long long *__val)
         if (parse_is_percent(v)) {
                 td->o.size = 0;
                 td->o.size_percent = -1ULL - v;
+               dprint(FD_PARSE, "SET size_percent %d\n",
+                                       td->o.size_percent);
         } else
                 td->o.size = v;
  
@@ -1439,6 +1462,39 @@ static int str_write_hist_log_cb(void *data, const char *str)
         return 0;
  }
  
+/*
+ * str is supposed to be a substring of the strdup'd original string,
+ * and is valid only if it's a regular file path.
+ * This function keeps the pointer to the path as needed later.
+ *
+ * "external:/path/to/so\0" <- original pointer updated with strdup'd
+ * "external\0"             <- above pointer after parsed, i.e. ->ioengine
+ *          "/path/to/so\0" <- str argument, i.e. ->ioengine_so_path
+ */
+static int str_ioengine_external_cb(void *data, const char *str)
+{
+       struct thread_data *td = cb_data_to_td(data);
+       struct stat sb;
+       char *p;
+
+       if (!str) {
+               log_err("fio: null external ioengine path\n");
+               return 1;
+       }
+
+       p = (char *)str; /* str is mutable */
+       strip_blank_front(&p);
+       strip_blank_end(p);
+
+       if (stat(p, &sb) || !S_ISREG(sb.st_mode)) {
+               log_err("fio: invalid external ioengine path \"%s\"\n", p);
+               return 1;
+       }
+
+       td->o.ioengine_so_path = p;
+       return 0;
+}
+
  static int rw_verify(struct fio_option *o, void *data)
  {
         struct thread_data *td = cb_data_to_td(data);
@@ -1789,6 +1845,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
  #endif
                           { .ival = "external",
                             .help = "Load external engine (append name)",
+                           .cb = str_ioengine_external_cb,
                           },
                 },
         },
@@ -1858,6 +1915,17 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_IO_BASIC,
         },
+       {
+               .name   = "serialize_overlap",
+               .lname  = "Serialize overlap",
+               .off1   = offsetof(struct thread_options, serialize_overlap),
+               .type   = FIO_OPT_BOOL,
+               .help   = "Wait for in-flight IOs that collide to complete",
+               .parent = "iodepth",
+               .def    = "0",
+               .category = FIO_OPT_C_IO,
+               .group  = FIO_OPT_G_IO_BASIC,
+       },
         {
                 .name   = "io_submit_mode",
                 .lname  = "IO submit mode",
@@ -1938,6 +2006,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .lname  = "IO offset",
                 .alias  = "fileoffset",
                 .type   = FIO_OPT_STR_VAL,
+               .cb     = str_offset_cb,
                 .off1   = offsetof(struct thread_options, start_offset),
                 .help   = "Start IO from this offset",
                 .def    = "0",
@@ -2245,9 +2314,13 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                             .oval = FIO_FSERVICE_PARETO,
                             .help = "Pareto randomized",
                           },
+                         { .ival = "normal",
+                           .oval = FIO_FSERVICE_GAUSS,
+                           .help = "Normal (Gaussian) randomized",
+                         },
                           { .ival = "gauss",
                             .oval = FIO_FSERVICE_GAUSS,
-                           .help = "Normal (Gaussian) distribution",
+                           .help = "Alias for normal",
                           },
                           { .ival = "roundrobin",
                             .oval = FIO_FSERVICE_RR,
@@ -2261,14 +2334,14 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .parent = "nrfiles",
                 .hide   = 1,
         },
-#ifdef CONFIG_POSIX_FALLOCATE
+#ifdef FIO_HAVE_ANY_FALLOCATE
         {
                 .name   = "fallocate",
                 .lname  = "Fallocate",
                 .type   = FIO_OPT_STR,
                 .off1   = offsetof(struct thread_options, fallocate_mode),
                 .help   = "Whether pre-allocation is performed when laying out files",
-               .def    = "posix",
+               .def    = "native",
                 .category = FIO_OPT_C_FILE,
                 .group  = FIO_OPT_G_INVALID,
                 .posval = {
@@ -2276,10 +2349,16 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                             .oval = FIO_FALLOCATE_NONE,
                             .help = "Do not pre-allocate space",
                           },
+                         { .ival = "native",
+                           .oval = FIO_FALLOCATE_NATIVE,
+                           .help = "Use native pre-allocation if possible",
+                         },
+#ifdef CONFIG_POSIX_FALLOCATE
                           { .ival = "posix",
                             .oval = FIO_FALLOCATE_POSIX,
                             .help = "Use posix_fallocate()",
                           },
+#endif
  #ifdef CONFIG_LINUX_FALLOCATE
                           { .ival = "keep",
                             .oval = FIO_FALLOCATE_KEEP_SIZE,
@@ -2291,20 +2370,22 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                             .oval = FIO_FALLOCATE_NONE,
                             .help = "Alias for 'none'",
                           },
+#ifdef CONFIG_POSIX_FALLOCATE
                           { .ival = "1",
                             .oval = FIO_FALLOCATE_POSIX,
                             .help = "Alias for 'posix'",
                           },
+#endif
                 },
         },
-#else  /* CONFIG_POSIX_FALLOCATE */
+#else  /* FIO_HAVE_ANY_FALLOCATE */
         {
                 .name   = "fallocate",
                 .lname  = "Fallocate",
                 .type   = FIO_OPT_UNSUPPORTED,
                 .help   = "Your platform does not support fallocate",
         },
-#endif /* CONFIG_POSIX_FALLOCATE */
+#endif /* FIO_HAVE_ANY_FALLOCATE */
         {
                 .name   = "fadvise_hint",
                 .lname  = "Fadvise hint",
@@ -2333,24 +2414,6 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_FILE,
                 .group  = FIO_OPT_G_INVALID,
         },
-#ifdef FIO_HAVE_STREAMID
-       {
-               .name   = "fadvise_stream",
-               .lname  = "Fadvise stream",
-               .type   = FIO_OPT_INT,
-               .off1   = offsetof(struct thread_options, fadvise_stream),
-               .help   = "Use fadvise() to set stream ID",
-               .category = FIO_OPT_C_FILE,
-               .group  = FIO_OPT_G_INVALID,
-       },
-#else
-       {
-               .name   = "fadvise_stream",
-               .lname  = "Fadvise stream",
-               .type   = FIO_OPT_UNSUPPORTED,
-               .help   = "Your platform does not support fadvise stream ID",
-       },
-#endif
         {
                 .name   = "fsync",
                 .lname  = "Fsync",
@@ -3412,6 +3475,34 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_IO_TYPE,
         },
+#ifdef FIO_HAVE_WRITE_HINT
+       {
+               .name   = "write_hint",
+               .lname  = "Write hint",
+               .type   = FIO_OPT_STR,
+               .off1   = offsetof(struct thread_options, write_hint),
+               .help   = "Set expected write life time",
+               .category = FIO_OPT_C_ENGINE,
+               .group  = FIO_OPT_G_INVALID,
+               .posval = {
+                         { .ival = "none",
+                           .oval = RWH_WRITE_LIFE_NONE,
+                         },
+                         { .ival = "short",
+                           .oval = RWH_WRITE_LIFE_SHORT,
+                         },
+                         { .ival = "medium",
+                           .oval = RWH_WRITE_LIFE_MEDIUM,
+                         },
+                         { .ival = "long",
+                           .oval = RWH_WRITE_LIFE_LONG,
+                         },
+                         { .ival = "extreme",
+                           .oval = RWH_WRITE_LIFE_EXTREME,
+                         },
+               },
+       },
+#endif
         {
                 .name   = "create_serialize",
                 .lname  = "Create serialize",
@@ -3985,6 +4076,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .off1   = offsetof(struct thread_options, clat_percentiles),
                 .help   = "Enable the reporting of completion latency percentiles",
                 .def    = "1",
+               .inverse = "lat_percentiles",
+               .category = FIO_OPT_C_STAT,
+               .group  = FIO_OPT_G_INVALID,
+       },
+       {
+               .name   = "lat_percentiles",
+               .lname  = "IO latency percentiles",
+               .type   = FIO_OPT_BOOL,
+               .off1   = offsetof(struct thread_options, lat_percentiles),
+               .help   = "Enable the reporting of IO latency percentiles",
+               .def    = "0",
+               .inverse = "clat_percentiles",
                 .category = FIO_OPT_C_STAT,
                 .group  = FIO_OPT_G_INVALID,
         },
@@ -4332,17 +4435,6 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
                 .category = FIO_OPT_C_IO,
                 .group  = FIO_OPT_G_IO_FLOW,
         },
-       {
-               .name   = "skip_bad",
-               .lname  = "Skip operations against bad blocks",
-               .type   = FIO_OPT_BOOL,
-               .off1   = offsetof(struct thread_options, skip_bad),
-               .help   = "Skip operations against known bad blocks.",
-               .hide   = 1,
-               .def    = "0",
-               .category = FIO_OPT_C_IO,
-               .group  = FIO_OPT_G_MTD,
-       },
         {
                 .name   = "steadystate",
                 .lname  = "Steady state threshold",
diff --git a/os/os-android.h b/os/os-android.h

index 6c3e0985b22fc5896e21d42458f02ff4b2be444c..bb590e4786fe3f713ed7e1a1914963ecc9dc1276 100644 (file)
--- a/os/os-android.h
+++ b/os/os-android.h
@@ -7,6 +7,7 @@
  #include <sys/mman.h>
  #include <sys/uio.h>
  #include <sys/syscall.h>
+#include <sys/sysmacros.h>
  #include <sys/vfs.h>
  #include <unistd.h>
  #include <fcntl.h>
@@ -32,6 +33,7 @@
  #define FIO_HAVE_HUGETLB
  #define FIO_HAVE_BLKTRACE
  #define FIO_HAVE_CL_SIZE
+#define FIO_HAVE_CGROUPS
  #define FIO_HAVE_FS_STAT
  #define FIO_HAVE_TRIM
  #define FIO_HAVE_GETTID
@@ -59,19 +61,17 @@
  
  #ifndef CONFIG_NO_SHM
  /*
- * The Android NDK doesn't currently export <sys/shm.h>, so define the
- * necessary stuff here.
+ * Bionic doesn't support SysV shared memeory, so implement it using ashmem
   */
-
-#include <sys/shm.h>
-#define SHM_HUGETLB    04000
-
  #include <stdio.h>
  #include <linux/ashmem.h>
+#include <linux/shm.h>
+#define shmid_ds shmid64_ds
+#define SHM_HUGETLB    04000
  
  #define ASHMEM_DEVICE  "/dev/ashmem"
  
-static inline int shmctl (int __shmid, int __cmd, struct shmid_ds *__buf)
+static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
  {
         int ret=0;
         if (__cmd == IPC_RMID)
@@ -84,7 +84,7 @@ static inline int shmctl (int __shmid, int __cmd, struct shmid_ds *__buf)
         return ret;
  }
  
-static inline int shmget (key_t __key, size_t __size, int __shmflg)
+static inline int shmget(key_t __key, size_t __size, int __shmflg)
  {
         int fd,ret;
         char keybuf[11];
@@ -98,7 +98,8 @@ static inline int shmget (key_t __key, size_t __size, int __shmflg)
         if (ret < 0)
                 goto error;
  
-       ret = ioctl(fd, ASHMEM_SET_SIZE, __size);
+       /* Stores size in first 8 bytes, allocate extra space */
+       ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t));
         if (ret < 0)
                 goto error;
  
@@ -109,21 +110,22 @@ error:
         return ret;
  }
  
-static inline void *shmat (int __shmid, const void *__shmaddr, int __shmflg)
+static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg)
  {
-       size_t *ptr, size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
-       ptr = mmap(NULL, size + sizeof(size_t), PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
-       *ptr = size;    //save size at beginning of buffer, for use with munmap
-       return &ptr[1];
+       size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
+       /* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */
+       uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
+       /* Save size at beginning of buffer, for use with munmap */
+       *ptr = size;
+       return ptr + 1;
  }
  
  static inline int shmdt (const void *__shmaddr)
  {
-       size_t *ptr, size;
-       ptr = (size_t *)__shmaddr;
-       ptr--;
-       size = *ptr;    //find mmap size which we stored at the beginning of the buffer
-       return munmap((void *)ptr, size + sizeof(size_t));
+       /* Find mmap size which we stored at the beginning of the buffer */
+       uint64_t *ptr = (uint64_t *)__shmaddr - 1;
+       size_t size = *ptr;
+       return munmap(ptr, size);
  }
  #endif
  
@@ -272,7 +274,7 @@ static inline unsigned long long get_fs_free_size(const char *path)
         return ret;
  }
  
-static inline int os_trim(int fd, unsigned long long start,
+static inline int os_trim(struct fio_file *f, unsigned long long start,
                           unsigned long long len)
  {
         uint64_t range[2];
@@ -280,7 +282,7 @@ static inline int os_trim(int fd, unsigned long long start,
         range[0] = start;
         range[1] = len;
  
-       if (!ioctl(fd, BLKDISCARD, range))
+       if (!ioctl(f->fd, BLKDISCARD, range))
                 return 0;
  
         return errno;
diff --git a/os/os-dragonfly.h b/os/os-dragonfly.h

index 8a116e60b5deefa43e189496fb5accdb9d4d69df..423b2369a071bd2de9096178ee2d3c118decabb2 100644 (file)
--- a/os/os-dragonfly.h
+++ b/os/os-dragonfly.h
@@ -5,6 +5,7 @@
  
  #include <errno.h>
  #include <unistd.h>
+#include <sys/endian.h>
  #include <sys/param.h>
  #include <sys/sysctl.h>
  #include <sys/statvfs.h>
@@ -215,7 +216,7 @@ static inline unsigned long long get_fs_free_size(const char *path)
         return ret;
  }
  
-static inline int os_trim(int fd, unsigned long long start,
+static inline int os_trim(struct fio_file *f, unsigned long long start,
                           unsigned long long len)
  {
         off_t range[2];
@@ -223,7 +224,7 @@ static inline int os_trim(int fd, unsigned long long start,
         range[0] = start;
         range[1] = len;
  
-       if (!ioctl(fd, IOCTLTRIM, range))
+       if (!ioctl(f->fd, IOCTLTRIM, range))
                 return 0;
  
         return errno;
diff --git a/os/os-freebsd.h b/os/os-freebsd.h

index c7863b5e02eb5b978f266cc5d9329827c1819aca..4a7cdeb7daaf2cef234c5a0c0ec551c7e72e4dad 100644 (file)
--- a/os/os-freebsd.h
+++ b/os/os-freebsd.h
@@ -6,6 +6,7 @@
  #include <errno.h>
  #include <sys/sysctl.h>
  #include <sys/disk.h>
+#include <sys/endian.h>
  #include <sys/thr.h>
  #include <sys/socket.h>
  #include <sys/param.h>
@@ -116,7 +117,7 @@ static inline unsigned long long get_fs_free_size(const char *path)
         return ret;
  }
  
-static inline int os_trim(int fd, unsigned long long start,
+static inline int os_trim(struct fio_file *f, unsigned long long start,
                           unsigned long long len)
  {
         off_t range[2];
@@ -124,7 +125,7 @@ static inline int os_trim(int fd, unsigned long long start,
         range[0] = start;
         range[1] = len;
  
-       if (!ioctl(fd, DIOCGDELETE, range))
+       if (!ioctl(f->fd, DIOCGDELETE, range))
                 return 0;
  
         return errno;
diff --git a/os/os-linux.h b/os/os-linux.h

index 911f7e7c8710719d5c6d2d9b094efb096077f0c5..1ad6ebd28497e3257e52c58353881357b0c90f8e 100644 (file)
--- a/os/os-linux.h
+++ b/os/os-linux.h
@@ -16,6 +16,8 @@
  #include <linux/unistd.h>
  #include <linux/raw.h>
  #include <linux/major.h>
+#include <linux/fs.h>
+#include <scsi/sg.h>
  
  #include "./os-linux-syscall.h"
  #include "binject.h"
@@ -258,6 +260,14 @@ static inline int arch_cache_line_size(void)
                 return atoi(size);
  }
  
+#ifdef __powerpc64__
+#define FIO_HAVE_CPU_ONLINE_SYSCONF
+static inline unsigned int cpus_online(void)
+{
+        return sysconf(_SC_NPROCESSORS_CONF);
+}
+#endif
+
  static inline unsigned long long get_fs_free_size(const char *path)
  {
         unsigned long long ret;
@@ -271,7 +281,7 @@ static inline unsigned long long get_fs_free_size(const char *path)
         return ret;
  }
  
-static inline int os_trim(int fd, unsigned long long start,
+static inline int os_trim(struct fio_file *f, unsigned long long start,
                           unsigned long long len)
  {
         uint64_t range[2];
@@ -279,7 +289,7 @@ static inline int os_trim(int fd, unsigned long long start,
         range[0] = start;
         range[1] = len;
  
-       if (!ioctl(fd, BLKDISCARD, range))
+       if (!ioctl(f->fd, BLKDISCARD, range))
                 return 0;
  
         return errno;
@@ -293,11 +303,26 @@ static inline int fio_set_sched_idle(void)
  }
  #endif
  
-#ifndef POSIX_FADV_STREAMID
-#define POSIX_FADV_STREAMID    8
+#ifndef F_GET_RW_HINT
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE  1024
+#endif
+#define F_GET_RW_HINT          (F_LINUX_SPECIFIC_BASE + 11)
+#define F_SET_RW_HINT          (F_LINUX_SPECIFIC_BASE + 12)
+#define F_GET_FILE_RW_HINT     (F_LINUX_SPECIFIC_BASE + 13)
+#define F_SET_FILE_RW_HINT     (F_LINUX_SPECIFIC_BASE + 14)
  #endif
  
-#define FIO_HAVE_STREAMID
+#ifndef RWH_WRITE_LIFE_NONE
+#define RWH_WRITE_LIFE_NOT_SET 0
+#define RWH_WRITE_LIFE_NONE    1
+#define RWH_WRITE_LIFE_SHORT   2
+#define RWH_WRITE_LIFE_MEDIUM  3
+#define RWH_WRITE_LIFE_LONG    4
+#define RWH_WRITE_LIFE_EXTREME 5
+#endif
+
+#define FIO_HAVE_WRITE_HINT
  
  #ifndef RWF_HIPRI
  #define RWF_HIPRI      0x00000001
@@ -309,14 +334,26 @@ static inline int fio_set_sched_idle(void)
  #define RWF_SYNC       0x00000004
  #endif
  
+#ifndef RWF_WRITE_LIFE_SHIFT
+#define RWF_WRITE_LIFE_SHIFT           4
+#define RWF_WRITE_LIFE_SHORT           (1 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_MEDIUM          (2 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_LONG            (3 << RWF_WRITE_LIFE_SHIFT)
+#define RWF_WRITE_LIFE_EXTREME         (4 << RWF_WRITE_LIFE_SHIFT)
+#endif
+
  #ifndef CONFIG_PWRITEV2
  #ifdef __NR_preadv2
  static inline void make_pos_h_l(unsigned long *pos_h, unsigned long *pos_l,
                                 off_t offset)
  {
+#if BITS_PER_LONG == 64
+       *pos_l = offset;
+       *pos_h = 0;
+#else
         *pos_l = offset & 0xffffffff;
         *pos_h = ((uint64_t) offset) >> 32;
-
+#endif
  }
  static inline ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt,
                               off_t offset, unsigned int flags)
@@ -355,4 +392,22 @@ static inline int shm_attach_to_open_removed(void)
         return 1;
  }
  
+#ifdef CONFIG_LINUX_FALLOCATE
+#define FIO_HAVE_NATIVE_FALLOCATE
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset,
+                                uint64_t len)
+{
+       int ret;
+       ret = fallocate(f->fd, 0, 0, len);
+       if (ret == 0)
+               return true;
+
+       /* Work around buggy old glibc versions... */
+       if (ret > 0)
+               errno = ret;
+
+       return false;
+}
+#endif
+
  #endif
diff --git a/os/os-mac.h b/os/os-mac.h

index 7de36ea79aa7b43a7f962796526cf000a2458527..92a60ee98766f0fc1526cd440e38a687e6cc02ae 100644 (file)
--- a/os/os-mac.h
+++ b/os/os-mac.h
@@ -20,6 +20,7 @@
  #define FIO_USE_GENERIC_INIT_RANDOM_STATE
  #define FIO_HAVE_GETTID
  #define FIO_HAVE_CHARDEV_SIZE
+#define FIO_HAVE_NATIVE_FALLOCATE
  
  #define OS_MAP_ANON            MAP_ANON
  
@@ -40,9 +41,9 @@ typedef unsigned int clockid_t;
  #endif
  
  #define FIO_OS_DIRECTIO
-static inline int fio_set_odirect(int fd)
+static inline int fio_set_odirect(struct fio_file *f)
  {
-       if (fcntl(fd, F_NOCACHE, 1) == -1)
+       if (fcntl(f->fd, F_NOCACHE, 1) == -1)
                 return errno;
         return 0;
  }
@@ -101,4 +102,15 @@ static inline int gettid(void)
   */
  extern int fdatasync(int fd);
  
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t len)
+{
+       fstore_t store = {F_ALLOCATEALL, F_PEOFPOSMODE, offset, len};
+       if (fcntl(f->fd, F_PREALLOCATE, &store) != -1) {
+               if (ftruncate(f->fd, len) == 0)
+                       return true;
+       }
+
+       return false;
+}
+
  #endif
diff --git a/os/os-netbsd.h b/os/os-netbsd.h

index 7be02a789e632404a0092248667a27351844b049..682a11c95f1ccb91df421e424923dbd2c9703761 100644 (file)
--- a/os/os-netbsd.h
+++ b/os/os-netbsd.h
@@ -10,9 +10,10 @@
  #include <sys/ioctl.h>
  #include <sys/dkio.h>
  #include <sys/disklabel.h>
-/* XXX hack to avoid confilcts between rbtree.h and <sys/rb.h> */
-#define        rb_node _rb_node
+#include <sys/endian.h>
  #include <sys/sysctl.h>
+
+/* XXX hack to avoid confilcts between rbtree.h and <sys/rbtree.h> */
  #undef rb_node
  #undef rb_left
  #undef rb_right
@@ -25,8 +26,6 @@
  #define FIO_HAVE_FS_STAT
  #define FIO_HAVE_GETTID
  
-#undef FIO_HAVE_CPU_AFFINITY   /* doesn't exist */
-
  #define OS_MAP_ANON            MAP_ANON
  
  #ifndef PTHREAD_STACK_MIN
diff --git a/os/os-openbsd.h b/os/os-openbsd.h

index d874ee2539e7d574f4898338c4d8f823c942cc93..b4c02c9bf236803227cdb053f35975bb2b6f26d6 100644 (file)
--- a/os/os-openbsd.h
+++ b/os/os-openbsd.h
@@ -9,24 +9,23 @@
  #include <sys/ioctl.h>
  #include <sys/dkio.h>
  #include <sys/disklabel.h>
+#include <sys/endian.h>
  #include <sys/utsname.h>
-/* XXX hack to avoid conflicts between rbtree.h and <sys/tree.h> */
  #include <sys/sysctl.h>
+
+/* XXX hack to avoid conflicts between rbtree.h and <sys/tree.h> */
  #undef RB_BLACK
  #undef RB_RED
  #undef RB_ROOT
  
  #include "../file.h"
  
-#undef  FIO_HAVE_ODIRECT
  #define FIO_USE_GENERIC_RAND
  #define FIO_USE_GENERIC_INIT_RANDOM_STATE
  #define FIO_HAVE_FS_STAT
  #define FIO_HAVE_GETTID
  #define FIO_HAVE_SHM_ATTACH_REMOVED
  
-#undef FIO_HAVE_CPU_AFFINITY   /* doesn't exist */
-
  #define OS_MAP_ANON            MAP_ANON
  
  #ifndef PTHREAD_STACK_MIN
diff --git a/os/os-solaris.h b/os/os-solaris.h

index 8f8f53b621c7de8fab15fc738e7591d35d3172fa..45268b23862af2983922fca47115cf0d362c6243 100644 (file)
--- a/os/os-solaris.h
+++ b/os/os-solaris.h
@@ -85,9 +85,9 @@ static inline long os_random_long(os_random_state_t *rs)
  
  #define FIO_OS_DIRECTIO
  extern int directio(int, int);
-static inline int fio_set_odirect(int fd)
+static inline int fio_set_odirect(struct fio_file *f)
  {
-       if (directio(fd, DIRECTIO_ON) < 0)
+       if (directio(f->fd, DIRECTIO_ON) < 0)
                 return errno;
  
         return 0;
@@ -97,7 +97,7 @@ static inline int fio_set_odirect(int fd)
   * pset binding hooks for fio
   */
  #define fio_setaffinity(pid, cpumask)          \
-       pset_bind((cpumask), P_PID, (pid), NULL)
+       pset_bind((cpumask), P_LWPID, (pid), NULL)
  #define fio_getaffinity(pid, ptr)      ({ 0; })
  
  #define fio_cpu_clear(mask, cpu)       pset_assign(PS_NONE, (cpu), NULL)
diff --git a/os/os-windows.h b/os/os-windows.h

index 0c8c42d39a24bc5e2afc2115bee0deb9dbdc91e9..36b421ee45ad52049dabafa625acda25d79c1c00 100644 (file)
--- a/os/os-windows.h
+++ b/os/os-windows.h
@@ -116,7 +116,6 @@ int nanosleep(const struct timespec *rqtp, struct timespec *rmtp);
  ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
  ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
                 off_t offset);
-extern void td_fill_rand_seeds(struct thread_data *);
  
  static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
  {
@@ -239,7 +238,7 @@ static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
         return 0;
  }
  
-static inline int init_random_state(struct thread_data *td, unsigned long *rand_seeds, int size)
+static inline int init_random_seeds(unsigned long *rand_seeds, int size)
  {
         HCRYPTPROV hCryptProv;
  
@@ -258,7 +257,6 @@ static inline int init_random_state(struct thread_data *td, unsigned long *rand_
         }
  
         CryptReleaseContext(hCryptProv, 0);
-       td_fill_rand_seeds(td);
         return 0;
  }
  
diff --git a/os/os.h b/os/os.h

index 5e3c813d8fb5b5f726d650c172aedde94f84f21d..f62b4270f8383851dede8402dc925066770623f5 100644 (file)
--- a/os/os.h
+++ b/os/os.h
@@ -60,11 +60,6 @@ typedef struct aiocb os_aiocb_t;
  #endif
  #endif
  
-#ifdef FIO_HAVE_SGIO
-#include <linux/fs.h>
-#include <scsi/sg.h>
-#endif
-
  #ifndef CONFIG_STRSEP
  #include "../oslib/strsep.h"
  #endif
@@ -209,16 +204,20 @@ static inline uint64_t fio_swap64(uint64_t val)
  
  #ifndef FIO_HAVE_BYTEORDER_FUNCS
  #ifdef CONFIG_LITTLE_ENDIAN
+#define __be64_to_cpu(x)               fio_swap64(x)
  #define __le16_to_cpu(x)               (x)
  #define __le32_to_cpu(x)               (x)
  #define __le64_to_cpu(x)               (x)
+#define __cpu_to_be64(x)               fio_swap64(x)
  #define __cpu_to_le16(x)               (x)
  #define __cpu_to_le32(x)               (x)
  #define __cpu_to_le64(x)               (x)
  #else
+#define __be64_to_cpu(x)               (x)
  #define __le16_to_cpu(x)               fio_swap16(x)
  #define __le32_to_cpu(x)               fio_swap32(x)
  #define __le64_to_cpu(x)               fio_swap64(x)
+#define __cpu_to_be64(x)               (x)
  #define __cpu_to_le16(x)               fio_swap16(x)
  #define __cpu_to_le32(x)               fio_swap32(x)
  #define __cpu_to_le64(x)               fio_swap64(x)
@@ -226,6 +225,10 @@ static inline uint64_t fio_swap64(uint64_t val)
  #endif /* FIO_HAVE_BYTEORDER_FUNCS */
  
  #ifdef FIO_INTERNAL
+#define be64_to_cpu(val) ({                    \
+       typecheck(uint64_t, val);               \
+       __be64_to_cpu(val);                     \
+})
  #define le16_to_cpu(val) ({                    \
         typecheck(uint16_t, val);               \
         __le16_to_cpu(val);                     \
@@ -240,6 +243,10 @@ static inline uint64_t fio_swap64(uint64_t val)
  })
  #endif
  
+#define cpu_to_be64(val) ({                    \
+       typecheck(uint64_t, val);               \
+       __cpu_to_be64(val);                     \
+})
  #define cpu_to_le16(val) ({                    \
         typecheck(uint16_t, val);               \
         __cpu_to_le16(val);                     \
@@ -253,19 +260,6 @@ static inline uint64_t fio_swap64(uint64_t val)
         __cpu_to_le64(val);                     \
  })
  
-#ifndef FIO_HAVE_BLKTRACE
-static inline int is_blktrace(const char *fname, int *need_swap)
-{
-       return 0;
-}
-struct thread_data;
-static inline int load_blktrace(struct thread_data *td, const char *fname,
-                               int need_swap)
-{
-       return 1;
-}
-#endif
-
  #define FIO_DEF_CL_SIZE                128
  
  static inline int os_cache_line_size(void)
@@ -316,12 +310,7 @@ static inline long os_random_long(os_random_state_t *rs)
  #endif
  
  #ifdef FIO_USE_GENERIC_INIT_RANDOM_STATE
-extern void td_fill_rand_seeds(struct thread_data *td);
-/*
- * Initialize the various random states we need (random io, block size ranges,
- * read/write mix, etc).
- */
-static inline int init_random_state(struct thread_data *td, unsigned long *rand_seeds, int size)
+static inline int init_random_seeds(unsigned long *rand_seeds, int size)
  {
         int fd;
  
@@ -336,7 +325,6 @@ static inline int init_random_state(struct thread_data *td, unsigned long *rand_
         }
  
         close(fd);
-       td_fill_rand_seeds(td);
         return 0;
  }
  #endif
@@ -348,14 +336,6 @@ static inline unsigned long long get_fs_free_size(const char *path)
  }
  #endif
  
-#ifdef __powerpc64__
-#define FIO_HAVE_CPU_ONLINE_SYSCONF
-static inline unsigned int cpus_online(void)
-{
-        return sysconf(_SC_NPROCESSORS_CONF);
-}
-#endif
-
  #ifndef FIO_HAVE_CPU_ONLINE_SYSCONF
  static inline unsigned int cpus_online(void)
  {
@@ -393,4 +373,16 @@ static inline int shm_attach_to_open_removed(void)
  }
  #endif
  
+#ifndef FIO_HAVE_NATIVE_FALLOCATE
+static inline bool fio_fallocate(struct fio_file *f, uint64_t offset, uint64_t len)
+{
+       errno = ENOSYS;
+       return false;
+}
+#endif
+
+#if defined(CONFIG_POSIX_FALLOCATE) || defined(FIO_HAVE_NATIVE_FALLOCATE)
+# define FIO_HAVE_ANY_FALLOCATE
+#endif
+
  #endif
diff --git a/os/windows/install.wxs b/os/windows/install.wxs

index ffaed8e14ce3d5c286a0be74781af8ad2f65ff05..58244c560bb6b1eafb84e2b0e6b8542d00292442 100755 (executable)
--- a/os/windows/install.wxs
+++ b/os/windows/install.wxs
@@ -10,7 +10,7 @@
         <Product Id="*"
           Codepage="1252" Language="1033"
           Manufacturer="fio" Name="fio"
-         UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="2.19">
+         UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="3.1">
                 <Package
                   Description="Flexible IO Tester"
                   InstallerVersion="301" Keywords="Installer,MSI,Database"
diff --git a/os/windows/posix.c b/os/windows/posix.c

index eae8c86170b958254bd8a2bc14c7015f801ac8d7..00f03355985077c08665a79e5036b60755507e40 100755 (executable)
--- a/os/windows/posix.c
+++ b/os/windows/posix.c
@@ -25,8 +25,8 @@
  #include "../os-windows.h"
  #include "../../lib/hweight.h"
  
-extern unsigned long mtime_since_now(struct timeval *);
-extern void fio_gettime(struct timeval *, void *);
+extern unsigned long mtime_since_now(struct timespec *);
+extern void fio_gettime(struct timespec *, void *);
  
  /* These aren't defined in the MinGW headers */
  HRESULT WINAPI StringCchCopyA(
@@ -584,7 +584,8 @@ char *basename(char *path)
         while (path[i] != '\\' && path[i] != '/' && i >= 0)
                 i--;
  
-       strncpy(name, path + i + 1, MAX_PATH);
+       name[MAX_PATH - 1] = '\0';
+       strncpy(name, path + i + 1, MAX_PATH - 1);
  
         return name;
  }
@@ -852,7 +853,7 @@ int poll(struct pollfd fds[], nfds_t nfds, int timeout)
  
  int nanosleep(const struct timespec *rqtp, struct timespec *rmtp)
  {
-       struct timeval tv;
+       struct timespec tv;
         DWORD ms_remaining;
         DWORD ms_total = (rqtp->tv_sec * 1000) + (rqtp->tv_nsec / 1000000.0);
  
diff --git a/oslib/libmtd.c b/oslib/libmtd.c

index 24e9db9cf062da307cd50cc11c17416152efdc73..5d18871b55e82f4c86704a9a284de2c9a519b9d8 100644 (file)
--- a/oslib/libmtd.c
+++ b/oslib/libmtd.c
@@ -1002,7 +1002,6 @@ int mtd_torture(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb)
                 }
         }
  
-       err = 0;
         normsg("PEB %d passed torture test, do not mark it a bad", eb);
  
  out:
diff --git a/oslib/libmtd_common.h b/oslib/libmtd_common.h

index 9768066b6a4449a562802ba972c6507eaf044494..35628fea7e052fe181d3f5919cbad1a267babe39 100644 (file)
--- a/oslib/libmtd_common.h
+++ b/oslib/libmtd_common.h
@@ -119,57 +119,6 @@ extern "C" {
         fprintf(stderr, "%s: warning!: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__); \
  } while(0)
  
-#if defined(__UCLIBC__)
-/* uClibc versions before 0.9.34 don't have rpmatch() */
-#if __UCLIBC_MAJOR__ == 0 && \
-               (__UCLIBC_MINOR__ < 9 || \
-               (__UCLIBC_MINOR__ == 9 && __UCLIBC_SUBLEVEL__ < 34))
-#undef rpmatch
-#define rpmatch __rpmatch
-static inline int __rpmatch(const char *resp)
-{
-    return (resp[0] == 'y' || resp[0] == 'Y') ? 1 :
-       (resp[0] == 'n' || resp[0] == 'N') ? 0 : -1;
-}
-#endif
-#endif
-
-/**
- * prompt the user for confirmation
- */
-static inline bool prompt(const char *msg, bool def)
-{
-       char *line = NULL;
-       size_t len;
-       bool ret = def;
-
-       do {
-               normsg_cont("%s (%c/%c) ", msg, def ? 'Y' : 'y', def ? 'n' : 'N');
-               fflush(stdout);
-
-               while (getline(&line, &len, stdin) == -1) {
-                       printf("failed to read prompt; assuming '%s'\n",
-                               def ? "yes" : "no");
-                       break;
-               }
-
-               if (strcmp("\n", line) != 0) {
-                       switch (rpmatch(line)) {
-                       case 0: ret = false; break;
-                       case 1: ret = true; break;
-                       case -1:
-                               puts("unknown response; please try again");
-                               continue;
-                       }
-               }
-               break;
-       } while (1);
-
-       free(line);
-
-       return ret;
-}
-
  static inline int is_power_of_2(unsigned long long n)
  {
         return (n != 0 && ((n & (n - 1)) == 0));
diff --git a/oslib/linux-dev-lookup.c b/oslib/linux-dev-lookup.c

index 5fbccd33c6d541ea168d4025d3eb5b742a3514dc..1dda93f2a0ef3fa0e537a9ccdf202a9a624ecb35 100644 (file)
--- a/oslib/linux-dev-lookup.c
+++ b/oslib/linux-dev-lookup.c
@@ -1,5 +1,6 @@
  #include <sys/types.h>
  #include <sys/stat.h>
+#include <sys/sysmacros.h>
  #include <dirent.h>
  #include <string.h>
  #include <stdio.h>
@@ -20,7 +21,7 @@ int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
                 return 0;
  
         while ((dir = readdir(D)) != NULL) {
-               char full_path[256];
+               char full_path[257];
  
                 if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
                         continue;
diff --git a/oslib/strndup.c b/oslib/strndup.c

new file mode 100644 (file)

index 0000000..7b0fcb5
--- /dev/null
+++ b/oslib/strndup.c
@@ -0,0 +1,18 @@
+#include <stdlib.h>
+#include "strndup.h"
+
+#ifndef CONFIG_HAVE_STRNDUP
+
+char *strndup(const char *s, size_t n)
+{
+       char *str = malloc(n + 1);
+
+       if (str) {
+               strncpy(str, s, n);
+               str[n] = '\0';
+       }
+
+       return str;
+}
+
+#endif
diff --git a/oslib/strndup.h b/oslib/strndup.h

new file mode 100644 (file)

index 0000000..2cb904d
--- /dev/null
+++ b/oslib/strndup.h
@@ -0,0 +1,7 @@
+#include <string.h>
+
+#ifndef CONFIG_HAVE_STRNDUP
+
+char *strndup(const char *s, size_t n);
+
+#endif
diff --git a/parse.c b/parse.c

index 4d4fdddeae573f12e10270291785401029399d53..ecce8b89dba99f5bd8af7120d10a2843fac056cf 100644 (file)
--- a/parse.c
+++ b/parse.c
@@ -1345,7 +1345,7 @@ void options_free(struct fio_option *options, void *data)
         dprint(FD_PARSE, "free options\n");
  
         for (o = &options[0]; o->name; o++) {
-               if (o->type != FIO_OPT_STR_STORE || !o->off1)
+               if (o->type != FIO_OPT_STR_STORE || !o->off1 || o->no_free)
                         continue;
  
                 ptr = td_var(data, o, o->off1);
diff --git a/parse.h b/parse.h

index fb6abd1b22a21447562acd5b8d02014be95ca0bd..dfe7f1620c6aa762f217840dba89e663277fd7c2 100644 (file)
--- a/parse.h
+++ b/parse.h
@@ -78,6 +78,7 @@ struct fio_option {
         int is_time;                    /* time based value */
         int no_warn_def;
         int pow2;                       /* must be a power-of-2 */
+       int no_free;
  };
  
  extern int parse_option(char *, const char *, struct fio_option *, struct fio_option **, void *, struct flist_head *);
diff --git a/printing.c b/printing.c

index 4dcc9861e44ccbdac05b10704ed3ad1b796f7574..b58996bb6a5ce47b161394cbb1795fc6375beaf9 100644 (file)
--- a/printing.c
+++ b/printing.c
@@ -31,7 +31,7 @@ static void results_draw_page(GtkPrintOperation *operation,
                               gpointer data)
  {
         cairo_t *cr;
-       char str[20];
+       char str[32];
         double x, y;
  
         cr = gtk_print_context_get_cairo_context(context);
diff --git a/profiles/act.c b/profiles/act.c

index 643f8a874f76ca415aa14aa8b1697a23116f671e..4669535a9906f6b0ac34b45842de72681e30e268 100644 (file)
--- a/profiles/act.c
+++ b/profiles/act.c
@@ -47,20 +47,12 @@ struct act_run_data {
  static struct act_run_data *act_run_data;
  
  struct act_prof_data {
-       struct timeval sample_tv;
+       struct timespec sample_tv;
         struct act_slice *slices;
         unsigned int cur_slice;
         unsigned int nr_slices;
  };
  
-static char *device_names;
-static unsigned int load;
-static unsigned int prep;
-static unsigned int threads_per_queue;
-static unsigned int num_read_blocks;
-static unsigned int write_size;
-static unsigned long long test_duration;
-
  #define ACT_MAX_OPTS   128
  static const char *act_opts[ACT_MAX_OPTS] = {
         "direct=1",
@@ -97,6 +89,7 @@ static struct fio_option options[] = {
                 .help   = "Devices to use",
                 .category = FIO_OPT_C_PROFILE,
                 .group  = FIO_OPT_G_ACT,
+               .no_free = true,
         },
         {
                 .name   = "load",
@@ -185,6 +178,8 @@ static int act_add_opt(const char *str, ...)
  
  static int act_add_rw(const char *dev, int reads)
  {
+       struct act_options *ao = &act_options;
+
         if (act_add_opt("name=act-%s-%s", reads ? "read" : "write", dev))
                 return 1;
         if (act_add_opt("filename=%s", dev))
@@ -192,21 +187,21 @@ static int act_add_rw(const char *dev, int reads)
         if (act_add_opt("rw=%s", reads ? "randread" : "randwrite"))
                 return 1;
         if (reads) {
-               int rload = load * R_LOAD / threads_per_queue;
+               int rload = ao->load * R_LOAD / ao->threads_per_queue;
  
-               if (act_add_opt("numjobs=%u", threads_per_queue))
+               if (act_add_opt("numjobs=%u", ao->threads_per_queue))
                         return 1;
                 if (act_add_opt("rate_iops=%u", rload))
                         return 1;
-               if (act_add_opt("bs=%u", num_read_blocks * 512))
+               if (act_add_opt("bs=%u", ao->num_read_blocks * 512))
                         return 1;
         } else {
-               const int rsize = write_size / (num_read_blocks * 512);
-               int wload = (load * W_LOAD + rsize - 1) / rsize;
+               const int rsize = ao->write_size / (ao->num_read_blocks * 512);
+               int wload = (ao->load * W_LOAD + rsize - 1) / rsize;
  
                 if (act_add_opt("rate_iops=%u", wload))
                         return 1;
-               if (act_add_opt("bs=%u", write_size))
+               if (act_add_opt("bs=%u", ao->write_size))
                         return 1;
         }
  
@@ -248,10 +243,10 @@ static int act_add_dev_prep(const char *dev)
  
  static int act_add_dev(const char *dev)
  {
-       if (prep)
+       if (act_options.prep)
                 return act_add_dev_prep(dev);
  
-       if (act_add_opt("runtime=%llus", test_duration))
+       if (act_add_opt("runtime=%llus", act_options.test_duration))
                 return 1;
         if (act_add_opt("time_based=1"))
                 return 1;
@@ -269,7 +264,7 @@ static int act_add_dev(const char *dev)
   */
  static int act_prep_cmdline(void)
  {
-       if (!device_names) {
+       if (!act_options.device_names) {
                 log_err("act: you need to set IO target(s) with the "
                         "device-names option.\n");
                 return 1;
@@ -280,7 +275,7 @@ static int act_prep_cmdline(void)
         do {
                 char *dev;
  
-               dev = strsep(&device_names, ",");
+               dev = strsep(&act_options.device_names, ",");
                 if (!dev)
                         break;
  
@@ -300,7 +295,7 @@ static int act_io_u_lat(struct thread_data *td, uint64_t usec)
         int i, ret = 0;
         double perm;
  
-       if (prep)
+       if (act_options.prep)
                 return 0;
  
         /*
@@ -431,7 +426,7 @@ static int act_td_init(struct thread_data *td)
         get_act_ref();
  
         apd = calloc(1, sizeof(*apd));
-       nr_slices = (test_duration + SAMPLE_SEC - 1) / SAMPLE_SEC;
+       nr_slices = (act_options.test_duration + SAMPLE_SEC - 1) / SAMPLE_SEC;
         apd->slices = calloc(nr_slices, sizeof(struct act_slice));
         apd->nr_slices = nr_slices;
         fio_gettime(&apd->sample_tv, NULL);
diff --git a/profiles/tiobench.c b/profiles/tiobench.c

index 9d9885a35788d4249652d511a0e176f532962def..f19a08577bb2450f0597cc9202e8c00c9eb495a6 100644 (file)
--- a/profiles/tiobench.c
+++ b/profiles/tiobench.c
@@ -70,6 +70,7 @@ static struct fio_option options[] = {
                 .help   = "Test directory",
                 .category = FIO_OPT_C_PROFILE,
                 .group  = FIO_OPT_G_TIOBENCH,
+               .no_free = true,
         },
         {
                 .name   = "threads",
diff --git a/server.c b/server.c

index 1e269c29da15319ac22ede6f56b81301926d50f5..e6ea4cdbfdcabc911a97f0e96d1e99e92b8c520b 100644 (file)
--- a/server.c
+++ b/server.c
@@ -252,9 +252,10 @@ static int fio_send_data(int sk, const void *p, unsigned int len)
         return fio_sendv_data(sk, &iov, 1);
  }
  
-static int fio_recv_data(int sk, void *p, unsigned int len, bool wait)
+static int fio_recv_data(int sk, void *buf, unsigned int len, bool wait)
  {
         int flags;
+       char *p = buf;
  
         if (wait)
                 flags = MSG_WAITALL;
@@ -377,7 +378,7 @@ struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait)
                         break;
  
                 /* There's payload, get it */
-               pdu = (void *) cmdret->payload + pdu_offset;
+               pdu = (char *) cmdret->payload + pdu_offset;
                 ret = fio_recv_data(sk, pdu, cmd.pdu_len, wait);
                 if (ret)
                         break;
@@ -438,7 +439,7 @@ static uint64_t alloc_reply(uint64_t tag, uint16_t opcode)
  
         reply = calloc(1, sizeof(*reply));
         INIT_FLIST_HEAD(&reply->list);
-       fio_gettime(&reply->tv, NULL);
+       fio_gettime(&reply->ts, NULL);
         reply->saved_tag = tag;
         reply->opcode = opcode;
  
@@ -855,7 +856,7 @@ static int handle_probe_cmd(struct fio_net_cmd *cmd)
  #ifdef CONFIG_BIG_ENDIAN
         probe.bigendian = 1;
  #endif
-       strncpy((char *) probe.fio_version, fio_version_string, sizeof(probe.fio_version));
+       strncpy((char *) probe.fio_version, fio_version_string, sizeof(probe.fio_version) - 1);
  
         probe.os        = FIO_OS;
         probe.arch      = FIO_ARCH;
@@ -969,6 +970,7 @@ static int handle_trigger_cmd(struct fio_net_cmd *cmd)
         } else
                 fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, rep, sz, NULL, SK_F_FREE | SK_F_INLINE);
  
+       fio_terminate_threads(TERMINATE_ALL);
         exec_trigger(buf);
         return 0;
  }
@@ -1279,7 +1281,7 @@ static int get_my_addr_str(int sk)
  
         ret = getsockname(sk, sockaddr_p, &len);
         if (ret) {
-               log_err("fio: getsockaddr: %s\n", strerror(errno));
+               log_err("fio: getsockname: %s\n", strerror(errno));
                 return -1;
         }
  
@@ -1474,6 +1476,7 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
                 convert_io_stat(&p.ts.slat_stat[i], &ts->slat_stat[i]);
                 convert_io_stat(&p.ts.lat_stat[i], &ts->lat_stat[i]);
                 convert_io_stat(&p.ts.bw_stat[i], &ts->bw_stat[i]);
+               convert_io_stat(&p.ts.iops_stat[i], &ts->iops_stat[i]);
         }
  
         p.ts.usr_time           = cpu_to_le64(ts->usr_time);
@@ -1481,7 +1484,8 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
         p.ts.ctx                = cpu_to_le64(ts->ctx);
         p.ts.minf               = cpu_to_le64(ts->minf);
         p.ts.majf               = cpu_to_le64(ts->majf);
-       p.ts.clat_percentiles   = cpu_to_le64(ts->clat_percentiles);
+       p.ts.clat_percentiles   = cpu_to_le32(ts->clat_percentiles);
+       p.ts.lat_percentiles    = cpu_to_le32(ts->lat_percentiles);
         p.ts.percentile_precision = cpu_to_le64(ts->percentile_precision);
  
         for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
@@ -1497,6 +1501,8 @@ void fio_server_send_ts(struct thread_stat *ts, struct group_run_stats *rs)
                 p.ts.io_u_complete[i]   = cpu_to_le32(ts->io_u_complete[i]);
         }
  
+       for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+               p.ts.io_u_lat_n[i]      = cpu_to_le32(ts->io_u_lat_n[i]);
         for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
                 p.ts.io_u_lat_u[i]      = cpu_to_le32(ts->io_u_lat_u[i]);
         for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
@@ -2268,7 +2274,7 @@ int fio_server_parse_host(const char *host, int ipv6, struct in_addr *inp,
   * For local domain sockets:
   *     *ptr is the filename, *is_sock is 1.
   */
-int fio_server_parse_string(const char *str, char **ptr, int *is_sock,
+int fio_server_parse_string(const char *str, char **ptr, bool *is_sock,
                             int *port, struct in_addr *inp,
                             struct in6_addr *inp6, int *ipv6)
  {
@@ -2277,13 +2283,13 @@ int fio_server_parse_string(const char *str, char **ptr, int *is_sock,
         int lport = 0;
  
         *ptr = NULL;
-       *is_sock = 0;
+       *is_sock = false;
         *port = fio_net_port;
         *ipv6 = 0;
  
         if (!strncmp(str, "sock:", 5)) {
                 *ptr = strdup(str + 5);
-               *is_sock = 1;
+               *is_sock = true;
  
                 return 0;
         }
@@ -2362,7 +2368,8 @@ int fio_server_parse_string(const char *str, char **ptr, int *is_sock,
  static int fio_handle_server_arg(void)
  {
         int port = fio_net_port;
-       int is_sock, ret = 0;
+       bool is_sock;
+       int ret = 0;
  
         saddr_in.sin_addr.s_addr = htonl(INADDR_ANY);
  
diff --git a/server.h b/server.h

index 5c720d46285166107fbad514b9e6b53d7f8b4961..ba3abfeb32287e777eb93e867a6f0e72a13a34c8 100644 (file)
--- a/server.h
+++ b/server.h
@@ -43,13 +43,13 @@ struct fio_net_cmd {
  
  struct fio_net_cmd_reply {
         struct flist_head list;
-       struct timeval tv;
+       struct timespec ts;
         uint64_t saved_tag;
         uint16_t opcode;
  };
  
  enum {
-       FIO_SERVER_VER                  = 61,
+       FIO_SERVER_VER                  = 66,
  
         FIO_SERVER_MAX_FRAGMENT_PDU     = 1024,
         FIO_SERVER_MAX_CMD_MB           = 2048,
@@ -212,7 +212,7 @@ extern int fio_server_text_output(int, const char *, size_t);
  extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *);
  extern int fio_net_send_simple_cmd(int, uint16_t, uint64_t, struct flist_head *);
  extern void fio_server_set_arg(const char *);
-extern int fio_server_parse_string(const char *, char **, int *, int *, struct in_addr *, struct in6_addr *, int *);
+extern int fio_server_parse_string(const char *, char **, bool *, int *, struct in_addr *, struct in6_addr *, int *);
  extern int fio_server_parse_host(const char *, int, struct in_addr *, struct in6_addr *);
  extern const char *fio_server_op(unsigned int);
  extern void fio_server_got_signal(int);
diff --git a/smalloc.c b/smalloc.c

index e48cfe8b1c11339d8e566743472795606668d588..cab7132511b1729b152278f119d300c83f54b5d2 100644 (file)
--- a/smalloc.c
+++ b/smalloc.c
@@ -189,7 +189,7 @@ static bool add_pool(struct pool *pool, unsigned int alloc_size)
                 goto out_fail;
  
         pool->map = ptr;
-       pool->bitmap = (void *) ptr + (pool->nr_blocks * SMALLOC_BPL);
+       pool->bitmap = (unsigned int *)((char *) ptr + (pool->nr_blocks * SMALLOC_BPL));
         memset(pool->bitmap, 0, bitmap_blocks * sizeof(unsigned int));
  
         pool->lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
diff --git a/stat.c b/stat.c

index 6e47c34cbbcaf528a4951519c8383b0e8e6b9c96..09afa5bdd8f5bb0fb3c771d01bae2fa1f491dc1e 100644 (file)
--- a/stat.c
+++ b/stat.c
@@ -37,9 +37,9 @@ void update_rusage_stat(struct thread_data *td)
         struct thread_stat *ts = &td->ts;
  
         fio_getrusage(&td->ru_end);
-       ts->usr_time += mtime_since(&td->ru_start.ru_utime,
+       ts->usr_time += mtime_since_tv(&td->ru_start.ru_utime,
                                         &td->ru_end.ru_utime);
-       ts->sys_time += mtime_since(&td->ru_start.ru_stime,
+       ts->sys_time += mtime_since_tv(&td->ru_start.ru_stime,
                                         &td->ru_end.ru_stime);
         ts->ctx += td->ru_end.ru_nvcsw + td->ru_end.ru_nivcsw
                         - (td->ru_start.ru_nvcsw + td->ru_start.ru_nivcsw);
@@ -58,7 +58,7 @@ void update_rusage_stat(struct thread_data *td)
   * group by looking at the index bits.
   *
   */
-static unsigned int plat_val_to_idx(unsigned int val)
+static unsigned int plat_val_to_idx(unsigned long long val)
  {
         unsigned int msb, error_bits, base, offset, idx;
  
@@ -66,7 +66,7 @@ static unsigned int plat_val_to_idx(unsigned int val)
         if (val == 0)
                 msb = 0;
         else
-               msb = (sizeof(val)*8) - __builtin_clz(val) - 1;
+               msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
  
         /*
          * MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
@@ -100,7 +100,8 @@ static unsigned int plat_val_to_idx(unsigned int val)
   */
  static unsigned long long plat_idx_to_val(unsigned int idx)
  {
-       unsigned int error_bits, k, base;
+       unsigned int error_bits;
+       unsigned long long k, base;
  
         assert(idx < FIO_IO_U_PLAT_NR);
  
@@ -111,7 +112,7 @@ static unsigned long long plat_idx_to_val(unsigned int idx)
  
         /* Find the group and compute the minimum value of that group */
         error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1;
-       base = 1 << (error_bits + FIO_IO_U_PLAT_BITS);
+       base = ((unsigned long long) 1) << (error_bits + FIO_IO_U_PLAT_BITS);
  
         /* Find its bucket number of the group */
         k = idx % FIO_IO_U_PLAT_VAL;
@@ -135,16 +136,16 @@ static int double_cmp(const void *a, const void *b)
  }
  
  unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
-                                  fio_fp64_t *plist, unsigned int **output,
-                                  unsigned int *maxv, unsigned int *minv)
+                                  fio_fp64_t *plist, unsigned long long **output,
+                                  unsigned long long *maxv, unsigned long long *minv)
  {
         unsigned long sum = 0;
         unsigned int len, i, j = 0;
         unsigned int oval_len = 0;
-       unsigned int *ovals = NULL;
-       int is_last;
+       unsigned long long *ovals = NULL;
+       bool is_last;
  
-       *minv = -1U;
+       *minv = -1ULL;
         *maxv = 0;
  
         len = 0;
@@ -165,7 +166,7 @@ unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
         /*
          * Calculate bucket values, note down max and min values
          */
-       is_last = 0;
+       is_last = false;
         for (i = 0; i < FIO_IO_U_PLAT_NR && !is_last; i++) {
                 sum += io_u_plat[i];
                 while (sum >= (plist[j].u.f / 100.0 * nr)) {
@@ -173,7 +174,7 @@ unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
  
                         if (j == oval_len) {
                                 oval_len += 100;
-                               ovals = realloc(ovals, oval_len * sizeof(unsigned int));
+                               ovals = realloc(ovals, oval_len * sizeof(*ovals));
                         }
  
                         ovals[j] = plat_idx_to_val(i);
@@ -182,7 +183,7 @@ unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
                         if (ovals[j] > *maxv)
                                 *maxv = ovals[j];
  
-                       is_last = (j == len - 1);
+                       is_last = (j == len - 1) != 0;
                         if (is_last)
                                 break;
  
@@ -199,11 +200,14 @@ unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
   */
  static void show_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
                                   fio_fp64_t *plist, unsigned int precision,
-                                 struct buf_output *out)
-{
-       unsigned int len, j = 0, minv, maxv;
-       unsigned int *ovals;
-       int is_last, per_line, scale_down;
+                                 bool is_clat, struct buf_output *out)
+{
+       unsigned int divisor, len, i, j = 0;
+       unsigned long long minv, maxv;
+       unsigned long long *ovals;
+       int per_line, scale_down, time_width;
+       const char *pre = is_clat ? "clat" : " lat";
+       bool is_last;
         char fmt[32];
  
         len = calc_clat_percentiles(io_u_plat, nr, plist, &ovals, &maxv, &minv);
@@ -211,39 +215,42 @@ static void show_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
                 goto out;
  
         /*
-        * We default to usecs, but if the value range is such that we
-        * should scale down to msecs, do that.
+        * We default to nsecs, but if the value range is such that we
+        * should scale down to usecs or msecs, do that.
          */
-       if (minv > 2000 && maxv > 99999) {
+       if (minv > 2000000 && maxv > 99999999ULL) {
+               scale_down = 2;
+               divisor = 1000000;
+               log_buf(out, "    %s percentiles (msec):\n     |", pre);
+       } else if (minv > 2000 && maxv > 99999) {
                 scale_down = 1;
-               log_buf(out, "    clat percentiles (msec):\n     |");
+               divisor = 1000;
+               log_buf(out, "    %s percentiles (usec):\n     |", pre);
         } else {
                 scale_down = 0;
-               log_buf(out, "    clat percentiles (usec):\n     |");
+               divisor = 1;
+               log_buf(out, "    %s percentiles (nsec):\n     |", pre);
         }
  
-       snprintf(fmt, sizeof(fmt), "%%1.%uf", precision);
-       per_line = (80 - 7) / (precision + 14);
  
-       for (j = 0; j < len; j++) {
-               char fbuf[16], *ptr = fbuf;
+       time_width = max(5, (int) (log10(maxv / divisor) + 1));
+       snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
+                       precision, time_width);
+       /* fmt will be something like " %5.2fth=[%4llu]%c" */
+       per_line = (80 - 7) / (precision + 10 + time_width);
  
+       for (j = 0; j < len; j++) {
                 /* for formatting */
                 if (j != 0 && (j % per_line) == 0)
                         log_buf(out, "     |");
  
                 /* end of the list */
-               is_last = (j == len - 1);
-
-               if (plist[j].u.f < 10.0)
-                       ptr += sprintf(fbuf, " ");
+               is_last = (j == len - 1) != 0;
  
-               snprintf(ptr, sizeof(fbuf), fmt, plist[j].u.f);
-
-               if (scale_down)
+               for (i = 0; i < scale_down; i++)
                         ovals[j] = (ovals[j] + 999) / 1000;
  
-               log_buf(out, " %sth=[%5u]%c", fbuf, ovals[j], is_last ? '\n' : ',');
+               log_buf(out, fmt, plist[j].u.f, ovals[j], is_last ? '\n' : ',');
  
                 if (is_last)
                         break;
@@ -257,8 +264,8 @@ out:
                 free(ovals);
  }
  
-bool calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max,
-             double *mean, double *dev)
+bool calc_lat(struct io_stat *is, unsigned long long *min,
+             unsigned long long *max, double *mean, double *dev)
  {
         double n = (double) is->samples;
  
@@ -355,6 +362,28 @@ static void stat_calc_lat(struct thread_stat *ts, double *dst,
         }
  }
  
+/*
+ * To keep the terse format unaltered, add all of the ns latency
+ * buckets to the first us latency bucket
+ */
+void stat_calc_lat_nu(struct thread_stat *ts, double *io_u_lat_u)
+{
+       unsigned long ntotal = 0, total = ddir_rw_sum(ts->total_io_u);
+       int i;
+
+       stat_calc_lat(ts, io_u_lat_u, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR);
+
+       for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+               ntotal += ts->io_u_lat_n[i];
+
+       io_u_lat_u[0] += 100.0 * (double) ntotal / (double) total;
+}
+
+void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat)
+{
+       stat_calc_lat(ts, io_u_lat, ts->io_u_lat_n, FIO_IO_U_LAT_N_NR);
+}
+
  void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat)
  {
         stat_calc_lat(ts, io_u_lat, ts->io_u_lat_u, FIO_IO_U_LAT_U_NR);
@@ -365,14 +394,17 @@ void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat)
         stat_calc_lat(ts, io_u_lat, ts->io_u_lat_m, FIO_IO_U_LAT_M_NR);
  }
  
-static void display_lat(const char *name, unsigned long min, unsigned long max,
-                       double mean, double dev, struct buf_output *out)
+static void display_lat(const char *name, unsigned long long min,
+                       unsigned long long max, double mean, double dev,
+                       struct buf_output *out)
  {
-       const char *base = "(usec)";
+       const char *base = "(nsec)";
         char *minp, *maxp;
  
-       if (usec_to_msec(&min, &max, &mean, &dev))
+       if (nsec_to_msec(&min, &max, &mean, &dev))
                 base = "(msec)";
+       else if (nsec_to_usec(&min, &max, &mean, &dev))
+               base = "(usec)";
  
         minp = num2str(min, 6, 1, 0, N2S_NONE);
         maxp = num2str(max, 6, 1, 0, N2S_NONE);
@@ -388,8 +420,8 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
                              int ddir, struct buf_output *out)
  {
         const char *str[] = { " read", "write", " trim" };
-       unsigned long min, max, runt;
-       unsigned long long bw, iops;
+       unsigned long runt;
+       unsigned long long min, max, bw, iops;
         double mean, dev;
         char *io_p, *bw_p, *bw_p_alt, *iops_p;
         int i2p;
@@ -427,11 +459,12 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
         if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
                 display_lat(" lat", min, max, mean, dev, out);
  
-       if (ts->clat_percentiles) {
+       if (ts->clat_percentiles || ts->lat_percentiles) {
                 show_clat_percentiles(ts->io_u_plat[ddir],
                                         ts->clat_stat[ddir].samples,
                                         ts->percentile_list,
-                                       ts->percentile_precision, out);
+                                       ts->percentile_precision,
+                                       ts->clat_percentiles, out);
         }
         if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
                 double p_of_agg = 100.0, fkb_base = (double)rs->kb_base;
@@ -446,6 +479,12 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
                 else
                         bw_str = "kB";
  
+               if (rs->agg[ddir]) {
+                       p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
+                       if (p_of_agg > 100.0)
+                               p_of_agg = 100.0;
+               }
+
                 if (rs->unit_base == 1) {
                         min *= 8.0;
                         max *= 8.0;
@@ -453,12 +492,6 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
                         dev *= 8.0;
                 }
  
-               if (rs->agg[ddir]) {
-                       p_of_agg = mean * 100 / (double) rs->agg[ddir];
-                       if (p_of_agg > 100.0)
-                               p_of_agg = 100.0;
-               }
-
                 if (mean > fkb_base * fkb_base) {
                         min /= fkb_base;
                         max /= fkb_base;
@@ -467,25 +500,33 @@ static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
                         bw_str = (rs->unit_base == 1 ? "Mibit" : "MiB");
                 }
  
-               log_buf(out, "   bw (%5s/s): min=%5lu, max=%5lu, per=%3.2f%%, avg=%5.02f, stdev=%5.02f\n",
-                       bw_str, min, max, p_of_agg, mean, dev);
+               log_buf(out, "   bw (%5s/s): min=%5llu, max=%5llu, per=%3.2f%%, "
+                       "avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
+                       bw_str, min, max, p_of_agg, mean, dev,
+                       (&ts->bw_stat[ddir])->samples);
+       }
+       if (calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) {
+               log_buf(out, "   iops        : min=%5llu, max=%5llu, "
+                       "avg=%5.02f, stdev=%5.02f, samples=%" PRIu64 "\n",
+                       min, max, mean, dev, (&ts->iops_stat[ddir])->samples);
         }
  }
  
-static int show_lat(double *io_u_lat, int nr, const char **ranges,
-                   const char *msg, struct buf_output *out)
+static bool show_lat(double *io_u_lat, int nr, const char **ranges,
+                    const char *msg, struct buf_output *out)
  {
-       int new_line = 1, i, line = 0, shown = 0;
+       bool new_line = true, shown = false;
+       int i, line = 0;
  
         for (i = 0; i < nr; i++) {
                 if (io_u_lat[i] <= 0.0)
                         continue;
-               shown = 1;
+               shown = true;
                 if (new_line) {
                         if (line)
                                 log_buf(out, "\n");
-                       log_buf(out, "    lat (%s) : ", msg);
-                       new_line = 0;
+                       log_buf(out, "  lat (%s)   : ", msg);
+                       new_line = false;
                         line = 0;
                 }
                 if (line)
@@ -493,13 +534,21 @@ static int show_lat(double *io_u_lat, int nr, const char **ranges,
                 log_buf(out, "%s%3.2f%%", ranges[i], io_u_lat[i]);
                 line++;
                 if (line == 5)
-                       new_line = 1;
+                       new_line = true;
         }
  
         if (shown)
                 log_buf(out, "\n");
  
-       return shown;
+       return true;
+}
+
+static void show_lat_n(double *io_u_lat_n, struct buf_output *out)
+{
+       const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=",
+                                "250=", "500=", "750=", "1000=", };
+
+       show_lat(io_u_lat_n, FIO_IO_U_LAT_N_NR, ranges, "nsec", out);
  }
  
  static void show_lat_u(double *io_u_lat_u, struct buf_output *out)
@@ -521,12 +570,15 @@ static void show_lat_m(double *io_u_lat_m, struct buf_output *out)
  
  static void show_latencies(struct thread_stat *ts, struct buf_output *out)
  {
+       double io_u_lat_n[FIO_IO_U_LAT_N_NR];
         double io_u_lat_u[FIO_IO_U_LAT_U_NR];
         double io_u_lat_m[FIO_IO_U_LAT_M_NR];
  
+       stat_calc_lat_n(ts, io_u_lat_n);
         stat_calc_lat_u(ts, io_u_lat_u);
         stat_calc_lat_m(ts, io_u_lat_m);
  
+       show_lat_n(io_u_lat_n, out);
         show_lat_u(io_u_lat_u, out);
         show_lat_m(io_u_lat_m, out);
  }
@@ -816,14 +868,13 @@ static void show_thread_status_normal(struct thread_stat *ts,
  
  static void show_ddir_status_terse(struct thread_stat *ts,
                                    struct group_run_stats *rs, int ddir,
-                                  struct buf_output *out)
+                                  int ver, struct buf_output *out)
  {
-       unsigned long min, max;
-       unsigned long long bw, iops;
-       unsigned int *ovals = NULL;
+       unsigned long long min, max, minv, maxv, bw, iops;
+       unsigned long long *ovals = NULL;
         double mean, dev;
-       unsigned int len, minv, maxv;
-       int i;
+       unsigned int len;
+       int i, bw_stat;
  
         assert(ddir_rw(ddir));
  
@@ -840,16 +891,16 @@ static void show_ddir_status_terse(struct thread_stat *ts,
                                         (unsigned long long) ts->runtime[ddir]);
  
         if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev))
-               log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev);
+               log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
         else
-               log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+               log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
  
         if (calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev))
-               log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev);
+               log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
         else
-               log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+               log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
  
-       if (ts->clat_percentiles) {
+       if (ts->clat_percentiles || ts->lat_percentiles) {
                 len = calc_clat_percentiles(ts->io_u_plat[ddir],
                                         ts->clat_stat[ddir].samples,
                                         ts->percentile_list, &ovals, &maxv,
@@ -862,39 +913,53 @@ static void show_ddir_status_terse(struct thread_stat *ts,
                         log_buf(out, ";0%%=0");
                         continue;
                 }
-               log_buf(out, ";%f%%=%u", ts->percentile_list[i].u.f, ovals[i]);
+               log_buf(out, ";%f%%=%llu", ts->percentile_list[i].u.f, ovals[i]/1000);
         }
  
         if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
-               log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev);
+               log_buf(out, ";%llu;%llu;%f;%f", min/1000, max/1000, mean/1000, dev/1000);
         else
-               log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+               log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
  
         if (ovals)
                 free(ovals);
  
-       if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
+       bw_stat = calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev);
+       if (bw_stat) {
                 double p_of_agg = 100.0;
  
                 if (rs->agg[ddir]) {
-                       p_of_agg = mean * 100 / (double) rs->agg[ddir];
+                       p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
                         if (p_of_agg > 100.0)
                                 p_of_agg = 100.0;
                 }
  
-               log_buf(out, ";%lu;%lu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
+               log_buf(out, ";%llu;%llu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
         } else
-               log_buf(out, ";%lu;%lu;%f%%;%f;%f", 0UL, 0UL, 0.0, 0.0, 0.0);
+               log_buf(out, ";%llu;%llu;%f%%;%f;%f", 0ULL, 0ULL, 0.0, 0.0, 0.0);
+
+       if (ver == 5) {
+               if (bw_stat)
+                       log_buf(out, ";%" PRIu64, (&ts->bw_stat[ddir])->samples);
+               else
+                       log_buf(out, ";%lu", 0UL);
+
+               if (calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev))
+                       log_buf(out, ";%llu;%llu;%f;%f;%" PRIu64, min, max,
+                               mean, dev, (&ts->iops_stat[ddir])->samples);
+               else
+                       log_buf(out, ";%llu;%llu;%f;%f;%lu", 0ULL, 0ULL, 0.0, 0.0, 0UL);
+       }
  }
  
  static void add_ddir_status_json(struct thread_stat *ts,
                 struct group_run_stats *rs, int ddir, struct json_object *parent)
  {
-       unsigned long min, max;
+       unsigned long long min, max, minv, maxv;
         unsigned long long bw;
-       unsigned int *ovals = NULL;
+       unsigned long long *ovals = NULL;
         double mean, dev, iops;
-       unsigned int len, minv, maxv;
+       unsigned int len;
         int i;
         const char *ddirname[] = {"read", "write", "trim"};
         struct json_object *dir_object, *tmp_object, *percentile_object, *clat_bins_object;
@@ -919,7 +984,8 @@ static void add_ddir_status_json(struct thread_stat *ts,
                 iops = (1000.0 * (uint64_t) ts->total_io_u[ddir]) / runt;
         }
  
-       json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir] >> 10);
+       json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir]);
+       json_object_add_value_int(dir_object, "io_kbytes", ts->io_bytes[ddir] >> 10);
         json_object_add_value_int(dir_object, "bw", bw);
         json_object_add_value_float(dir_object, "iops", iops);
         json_object_add_value_int(dir_object, "runtime", ts->runtime[ddir]);
@@ -932,7 +998,7 @@ static void add_ddir_status_json(struct thread_stat *ts,
                 mean = dev = 0.0;
         }
         tmp_object = json_create_object();
-       json_object_add_value_object(dir_object, "slat", tmp_object);
+       json_object_add_value_object(dir_object, "slat_ns", tmp_object);
         json_object_add_value_int(tmp_object, "min", min);
         json_object_add_value_int(tmp_object, "max", max);
         json_object_add_value_float(tmp_object, "mean", mean);
@@ -943,13 +1009,13 @@ static void add_ddir_status_json(struct thread_stat *ts,
                 mean = dev = 0.0;
         }
         tmp_object = json_create_object();
-       json_object_add_value_object(dir_object, "clat", tmp_object);
+       json_object_add_value_object(dir_object, "clat_ns", tmp_object);
         json_object_add_value_int(tmp_object, "min", min);
         json_object_add_value_int(tmp_object, "max", max);
         json_object_add_value_float(tmp_object, "mean", mean);
         json_object_add_value_float(tmp_object, "stddev", dev);
  
-       if (ts->clat_percentiles) {
+       if (ts->clat_percentiles || ts->lat_percentiles) {
                 len = calc_clat_percentiles(ts->io_u_plat[ddir],
                                         ts->clat_stat[ddir].samples,
                                         ts->percentile_list, &ovals, &maxv,
@@ -984,7 +1050,7 @@ static void add_ddir_status_json(struct thread_stat *ts,
                 mean = dev = 0.0;
         }
         tmp_object = json_create_object();
-       json_object_add_value_object(dir_object, "lat", tmp_object);
+       json_object_add_value_object(dir_object, "lat_ns", tmp_object);
         json_object_add_value_int(tmp_object, "min", min);
         json_object_add_value_int(tmp_object, "max", max);
         json_object_add_value_float(tmp_object, "mean", mean);
@@ -994,7 +1060,7 @@ static void add_ddir_status_json(struct thread_stat *ts,
  
         if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
                 if (rs->agg[ddir]) {
-                       p_of_agg = mean * 100 / (double) rs->agg[ddir];
+                       p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024);
                         if (p_of_agg > 100.0)
                                 p_of_agg = 100.0;
                 }
@@ -1007,74 +1073,24 @@ static void add_ddir_status_json(struct thread_stat *ts,
         json_object_add_value_float(dir_object, "bw_agg", p_of_agg);
         json_object_add_value_float(dir_object, "bw_mean", mean);
         json_object_add_value_float(dir_object, "bw_dev", dev);
-}
-
-static void show_thread_status_terse_v2(struct thread_stat *ts,
-                                       struct group_run_stats *rs,
-                                       struct buf_output *out)
-{
-       double io_u_dist[FIO_IO_U_MAP_NR];
-       double io_u_lat_u[FIO_IO_U_LAT_U_NR];
-       double io_u_lat_m[FIO_IO_U_LAT_M_NR];
-       double usr_cpu, sys_cpu;
-       int i;
+       json_object_add_value_int(dir_object, "bw_samples",
+                               (&ts->bw_stat[ddir])->samples);
  
-       /* General Info */
-       log_buf(out, "2;%s;%d;%d", ts->name, ts->groupid, ts->error);
-       /* Log Read Status */
-       show_ddir_status_terse(ts, rs, DDIR_READ, out);
-       /* Log Write Status */
-       show_ddir_status_terse(ts, rs, DDIR_WRITE, out);
-       /* Log Trim Status */
-       show_ddir_status_terse(ts, rs, DDIR_TRIM, out);
-
-       /* CPU Usage */
-       if (ts->total_run_time) {
-               double runt = (double) ts->total_run_time;
-
-               usr_cpu = (double) ts->usr_time * 100 / runt;
-               sys_cpu = (double) ts->sys_time * 100 / runt;
-       } else {
-               usr_cpu = 0;
-               sys_cpu = 0;
+       if (!calc_lat(&ts->iops_stat[ddir], &min, &max, &mean, &dev)) {
+               min = max = 0;
+               mean = dev = 0.0;
         }
-
-       log_buf(out, ";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu,
-                                               (unsigned long long) ts->ctx,
-                                               (unsigned long long) ts->majf,
-                                               (unsigned long long) ts->minf);
-
-       /* Calc % distribution of IO depths, usecond, msecond latency */
-       stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
-       stat_calc_lat_u(ts, io_u_lat_u);
-       stat_calc_lat_m(ts, io_u_lat_m);
-
-       /* Only show fixed 7 I/O depth levels*/
-       log_buf(out, ";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%",
-                       io_u_dist[0], io_u_dist[1], io_u_dist[2], io_u_dist[3],
-                       io_u_dist[4], io_u_dist[5], io_u_dist[6]);
-
-       /* Microsecond latency */
-       for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
-               log_buf(out, ";%3.2f%%", io_u_lat_u[i]);
-       /* Millisecond latency */
-       for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
-               log_buf(out, ";%3.2f%%", io_u_lat_m[i]);
-       /* Additional output if continue_on_error set - default off*/
-       if (ts->continue_on_error)
-               log_buf(out, ";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
-       log_buf(out, "\n");
-
-       /* Additional output if description is set */
-       if (strlen(ts->description))
-               log_buf(out, ";%s", ts->description);
-
-       log_buf(out, "\n");
+       json_object_add_value_int(dir_object, "iops_min", min);
+       json_object_add_value_int(dir_object, "iops_max", max);
+       json_object_add_value_float(dir_object, "iops_mean", mean);
+       json_object_add_value_float(dir_object, "iops_stddev", dev);
+       json_object_add_value_int(dir_object, "iops_samples",
+                               (&ts->iops_stat[ddir])->samples);
  }
  
-static void show_thread_status_terse_v3_v4(struct thread_stat *ts,
-                                          struct group_run_stats *rs, int ver,
-                                          struct buf_output *out)
+static void show_thread_status_terse_all(struct thread_stat *ts,
+                                        struct group_run_stats *rs, int ver,
+                                        struct buf_output *out)
  {
         double io_u_dist[FIO_IO_U_MAP_NR];
         double io_u_lat_u[FIO_IO_U_LAT_U_NR];
@@ -1083,15 +1099,19 @@ static void show_thread_status_terse_v3_v4(struct thread_stat *ts,
         int i;
  
         /* General Info */
-       log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string,
-                                       ts->name, ts->groupid, ts->error);
+       if (ver == 2)
+               log_buf(out, "2;%s;%d;%d", ts->name, ts->groupid, ts->error);
+       else
+               log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string,
+                       ts->name, ts->groupid, ts->error);
+
         /* Log Read Status */
-       show_ddir_status_terse(ts, rs, DDIR_READ, out);
+       show_ddir_status_terse(ts, rs, DDIR_READ, ver, out);
         /* Log Write Status */
-       show_ddir_status_terse(ts, rs, DDIR_WRITE, out);
+       show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out);
         /* Log Trim Status */
-       if (ver == 4)
-               show_ddir_status_terse(ts, rs, DDIR_TRIM, out);
+       if (ver == 2 || ver == 4 || ver == 5)
+               show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out);
  
         /* CPU Usage */
         if (ts->total_run_time) {
@@ -1111,7 +1131,7 @@ static void show_thread_status_terse_v3_v4(struct thread_stat *ts,
  
         /* Calc % distribution of IO depths, usecond, msecond latency */
         stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
-       stat_calc_lat_u(ts, io_u_lat_u);
+       stat_calc_lat_nu(ts, io_u_lat_u);
         stat_calc_lat_m(ts, io_u_lat_m);
  
         /* Only show fixed 7 I/O depth levels*/
@@ -1127,11 +1147,14 @@ static void show_thread_status_terse_v3_v4(struct thread_stat *ts,
                 log_buf(out, ";%3.2f%%", io_u_lat_m[i]);
  
         /* disk util stats, if any */
-       show_disk_util(1, NULL, out);
+       if (ver >= 3)
+               show_disk_util(1, NULL, out);
  
         /* Additional output if continue_on_error set - default off*/
         if (ts->continue_on_error)
                 log_buf(out, ";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
+       if (ver == 2)
+               log_buf(out, "\n");
  
         /* Additional output if description is set */
         if (strlen(ts->description))
@@ -1172,6 +1195,7 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
         struct json_object *root, *tmp;
         struct jobs_eta *je;
         double io_u_dist[FIO_IO_U_MAP_NR];
+       double io_u_lat_n[FIO_IO_U_LAT_N_NR];
         double io_u_lat_u[FIO_IO_U_LAT_U_NR];
         double io_u_lat_m[FIO_IO_U_LAT_M_NR];
         double usr_cpu, sys_cpu;
@@ -1216,6 +1240,7 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
  
         /* Calc % distribution of IO depths, usecond, msecond latency */
         stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
+       stat_calc_lat_n(ts, io_u_lat_n);
         stat_calc_lat_u(ts, io_u_lat_u);
         stat_calc_lat_m(ts, io_u_lat_m);
  
@@ -1231,9 +1256,17 @@ static struct json_object *show_thread_status_json(struct thread_stat *ts,
                 json_object_add_value_float(tmp, (const char *)name, io_u_dist[i]);
         }
  
+       /* Nanosecond latency */
         tmp = json_create_object();
-       json_object_add_value_object(root, "latency_us", tmp);
+       json_object_add_value_object(root, "latency_ns", tmp);
+       for (i = 0; i < FIO_IO_U_LAT_N_NR; i++) {
+               const char *ranges[] = { "2", "4", "10", "20", "50", "100",
+                                "250", "500", "750", "1000", };
+               json_object_add_value_float(tmp, ranges[i], io_u_lat_n[i]);
+       }
         /* Microsecond latency */
+       tmp = json_create_object();
+       json_object_add_value_object(root, "latency_us", tmp);
         for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
                 const char *ranges[] = { "2", "4", "10", "20", "50", "100",
                                  "250", "500", "750", "1000", };
@@ -1362,10 +1395,8 @@ static void show_thread_status_terse(struct thread_stat *ts,
                                      struct group_run_stats *rs,
                                      struct buf_output *out)
  {
-       if (terse_version == 2)
-               show_thread_status_terse_v2(ts, rs, out);
-       else if (terse_version == 3 || terse_version == 4)
-               show_thread_status_terse_v3_v4(ts, rs, terse_version, out);
+       if (terse_version >= 2 && terse_version <= 5)
+               show_thread_status_terse_all(ts, rs, terse_version, out);
         else
                 log_err("fio: bad terse version!? %d\n", terse_version);
  }
@@ -1457,6 +1488,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
                         sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first);
                         sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first);
                         sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first);
+                       sum_stat(&dst->iops_stat[l], &src->iops_stat[l], first);
  
                         dst->io_bytes[l] += src->io_bytes[l];
  
@@ -1467,6 +1499,7 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
                         sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first);
                         sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first);
                         sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first);
+                       sum_stat(&dst->iops_stat[0], &src->iops_stat[l], first);
  
                         dst->io_bytes[0] += src->io_bytes[l];
  
@@ -1493,6 +1526,8 @@ void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
                 dst->io_u_submit[k] += src->io_u_submit[k];
         for (k = 0; k < FIO_IO_U_MAP_NR; k++)
                 dst->io_u_complete[k] += src->io_u_complete[k];
+       for (k = 0; k < FIO_IO_U_LAT_N_NR; k++)
+               dst->io_u_lat_n[k] += src->io_u_lat_n[k];
         for (k = 0; k < FIO_IO_U_LAT_U_NR; k++)
                 dst->io_u_lat_u[k] += src->io_u_lat_u[k];
         for (k = 0; k < FIO_IO_U_LAT_M_NR; k++)
@@ -1546,6 +1581,7 @@ void init_thread_stat(struct thread_stat *ts)
                 ts->clat_stat[j].min_val = -1UL;
                 ts->slat_stat[j].min_val = -1UL;
                 ts->bw_stat[j].min_val = -1UL;
+               ts->iops_stat[j].min_val = -1UL;
         }
         ts->groupid = -1;
  }
@@ -1556,8 +1592,8 @@ void __show_run_stats(void)
         struct thread_data *td;
         struct thread_stat *threadstats, *ts;
         int i, j, k, nr_ts, last_ts, idx;
-       int kb_base_warned = 0;
-       int unit_base_warned = 0;
+       bool kb_base_warned = false;
+       bool unit_base_warned = false;
         struct json_object *root = NULL;
         struct json_array *array = NULL;
         struct buf_output output[FIO_OUTPUT_NR];
@@ -1613,6 +1649,7 @@ void __show_run_stats(void)
                 ts = &threadstats[j];
  
                 ts->clat_percentiles = td->o.clat_percentiles;
+               ts->lat_percentiles = td->o.lat_percentiles;
                 ts->percentile_precision = td->o.percentile_precision;
                 memcpy(ts->percentile_list, td->o.percentile_list, sizeof(td->o.percentile_list));
                 opt_lists[j] = &td->opt_list;
@@ -1649,11 +1686,11 @@ void __show_run_stats(void)
                 } else if (ts->kb_base != td->o.kb_base && !kb_base_warned) {
                         log_info("fio: kb_base differs for jobs in group, using"
                                  " %u as the base\n", ts->kb_base);
-                       kb_base_warned = 1;
+                       kb_base_warned = true;
                 } else if (ts->unit_base != td->o.unit_base && !unit_base_warned) {
                         log_info("fio: unit_base differs for jobs in group, using"
                                  " %u as the base\n", ts->unit_base);
-                       unit_base_warned = 1;
+                       unit_base_warned = true;
                 }
  
                 ts->continue_on_error = td->o.continue_on_error;
@@ -1825,8 +1862,10 @@ void __show_run_stats(void)
         }
  
         for (i = 0; i < FIO_OUTPUT_NR; i++) {
-               buf_output_flush(&output[i]);
-               buf_output_free(&output[i]);
+               struct buf_output *out = &output[i];
+
+               log_info_buf(out->buf, out->buflen);
+               buf_output_free(out);
         }
  
         log_info_flush();
@@ -1846,22 +1885,22 @@ void __show_running_run_stats(void)
  {
         struct thread_data *td;
         unsigned long long *rt;
-       struct timeval tv;
+       struct timespec ts;
         int i;
  
         fio_mutex_down(stat_mutex);
  
         rt = malloc(thread_number * sizeof(unsigned long long));
-       fio_gettime(&tv, NULL);
+       fio_gettime(&ts, NULL);
  
         for_each_td(td, i) {
                 td->update_rusage = 1;
                 td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
                 td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE];
                 td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
-               td->ts.total_run_time = mtime_since(&td->epoch, &tv);
+               td->ts.total_run_time = mtime_since(&td->epoch, &ts);
  
-               rt[i] = mtime_since(&td->start, &tv);
+               rt[i] = mtime_since(&td->start, &ts);
                 if (td_read(td) && td->ts.io_bytes[DDIR_READ])
                         td->ts.runtime[DDIR_READ] += rt[i];
                 if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
@@ -1895,9 +1934,9 @@ void __show_running_run_stats(void)
         fio_mutex_up(stat_mutex);
  }
  
-static int status_interval_init;
-static struct timeval status_time;
-static int status_file_disabled;
+static bool status_interval_init;
+static struct timespec status_time;
+static bool status_file_disabled;
  
  #define FIO_STATUS_FILE                "fio-dump-status"
  
@@ -1928,7 +1967,7 @@ static int check_status_file(void)
                 log_err("fio: failed to unlink %s: %s\n", fio_status_file_path,
                                                         strerror(errno));
                 log_err("fio: disabling status file updates\n");
-               status_file_disabled = 1;
+               status_file_disabled = true;
         }
  
         return 1;
@@ -1939,7 +1978,7 @@ void check_for_running_stats(void)
         if (status_interval) {
                 if (!status_interval_init) {
                         fio_gettime(&status_time, NULL);
-                       status_interval_init = 1;
+                       status_interval_init = true;
                 } else if (mtime_since_now(&status_time) >= status_interval) {
                         show_running_run_stats();
                         fio_gettime(&status_time, NULL);
@@ -1952,7 +1991,7 @@ void check_for_running_stats(void)
         }
  }
  
-static inline void add_stat_sample(struct io_stat *is, unsigned long data)
+static inline void add_stat_sample(struct io_stat *is, unsigned long long data)
  {
         double val = data;
         double delta;
@@ -2125,7 +2164,7 @@ static void __add_log_sample(struct io_log *iolog, union io_sample_data data,
         if (iolog->disabled)
                 return;
         if (flist_empty(&iolog->io_logs))
-               iolog->avg_last = t;
+               iolog->avg_last[ddir] = t;
  
         cur_log = get_cur_log(iolog);
         if (cur_log) {
@@ -2185,6 +2224,8 @@ void reset_io_stats(struct thread_data *td)
                 ts->io_u_complete[i] = 0;
         }
  
+       for (i = 0; i < FIO_IO_U_LAT_N_NR; i++)
+               ts->io_u_lat_n[i] = 0;
         for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
                 ts->io_u_lat_u[i] = 0;
         for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
@@ -2254,9 +2295,9 @@ static long add_log_sample(struct thread_data *td, struct io_log *iolog,
          * If period hasn't passed, adding the above sample is all we
          * need to do.
          */
-       this_window = elapsed - iolog->avg_last;
-       if (elapsed < iolog->avg_last)
-               return iolog->avg_last - elapsed;
+       this_window = elapsed - iolog->avg_last[ddir];
+       if (elapsed < iolog->avg_last[ddir])
+               return iolog->avg_last[ddir] - elapsed;
         else if (this_window < iolog->avg_msec) {
                 int diff = iolog->avg_msec - this_window;
  
@@ -2264,9 +2305,9 @@ static long add_log_sample(struct thread_data *td, struct io_log *iolog,
                         return diff;
         }
  
-       _add_stat_to_log(iolog, elapsed, td->o.log_max != 0);
+       __add_stat_to_log(iolog, ddir, elapsed, td->o.log_max != 0);
  
-       iolog->avg_last = elapsed - (this_window - iolog->avg_msec);
+       iolog->avg_last[ddir] = elapsed - (this_window - iolog->avg_msec);
         return iolog->avg_msec;
  }
  
@@ -2300,16 +2341,16 @@ void add_agg_sample(union io_sample_data data, enum fio_ddir ddir, unsigned int
  }
  
  static void add_clat_percentile_sample(struct thread_stat *ts,
-                               unsigned long usec, enum fio_ddir ddir)
+                               unsigned long long nsec, enum fio_ddir ddir)
  {
-       unsigned int idx = plat_val_to_idx(usec);
+       unsigned int idx = plat_val_to_idx(nsec);
         assert(idx < FIO_IO_U_PLAT_NR);
  
         ts->io_u_plat[ddir][idx]++;
  }
  
  void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
-                    unsigned long usec, unsigned int bs, uint64_t offset)
+                    unsigned long long nsec, unsigned int bs, uint64_t offset)
  {
         unsigned long elapsed, this_window;
         struct thread_stat *ts = &td->ts;
@@ -2317,14 +2358,14 @@ void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
  
         td_io_u_lock(td);
  
-       add_stat_sample(&ts->clat_stat[ddir], usec);
+       add_stat_sample(&ts->clat_stat[ddir], nsec);
  
         if (td->clat_log)
-               add_log_sample(td, td->clat_log, sample_val(usec), ddir, bs,
+               add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs,
                                offset);
  
         if (ts->clat_percentiles)
-               add_clat_percentile_sample(ts, usec, ddir);
+               add_clat_percentile_sample(ts, nsec, ddir);
  
         if (iolog && iolog->hist_msec) {
                 struct io_hist *hw = &iolog->hist_window[ddir];
@@ -2386,7 +2427,7 @@ void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
  }
  
  void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
-                   unsigned long usec, unsigned int bs, uint64_t offset)
+                   unsigned long long nsec, unsigned int bs, uint64_t offset)
  {
         struct thread_stat *ts = &td->ts;
  
@@ -2395,23 +2436,26 @@ void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
  
         td_io_u_lock(td);
  
-       add_stat_sample(&ts->lat_stat[ddir], usec);
+       add_stat_sample(&ts->lat_stat[ddir], nsec);
  
         if (td->lat_log)
-               add_log_sample(td, td->lat_log, sample_val(usec), ddir, bs,
+               add_log_sample(td, td->lat_log, sample_val(nsec), ddir, bs,
                                offset);
  
+       if (ts->lat_percentiles)
+               add_clat_percentile_sample(ts, nsec, ddir);
+
         td_io_u_unlock(td);
  }
  
  void add_bw_sample(struct thread_data *td, struct io_u *io_u,
-                  unsigned int bytes, unsigned long spent)
+                  unsigned int bytes, unsigned long long spent)
  {
         struct thread_stat *ts = &td->ts;
         unsigned long rate;
  
         if (spent)
-               rate = bytes * 1000 / spent;
+               rate = (unsigned long) (bytes * 1000000ULL / spent);
         else
                 rate = 0;
  
@@ -2427,8 +2471,8 @@ void add_bw_sample(struct thread_data *td, struct io_u *io_u,
         td_io_u_unlock(td);
  }
  
-static int __add_samples(struct thread_data *td, struct timeval *parent_tv,
-                        struct timeval *t, unsigned int avg_time,
+static int __add_samples(struct thread_data *td, struct timespec *parent_tv,
+                        struct timespec *t, unsigned int avg_time,
                          uint64_t *this_io_bytes, uint64_t *stat_io_bytes,
                          struct io_stat *stat, struct io_log *log,
                          bool is_kb)
@@ -2465,7 +2509,7 @@ static int __add_samples(struct thread_data *td, struct timeval *parent_tv,
  
                 add_stat_sample(&stat[ddir], rate);
  
-               if (td->bw_log) {
+               if (log) {
                         unsigned int bs = 0;
  
                         if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
@@ -2478,7 +2522,7 @@ static int __add_samples(struct thread_data *td, struct timeval *parent_tv,
                 stat_io_bytes[ddir] = this_io_bytes[ddir];
         }
  
-       timeval_add_msec(parent_tv, avg_time);
+       timespec_add_msec(parent_tv, avg_time);
  
         td_io_u_unlock(td);
  
@@ -2490,7 +2534,7 @@ static int __add_samples(struct thread_data *td, struct timeval *parent_tv,
         return min(next, next_log);
  }
  
-static int add_bw_samples(struct thread_data *td, struct timeval *t)
+static int add_bw_samples(struct thread_data *td, struct timespec *t)
  {
         return __add_samples(td, &td->bw_sample_time, t, td->o.bw_avg_time,
                                 td->this_io_bytes, td->stat_io_bytes,
@@ -2514,7 +2558,7 @@ void add_iops_sample(struct thread_data *td, struct io_u *io_u,
         td_io_u_unlock(td);
  }
  
-static int add_iops_samples(struct thread_data *td, struct timeval *t)
+static int add_iops_samples(struct thread_data *td, struct timespec *t)
  {
         return __add_samples(td, &td->iops_sample_time, t, td->o.iops_avg_time,
                                 td->this_io_blocks, td->stat_io_blocks,
@@ -2528,7 +2572,7 @@ int calc_log_samples(void)
  {
         struct thread_data *td;
         unsigned int next = ~0U, tmp;
-       struct timeval now;
+       struct timespec now;
         int i;
  
         fio_gettime(&now, NULL);
@@ -2541,12 +2585,14 @@ int calc_log_samples(void)
                         next = min(td->o.iops_avg_time, td->o.bw_avg_time);
                         continue;
                 }
-               if (td->bw_log && !per_unit_log(td->bw_log)) {
+               if (!td->bw_log ||
+                       (td->bw_log && !per_unit_log(td->bw_log))) {
                         tmp = add_bw_samples(td, &now);
                         if (tmp < next)
                                 next = tmp;
                 }
-               if (td->iops_log && !per_unit_log(td->iops_log)) {
+               if (!td->iops_log ||
+                       (td->iops_log && !per_unit_log(td->iops_log))) {
                         tmp = add_iops_samples(td, &now);
                         if (tmp < next)
                                 next = tmp;
diff --git a/stat.h b/stat.h

index aa4ad806aa9159db7b15276fcf15853dae9c5bf8..848331bb5e47fef2438cf19a910fed5c02b712bc 100644 (file)
--- a/stat.h
+++ b/stat.h
@@ -19,6 +19,7 @@ struct group_run_stats {
   * How many depth levels to log
   */
  #define FIO_IO_U_MAP_NR        7
+#define FIO_IO_U_LAT_N_NR 10
  #define FIO_IO_U_LAT_U_NR 10
  #define FIO_IO_U_LAT_M_NR 12
  
@@ -108,7 +109,7 @@ struct group_run_stats {
  
  #define FIO_IO_U_PLAT_BITS 6
  #define FIO_IO_U_PLAT_VAL (1 << FIO_IO_U_PLAT_BITS)
-#define FIO_IO_U_PLAT_GROUP_NR 19
+#define FIO_IO_U_PLAT_GROUP_NR 29
  #define FIO_IO_U_PLAT_NR (FIO_IO_U_PLAT_GROUP_NR * FIO_IO_U_PLAT_VAL)
  #define FIO_IO_U_LIST_MAX_LEN 20 /* The size of the default and user-specified
                                         list of percentiles */
@@ -171,13 +172,15 @@ struct thread_stat {
         /*
          * IO depth and latency stats
          */
-       uint64_t clat_percentiles;
+       uint32_t clat_percentiles;
+       uint32_t lat_percentiles;
         uint64_t percentile_precision;
         fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
  
         uint32_t io_u_map[FIO_IO_U_MAP_NR];
         uint32_t io_u_submit[FIO_IO_U_MAP_NR];
         uint32_t io_u_complete[FIO_IO_U_MAP_NR];
+       uint32_t io_u_lat_n[FIO_IO_U_LAT_N_NR];
         uint32_t io_u_lat_u[FIO_IO_U_LAT_U_NR];
         uint32_t io_u_lat_m[FIO_IO_U_LAT_M_NR];
         uint32_t io_u_plat[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
@@ -242,17 +245,17 @@ struct jobs_eta {
         uint32_t nr_pending;
         uint32_t nr_setting_up;
  
-       uint32_t files_open;
-
         uint64_t m_rate[DDIR_RWDIR_CNT], t_rate[DDIR_RWDIR_CNT];
-       uint32_t m_iops[DDIR_RWDIR_CNT], t_iops[DDIR_RWDIR_CNT];
         uint64_t rate[DDIR_RWDIR_CNT];
+       uint32_t m_iops[DDIR_RWDIR_CNT], t_iops[DDIR_RWDIR_CNT];
         uint32_t iops[DDIR_RWDIR_CNT];
         uint64_t elapsed_sec;
         uint64_t eta_sec;
         uint32_t is_pow2;
         uint32_t unit_base;
  
+       uint32_t files_open;
+
         /*
          * Network 'copy' of run_str[]
          */
@@ -286,8 +289,9 @@ extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats
  extern void init_thread_stat(struct thread_stat *ts);
  extern void init_group_run_stat(struct group_run_stats *gs);
  extern void eta_to_str(char *str, unsigned long eta_sec);
-extern bool calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max, double *mean, double *dev);
-extern unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr, fio_fp64_t *plist, unsigned int **output, unsigned int *maxv, unsigned int *minv);
+extern bool calc_lat(struct io_stat *is, unsigned long long *min, unsigned long long *max, double *mean, double *dev);
+extern unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr, fio_fp64_t *plist, unsigned long long **output, unsigned long long *maxv, unsigned long long *minv);
+extern void stat_calc_lat_n(struct thread_stat *ts, double *io_u_lat);
  extern void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat);
  extern void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat);
  extern void stat_calc_dist(unsigned int *map, unsigned long total, double *io_u_dist);
@@ -295,9 +299,9 @@ extern void reset_io_stats(struct thread_data *);
  extern void update_rusage_stat(struct thread_data *);
  extern void clear_rusage_stat(struct thread_data *);
  
-extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long,
+extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
                                 unsigned int, uint64_t);
-extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long,
+extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
                                 unsigned int, uint64_t);
  extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long,
                                 unsigned int, uint64_t);
@@ -305,16 +309,17 @@ extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned int);
  extern void add_iops_sample(struct thread_data *, struct io_u *,
                                 unsigned int);
  extern void add_bw_sample(struct thread_data *, struct io_u *,
-                               unsigned int, unsigned long);
+                               unsigned int, unsigned long long);
  extern int calc_log_samples(void);
  
  extern struct io_log *agg_io_log[DDIR_RWDIR_CNT];
  extern int write_bw_log;
  
-static inline bool usec_to_msec(unsigned long *min, unsigned long *max,
-                               double *mean, double *dev)
+static inline bool nsec_to_usec(unsigned long long *min,
+                               unsigned long long *max, double *mean,
+                               double *dev)
  {
-       if (*min > 1000 && *max > 1000 && *mean > 1000.0 && *dev > 1000.0) {
+       if (*min > 2000 && *max > 99999 && *dev > 1000.0) {
                 *min /= 1000;
                 *max /= 1000;
                 *mean /= 1000.0;
@@ -324,6 +329,22 @@ static inline bool usec_to_msec(unsigned long *min, unsigned long *max,
  
         return false;
  }
+
+static inline bool nsec_to_msec(unsigned long long *min,
+                               unsigned long long *max, double *mean,
+                               double *dev)
+{
+       if (*min > 2000000 && *max > 99999999ULL && *dev > 1000000.0) {
+               *min /= 1000000;
+               *max /= 1000000;
+               *mean /= 1000000.0;
+               *dev /= 1000000.0;
+               return true;
+       }
+
+       return false;
+}
+
  /*
   * Worst level condensing would be 1:5, so allow enough room for that
   */
diff --git a/steadystate.c b/steadystate.c

index 98f027cee4013ac4f75db7a4ade31c8d5f8056fa..45d4f5d2eeb928306c1b312e20f960b6f9b53506 100644 (file)
--- a/steadystate.c
+++ b/steadystate.c
@@ -196,7 +196,7 @@ void steadystate_check(void)
         int i, j, ddir, prev_groupid, group_ramp_time_over = 0;
         unsigned long rate_time;
         struct thread_data *td, *td2;
-       struct timeval now;
+       struct timespec now;
         uint64_t group_bw = 0, group_iops = 0;
         uint64_t td_iops, td_bytes;
         bool ret;
diff --git a/steadystate.h b/steadystate.h

index 20ccd3035f11d67ca751d78be6135be85517d6e0..bbc3945ceb93e0bf89d89e7d2accdd6c34dd31dd 100644 (file)
--- a/steadystate.h
+++ b/steadystate.h
@@ -35,7 +35,7 @@ struct steadystate_data {
         uint64_t sum_xy;
         uint64_t oldest_y;
  
-       struct timeval prev_time;
+       struct timespec prev_time;
         uint64_t prev_iops;
         uint64_t prev_bytes;
  };
diff --git a/t/arch.c b/t/arch.c

index befb7c7f00cb56effdfa6e8fd4405e9bb653210f..bd28a848b20152e9a53777e3f7bd58d92c06f62f 100644 (file)
--- a/t/arch.c
+++ b/t/arch.c
@@ -1,5 +1,5 @@
  #include "../arch/arch.h"
  
  unsigned long arch_flags = 0;
-int tsc_reliable;
+bool tsc_reliable;
  int arch_random;
diff --git a/t/axmap.c b/t/axmap.c

index e32ff98d09f1323a12b9cb1c708cb49370ca0a38..a803ce47bf3386308427c8ebf202ed8df8461625 100644 (file)
--- a/t/axmap.c
+++ b/t/axmap.c
@@ -8,16 +8,6 @@
  #include "../lib/lfsr.h"
  #include "../lib/axmap.h"
  
-void *smalloc(size_t size)
-{
-       return malloc(size);
-}
-
-void sfree(void *ptr)
-{
-       free(ptr);
-}
-
  static int test_regular(size_t size, int seed)
  {
         struct fio_lfsr lfsr;
diff --git a/t/debug.c b/t/debug.c

index bf6f460578fd858984205d3463452f0b9c91e28b..8965cfbc4547241b5a6af4b66c03339d3f943696 100644 (file)
--- a/t/debug.c
+++ b/t/debug.c
@@ -1,7 +1,7 @@
  #include <stdio.h>
  
  FILE *f_err;
-struct timeval *fio_tv = NULL;
+struct timespec *fio_ts = NULL;
  unsigned long fio_debug = 0;
  
  void __dprint(int type, const char *str, ...)
diff --git a/t/dedupe.c b/t/dedupe.c

index 1f172a26986231d7f4402d2a76695557523f23bb..c3b837f7b698cdedd027f20be12e085ffe21faa1 100644 (file)
--- a/t/dedupe.c
+++ b/t/dedupe.c
@@ -334,7 +334,7 @@ static void *thread_fn(void *data)
  static void show_progress(struct worker_thread *threads, unsigned long total)
  {
         unsigned long last_nitems = 0;
-       struct timeval last_tv;
+       struct timespec last_tv;
  
         fio_gettime(&last_tv, NULL);
  
diff --git a/t/gen-rand.c b/t/gen-rand.c

index 6c31f92598fc1686e05f905be24237b576b61b18..4e9d39c6431eb097271b516d795e5ff0ef492dfc 100644 (file)
--- a/t/gen-rand.c
+++ b/t/gen-rand.c
@@ -63,6 +63,6 @@ int main(int argc, char *argv[])
         }
  
         printf("Passes=%lu, Fail=%lu\n", pass, fail);
-
+       free(buckets);
         return 0;
  }
diff --git a/t/lfsr-test.c b/t/lfsr-test.c

index 7016f2688f6bfc0dc5f50811143722a1820d06ac..4009b62e38c3acdcfe827f54708cba230c640745 100644 (file)
--- a/t/lfsr-test.c
+++ b/t/lfsr-test.c
@@ -27,7 +27,7 @@ void usage()
  int main(int argc, char *argv[])
  {
         int r;
-       struct timeval start, end;
+       struct timespec start, end;
         struct fio_lfsr *fl;
         int verify = 0;
         unsigned int spin = 0;
diff --git a/t/time-test.c b/t/time-test.c

new file mode 100644 (file)

index 0000000..a74d920
--- /dev/null
+++ b/t/time-test.c
@@ -0,0 +1,544 @@
+/*
+ * Carry out arithmetic to explore conversion of CPU clock ticks to nsec
+ *
+ * When we use the CPU clock for timing, we do the following:
+ *
+ * 1) Calibrate the CPU clock to relate the frequency of CPU clock ticks
+ *    to actual time.
+ *
+ *    Using gettimeofday() or clock_gettime(), count how many CPU clock
+ *    ticks occur per usec
+ *
+ * 2) Calculate conversion factors so that we can ultimately convert
+ *    from clocks ticks to nsec with
+ *      nsec = (ticks * clock_mult) >> clock_shift
+ *
+ *    This is equivalent to
+ *     nsec = ticks * (MULTIPLIER / cycles_per_nsec) / MULTIPLIER
+ *    where
+ *     clock_mult = MULTIPLIER / cycles_per_nsec
+ *      MULTIPLIER = 2^clock_shift
+ *
+ *    It would be simpler to just calculate nsec = ticks / cycles_per_nsec,
+ *    but all of this is necessary because of rounding when calculating
+ *    cycles_per_nsec. With a 3.0GHz CPU, cycles_per_nsec would simply
+ *    be 3. But with a 3.33GHz CPU or a 4.5GHz CPU, the fractional
+ *    portion is lost with integer arithmetic.
+ *
+ *    This multiply and shift calculation also has a performance benefit
+ *    as multiplication and bit shift operations are faster than integer
+ *    division.
+ *
+ * 3) Dynamically determine clock_shift and clock_mult at run time based
+ *    on MAX_CLOCK_SEC and cycles_per_usec. MAX_CLOCK_SEC is the maximum
+ *    duration for which the conversion will be valid.
+ *
+ *    The primary constraint is that (ticks * clock_mult) must not overflow
+ *    when ticks is at its maximum value.
+ *
+ *    So we have
+ *     max_ticks = MAX_CLOCK_SEC * 1000000000 * cycles_per_nsec
+ *     max_ticks * clock_mult <= ULLONG_MAX
+ *     max_ticks * MULTIPLIER / cycles_per_nsec <= ULLONG_MAX
+ *      MULTIPLIER <= ULLONG_MAX * cycles_per_nsec / max_ticks
+ *
+ *    Then choose the largest clock_shift that satisfies
+ *     2^clock_shift <= ULLONG_MAX * cycles_per_nsec / max_ticks
+ *
+ *    Finally calculate the appropriate clock_mult associated with clock_shift
+ *     clock_mult = 2^clock_shift / cycles_per_nsec
+ *
+ * 4) In the code below we have cycles_per_usec and use
+ *     cycles_per_nsec = cycles_per_usec / 1000
+ *
+ *
+ * The code below implements 4 clock tick to nsec conversion strategies
+ *
+ *   i) 64-bit arithmetic for the (ticks * clock_mult) product with the
+ *     conversion valid for at most MAX_CLOCK_SEC
+ *
+ *  ii) NOT IMPLEMENTED Use 64-bit integers to emulate 128-bit multiplication
+ *     for the (ticks * clock_mult) product
+ *
+ * iii) 64-bit arithmetic with clock ticks to nsec conversion occurring in
+ *     two stages. The first stage counts the number of discrete, large chunks
+ *     of time that have elapsed. To this is added the time represented by
+ *     the remaining clock ticks. The advantage of this strategy is better
+ *     accuracy because the (ticks * clock_mult) product used for final
+ *     fractional chunk
+ *
+ *  iv) 64-bit arithmetic with the clock ticks to nsec conversion occuring in
+ *     two stages. This is carried out using locks to update the number of
+ *     large time chunks (MAX_CLOCK_SEC_2STAGE) that have elapsed.
+ *
+ *   v) 128-bit arithmetic used for the clock ticks to nsec conversion.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "lib/seqlock.h"
+
+#define DEBUG 0
+#define MAX_CLOCK_SEC 365*24*60*60ULL
+#define MAX_CLOCK_SEC_2STAGE 60*60ULL
+#define dprintf(...) if (DEBUG) { printf(__VA_ARGS__); }
+
+enum {
+       __CLOCK64_BIT           = 1 << 0,
+       __CLOCK128_BIT          = 1 << 1,
+       __CLOCK_MULT_SHIFT      = 1 << 2,
+       __CLOCK_EMULATE_128     = 1 << 3,
+       __CLOCK_2STAGE          = 1 << 4,
+       __CLOCK_LOCK            = 1 << 5,
+
+       CLOCK64_MULT_SHIFT      = __CLOCK64_BIT | __CLOCK_MULT_SHIFT,
+       CLOCK64_EMULATE_128     = __CLOCK64_BIT | __CLOCK_EMULATE_128,
+       CLOCK64_2STAGE          = __CLOCK64_BIT | __CLOCK_2STAGE,
+       CLOCK64_LOCK            = __CLOCK64_BIT | __CLOCK_LOCK,
+       CLOCK128_MULT_SHIFT     = __CLOCK128_BIT | __CLOCK_MULT_SHIFT,
+};
+
+static struct seqlock clock_seqlock;
+static unsigned long long cycles_start;
+static unsigned long long elapsed_nsec;
+
+static unsigned int max_cycles_shift;
+static unsigned long long max_cycles_mask;
+static unsigned long long nsecs_for_max_cycles;
+
+static unsigned int clock_shift;
+static unsigned long long clock_mult;
+
+static unsigned long long *nsecs;
+static unsigned long long clock_mult64_128[2];
+static __uint128_t clock_mult128;
+
+/*
+ * Functions for carrying out 128-bit
+ * arithmetic using 64-bit integers
+ *
+ * 128-bit integers are stored as
+ * arrays of two 64-bit integers
+ *
+ * Ordering is little endian
+ *
+ * a[0] has the less significant bits
+ * a[1] has the more significant bits
+ *
+ * NOT FULLY IMPLEMENTED
+ */
+static void do_mult(unsigned long long a[2], unsigned long long b,
+                   unsigned long long product[2])
+{
+       product[0] = product[1] = 0;
+       return;
+}
+
+static void do_div(unsigned long long a[2], unsigned long long b,
+                  unsigned long long c[2])
+{
+       return;
+}
+
+static void do_shift64(unsigned long long a[2], unsigned int count)
+{
+       a[0] = a[1] >> (count-64);
+       a[1] = 0;
+}
+
+static void do_shift(unsigned long long a[2], unsigned int count)
+{
+       if (count > 64)
+               do_shift64(a, count);
+       else {
+               while (count--) {
+                       a[0] >>= 1;
+                       a[0] |= a[1] << 63;
+                       a[1] >>= 1;
+               }
+       }
+}
+
+static void update_clock(unsigned long long t)
+{
+       write_seqlock_begin(&clock_seqlock);
+       elapsed_nsec = (t >> max_cycles_shift) * nsecs_for_max_cycles;
+       cycles_start = t & ~max_cycles_mask;
+       write_seqlock_end(&clock_seqlock);
+}
+
+static unsigned long long _get_nsec(int mode, unsigned long long t)
+{
+       switch(mode) {
+       case CLOCK64_MULT_SHIFT:
+               return (t * clock_mult) >> clock_shift;
+       case CLOCK64_EMULATE_128: {
+               unsigned long long product[2] =  { };
+
+               do_mult(clock_mult64_128, t, product);
+               do_shift(product, clock_shift);
+               return product[0];
+               }
+       case CLOCK64_2STAGE: {
+               unsigned long long multiples, nsec;
+
+               multiples = t >> max_cycles_shift;
+               dprintf("multiples=%llu\n", multiples);
+               nsec = multiples * nsecs_for_max_cycles;
+               nsec += ((t & max_cycles_mask) * clock_mult) >> clock_shift;
+               return nsec;
+               }
+       case CLOCK64_LOCK: {
+               unsigned int seq;
+               unsigned long long nsec;
+
+               do {
+                       seq = read_seqlock_begin(&clock_seqlock);
+                       nsec = elapsed_nsec;
+                       nsec += ((t - cycles_start) * clock_mult) >> clock_shift;
+               } while (read_seqlock_retry(&clock_seqlock, seq));
+               return nsec;
+               }
+       case CLOCK128_MULT_SHIFT:
+               return (unsigned long long)((t * clock_mult128) >> clock_shift);
+               default:
+                       assert(0);
+       }
+}
+
+static unsigned long long get_nsec(int mode, unsigned long long t)
+{
+       if (mode == CLOCK64_LOCK) {
+               update_clock(t);
+       }
+
+       return _get_nsec(mode, t);
+}
+
+static void calc_mult_shift(int mode, void *mult, unsigned int *shift,
+                           unsigned long long max_sec,
+                           unsigned long long cycles_per_usec)
+{
+       unsigned long long max_ticks;
+       max_ticks = max_sec * cycles_per_usec * 1000000ULL;
+
+       switch (mode) {
+       case CLOCK64_MULT_SHIFT: {
+               unsigned long long max_mult, tmp;
+               unsigned int sft = 0;
+
+               /*
+                * Calculate the largest multiplier that will not
+                * produce a 64-bit overflow in the multiplication
+                * step of the clock ticks to nsec conversion
+                */
+               max_mult = ULLONG_MAX / max_ticks;
+               dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=%llu\n", max_ticks, __builtin_clzll(max_ticks), max_mult);
+
+               /*
+                * Find the largest shift count that will produce
+                * a multiplier less than max_mult
+                */
+               tmp = max_mult * cycles_per_usec / 1000;
+               while (tmp > 1) {
+                       tmp >>= 1;
+                       sft++;
+                       dprintf("tmp=%llu, sft=%u\n", tmp, sft);
+               }
+
+               *shift = sft;
+               *((unsigned long long *)mult) = (unsigned long long) ((1ULL << sft) * 1000 / cycles_per_usec);
+               break;
+               }
+       case CLOCK64_EMULATE_128: {
+               unsigned long long max_mult[2], tmp[2] = { };
+               unsigned int sft = 0;
+
+               /*
+                * Calculate the largest multiplier that will not
+                * produce a 128-bit overflow in the multiplication
+                * step of the clock ticks to nsec conversion,
+                * but use only 64-bit integers in the process
+                */
+               max_mult[0] = max_mult[1] = ULLONG_MAX;
+               do_div(max_mult, max_ticks, max_mult);
+               dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
+                       max_ticks, __builtin_clzll(max_ticks), max_mult[1], max_mult[0]);
+
+               /*
+                * Find the largest shift count that will produce
+                * a multiplier less than max_mult
+                */
+               do_div(max_mult, cycles_per_usec, tmp);
+               do_div(tmp, 1000ULL, tmp);
+               while (tmp[0] > 1 || tmp[1] > 1) {
+                       do_shift(tmp, 1);
+                       sft++;
+                       dprintf("tmp=0x%016llx%016llx, sft=%u\n", tmp[1], tmp[0], sft);
+               }
+
+               *shift = sft;
+//             *((unsigned long long *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
+               break;
+               }
+       case CLOCK64_2STAGE: {
+               unsigned long long tmp;
+/*
+ * This clock tick to nsec conversion requires two stages.
+ *
+ * Stage 1: Determine how many ~MAX_CLOCK_SEC_2STAGE periods worth of clock ticks
+ *     have elapsed and set nsecs to the appropriate value for those
+ *     ~MAX_CLOCK_SEC_2STAGE periods.
+ * Stage 2: Subtract the ticks for the elapsed ~MAX_CLOCK_SEC_2STAGE periods from
+ *     Stage 1. Convert remaining clock ticks to nsecs and add to previously
+ *     set nsec value.
+ *
+ * To optimize the arithmetic operations, use the greatest power of 2 ticks
+ * less than the number of ticks in MAX_CLOCK_SEC_2STAGE seconds.
+ *
+ */
+               // Use a period shorter than MAX_CLOCK_SEC here for better accuracy
+               calc_mult_shift(CLOCK64_MULT_SHIFT, mult, shift, MAX_CLOCK_SEC_2STAGE, cycles_per_usec);
+
+               // Find the greatest power of 2 clock ticks that is less than the ticks in MAX_CLOCK_SEC_2STAGE
+               max_cycles_shift = max_cycles_mask = 0;
+               tmp = MAX_CLOCK_SEC_2STAGE * 1000000ULL * cycles_per_usec;
+               dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+               while (tmp > 1) {
+                       tmp >>= 1;
+                       max_cycles_shift++;
+                       dprintf("tmp=%llu, max_cycles_shift=%u\n", tmp, max_cycles_shift);
+               }
+               // if use use (1ULL << max_cycles_shift) * 1000 / cycles_per_usec here we will
+               // have a discontinuity every (1ULL << max_cycles_shift) cycles
+               nsecs_for_max_cycles = (1ULL << max_cycles_shift) * *((unsigned long long *)mult) >> *shift;
+
+               // Use a bitmask to calculate ticks % (1ULL << max_cycles_shift)
+               for (tmp = 0; tmp < max_cycles_shift; tmp++)
+                       max_cycles_mask |= 1ULL << tmp;
+
+               dprintf("max_cycles_shift=%u, 2^max_cycles_shift=%llu, nsecs_for_max_cycles=%llu, max_cycles_mask=%016llx\n",
+                               max_cycles_shift, (1ULL << max_cycles_shift),
+                               nsecs_for_max_cycles, max_cycles_mask);
+
+
+               break;
+               }
+       case CLOCK64_LOCK: {
+/*
+ * This clock tick to nsec conversion also requires two stages.
+ *
+ * Stage 1: Add to nsec the current running total of elapsed long periods
+ * Stage 2: Subtract from clock ticks the tick count corresponding to the
+ *     most recently elapsed long period. Convert the remaining ticks to
+ *     nsec and add to the previous nsec value.
+ *
+ * In practice the elapsed nsec from Stage 1 and the tick count subtracted
+ * in Stage 2 will be maintained in a separate thread.
+ *
+ */
+               calc_mult_shift(CLOCK64_2STAGE, mult, shift, MAX_CLOCK_SEC, cycles_per_usec);
+               cycles_start = 0;
+               break;
+               }
+       case CLOCK128_MULT_SHIFT: {
+               __uint128_t max_mult, tmp;
+               unsigned int sft = 0;
+
+               /*
+                * Calculate the largest multiplier that will not
+                * produce a 128-bit overflow in the multiplication
+                * step of the clock ticks to nsec conversion
+                */
+               max_mult = ((__uint128_t) ULLONG_MAX) << 64 | ULLONG_MAX;
+               max_mult /= max_ticks;
+               dprintf("max_ticks=%llu, __builtin_clzll=%d, max_mult=0x%016llx%016llx\n",
+                               max_ticks, __builtin_clzll(max_ticks),
+                               (unsigned long long) (max_mult >> 64),
+                               (unsigned long long) max_mult);
+
+               /*
+                * Find the largest shift count that will produce
+                * a multiplier less than max_mult
+                */
+               tmp = max_mult * cycles_per_usec / 1000;
+               while (tmp > 1) {
+                       tmp >>= 1;
+                       sft++;
+                       dprintf("tmp=0x%016llx%016llx, sft=%u\n",
+                                       (unsigned long long) (tmp >> 64),
+                                       (unsigned long long) tmp, sft);
+               }
+
+               *shift = sft;
+               *((__uint128_t *)mult) = (__uint128_t) (((__uint128_t)1 << sft) * 1000 / cycles_per_usec);
+               break;
+               }
+       }
+}
+
+static int discontinuity(int mode, int delta_ticks, int delta_nsec,
+                        unsigned long long start, unsigned long len)
+{
+       int i;
+       unsigned long mismatches = 0, bad_mismatches = 0;
+       unsigned long long delta, max_mismatch = 0;
+       unsigned long long *ns = nsecs;
+
+       for (i = 0; i < len; ns++, i++) {
+               *ns = get_nsec(mode, start + i);
+               if (i - delta_ticks >= 0) {
+                       if (*ns > *(ns - delta_ticks))
+                               delta = *ns - *(ns - delta_ticks);
+                       else
+                               delta = *(ns - delta_ticks) - *ns;
+                       if (delta > delta_nsec)
+                               delta -= delta_nsec;
+                       else
+                               delta = delta_nsec - delta;
+                       if (delta) {
+                               mismatches++;
+                               if (delta > 1)
+                                       bad_mismatches++;
+                               if (delta > max_mismatch)
+                                       max_mismatch = delta;
+                       }
+               }
+               if (!bad_mismatches)
+                       assert(max_mismatch == 0 || max_mismatch == 1);
+               if (!mismatches)
+                       assert(max_mismatch == 0);
+       }
+
+       printf("%lu discontinuities (%lu%%) (%lu errors > 1ns, max delta = %lluns) for ticks = %llu...%llu\n",
+               mismatches, (mismatches * 100) / len, bad_mismatches, max_mismatch, start,
+               start + len - 1);
+       return mismatches;
+}
+
+#define MIN_TICKS 1ULL
+#define LEN 1000000000ULL
+#define NSEC_ONE_SEC 1000000000ULL
+#define TESTLEN 9
+
+static long long test_clock(int mode, int cycles_per_usec, int fast_test,
+                           int quiet, int delta_ticks, int delta_nsec)
+{
+       int i;
+       long long delta;
+       unsigned long long max_ticks;
+       unsigned long long nsecs;
+       void *mult;
+       unsigned long long test_ns[TESTLEN] =
+                       {NSEC_ONE_SEC, NSEC_ONE_SEC,
+                        NSEC_ONE_SEC, NSEC_ONE_SEC*60, NSEC_ONE_SEC*60*60,
+                        NSEC_ONE_SEC*60*60*2, NSEC_ONE_SEC*60*60*4,
+                        NSEC_ONE_SEC*60*60*8, NSEC_ONE_SEC*60*60*24};
+       unsigned long long test_ticks[TESTLEN];
+
+       max_ticks = MAX_CLOCK_SEC * (unsigned long long) cycles_per_usec * 1000000ULL;
+
+       switch(mode) {
+       case CLOCK64_MULT_SHIFT:
+               mult = &clock_mult;
+               break;
+       case CLOCK64_EMULATE_128:
+               mult = clock_mult64_128;
+               break;
+       case CLOCK64_2STAGE:
+               mult = &clock_mult;
+               break;
+       case CLOCK64_LOCK:
+               mult = &clock_mult;
+               break;
+       case CLOCK128_MULT_SHIFT:
+               mult = &clock_mult128;
+               break;
+       default:
+               assert(0);
+       }
+       calc_mult_shift(mode, mult, &clock_shift, MAX_CLOCK_SEC, cycles_per_usec);
+       nsecs = get_nsec(mode, max_ticks);
+       delta = nsecs/1000000 - MAX_CLOCK_SEC*1000;
+
+       if (mode == CLOCK64_2STAGE) {
+               test_ns[0] = nsecs_for_max_cycles - 1;
+               test_ns[1] = nsecs_for_max_cycles;
+               test_ticks[0] = (1ULL << max_cycles_shift) - 1;
+               test_ticks[1] = (1ULL << max_cycles_shift);
+
+               for (i = 2; i < TESTLEN; i++)
+                       test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
+       }
+       else {
+               for (i = 0; i < TESTLEN; i++)
+                       test_ticks[i] = test_ns[i] / 1000 * cycles_per_usec;
+       }
+
+       if (!quiet) {
+               printf("cycles_per_usec=%d, delta_ticks=%d, delta_nsec=%d, max_ticks=%llu, shift=%u, 2^shift=%llu\n",
+                       cycles_per_usec, delta_ticks, delta_nsec, max_ticks, clock_shift, (1ULL << clock_shift));
+               switch(mode) {
+                       case CLOCK64_LOCK:
+                       case CLOCK64_2STAGE:
+                       case CLOCK64_MULT_SHIFT: {
+                               printf("clock_mult=%llu, clock_mult / 2^clock_shift=%f\n",
+                                       clock_mult, (double) clock_mult / (1ULL << clock_shift));
+                               break;
+                       }
+                       case CLOCK64_EMULATE_128: {
+                               printf("clock_mult=0x%016llx%016llx\n",
+                                       clock_mult64_128[1], clock_mult64_128[0]);
+                               break;
+                       }
+                       case CLOCK128_MULT_SHIFT: {
+                               printf("clock_mult=0x%016llx%016llx\n",
+                                       (unsigned long long) (clock_mult128 >> 64),
+                                       (unsigned long long) clock_mult128);
+                               break;
+                       }
+               }
+               printf("get_nsec(max_ticks) = %lluns, should be %lluns, error<=abs(%lld)ms\n",
+                       nsecs, MAX_CLOCK_SEC*1000000000ULL, delta);
+       }
+
+       for (i = 0; i < TESTLEN; i++)
+       {
+               nsecs = get_nsec(mode, test_ticks[i]);
+               delta = nsecs > test_ns[i] ? nsecs - test_ns[i] : test_ns[i] - nsecs;
+               if (!quiet || delta > 0)
+                       printf("get_nsec(%llu)=%llu, expected %llu, delta=%llu\n",
+                               test_ticks[i], nsecs, test_ns[i], delta);
+       }
+
+       if (!fast_test) {
+               discontinuity(mode, delta_ticks, delta_nsec, max_ticks - LEN + 1, LEN);
+               discontinuity(mode, delta_ticks, delta_nsec, MIN_TICKS, LEN);
+       }
+
+       if (!quiet)
+               printf("\n\n");
+
+       return delta;
+}
+
+int main(int argc, char *argv[])
+{
+       nsecs = malloc(LEN * sizeof(unsigned long long));
+
+       test_clock(CLOCK64_LOCK, 3333, 1, 0, 0, 0);
+       test_clock(CLOCK64_LOCK, 1000, 1, 0, 1, 1);
+       test_clock(CLOCK64_LOCK, 1100, 1, 0, 11, 10);
+       test_clock(CLOCK64_LOCK, 3000, 1, 0, 3, 1);
+       test_clock(CLOCK64_LOCK, 3333, 1, 0, 3333, 1000);
+       test_clock(CLOCK64_LOCK, 3392, 1, 0, 424, 125);
+       test_clock(CLOCK64_LOCK, 4500, 1, 0, 9, 2);
+       test_clock(CLOCK64_LOCK, 5000, 1, 0, 5, 1);
+
+       free(nsecs);
+       return 0;
+}
diff --git a/t/verify-state.c b/t/verify-state.c

index 9a2c3df68cb1db568844ad263a600b3a43285167..78a56dafd26878e3a5fec7bced3382fe20385c43 100644 (file)
--- a/t/verify-state.c
+++ b/t/verify-state.c
@@ -58,7 +58,8 @@ static void show(struct thread_io_list *s, size_t size)
                 show_s(s, no_s);
                 no_s++;
                 size -= __thread_io_list_sz(s->depth, s->nofiles);
-               s = (void *) s + __thread_io_list_sz(s->depth, s->nofiles);
+               s = (struct thread_io_list *)((char *) s +
+                       __thread_io_list_sz(s->depth, s->nofiles));
         } while (size != 0);
  }
  
diff --git a/td_error.c b/td_error.c

index 903f9ea5163e95854043b942889d10f6185aa198..9d58a31471d03f3f7db70fb6660432750f5c882c 100644 (file)
--- a/td_error.c
+++ b/td_error.c
@@ -20,8 +20,7 @@ int td_non_fatal_error(struct thread_data *td, enum error_type_bit etype,
  
         if (!td->o.ignore_error[etype]) {
                 td->o.ignore_error[etype] = __NON_FATAL_ERR;
-               td->o.ignore_error_nr[etype] = sizeof(__NON_FATAL_ERR)
-                       / sizeof(int);
+               td->o.ignore_error_nr[etype] = ARRAY_SIZE(__NON_FATAL_ERR);
         }
  
         if (!(td->o.continue_on_error & (1 << etype)))
diff --git a/td_error.h b/td_error.h

index 113398987553e6f75903877c380d5e3b02300a03..1b38a5349ffc727019364f3490ff37171a03880e 100644 (file)
--- a/td_error.h
+++ b/td_error.h
@@ -2,7 +2,8 @@
  #define FIO_TD_ERROR_H
  
  /*
- * What type of errors to continue on when continue_on_error is used
+ * What type of errors to continue on when continue_on_error is used,
+ * and what type of errors to ignore when ignore_error is used.
   */
  enum error_type_bit {
         ERROR_TYPE_READ_BIT = 0,
diff --git a/thread_options.h b/thread_options.h

index d0f3fe932a4d35e39c0e57482445c507e9dc50d1..1813cdc706a89a5a493386e7f7d0345abf3fae3a 100644 (file)
--- a/thread_options.h
+++ b/thread_options.h
@@ -53,6 +53,7 @@ struct thread_options {
         char *filename_format;
         char *opendir;
         char *ioengine;
+       char *ioengine_so_path;
         char *mmapfile;
         enum td_ddir td_ddir;
         unsigned int rw_seq;
@@ -65,6 +66,7 @@ struct thread_options {
         unsigned int iodepth_batch;
         unsigned int iodepth_batch_complete_min;
         unsigned int iodepth_batch_complete_max;
+       unsigned int serialize_overlap;
  
         unsigned int unique_filename;
  
@@ -102,6 +104,7 @@ struct thread_options {
         unsigned int end_fsync;
         unsigned int pre_read;
         unsigned int sync_io;
+       unsigned int write_hint;
         unsigned int verify;
         unsigned int do_verify;
         unsigned int verifysort;
@@ -200,6 +203,7 @@ struct thread_options {
         unsigned int numa_mem_prefer_node;
         char *numa_memnodes;
         unsigned int gpu_dev_id;
+       unsigned int start_offset_percent;
  
         unsigned int iolog;
         unsigned int rwmixcycle;
@@ -236,6 +240,7 @@ struct thread_options {
         unsigned int trim_zero;
         unsigned long long trim_backlog;
         unsigned int clat_percentiles;
+       unsigned int lat_percentiles;
         unsigned int percentile_precision;      /* digits after decimal for percentiles */
         fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
  
@@ -304,7 +309,6 @@ struct thread_options {
         fio_fp64_t latency_percentile;
  
         unsigned block_error_hist;
-       unsigned int skip_bad;
  
         unsigned int replay_align;
         unsigned int replay_scale;
@@ -339,6 +343,8 @@ struct thread_options_pack {
         uint32_t iodepth_batch;
         uint32_t iodepth_batch_complete_min;
         uint32_t iodepth_batch_complete_max;
+       uint32_t serialize_overlap;
+       uint32_t lat_percentiles;
  
         uint64_t size;
         uint64_t io_size;
@@ -375,6 +381,7 @@ struct thread_options_pack {
         uint32_t end_fsync;
         uint32_t pre_read;
         uint32_t sync_io;
+       uint32_t write_hint;
         uint32_t verify;
         uint32_t do_verify;
         uint32_t verifysort;
@@ -416,6 +423,7 @@ struct thread_options_pack {
  
         uint32_t random_distribution;
         uint32_t exitall_error;
+       uint32_t pad;
  
         struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
         uint32_t zone_split_nr[DDIR_RWDIR_CNT];
@@ -469,7 +477,7 @@ struct thread_options_pack {
         uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
  #endif
         uint32_t gpu_dev_id;
-       uint32_t pad;
+       uint32_t start_offset_percent;
         uint32_t cpus_allowed_policy;
         uint32_t iolog;
         uint32_t rwmixcycle;
@@ -575,7 +583,6 @@ struct thread_options_pack {
         fio_fp64_t latency_percentile;
  
         uint32_t block_error_hist;
-       uint32_t skip_bad;
  
         uint32_t replay_align;
         uint32_t replay_scale;
diff --git a/time.c b/time.c

index 279ee48492302ebd5a1080e5422cc48ecd6b7a59..c8876829a367ede755453f6b255b2f4139ee8a74 100644 (file)
--- a/time.c
+++ b/time.c
@@ -3,23 +3,23 @@
  
  #include "fio.h"
  
-static struct timeval genesis;
+static struct timespec genesis;
  static unsigned long ns_granularity;
  
-void timeval_add_msec(struct timeval *tv, unsigned int msec)
+void timespec_add_msec(struct timespec *ts, unsigned int msec)
  {
-       unsigned long adj_usec = 1000 * msec;
+       uint64_t adj_nsec = 1000000ULL * msec;
  
-       tv->tv_usec += adj_usec;
-       if (adj_usec >= 1000000) {
-               unsigned long adj_sec = adj_usec / 1000000;
+       ts->tv_nsec += adj_nsec;
+       if (adj_nsec >= 1000000000) {
+               uint64_t adj_sec = adj_nsec / 1000000000;
  
-               tv->tv_usec -=  adj_sec * 1000000;
-               tv->tv_sec += adj_sec;
+               ts->tv_nsec -= adj_sec * 1000000000;
+               ts->tv_sec += adj_sec;
         }
-       if (tv->tv_usec >= 1000000){
-               tv->tv_usec -= 1000000;
-               tv->tv_sec++;
+       if (ts->tv_nsec >= 1000000000){
+               ts->tv_nsec -= 1000000000;
+               ts->tv_sec++;
         }
  }
  
@@ -28,7 +28,7 @@ void timeval_add_msec(struct timeval *tv, unsigned int msec)
   */
  uint64_t usec_spin(unsigned int usec)
  {
-       struct timeval start;
+       struct timespec start;
         uint64_t t;
  
         fio_gettime(&start, NULL);
@@ -41,7 +41,7 @@ uint64_t usec_spin(unsigned int usec)
  uint64_t usec_sleep(struct thread_data *td, unsigned long usec)
  {
         struct timespec req;
-       struct timeval tv;
+       struct timespec tv;
         uint64_t t = 0;
  
         do {
@@ -97,31 +97,37 @@ bool in_ramp_time(struct thread_data *td)
         return td->o.ramp_time && !td->ramp_time_over;
  }
  
-static void parent_update_ramp(struct thread_data *td)
+static bool parent_update_ramp(struct thread_data *td)
  {
         struct thread_data *parent = td->parent;
  
         if (!parent || parent->ramp_time_over)
-               return;
+               return false;
  
         reset_all_stats(parent);
-       parent->ramp_time_over = 1;
+       parent->ramp_time_over = true;
         td_set_runstate(parent, TD_RAMP);
+       return true;
  }
  
  bool ramp_time_over(struct thread_data *td)
  {
-       struct timeval tv;
-
         if (!td->o.ramp_time || td->ramp_time_over)
                 return true;
  
-       fio_gettime(&tv, NULL);
-       if (utime_since(&td->epoch, &tv) >= td->o.ramp_time) {
-               td->ramp_time_over = 1;
+       if (utime_since_now(&td->epoch) >= td->o.ramp_time) {
+               td->ramp_time_over = true;
                 reset_all_stats(td);
                 td_set_runstate(td, TD_RAMP);
-               parent_update_ramp(td);
+
+               /*
+                * If we have a parent, the parent isn't doing IO. Hence
+                * the parent never enters do_io(), which will switch us
+                * from RAMP -> RUNNING. Do this manually here.
+                */
+               if (parent_update_ramp(td))
+                       td_set_runstate(td, TD_RUNNING);
+
                 return true;
         }
  
@@ -138,8 +144,7 @@ void fio_time_init(void)
          * Check the granularity of the nanosleep function
          */
         for (i = 0; i < 10; i++) {
-               struct timeval tv;
-               struct timespec ts;
+               struct timespec tv, ts;
                 unsigned long elapsed;
  
                 fio_gettime(&tv, NULL);
@@ -170,7 +175,7 @@ void set_epoch_time(struct thread_data *td, int log_unix_epoch)
         }
  }
  
-void fill_start_time(struct timeval *t)
+void fill_start_time(struct timespec *t)
  {
         memcpy(t, &genesis, sizeof(genesis));
  }
diff --git a/tools/fio_jsonplus_clat2csv b/tools/fio_jsonplus_clat2csv

new file mode 100755 (executable)

index 0000000..d4ac16e
--- /dev/null
+++ b/tools/fio_jsonplus_clat2csv
@@ -0,0 +1,164 @@
+#!/usr/bin/python
+#
+# fio_jsonplus_clat2csv
+#
+# This script converts fio's json+ completion latency data to CSV format.
+#
+# For example:
+#
+# Run the following fio jobs:
+# ../fio --output=fio-jsonplus.output --output-format=json+ --name=test1
+#      --ioengine=null --time_based --runtime=5s --size=1G --rw=randrw
+#      --name=test2 --ioengine=null --time_based --runtime=3s --size=1G
+#      --rw=read --name=test3 --ioengine=null --time_based --runtime=4s
+#      --size=8G --rw=write
+#
+# Then run:
+# fio_jsonplus_clat2csv fio-jsonplus.output fio-latency.csv
+#
+# You will end up with the following 3 files
+#
+# -rw-r--r-- 1 root root  6467 Jun 27 14:57 fio-latency_job0.csv
+# -rw-r--r-- 1 root root  3985 Jun 27 14:57 fio-latency_job1.csv
+# -rw-r--r-- 1 root root  4490 Jun 27 14:57 fio-latency_job2.csv
+#
+# fio-latency_job0.csv will look something like:
+#
+# clat_nsec, read_count, read_cumulative, read_percentile, write_count,
+#      write_cumulative, write_percentile, trim_count, trim_cumulative,
+#      trim_percentile,
+# 25, 1, 1, 1.50870705013e-07, , , , , , ,
+# 26, 12, 13, 1.96131916517e-06, 947, 947, 0.000142955890032, , , ,
+# 27, 843677, 843690, 0.127288105112, 838347, 839294, 0.126696959629, , , ,
+# 28, 1877982, 2721672, 0.410620573454, 1870189, 2709483, 0.409014312345, , , ,
+# 29, 4471, 2726143, 0.411295116376, 7718, 2717201, 0.410179395301, , , ,
+# 30, 2142885, 4869028, 0.734593687087, 2138164, 4855365, 0.732949340025, , , ,
+# ...
+# 2544, , , , 2, 6624404, 0.999997433738, , , ,
+# 2576, 3, 6628178, 0.99999788781, 4, 6624408, 0.999998037564, , , ,
+# 2608, 4, 6628182, 0.999998491293, 4, 6624412, 0.999998641391, , , ,
+# 2640, 3, 6628185, 0.999998943905, 2, 6624414, 0.999998943304, , , ,
+# 2672, 1, 6628186, 0.999999094776, 3, 6624417, 0.999999396174, , , ,
+# 2736, 1, 6628187, 0.999999245646, 1, 6624418, 0.99999954713, , , ,
+# 2768, 2, 6628189, 0.999999547388, 1, 6624419, 0.999999698087, , , ,
+# 2800, , , , 1, 6624420, 0.999999849043, , , ,
+# 2832, 1, 6628190, 0.999999698259, , , , , , ,
+# 4192, 1, 6628191, 0.999999849129, , , , , , ,
+# 5792, , , , 1, 6624421, 1.0, , , ,
+# 10304, 1, 6628192, 1.0, , , , , , ,
+#
+# The first line says that you had one read IO with 25ns clat,
+# the cumulative number of read IOs at or below 25ns is 1, and
+# 25ns is the 0.00001509th percentile for read latency
+#
+# The job had 2 write IOs complete in 2544ns,
+# 6624404 write IOs completed in 2544ns or less,
+# and this represents the 99.99974th percentile for write latency
+#
+# The last line says that one read IO had 10304ns clat,
+# 6628192 read IOs had 10304ns or shorter clat, and
+# 10304ns is the 100th percentile for read latency
+#
+
+import os
+import json
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('source',
+                        help='fio json+ output file containing completion '
+                             'latency data')
+    parser.add_argument('dest',
+                        help='destination file stub for latency data in CSV '
+                             'format. job number will be appended to filename')
+    args = parser.parse_args()
+
+    return args
+
+
+def percentile(idx, run_total):
+    total = run_total[len(run_total)-1]
+    if total == 0:
+        return 0
+
+    return float(run_total[idx]) / total
+
+
+def more_lines(indices, bins):
+    for key, value in indices.iteritems():
+        if value < len(bins[key]):
+            return True
+
+    return False
+
+
+def main():
+    args = parse_args()
+
+    with open(args.source, 'r') as source:
+        jsondata = json.loads(source.read())
+
+    for jobnum in range(0, len(jsondata['jobs'])):
+        bins = {}
+        run_total = {}
+        ddir_set = set(['read', 'write', 'trim'])
+
+        prev_ddir = None
+        for ddir in ddir_set:
+            bins[ddir] = [[int(key), value] for key, value in
+                          jsondata['jobs'][jobnum][ddir]['clat_ns']
+                          ['bins'].iteritems()]
+            bins[ddir] = sorted(bins[ddir], key=lambda bin: bin[0])
+
+            run_total[ddir] = [0 for x in range(0, len(bins[ddir]))]
+            if len(bins[ddir]) > 0:
+                run_total[ddir][0] = bins[ddir][0][1]
+                for x in range(1, len(bins[ddir])):
+                    run_total[ddir][x] = run_total[ddir][x-1] + \
+                        bins[ddir][x][1]
+
+        stub, ext = os.path.splitext(args.dest)
+        outfile = stub + '_job' + str(jobnum) + ext
+
+        with open(outfile, 'w') as output:
+            output.write("clat_nsec, ")
+            ddir_list = list(ddir_set)
+            for ddir in ddir_list:
+                output.write("{0}_count, {0}_cumulative, {0}_percentile, ".
+                             format(ddir))
+            output.write("\n")
+
+#
+# Have a counter for each ddir
+# In each round, pick the shortest remaining duration
+# and output a line with any values for that duration
+#
+            indices = {x: 0 for x in ddir_list}
+            while more_lines(indices, bins):
+                min_lat = 17112760320
+                for ddir in ddir_list:
+                    if indices[ddir] < len(bins[ddir]):
+                        min_lat = min(bins[ddir][indices[ddir]][0], min_lat)
+
+                output.write("{0}, ".format(min_lat))
+
+                for ddir in ddir_list:
+                    if indices[ddir] < len(bins[ddir]) and \
+                       min_lat == bins[ddir][indices[ddir]][0]:
+                        count = bins[ddir][indices[ddir]][1]
+                        cumulative = run_total[ddir][indices[ddir]]
+                        ptile = percentile(indices[ddir], run_total[ddir])
+                        output.write("{0}, {1}, {2}, ".format(count,
+                                     cumulative, ptile))
+                        indices[ddir] += 1
+                    else:
+                        output.write(", , , ")
+                output.write("\n")
+
+            print "{0} generated".format(outfile)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/hist/fiologparser_hist.py b/tools/hist/fiologparser_hist.py

index ead5e543eff1a835195bf67422fe428a0ffff013..ad97a54d3ab001912e5b197dc68004d9151ac79e 100755 (executable)
--- a/tools/hist/fiologparser_hist.py
+++ b/tools/hist/fiologparser_hist.py
@@ -373,7 +373,7 @@ if __name__ == '__main__':
          help='print warning messages to stderr')
  
      arg('--group_nr',
-        default=19,
+        default=29,
          type=int,
          help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')
  
diff --git a/tools/plot/fio2gnuplot.1 b/tools/plot/fio2gnuplot.1

index 1a33167b4d0671d15d93a58a83b8ee12934e3648..6fb1283f50049b0e84a4c4bb451c1a3c46f0471a 100644 (file)
--- a/tools/plot/fio2gnuplot.1
+++ b/tools/plot/fio2gnuplot.1
@@ -1,5 +1,5 @@
  .\" Text automatically generated by txt2man
-.TH fio2gnuplot  "07 août 2013" "" ""
+.TH fio2gnuplot 1 "August 2013"
  .SH NAME
  \fBfio2gnuplot \fP- Render fio's output files with gnuplot
  .SH SYNOPSIS
diff --git a/verify-state.h b/verify-state.h

index e46265e48d7a6c4ddbbac34a458e26d3272d60da..1586f63fb0f0a61f7eea2a6db90876b24f895960 100644 (file)
--- a/verify-state.h
+++ b/verify-state.h
@@ -77,7 +77,7 @@ static inline size_t thread_io_list_sz(struct thread_io_list *s)
  
  static inline struct thread_io_list *io_list_next(struct thread_io_list *s)
  {
-       return (void *) s + thread_io_list_sz(s);
+       return (struct thread_io_list *)((char *) s + thread_io_list_sz(s));
  }
  
  static inline void verify_state_gen_name(char *out, size_t size,
diff --git a/verify.c b/verify.c

index cadfe9c32825d99f818ff879cbe67273c41cd15d..1f177d756d24a6e6059f46bb5337870a4dc65214 100644 (file)
--- a/verify.c
+++ b/verify.c
@@ -271,6 +271,7 @@ static void dump_buf(char *buf, unsigned int len, unsigned long long offset,
         fd = open(fname, O_CREAT | O_TRUNC | O_WRONLY, 0644);
         if (fd < 0) {
                 perror("open verify buf file");
+               free(ptr);
                 return;
         }
  
@@ -387,7 +388,7 @@ static int verify_io_u_pattern(struct verify_header *hdr, struct vcont *vc)
         (void)paste_format_inplace(pattern, pattern_size,
                                    td->o.verify_fmt, td->o.verify_fmt_sz, io_u);
  
-       buf = (void *) hdr + header_size;
+       buf = (char *) hdr + header_size;
         len = get_hdr_inc(td, io_u) - header_size;
         mod = (get_hdr_inc(td, io_u) * vc->hdr_num + header_size) % pattern_size;
  
@@ -1166,7 +1167,7 @@ static void __fill_hdr(struct thread_data *td, struct io_u *io_u,
         hdr->rand_seed = rand_seed;
         hdr->offset = io_u->offset + header_num * td->o.verify_interval;
         hdr->time_sec = io_u->start_time.tv_sec;
-       hdr->time_usec = io_u->start_time.tv_usec;
+       hdr->time_usec = io_u->start_time.tv_nsec / 1000;
         hdr->thread = td->thread_number;
         hdr->numberio = io_u->numberio;
         hdr->crc32 = fio_crc32c(p, offsetof(struct verify_header, crc32));
@@ -1187,9 +1188,10 @@ static void populate_hdr(struct thread_data *td, struct io_u *io_u,
                          unsigned int header_len)
  {
         unsigned int data_len;
-       void *data, *p;
+       void *data;
+       char *p;
  
-       p = (void *) hdr;
+       p = (char *) hdr;
  
         fill_hdr(td, io_u, hdr, header_num, header_len, io_u->rand_seed);
author	Jens Axboe <axboe@kernel.dk>
	Tue, 3 Oct 2017 17:19:26 +0000 (11:19 -0600)
committer	Jens Axboe <axboe@kernel.dk>
	Tue, 3 Oct 2017 17:19:26 +0000 (11:19 -0600)
.travis.yml		patch \| blob \| blame \| history
FIO-VERSION-GEN		patch \| blob \| blame \| history
HOWTO		patch \| blob \| blame \| history
Makefile		patch \| blob \| blame \| history
README		patch \| blob \| blame \| history
appveyor.yml		patch \| blob \| blame \| history
arch/arch-arm.h		patch \| blob \| blame \| history
arch/arch-ia64.h		patch \| blob \| blame \| history
arch/arch-ppc.h		patch \| blob \| blame \| history
arch/arch-s390.h		patch \| blob \| blame \| history
arch/arch-x86-common.h		patch \| blob \| blame \| history
arch/arch.h		patch \| blob \| blame \| history
backend.c		patch \| blob \| blame \| history
blktrace.c		patch \| blob \| blame \| history
blktrace.h	[new file with mode: 0644]	patch \| blob
blktrace_api.h		patch \| blob \| blame \| history
cconv.c		patch \| blob \| blame \| history
client.c		patch \| blob \| blame \| history
client.h		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
crc/crc32c-arm64.c		patch \| blob \| blame \| history
crc/crc32c-intel.c		patch \| blob \| blame \| history
crc/crc32c.h		patch \| blob \| blame \| history
crc/test.c		patch \| blob \| blame \| history
diskutil.c		patch \| blob \| blame \| history
diskutil.h		patch \| blob \| blame \| history
doc/fio_examples.rst		patch \| blob \| blame \| history
engines/binject.c		patch \| blob \| blame \| history
engines/glusterfs.c		patch \| blob \| blame \| history
engines/glusterfs_async.c		patch \| blob \| blame \| history
engines/guasi.c		patch \| blob \| blame \| history
engines/libaio.c		patch \| blob \| blame \| history
engines/mtd.c		patch \| blob \| blame \| history
engines/rbd.c		patch \| blob \| blame \| history
engines/rdma.c		patch \| blob \| blame \| history
engines/sg.c		patch \| blob \| blame \| history
engines/skeleton_external.c		patch \| blob \| blame \| history
engines/splice.c		patch \| blob \| blame \| history
engines/sync.c		patch \| blob \| blame \| history
engines/windowsaio.c		patch \| blob \| blame \| history
eta.c		patch \| blob \| blame \| history
examples/butterfly.fio	[new file with mode: 0644]	patch \| blob
examples/mtd.fio		patch \| blob \| blame \| history
exp/README.md	[deleted file]	patch \| blob \| blame \| history
file.h		patch \| blob \| blame \| history
filesetup.c		patch \| blob \| blame \| history
fio.1		patch \| blob \| blame \| history
fio.h		patch \| blob \| blame \| history
fio_time.h		patch \| blob \| blame \| history
flist.h		patch \| blob \| blame \| history
gclient.c		patch \| blob \| blame \| history
gettime-thread.c		patch \| blob \| blame \| history
gettime.c		patch \| blob \| blame \| history
gettime.h		patch \| blob \| blame \| history
gfio.c		patch \| blob \| blame \| history
helper_thread.c		patch \| blob \| blame \| history
idletime.c		patch \| blob \| blame \| history
idletime.h		patch \| blob \| blame \| history
init.c		patch \| blob \| blame \| history
io_u.c		patch \| blob \| blame \| history
io_u.h		patch \| blob \| blame \| history
ioengines.c		patch \| blob \| blame \| history
ioengines.h		patch \| blob \| blame \| history
iolog.c		patch \| blob \| blame \| history
iolog.h		patch \| blob \| blame \| history
lib/axmap.c		patch \| blob \| blame \| history
lib/ffz.h		patch \| blob \| blame \| history
lib/memalign.c		patch \| blob \| blame \| history
lib/output_buffer.c		patch \| blob \| blame \| history
lib/output_buffer.h		patch \| blob \| blame \| history
lib/pattern.c		patch \| blob \| blame \| history
lib/seqlock.h		patch \| blob \| blame \| history
libfio.c		patch \| blob \| blame \| history
log.c		patch \| blob \| blame \| history
log.h		patch \| blob \| blame \| history
memory.c		patch \| blob \| blame \| history
mutex.c		patch \| blob \| blame \| history
options.c		patch \| blob \| blame \| history
os/os-android.h		patch \| blob \| blame \| history
os/os-dragonfly.h		patch \| blob \| blame \| history
os/os-freebsd.h		patch \| blob \| blame \| history
os/os-linux.h		patch \| blob \| blame \| history
os/os-mac.h		patch \| blob \| blame \| history
os/os-netbsd.h		patch \| blob \| blame \| history
os/os-openbsd.h		patch \| blob \| blame \| history
os/os-solaris.h		patch \| blob \| blame \| history
os/os-windows.h		patch \| blob \| blame \| history
os/os.h		patch \| blob \| blame \| history
os/windows/install.wxs		patch \| blob \| blame \| history
os/windows/posix.c		patch \| blob \| blame \| history
oslib/libmtd.c		patch \| blob \| blame \| history
oslib/libmtd_common.h		patch \| blob \| blame \| history
oslib/linux-dev-lookup.c		patch \| blob \| blame \| history
oslib/strndup.c	[new file with mode: 0644]	patch \| blob
oslib/strndup.h	[new file with mode: 0644]	patch \| blob
parse.c		patch \| blob \| blame \| history
parse.h		patch \| blob \| blame \| history
printing.c		patch \| blob \| blame \| history
profiles/act.c		patch \| blob \| blame \| history
profiles/tiobench.c		patch \| blob \| blame \| history
server.c		patch \| blob \| blame \| history
server.h		patch \| blob \| blame \| history
smalloc.c		patch \| blob \| blame \| history
stat.c		patch \| blob \| blame \| history
stat.h		patch \| blob \| blame \| history
steadystate.c		patch \| blob \| blame \| history
steadystate.h		patch \| blob \| blame \| history
t/arch.c		patch \| blob \| blame \| history
t/axmap.c		patch \| blob \| blame \| history
t/debug.c		patch \| blob \| blame \| history
t/dedupe.c		patch \| blob \| blame \| history
t/gen-rand.c		patch \| blob \| blame \| history
t/lfsr-test.c		patch \| blob \| blame \| history
t/time-test.c	[new file with mode: 0644]	patch \| blob
t/verify-state.c		patch \| blob \| blame \| history
td_error.c		patch \| blob \| blame \| history
td_error.h		patch \| blob \| blame \| history
thread_options.h		patch \| blob \| blame \| history
time.c		patch \| blob \| blame \| history
tools/fio_jsonplus_clat2csv	[new file with mode: 0755]	patch \| blob
tools/hist/fiologparser_hist.py		patch \| blob \| blame \| history
tools/plot/fio2gnuplot.1		patch \| blob \| blame \| history
verify-state.h		patch \| blob \| blame \| history
verify.c		patch \| blob \| blame \| history