+++ /dev/null
-clone_depth: 1 # NB: this stops FIO-VERSION-GEN making tag based versions
-
-image:
- - Visual Studio 2019
-
-environment:
- CYG_MIRROR: http://cygwin.mirror.constant.com
- matrix:
- - ARCHITECTURE: x64
- CC: clang
- CONFIGURE_OPTIONS: --enable-pdb
- DISTRO: msys2
-# Skip 32 bit clang build
-# - ARCHITECTURE: x86
-# CC: clang
-# CONFIGURE_OPTIONS: --enable-pdb
-# DISTRO: msys2
- - ARCHITECTURE: x64
- CONFIGURE_OPTIONS:
- DISTRO: cygwin
- - ARCHITECTURE: x86
- CONFIGURE_OPTIONS: --build-32bit-win --target-win-ver=xp
- DISTRO: cygwin
-
-install:
- - if %DISTRO%==cygwin (
- SET "PATH=C:\cygwin64\bin;C:\cygwin64;%PATH%"
- )
- - if %DISTRO%==msys2 if %ARCHITECTURE%==x86 (
- SET "PATH=C:\msys64\mingw32\bin;C:\msys64\usr\bin;%PATH%"
- )
- - if %DISTRO%==msys2 if %ARCHITECTURE%==x64 (
- SET "PATH=C:\msys64\mingw64\bin;C:\msys64\usr\bin;%PATH%"
- )
- - SET PATH=C:\Python38-x64;%PATH% # NB: Changed env variables persist to later sections
- - SET PYTHONUNBUFFERED=TRUE
- - bash.exe ci\appveyor-install.sh
-
-build_script:
- - bash.exe configure --extra-cflags=-Werror --disable-native %CONFIGURE_OPTIONS%
- - make.exe -j2
-
-after_build:
- - file.exe fio.exe
- - make.exe test
- - 'cd os\windows && dobuild.cmd %ARCHITECTURE% && cd ..'
- - ps: Get-ChildItem .\os\windows\*.msi | % { Push-AppveyorArtifact $_.FullName -FileName $_.Name -DeploymentName fio.msi }
-
-test_script:
- - python.exe t/run-fio-tests.py --artifact-root test-artifacts --debug
-
-on_finish:
- - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && [ -d test-artifacts ] && 7z a -t7z test-artifacts.7z test-artifacts -xr!foo.0.0 -xr!latency.?.0 -xr!fio_jsonplus_clat2csv.test && appveyor PushArtifact test-artifacts.7z'
--- /dev/null
+**Please acknowledge you have done the following before creating a ticket**
+
+- [ ] I have read the GitHub issues section of [REPORTING-BUGS](../blob/master/REPORTING-BUGS).
+
+<!-- replace me with bug report / enhancement request -->
--- /dev/null
+---
+name: Report a bug
+about: For bugs that are reproducible with the latest fio releases
+
+---
+
+**Please acknowledge the following before creating a ticket**
+
+- [ ] I have read the GitHub issues section of [REPORTING-BUGS](../blob/master/REPORTING-BUGS).
+
+**Description of the bug:**
+<!--replaceme-->
+
+**Environment**: <!-- Name and version of operating system -->
+
+**fio version**: <!--replaceme-->
+
+**Reproduction steps**
+<!-- Please minimise the job file/command line options down to only those
+necessary to reproduce the issue (https://stackoverflow.com/help/mcve ) -->
--- /dev/null
+blank_issues_enabled: true
+
+contact_links:
+- name: General questions (e.g. "How do I...", "Why is...") that are related to fio
+ url: http://vger.kernel.org/vger-lists.html#fio
+ about: Please send questions to the fio mailing list (plain-text emails ONLY)
--- /dev/null
+---
+name: Feature enhancement request
+about: Suggest a new fio feature
+labels: enhancement
+
+---
+
+**Description of the new feature**
+<!-- Please be aware regular fio developers are busy with non-fio work. Because
+of this, most requests are only completed if someone from outside the project
+contributes the code. -->
--- /dev/null
+Please confirm that your commit message(s) follow these guidelines:
+
+1. First line is a commit title, a descriptive one-liner for the change
+2. Empty second line
+3. Commit message body that explains why the change is useful. Break lines that
+ aren't something like a URL at 72-74 chars.
+4. Empty line
+5. Signed-off-by: Real Name <real@email.com>
+
+Reminders:
+
+1. If you modify struct thread_options, also make corresponding changes in
+ cconv.c and bump FIO_SERVER_VER in server.h
+2. If you change the ioengine interface (hooks, flags, etc), remember to bump
+ FIO_IOOPS_VERSION in ioengines.h.
--- /dev/null
+# Getting support for fio
+
+## General questions
+
+Please use the fio mailing list for asking general fio questions (e.g. "How do
+I do X?", "Why does Y happen?"). See the Mailing list section of the
+[README][readme] for details).
+
+## Reporting bugs
+
+As mentioned in [REPORTING-BUGS][reportingbugs], fio bugs and enhancements can
+be reported to the fio mailing list or fio's GitHub issues tracker.
+
+When reporting bugs please include ALL of the following:
+- Description of the issue
+- fio version number tested. If your fio isn't among the recent releases (see
+ the [fio releases page][releases]) please build a new one from source (see
+ the Source and Building sections of the [README][readme] for how to do this)
+ and reproduce the issue with the fresh build before filing an issue.
+- Reproduction steps and minimal job file/command line parameters.
+
+When requesting an enhancement only the description is needed.
+
+### GitHub issues specific information
+
+[Formatting terminal output with markdown][quotingcode] will help people who
+are reading your report. However, if the output is large (e.g. over 15 lines
+long) please consider including it as a text attachment. Avoid attaching
+pictures of screenshots as these are not searchable/selectable.
+
+<!-- Definitions -->
+
+[readme]: ../README
+[reportingbugs]: ../REPORTING-BUGS
+[releases]: ../../../releases
+[quotingcode]: https://docs.github.com/en/free-pro-team@latest/github/writing-on-github/basic-writing-and-formatting-syntax#quoting-code
--- /dev/null
+name: CI
+
+on:
+ push:
+ pull_request:
+
+jobs:
+ build:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ build:
+ - linux-gcc
+ - linux-clang
+ - macos
+ - linux-i686-gcc
+ - android
+ - windows-cygwin-64
+ - windows-cygwin-32
+ - windows-msys2-64
+ include:
+ - build: linux-gcc
+ os: ubuntu-22.04
+ cc: gcc
+ - build: linux-clang
+ os: ubuntu-22.04
+ cc: clang
+ - build: macos
+ os: macos-13
+ - build: linux-i686-gcc
+ os: ubuntu-22.04
+ arch: i686
+ - build: android
+ os: ubuntu-22.04
+ arch: aarch64-linux-android32
+ - build: android-recovery
+ os: ubuntu-22.04
+ arch: aarch64-linux-android32
+ - build: windows-cygwin-64
+ os: windows-latest
+ arch: x86_64
+ installer_arch: x64
+ shell: bash
+ - build: windows-cygwin-32
+ os: windows-latest
+ arch: i686
+ installer_arch: x86
+ shell: bash
+ - build: windows-msys2-64
+ os: windows-latest
+ cc: clang
+ arch: x86_64
+ installer_arch: x64
+ shell: msys2
+
+ env:
+ CI_TARGET_BUILD: ${{ matrix.build }}
+ CI_TARGET_ARCH: ${{ matrix.arch }}
+ CC: ${{ matrix.cc }}
+
+ steps:
+ - name: git config line endings (Windows)
+ if: ${{ contains( matrix.build, 'windows' ) }}
+ run: git config --global core.autocrlf input
+ - name: Checkout repo
+ uses: actions/checkout@v4
+ - name: Install Cygwin toolchain (Windows)
+ if: ${{ startsWith(matrix.build, 'windows-cygwin') }}
+ uses: cygwin/cygwin-install-action@master
+ with:
+ packages: >
+ mingw64-${{matrix.arch}}-binutils
+ mingw64-${{matrix.arch}}-CUnit
+ mingw64-${{matrix.arch}}-curl
+ mingw64-${{matrix.arch}}-dlfcn
+ mingw64-${{matrix.arch}}-gcc-core
+ mingw64-${{matrix.arch}}-headers
+ mingw64-${{matrix.arch}}-runtime
+ mingw64-${{matrix.arch}}-zlib
+
+ - name: Install msys2 toolchain (Windows)
+ if: ${{ startsWith(matrix.build, 'windows-msys2') }}
+ uses: msys2/setup-msys2@v2
+ with:
+ install: >
+ git
+ base-devel
+ mingw-w64-${{matrix.arch}}-clang
+ mingw-w64-${{matrix.arch}}-cunit
+ mingw-w64-${{matrix.arch}}-toolchain
+ mingw-w64-${{matrix.arch}}-lld
+ mingw-w64-${{matrix.arch}}-python-scipy
+ mingw-w64-${{matrix.arch}}-python-six
+ mingw-w64-${{matrix.arch}}-python-statsmodels
+ mingw-w64-${{matrix.arch}}-python-sphinx
+
+ - name: Install dependencies
+ run: ${{matrix.shell}} ./ci/actions-install.sh
+ if: ${{ !contains( matrix.build, 'msys2' ) }}
+ - name: Build
+ run: ${{matrix.shell}} ./ci/actions-build.sh
+ - name: Build installer (Windows)
+ if: ${{ contains( matrix.build, 'windows' ) }}
+ shell: cmd
+ run: |
+ cd os\windows
+ dobuild.cmd ${{ matrix.installer_arch }}
+ cd ..\..
+
+ - name: Upload installer as artifact (Windows)
+ if: ${{ contains( matrix.build, 'windows' ) }}
+ uses: actions/upload-artifact@v4
+ with:
+ name: ${{ matrix.build }}-installer
+ path: os\windows\*.msi
+ - name: Upload installer as release for tagged builds (Windows)
+ uses: softprops/action-gh-release@v1
+ if: ${{ startsWith(github.ref, 'refs/tags/') && startsWith(matrix.build, 'windows-cygwin') }}
+ with:
+ files: os/windows/*.msi
+ - name: Remove dependency files to resolve Makefile Cygwin sed issue (Windows)
+ if: ${{ startsWith(matrix.build, 'windows-cygwin') }}
+ run: rm *.d */*.d */*/*.d
+ shell: bash
+ - name: Smoke test
+ run: ${{matrix.shell}} ./ci/actions-smoke-test.sh
+ - name: Full test
+ run: ${{matrix.shell}} ./ci/actions-full-test.sh
--- /dev/null
+name: CIFuzz
+on: [pull_request]
+jobs:
+ Fuzzing:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Build Fuzzers
+ id: build
+ uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+ with:
+ oss-fuzz-project-name: 'fio'
+ dry-run: false
+ - name: Run Fuzzers
+ uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+ with:
+ oss-fuzz-project-name: 'fio'
+ fuzz-seconds: 600
+ dry-run: false
+ - name: Upload Crash
+ uses: actions/upload-artifact@v1
+ if: failure() && steps.build.outcome == 'success'
+ with:
+ name: artifacts
+ path: ./out/artifacts
/tags
/TAGS
/t/zbd/test-zbd-support.log.*
+/t/fuzz/fuzz_parseini
+tsc-rate
+++ /dev/null
-language: c
-dist: bionic
-os:
- - linux
-compiler:
- - clang
- - gcc
-arch:
- - amd64
- - arm64
-env:
- global:
- - MAKEFLAGS="-j 2"
-matrix:
- include:
- - os: linux
- compiler: gcc
- arch: amd64
- env: BUILD_ARCH="x86" # Only do the gcc x86 build to reduce clutter
- # Default xcode image
- - os: osx
- compiler: clang # Workaround travis setting CC=["clang", "gcc"]
- arch: amd64
- # Latest xcode image (needs periodic updating)
- - os: osx
- compiler: clang
- osx_image: xcode11.2
- arch: amd64
- exclude:
- - os: osx
- compiler: gcc
-
-install:
- - ci/travis-install.sh
-
-script:
- - ci/travis-build.sh
--- /dev/null
+cff-version: 1.2.0
+preferred-citation:
+ type: software
+ authors:
+ - family-names: "Axboe"
+ given-names: "Jens"
+ email: axboe@kernel.dk
+ title: "Flexible I/O Tester"
+ year: 2022
+ url: "https://github.com/axboe/fio"
+licence: GNU GPL v2.0
--- /dev/null
+- Shifted dedup-able data.
+ Allow for dedup buffer generation to shift contents by random number
+ of sectors (fill the gaps with uncompressible data). Some storage
+ subsystems modernized the deduplication detection algorithms to look
+ for shifted data as well. For example, some databases push a timestamp
+ on the prefix of written blocks, which makes the underlying data
+ dedup-able in different alignment. FIO should be able to simulate such
+ workload.
+
+- Generation of similar data (but not exact).
+ A rising trend in enterprise storage systems.
+ Generation of "similar" data means random uncompressible buffers
+ that differ by few(configurable number of) bits from each other.
+ The storage subsystem usually identifies the similar buffers using
+ locality-sensitive hashing or other methods.
+
#!/bin/sh
GVF=FIO-VERSION-FILE
-DEF_VER=fio-3.23
+DEF_VER=fio-3.37
LF='
'
+++ /dev/null
-How fio works
--------------
-
-The first step in getting fio to simulate a desired I/O workload, is writing a
-job file describing that specific setup. A job file may contain any number of
-threads and/or files -- the typical contents of the job file is a *global*
-section defining shared parameters, and one or more job sections describing the
-jobs involved. When run, fio parses this file and sets everything up as
-described. If we break down a job from top to bottom, it contains the following
-basic parameters:
-
-`I/O type`_
-
- Defines the I/O pattern issued to the file(s). We may only be reading
- sequentially from this file(s), or we may be writing randomly. Or even
- mixing reads and writes, sequentially or randomly.
- Should we be doing buffered I/O, or direct/raw I/O?
-
-`Block size`_
-
- In how large chunks are we issuing I/O? This may be a single value,
- or it may describe a range of block sizes.
-
-`I/O size`_
-
- How much data are we going to be reading/writing.
-
-`I/O engine`_
-
- How do we issue I/O? We could be memory mapping the file, we could be
- using regular read/write, we could be using splice, async I/O, or even
- SG (SCSI generic sg).
-
-`I/O depth`_
-
- If the I/O engine is async, how large a queuing depth do we want to
- maintain?
-
-
-`Target file/device`_
-
- How many files are we spreading the workload over.
-
-`Threads, processes and job synchronization`_
-
- How many threads or processes should we spread this workload over.
-
-The above are the basic parameters defined for a workload, in addition there's a
-multitude of parameters that modify other aspects of how this job behaves.
-
-
-Command line options
---------------------
-
-.. option:: --debug=type
-
- Enable verbose tracing `type` of various fio actions. May be ``all`` for all types
- or individual types separated by a comma (e.g. ``--debug=file,mem`` will
- enable file and memory debugging). Currently, additional logging is
- available for:
-
- *process*
- Dump info related to processes.
- *file*
- Dump info related to file actions.
- *io*
- Dump info related to I/O queuing.
- *mem*
- Dump info related to memory allocations.
- *blktrace*
- Dump info related to blktrace setup.
- *verify*
- Dump info related to I/O verification.
- *all*
- Enable all debug options.
- *random*
- Dump info related to random offset generation.
- *parse*
- Dump info related to option matching and parsing.
- *diskutil*
- Dump info related to disk utilization updates.
- *job:x*
- Dump info only related to job number x.
- *mutex*
- Dump info only related to mutex up/down ops.
- *profile*
- Dump info related to profile extensions.
- *time*
- Dump info related to internal time keeping.
- *net*
- Dump info related to networking connections.
- *rate*
- Dump info related to I/O rate switching.
- *compress*
- Dump info related to log compress/decompress.
- *steadystate*
- Dump info related to steadystate detection.
- *helperthread*
- Dump info related to the helper thread.
- *zbd*
- Dump info related to support for zoned block devices.
- *?* or *help*
- Show available debug options.
-
-.. option:: --parse-only
-
- Parse options only, don't start any I/O.
-
-.. option:: --merge-blktrace-only
-
- Merge blktraces only, don't start any I/O.
-
-.. option:: --output=filename
-
- Write output to file `filename`.
-
-.. option:: --output-format=format
-
- Set the reporting `format` to `normal`, `terse`, `json`, or `json+`. Multiple
- formats can be selected, separated by a comma. `terse` is a CSV based
- format. `json+` is like `json`, except it adds a full dump of the latency
- buckets.
-
-.. option:: --bandwidth-log
-
- Generate aggregate bandwidth logs.
-
-.. option:: --minimal
-
- Print statistics in a terse, semicolon-delimited format.
-
-.. option:: --append-terse
-
- Print statistics in selected mode AND terse, semicolon-delimited format.
- **Deprecated**, use :option:`--output-format` instead to select multiple
- formats.
-
-.. option:: --terse-version=version
-
- Set terse `version` output format (default 3, or 2 or 4 or 5).
-
-.. option:: --version
-
- Print version information and exit.
-
-.. option:: --help
-
- Print a summary of the command line options and exit.
-
-.. option:: --cpuclock-test
-
- Perform test and validation of internal CPU clock.
-
-.. option:: --crctest=[test]
-
- Test the speed of the built-in checksumming functions. If no argument is
- given, all of them are tested. Alternatively, a comma separated list can
- be passed, in which case the given ones are tested.
-
-.. option:: --cmdhelp=command
-
- Print help information for `command`. May be ``all`` for all commands.
-
-.. option:: --enghelp=[ioengine[,command]]
-
- List all commands defined by `ioengine`, or print help for `command`
- defined by `ioengine`. If no `ioengine` is given, list all
- available ioengines.
-
-.. option:: --showcmd=jobfile
-
- Convert `jobfile` to a set of command-line options.
-
-.. option:: --readonly
-
- Turn on safety read-only checks, preventing writes and trims. The
- ``--readonly`` option is an extra safety guard to prevent users from
- accidentally starting a write or trim workload when that is not desired.
- Fio will only modify the device under test if
- `rw=write/randwrite/rw/randrw/trim/randtrim/trimwrite` is given. This
- safety net can be used as an extra precaution.
-
-.. option:: --eta=when
-
- Specifies when real-time ETA estimate should be printed. `when` may be
- `always`, `never` or `auto`. `auto` is the default, it prints ETA
- when requested if the output is a TTY. `always` disregards the output
- type, and prints ETA when requested. `never` never prints ETA.
-
-.. option:: --eta-interval=time
-
- By default, fio requests client ETA status roughly every second. With
- this option, the interval is configurable. Fio imposes a minimum
- allowed time to avoid flooding the console, less than 250 msec is
- not supported.
-
-.. option:: --eta-newline=time
-
- Force a new line for every `time` period passed. When the unit is omitted,
- the value is interpreted in seconds.
-
-.. option:: --status-interval=time
-
- Force a full status dump of cumulative (from job start) values at `time`
- intervals. This option does *not* provide per-period measurements. So
- values such as bandwidth are running averages. When the time unit is omitted,
- `time` is interpreted in seconds. Note that using this option with
- ``--output-format=json`` will yield output that technically isn't valid
- json, since the output will be collated sets of valid json. It will need
- to be split into valid sets of json after the run.
-
-.. option:: --section=name
-
- Only run specified section `name` in job file. Multiple sections can be specified.
- The ``--section`` option allows one to combine related jobs into one file.
- E.g. one job file could define light, moderate, and heavy sections. Tell
- fio to run only the "heavy" section by giving ``--section=heavy``
- command line option. One can also specify the "write" operations in one
- section and "verify" operation in another section. The ``--section`` option
- only applies to job sections. The reserved *global* section is always
- parsed and used.
-
-.. option:: --alloc-size=kb
-
- Allocate additional internal smalloc pools of size `kb` in KiB. The
- ``--alloc-size`` option increases shared memory set aside for use by fio.
- If running large jobs with randommap enabled, fio can run out of memory.
- Smalloc is an internal allocator for shared structures from a fixed size
- memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
-
- NOTE: While running :file:`.fio_smalloc.*` backing store files are visible
- in :file:`/tmp`.
-
-.. option:: --warnings-fatal
-
- All fio parser warnings are fatal, causing fio to exit with an
- error.
-
-.. option:: --max-jobs=nr
-
- Set the maximum number of threads/processes to support to `nr`.
- NOTE: On Linux, it may be necessary to increase the shared-memory
- limit (:file:`/proc/sys/kernel/shmmax`) if fio runs into errors while
- creating jobs.
-
-.. option:: --server=args
-
- Start a backend server, with `args` specifying what to listen to.
- See `Client/Server`_ section.
-
-.. option:: --daemonize=pidfile
-
- Background a fio server, writing the pid to the given `pidfile` file.
-
-.. option:: --client=hostname
-
- Instead of running the jobs locally, send and run them on the given `hostname`
- or set of `hostname`\s. See `Client/Server`_ section.
-
-.. option:: --remote-config=file
-
- Tell fio server to load this local `file`.
-
-.. option:: --idle-prof=option
-
- Report CPU idleness. `option` is one of the following:
-
- **calibrate**
- Run unit work calibration only and exit.
-
- **system**
- Show aggregate system idleness and unit work.
-
- **percpu**
- As **system** but also show per CPU idleness.
-
-.. option:: --inflate-log=log
-
- Inflate and output compressed `log`.
-
-.. option:: --trigger-file=file
-
- Execute trigger command when `file` exists.
-
-.. option:: --trigger-timeout=time
-
- Execute trigger at this `time`.
-
-.. option:: --trigger=command
-
- Set this `command` as local trigger.
-
-.. option:: --trigger-remote=command
-
- Set this `command` as remote trigger.
-
-.. option:: --aux-path=path
-
- Use the directory specified by `path` for generated state files instead
- of the current working directory.
-
-Any parameters following the options will be assumed to be job files, unless
-they match a job file parameter. Multiple job files can be listed and each job
-file will be regarded as a separate group. Fio will :option:`stonewall`
-execution between each group.
-
-
-Job file format
----------------
-
-As previously described, fio accepts one or more job files describing what it is
-supposed to do. The job file format is the classic ini file, where the names
-enclosed in [] brackets define the job name. You are free to use any ASCII name
-you want, except *global* which has special meaning. Following the job name is
-a sequence of zero or more parameters, one per line, that define the behavior of
-the job. If the first character in a line is a ';' or a '#', the entire line is
-discarded as a comment.
-
-A *global* section sets defaults for the jobs described in that file. A job may
-override a *global* section parameter, and a job file may even have several
-*global* sections if so desired. A job is only affected by a *global* section
-residing above it.
-
-The :option:`--cmdhelp` option also lists all options. If used with a `command`
-argument, :option:`--cmdhelp` will detail the given `command`.
-
-See the `examples/` directory for inspiration on how to write job files. Note
-the copyright and license requirements currently apply to `examples/` files.
-
-So let's look at a really simple job file that defines two processes, each
-randomly reading from a 128MiB file:
-
-.. code-block:: ini
-
- ; -- start job file --
- [global]
- rw=randread
- size=128m
-
- [job1]
-
- [job2]
-
- ; -- end job file --
-
-As you can see, the job file sections themselves are empty as all the described
-parameters are shared. As no :option:`filename` option is given, fio makes up a
-`filename` for each of the jobs as it sees fit. On the command line, this job
-would look as follows::
-
-$ fio --name=global --rw=randread --size=128m --name=job1 --name=job2
-
-
-Let's look at an example that has a number of processes writing randomly to
-files:
-
-.. code-block:: ini
-
- ; -- start job file --
- [random-writers]
- ioengine=libaio
- iodepth=4
- rw=randwrite
- bs=32k
- direct=0
- size=64m
- numjobs=4
- ; -- end job file --
-
-Here we have no *global* section, as we only have one job defined anyway. We
-want to use async I/O here, with a depth of 4 for each file. We also increased
-the buffer size used to 32KiB and define numjobs to 4 to fork 4 identical
-jobs. The result is 4 processes each randomly writing to their own 64MiB
-file. Instead of using the above job file, you could have given the parameters
-on the command line. For this case, you would specify::
-
-$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
-
-When fio is utilized as a basis of any reasonably large test suite, it might be
-desirable to share a set of standardized settings across multiple job files.
-Instead of copy/pasting such settings, any section may pull in an external
-:file:`filename.fio` file with *include filename* directive, as in the following
-example::
-
- ; -- start job file including.fio --
- [global]
- filename=/tmp/test
- filesize=1m
- include glob-include.fio
-
- [test]
- rw=randread
- bs=4k
- time_based=1
- runtime=10
- include test-include.fio
- ; -- end job file including.fio --
-
-.. code-block:: ini
-
- ; -- start job file glob-include.fio --
- thread=1
- group_reporting=1
- ; -- end job file glob-include.fio --
-
-.. code-block:: ini
-
- ; -- start job file test-include.fio --
- ioengine=libaio
- iodepth=4
- ; -- end job file test-include.fio --
-
-Settings pulled into a section apply to that section only (except *global*
-section). Include directives may be nested in that any included file may contain
-further include directive(s). Include files may not contain [] sections.
-
-
-Environment variables
-~~~~~~~~~~~~~~~~~~~~~
-
-Fio also supports environment variable expansion in job files. Any sub-string of
-the form ``${VARNAME}`` as part of an option value (in other words, on the right
-of the '='), will be expanded to the value of the environment variable called
-`VARNAME`. If no such environment variable is defined, or `VARNAME` is the
-empty string, the empty string will be substituted.
-
-As an example, let's look at a sample fio invocation and job file::
-
-$ SIZE=64m NUMJOBS=4 fio jobfile.fio
-
-.. code-block:: ini
-
- ; -- start job file --
- [random-writers]
- rw=randwrite
- size=${SIZE}
- numjobs=${NUMJOBS}
- ; -- end job file --
-
-This will expand to the following equivalent job file at runtime:
-
-.. code-block:: ini
-
- ; -- start job file --
- [random-writers]
- rw=randwrite
- size=64m
- numjobs=4
- ; -- end job file --
-
-Fio ships with a few example job files, you can also look there for inspiration.
-
-Reserved keywords
-~~~~~~~~~~~~~~~~~
-
-Additionally, fio has a set of reserved keywords that will be replaced
-internally with the appropriate value. Those keywords are:
-
-**$pagesize**
-
- The architecture page size of the running system.
-
-**$mb_memory**
-
- Megabytes of total memory in the system.
-
-**$ncpus**
-
- Number of online available CPUs.
-
-These can be used on the command line or in the job file, and will be
-automatically substituted with the current system values when the job is
-run. Simple math is also supported on these keywords, so you can perform actions
-like::
-
- size=8*$mb_memory
-
-and get that properly expanded to 8 times the size of memory in the machine.
-
-
-Job file parameters
--------------------
-
-This section describes in details each parameter associated with a job. Some
-parameters take an option of a given type, such as an integer or a
-string. Anywhere a numeric value is required, an arithmetic expression may be
-used, provided it is surrounded by parentheses. Supported operators are:
-
- - addition (+)
- - subtraction (-)
- - multiplication (*)
- - division (/)
- - modulus (%)
- - exponentiation (^)
-
-For time values in expressions, units are microseconds by default. This is
-different than for time values not in expressions (not enclosed in
-parentheses). The following types are used:
-
-
-Parameter types
-~~~~~~~~~~~~~~~
-
-**str**
- String: A sequence of alphanumeric characters.
-
-**time**
- Integer with possible time suffix. Without a unit value is interpreted as
- seconds unless otherwise specified. Accepts a suffix of 'd' for days, 'h' for
- hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and
- 'us' (or 'usec') for microseconds. For example, use 10m for 10 minutes.
-
-.. _int:
-
-**int**
- Integer. A whole number value, which may contain an integer prefix
- and an integer suffix:
-
- [*integer prefix*] **number** [*integer suffix*]
-
- The optional *integer prefix* specifies the number's base. The default
- is decimal. *0x* specifies hexadecimal.
-
- The optional *integer suffix* specifies the number's units, and includes an
- optional unit prefix and an optional unit. For quantities of data, the
- default unit is bytes. For quantities of time, the default unit is seconds
- unless otherwise specified.
-
- With :option:`kb_base`\=1000, fio follows international standards for unit
- prefixes. To specify power-of-10 decimal values defined in the
- International System of Units (SI):
-
- * *K* -- means kilo (K) or 1000
- * *M* -- means mega (M) or 1000**2
- * *G* -- means giga (G) or 1000**3
- * *T* -- means tera (T) or 1000**4
- * *P* -- means peta (P) or 1000**5
-
- To specify power-of-2 binary values defined in IEC 80000-13:
-
- * *Ki* -- means kibi (Ki) or 1024
- * *Mi* -- means mebi (Mi) or 1024**2
- * *Gi* -- means gibi (Gi) or 1024**3
- * *Ti* -- means tebi (Ti) or 1024**4
- * *Pi* -- means pebi (Pi) or 1024**5
-
- With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite
- from those specified in the SI and IEC 80000-13 standards to provide
- compatibility with old scripts. For example, 4k means 4096.
-
- For quantities of data, an optional unit of 'B' may be included
- (e.g., 'kB' is the same as 'k').
-
- The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
- not milli). 'b' and 'B' both mean byte, not bit.
-
- Examples with :option:`kb_base`\=1000:
-
- * *4 KiB*: 4096, 4096b, 4096B, 4ki, 4kib, 4kiB, 4Ki, 4KiB
- * *1 MiB*: 1048576, 1mi, 1024ki
- * *1 MB*: 1000000, 1m, 1000k
- * *1 TiB*: 1099511627776, 1ti, 1024gi, 1048576mi
- * *1 TB*: 1000000000, 1t, 1000m, 1000000k
-
- Examples with :option:`kb_base`\=1024 (default):
-
- * *4 KiB*: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
- * *1 MiB*: 1048576, 1m, 1024k
- * *1 MB*: 1000000, 1mi, 1000ki
- * *1 TiB*: 1099511627776, 1t, 1024g, 1048576m
- * *1 TB*: 1000000000, 1ti, 1000mi, 1000000ki
-
- To specify times (units are not case sensitive):
-
- * *D* -- means days
- * *H* -- means hours
- * *M* -- means minutes
- * *s* -- or sec means seconds (default)
- * *ms* -- or *msec* means milliseconds
- * *us* -- or *usec* means microseconds
-
- If the option accepts an upper and lower range, use a colon ':' or
- minus '-' to separate such values. See :ref:`irange <irange>`.
- If the lower value specified happens to be larger than the upper value
- the two values are swapped.
-
-.. _bool:
-
-**bool**
- Boolean. Usually parsed as an integer, however only defined for
- true and false (1 and 0).
-
-.. _irange:
-
-**irange**
- Integer range with suffix. Allows value range to be given, such as
- 1024-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
- option allows two sets of ranges, they can be specified with a ',' or '/'
- delimiter: 1k-4k/8k-32k. Also see :ref:`int <int>`.
-
-**float_list**
- A list of floating point numbers, separated by a ':' character.
-
-With the above in mind, here follows the complete list of fio job parameters.
-
-
-Units
-~~~~~
-
-.. option:: kb_base=int
-
- Select the interpretation of unit prefixes in input parameters.
-
- **1000**
- Inputs comply with IEC 80000-13 and the International
- System of Units (SI). Use:
-
- - power-of-2 values with IEC prefixes (e.g., KiB)
- - power-of-10 values with SI prefixes (e.g., kB)
-
- **1024**
- Compatibility mode (default). To avoid breaking old scripts:
-
- - power-of-2 values with SI prefixes
- - power-of-10 values with IEC prefixes
-
- See :option:`bs` for more details on input parameters.
-
- Outputs always use correct prefixes. Most outputs include both
- side-by-side, like::
-
- bw=2383.3kB/s (2327.4KiB/s)
-
- If only one value is reported, then kb_base selects the one to use:
-
- **1000** -- SI prefixes
-
- **1024** -- IEC prefixes
-
-.. option:: unit_base=int
-
- Base unit for reporting. Allowed values are:
-
- **0**
- Use auto-detection (default).
- **8**
- Byte based.
- **1**
- Bit based.
-
-
-Job description
-~~~~~~~~~~~~~~~
-
-.. option:: name=str
-
- ASCII name of the job. This may be used to override the name printed by fio
- for this job. Otherwise the job name is used. On the command line this
- parameter has the special purpose of also signaling the start of a new job.
-
-.. option:: description=str
-
- Text description of the job. Doesn't do anything except dump this text
- description when this job is run. It's not parsed.
-
-.. option:: loops=int
-
- Run the specified number of iterations of this job. Used to repeat the same
- workload a given number of times. Defaults to 1.
-
-.. option:: numjobs=int
-
- Create the specified number of clones of this job. Each clone of job
- is spawned as an independent thread or process. May be used to setup a
- larger number of threads/processes doing the same thing. Each thread is
- reported separately; to see statistics for all clones as a whole, use
- :option:`group_reporting` in conjunction with :option:`new_group`.
- See :option:`--max-jobs`. Default: 1.
-
-
-Time related parameters
-~~~~~~~~~~~~~~~~~~~~~~~
-
-.. option:: runtime=time
-
- Tell fio to terminate processing after the specified period of time. It
- can be quite hard to determine for how long a specified job will run, so
- this parameter is handy to cap the total runtime to a given time. When
- the unit is omitted, the value is interpreted in seconds.
-
-.. option:: time_based
-
- If set, fio will run for the duration of the :option:`runtime` specified
- even if the file(s) are completely read or written. It will simply loop over
- the same workload as many times as the :option:`runtime` allows.
-
-.. option:: startdelay=irange(time)
-
- Delay the start of job for the specified amount of time. Can be a single
- value or a range. When given as a range, each thread will choose a value
- randomly from within the range. Value is in seconds if a unit is omitted.
-
-.. option:: ramp_time=time
-
- If set, fio will run the specified workload for this amount of time before
- logging any performance numbers. Useful for letting performance settle
- before logging results, thus minimizing the runtime required for stable
- results. Note that the ``ramp_time`` is considered lead in time for a job,
- thus it will increase the total runtime if a special timeout or
- :option:`runtime` is specified. When the unit is omitted, the value is
- given in seconds.
-
-.. option:: clocksource=str
-
- Use the given clocksource as the base of timing. The supported options are:
-
- **gettimeofday**
- :manpage:`gettimeofday(2)`
-
- **clock_gettime**
- :manpage:`clock_gettime(2)`
-
- **cpu**
- Internal CPU clock source
-
- cpu is the preferred clocksource if it is reliable, as it is very fast (and
- fio is heavy on time calls). Fio will automatically use this clocksource if
- it's supported and considered reliable on the system it is running on,
- unless another clocksource is specifically set. For x86/x86-64 CPUs, this
- means supporting TSC Invariant.
-
-.. option:: gtod_reduce=bool
-
- Enable all of the :manpage:`gettimeofday(2)` reducing options
- (:option:`disable_clat`, :option:`disable_slat`, :option:`disable_bw_measurement`) plus
- reduce precision of the timeout somewhat to really shrink the
- :manpage:`gettimeofday(2)` call count. With this option enabled, we only do
- about 0.4% of the :manpage:`gettimeofday(2)` calls we would have done if all
- time keeping was enabled.
-
-.. option:: gtod_cpu=int
-
- Sometimes it's cheaper to dedicate a single thread of execution to just
- getting the current time. Fio (and databases, for instance) are very
- intensive on :manpage:`gettimeofday(2)` calls. With this option, you can set
- one CPU aside for doing nothing but logging current time to a shared memory
- location. Then the other threads/processes that run I/O workloads need only
- copy that segment, instead of entering the kernel with a
- :manpage:`gettimeofday(2)` call. The CPU set aside for doing these time
- calls will be excluded from other uses. Fio will manually clear it from the
- CPU mask of other jobs.
-
-
-Target file/device
-~~~~~~~~~~~~~~~~~~
-
-.. option:: directory=str
-
- Prefix filenames with this directory. Used to place files in a different
- location than :file:`./`. You can specify a number of directories by
- separating the names with a ':' character. These directories will be
- assigned equally distributed to job clones created by :option:`numjobs` as
- long as they are using generated filenames. If specific `filename(s)` are
- set fio will use the first listed directory, and thereby matching the
- `filename` semantic (which generates a file for each clone if not
- specified, but lets all clones use the same file if set).
-
- See the :option:`filename` option for information on how to escape "``:``"
- characters within the directory path itself.
-
- Note: To control the directory fio will use for internal state files
- use :option:`--aux-path`.
-
-.. option:: filename=str
-
- Fio normally makes up a `filename` based on the job name, thread number, and
- file number (see :option:`filename_format`). If you want to share files
- between threads in a job or several
- jobs with fixed file paths, specify a `filename` for each of them to override
- the default. If the ioengine is file based, you can specify a number of files
- by separating the names with a ':' colon. So if you wanted a job to open
- :file:`/dev/sda` and :file:`/dev/sdb` as the two working files, you would use
- ``filename=/dev/sda:/dev/sdb``. This also means that whenever this option is
- specified, :option:`nrfiles` is ignored. The size of regular files specified
- by this option will be :option:`size` divided by number of files unless an
- explicit size is specified by :option:`filesize`.
-
- Each colon in the wanted path must be escaped with a ``\``
- character. For instance, if the path is :file:`/dev/dsk/foo@3,0:c` then you
- would use ``filename=/dev/dsk/foo@3,0\:c`` and if the path is
- :file:`F:\\filename` then you would use ``filename=F\:\filename``.
-
- On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for
- the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc.
- Note: Windows and FreeBSD prevent write access to areas
- of the disk containing in-use data (e.g. filesystems).
-
- The filename "`-`" is a reserved name, meaning *stdin* or *stdout*. Which
- of the two depends on the read/write direction set.
-
-.. option:: filename_format=str
-
- If sharing multiple files between jobs, it is usually necessary to have fio
- generate the exact names that you want. By default, fio will name a file
- based on the default file format specification of
- :file:`jobname.jobnumber.filenumber`. With this option, that can be
- customized. Fio will recognize and replace the following keywords in this
- string:
-
- **$jobname**
- The name of the worker thread or process.
- **$jobnum**
- The incremental number of the worker thread or process.
- **$filenum**
- The incremental number of the file for that worker thread or
- process.
-
- To have dependent jobs share a set of files, this option can be set to have
- fio generate filenames that are shared between the two. For instance, if
- :file:`testfiles.$filenum` is specified, file number 4 for any job will be
- named :file:`testfiles.4`. The default of :file:`$jobname.$jobnum.$filenum`
- will be used if no other format specifier is given.
-
- If you specify a path then the directories will be created up to the
- main directory for the file. So for example if you specify
- ``filename_format=a/b/c/$jobnum`` then the directories a/b/c will be
- created before the file setup part of the job. If you specify
- :option:`directory` then the path will be relative that directory,
- otherwise it is treated as the absolute path.
-
-.. option:: unique_filename=bool
-
- To avoid collisions between networked clients, fio defaults to prefixing any
- generated filenames (with a directory specified) with the source of the
- client connecting. To disable this behavior, set this option to 0.
-
-.. option:: opendir=str
-
- Recursively open any files below directory `str`.
-
-.. option:: lockfile=str
-
- Fio defaults to not locking any files before it does I/O to them. If a file
- or file descriptor is shared, fio can serialize I/O to that file to make the
- end result consistent. This is usual for emulating real workloads that share
- files. The lock modes are:
-
- **none**
- No locking. The default.
- **exclusive**
- Only one thread or process may do I/O at a time, excluding all
- others.
- **readwrite**
- Read-write locking on the file. Many readers may
- access the file at the same time, but writes get exclusive access.
-
-.. option:: nrfiles=int
-
- Number of files to use for this job. Defaults to 1. The size of files
- will be :option:`size` divided by this unless explicit size is specified by
- :option:`filesize`. Files are created for each thread separately, and each
- file will have a file number within its name by default, as explained in
- :option:`filename` section.
-
-
-.. option:: openfiles=int
-
- Number of files to keep open at the same time. Defaults to the same as
- :option:`nrfiles`, can be set smaller to limit the number simultaneous
- opens.
-
-.. option:: file_service_type=str
-
- Defines how fio decides which file from a job to service next. The following
- types are defined:
-
- **random**
- Choose a file at random.
-
- **roundrobin**
- Round robin over opened files. This is the default.
-
- **sequential**
- Finish one file before moving on to the next. Multiple files can
- still be open depending on :option:`openfiles`.
-
- **zipf**
- Use a *Zipf* distribution to decide what file to access.
-
- **pareto**
- Use a *Pareto* distribution to decide what file to access.
-
- **normal**
- Use a *Gaussian* (normal) distribution to decide what file to
- access.
-
- **gauss**
- Alias for normal.
-
- For *random*, *roundrobin*, and *sequential*, a postfix can be appended to
- tell fio how many I/Os to issue before switching to a new file. For example,
- specifying ``file_service_type=random:8`` would cause fio to issue
- 8 I/Os before selecting a new file at random. For the non-uniform
- distributions, a floating point postfix can be given to influence how the
- distribution is skewed. See :option:`random_distribution` for a description
- of how that would work.
-
-.. option:: ioscheduler=str
-
- Attempt to switch the device hosting the file to the specified I/O scheduler
- before running.
-
-.. option:: create_serialize=bool
-
- If true, serialize the file creation for the jobs. This may be handy to
- avoid interleaving of data files, which may greatly depend on the filesystem
- used and even the number of processors in the system. Default: true.
-
-.. option:: create_fsync=bool
-
- :manpage:`fsync(2)` the data file after creation. This is the default.
-
-.. option:: create_on_open=bool
-
- If true, don't pre-create files but allow the job's open() to create a file
- when it's time to do I/O. Default: false -- pre-create all necessary files
- when the job starts.
-
-.. option:: create_only=bool
-
- If true, fio will only run the setup phase of the job. If files need to be
- laid out or updated on disk, only that will be done -- the actual job contents
- are not executed. Default: false.
-
-.. option:: allow_file_create=bool
-
- If true, fio is permitted to create files as part of its workload. If this
- option is false, then fio will error out if
- the files it needs to use don't already exist. Default: true.
-
-.. option:: allow_mounted_write=bool
-
- If this isn't set, fio will abort jobs that are destructive (e.g. that write)
- to what appears to be a mounted device or partition. This should help catch
- creating inadvertently destructive tests, not realizing that the test will
- destroy data on the mounted file system. Note that some platforms don't allow
- writing against a mounted device regardless of this option. Default: false.
-
-.. option:: pre_read=bool
-
- If this is given, files will be pre-read into memory before starting the
- given I/O operation. This will also clear the :option:`invalidate` flag,
- since it is pointless to pre-read and then drop the cache. This will only
- work for I/O engines that are seek-able, since they allow you to read the
- same data multiple times. Thus it will not work on non-seekable I/O engines
- (e.g. network, splice). Default: false.
-
-.. option:: unlink=bool
-
- Unlink the job files when done. Not the default, as repeated runs of that
- job would then waste time recreating the file set again and again. Default:
- false.
-
-.. option:: unlink_each_loop=bool
-
- Unlink job files after each iteration or loop. Default: false.
-
-.. option:: zonemode=str
-
- Accepted values are:
-
- **none**
- The :option:`zonerange`, :option:`zonesize`,
- :option `zonecapacity` and option:`zoneskip`
- parameters are ignored.
- **strided**
- I/O happens in a single zone until
- :option:`zonesize` bytes have been transferred.
- After that number of bytes has been
- transferred processing of the next zone
- starts. :option `zonecapacity` is ignored.
- **zbd**
- Zoned block device mode. I/O happens
- sequentially in each zone, even if random I/O
- has been selected. Random I/O happens across
- all zones instead of being restricted to a
- single zone. The :option:`zoneskip` parameter
- is ignored. :option:`zonerange` and
- :option:`zonesize` must be identical.
-
-.. option:: zonerange=int
-
- Size of a single zone. See also :option:`zonesize` and
- :option:`zoneskip`.
-
-.. option:: zonesize=int
-
- For :option:`zonemode` =strided, this is the number of bytes to
- transfer before skipping :option:`zoneskip` bytes. If this parameter
- is smaller than :option:`zonerange` then only a fraction of each zone
- with :option:`zonerange` bytes will be accessed. If this parameter is
- larger than :option:`zonerange` then each zone will be accessed
- multiple times before skipping to the next zone.
-
- For :option:`zonemode` =zbd, this is the size of a single zone. The
- :option:`zonerange` parameter is ignored in this mode.
-
-
-.. option:: zonecapacity=int
-
- For :option:`zonemode` =zbd, this defines the capacity of a single zone,
- which is the accessible area starting from the zone start address.
- This parameter only applies when using :option:`zonemode` =zbd in
- combination with regular block devices. If not specified it defaults to
- the zone size. If the target device is a zoned block device, the zone
- capacity is obtained from the device information and this option is
- ignored.
-
-.. option:: zoneskip=int
-
- For :option:`zonemode` =strided, the number of bytes to skip after
- :option:`zonesize` bytes of data have been transferred. This parameter
- must be zero for :option:`zonemode` =zbd.
-
-.. option:: read_beyond_wp=bool
-
- This parameter applies to :option:`zonemode` =zbd only.
-
- Zoned block devices are block devices that consist of multiple zones.
- Each zone has a type, e.g. conventional or sequential. A conventional
- zone can be written at any offset that is a multiple of the block
- size. Sequential zones must be written sequentially. The position at
- which a write must occur is called the write pointer. A zoned block
- device can be either drive managed, host managed or host aware. For
- host managed devices the host must ensure that writes happen
- sequentially. Fio recognizes host managed devices and serializes
- writes to sequential zones for these devices.
-
- If a read occurs in a sequential zone beyond the write pointer then
- the zoned block device will complete the read without reading any data
- from the storage medium. Since such reads lead to unrealistically high
- bandwidth and IOPS numbers fio only reads beyond the write pointer if
- explicitly told to do so. Default: false.
-
-.. option:: max_open_zones=int
-
- When running a random write test across an entire drive many more
- zones will be open than in a typical application workload. Hence this
- command line option that allows to limit the number of open zones. The
- number of open zones is defined as the number of zones to which write
- commands are issued.
-
-.. option:: zone_reset_threshold=float
-
- A number between zero and one that indicates the ratio of logical
- blocks with data to the total number of logical blocks in the test
- above which zones should be reset periodically.
-
-.. option:: zone_reset_frequency=float
-
- A number between zero and one that indicates how often a zone reset
- should be issued if the zone reset threshold has been exceeded. A zone
- reset is submitted after each (1 / zone_reset_frequency) write
- requests. This and the previous parameter can be used to simulate
- garbage collection activity.
-
-
-I/O type
-~~~~~~~~
-
-.. option:: direct=bool
-
- If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
- OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous
- ioengines don't support direct I/O. Default: false.
-
-.. option:: atomic=bool
-
- If value is true, attempt to use atomic direct I/O. Atomic writes are
- guaranteed to be stable once acknowledged by the operating system. Only
- Linux supports O_ATOMIC right now.
-
-.. option:: buffered=bool
-
- If value is true, use buffered I/O. This is the opposite of the
- :option:`direct` option. Defaults to true.
-
-.. option:: readwrite=str, rw=str
-
- Type of I/O pattern. Accepted values are:
-
- **read**
- Sequential reads.
- **write**
- Sequential writes.
- **trim**
- Sequential trims (Linux block devices and SCSI
- character devices only).
- **randread**
- Random reads.
- **randwrite**
- Random writes.
- **randtrim**
- Random trims (Linux block devices and SCSI
- character devices only).
- **rw,readwrite**
- Sequential mixed reads and writes.
- **randrw**
- Random mixed reads and writes.
- **trimwrite**
- Sequential trim+write sequences. Blocks will be trimmed first,
- then the same blocks will be written to.
-
- Fio defaults to read if the option is not specified. For the mixed I/O
- types, the default is to split them 50/50. For certain types of I/O the
- result may still be skewed a bit, since the speed may be different.
-
- It is possible to specify the number of I/Os to do before getting a new
- offset by appending ``:<nr>`` to the end of the string given. For a
- random read, it would look like ``rw=randread:8`` for passing in an offset
- modifier with a value of 8. If the suffix is used with a sequential I/O
- pattern, then the *<nr>* value specified will be **added** to the generated
- offset for each I/O turning sequential I/O into sequential I/O with holes.
- For instance, using ``rw=write:4k`` will skip 4k for every write. Also see
- the :option:`rw_sequencer` option.
-
-.. option:: rw_sequencer=str
-
- If an offset modifier is given by appending a number to the ``rw=<str>``
- line, then this option controls how that number modifies the I/O offset
- being generated. Accepted values are:
-
- **sequential**
- Generate sequential offset.
- **identical**
- Generate the same offset.
-
- ``sequential`` is only useful for random I/O, where fio would normally
- generate a new random offset for every I/O. If you append e.g. 8 to randread,
- you would get a new random offset for every 8 I/Os. The result would be a
- seek for only every 8 I/Os, instead of for every I/O. Use ``rw=randread:8``
- to specify that. As sequential I/O is already sequential, setting
- ``sequential`` for that would not result in any differences. ``identical``
- behaves in a similar fashion, except it sends the same offset 8 number of
- times before generating a new offset.
-
-.. option:: unified_rw_reporting=bool
-
- Fio normally reports statistics on a per data direction basis, meaning that
- reads, writes, and trims are accounted and reported separately. If this
- option is set fio sums the results and report them as "mixed" instead.
-
-.. option:: randrepeat=bool
-
- Seed the random number generator used for random I/O patterns in a
- predictable way so the pattern is repeatable across runs. Default: true.
-
-.. option:: allrandrepeat=bool
-
- Seed all random number generators in a predictable way so results are
- repeatable across runs. Default: false.
-
-.. option:: randseed=int
-
- Seed the random number generators based on this seed value, to be able to
- control what sequence of output is being generated. If not set, the random
- sequence depends on the :option:`randrepeat` setting.
-
-.. option:: fallocate=str
-
- Whether pre-allocation is performed when laying down files.
- Accepted values are:
-
- **none**
- Do not pre-allocate space.
-
- **native**
- Use a platform's native pre-allocation call but fall back to
- **none** behavior if it fails/is not implemented.
-
- **posix**
- Pre-allocate via :manpage:`posix_fallocate(3)`.
-
- **keep**
- Pre-allocate via :manpage:`fallocate(2)` with
- FALLOC_FL_KEEP_SIZE set.
-
- **truncate**
- Extend file to final size via :manpage:`ftruncate(2)`
- instead of allocating.
-
- **0**
- Backward-compatible alias for **none**.
-
- **1**
- Backward-compatible alias for **posix**.
-
- May not be available on all supported platforms. **keep** is only available
- on Linux. If using ZFS on Solaris this cannot be set to **posix**
- because ZFS doesn't support pre-allocation. Default: **native** if any
- pre-allocation methods except **truncate** are available, **none** if not.
-
- Note that using **truncate** on Windows will interact surprisingly
- with non-sequential write patterns. When writing to a file that has
- been extended by setting the end-of-file information, Windows will
- backfill the unwritten portion of the file up to that offset with
- zeroes before issuing the new write. This means that a single small
- write to the end of an extended file will stall until the entire
- file has been filled with zeroes.
-
-.. option:: fadvise_hint=str
-
- Use :manpage:`posix_fadvise(2)` or :manpage:`posix_fadvise(2)` to
- advise the kernel on what I/O patterns are likely to be issued.
- Accepted values are:
-
- **0**
- Backwards-compatible hint for "no hint".
-
- **1**
- Backwards compatible hint for "advise with fio workload type". This
- uses **FADV_RANDOM** for a random workload, and **FADV_SEQUENTIAL**
- for a sequential workload.
-
- **sequential**
- Advise using **FADV_SEQUENTIAL**.
-
- **random**
- Advise using **FADV_RANDOM**.
-
-.. option:: write_hint=str
-
- Use :manpage:`fcntl(2)` to advise the kernel what life time to expect
- from a write. Only supported on Linux, as of version 4.13. Accepted
- values are:
-
- **none**
- No particular life time associated with this file.
-
- **short**
- Data written to this file has a short life time.
-
- **medium**
- Data written to this file has a medium life time.
-
- **long**
- Data written to this file has a long life time.
-
- **extreme**
- Data written to this file has a very long life time.
-
- The values are all relative to each other, and no absolute meaning
- should be associated with them.
-
-.. option:: offset=int
-
- Start I/O at the provided offset in the file, given as either a fixed size in
- bytes or a percentage. If a percentage is given, the generated offset will be
- aligned to the minimum ``blocksize`` or to the value of ``offset_align`` if
- provided. Data before the given offset will not be touched. This
- effectively caps the file size at `real_size - offset`. Can be combined with
- :option:`size` to constrain the start and end range of the I/O workload.
- A percentage can be specified by a number between 1 and 100 followed by '%',
- for example, ``offset=20%`` to specify 20%.
-
-.. option:: offset_align=int
-
- If set to non-zero value, the byte offset generated by a percentage ``offset``
- is aligned upwards to this value. Defaults to 0 meaning that a percentage
- offset is aligned to the minimum block size.
-
-.. option:: offset_increment=int
-
- If this is provided, then the real offset becomes `offset + offset_increment
- * thread_number`, where the thread number is a counter that starts at 0 and
- is incremented for each sub-job (i.e. when :option:`numjobs` option is
- specified). This option is useful if there are several jobs which are
- intended to operate on a file in parallel disjoint segments, with even
- spacing between the starting points. Percentages can be used for this option.
- If a percentage is given, the generated offset will be aligned to the minimum
- ``blocksize`` or to the value of ``offset_align`` if provided.
-
-.. option:: number_ios=int
-
- Fio will normally perform I/Os until it has exhausted the size of the region
- set by :option:`size`, or if it exhaust the allocated time (or hits an error
- condition). With this setting, the range/size can be set independently of
- the number of I/Os to perform. When fio reaches this number, it will exit
- normally and report status. Note that this does not extend the amount of I/O
- that will be done, it will only stop fio if this condition is met before
- other end-of-job criteria.
-
-.. option:: fsync=int
-
- If writing to a file, issue an :manpage:`fsync(2)` (or its equivalent) of
- the dirty data for every number of blocks given. For example, if you give 32
- as a parameter, fio will sync the file after every 32 writes issued. If fio is
- using non-buffered I/O, we may not sync the file. The exception is the sg
- I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
- means fio does not periodically issue and wait for a sync to complete. Also
- see :option:`end_fsync` and :option:`fsync_on_close`.
-
-.. option:: fdatasync=int
-
- Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
- not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no
- :manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`.
- Defaults to 0, which means fio does not periodically issue and wait for a
- data-only sync to complete.
-
-.. option:: write_barrier=int
-
- Make every `N-th` write a barrier write.
-
-.. option:: sync_file_range=str:int
-
- Use :manpage:`sync_file_range(2)` for every `int` number of write
- operations. Fio will track range of writes that have happened since the last
- :manpage:`sync_file_range(2)` call. `str` can currently be one or more of:
-
- **wait_before**
- SYNC_FILE_RANGE_WAIT_BEFORE
- **write**
- SYNC_FILE_RANGE_WRITE
- **wait_after**
- SYNC_FILE_RANGE_WAIT_AFTER
-
- So if you do ``sync_file_range=wait_before,write:8``, fio would use
- ``SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE`` for every 8
- writes. Also see the :manpage:`sync_file_range(2)` man page. This option is
- Linux specific.
-
-.. option:: overwrite=bool
-
- If true, writes to a file will always overwrite existing data. If the file
- doesn't already exist, it will be created before the write phase begins. If
- the file exists and is large enough for the specified write phase, nothing
- will be done. Default: false.
-
-.. option:: end_fsync=bool
-
- If true, :manpage:`fsync(2)` file contents when a write stage has completed.
- Default: false.
-
-.. option:: fsync_on_close=bool
-
- If true, fio will :manpage:`fsync(2)` a dirty file on close. This differs
- from :option:`end_fsync` in that it will happen on every file close, not
- just at the end of the job. Default: false.
-
-.. option:: rwmixread=int
-
- Percentage of a mixed workload that should be reads. Default: 50.
-
-.. option:: rwmixwrite=int
-
- Percentage of a mixed workload that should be writes. If both
- :option:`rwmixread` and :option:`rwmixwrite` is given and the values do not
- add up to 100%, the latter of the two will be used to override the
- first. This may interfere with a given rate setting, if fio is asked to
- limit reads or writes to a certain rate. If that is the case, then the
- distribution may be skewed. Default: 50.
-
-.. option:: random_distribution=str:float[,str:float][,str:float]
-
- By default, fio will use a completely uniform random distribution when asked
- to perform random I/O. Sometimes it is useful to skew the distribution in
- specific ways, ensuring that some parts of the data is more hot than others.
- fio includes the following distribution models:
-
- **random**
- Uniform random distribution
-
- **zipf**
- Zipf distribution
-
- **pareto**
- Pareto distribution
-
- **normal**
- Normal (Gaussian) distribution
-
- **zoned**
- Zoned random distribution
-
- **zoned_abs**
- Zone absolute random distribution
-
- When using a **zipf** or **pareto** distribution, an input value is also
- needed to define the access pattern. For **zipf**, this is the `Zipf
- theta`. For **pareto**, it's the `Pareto power`. Fio includes a test
- program, :command:`fio-genzipf`, that can be used visualize what the given input
- values will yield in terms of hit rates. If you wanted to use **zipf** with
- a `theta` of 1.2, you would use ``random_distribution=zipf:1.2`` as the
- option. If a non-uniform model is used, fio will disable use of the random
- map. For the **normal** distribution, a normal (Gaussian) deviation is
- supplied as a value between 0 and 100.
-
- For a **zoned** distribution, fio supports specifying percentages of I/O
- access that should fall within what range of the file or device. For
- example, given a criteria of:
-
- * 60% of accesses should be to the first 10%
- * 30% of accesses should be to the next 20%
- * 8% of accesses should be to the next 30%
- * 2% of accesses should be to the next 40%
-
- we can define that through zoning of the random accesses. For the above
- example, the user would do::
-
- random_distribution=zoned:60/10:30/20:8/30:2/40
-
- A **zoned_abs** distribution works exactly like the **zoned**, except
- that it takes absolute sizes. For example, let's say you wanted to
- define access according to the following criteria:
-
- * 60% of accesses should be to the first 20G
- * 30% of accesses should be to the next 100G
- * 10% of accesses should be to the next 500G
-
- we can define an absolute zoning distribution with:
-
- random_distribution=zoned_abs=60/20G:30/100G:10/500g
-
- For both **zoned** and **zoned_abs**, fio supports defining up to
- 256 separate zones.
-
- Similarly to how :option:`bssplit` works for setting ranges and
- percentages of block sizes. Like :option:`bssplit`, it's possible to
- specify separate zones for reads, writes, and trims. If just one set
- is given, it'll apply to all of them. This goes for both **zoned**
- **zoned_abs** distributions.
-
-.. option:: percentage_random=int[,int][,int]
-
- For a random workload, set how big a percentage should be random. This
- defaults to 100%, in which case the workload is fully random. It can be set
- from anywhere from 0 to 100. Setting it to 0 would make the workload fully
- sequential. Any setting in between will result in a random mix of sequential
- and random I/O, at the given percentages. Comma-separated values may be
- specified for reads, writes, and trims as described in :option:`blocksize`.
-
-.. option:: norandommap
-
- Normally fio will cover every block of the file when doing random I/O. If
- this option is given, fio will just get a new random offset without looking
- at past I/O history. This means that some blocks may not be read or written,
- and that some blocks may be read/written more than once. If this option is
- used with :option:`verify` and multiple blocksizes (via :option:`bsrange`),
- only intact blocks are verified, i.e., partially-overwritten blocks are
- ignored. With an async I/O engine and an I/O depth > 1, it is possible for
- the same block to be overwritten, which can cause verification errors. Either
- do not use norandommap in this case, or also use the lfsr random generator.
-
-.. option:: softrandommap=bool
-
- See :option:`norandommap`. If fio runs with the random block map enabled and
- it fails to allocate the map, if this option is set it will continue without
- a random block map. As coverage will not be as complete as with random maps,
- this option is disabled by default.
-
-.. option:: random_generator=str
-
- Fio supports the following engines for generating I/O offsets for random I/O:
-
- **tausworthe**
- Strong 2^88 cycle random number generator.
- **lfsr**
- Linear feedback shift register generator.
- **tausworthe64**
- Strong 64-bit 2^258 cycle random number generator.
-
- **tausworthe** is a strong random number generator, but it requires tracking
- on the side if we want to ensure that blocks are only read or written
- once. **lfsr** guarantees that we never generate the same offset twice, and
- it's also less computationally expensive. It's not a true random generator,
- however, though for I/O purposes it's typically good enough. **lfsr** only
- works with single block sizes, not with workloads that use multiple block
- sizes. If used with such a workload, fio may read or write some blocks
- multiple times. The default value is **tausworthe**, unless the required
- space exceeds 2^32 blocks. If it does, then **tausworthe64** is
- selected automatically.
-
-
-Block size
-~~~~~~~~~~
-
-.. option:: blocksize=int[,int][,int], bs=int[,int][,int]
-
- The block size in bytes used for I/O units. Default: 4096. A single value
- applies to reads, writes, and trims. Comma-separated values may be
- specified for reads, writes, and trims. A value not terminated in a comma
- applies to subsequent types.
-
- Examples:
-
- **bs=256k**
- means 256k for reads, writes and trims.
-
- **bs=8k,32k**
- means 8k for reads, 32k for writes and trims.
-
- **bs=8k,32k,**
- means 8k for reads, 32k for writes, and default for trims.
-
- **bs=,8k**
- means default for reads, 8k for writes and trims.
-
- **bs=,8k,**
- means default for reads, 8k for writes, and default for trims.
-
-.. option:: blocksize_range=irange[,irange][,irange], bsrange=irange[,irange][,irange]
-
- A range of block sizes in bytes for I/O units. The issued I/O unit will
- always be a multiple of the minimum size, unless
- :option:`blocksize_unaligned` is set.
-
- Comma-separated ranges may be specified for reads, writes, and trims as
- described in :option:`blocksize`.
-
- Example: ``bsrange=1k-4k,2k-8k``.
-
-.. option:: bssplit=str[,str][,str]
-
- Sometimes you want even finer grained control of the block sizes
- issued, not just an even split between them. This option allows you to
- weight various block sizes, so that you are able to define a specific
- amount of block sizes issued. The format for this option is::
-
- bssplit=blocksize/percentage:blocksize/percentage
-
- for as many block sizes as needed. So if you want to define a workload
- that has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would
- write::
-
- bssplit=4k/10:64k/50:32k/40
-
- Ordering does not matter. If the percentage is left blank, fio will
- fill in the remaining values evenly. So a bssplit option like this one::
-
- bssplit=4k/50:1k/:32k/
-
- would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always
- add up to 100, if bssplit is given a range that adds up to more, it
- will error out.
-
- Comma-separated values may be specified for reads, writes, and trims as
- described in :option:`blocksize`.
-
- If you want a workload that has 50% 2k reads and 50% 4k reads, while
- having 90% 4k writes and 10% 8k writes, you would specify::
-
- bssplit=2k/50:4k/50,4k/90:8k/10
-
- Fio supports defining up to 64 different weights for each data
- direction.
-
-.. option:: blocksize_unaligned, bs_unaligned
-
- If set, fio will issue I/O units with any size within
- :option:`blocksize_range`, not just multiples of the minimum size. This
- typically won't work with direct I/O, as that normally requires sector
- alignment.
-
-.. option:: bs_is_seq_rand=bool
-
- If this option is set, fio will use the normal read,write blocksize settings
- as sequential,random blocksize settings instead. Any random read or write
- will use the WRITE blocksize settings, and any sequential read or write will
- use the READ blocksize settings.
-
-.. option:: blockalign=int[,int][,int], ba=int[,int][,int]
-
- Boundary to which fio will align random I/O units. Default:
- :option:`blocksize`. Minimum alignment is typically 512b for using direct
- I/O, though it usually depends on the hardware block size. This option is
- mutually exclusive with using a random map for files, so it will turn off
- that option. Comma-separated values may be specified for reads, writes, and
- trims as described in :option:`blocksize`.
-
-
-Buffers and memory
-~~~~~~~~~~~~~~~~~~
-
-.. option:: zero_buffers
-
- Initialize buffers with all zeros. Default: fill buffers with random data.
-
-.. option:: refill_buffers
-
- If this option is given, fio will refill the I/O buffers on every
- submit. Only makes sense if :option:`zero_buffers` isn't specified,
- naturally. Defaults to being unset i.e., the buffer is only filled at
- init time and the data in it is reused when possible but if any of
- :option:`verify`, :option:`buffer_compress_percentage` or
- :option:`dedupe_percentage` are enabled then `refill_buffers` is also
- automatically enabled.
-
-.. option:: scramble_buffers=bool
-
- If :option:`refill_buffers` is too costly and the target is using data
- deduplication, then setting this option will slightly modify the I/O buffer
- contents to defeat normal de-dupe attempts. This is not enough to defeat
- more clever block compression attempts, but it will stop naive dedupe of
- blocks. Default: true.
-
-.. option:: buffer_compress_percentage=int
-
- If this is set, then fio will attempt to provide I/O buffer content
- (on WRITEs) that compresses to the specified level. Fio does this by
- providing a mix of random data followed by fixed pattern data. The
- fixed pattern is either zeros, or the pattern specified by
- :option:`buffer_pattern`. If the `buffer_pattern` option is used, it
- might skew the compression ratio slightly. Setting
- `buffer_compress_percentage` to a value other than 100 will also
- enable :option:`refill_buffers` in order to reduce the likelihood that
- adjacent blocks are so similar that they over compress when seen
- together. See :option:`buffer_compress_chunk` for how to set a finer or
- coarser granularity for the random/fixed data region. Defaults to unset
- i.e., buffer data will not adhere to any compression level.
-
-.. option:: buffer_compress_chunk=int
-
- This setting allows fio to manage how big the random/fixed data region
- is when using :option:`buffer_compress_percentage`. When
- `buffer_compress_chunk` is set to some non-zero value smaller than the
- block size, fio can repeat the random/fixed region throughout the I/O
- buffer at the specified interval (which particularly useful when
- bigger block sizes are used for a job). When set to 0, fio will use a
- chunk size that matches the block size resulting in a single
- random/fixed region within the I/O buffer. Defaults to 512. When the
- unit is omitted, the value is interpreted in bytes.
-
-.. option:: buffer_pattern=str
-
- If set, fio will fill the I/O buffers with this pattern or with the contents
- of a file. If not set, the contents of I/O buffers are defined by the other
- options related to buffer contents. The setting can be any pattern of bytes,
- and can be prefixed with 0x for hex values. It may also be a string, where
- the string must then be wrapped with ``""``. Or it may also be a filename,
- where the filename must be wrapped with ``''`` in which case the file is
- opened and read. Note that not all the file contents will be read if that
- would cause the buffers to overflow. So, for example::
-
- buffer_pattern='filename'
-
- or::
-
- buffer_pattern="abcd"
-
- or::
-
- buffer_pattern=-12
-
- or::
-
- buffer_pattern=0xdeadface
-
- Also you can combine everything together in any order::
-
- buffer_pattern=0xdeadface"abcd"-12'filename'
-
-.. option:: dedupe_percentage=int
-
- If set, fio will generate this percentage of identical buffers when
- writing. These buffers will be naturally dedupable. The contents of the
- buffers depend on what other buffer compression settings have been set. It's
- possible to have the individual buffers either fully compressible, or not at
- all -- this option only controls the distribution of unique buffers. Setting
- this option will also enable :option:`refill_buffers` to prevent every buffer
- being identical.
-
-.. option:: invalidate=bool
-
- Invalidate the buffer/page cache parts of the files to be used prior to
- starting I/O if the platform and file type support it. Defaults to true.
- This will be ignored if :option:`pre_read` is also specified for the
- same job.
-
-.. option:: sync=bool
-
- Use synchronous I/O for buffered writes. For the majority of I/O engines,
- this means using O_SYNC. Default: false.
-
-.. option:: iomem=str, mem=str
-
- Fio can use various types of memory as the I/O unit buffer. The allowed
- values are:
-
- **malloc**
- Use memory from :manpage:`malloc(3)` as the buffers. Default memory
- type.
-
- **shm**
- Use shared memory as the buffers. Allocated through
- :manpage:`shmget(2)`.
-
- **shmhuge**
- Same as shm, but use huge pages as backing.
-
- **mmap**
- Use :manpage:`mmap(2)` to allocate buffers. May either be anonymous memory, or can
- be file backed if a filename is given after the option. The format
- is `mem=mmap:/path/to/file`.
-
- **mmaphuge**
- Use a memory mapped huge file as the buffer backing. Append filename
- after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file`.
-
- **mmapshared**
- Same as mmap, but use a MMAP_SHARED mapping.
-
- **cudamalloc**
- Use GPU memory as the buffers for GPUDirect RDMA benchmark.
- The :option:`ioengine` must be `rdma`.
-
- The area allocated is a function of the maximum allowed bs size for the job,
- multiplied by the I/O depth given. Note that for **shmhuge** and
- **mmaphuge** to work, the system must have free huge pages allocated. This
- can normally be checked and set by reading/writing
- :file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page
- is 4MiB in size. So to calculate the number of huge pages you need for a
- given job file, add up the I/O depth of all jobs (normally one unless
- :option:`iodepth` is used) and multiply by the maximum bs set. Then divide
- that number by the huge page size. You can see the size of the huge pages in
- :file:`/proc/meminfo`. If no huge pages are allocated by having a non-zero
- number in `nr_hugepages`, using **mmaphuge** or **shmhuge** will fail. Also
- see :option:`hugepage-size`.
-
- **mmaphuge** also needs to have hugetlbfs mounted and the file location
- should point there. So if it's mounted in :file:`/huge`, you would use
- `mem=mmaphuge:/huge/somefile`.
-
-.. option:: iomem_align=int, mem_align=int
-
- This indicates the memory alignment of the I/O memory buffers. Note that
- the given alignment is applied to the first I/O unit buffer, if using
- :option:`iodepth` the alignment of the following buffers are given by the
- :option:`bs` used. In other words, if using a :option:`bs` that is a
- multiple of the page sized in the system, all buffers will be aligned to
- this value. If using a :option:`bs` that is not page aligned, the alignment
- of subsequent I/O memory buffers is the sum of the :option:`iomem_align` and
- :option:`bs` used.
-
-.. option:: hugepage-size=int
-
- Defines the size of a huge page. Must at least be equal to the system
- setting, see :file:`/proc/meminfo`. Defaults to 4MiB. Should probably
- always be a multiple of megabytes, so using ``hugepage-size=Xm`` is the
- preferred way to set this to avoid setting a non-pow-2 bad value.
-
-.. option:: lockmem=int
-
- Pin the specified amount of memory with :manpage:`mlock(2)`. Can be used to
- simulate a smaller amount of memory. The amount specified is per worker.
-
-
-I/O size
-~~~~~~~~
-
-.. option:: size=int
-
- The total size of file I/O for each thread of this job. Fio will run until
- this many bytes has been transferred, unless runtime is limited by other options
- (such as :option:`runtime`, for instance, or increased/decreased by :option:`io_size`).
- Fio will divide this size between the available files determined by options
- such as :option:`nrfiles`, :option:`filename`, unless :option:`filesize` is
- specified by the job. If the result of division happens to be 0, the size is
- set to the physical size of the given files or devices if they exist.
- If this option is not specified, fio will use the full size of the given
- files or devices. If the files do not exist, size must be given. It is also
- possible to give size as a percentage between 1 and 100. If ``size=20%`` is
- given, fio will use 20% of the full size of the given files or devices.
- Can be combined with :option:`offset` to constrain the start and end range
- that I/O will be done within.
-
-.. option:: io_size=int, io_limit=int
-
- Normally fio operates within the region set by :option:`size`, which means
- that the :option:`size` option sets both the region and size of I/O to be
- performed. Sometimes that is not what you want. With this option, it is
- possible to define just the amount of I/O that fio should do. For instance,
- if :option:`size` is set to 20GiB and :option:`io_size` is set to 5GiB, fio
- will perform I/O within the first 20GiB but exit when 5GiB have been
- done. The opposite is also possible -- if :option:`size` is set to 20GiB,
- and :option:`io_size` is set to 40GiB, then fio will do 40GiB of I/O within
- the 0..20GiB region.
-
-.. option:: filesize=irange(int)
-
- Individual file sizes. May be a range, in which case fio will select sizes
- for files at random within the given range and limited to :option:`size` in
- total (if that is given). If not given, each created file is the same size.
- This option overrides :option:`size` in terms of file size, which means
- this value is used as a fixed size or possible range of each file.
-
-.. option:: file_append=bool
-
- Perform I/O after the end of the file. Normally fio will operate within the
- size of a file. If this option is set, then fio will append to the file
- instead. This has identical behavior to setting :option:`offset` to the size
- of a file. This option is ignored on non-regular files.
-
-.. option:: fill_device=bool, fill_fs=bool
-
- Sets size to something really large and waits for ENOSPC (no space left on
- device) as the terminating condition. Only makes sense with sequential
- write. For a read workload, the mount point will be filled first then I/O
- started on the result. This option doesn't make sense if operating on a raw
- device node, since the size of that is already known by the file system.
- Additionally, writing beyond end-of-device will not return ENOSPC there.
-
-
-I/O engine
-~~~~~~~~~~
-
-.. option:: ioengine=str
-
- Defines how the job issues I/O to the file. The following types are defined:
-
- **sync**
- Basic :manpage:`read(2)` or :manpage:`write(2)`
- I/O. :manpage:`lseek(2)` is used to position the I/O location.
- See :option:`fsync` and :option:`fdatasync` for syncing write I/Os.
-
- **psync**
- Basic :manpage:`pread(2)` or :manpage:`pwrite(2)` I/O. Default on
- all supported operating systems except for Windows.
-
- **vsync**
- Basic :manpage:`readv(2)` or :manpage:`writev(2)` I/O. Will emulate
- queuing by coalescing adjacent I/Os into a single submission.
-
- **pvsync**
- Basic :manpage:`preadv(2)` or :manpage:`pwritev(2)` I/O.
-
- **pvsync2**
- Basic :manpage:`preadv2(2)` or :manpage:`pwritev2(2)` I/O.
-
- **io_uring**
- Fast Linux native asynchronous I/O. Supports async IO
- for both direct and buffered IO.
- This engine defines engine specific options.
-
- **libaio**
- Linux native asynchronous I/O. Note that Linux may only support
- queued behavior with non-buffered I/O (set ``direct=1`` or
- ``buffered=0``).
- This engine defines engine specific options.
-
- **posixaio**
- POSIX asynchronous I/O using :manpage:`aio_read(3)` and
- :manpage:`aio_write(3)`.
-
- **solarisaio**
- Solaris native asynchronous I/O.
-
- **windowsaio**
- Windows native asynchronous I/O. Default on Windows.
-
- **mmap**
- File is memory mapped with :manpage:`mmap(2)` and data copied
- to/from using :manpage:`memcpy(3)`.
-
- **splice**
- :manpage:`splice(2)` is used to transfer the data and
- :manpage:`vmsplice(2)` to transfer data from user space to the
- kernel.
-
- **sg**
- SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
- ioctl, or if the target is an sg character device we use
- :manpage:`read(2)` and :manpage:`write(2)` for asynchronous
- I/O. Requires :option:`filename` option to specify either block or
- character devices. This engine supports trim operations.
- The sg engine includes engine specific options.
-
- **null**
- Doesn't transfer any data, just pretends to. This is mainly used to
- exercise fio itself and for debugging/testing purposes.
-
- **net**
- Transfer over the network to given ``host:port``. Depending on the
- :option:`protocol` used, the :option:`hostname`, :option:`port`,
- :option:`listen` and :option:`filename` options are used to specify
- what sort of connection to make, while the :option:`protocol` option
- determines which protocol will be used. This engine defines engine
- specific options.
-
- **netsplice**
- Like **net**, but uses :manpage:`splice(2)` and
- :manpage:`vmsplice(2)` to map data and send/receive.
- This engine defines engine specific options.
-
- **cpuio**
- Doesn't transfer any data, but burns CPU cycles according to the
- :option:`cpuload` and :option:`cpuchunks` options. Setting
- :option:`cpuload`\=85 will cause that job to do nothing but burn 85%
- of the CPU. In case of SMP machines, use :option:`numjobs`\=<nr_of_cpu>
- to get desired CPU usage, as the cpuload only loads a
- single CPU at the desired rate. A job never finishes unless there is
- at least one non-cpuio job.
-
- **rdma**
- The RDMA I/O engine supports both RDMA memory semantics
- (RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
- InfiniBand, RoCE and iWARP protocols. This engine defines engine
- specific options.
-
- **falloc**
- I/O engine that does regular fallocate to simulate data transfer as
- fio ioengine.
-
- DDIR_READ
- does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
-
- DDIR_WRITE
- does fallocate(,mode = 0).
-
- DDIR_TRIM
- does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
-
- **ftruncate**
- I/O engine that sends :manpage:`ftruncate(2)` operations in response
- to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
- size to the current block offset. :option:`blocksize` is ignored.
-
- **e4defrag**
- I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
- defragment activity in request to DDIR_WRITE event.
-
- **rados**
- I/O engine supporting direct access to Ceph Reliable Autonomic
- Distributed Object Store (RADOS) via librados. This ioengine
- defines engine specific options.
-
- **rbd**
- I/O engine supporting direct access to Ceph Rados Block Devices
- (RBD) via librbd without the need to use the kernel rbd driver. This
- ioengine defines engine specific options.
-
- **http**
- I/O engine supporting GET/PUT requests over HTTP(S) with libcurl to
- a WebDAV or S3 endpoint. This ioengine defines engine specific options.
-
- This engine only supports direct IO of iodepth=1; you need to scale this
- via numjobs. blocksize defines the size of the objects to be created.
-
- TRIM is translated to object deletion.
-
- **gfapi**
- Using GlusterFS libgfapi sync interface to direct access to
- GlusterFS volumes without having to go through FUSE. This ioengine
- defines engine specific options.
-
- **gfapi_async**
- Using GlusterFS libgfapi async interface to direct access to
- GlusterFS volumes without having to go through FUSE. This ioengine
- defines engine specific options.
-
- **libhdfs**
- Read and write through Hadoop (HDFS). The :option:`filename` option
- is used to specify host,port of the hdfs name-node to connect. This
- engine interprets offsets a little differently. In HDFS, files once
- created cannot be modified so random writes are not possible. To
- imitate this the libhdfs engine expects a bunch of small files to be
- created over HDFS and will randomly pick a file from them
- based on the offset generated by fio backend (see the example
- job file to create such files, use ``rw=write`` option). Please
- note, it may be necessary to set environment variables to work
- with HDFS/libhdfs properly. Each job uses its own connection to
- HDFS.
-
- **mtd**
- Read, write and erase an MTD character device (e.g.,
- :file:`/dev/mtd0`). Discards are treated as erases. Depending on the
- underlying device type, the I/O may have to go in a certain pattern,
- e.g., on NAND, writing sequentially to erase blocks and discarding
- before overwriting. The `trimwrite` mode works well for this
- constraint.
-
- **pmemblk**
- Read and write using filesystem DAX to a file on a filesystem
- mounted with DAX on a persistent memory device through the PMDK
- libpmemblk library.
-
- **dev-dax**
- Read and write using device DAX to a persistent memory device (e.g.,
- /dev/dax0.0) through the PMDK libpmem library.
-
- **external**
- Prefix to specify loading an external I/O engine object file. Append
- the engine filename, e.g. ``ioengine=external:/tmp/foo.o`` to load
- ioengine :file:`foo.o` in :file:`/tmp`. The path can be either
- absolute or relative. See :file:`engines/skeleton_external.c` for
- details of writing an external I/O engine.
-
- **filecreate**
- Simply create the files and do no I/O to them. You still need to
- set `filesize` so that all the accounting still occurs, but no
- actual I/O will be done other than creating the file.
-
- **filestat**
- Simply do stat() and do no I/O to the file. You need to set 'filesize'
- and 'nrfiles', so that files will be created.
- This engine is to measure file lookup and meta data access.
-
- **libpmem**
- Read and write using mmap I/O to a file on a filesystem
- mounted with DAX on a persistent memory device through the PMDK
- libpmem library.
-
- **ime_psync**
- Synchronous read and write using DDN's Infinite Memory Engine (IME).
- This engine is very basic and issues calls to IME whenever an IO is
- queued.
-
- **ime_psyncv**
- Synchronous read and write using DDN's Infinite Memory Engine (IME).
- This engine uses iovecs and will try to stack as much IOs as possible
- (if the IOs are "contiguous" and the IO depth is not exceeded)
- before issuing a call to IME.
-
- **ime_aio**
- Asynchronous read and write using DDN's Infinite Memory Engine (IME).
- This engine will try to stack as much IOs as possible by creating
- requests for IME. FIO will then decide when to commit these requests.
- **libiscsi**
- Read and write iscsi lun with libiscsi.
- **nbd**
- Read and write a Network Block Device (NBD).
-
-I/O engine specific parameters
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In addition, there are some parameters which are only valid when a specific
-:option:`ioengine` is in use. These are used identically to normal parameters,
-with the caveat that when used on the command line, they must come after the
-:option:`ioengine` that defines them is selected.
-
-.. option:: cmdprio_percentage=int : [io_uring] [libaio]
-
- Set the percentage of I/O that will be issued with higher priority by setting
- the priority bit. Non-read I/O is likely unaffected by ``cmdprio_percentage``.
- This option cannot be used with the `prio` or `prioclass` options. For this
- option to set the priority bit properly, NCQ priority must be supported and
- enabled and :option:`direct`\=1 option must be used. fio must also be run as
- the root user.
-
-.. option:: fixedbufs : [io_uring]
-
- If fio is asked to do direct IO, then Linux will map pages for each
- IO call, and release them when IO is done. If this option is set, the
- pages are pre-mapped before IO is started. This eliminates the need to
- map and release for each IO. This is more efficient, and reduces the
- IO latency as well.
-
-.. option:: hipri : [io_uring]
-
- If this option is set, fio will attempt to use polled IO completions.
- Normal IO completions generate interrupts to signal the completion of
- IO, polled completions do not. Hence they are require active reaping
- by the application. The benefits are more efficient IO for high IOPS
- scenarios, and lower latencies for low queue depth IO.
-
-.. option:: registerfiles : [io_uring]
-
- With this option, fio registers the set of files being used with the
- kernel. This avoids the overhead of managing file counts in the kernel,
- making the submission and completion part more lightweight. Required
- for the below :option:`sqthread_poll` option.
-
-.. option:: sqthread_poll : [io_uring]
-
- Normally fio will submit IO by issuing a system call to notify the
- kernel of available items in the SQ ring. If this option is set, the
- act of submitting IO will be done by a polling thread in the kernel.
- This frees up cycles for fio, at the cost of using more CPU in the
- system.
-
-.. option:: sqthread_poll_cpu : [io_uring]
-
- When :option:`sqthread_poll` is set, this option provides a way to
- define which CPU should be used for the polling thread.
-
-.. option:: userspace_reap : [libaio]
-
- Normally, with the libaio engine in use, fio will use the
- :manpage:`io_getevents(2)` system call to reap newly returned events. With
- this flag turned on, the AIO ring will be read directly from user-space to
- reap events. The reaping mode is only enabled when polling for a minimum of
- 0 events (e.g. when :option:`iodepth_batch_complete` `=0`).
-
-.. option:: hipri : [pvsync2]
-
- Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
- than normal.
-
-.. option:: hipri_percentage : [pvsync2]
-
- When hipri is set this determines the probability of a pvsync2 I/O being high
- priority. The default is 100%.
-
-.. option:: nowait : [pvsync2] [libaio] [io_uring]
-
- By default if a request cannot be executed immediately (e.g. resource starvation,
- waiting on locks) it is queued and the initiating process will be blocked until
- the required resource becomes free.
-
- This option sets the RWF_NOWAIT flag (supported from the 4.14 Linux kernel) and
- the call will return instantly with EAGAIN or a partial result rather than waiting.
-
- It is useful to also use ignore_error=EAGAIN when using this option.
-
- Note: glibc 2.27, 2.28 have a bug in syscall wrappers preadv2, pwritev2.
- They return EOPNOTSUP instead of EAGAIN.
-
- For cached I/O, using this option usually means a request operates only with
- cached data. Currently the RWF_NOWAIT flag does not supported for cached write.
-
- For direct I/O, requests will only succeed if cache invalidation isn't required,
- file blocks are fully allocated and the disk request could be issued immediately.
-
-.. option:: cpuload=int : [cpuio]
-
- Attempt to use the specified percentage of CPU cycles. This is a mandatory
- option when using cpuio I/O engine.
-
-.. option:: cpuchunks=int : [cpuio]
-
- Split the load into cycles of the given time. In microseconds.
-
-.. option:: exit_on_io_done=bool : [cpuio]
-
- Detect when I/O threads are done, then exit.
-
-.. option:: namenode=str : [libhdfs]
-
- The hostname or IP address of a HDFS cluster namenode to contact.
-
-.. option:: port=int
-
- [libhdfs]
-
- The listening port of the HFDS cluster namenode.
-
- [netsplice], [net]
-
- The TCP or UDP port to bind to or connect to. If this is used with
- :option:`numjobs` to spawn multiple instances of the same job type, then
- this will be the starting port number since fio will use a range of
- ports.
-
- [rdma]
-
- The port to use for RDMA-CM communication. This should be the same value
- on the client and the server side.
-
-.. option:: hostname=str : [netsplice] [net] [rdma]
-
- The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O. If the job
- is a TCP listener or UDP reader, the hostname is not used and must be omitted
- unless it is a valid UDP multicast address.
-
-.. option:: interface=str : [netsplice] [net]
-
- The IP address of the network interface used to send or receive UDP
- multicast.
-
-.. option:: ttl=int : [netsplice] [net]
-
- Time-to-live value for outgoing UDP multicast packets. Default: 1.
-
-.. option:: nodelay=bool : [netsplice] [net]
-
- Set TCP_NODELAY on TCP connections.
-
-.. option:: protocol=str, proto=str : [netsplice] [net]
-
- The network protocol to use. Accepted values are:
-
- **tcp**
- Transmission control protocol.
- **tcpv6**
- Transmission control protocol V6.
- **udp**
- User datagram protocol.
- **udpv6**
- User datagram protocol V6.
- **unix**
- UNIX domain socket.
-
- When the protocol is TCP or UDP, the port must also be given, as well as the
- hostname if the job is a TCP listener or UDP reader. For unix sockets, the
- normal :option:`filename` option should be used and the port is invalid.
-
-.. option:: listen : [netsplice] [net]
-
- For TCP network connections, tell fio to listen for incoming connections
- rather than initiating an outgoing connection. The :option:`hostname` must
- be omitted if this option is used.
-
-.. option:: pingpong : [netsplice] [net]
-
- Normally a network writer will just continue writing data, and a network
- reader will just consume packages. If ``pingpong=1`` is set, a writer will
- send its normal payload to the reader, then wait for the reader to send the
- same payload back. This allows fio to measure network latencies. The
- submission and completion latencies then measure local time spent sending or
- receiving, and the completion latency measures how long it took for the
- other end to receive and send back. For UDP multicast traffic
- ``pingpong=1`` should only be set for a single reader when multiple readers
- are listening to the same address.
-
-.. option:: window_size : [netsplice] [net]
-
- Set the desired socket buffer size for the connection.
-
-.. option:: mss : [netsplice] [net]
-
- Set the TCP maximum segment size (TCP_MAXSEG).
-
-.. option:: donorname=str : [e4defrag]
-
- File will be used as a block donor (swap extents between files).
-
-.. option:: inplace=int : [e4defrag]
-
- Configure donor file blocks allocation strategy:
-
- **0**
- Default. Preallocate donor's file on init.
- **1**
- Allocate space immediately inside defragment event, and free right
- after event.
-
-.. option:: clustername=str : [rbd,rados]
-
- Specifies the name of the Ceph cluster.
-
-.. option:: rbdname=str : [rbd]
-
- Specifies the name of the RBD.
-
-.. option:: pool=str : [rbd,rados]
-
- Specifies the name of the Ceph pool containing RBD or RADOS data.
-
-.. option:: clientname=str : [rbd,rados]
-
- Specifies the username (without the 'client.' prefix) used to access the
- Ceph cluster. If the *clustername* is specified, the *clientname* shall be
- the full *type.id* string. If no type. prefix is given, fio will add
- 'client.' by default.
-
-.. option:: busy_poll=bool : [rbd,rados]
-
- Poll store instead of waiting for completion. Usually this provides better
- throughput at cost of higher(up to 100%) CPU utilization.
-
-.. option:: skip_bad=bool : [mtd]
-
- Skip operations against known bad blocks.
-
-.. option:: hdfsdirectory : [libhdfs]
-
- libhdfs will create chunk in this HDFS directory.
-
-.. option:: chunk_size : [libhdfs]
-
- The size of the chunk to use for each file.
-
-.. option:: verb=str : [rdma]
-
- The RDMA verb to use on this side of the RDMA ioengine connection. Valid
- values are write, read, send and recv. These correspond to the equivalent
- RDMA verbs (e.g. write = rdma_write etc.). Note that this only needs to be
- specified on the client side of the connection. See the examples folder.
-
-.. option:: bindname=str : [rdma]
-
- The name to use to bind the local RDMA-CM connection to a local RDMA device.
- This could be a hostname or an IPv4 or IPv6 address. On the server side this
- will be passed into the rdma_bind_addr() function and on the client site it
- will be used in the rdma_resolve_add() function. This can be useful when
- multiple paths exist between the client and the server or in certain loopback
- configurations.
-
-.. option:: stat_type=str : [filestat]
-
- Specify stat system call type to measure lookup/getattr performance.
- Default is **stat** for :manpage:`stat(2)`.
-
-.. option:: readfua=bool : [sg]
-
- With readfua option set to 1, read operations include
- the force unit access (fua) flag. Default is 0.
-
-.. option:: writefua=bool : [sg]
-
- With writefua option set to 1, write operations include
- the force unit access (fua) flag. Default is 0.
-
-.. option:: sg_write_mode=str : [sg]
-
- Specify the type of write commands to issue. This option can take three values:
-
- **write**
- This is the default where write opcodes are issued as usual.
- **verify**
- Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
- directs the device to carry out a medium verification with no data
- comparison. The writefua option is ignored with this selection.
- **same**
- Issue WRITE SAME commands. This transfers a single block to the device
- and writes this same block of data to a contiguous sequence of LBAs
- beginning at the specified offset. fio's block size parameter specifies
- the amount of data written with each command. However, the amount of data
- actually transferred to the device is equal to the device's block
- (sector) size. For a device with 512 byte sectors, blocksize=8k will
- write 16 sectors with each command. fio will still generate 8k of data
- for each command but only the first 512 bytes will be used and
- transferred to the device. The writefua option is ignored with this
- selection.
-
-.. option:: http_host=str : [http]
-
- Hostname to connect to. For S3, this could be the bucket hostname.
- Default is **localhost**
-
-.. option:: http_user=str : [http]
-
- Username for HTTP authentication.
-
-.. option:: http_pass=str : [http]
-
- Password for HTTP authentication.
-
-.. option:: https=str : [http]
-
- Enable HTTPS instead of http. *on* enables HTTPS; *insecure*
- will enable HTTPS, but disable SSL peer verification (use with
- caution!). Default is **off**
-
-.. option:: http_mode=str : [http]
-
- Which HTTP access mode to use: *webdav*, *swift*, or *s3*.
- Default is **webdav**
-
-.. option:: http_s3_region=str : [http]
-
- The S3 region/zone string.
- Default is **us-east-1**
-
-.. option:: http_s3_key=str : [http]
-
- The S3 secret key.
-
-.. option:: http_s3_keyid=str : [http]
-
- The S3 key/access id.
-
-.. option:: http_swift_auth_token=str : [http]
-
- The Swift auth token. See the example configuration file on how
- to retrieve this.
-
-.. option:: http_verbose=int : [http]
-
- Enable verbose requests from libcurl. Useful for debugging. 1
- turns on verbose logging from libcurl, 2 additionally enables
- HTTP IO tracing. Default is **0**
-
-.. option:: uri=str : [nbd]
-
- Specify the NBD URI of the server to test. The string
- is a standard NBD URI
- (see https://github.com/NetworkBlockDevice/nbd/tree/master/doc).
- Example URIs: nbd://localhost:10809
- nbd+unix:///?socket=/tmp/socket
- nbds://tlshost/exportname
-
-I/O depth
-~~~~~~~~~
-
-.. option:: iodepth=int
-
- Number of I/O units to keep in flight against the file. Note that
- increasing *iodepth* beyond 1 will not affect synchronous ioengines (except
- for small degrees when :option:`verify_async` is in use). Even async
- engines may impose OS restrictions causing the desired depth not to be
- achieved. This may happen on Linux when using libaio and not setting
- :option:`direct`\=1, since buffered I/O is not async on that OS. Keep an
- eye on the I/O depth distribution in the fio output to verify that the
- achieved depth is as expected. Default: 1.
-
-.. option:: iodepth_batch_submit=int, iodepth_batch=int
-
- This defines how many pieces of I/O to submit at once. It defaults to 1
- which means that we submit each I/O as soon as it is available, but can be
- raised to submit bigger batches of I/O at the time. If it is set to 0 the
- :option:`iodepth` value will be used.
-
-.. option:: iodepth_batch_complete_min=int, iodepth_batch_complete=int
-
- This defines how many pieces of I/O to retrieve at once. It defaults to 1
- which means that we'll ask for a minimum of 1 I/O in the retrieval process
- from the kernel. The I/O retrieval will go on until we hit the limit set by
- :option:`iodepth_low`. If this variable is set to 0, then fio will always
- check for completed events before queuing more I/O. This helps reduce I/O
- latency, at the cost of more retrieval system calls.
-
-.. option:: iodepth_batch_complete_max=int
-
- This defines maximum pieces of I/O to retrieve at once. This variable should
- be used along with :option:`iodepth_batch_complete_min`\=int variable,
- specifying the range of min and max amount of I/O which should be
- retrieved. By default it is equal to the :option:`iodepth_batch_complete_min`
- value.
-
- Example #1::
-
- iodepth_batch_complete_min=1
- iodepth_batch_complete_max=<iodepth>
-
- which means that we will retrieve at least 1 I/O and up to the whole
- submitted queue depth. If none of I/O has been completed yet, we will wait.
-
- Example #2::
-
- iodepth_batch_complete_min=0
- iodepth_batch_complete_max=<iodepth>
-
- which means that we can retrieve up to the whole submitted queue depth, but
- if none of I/O has been completed yet, we will NOT wait and immediately exit
- the system call. In this example we simply do polling.
-
-.. option:: iodepth_low=int
-
- The low water mark indicating when to start filling the queue
- again. Defaults to the same as :option:`iodepth`, meaning that fio will
- attempt to keep the queue full at all times. If :option:`iodepth` is set to
- e.g. 16 and *iodepth_low* is set to 4, then after fio has filled the queue of
- 16 requests, it will let the depth drain down to 4 before starting to fill
- it again.
-
-.. option:: serialize_overlap=bool
-
- Serialize in-flight I/Os that might otherwise cause or suffer from data races.
- When two or more I/Os are submitted simultaneously, there is no guarantee that
- the I/Os will be processed or completed in the submitted order. Further, if
- two or more of those I/Os are writes, any overlapping region between them can
- become indeterminate/undefined on certain storage. These issues can cause
- verification to fail erratically when at least one of the racing I/Os is
- changing data and the overlapping region has a non-zero size. Setting
- ``serialize_overlap`` tells fio to avoid provoking this behavior by explicitly
- serializing in-flight I/Os that have a non-zero overlap. Note that setting
- this option can reduce both performance and the :option:`iodepth` achieved.
-
- This option only applies to I/Os issued for a single job except when it is
- enabled along with :option:`io_submit_mode`\=offload. In offload mode, fio
- will check for overlap among all I/Os submitted by offload jobs with :option:`serialize_overlap`
- enabled.
-
- Default: false.
-
-.. option:: io_submit_mode=str
-
- This option controls how fio submits the I/O to the I/O engine. The default
- is `inline`, which means that the fio job threads submit and reap I/O
- directly. If set to `offload`, the job threads will offload I/O submission
- to a dedicated pool of I/O threads. This requires some coordination and thus
- has a bit of extra overhead, especially for lower queue depth I/O where it
- can increase latencies. The benefit is that fio can manage submission rates
- independently of the device completion rates. This avoids skewed latency
- reporting if I/O gets backed up on the device side (the coordinated omission
- problem). Note that this option cannot reliably be used with async IO
- engines.
-
-
-I/O rate
-~~~~~~~~
-
-.. option:: thinktime=time
-
- Stall the job for the specified period of time after an I/O has completed before issuing the
- next. May be used to simulate processing being done by an application.
- When the unit is omitted, the value is interpreted in microseconds. See
- :option:`thinktime_blocks` and :option:`thinktime_spin`.
-
-.. option:: thinktime_spin=time
-
- Only valid if :option:`thinktime` is set - pretend to spend CPU time doing
- something with the data received, before falling back to sleeping for the
- rest of the period specified by :option:`thinktime`. When the unit is
- omitted, the value is interpreted in microseconds.
-
-.. option:: thinktime_blocks=int
-
- Only valid if :option:`thinktime` is set - control how many blocks to issue,
- before waiting :option:`thinktime` usecs. If not set, defaults to 1 which will make
- fio wait :option:`thinktime` usecs after every block. This effectively makes any
- queue depth setting redundant, since no more than 1 I/O will be queued
- before we have to complete it and do our :option:`thinktime`. In other words, this
- setting effectively caps the queue depth if the latter is larger.
-
-.. option:: rate=int[,int][,int]
-
- Cap the bandwidth used by this job. The number is in bytes/sec, the normal
- suffix rules apply. Comma-separated values may be specified for reads,
- writes, and trims as described in :option:`blocksize`.
-
- For example, using `rate=1m,500k` would limit reads to 1MiB/sec and writes to
- 500KiB/sec. Capping only reads or writes can be done with `rate=,500k` or
- `rate=500k,` where the former will only limit writes (to 500KiB/sec) and the
- latter will only limit reads.
-
-.. option:: rate_min=int[,int][,int]
-
- Tell fio to do whatever it can to maintain at least this bandwidth. Failing
- to meet this requirement will cause the job to exit. Comma-separated values
- may be specified for reads, writes, and trims as described in
- :option:`blocksize`.
-
-.. option:: rate_iops=int[,int][,int]
-
- Cap the bandwidth to this number of IOPS. Basically the same as
- :option:`rate`, just specified independently of bandwidth. If the job is
- given a block size range instead of a fixed value, the smallest block size
- is used as the metric. Comma-separated values may be specified for reads,
- writes, and trims as described in :option:`blocksize`.
-
-.. option:: rate_iops_min=int[,int][,int]
-
- If fio doesn't meet this rate of I/O, it will cause the job to exit.
- Comma-separated values may be specified for reads, writes, and trims as
- described in :option:`blocksize`.
-
-.. option:: rate_process=str
-
- This option controls how fio manages rated I/O submissions. The default is
- `linear`, which submits I/O in a linear fashion with fixed delays between
- I/Os that gets adjusted based on I/O completion rates. If this is set to
- `poisson`, fio will submit I/O based on a more real world random request
- flow, known as the Poisson process
- (https://en.wikipedia.org/wiki/Poisson_point_process). The lambda will be
- 10^6 / IOPS for the given workload.
-
-.. option:: rate_ignore_thinktime=bool
-
- By default, fio will attempt to catch up to the specified rate setting,
- if any kind of thinktime setting was used. If this option is set, then
- fio will ignore the thinktime and continue doing IO at the specified
- rate, instead of entering a catch-up mode after thinktime is done.
-
-
-I/O latency
-~~~~~~~~~~~
-
-.. option:: latency_target=time
-
- If set, fio will attempt to find the max performance point that the given
- workload will run at while maintaining a latency below this target. When
- the unit is omitted, the value is interpreted in microseconds. See
- :option:`latency_window` and :option:`latency_percentile`.
-
-.. option:: latency_window=time
-
- Used with :option:`latency_target` to specify the sample window that the job
- is run at varying queue depths to test the performance. When the unit is
- omitted, the value is interpreted in microseconds.
-
-.. option:: latency_percentile=float
-
- The percentage of I/Os that must fall within the criteria specified by
- :option:`latency_target` and :option:`latency_window`. If not set, this
- defaults to 100.0, meaning that all I/Os must be equal or below to the value
- set by :option:`latency_target`.
-
-.. option:: latency_run=bool
-
- Used with :option:`latency_target`. If false (default), fio will find
- the highest queue depth that meets :option:`latency_target` and exit. If
- true, fio will continue running and try to meet :option:`latency_target`
- by adjusting queue depth.
-
-.. option:: max_latency=time
-
- If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
- maximum latency. When the unit is omitted, the value is interpreted in
- microseconds.
-
-.. option:: rate_cycle=int
-
- Average bandwidth for :option:`rate` and :option:`rate_min` over this number
- of milliseconds. Defaults to 1000.
-
-
-I/O replay
-~~~~~~~~~~
-
-.. option:: write_iolog=str
-
- Write the issued I/O patterns to the specified file. See
- :option:`read_iolog`. Specify a separate file for each job, otherwise the
- iologs will be interspersed and the file may be corrupt.
-
-.. option:: read_iolog=str
-
- Open an iolog with the specified filename and replay the I/O patterns it
- contains. This can be used to store a workload and replay it sometime
- later. The iolog given may also be a blktrace binary file, which allows fio
- to replay a workload captured by :command:`blktrace`. See
- :manpage:`blktrace(8)` for how to capture such logging data. For blktrace
- replay, the file needs to be turned into a blkparse binary data file first
- (``blkparse <device> -o /dev/null -d file_for_fio.bin``).
- You can specify a number of files by separating the names with a ':'
- character. See the :option:`filename` option for information on how to
- escape ':' characters within the file names. These files will
- be sequentially assigned to job clones created by :option:`numjobs`.
- '-' is a reserved name, meaning read from stdin, notably if
- :option:`filename` is set to '-' which means stdin as well, then
- this flag can't be set to '-'.
-
-.. option:: read_iolog_chunked=bool
-
- Determines how iolog is read. If false(default) entire :option:`read_iolog`
- will be read at once. If selected true, input from iolog will be read
- gradually. Useful when iolog is very large, or it is generated.
-
-.. option:: merge_blktrace_file=str
-
- When specified, rather than replaying the logs passed to :option:`read_iolog`,
- the logs go through a merge phase which aggregates them into a single
- blktrace. The resulting file is then passed on as the :option:`read_iolog`
- parameter. The intention here is to make the order of events consistent.
- This limits the influence of the scheduler compared to replaying multiple
- blktraces via concurrent jobs.
-
-.. option:: merge_blktrace_scalars=float_list
-
- This is a percentage based option that is index paired with the list of
- files passed to :option:`read_iolog`. When merging is performed, scale
- the time of each event by the corresponding amount. For example,
- ``--merge_blktrace_scalars="50:100"`` runs the first trace in halftime
- and the second trace in realtime. This knob is separately tunable from
- :option:`replay_time_scale` which scales the trace during runtime and
- does not change the output of the merge unlike this option.
-
-.. option:: merge_blktrace_iters=float_list
-
- This is a whole number option that is index paired with the list of files
- passed to :option:`read_iolog`. When merging is performed, run each trace
- for the specified number of iterations. For example,
- ``--merge_blktrace_iters="2:1"`` runs the first trace for two iterations
- and the second trace for one iteration.
-
-.. option:: replay_no_stall=bool
-
- When replaying I/O with :option:`read_iolog` the default behavior is to
- attempt to respect the timestamps within the log and replay them with the
- appropriate delay between IOPS. By setting this variable fio will not
- respect the timestamps and attempt to replay them as fast as possible while
- still respecting ordering. The result is the same I/O pattern to a given
- device, but different timings.
-
-.. option:: replay_time_scale=int
-
- When replaying I/O with :option:`read_iolog`, fio will honor the
- original timing in the trace. With this option, it's possible to scale
- the time. It's a percentage option, if set to 50 it means run at 50%
- the original IO rate in the trace. If set to 200, run at twice the
- original IO rate. Defaults to 100.
-
-.. option:: replay_redirect=str
-
- While replaying I/O patterns using :option:`read_iolog` the default behavior
- is to replay the IOPS onto the major/minor device that each IOP was recorded
- from. This is sometimes undesirable because on a different machine those
- major/minor numbers can map to a different device. Changing hardware on the
- same system can also result in a different major/minor mapping.
- ``replay_redirect`` causes all I/Os to be replayed onto the single specified
- device regardless of the device it was recorded
- from. i.e. :option:`replay_redirect`\= :file:`/dev/sdc` would cause all I/O
- in the blktrace or iolog to be replayed onto :file:`/dev/sdc`. This means
- multiple devices will be replayed onto a single device, if the trace
- contains multiple devices. If you want multiple devices to be replayed
- concurrently to multiple redirected devices you must blkparse your trace
- into separate traces and replay them with independent fio invocations.
- Unfortunately this also breaks the strict time ordering between multiple
- device accesses.
-
-.. option:: replay_align=int
-
- Force alignment of the byte offsets in a trace to this value. The value
- must be a power of 2.
-
-.. option:: replay_scale=int
-
- Scale byte offsets down by this factor when replaying traces. Should most
- likely use :option:`replay_align` as well.
-
-.. option:: replay_skip=str
-
- Sometimes it's useful to skip certain IO types in a replay trace.
- This could be, for instance, eliminating the writes in the trace.
- Or not replaying the trims/discards, if you are redirecting to
- a device that doesn't support them. This option takes a comma
- separated list of read, write, trim, sync.
-
-
-Threads, processes and job synchronization
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. option:: thread
-
- Fio defaults to creating jobs by using fork, however if this option is
- given, fio will create jobs by using POSIX Threads' function
- :manpage:`pthread_create(3)` to create threads instead.
-
-.. option:: wait_for=str
-
- If set, the current job won't be started until all workers of the specified
- waitee job are done.
-
- ``wait_for`` operates on the job name basis, so there are a few
- limitations. First, the waitee must be defined prior to the waiter job
- (meaning no forward references). Second, if a job is being referenced as a
- waitee, it must have a unique name (no duplicate waitees).
-
-.. option:: nice=int
-
- Run the job with the given nice value. See man :manpage:`nice(2)`.
-
- On Windows, values less than -15 set the process class to "High"; -1 through
- -15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
- priority class.
-
-.. option:: prio=int
-
- Set the I/O priority value of this job. Linux limits us to a positive value
- between 0 and 7, with 0 being the highest. See man
- :manpage:`ionice(1)`. Refer to an appropriate manpage for other operating
- systems since meaning of priority may differ. For per-command priority
- setting, see I/O engine specific `cmdprio_percentage` and `hipri_percentage`
- options.
-
-.. option:: prioclass=int
-
- Set the I/O priority class. See man :manpage:`ionice(1)`. For per-command
- priority setting, see I/O engine specific `cmdprio_percentage` and
- `hipri_percentage` options.
-
-.. option:: cpus_allowed=str
-
- Controls the same options as :option:`cpumask`, but accepts a textual
- specification of the permitted CPUs instead and CPUs are indexed from 0. So
- to use CPUs 0 and 5 you would specify ``cpus_allowed=0,5``. This option also
- allows a range of CPUs to be specified -- say you wanted a binding to CPUs
- 0, 5, and 8 to 15, you would set ``cpus_allowed=0,5,8-15``.
-
- On Windows, when ``cpus_allowed`` is unset only CPUs from fio's current
- processor group will be used and affinity settings are inherited from the
- system. An fio build configured to target Windows 7 makes options that set
- CPUs processor group aware and values will set both the processor group
- and a CPU from within that group. For example, on a system where processor
- group 0 has 40 CPUs and processor group 1 has 32 CPUs, ``cpus_allowed``
- values between 0 and 39 will bind CPUs from processor group 0 and
- ``cpus_allowed`` values between 40 and 71 will bind CPUs from processor
- group 1. When using ``cpus_allowed_policy=shared`` all CPUs specified by a
- single ``cpus_allowed`` option must be from the same processor group. For
- Windows fio builds not built for Windows 7, CPUs will only be selected from
- (and be relative to) whatever processor group fio happens to be running in
- and CPUs from other processor groups cannot be used.
-
-.. option:: cpus_allowed_policy=str
-
- Set the policy of how fio distributes the CPUs specified by
- :option:`cpus_allowed` or :option:`cpumask`. Two policies are supported:
-
- **shared**
- All jobs will share the CPU set specified.
- **split**
- Each job will get a unique CPU from the CPU set.
-
- **shared** is the default behavior, if the option isn't specified. If
- **split** is specified, then fio will assign one cpu per job. If not
- enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
- in the set.
-
-.. option:: cpumask=int
-
- Set the CPU affinity of this job. The parameter given is a bit mask of
- allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
- and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
- :manpage:`sched_setaffinity(2)`. This may not work on all supported
- operating systems or kernel versions. This option doesn't work well for a
- higher CPU count than what you can store in an integer mask, so it can only
- control cpus 1-32. For boxes with larger CPU counts, use
- :option:`cpus_allowed`.
-
-.. option:: numa_cpu_nodes=str
-
- Set this job running on specified NUMA nodes' CPUs. The arguments allow
- comma delimited list of cpu numbers, A-B ranges, or `all`. Note, to enable
- NUMA options support, fio must be built on a system with libnuma-dev(el)
- installed.
-
-.. option:: numa_mem_policy=str
-
- Set this job's memory policy and corresponding NUMA nodes. Format of the
- arguments::
-
- <mode>[:<nodelist>]
-
- ``mode`` is one of the following memory policies: ``default``, ``prefer``,
- ``bind``, ``interleave`` or ``local``. For ``default`` and ``local`` memory
- policies, no node needs to be specified. For ``prefer``, only one node is
- allowed. For ``bind`` and ``interleave`` the ``nodelist`` may be as
- follows: a comma delimited list of numbers, A-B ranges, or `all`.
-
-.. option:: cgroup=str
-
- Add job to this control group. If it doesn't exist, it will be created. The
- system must have a mounted cgroup blkio mount point for this to work. If
- your system doesn't have it mounted, you can do so with::
-
- # mount -t cgroup -o blkio none /cgroup
-
-.. option:: cgroup_weight=int
-
- Set the weight of the cgroup to this value. See the documentation that comes
- with the kernel, allowed values are in the range of 100..1000.
-
-.. option:: cgroup_nodelete=bool
-
- Normally fio will delete the cgroups it has created after the job
- completion. To override this behavior and to leave cgroups around after the
- job completion, set ``cgroup_nodelete=1``. This can be useful if one wants
- to inspect various cgroup files after job completion. Default: false.
-
-.. option:: flow_id=int
-
- The ID of the flow. If not specified, it defaults to being a global
- flow. See :option:`flow`.
-
-.. option:: flow=int
-
- Weight in token-based flow control. If this value is used, then there is a
- 'flow counter' which is used to regulate the proportion of activity between
- two or more jobs. Fio attempts to keep this flow counter near zero. The
- ``flow`` parameter stands for how much should be added or subtracted to the
- flow counter on each iteration of the main I/O loop. That is, if one job has
- ``flow=8`` and another job has ``flow=-1``, then there will be a roughly 1:8
- ratio in how much one runs vs the other.
-
-.. option:: flow_sleep=int
-
- The period of time, in microseconds, to wait after the flow counter
- has exceeded its proportion before retrying operations.
-
-.. option:: stonewall, wait_for_previous
-
- Wait for preceding jobs in the job file to exit, before starting this
- one. Can be used to insert serialization points in the job file. A stone
- wall also implies starting a new reporting group, see
- :option:`group_reporting`.
-
-.. option:: exitall
-
- By default, fio will continue running all other jobs when one job finishes.
- Sometimes this is not the desired action. Setting ``exitall`` will instead
- make fio terminate all jobs in the same group, as soon as one job of that
- group finishes.
-
-.. option:: exit_what
-
- By default, fio will continue running all other jobs when one job finishes.
- Sometimes this is not the desired action. Setting ``exit_all`` will
- instead make fio terminate all jobs in the same group. The option
- ``exit_what`` allows to control which jobs get terminated when ``exitall`` is
- enabled. The default is ``group`` and does not change the behaviour of
- ``exitall``. The setting ``all`` terminates all jobs. The setting ``stonewall``
- terminates all currently running jobs across all groups and continues execution
- with the next stonewalled group.
-
-.. option:: exec_prerun=str
-
- Before running this job, issue the command specified through
- :manpage:`system(3)`. Output is redirected in a file called
- :file:`jobname.prerun.txt`.
-
-.. option:: exec_postrun=str
-
- After the job completes, issue the command specified though
- :manpage:`system(3)`. Output is redirected in a file called
- :file:`jobname.postrun.txt`.
-
-.. option:: uid=int
-
- Instead of running as the invoking user, set the user ID to this value
- before the thread/process does any work.
-
-.. option:: gid=int
-
- Set group ID, see :option:`uid`.
-
-
-Verification
-~~~~~~~~~~~~
-
-.. option:: verify_only
-
- Do not perform specified workload, only verify data still matches previous
- invocation of this workload. This option allows one to check data multiple
- times at a later date without overwriting it. This option makes sense only
- for workloads that write data, and does not support workloads with the
- :option:`time_based` option set.
-
-.. option:: do_verify=bool
-
- Run the verify phase after a write phase. Only valid if :option:`verify` is
- set. Default: true.
-
-.. option:: verify=str
-
- If writing to a file, fio can verify the file contents after each iteration
- of the job. Each verification method also implies verification of special
- header, which is written to the beginning of each block. This header also
- includes meta information, like offset of the block, block number, timestamp
- when block was written, etc. :option:`verify` can be combined with
- :option:`verify_pattern` option. The allowed values are:
-
- **md5**
- Use an md5 sum of the data area and store it in the header of
- each block.
-
- **crc64**
- Use an experimental crc64 sum of the data area and store it in the
- header of each block.
-
- **crc32c**
- Use a crc32c sum of the data area and store it in the header of
- each block. This will automatically use hardware acceleration
- (e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
- fall back to software crc32c if none is found. Generally the
- fastest checksum fio supports when hardware accelerated.
-
- **crc32c-intel**
- Synonym for crc32c.
-
- **crc32**
- Use a crc32 sum of the data area and store it in the header of each
- block.
-
- **crc16**
- Use a crc16 sum of the data area and store it in the header of each
- block.
-
- **crc7**
- Use a crc7 sum of the data area and store it in the header of each
- block.
-
- **xxhash**
- Use xxhash as the checksum function. Generally the fastest software
- checksum that fio supports.
-
- **sha512**
- Use sha512 as the checksum function.
-
- **sha256**
- Use sha256 as the checksum function.
-
- **sha1**
- Use optimized sha1 as the checksum function.
-
- **sha3-224**
- Use optimized sha3-224 as the checksum function.
-
- **sha3-256**
- Use optimized sha3-256 as the checksum function.
-
- **sha3-384**
- Use optimized sha3-384 as the checksum function.
-
- **sha3-512**
- Use optimized sha3-512 as the checksum function.
-
- **meta**
- This option is deprecated, since now meta information is included in
- generic verification header and meta verification happens by
- default. For detailed information see the description of the
- :option:`verify` setting. This option is kept because of
- compatibility's sake with old configurations. Do not use it.
-
- **pattern**
- Verify a strict pattern. Normally fio includes a header with some
- basic information and checksumming, but if this option is set, only
- the specific pattern set with :option:`verify_pattern` is verified.
-
- **null**
- Only pretend to verify. Useful for testing internals with
- :option:`ioengine`\=null, not for much else.
-
- This option can be used for repeated burn-in tests of a system to make sure
- that the written data is also correctly read back. If the data direction
- given is a read or random read, fio will assume that it should verify a
- previously written file. If the data direction includes any form of write,
- the verify will be of the newly written data.
-
- To avoid false verification errors, do not use the norandommap option when
- verifying data with async I/O engines and I/O depths > 1. Or use the
- norandommap and the lfsr random generator together to avoid writing to the
- same offset with muliple outstanding I/Os.
-
-.. option:: verify_offset=int
-
- Swap the verification header with data somewhere else in the block before
- writing. It is swapped back before verifying.
-
-.. option:: verify_interval=int
-
- Write the verification header at a finer granularity than the
- :option:`blocksize`. It will be written for chunks the size of
- ``verify_interval``. :option:`blocksize` should divide this evenly.
-
-.. option:: verify_pattern=str
-
- If set, fio will fill the I/O buffers with this pattern. Fio defaults to
- filling with totally random bytes, but sometimes it's interesting to fill
- with a known pattern for I/O verification purposes. Depending on the width
- of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
- be either a decimal or a hex number). The ``verify_pattern`` if larger than
- a 32-bit quantity has to be a hex number that starts with either "0x" or
- "0X". Use with :option:`verify`. Also, ``verify_pattern`` supports %o
- format, which means that for each block offset will be written and then
- verified back, e.g.::
-
- verify_pattern=%o
-
- Or use combination of everything::
-
- verify_pattern=0xff%o"abcd"-12
-
-.. option:: verify_fatal=bool
-
- Normally fio will keep checking the entire contents before quitting on a
- block verification failure. If this option is set, fio will exit the job on
- the first observed failure. Default: false.
-
-.. option:: verify_dump=bool
-
- If set, dump the contents of both the original data block and the data block
- we read off disk to files. This allows later analysis to inspect just what
- kind of data corruption occurred. Off by default.
-
-.. option:: verify_async=int
-
- Fio will normally verify I/O inline from the submitting thread. This option
- takes an integer describing how many async offload threads to create for I/O
- verification instead, causing fio to offload the duty of verifying I/O
- contents to one or more separate threads. If using this offload option, even
- sync I/O engines can benefit from using an :option:`iodepth` setting higher
- than 1, as it allows them to have I/O in flight while verifies are running.
- Defaults to 0 async threads, i.e. verification is not asynchronous.
-
-.. option:: verify_async_cpus=str
-
- Tell fio to set the given CPU affinity on the async I/O verification
- threads. See :option:`cpus_allowed` for the format used.
-
-.. option:: verify_backlog=int
-
- Fio will normally verify the written contents of a job that utilizes verify
- once that job has completed. In other words, everything is written then
- everything is read back and verified. You may want to verify continually
- instead for a variety of reasons. Fio stores the meta data associated with
- an I/O block in memory, so for large verify workloads, quite a bit of memory
- would be used up holding this meta data. If this option is enabled, fio will
- write only N blocks before verifying these blocks.
-
-.. option:: verify_backlog_batch=int
-
- Control how many blocks fio will verify if :option:`verify_backlog` is
- set. If not set, will default to the value of :option:`verify_backlog`
- (meaning the entire queue is read back and verified). If
- ``verify_backlog_batch`` is less than :option:`verify_backlog` then not all
- blocks will be verified, if ``verify_backlog_batch`` is larger than
- :option:`verify_backlog`, some blocks will be verified more than once.
-
-.. option:: verify_state_save=bool
-
- When a job exits during the write phase of a verify workload, save its
- current state. This allows fio to replay up until that point, if the verify
- state is loaded for the verify read phase. The format of the filename is,
- roughly::
-
- <type>-<jobname>-<jobindex>-verify.state.
-
- <type> is "local" for a local run, "sock" for a client/server socket
- connection, and "ip" (192.168.0.1, for instance) for a networked
- client/server connection. Defaults to true.
-
-.. option:: verify_state_load=bool
-
- If a verify termination trigger was used, fio stores the current write state
- of each thread. This can be used at verification time so that fio knows how
- far it should verify. Without this information, fio will run a full
- verification pass, according to the settings in the job file used. Default
- false.
-
-.. option:: trim_percentage=int
-
- Number of verify blocks to discard/trim.
-
-.. option:: trim_verify_zero=bool
-
- Verify that trim/discarded blocks are returned as zeros.
-
-.. option:: trim_backlog=int
-
- Trim after this number of blocks are written.
-
-.. option:: trim_backlog_batch=int
-
- Trim this number of I/O blocks.
-
-.. option:: experimental_verify=bool
-
- Enable experimental verification.
-
-Steady state
-~~~~~~~~~~~~
-
-.. option:: steadystate=str:float, ss=str:float
-
- Define the criterion and limit for assessing steady state performance. The
- first parameter designates the criterion whereas the second parameter sets
- the threshold. When the criterion falls below the threshold for the
- specified duration, the job will stop. For example, `iops_slope:0.1%` will
- direct fio to terminate the job when the least squares regression slope
- falls below 0.1% of the mean IOPS. If :option:`group_reporting` is enabled
- this will apply to all jobs in the group. Below is the list of available
- steady state assessment criteria. All assessments are carried out using only
- data from the rolling collection window. Threshold limits can be expressed
- as a fixed value or as a percentage of the mean in the collection window.
-
- When using this feature, most jobs should include the :option:`time_based`
- and :option:`runtime` options or the :option:`loops` option so that fio does not
- stop running after it has covered the full size of the specified file(s) or device(s).
-
- **iops**
- Collect IOPS data. Stop the job if all individual IOPS measurements
- are within the specified limit of the mean IOPS (e.g., ``iops:2``
- means that all individual IOPS values must be within 2 of the mean,
- whereas ``iops:0.2%`` means that all individual IOPS values must be
- within 0.2% of the mean IOPS to terminate the job).
-
- **iops_slope**
- Collect IOPS data and calculate the least squares regression
- slope. Stop the job if the slope falls below the specified limit.
-
- **bw**
- Collect bandwidth data. Stop the job if all individual bandwidth
- measurements are within the specified limit of the mean bandwidth.
-
- **bw_slope**
- Collect bandwidth data and calculate the least squares regression
- slope. Stop the job if the slope falls below the specified limit.
-
-.. option:: steadystate_duration=time, ss_dur=time
-
- A rolling window of this duration will be used to judge whether steady state
- has been reached. Data will be collected once per second. The default is 0
- which disables steady state detection. When the unit is omitted, the
- value is interpreted in seconds.
-
-.. option:: steadystate_ramp_time=time, ss_ramp=time
-
- Allow the job to run for the specified duration before beginning data
- collection for checking the steady state job termination criterion. The
- default is 0. When the unit is omitted, the value is interpreted in seconds.
-
-
-Measurements and reporting
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. option:: per_job_logs=bool
-
- If set, this generates bw/clat/iops log with per file private filenames. If
- not set, jobs with identical names will share the log filename. Default:
- true.
-
-.. option:: group_reporting
-
- It may sometimes be interesting to display statistics for groups of jobs as
- a whole instead of for each individual job. This is especially true if
- :option:`numjobs` is used; looking at individual thread/process output
- quickly becomes unwieldy. To see the final report per-group instead of
- per-job, use :option:`group_reporting`. Jobs in a file will be part of the
- same reporting group, unless if separated by a :option:`stonewall`, or by
- using :option:`new_group`.
-
-.. option:: new_group
-
- Start a new reporting group. See: :option:`group_reporting`. If not given,
- all jobs in a file will be part of the same reporting group, unless
- separated by a :option:`stonewall`.
-
-.. option:: stats=bool
-
- By default, fio collects and shows final output results for all jobs
- that run. If this option is set to 0, then fio will ignore it in
- the final stat output.
-
-.. option:: write_bw_log=str
-
- If given, write a bandwidth log for this job. Can be used to store data of
- the bandwidth of the jobs in their lifetime.
-
- If no str argument is given, the default filename of
- :file:`jobname_type.x.log` is used. Even when the argument is given, fio
- will still append the type of log. So if one specifies::
-
- write_bw_log=foo
-
- The actual log name will be :file:`foo_bw.x.log` where `x` is the index
- of the job (`1..N`, where `N` is the number of jobs). If
- :option:`per_job_logs` is false, then the filename will not include the
- `.x` job index.
-
- The included :command:`fio_generate_plots` script uses :command:`gnuplot` to turn these
- text files into nice graphs. See `Log File Formats`_ for how data is
- structured within the file.
-
-.. option:: write_lat_log=str
-
- Same as :option:`write_bw_log`, except this option creates I/O
- submission (e.g., :file:`name_slat.x.log`), completion (e.g.,
- :file:`name_clat.x.log`), and total (e.g., :file:`name_lat.x.log`)
- latency files instead. See :option:`write_bw_log` for details about
- the filename format and `Log File Formats`_ for how data is structured
- within the files.
-
-.. option:: write_hist_log=str
-
- Same as :option:`write_bw_log` but writes an I/O completion latency
- histogram file (e.g., :file:`name_hist.x.log`) instead. Note that this
- file will be empty unless :option:`log_hist_msec` has also been set.
- See :option:`write_bw_log` for details about the filename format and
- `Log File Formats`_ for how data is structured within the file.
-
-.. option:: write_iops_log=str
-
- Same as :option:`write_bw_log`, but writes an IOPS file (e.g.
- :file:`name_iops.x.log`) instead. Because fio defaults to individual
- I/O logging, the value entry in the IOPS log will be 1 unless windowed
- logging (see :option:`log_avg_msec`) has been enabled. See
- :option:`write_bw_log` for details about the filename format and `Log
- File Formats`_ for how data is structured within the file.
-
-.. option:: log_avg_msec=int
-
- By default, fio will log an entry in the iops, latency, or bw log for every
- I/O that completes. When writing to the disk log, that can quickly grow to a
- very large size. Setting this option makes fio average the each log entry
- over the specified period of time, reducing the resolution of the log. See
- :option:`log_max_value` as well. Defaults to 0, logging all entries.
- Also see `Log File Formats`_.
-
-.. option:: log_hist_msec=int
-
- Same as :option:`log_avg_msec`, but logs entries for completion latency
- histograms. Computing latency percentiles from averages of intervals using
- :option:`log_avg_msec` is inaccurate. Setting this option makes fio log
- histogram entries over the specified period of time, reducing log sizes for
- high IOPS devices while retaining percentile accuracy. See
- :option:`log_hist_coarseness` and :option:`write_hist_log` as well.
- Defaults to 0, meaning histogram logging is disabled.
-
-.. option:: log_hist_coarseness=int
-
- Integer ranging from 0 to 6, defining the coarseness of the resolution of
- the histogram logs enabled with :option:`log_hist_msec`. For each increment
- in coarseness, fio outputs half as many bins. Defaults to 0, for which
- histogram logs contain 1216 latency bins. See :option:`write_hist_log`
- and `Log File Formats`_.
-
-.. option:: log_max_value=bool
-
- If :option:`log_avg_msec` is set, fio logs the average over that window. If
- you instead want to log the maximum value, set this option to 1. Defaults to
- 0, meaning that averaged values are logged.
-
-.. option:: log_offset=bool
-
- If this is set, the iolog options will include the byte offset for the I/O
- entry as well as the other data values. Defaults to 0 meaning that
- offsets are not present in logs. Also see `Log File Formats`_.
-
-.. option:: log_compression=int
-
- If this is set, fio will compress the I/O logs as it goes, to keep the
- memory footprint lower. When a log reaches the specified size, that chunk is
- removed and compressed in the background. Given that I/O logs are fairly
- highly compressible, this yields a nice memory savings for longer runs. The
- downside is that the compression will consume some background CPU cycles, so
- it may impact the run. This, however, is also true if the logging ends up
- consuming most of the system memory. So pick your poison. The I/O logs are
- saved normally at the end of a run, by decompressing the chunks and storing
- them in the specified log file. This feature depends on the availability of
- zlib.
-
-.. option:: log_compression_cpus=str
-
- Define the set of CPUs that are allowed to handle online log compression for
- the I/O jobs. This can provide better isolation between performance
- sensitive jobs, and background compression work. See
- :option:`cpus_allowed` for the format used.
-
-.. option:: log_store_compressed=bool
-
- If set, fio will store the log files in a compressed format. They can be
- decompressed with fio, using the :option:`--inflate-log` command line
- parameter. The files will be stored with a :file:`.fz` suffix.
-
-.. option:: log_unix_epoch=bool
-
- If set, fio will log Unix timestamps to the log files produced by enabling
- write_type_log for each log type, instead of the default zero-based
- timestamps.
-
-.. option:: block_error_percentiles=bool
-
- If set, record errors in trim block-sized units from writes and trims and
- output a histogram of how many trims it took to get to errors, and what kind
- of error was encountered.
-
-.. option:: bwavgtime=int
-
- Average the calculated bandwidth over the given time. Value is specified in
- milliseconds. If the job also does bandwidth logging through
- :option:`write_bw_log`, then the minimum of this option and
- :option:`log_avg_msec` will be used. Default: 500ms.
-
-.. option:: iopsavgtime=int
-
- Average the calculated IOPS over the given time. Value is specified in
- milliseconds. If the job also does IOPS logging through
- :option:`write_iops_log`, then the minimum of this option and
- :option:`log_avg_msec` will be used. Default: 500ms.
-
-.. option:: disk_util=bool
-
- Generate disk utilization statistics, if the platform supports it.
- Default: true.
-
-.. option:: disable_lat=bool
-
- Disable measurements of total latency numbers. Useful only for cutting back
- the number of calls to :manpage:`gettimeofday(2)`, as that does impact
- performance at really high IOPS rates. Note that to really get rid of a
- large amount of these calls, this option must be used with
- :option:`disable_slat` and :option:`disable_bw_measurement` as well.
-
-.. option:: disable_clat=bool
-
- Disable measurements of completion latency numbers. See
- :option:`disable_lat`.
-
-.. option:: disable_slat=bool
-
- Disable measurements of submission latency numbers. See
- :option:`disable_lat`.
-
-.. option:: disable_bw_measurement=bool, disable_bw=bool
-
- Disable measurements of throughput/bandwidth numbers. See
- :option:`disable_lat`.
-
-.. option:: slat_percentiles=bool
-
- Report submission latency percentiles. Submission latency is not recorded
- for synchronous ioengines.
-
-.. option:: clat_percentiles=bool
-
- Report completion latency percentiles.
-
-.. option:: lat_percentiles=bool
-
- Report total latency percentiles. Total latency is the sum of submission
- latency and completion latency.
-
-.. option:: percentile_list=float_list
-
- Overwrite the default list of percentiles for latencies and the block error
- histogram. Each number is a floating point number in the range (0,100], and
- the maximum length of the list is 20. Use ``:`` to separate the numbers. For
- example, ``--percentile_list=99.5:99.9`` will cause fio to report the
- latency durations below which 99.5% and 99.9% of the observed latencies fell,
- respectively.
-
-.. option:: significant_figures=int
-
- If using :option:`--output-format` of `normal`, set the significant
- figures to this value. Higher values will yield more precise IOPS and
- throughput units, while lower values will round. Requires a minimum
- value of 1 and a maximum value of 10. Defaults to 4.
-
-
-Error handling
-~~~~~~~~~~~~~~
-
-.. option:: exitall_on_error
-
- When one job finishes in error, terminate the rest. The default is to wait
- for each job to finish.
-
-.. option:: continue_on_error=str
-
- Normally fio will exit the job on the first observed failure. If this option
- is set, fio will continue the job when there is a 'non-fatal error' (EIO or
- EILSEQ) until the runtime is exceeded or the I/O size specified is
- completed. If this option is used, there are two more stats that are
- appended, the total error count and the first error. The error field given
- in the stats is the first error that was hit during the run.
-
- The allowed values are:
-
- **none**
- Exit on any I/O or verify errors.
-
- **read**
- Continue on read errors, exit on all others.
-
- **write**
- Continue on write errors, exit on all others.
-
- **io**
- Continue on any I/O error, exit on all others.
-
- **verify**
- Continue on verify errors, exit on all others.
-
- **all**
- Continue on all errors.
-
- **0**
- Backward-compatible alias for 'none'.
-
- **1**
- Backward-compatible alias for 'all'.
-
-.. option:: ignore_error=str
-
- Sometimes you want to ignore some errors during test in that case you can
- specify error list for each error type, instead of only being able to
- ignore the default 'non-fatal error' using :option:`continue_on_error`.
- ``ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST`` errors for
- given error type is separated with ':'. Error may be symbol ('ENOSPC',
- 'ENOMEM') or integer. Example::
-
- ignore_error=EAGAIN,ENOSPC:122
-
- This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
- WRITE. This option works by overriding :option:`continue_on_error` with
- the list of errors for each error type if any.
-
-.. option:: error_dump=bool
-
- If set dump every error even if it is non fatal, true by default. If
- disabled only fatal error will be dumped.
-
-Running predefined workloads
-----------------------------
-
-Fio includes predefined profiles that mimic the I/O workloads generated by
-other tools.
-
-.. option:: profile=str
-
- The predefined workload to run. Current profiles are:
-
- **tiobench**
- Threaded I/O bench (tiotest/tiobench) like workload.
-
- **act**
- Aerospike Certification Tool (ACT) like workload.
-
-To view a profile's additional options use :option:`--cmdhelp` after specifying
-the profile. For example::
-
- $ fio --profile=act --cmdhelp
-
-Act profile options
-~~~~~~~~~~~~~~~~~~~
-
-.. option:: device-names=str
- :noindex:
-
- Devices to use.
-
-.. option:: load=int
- :noindex:
-
- ACT load multiplier. Default: 1.
-
-.. option:: test-duration=time
- :noindex:
-
- How long the entire test takes to run. When the unit is omitted, the value
- is given in seconds. Default: 24h.
-
-.. option:: threads-per-queue=int
- :noindex:
-
- Number of read I/O threads per device. Default: 8.
-
-.. option:: read-req-num-512-blocks=int
- :noindex:
-
- Number of 512B blocks to read at the time. Default: 3.
-
-.. option:: large-block-op-kbytes=int
- :noindex:
-
- Size of large block ops in KiB (writes). Default: 131072.
-
-.. option:: prep
- :noindex:
-
- Set to run ACT prep phase.
-
-Tiobench profile options
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. option:: size=str
- :noindex:
-
- Size in MiB.
-
-.. option:: block=int
- :noindex:
-
- Block size in bytes. Default: 4096.
-
-.. option:: numruns=int
- :noindex:
-
- Number of runs.
-
-.. option:: dir=str
- :noindex:
-
- Test directory.
-
-.. option:: threads=int
- :noindex:
-
- Number of threads.
-
-Interpreting the output
------------------------
-
-..
- Example output was based on the following:
- TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --time_based \
- --rate=1256k --bs=14K --name=quick --runtime=1s --name=mixed \
- --runtime=2m --rw=rw
-
-Fio spits out a lot of output. While running, fio will display the status of the
-jobs created. An example of that would be::
-
- Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
-
-The characters inside the first set of square brackets denote the current status of
-each thread. The first character is the first job defined in the job file, and so
-forth. The possible values (in typical life cycle order) are:
-
-+------+-----+-----------------------------------------------------------+
-| Idle | Run | |
-+======+=====+===========================================================+
-| P | | Thread setup, but not started. |
-+------+-----+-----------------------------------------------------------+
-| C | | Thread created. |
-+------+-----+-----------------------------------------------------------+
-| I | | Thread initialized, waiting or generating necessary data. |
-+------+-----+-----------------------------------------------------------+
-| | p | Thread running pre-reading file(s). |
-+------+-----+-----------------------------------------------------------+
-| | / | Thread is in ramp period. |
-+------+-----+-----------------------------------------------------------+
-| | R | Running, doing sequential reads. |
-+------+-----+-----------------------------------------------------------+
-| | r | Running, doing random reads. |
-+------+-----+-----------------------------------------------------------+
-| | W | Running, doing sequential writes. |
-+------+-----+-----------------------------------------------------------+
-| | w | Running, doing random writes. |
-+------+-----+-----------------------------------------------------------+
-| | M | Running, doing mixed sequential reads/writes. |
-+------+-----+-----------------------------------------------------------+
-| | m | Running, doing mixed random reads/writes. |
-+------+-----+-----------------------------------------------------------+
-| | D | Running, doing sequential trims. |
-+------+-----+-----------------------------------------------------------+
-| | d | Running, doing random trims. |
-+------+-----+-----------------------------------------------------------+
-| | F | Running, currently waiting for :manpage:`fsync(2)`. |
-+------+-----+-----------------------------------------------------------+
-| | V | Running, doing verification of written data. |
-+------+-----+-----------------------------------------------------------+
-| f | | Thread finishing. |
-+------+-----+-----------------------------------------------------------+
-| E | | Thread exited, not reaped by main thread yet. |
-+------+-----+-----------------------------------------------------------+
-| _ | | Thread reaped. |
-+------+-----+-----------------------------------------------------------+
-| X | | Thread reaped, exited with an error. |
-+------+-----+-----------------------------------------------------------+
-| K | | Thread reaped, exited due to signal. |
-+------+-----+-----------------------------------------------------------+
-
-..
- Example output was based on the following:
- TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --runtime=58m \
- --time_based --rate=2512k --bs=256K --numjobs=10 \
- --name=readers --rw=read --name=writers --rw=write
-
-Fio will condense the thread string as not to take up more space on the command
-line than needed. For instance, if you have 10 readers and 10 writers running,
-the output would look like this::
-
- Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
-
-Note that the status string is displayed in order, so it's possible to tell which of
-the jobs are currently doing what. In the example above this means that jobs 1--10
-are readers and 11--20 are writers.
-
-The other values are fairly self explanatory -- number of threads currently
-running and doing I/O, the number of currently open files (f=), the estimated
-completion percentage, the rate of I/O since last check (read speed listed first,
-then write speed and optionally trim speed) in terms of bandwidth and IOPS,
-and time to completion for the current running group. It's impossible to estimate
-runtime of the following groups (if any).
-
-..
- Example output was based on the following:
- TZ=UTC fio --iodepth=16 --ioengine=posixaio --filename=/tmp/fiofile \
- --direct=1 --size=100M --time_based --runtime=50s --rate_iops=89 \
- --bs=7K --name=Client1 --rw=write
-
-When fio is done (or interrupted by :kbd:`Ctrl-C`), it will show the data for
-each thread, group of threads, and disks in that order. For each overall thread (or
-group) the output looks like::
-
- Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
- write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
- slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
- clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
- lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
- clat percentiles (usec):
- | 1.00th=[ 302], 5.00th=[ 326], 10.00th=[ 343], 20.00th=[ 363],
- | 30.00th=[ 392], 40.00th=[ 404], 50.00th=[ 416], 60.00th=[ 445],
- | 70.00th=[ 816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
- | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
- | 99.99th=[78119]
- bw ( KiB/s): min= 532, max= 686, per=0.10%, avg=622.87, stdev=24.82, samples= 100
- iops : min= 76, max= 98, avg=88.98, stdev= 3.54, samples= 100
- lat (usec) : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
- lat (msec) : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
- lat (msec) : 100=0.65%
- cpu : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
- IO depths : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
- submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
- complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
- issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
- latency : target=0, window=0, percentile=100.00%, depth=8
-
-The job name (or first job's name when using :option:`group_reporting`) is printed,
-along with the group id, count of jobs being aggregated, last error id seen (which
-is 0 when there are no errors), pid/tid of that thread and the time the job/group
-completed. Below are the I/O statistics for each data direction performed (showing
-writes in the example above). In the order listed, they denote:
-
-**read/write/trim**
- The string before the colon shows the I/O direction the statistics
- are for. **IOPS** is the average I/Os performed per second. **BW**
- is the average bandwidth rate shown as: value in power of 2 format
- (value in power of 10 format). The last two values show: (**total
- I/O performed** in power of 2 format / **runtime** of that thread).
-
-**slat**
- Submission latency (**min** being the minimum, **max** being the
- maximum, **avg** being the average, **stdev** being the standard
- deviation). This is the time it took to submit the I/O. For
- sync I/O this row is not displayed as the slat is really the
- completion latency (since queue/complete is one operation there).
- This value can be in nanoseconds, microseconds or milliseconds ---
- fio will choose the most appropriate base and print that (in the
- example above nanoseconds was the best scale). Note: in :option:`--minimal` mode
- latencies are always expressed in microseconds.
-
-**clat**
- Completion latency. Same names as slat, this denotes the time from
- submission to completion of the I/O pieces. For sync I/O, clat will
- usually be equal (or very close) to 0, as the time from submit to
- complete is basically just CPU time (I/O has already been done, see slat
- explanation).
-
-**lat**
- Total latency. Same names as slat and clat, this denotes the time from
- when fio created the I/O unit to completion of the I/O operation.
-
-**bw**
- Bandwidth statistics based on samples. Same names as the xlat stats,
- but also includes the number of samples taken (**samples**) and an
- approximate percentage of total aggregate bandwidth this thread
- received in its group (**per**). This last value is only really
- useful if the threads in this group are on the same disk, since they
- are then competing for disk access.
-
-**iops**
- IOPS statistics based on samples. Same names as bw.
-
-**lat (nsec/usec/msec)**
- The distribution of I/O completion latencies. This is the time from when
- I/O leaves fio and when it gets completed. Unlike the separate
- read/write/trim sections above, the data here and in the remaining
- sections apply to all I/Os for the reporting group. 250=0.04% means that
- 0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
- of the I/Os required 250 to 499us for completion.
-
-**cpu**
- CPU usage. User and system time, along with the number of context
- switches this thread went through, usage of system and user time, and
- finally the number of major and minor page faults. The CPU utilization
- numbers are averages for the jobs in that reporting group, while the
- context and fault counters are summed.
-
-**IO depths**
- The distribution of I/O depths over the job lifetime. The numbers are
- divided into powers of 2 and each entry covers depths from that value
- up to those that are lower than the next entry -- e.g., 16= covers
- depths from 16 to 31. Note that the range covered by a depth
- distribution entry can be different to the range covered by the
- equivalent submit/complete distribution entry.
-
-**IO submit**
- How many pieces of I/O were submitting in a single submit call. Each
- entry denotes that amount and below, until the previous entry -- e.g.,
- 16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
- call. Note that the range covered by a submit distribution entry can
- be different to the range covered by the equivalent depth distribution
- entry.
-
-**IO complete**
- Like the above submit number, but for completions instead.
-
-**IO issued rwt**
- The number of read/write/trim requests issued, and how many of them were
- short or dropped.
-
-**IO latency**
- These values are for :option:`latency_target` and related options. When
- these options are engaged, this section describes the I/O depth required
- to meet the specified latency target.
-
-..
- Example output was based on the following:
- TZ=UTC fio --ioengine=null --iodepth=2 --size=100M --numjobs=2 \
- --rate_process=poisson --io_limit=32M --name=read --bs=128k \
- --rate=11M --name=write --rw=write --bs=2k --rate=700k
-
-After each client has been listed, the group statistics are printed. They
-will look like this::
-
- Run status group 0 (all jobs):
- READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s-10.8MiB/s (10.9MB/s-11.3MB/s), io=64.0MiB (67.1MB), run=2973-3069msec
- WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s-621KiB/s (630kB/s-636kB/s), io=64.0MiB (67.1MB), run=52747-53223msec
-
-For each data direction it prints:
-
-**bw**
- Aggregate bandwidth of threads in this group followed by the
- minimum and maximum bandwidth of all the threads in this group.
- Values outside of brackets are power-of-2 format and those
- within are the equivalent value in a power-of-10 format.
-**io**
- Aggregate I/O performed of all threads in this group. The
- format is the same as bw.
-**run**
- The smallest and longest runtimes of the threads in this group.
-
-And finally, the disk statistics are printed. This is Linux specific. They will look like this::
-
- Disk stats (read/write):
- sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
-
-Each value is printed for both reads and writes, with reads first. The
-numbers denote:
-
-**ios**
- Number of I/Os performed by all groups.
-**merge**
- Number of merges performed by the I/O scheduler.
-**ticks**
- Number of ticks we kept the disk busy.
-**in_queue**
- Total time spent in the disk queue.
-**util**
- The disk utilization. A value of 100% means we kept the disk
- busy constantly, 50% would be a disk idling half of the time.
-
-It is also possible to get fio to dump the current output while it is running,
-without terminating the job. To do that, send fio the **USR1** signal. You can
-also get regularly timed dumps by using the :option:`--status-interval`
-parameter, or by creating a file in :file:`/tmp` named
-:file:`fio-dump-status`. If fio sees this file, it will unlink it and dump the
-current output status.
-
-
-Terse output
-------------
-
-For scripted usage where you typically want to generate tables or graphs of the
-results, fio can output the results in a semicolon separated format. The format
-is one long line of values, such as::
-
- 2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
- A description of this job goes here.
-
-The job description (if provided) follows on a second line for terse v2.
-It appears on the same line for other terse versions.
-
-To enable terse output, use the :option:`--minimal` or
-:option:`--output-format`\=terse command line options. The
-first value is the version of the terse output format. If the output has to be
-changed for some reason, this number will be incremented by 1 to signify that
-change.
-
-Split up, the format is as follows (comments in brackets denote when a
-field was introduced or whether it's specific to some terse version):
-
- ::
-
- terse version, fio version [v3], jobname, groupid, error
-
- READ status::
-
- Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
- Submission latency: min, max, mean, stdev (usec)
- Completion latency: min, max, mean, stdev (usec)
- Completion latency percentiles: 20 fields (see below)
- Total latency: min, max, mean, stdev (usec)
- Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
- IOPS [v5]: min, max, mean, stdev, number of samples
-
- WRITE status:
-
- ::
-
- Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
- Submission latency: min, max, mean, stdev (usec)
- Completion latency: min, max, mean, stdev (usec)
- Completion latency percentiles: 20 fields (see below)
- Total latency: min, max, mean, stdev (usec)
- Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
- IOPS [v5]: min, max, mean, stdev, number of samples
-
- TRIM status [all but version 3]:
-
- Fields are similar to READ/WRITE status.
-
- CPU usage::
-
- user, system, context switches, major faults, minor faults
-
- I/O depths::
-
- <=1, 2, 4, 8, 16, 32, >=64
-
- I/O latencies microseconds::
-
- <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
-
- I/O latencies milliseconds::
-
- <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
-
- Disk utilization [v3]::
-
- disk name, read ios, write ios, read merges, write merges, read ticks, write ticks,
- time spent in queue, disk utilization percentage
-
- Additional Info (dependent on continue_on_error, default off)::
-
- total # errors, first error code
-
- Additional Info (dependent on description being set)::
-
- Text description
-
-Completion latency percentiles can be a grouping of up to 20 sets, so for the
-terse output fio writes all of them. Each field will look like this::
-
- 1.00%=6112
-
-which is the Xth percentile, and the `usec` latency associated with it.
-
-For `Disk utilization`, all disks used by fio are shown. So for each disk there
-will be a disk utilization section.
-
-Below is a single line containing short names for each of the fields in the
-minimal output v3, separated by semicolons::
-
- terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
-
-In client/server mode terse output differs from what appears when jobs are run
-locally. Disk utilization data is omitted from the standard terse output and
-for v3 and later appears on its own separate line at the end of each terse
-reporting cycle.
-
-
-JSON output
-------------
-
-The `json` output format is intended to be both human readable and convenient
-for automated parsing. For the most part its sections mirror those of the
-`normal` output. The `runtime` value is reported in msec and the `bw` value is
-reported in 1024 bytes per second units.
-
-
-JSON+ output
-------------
-
-The `json+` output format is identical to the `json` output format except that it
-adds a full dump of the completion latency bins. Each `bins` object contains a
-set of (key, value) pairs where keys are latency durations and values count how
-many I/Os had completion latencies of the corresponding duration. For example,
-consider:
-
- "bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
-
-This data indicates that one I/O required 87,552ns to complete, two I/Os required
-100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
-
-Also included with fio is a Python script `fio_jsonplus_clat2csv` that takes
-json+ output and generates CSV-formatted latency data suitable for plotting.
-
-The latency durations actually represent the midpoints of latency intervals.
-For details refer to :file:`stat.h`.
-
-
-Trace file format
------------------
-
-There are two trace file format that you can encounter. The older (v1) format is
-unsupported since version 1.20-rc3 (March 2008). It will still be described
-below in case that you get an old trace and want to understand it.
-
-In any case the trace is a simple text file with a single action per line.
-
-
-Trace file format v1
-~~~~~~~~~~~~~~~~~~~~
-
-Each line represents a single I/O action in the following format::
-
- rw, offset, length
-
-where `rw=0/1` for read/write, and the `offset` and `length` entries being in bytes.
-
-This format is not supported in fio versions >= 1.20-rc3.
-
-
-Trace file format v2
-~~~~~~~~~~~~~~~~~~~~
-
-The second version of the trace file format was added in fio version 1.17. It
-allows to access more then one file per trace and has a bigger set of possible
-file actions.
-
-The first line of the trace file has to be::
-
- fio version 2 iolog
-
-Following this can be lines in two different formats, which are described below.
-
-The file management format::
-
- filename action
-
-The `filename` is given as an absolute path. The `action` can be one of these:
-
-**add**
- Add the given `filename` to the trace.
-**open**
- Open the file with the given `filename`. The `filename` has to have
- been added with the **add** action before.
-**close**
- Close the file with the given `filename`. The file has to have been
- opened before.
-
-
-The file I/O action format::
-
- filename action offset length
-
-The `filename` is given as an absolute path, and has to have been added and
-opened before it can be used with this format. The `offset` and `length` are
-given in bytes. The `action` can be one of these:
-
-**wait**
- Wait for `offset` microseconds. Everything below 100 is discarded.
- The time is relative to the previous `wait` statement.
-**read**
- Read `length` bytes beginning from `offset`.
-**write**
- Write `length` bytes beginning from `offset`.
-**sync**
- :manpage:`fsync(2)` the file.
-**datasync**
- :manpage:`fdatasync(2)` the file.
-**trim**
- Trim the given file from the given `offset` for `length` bytes.
-
-
-I/O Replay - Merging Traces
----------------------------
-
-Colocation is a common practice used to get the most out of a machine.
-Knowing which workloads play nicely with each other and which ones don't is
-a much harder task. While fio can replay workloads concurrently via multiple
-jobs, it leaves some variability up to the scheduler making results harder to
-reproduce. Merging is a way to make the order of events consistent.
-
-Merging is integrated into I/O replay and done when a
-:option:`merge_blktrace_file` is specified. The list of files passed to
-:option:`read_iolog` go through the merge process and output a single file
-stored to the specified file. The output file is passed on as if it were the
-only file passed to :option:`read_iolog`. An example would look like::
-
- $ fio --read_iolog="<file1>:<file2>" --merge_blktrace_file="<output_file>"
-
-Creating only the merged file can be done by passing the command line argument
-:option:`--merge-blktrace-only`.
-
-Scaling traces can be done to see the relative impact of any particular trace
-being slowed down or sped up. :option:`merge_blktrace_scalars` takes in a colon
-separated list of percentage scalars. It is index paired with the files passed
-to :option:`read_iolog`.
-
-With scaling, it may be desirable to match the running time of all traces.
-This can be done with :option:`merge_blktrace_iters`. It is index paired with
-:option:`read_iolog` just like :option:`merge_blktrace_scalars`.
-
-In an example, given two traces, A and B, each 60s long. If we want to see
-the impact of trace A issuing IOs twice as fast and repeat trace A over the
-runtime of trace B, the following can be done::
-
- $ fio --read_iolog="<trace_a>:"<trace_b>" --merge_blktrace_file"<output_file>" --merge_blktrace_scalars="50:100" --merge_blktrace_iters="2:1"
-
-This runs trace A at 2x the speed twice for approximately the same runtime as
-a single run of trace B.
-
-
-CPU idleness profiling
-----------------------
-
-In some cases, we want to understand CPU overhead in a test. For example, we
-test patches for the specific goodness of whether they reduce CPU usage.
-Fio implements a balloon approach to create a thread per CPU that runs at idle
-priority, meaning that it only runs when nobody else needs the cpu.
-By measuring the amount of work completed by the thread, idleness of each CPU
-can be derived accordingly.
-
-An unit work is defined as touching a full page of unsigned characters. Mean and
-standard deviation of time to complete an unit work is reported in "unit work"
-section. Options can be chosen to report detailed percpu idleness or overall
-system idleness by aggregating percpu stats.
-
-
-Verification and triggers
--------------------------
-
-Fio is usually run in one of two ways, when data verification is done. The first
-is a normal write job of some sort with verify enabled. When the write phase has
-completed, fio switches to reads and verifies everything it wrote. The second
-model is running just the write phase, and then later on running the same job
-(but with reads instead of writes) to repeat the same I/O patterns and verify
-the contents. Both of these methods depend on the write phase being completed,
-as fio otherwise has no idea how much data was written.
-
-With verification triggers, fio supports dumping the current write state to
-local files. Then a subsequent read verify workload can load this state and know
-exactly where to stop. This is useful for testing cases where power is cut to a
-server in a managed fashion, for instance.
-
-A verification trigger consists of two things:
-
-1) Storing the write state of each job.
-2) Executing a trigger command.
-
-The write state is relatively small, on the order of hundreds of bytes to single
-kilobytes. It contains information on the number of completions done, the last X
-completions, etc.
-
-A trigger is invoked either through creation ('touch') of a specified file in
-the system, or through a timeout setting. If fio is run with
-:option:`--trigger-file`\= :file:`/tmp/trigger-file`, then it will continually
-check for the existence of :file:`/tmp/trigger-file`. When it sees this file, it
-will fire off the trigger (thus saving state, and executing the trigger
-command).
-
-For client/server runs, there's both a local and remote trigger. If fio is
-running as a server backend, it will send the job states back to the client for
-safe storage, then execute the remote trigger, if specified. If a local trigger
-is specified, the server will still send back the write state, but the client
-will then execute the trigger.
-
-Verification trigger example
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Let's say we want to run a powercut test on the remote Linux machine 'server'.
-Our write workload is in :file:`write-test.fio`. We want to cut power to 'server' at
-some point during the run, and we'll run this test from the safety or our local
-machine, 'localbox'. On the server, we'll start the fio backend normally::
-
- server# fio --server
-
-and on the client, we'll fire off the workload::
-
- localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
-
-We set :file:`/tmp/my-trigger` as the trigger file, and we tell fio to execute::
-
- echo b > /proc/sysrq-trigger
-
-on the server once it has received the trigger and sent us the write state. This
-will work, but it's not **really** cutting power to the server, it's merely
-abruptly rebooting it. If we have a remote way of cutting power to the server
-through IPMI or similar, we could do that through a local trigger command
-instead. Let's assume we have a script that does IPMI reboot of a given hostname,
-ipmi-reboot. On localbox, we could then have run fio with a local trigger
-instead::
-
- localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
-
-For this case, fio would wait for the server to send us the write state, then
-execute ``ipmi-reboot server`` when that happened.
-
-Loading verify state
-~~~~~~~~~~~~~~~~~~~~
-
-To load stored write state, a read verification job file must contain the
-:option:`verify_state_load` option. If that is set, fio will load the previously
-stored state. For a local fio run this is done by loading the files directly,
-and on a client/server run, the server backend will ask the client to send the
-files over and load them from there.
-
-
-Log File Formats
-----------------
-
-Fio supports a variety of log file formats, for logging latencies, bandwidth,
-and IOPS. The logs share a common format, which looks like this:
-
- *time* (`msec`), *value*, *data direction*, *block size* (`bytes`),
- *offset* (`bytes`), *command priority*
-
-*Time* for the log entry is always in milliseconds. The *value* logged depends
-on the type of log, it will be one of the following:
-
- **Latency log**
- Value is latency in nsecs
- **Bandwidth log**
- Value is in KiB/sec
- **IOPS log**
- Value is IOPS
-
-*Data direction* is one of the following:
-
- **0**
- I/O is a READ
- **1**
- I/O is a WRITE
- **2**
- I/O is a TRIM
-
-The entry's *block size* is always in bytes. The *offset* is the position in bytes
-from the start of the file for that particular I/O. The logging of the offset can be
-toggled with :option:`log_offset`.
-
-*Command priority* is 0 for normal priority and 1 for high priority. This is controlled
-by the ioengine specific :option:`cmdprio_percentage`.
-
-Fio defaults to logging every individual I/O but when windowed logging is set
-through :option:`log_avg_msec`, either the average (by default) or the maximum
-(:option:`log_max_value` is set) *value* seen over the specified period of time
-is recorded. Each *data direction* seen within the window period will aggregate
-its values in a separate row. Further, when using windowed logging the *block
-size* and *offset* entries will always contain 0.
-
-
-Client/Server
--------------
-
-Normally fio is invoked as a stand-alone application on the machine where the
-I/O workload should be generated. However, the backend and frontend of fio can
-be run separately i.e., the fio server can generate an I/O workload on the "Device
-Under Test" while being controlled by a client on another machine.
-
-Start the server on the machine which has access to the storage DUT::
-
- $ fio --server=args
-
-where `args` defines what fio listens to. The arguments are of the form
-``type,hostname`` or ``IP,port``. *type* is either ``ip`` (or ip4) for TCP/IP
-v4, ``ip6`` for TCP/IP v6, or ``sock`` for a local unix domain socket.
-*hostname* is either a hostname or IP address, and *port* is the port to listen
-to (only valid for TCP/IP, not a local socket). Some examples:
-
-1) ``fio --server``
-
- Start a fio server, listening on all interfaces on the default port (8765).
-
-2) ``fio --server=ip:hostname,4444``
-
- Start a fio server, listening on IP belonging to hostname and on port 4444.
-
-3) ``fio --server=ip6:::1,4444``
-
- Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
-
-4) ``fio --server=,4444``
-
- Start a fio server, listening on all interfaces on port 4444.
-
-5) ``fio --server=1.2.3.4``
-
- Start a fio server, listening on IP 1.2.3.4 on the default port.
-
-6) ``fio --server=sock:/tmp/fio.sock``
-
- Start a fio server, listening on the local socket :file:`/tmp/fio.sock`.
-
-Once a server is running, a "client" can connect to the fio server with::
-
- fio <local-args> --client=<server> <remote-args> <job file(s)>
-
-where `local-args` are arguments for the client where it is running, `server`
-is the connect string, and `remote-args` and `job file(s)` are sent to the
-server. The `server` string follows the same format as it does on the server
-side, to allow IP/hostname/socket and port strings.
-
-Fio can connect to multiple servers this way::
-
- fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
-
-If the job file is located on the fio server, then you can tell the server to
-load a local file as well. This is done by using :option:`--remote-config` ::
-
- fio --client=server --remote-config /path/to/file.fio
-
-Then fio will open this local (to the server) job file instead of being passed
-one from the client.
-
-If you have many servers (example: 100 VMs/containers), you can input a pathname
-of a file containing host IPs/names as the parameter value for the
-:option:`--client` option. For example, here is an example :file:`host.list`
-file containing 2 hostnames::
-
- host1.your.dns.domain
- host2.your.dns.domain
-
-The fio command would then be::
-
- fio --client=host.list <job file(s)>
-
-In this mode, you cannot input server-specific parameters or job files -- all
-servers receive the same job file.
-
-In order to let ``fio --client`` runs use a shared filesystem from multiple
-hosts, ``fio --client`` now prepends the IP address of the server to the
-filename. For example, if fio is using the directory :file:`/mnt/nfs/fio` and is
-writing filename :file:`fileio.tmp`, with a :option:`--client` `hostfile`
-containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and
-192.168.10.121, then fio will create two files::
-
- /mnt/nfs/fio/192.168.10.120.fileio.tmp
- /mnt/nfs/fio/192.168.10.121.fileio.tmp
-
-Terse output in client/server mode will differ slightly from what is produced
-when fio is run in stand-alone mode. See the terse output section for details.
--- /dev/null
+How fio works
+-------------
+
+The first step in getting fio to simulate a desired I/O workload, is writing a
+job file describing that specific setup. A job file may contain any number of
+threads and/or files -- the typical contents of the job file is a *global*
+section defining shared parameters, and one or more job sections describing the
+jobs involved. When run, fio parses this file and sets everything up as
+described. If we break down a job from top to bottom, it contains the following
+basic parameters:
+
+`I/O type`_
+
+ Defines the I/O pattern issued to the file(s). We may only be reading
+ sequentially from this file(s), or we may be writing randomly. Or even
+ mixing reads and writes, sequentially or randomly.
+ Should we be doing buffered I/O, or direct/raw I/O?
+
+`Block size`_
+
+ In how large chunks are we issuing I/O? This may be a single value,
+ or it may describe a range of block sizes.
+
+`I/O size`_
+
+ How much data are we going to be reading/writing.
+
+`I/O engine`_
+
+ How do we issue I/O? We could be memory mapping the file, we could be
+ using regular read/write, we could be using splice, async I/O, or even
+ SG (SCSI generic sg).
+
+`I/O depth`_
+
+ If the I/O engine is async, how large a queuing depth do we want to
+ maintain?
+
+
+`Target file/device`_
+
+ How many files are we spreading the workload over.
+
+`Threads, processes and job synchronization`_
+
+ How many threads or processes should we spread this workload over.
+
+The above are the basic parameters defined for a workload, in addition there's a
+multitude of parameters that modify other aspects of how this job behaves.
+
+
+Command line options
+--------------------
+
+.. option:: --debug=type
+
+ Enable verbose tracing `type` of various fio actions. May be ``all`` for all types
+ or individual types separated by a comma (e.g. ``--debug=file,mem`` will
+ enable file and memory debugging). Currently, additional logging is
+ available for:
+
+ *process*
+ Dump info related to processes.
+ *file*
+ Dump info related to file actions.
+ *io*
+ Dump info related to I/O queuing.
+ *mem*
+ Dump info related to memory allocations.
+ *blktrace*
+ Dump info related to blktrace setup.
+ *verify*
+ Dump info related to I/O verification.
+ *all*
+ Enable all debug options.
+ *random*
+ Dump info related to random offset generation.
+ *parse*
+ Dump info related to option matching and parsing.
+ *diskutil*
+ Dump info related to disk utilization updates.
+ *job:x*
+ Dump info only related to job number x.
+ *mutex*
+ Dump info only related to mutex up/down ops.
+ *profile*
+ Dump info related to profile extensions.
+ *time*
+ Dump info related to internal time keeping.
+ *net*
+ Dump info related to networking connections.
+ *rate*
+ Dump info related to I/O rate switching.
+ *compress*
+ Dump info related to log compress/decompress.
+ *steadystate*
+ Dump info related to steadystate detection.
+ *helperthread*
+ Dump info related to the helper thread.
+ *zbd*
+ Dump info related to support for zoned block devices.
+ *?* or *help*
+ Show available debug options.
+
+.. option:: --parse-only
+
+ Parse options only, don't start any I/O.
+
+.. option:: --merge-blktrace-only
+
+ Merge blktraces only, don't start any I/O.
+
+.. option:: --output=filename
+
+ Write output to file `filename`.
+
+.. option:: --output-format=format
+
+ Set the reporting `format` to `normal`, `terse`, `json`, or `json+`. Multiple
+ formats can be selected, separated by a comma. `terse` is a CSV based
+ format. `json+` is like `json`, except it adds a full dump of the latency
+ buckets.
+
+.. option:: --bandwidth-log
+
+ Generate aggregate bandwidth logs.
+
+.. option:: --minimal
+
+ Print statistics in a terse, semicolon-delimited format.
+
+.. option:: --append-terse
+
+ Print statistics in selected mode AND terse, semicolon-delimited format.
+ **Deprecated**, use :option:`--output-format` instead to select multiple
+ formats.
+
+.. option:: --terse-version=version
+
+ Set terse `version` output format (default 3, or 2 or 4 or 5).
+
+.. option:: --version
+
+ Print version information and exit.
+
+.. option:: --help
+
+ Print a summary of the command line options and exit.
+
+.. option:: --cpuclock-test
+
+ Perform test and validation of internal CPU clock.
+
+.. option:: --crctest=[test]
+
+ Test the speed of the built-in checksumming functions. If no argument is
+ given, all of them are tested. Alternatively, a comma separated list can
+ be passed, in which case the given ones are tested.
+
+.. option:: --cmdhelp=command
+
+ Print help information for `command`. May be ``all`` for all commands.
+
+.. option:: --enghelp=[ioengine[,command]]
+
+ List all commands defined by `ioengine`, or print help for `command`
+ defined by `ioengine`. If no `ioengine` is given, list all
+ available ioengines.
+
+.. option:: --showcmd
+
+ Convert given job files to a set of command-line options.
+
+.. option:: --readonly
+
+ Turn on safety read-only checks, preventing writes and trims. The
+ ``--readonly`` option is an extra safety guard to prevent users from
+ accidentally starting a write or trim workload when that is not desired.
+ Fio will only modify the device under test if
+ `rw=write/randwrite/rw/randrw/trim/randtrim/trimwrite` is given. This
+ safety net can be used as an extra precaution.
+
+.. option:: --eta=when
+
+ Specifies when real-time ETA estimate should be printed. `when` may be
+ `always`, `never` or `auto`. `auto` is the default, it prints ETA
+ when requested if the output is a TTY. `always` disregards the output
+ type, and prints ETA when requested. `never` never prints ETA.
+
+.. option:: --eta-interval=time
+
+ By default, fio requests client ETA status roughly every second. With
+ this option, the interval is configurable. Fio imposes a minimum
+ allowed time to avoid flooding the console, less than 250 msec is
+ not supported.
+
+.. option:: --eta-newline=time
+
+ Force a new line for every `time` period passed. When the unit is omitted,
+ the value is interpreted in seconds.
+
+.. option:: --status-interval=time
+
+ Force a full status dump of cumulative (from job start) values at `time`
+ intervals. This option does *not* provide per-period measurements. So
+ values such as bandwidth are running averages. When the time unit is omitted,
+ `time` is interpreted in seconds. Note that using this option with
+ ``--output-format=json`` will yield output that technically isn't valid
+ json, since the output will be collated sets of valid json. It will need
+ to be split into valid sets of json after the run.
+
+.. option:: --section=name
+
+ Only run specified section `name` in job file. Multiple sections can be specified.
+ The ``--section`` option allows one to combine related jobs into one file.
+ E.g. one job file could define light, moderate, and heavy sections. Tell
+ fio to run only the "heavy" section by giving ``--section=heavy``
+ command line option. One can also specify the "write" operations in one
+ section and "verify" operation in another section. The ``--section`` option
+ only applies to job sections. The reserved *global* section is always
+ parsed and used.
+
+.. option:: --alloc-size=kb
+
+ Allocate additional internal smalloc pools of size `kb` in KiB. The
+ ``--alloc-size`` option increases shared memory set aside for use by fio.
+ If running large jobs with randommap enabled, fio can run out of memory.
+ Smalloc is an internal allocator for shared structures from a fixed size
+ memory pool and can grow to 16 pools. The pool size defaults to 16MiB.
+
+ NOTE: While running :file:`.fio_smalloc.*` backing store files are visible
+ in :file:`/tmp`.
+
+.. option:: --warnings-fatal
+
+ All fio parser warnings are fatal, causing fio to exit with an
+ error.
+
+.. option:: --max-jobs=nr
+
+ Set the maximum number of threads/processes to support to `nr`.
+ NOTE: On Linux, it may be necessary to increase the shared-memory
+ limit (:file:`/proc/sys/kernel/shmmax`) if fio runs into errors while
+ creating jobs.
+
+.. option:: --server=args
+
+ Start a backend server, with `args` specifying what to listen to.
+ See `Client/Server`_ section.
+
+.. option:: --daemonize=pidfile
+
+ Background a fio server, writing the pid to the given `pidfile` file.
+
+.. option:: --client=hostname
+
+ Instead of running the jobs locally, send and run them on the given `hostname`
+ or set of `hostname`\s. See `Client/Server`_ section.
+
+.. option:: --remote-config=file
+
+ Tell fio server to load this local `file`.
+
+.. option:: --idle-prof=option
+
+ Report CPU idleness. `option` is one of the following:
+
+ **calibrate**
+ Run unit work calibration only and exit.
+
+ **system**
+ Show aggregate system idleness and unit work.
+
+ **percpu**
+ As **system** but also show per CPU idleness.
+
+.. option:: --inflate-log=log
+
+ Inflate and output compressed `log`.
+
+.. option:: --trigger-file=file
+
+ Execute trigger command when `file` exists.
+
+.. option:: --trigger-timeout=time
+
+ Execute trigger at this `time`.
+
+.. option:: --trigger=command
+
+ Set this `command` as local trigger.
+
+.. option:: --trigger-remote=command
+
+ Set this `command` as remote trigger.
+
+.. option:: --aux-path=path
+
+ Use the directory specified by `path` for generated state files instead
+ of the current working directory.
+
+Any parameters following the options will be assumed to be job files, unless
+they match a job file parameter. Multiple job files can be listed and each job
+file will be regarded as a separate group. Fio will :option:`stonewall`
+execution between each group.
+
+
+Job file format
+---------------
+
+As previously described, fio accepts one or more job files describing what it is
+supposed to do. The job file format is the classic ini file, where the names
+enclosed in [] brackets define the job name. You are free to use any ASCII name
+you want, except *global* which has special meaning. Following the job name is
+a sequence of zero or more parameters, one per line, that define the behavior of
+the job. If the first character in a line is a ';' or a '#', the entire line is
+discarded as a comment.
+
+A *global* section sets defaults for the jobs described in that file. A job may
+override a *global* section parameter, and a job file may even have several
+*global* sections if so desired. A job is only affected by a *global* section
+residing above it.
+
+The :option:`--cmdhelp` option also lists all options. If used with a `command`
+argument, :option:`--cmdhelp` will detail the given `command`.
+
+See the `examples/` directory for inspiration on how to write job files. Note
+the copyright and license requirements currently apply to `examples/` files.
+
+So let's look at a really simple job file that defines two processes, each
+randomly reading from a 128MiB file:
+
+.. code-block:: ini
+
+ ; -- start job file --
+ [global]
+ rw=randread
+ size=128m
+
+ [job1]
+
+ [job2]
+
+ ; -- end job file --
+
+As you can see, the job file sections themselves are empty as all the described
+parameters are shared. As no :option:`filename` option is given, fio makes up a
+`filename` for each of the jobs as it sees fit. On the command line, this job
+would look as follows::
+
+$ fio --name=global --rw=randread --size=128m --name=job1 --name=job2
+
+
+Let's look at an example that has a number of processes writing randomly to
+files:
+
+.. code-block:: ini
+
+ ; -- start job file --
+ [random-writers]
+ ioengine=libaio
+ iodepth=4
+ rw=randwrite
+ bs=32k
+ direct=0
+ size=64m
+ numjobs=4
+ ; -- end job file --
+
+Here we have no *global* section, as we only have one job defined anyway. We
+want to use async I/O here, with a depth of 4 for each file. We also increased
+the buffer size used to 32KiB and define numjobs to 4 to fork 4 identical
+jobs. The result is 4 processes each randomly writing to their own 64MiB
+file. Instead of using the above job file, you could have given the parameters
+on the command line. For this case, you would specify::
+
+$ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
+
+When fio is utilized as a basis of any reasonably large test suite, it might be
+desirable to share a set of standardized settings across multiple job files.
+Instead of copy/pasting such settings, any section may pull in an external
+:file:`filename.fio` file with *include filename* directive, as in the following
+example::
+
+ ; -- start job file including.fio --
+ [global]
+ filename=/tmp/test
+ filesize=1m
+ include glob-include.fio
+
+ [test]
+ rw=randread
+ bs=4k
+ time_based=1
+ runtime=10
+ include test-include.fio
+ ; -- end job file including.fio --
+
+.. code-block:: ini
+
+ ; -- start job file glob-include.fio --
+ thread=1
+ group_reporting=1
+ ; -- end job file glob-include.fio --
+
+.. code-block:: ini
+
+ ; -- start job file test-include.fio --
+ ioengine=libaio
+ iodepth=4
+ ; -- end job file test-include.fio --
+
+Settings pulled into a section apply to that section only (except *global*
+section). Include directives may be nested in that any included file may contain
+further include directive(s). Include files may not contain [] sections.
+
+
+Environment variables
+~~~~~~~~~~~~~~~~~~~~~
+
+Fio also supports environment variable expansion in job files. Any sub-string of
+the form ``${VARNAME}`` as part of an option value (in other words, on the right
+of the '='), will be expanded to the value of the environment variable called
+`VARNAME`. If no such environment variable is defined, or `VARNAME` is the
+empty string, the empty string will be substituted.
+
+As an example, let's look at a sample fio invocation and job file::
+
+$ SIZE=64m NUMJOBS=4 fio jobfile.fio
+
+.. code-block:: ini
+
+ ; -- start job file --
+ [random-writers]
+ rw=randwrite
+ size=${SIZE}
+ numjobs=${NUMJOBS}
+ ; -- end job file --
+
+This will expand to the following equivalent job file at runtime:
+
+.. code-block:: ini
+
+ ; -- start job file --
+ [random-writers]
+ rw=randwrite
+ size=64m
+ numjobs=4
+ ; -- end job file --
+
+Fio ships with a few example job files, you can also look there for inspiration.
+
+Reserved keywords
+~~~~~~~~~~~~~~~~~
+
+Additionally, fio has a set of reserved keywords that will be replaced
+internally with the appropriate value. Those keywords are:
+
+**$pagesize**
+
+ The architecture page size of the running system.
+
+**$mb_memory**
+
+ Megabytes of total memory in the system.
+
+**$ncpus**
+
+ Number of online available CPUs.
+
+These can be used on the command line or in the job file, and will be
+automatically substituted with the current system values when the job is
+run. Simple math is also supported on these keywords, so you can perform actions
+like::
+
+ size=8*$mb_memory
+
+and get that properly expanded to 8 times the size of memory in the machine.
+
+
+Job file parameters
+-------------------
+
+This section describes in details each parameter associated with a job. Some
+parameters take an option of a given type, such as an integer or a
+string. Anywhere a numeric value is required, an arithmetic expression may be
+used, provided it is surrounded by parentheses. Supported operators are:
+
+ - addition (+)
+ - subtraction (-)
+ - multiplication (*)
+ - division (/)
+ - modulus (%)
+ - exponentiation (^)
+
+For time values in expressions, units are microseconds by default. This is
+different than for time values not in expressions (not enclosed in
+parentheses). The following types are used:
+
+
+Parameter types
+~~~~~~~~~~~~~~~
+
+**str**
+ String: A sequence of alphanumeric characters.
+
+**time**
+ Integer with possible time suffix. Without a unit value is interpreted as
+ seconds unless otherwise specified. Accepts a suffix of 'd' for days, 'h' for
+ hours, 'm' for minutes, 's' for seconds, 'ms' (or 'msec') for milliseconds and
+ 'us' (or 'usec') for microseconds. For example, use 10m for 10 minutes.
+
+.. _int:
+
+**int**
+ Integer. A whole number value, which may contain an integer prefix
+ and an integer suffix:
+
+ [*integer prefix*] **number** [*integer suffix*]
+
+ The optional *integer prefix* specifies the number's base. The default
+ is decimal. *0x* specifies hexadecimal.
+
+ The optional *integer suffix* specifies the number's units, and includes an
+ optional unit prefix and an optional unit. For quantities of data, the
+ default unit is bytes. For quantities of time, the default unit is seconds
+ unless otherwise specified.
+
+ With :option:`kb_base`\=1000, fio follows international standards for unit
+ prefixes. To specify power-of-10 decimal values defined in the
+ International System of Units (SI):
+
+ * *K* -- means kilo (K) or 1000
+ * *M* -- means mega (M) or 1000**2
+ * *G* -- means giga (G) or 1000**3
+ * *T* -- means tera (T) or 1000**4
+ * *P* -- means peta (P) or 1000**5
+
+ To specify power-of-2 binary values defined in IEC 80000-13:
+
+ * *Ki* -- means kibi (Ki) or 1024
+ * *Mi* -- means mebi (Mi) or 1024**2
+ * *Gi* -- means gibi (Gi) or 1024**3
+ * *Ti* -- means tebi (Ti) or 1024**4
+ * *Pi* -- means pebi (Pi) or 1024**5
+
+ For Zone Block Device Mode:
+ * *z* -- means Zone
+
+ With :option:`kb_base`\=1024 (the default), the unit prefixes are opposite
+ from those specified in the SI and IEC 80000-13 standards to provide
+ compatibility with old scripts. For example, 4k means 4096.
+
+ For quantities of data, an optional unit of 'B' may be included
+ (e.g., 'kB' is the same as 'k').
+
+ The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
+ not milli). 'b' and 'B' both mean byte, not bit.
+
+ Examples with :option:`kb_base`\=1000:
+
+ * *4 KiB*: 4096, 4096b, 4096B, 4ki, 4kib, 4kiB, 4Ki, 4KiB
+ * *1 MiB*: 1048576, 1mi, 1024ki
+ * *1 MB*: 1000000, 1m, 1000k
+ * *1 TiB*: 1099511627776, 1ti, 1024gi, 1048576mi
+ * *1 TB*: 1000000000, 1t, 1000m, 1000000k
+
+ Examples with :option:`kb_base`\=1024 (default):
+
+ * *4 KiB*: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+ * *1 MiB*: 1048576, 1m, 1024k
+ * *1 MB*: 1000000, 1mi, 1000ki
+ * *1 TiB*: 1099511627776, 1t, 1024g, 1048576m
+ * *1 TB*: 1000000000, 1ti, 1000mi, 1000000ki
+
+ To specify times (units are not case sensitive):
+
+ * *D* -- means days
+ * *H* -- means hours
+ * *M* -- means minutes
+ * *s* -- or sec means seconds (default)
+ * *ms* -- or *msec* means milliseconds
+ * *us* -- or *usec* means microseconds
+
+ If the option accepts an upper and lower range, use a colon ':' or
+ minus '-' to separate such values. See :ref:`irange <irange>`.
+ If the lower value specified happens to be larger than the upper value
+ the two values are swapped.
+
+.. _bool:
+
+**bool**
+ Boolean. Usually parsed as an integer, however only defined for
+ true and false (1 and 0).
+
+.. _irange:
+
+**irange**
+ Integer range with suffix. Allows value range to be given, such as
+ 1024-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
+ option allows two sets of ranges, they can be specified with a ',' or '/'
+ delimiter: 1k-4k/8k-32k. Also see :ref:`int <int>`.
+
+**float_list**
+ A list of floating point numbers, separated by a ':' character.
+
+With the above in mind, here follows the complete list of fio job parameters.
+
+
+Units
+~~~~~
+
+.. option:: kb_base=int
+
+ Select the interpretation of unit prefixes in input parameters.
+
+ **1000**
+ Inputs comply with IEC 80000-13 and the International
+ System of Units (SI). Use:
+
+ - power-of-2 values with IEC prefixes (e.g., KiB)
+ - power-of-10 values with SI prefixes (e.g., kB)
+
+ **1024**
+ Compatibility mode (default). To avoid breaking old scripts:
+
+ - power-of-2 values with SI prefixes
+ - power-of-10 values with IEC prefixes
+
+ See :option:`bs` for more details on input parameters.
+
+ Outputs always use correct prefixes. Most outputs include both
+ side-by-side, like::
+
+ bw=2383.3kB/s (2327.4KiB/s)
+
+ If only one value is reported, then kb_base selects the one to use:
+
+ **1000** -- SI prefixes
+
+ **1024** -- IEC prefixes
+
+.. option:: unit_base=int
+
+ Base unit for reporting. Allowed values are:
+
+ **0**
+ Use auto-detection (default).
+ **8**
+ Byte based.
+ **1**
+ Bit based.
+
+
+Job description
+~~~~~~~~~~~~~~~
+
+.. option:: name=str
+
+ ASCII name of the job. This may be used to override the name printed by fio
+ for this job. Otherwise the job name is used. On the command line this
+ parameter has the special purpose of also signaling the start of a new job.
+
+.. option:: description=str
+
+ Text description of the job. Doesn't do anything except dump this text
+ description when this job is run. It's not parsed.
+
+.. option:: loops=int
+
+ Run the specified number of iterations of this job. Used to repeat the same
+ workload a given number of times. Defaults to 1.
+
+.. option:: numjobs=int
+
+ Create the specified number of clones of this job. Each clone of job
+ is spawned as an independent thread or process. May be used to setup a
+ larger number of threads/processes doing the same thing. Each thread is
+ reported separately; to see statistics for all clones as a whole, use
+ :option:`group_reporting` in conjunction with :option:`new_group`.
+ See :option:`--max-jobs`. Default: 1.
+
+
+Time related parameters
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: runtime=time
+
+ Limit runtime. The test will run until it completes the configured I/O
+ workload or until it has run for this specified amount of time, whichever
+ occurs first. It can be quite hard to determine for how long a specified
+ job will run, so this parameter is handy to cap the total runtime to a
+ given time. When the unit is omitted, the value is interpreted in
+ seconds.
+
+.. option:: time_based
+
+ If set, fio will run for the duration of the :option:`runtime` specified
+ even if the file(s) are completely read or written. It will simply loop over
+ the same workload as many times as the :option:`runtime` allows.
+
+.. option:: startdelay=irange(time)
+
+ Delay the start of job for the specified amount of time. Can be a single
+ value or a range. When given as a range, each thread will choose a value
+ randomly from within the range. Value is in seconds if a unit is omitted.
+
+.. option:: ramp_time=time
+
+ If set, fio will run the specified workload for this amount of time before
+ logging any performance numbers. Useful for letting performance settle
+ before logging results, thus minimizing the runtime required for stable
+ results. Note that the ``ramp_time`` is considered lead in time for a job,
+ thus it will increase the total runtime if a special timeout or
+ :option:`runtime` is specified. When the unit is omitted, the value is
+ given in seconds.
+
+.. option:: clocksource=str
+
+ Use the given clocksource as the base of timing. The supported options are:
+
+ **gettimeofday**
+ :manpage:`gettimeofday(2)`
+
+ **clock_gettime**
+ :manpage:`clock_gettime(2)`
+
+ **cpu**
+ Internal CPU clock source
+
+ cpu is the preferred clocksource if it is reliable, as it is very fast (and
+ fio is heavy on time calls). Fio will automatically use this clocksource if
+ it's supported and considered reliable on the system it is running on,
+ unless another clocksource is specifically set. For x86/x86-64 CPUs, this
+ means supporting TSC Invariant.
+
+.. option:: gtod_reduce=bool
+
+ Enable all of the :manpage:`gettimeofday(2)` reducing options
+ (:option:`disable_clat`, :option:`disable_slat`, :option:`disable_bw_measurement`) plus
+ reduce precision of the timeout somewhat to really shrink the
+ :manpage:`gettimeofday(2)` call count. With this option enabled, we only do
+ about 0.4% of the :manpage:`gettimeofday(2)` calls we would have done if all
+ time keeping was enabled.
+
+.. option:: gtod_cpu=int
+
+ Sometimes it's cheaper to dedicate a single thread of execution to just
+ getting the current time. Fio (and databases, for instance) are very
+ intensive on :manpage:`gettimeofday(2)` calls. With this option, you can set
+ one CPU aside for doing nothing but logging current time to a shared memory
+ location. Then the other threads/processes that run I/O workloads need only
+ copy that segment, instead of entering the kernel with a
+ :manpage:`gettimeofday(2)` call. The CPU set aside for doing these time
+ calls will be excluded from other uses. Fio will manually clear it from the
+ CPU mask of other jobs.
+
+.. option:: job_start_clock_id=int
+
+ The clock_id passed to the call to `clock_gettime` used to record
+ job_start in the `json` output format. Default is 0, or CLOCK_REALTIME.
+
+
+Target file/device
+~~~~~~~~~~~~~~~~~~
+
+.. option:: directory=str
+
+ Prefix filenames with this directory. Used to place files in a different
+ location than :file:`./`. You can specify a number of directories by
+ separating the names with a ':' character. These directories will be
+ assigned equally distributed to job clones created by :option:`numjobs` as
+ long as they are using generated filenames. If specific `filename(s)` are
+ set fio will use the first listed directory, and thereby matching the
+ `filename` semantic (which generates a file for each clone if not
+ specified, but lets all clones use the same file if set).
+
+ See the :option:`filename` option for information on how to escape "``:``"
+ characters within the directory path itself.
+
+ Note: To control the directory fio will use for internal state files
+ use :option:`--aux-path`.
+
+.. option:: filename=str
+
+ Fio normally makes up a `filename` based on the job name, thread number, and
+ file number (see :option:`filename_format`). If you want to share files
+ between threads in a job or several
+ jobs with fixed file paths, specify a `filename` for each of them to override
+ the default. If the ioengine is file based, you can specify a number of files
+ by separating the names with a ':' colon. So if you wanted a job to open
+ :file:`/dev/sda` and :file:`/dev/sdb` as the two working files, you would use
+ ``filename=/dev/sda:/dev/sdb``. This also means that whenever this option is
+ specified, :option:`nrfiles` is ignored. The size of regular files specified
+ by this option will be :option:`size` divided by number of files unless an
+ explicit size is specified by :option:`filesize`.
+
+ Each colon in the wanted path must be escaped with a ``\``
+ character. For instance, if the path is :file:`/dev/dsk/foo@3,0:c` then you
+ would use ``filename=/dev/dsk/foo@3,0\:c`` and if the path is
+ :file:`F:\\filename` then you would use ``filename=F\:\filename``.
+
+ On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for
+ the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc.
+ Note: Windows and FreeBSD (refer to geom(4)) prevent write access to areas
+ of the disk containing in-use data (e.g. filesystems).
+
+ The filename "`-`" is a reserved name, meaning *stdin* or *stdout*. Which
+ of the two depends on the read/write direction set.
+
+.. option:: filename_format=str
+
+ If sharing multiple files between jobs, it is usually necessary to have fio
+ generate the exact names that you want. By default, fio will name a file
+ based on the default file format specification of
+ :file:`jobname.jobnumber.filenumber`. With this option, that can be
+ customized. Fio will recognize and replace the following keywords in this
+ string:
+
+ **$jobname**
+ The name of the worker thread or process.
+ **$clientuid**
+ IP of the fio process when using client/server mode.
+ **$jobnum**
+ The incremental number of the worker thread or process.
+ **$filenum**
+ The incremental number of the file for that worker thread or
+ process.
+
+ To have dependent jobs share a set of files, this option can be set to have
+ fio generate filenames that are shared between the two. For instance, if
+ :file:`testfiles.$filenum` is specified, file number 4 for any job will be
+ named :file:`testfiles.4`. The default of :file:`$jobname.$jobnum.$filenum`
+ will be used if no other format specifier is given.
+
+ If you specify a path then the directories will be created up to the
+ main directory for the file. So for example if you specify
+ ``filename_format=a/b/c/$jobnum`` then the directories a/b/c will be
+ created before the file setup part of the job. If you specify
+ :option:`directory` then the path will be relative that directory,
+ otherwise it is treated as the absolute path.
+
+.. option:: unique_filename=bool
+
+ To avoid collisions between networked clients, fio defaults to prefixing any
+ generated filenames (with a directory specified) with the source of the
+ client connecting. To disable this behavior, set this option to 0.
+
+.. option:: opendir=str
+
+ Recursively open any files below directory `str`. This accepts only a
+ single directory and unlike related options, colons appearing in the
+ path must not be escaped.
+
+.. option:: lockfile=str
+
+ Fio defaults to not locking any files before it does I/O to them. If a file
+ or file descriptor is shared, fio can serialize I/O to that file to make the
+ end result consistent. This is usual for emulating real workloads that share
+ files. The lock modes are:
+
+ **none**
+ No locking. The default.
+ **exclusive**
+ Only one thread or process may do I/O at a time, excluding all
+ others.
+ **readwrite**
+ Read-write locking on the file. Many readers may
+ access the file at the same time, but writes get exclusive access.
+
+.. option:: nrfiles=int
+
+ Number of files to use for this job. Defaults to 1. The size of files
+ will be :option:`size` divided by this unless explicit size is specified by
+ :option:`filesize`. Files are created for each thread separately, and each
+ file will have a file number within its name by default, as explained in
+ :option:`filename` section.
+
+
+.. option:: openfiles=int
+
+ Number of files to keep open at the same time. Defaults to the same as
+ :option:`nrfiles`, can be set smaller to limit the number simultaneous
+ opens.
+
+.. option:: file_service_type=str
+
+ Defines how fio decides which file from a job to service next. The following
+ types are defined:
+
+ **random**
+ Choose a file at random.
+
+ **roundrobin**
+ Round robin over opened files. This is the default.
+
+ **sequential**
+ Finish one file before moving on to the next. Multiple files can
+ still be open depending on :option:`openfiles`.
+
+ **zipf**
+ Use a *Zipf* distribution to decide what file to access.
+
+ **pareto**
+ Use a *Pareto* distribution to decide what file to access.
+
+ **normal**
+ Use a *Gaussian* (normal) distribution to decide what file to
+ access.
+
+ **gauss**
+ Alias for normal.
+
+ For *random*, *roundrobin*, and *sequential*, a postfix can be appended to
+ tell fio how many I/Os to issue before switching to a new file. For example,
+ specifying ``file_service_type=random:8`` would cause fio to issue
+ 8 I/Os before selecting a new file at random. For the non-uniform
+ distributions, a floating point postfix can be given to influence how the
+ distribution is skewed. See :option:`random_distribution` for a description
+ of how that would work.
+
+.. option:: ioscheduler=str
+
+ Attempt to switch the device hosting the file to the specified I/O scheduler
+ before running.
+
+.. option:: create_serialize=bool
+
+ If true, serialize the file creation for the jobs. This may be handy to
+ avoid interleaving of data files, which may greatly depend on the filesystem
+ used and even the number of processors in the system. Default: true.
+
+.. option:: create_fsync=bool
+
+ :manpage:`fsync(2)` the data file after creation. This is the default.
+
+.. option:: create_on_open=bool
+
+ If true, don't pre-create files but allow the job's open() to create a file
+ when it's time to do I/O. Default: false -- pre-create all necessary files
+ when the job starts.
+
+.. option:: create_only=bool
+
+ If true, fio will only run the setup phase of the job. If files need to be
+ laid out or updated on disk, only that will be done -- the actual job contents
+ are not executed. Default: false.
+
+.. option:: allow_file_create=bool
+
+ If true, fio is permitted to create files as part of its workload. If this
+ option is false, then fio will error out if
+ the files it needs to use don't already exist. Default: true.
+
+.. option:: allow_mounted_write=bool
+
+ If this isn't set, fio will abort jobs that are destructive (e.g. that write)
+ to what appears to be a mounted device or partition. This should help catch
+ creating inadvertently destructive tests, not realizing that the test will
+ destroy data on the mounted file system. Note that some platforms don't allow
+ writing against a mounted device regardless of this option. Default: false.
+
+.. option:: pre_read=bool
+
+ If this is given, files will be pre-read into memory before starting the
+ given I/O operation. This will also clear the :option:`invalidate` flag,
+ since it is pointless to pre-read and then drop the cache. This will only
+ work for I/O engines that are seek-able, since they allow you to read the
+ same data multiple times. Thus it will not work on non-seekable I/O engines
+ (e.g. network, splice). Default: false.
+
+.. option:: unlink=bool
+
+ Unlink (delete) the job files when done. Not the default, as repeated runs of that
+ job would then waste time recreating the file set again and again. Default:
+ false.
+
+.. option:: unlink_each_loop=bool
+
+ Unlink (delete) job files after each iteration or loop. Default: false.
+
+.. option:: zonemode=str
+
+ Accepted values are:
+
+ **none**
+ The :option:`zonerange`, :option:`zonesize`,
+ :option:`zonecapacity` and :option:`zoneskip`
+ parameters are ignored.
+ **strided**
+ I/O happens in a single zone until
+ :option:`zonesize` bytes have been transferred.
+ After that number of bytes has been
+ transferred processing of the next zone
+ starts. :option:`zonecapacity` is ignored.
+ **zbd**
+ Zoned block device mode. I/O happens
+ sequentially in each zone, even if random I/O
+ has been selected. Random I/O happens across
+ all zones instead of being restricted to a
+ single zone. The :option:`zoneskip` parameter
+ is ignored. :option:`zonerange` and
+ :option:`zonesize` must be identical.
+ Trim is handled using a zone reset operation.
+ Trim only considers non-empty sequential write
+ required and sequential write preferred zones.
+
+.. option:: zonerange=int
+
+ Size of a single zone. See also :option:`zonesize` and
+ :option:`zoneskip`.
+
+.. option:: zonesize=int
+
+ For :option:`zonemode` =strided, this is the number of bytes to
+ transfer before skipping :option:`zoneskip` bytes. If this parameter
+ is smaller than :option:`zonerange` then only a fraction of each zone
+ with :option:`zonerange` bytes will be accessed. If this parameter is
+ larger than :option:`zonerange` then each zone will be accessed
+ multiple times before skipping to the next zone.
+
+ For :option:`zonemode` =zbd, this is the size of a single zone. The
+ :option:`zonerange` parameter is ignored in this mode.
+
+
+.. option:: zonecapacity=int
+
+ For :option:`zonemode` =zbd, this defines the capacity of a single zone,
+ which is the accessible area starting from the zone start address.
+ This parameter only applies when using :option:`zonemode` =zbd in
+ combination with regular block devices. If not specified it defaults to
+ the zone size. If the target device is a zoned block device, the zone
+ capacity is obtained from the device information and this option is
+ ignored.
+
+.. option:: zoneskip=int
+
+ For :option:`zonemode` =strided, the number of bytes to skip after
+ :option:`zonesize` bytes of data have been transferred. This parameter
+ must be zero for :option:`zonemode` =zbd.
+
+.. option:: read_beyond_wp=bool
+
+ This parameter applies to :option:`zonemode` =zbd only.
+
+ Zoned block devices are block devices that consist of multiple zones.
+ Each zone has a type, e.g. conventional or sequential. A conventional
+ zone can be written at any offset that is a multiple of the block
+ size. Sequential zones must be written sequentially. The position at
+ which a write must occur is called the write pointer. A zoned block
+ device can be either drive managed, host managed or host aware. For
+ host managed devices the host must ensure that writes happen
+ sequentially. Fio recognizes host managed devices and serializes
+ writes to sequential zones for these devices.
+
+ If a read occurs in a sequential zone beyond the write pointer then
+ the zoned block device will complete the read without reading any data
+ from the storage medium. Since such reads lead to unrealistically high
+ bandwidth and IOPS numbers fio only reads beyond the write pointer if
+ explicitly told to do so. Default: false.
+
+.. option:: max_open_zones=int
+
+ When a zone of a zoned block device is partially written (i.e. not all
+ sectors of the zone have been written), the zone is in one of three
+ conditions: 'implicit open', 'explicit open' or 'closed'. Zoned block
+ devices may have a limit called 'max_open_zones' (same name as the
+ parameter) on the total number of zones that can simultaneously be in
+ the 'implicit open' or 'explicit open' conditions. Zoned block devices
+ may have another limit called 'max_active_zones', on the total number of
+ zones that can simultaneously be in the three conditions. The
+ :option:`max_open_zones` parameter limits the number of zones to which
+ write commands are issued by all fio jobs, that is, limits the number of
+ zones that will be in the conditions. When the device has the
+ max_open_zones limit and does not have the max_active_zones limit, the
+ :option:`max_open_zones` parameter limits the number of zones in the two
+ open conditions up to the limit. In this case, fio includes zones in the
+ two open conditions to the write target zones at fio start. When the
+ device has both the max_open_zones and the max_active_zones limits, the
+ :option:`max_open_zones` parameter limits the number of zones in the
+ three conditions up to the limit. In this case, fio includes zones in
+ the three conditions to the write target zones at fio start.
+
+ This parameter is relevant only if the :option:`zonemode` =zbd is used.
+ The default value is always equal to the max_open_zones limit of the
+ target zoned block device and a value higher than this limit cannot be
+ specified by users unless the option :option:`ignore_zone_limits` is
+ specified. When :option:`ignore_zone_limits` is specified or the target
+ device does not have the max_open_zones limit, :option:`max_open_zones`
+ can specify 0 to disable any limit on the number of zones that can be
+ simultaneously written to by all jobs.
+
+.. option:: job_max_open_zones=int
+
+ In the same manner as :option:`max_open_zones`, limit the number of open
+ zones per fio job, that is, the number of zones that a single job can
+ simultaneously write to. A value of zero indicates no limit.
+ Default: zero.
+
+.. option:: ignore_zone_limits=bool
+
+ If this option is used, fio will ignore the maximum number of open
+ zones limit of the zoned block device in use, thus allowing the
+ option :option:`max_open_zones` value to be larger than the device
+ reported limit. Default: false.
+
+.. option:: zone_reset_threshold=float
+
+ A number between zero and one that indicates the ratio of written bytes
+ in the zones with write pointers in the IO range to the size of the IO
+ range. When current ratio is above this ratio, zones are reset
+ periodically as :option:`zone_reset_frequency` specifies. If there are
+ multiple jobs when using this option, the IO range for all write jobs
+ has to be the same.
+
+.. option:: zone_reset_frequency=float
+
+ A number between zero and one that indicates how often a zone reset
+ should be issued if the zone reset threshold has been exceeded. A zone
+ reset is submitted after each (1 / zone_reset_frequency) write
+ requests. This and the previous parameter can be used to simulate
+ garbage collection activity.
+
+
+I/O type
+~~~~~~~~
+
+.. option:: direct=bool
+
+ If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
+ OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous
+ ioengines don't support direct I/O. Default: false.
+
+.. option:: buffered=bool
+
+ If value is true, use buffered I/O. This is the opposite of the
+ :option:`direct` option. Defaults to true.
+
+.. option:: readwrite=str, rw=str
+
+ Type of I/O pattern. Accepted values are:
+
+ **read**
+ Sequential reads.
+ **write**
+ Sequential writes.
+ **trim**
+ Sequential trims (Linux block devices and SCSI
+ character devices only).
+ **randread**
+ Random reads.
+ **randwrite**
+ Random writes.
+ **randtrim**
+ Random trims (Linux block devices and SCSI
+ character devices only).
+ **rw,readwrite**
+ Sequential mixed reads and writes.
+ **randrw**
+ Random mixed reads and writes.
+ **trimwrite**
+ Sequential trim+write sequences. Blocks will be trimmed first,
+ then the same blocks will be written to. So if ``io_size=64K``
+ is specified, Fio will trim a total of 64K bytes and also
+ write 64K bytes on the same trimmed blocks. This behaviour
+ will be consistent with ``number_ios`` or other Fio options
+ limiting the total bytes or number of I/O's.
+ **randtrimwrite**
+ Like trimwrite, but uses random offsets rather
+ than sequential writes.
+
+ Fio defaults to read if the option is not specified. For the mixed I/O
+ types, the default is to split them 50/50. For certain types of I/O the
+ result may still be skewed a bit, since the speed may be different.
+
+ It is possible to specify the number of I/Os to do before getting a new
+ offset by appending ``:<nr>`` to the end of the string given. For a
+ random read, it would look like ``rw=randread:8`` for passing in an offset
+ modifier with a value of 8. If the suffix is used with a sequential I/O
+ pattern, then the *<nr>* value specified will be **added** to the generated
+ offset for each I/O turning sequential I/O into sequential I/O with holes.
+ For instance, using ``rw=write:4k`` will skip 4k for every write. Also see
+ the :option:`rw_sequencer` option.
+
+.. option:: rw_sequencer=str
+
+ If an offset modifier is given by appending a number to the ``rw=<str>``
+ line, then this option controls how that number modifies the I/O offset
+ being generated. Accepted values are:
+
+ **sequential**
+ Generate sequential offset.
+ **identical**
+ Generate the same offset.
+
+ ``sequential`` is only useful for random I/O, where fio would normally
+ generate a new random offset for every I/O. If you append e.g. 8 to
+ randread, i.e. ``rw=randread:8`` you would get a new random offset for
+ every 8 I/Os. The result would be a sequence of 8 sequential offsets
+ with a random starting point. However this behavior may change if a
+ sequential I/O reaches end of the file. As sequential I/O is already
+ sequential, setting ``sequential`` for that would not result in any
+ difference. ``identical`` behaves in a similar fashion, except it sends
+ the same offset 8 number of times before generating a new offset.
+
+ Example #1::
+
+ rw=randread:8
+ rw_sequencer=sequential
+ bs=4k
+
+ The generated sequence of offsets will look like this:
+ 4k, 8k, 12k, 16k, 20k, 24k, 28k, 32k, 92k, 96k, 100k, 104k, 108k,
+ 112k, 116k, 120k, 48k, 52k ...
+
+ Example #2::
+
+ rw=randread:8
+ rw_sequencer=identical
+ bs=4k
+
+ The generated sequence of offsets will look like this:
+ 4k, 4k, 4k, 4k, 4k, 4k, 4k, 4k, 92k, 92k, 92k, 92k, 92k, 92k, 92k, 92k,
+ 48k, 48k, 48k ...
+
+.. option:: unified_rw_reporting=str
+
+ Fio normally reports statistics on a per data direction basis, meaning that
+ reads, writes, and trims are accounted and reported separately. This option
+ determines whether fio reports the results normally, summed together, or as
+ both options.
+ Accepted values are:
+
+ **none**
+ Normal statistics reporting.
+
+ **mixed**
+ Statistics are summed per data direction and reported together.
+
+ **both**
+ Statistics are reported normally, followed by the mixed statistics.
+
+ **0**
+ Backward-compatible alias for **none**.
+
+ **1**
+ Backward-compatible alias for **mixed**.
+
+ **2**
+ Alias for **both**.
+
+.. option:: randrepeat=bool
+
+ Seed all random number generators in a predictable way so the pattern
+ is repeatable across runs. Default: true.
+
+.. option:: allrandrepeat=bool
+
+ Alias for :option:`randrepeat`. Default: true.
+
+.. option:: randseed=int
+
+ Seed the random number generators based on this seed value, to be able to
+ control what sequence of output is being generated. If not set, the random
+ sequence depends on the :option:`randrepeat` setting.
+
+.. option:: fallocate=str
+
+ Whether pre-allocation is performed when laying down files.
+ Accepted values are:
+
+ **none**
+ Do not pre-allocate space.
+
+ **native**
+ Use a platform's native pre-allocation call but fall back to
+ **none** behavior if it fails/is not implemented.
+
+ **posix**
+ Pre-allocate via :manpage:`posix_fallocate(3)`.
+
+ **keep**
+ Pre-allocate via :manpage:`fallocate(2)` with
+ FALLOC_FL_KEEP_SIZE set.
+
+ **truncate**
+ Extend file to final size via :manpage:`ftruncate(2)`
+ instead of allocating.
+
+ **0**
+ Backward-compatible alias for **none**.
+
+ **1**
+ Backward-compatible alias for **posix**.
+
+ May not be available on all supported platforms. **keep** is only available
+ on Linux. If using ZFS on Solaris this cannot be set to **posix**
+ because ZFS doesn't support pre-allocation. Default: **native** if any
+ pre-allocation methods except **truncate** are available, **none** if not.
+
+ Note that using **truncate** on Windows will interact surprisingly
+ with non-sequential write patterns. When writing to a file that has
+ been extended by setting the end-of-file information, Windows will
+ backfill the unwritten portion of the file up to that offset with
+ zeroes before issuing the new write. This means that a single small
+ write to the end of an extended file will stall until the entire
+ file has been filled with zeroes.
+
+.. option:: fadvise_hint=str
+
+ Use :manpage:`posix_fadvise(2)` or :manpage:`posix_fadvise(2)` to
+ advise the kernel on what I/O patterns are likely to be issued.
+ Accepted values are:
+
+ **0**
+ Backwards-compatible hint for "no hint".
+
+ **1**
+ Backwards compatible hint for "advise with fio workload type". This
+ uses **FADV_RANDOM** for a random workload, and **FADV_SEQUENTIAL**
+ for a sequential workload.
+
+ **sequential**
+ Advise using **FADV_SEQUENTIAL**.
+
+ **random**
+ Advise using **FADV_RANDOM**.
+
+ **noreuse**
+ Advise using **FADV_NOREUSE**. This may be a no-op on older Linux
+ kernels. Since Linux 6.3, it provides a hint to the LRU algorithm.
+ See the :manpage:`posix_fadvise(2)` man page.
+
+.. option:: write_hint=str
+
+ Use :manpage:`fcntl(2)` to advise the kernel what life time to expect
+ from a write. Only supported on Linux, as of version 4.13. Accepted
+ values are:
+
+ **none**
+ No particular life time associated with this file.
+
+ **short**
+ Data written to this file has a short life time.
+
+ **medium**
+ Data written to this file has a medium life time.
+
+ **long**
+ Data written to this file has a long life time.
+
+ **extreme**
+ Data written to this file has a very long life time.
+
+ The values are all relative to each other, and no absolute meaning
+ should be associated with them.
+
+.. option:: offset=int
+
+ Start I/O at the provided offset in the file, given as either a fixed size in
+ bytes, zones or a percentage. If a percentage is given, the generated offset will be
+ aligned to the minimum ``blocksize`` or to the value of ``offset_align`` if
+ provided. Data before the given offset will not be touched. This
+ effectively caps the file size at `real_size - offset`. Can be combined with
+ :option:`size` to constrain the start and end range of the I/O workload.
+ A percentage can be specified by a number between 1 and 100 followed by '%',
+ for example, ``offset=20%`` to specify 20%. In ZBD mode, value can be set as
+ number of zones using 'z'.
+
+.. option:: offset_align=int
+
+ If set to non-zero value, the byte offset generated by a percentage ``offset``
+ is aligned upwards to this value. Defaults to 0 meaning that a percentage
+ offset is aligned to the minimum block size.
+
+.. option:: offset_increment=int
+
+ If this is provided, then the real offset becomes `offset + offset_increment
+ * thread_number`, where the thread number is a counter that starts at 0 and
+ is incremented for each sub-job (i.e. when :option:`numjobs` option is
+ specified). This option is useful if there are several jobs which are
+ intended to operate on a file in parallel disjoint segments, with even
+ spacing between the starting points. Percentages can be used for this option.
+ If a percentage is given, the generated offset will be aligned to the minimum
+ ``blocksize`` or to the value of ``offset_align`` if provided. In ZBD mode, value can
+ also be set as number of zones using 'z'.
+
+.. option:: number_ios=int
+
+ Fio will normally perform I/Os until it has exhausted the size of the region
+ set by :option:`size`, or if it exhaust the allocated time (or hits an error
+ condition). With this setting, the range/size can be set independently of
+ the number of I/Os to perform. When fio reaches this number, it will exit
+ normally and report status. Note that this does not extend the amount of I/O
+ that will be done, it will only stop fio if this condition is met before
+ other end-of-job criteria.
+
+.. option:: fsync=int
+
+ If writing to a file, issue an :manpage:`fsync(2)` (or its equivalent) of
+ the dirty data for every number of blocks given. For example, if you give 32
+ as a parameter, fio will sync the file after every 32 writes issued. If fio is
+ using non-buffered I/O, we may not sync the file. The exception is the sg
+ I/O engine, which synchronizes the disk cache anyway. Defaults to 0, which
+ means fio does not periodically issue and wait for a sync to complete. Also
+ see :option:`end_fsync` and :option:`fsync_on_close`.
+
+.. option:: fdatasync=int
+
+ Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
+ not metadata blocks. In Windows, DragonFlyBSD or OSX there is no
+ :manpage:`fdatasync(2)` so this falls back to using :manpage:`fsync(2)`.
+ Defaults to 0, which means fio does not periodically issue and wait for a
+ data-only sync to complete.
+
+.. option:: write_barrier=int
+
+ Make every `N-th` write a barrier write.
+
+.. option:: sync_file_range=str:int
+
+ Use :manpage:`sync_file_range(2)` for every `int` number of write
+ operations. Fio will track range of writes that have happened since the last
+ :manpage:`sync_file_range(2)` call. `str` can currently be one or more of:
+
+ **wait_before**
+ SYNC_FILE_RANGE_WAIT_BEFORE
+ **write**
+ SYNC_FILE_RANGE_WRITE
+ **wait_after**
+ SYNC_FILE_RANGE_WAIT_AFTER
+
+ So if you do ``sync_file_range=wait_before,write:8``, fio would use
+ ``SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE`` for every 8
+ writes. Also see the :manpage:`sync_file_range(2)` man page. This option is
+ Linux specific.
+
+.. option:: overwrite=bool
+
+ If true, writes to a file will always overwrite existing data. If the file
+ doesn't already exist, it will be created before the write phase begins. If
+ the file exists and is large enough for the specified write phase, nothing
+ will be done. Default: false.
+
+.. option:: end_fsync=bool
+
+ If true, :manpage:`fsync(2)` file contents when a write stage has completed.
+ Default: false.
+
+.. option:: fsync_on_close=bool
+
+ If true, fio will :manpage:`fsync(2)` a dirty file on close. This differs
+ from :option:`end_fsync` in that it will happen on every file close, not
+ just at the end of the job. Default: false.
+
+.. option:: rwmixread=int
+
+ Percentage of a mixed workload that should be reads. Default: 50.
+
+.. option:: rwmixwrite=int
+
+ Percentage of a mixed workload that should be writes. If both
+ :option:`rwmixread` and :option:`rwmixwrite` is given and the values do not
+ add up to 100%, the latter of the two will be used to override the
+ first. This may interfere with a given rate setting, if fio is asked to
+ limit reads or writes to a certain rate. If that is the case, then the
+ distribution may be skewed. Default: 50.
+
+.. option:: random_distribution=str:float[:float][,str:float][,str:float]
+
+ By default, fio will use a completely uniform random distribution when asked
+ to perform random I/O. Sometimes it is useful to skew the distribution in
+ specific ways, ensuring that some parts of the data is more hot than others.
+ fio includes the following distribution models:
+
+ **random**
+ Uniform random distribution
+
+ **zipf**
+ Zipf distribution
+
+ **pareto**
+ Pareto distribution
+
+ **normal**
+ Normal (Gaussian) distribution
+
+ **zoned**
+ Zoned random distribution
+
+ **zoned_abs**
+ Zone absolute random distribution
+
+ When using a **zipf** or **pareto** distribution, an input value is also
+ needed to define the access pattern. For **zipf**, this is the `Zipf
+ theta`. For **pareto**, it's the `Pareto power`. Fio includes a test
+ program, :command:`fio-genzipf`, that can be used visualize what the given input
+ values will yield in terms of hit rates. If you wanted to use **zipf** with
+ a `theta` of 1.2, you would use ``random_distribution=zipf:1.2`` as the
+ option. If a non-uniform model is used, fio will disable use of the random
+ map. For the **normal** distribution, a normal (Gaussian) deviation is
+ supplied as a value between 0 and 100.
+
+ The second, optional float is allowed for **pareto**, **zipf** and **normal** distributions.
+ It allows one to set base of distribution in non-default place, giving more control
+ over most probable outcome. This value is in range [0-1] which maps linearly to
+ range of possible random values.
+ Defaults are: random for **pareto** and **zipf**, and 0.5 for **normal**.
+ If you wanted to use **zipf** with a `theta` of 1.2 centered on 1/4 of allowed value range,
+ you would use ``random_distribution=zipf:1.2:0.25``.
+
+ For a **zoned** distribution, fio supports specifying percentages of I/O
+ access that should fall within what range of the file or device. For
+ example, given a criteria of:
+
+ * 60% of accesses should be to the first 10%
+ * 30% of accesses should be to the next 20%
+ * 8% of accesses should be to the next 30%
+ * 2% of accesses should be to the next 40%
+
+ we can define that through zoning of the random accesses. For the above
+ example, the user would do::
+
+ random_distribution=zoned:60/10:30/20:8/30:2/40
+
+ A **zoned_abs** distribution works exactly like the **zoned**, except
+ that it takes absolute sizes. For example, let's say you wanted to
+ define access according to the following criteria:
+
+ * 60% of accesses should be to the first 20G
+ * 30% of accesses should be to the next 100G
+ * 10% of accesses should be to the next 500G
+
+ we can define an absolute zoning distribution with:
+
+ random_distribution=zoned_abs=60/20G:30/100G:10/500g
+
+ For both **zoned** and **zoned_abs**, fio supports defining up to
+ 256 separate zones.
+
+ Similarly to how :option:`bssplit` works for setting ranges and
+ percentages of block sizes. Like :option:`bssplit`, it's possible to
+ specify separate zones for reads, writes, and trims. If just one set
+ is given, it'll apply to all of them. This goes for both **zoned**
+ **zoned_abs** distributions.
+
+.. option:: percentage_random=int[,int][,int]
+
+ For a random workload, set how big a percentage should be random. This
+ defaults to 100%, in which case the workload is fully random. It can be set
+ from anywhere from 0 to 100. Setting it to 0 would make the workload fully
+ sequential. Any setting in between will result in a random mix of sequential
+ and random I/O, at the given percentages. Comma-separated values may be
+ specified for reads, writes, and trims as described in :option:`blocksize`.
+
+.. option:: norandommap
+
+ Normally fio will cover every block of the file when doing random I/O. If
+ this option is given, fio will just get a new random offset without looking
+ at past I/O history. This means that some blocks may not be read or written,
+ and that some blocks may be read/written more than once. If this option is
+ used with :option:`verify` and multiple blocksizes (via :option:`bsrange`),
+ only intact blocks are verified, i.e., partially-overwritten blocks are
+ ignored. With an async I/O engine and an I/O depth > 1, it is possible for
+ the same block to be overwritten, which can cause verification errors. Either
+ do not use norandommap in this case, or also use the lfsr random generator.
+
+.. option:: softrandommap=bool
+
+ See :option:`norandommap`. If fio runs with the random block map enabled and
+ it fails to allocate the map, if this option is set it will continue without
+ a random block map. As coverage will not be as complete as with random maps,
+ this option is disabled by default.
+
+.. option:: random_generator=str
+
+ Fio supports the following engines for generating I/O offsets for random I/O:
+
+ **tausworthe**
+ Strong 2^88 cycle random number generator.
+ **lfsr**
+ Linear feedback shift register generator.
+ **tausworthe64**
+ Strong 64-bit 2^258 cycle random number generator.
+
+ **tausworthe** is a strong random number generator, but it requires tracking
+ on the side if we want to ensure that blocks are only read or written
+ once. **lfsr** guarantees that we never generate the same offset twice, and
+ it's also less computationally expensive. It's not a true random generator,
+ however, though for I/O purposes it's typically good enough. **lfsr** only
+ works with single block sizes, not with workloads that use multiple block
+ sizes. If used with such a workload, fio may read or write some blocks
+ multiple times. The default value is **tausworthe**, unless the required
+ space exceeds 2^32 blocks. If it does, then **tausworthe64** is
+ selected automatically.
+
+
+Block size
+~~~~~~~~~~
+
+.. option:: blocksize=int[,int][,int], bs=int[,int][,int]
+
+ The block size in bytes used for I/O units. Default: 4096. A single value
+ applies to reads, writes, and trims. Comma-separated values may be
+ specified for reads, writes, and trims. A value not terminated in a comma
+ applies to subsequent types.
+
+ Examples:
+
+ **bs=256k**
+ means 256k for reads, writes and trims.
+
+ **bs=8k,32k**
+ means 8k for reads, 32k for writes and trims.
+
+ **bs=8k,32k,**
+ means 8k for reads, 32k for writes, and default for trims.
+
+ **bs=,8k**
+ means default for reads, 8k for writes and trims.
+
+ **bs=,8k,**
+ means default for reads, 8k for writes, and default for trims.
+
+.. option:: blocksize_range=irange[,irange][,irange], bsrange=irange[,irange][,irange]
+
+ A range of block sizes in bytes for I/O units. The issued I/O unit will
+ always be a multiple of the minimum size, unless
+ :option:`blocksize_unaligned` is set.
+
+ Comma-separated ranges may be specified for reads, writes, and trims as
+ described in :option:`blocksize`.
+
+ Example: ``bsrange=1k-4k,2k-8k`` also the ':' delimiter ``bsrange=1k:4k,2k:8k``.
+
+.. option:: bssplit=str[,str][,str]
+
+ Sometimes you want even finer grained control of the block sizes
+ issued, not just an even split between them. This option allows you to
+ weight various block sizes, so that you are able to define a specific
+ amount of block sizes issued. The format for this option is::
+
+ bssplit=blocksize/percentage:blocksize/percentage
+
+ for as many block sizes as needed. So if you want to define a workload
+ that has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would
+ write::
+
+ bssplit=4k/10:64k/50:32k/40
+
+ Ordering does not matter. If the percentage is left blank, fio will
+ fill in the remaining values evenly. So a bssplit option like this one::
+
+ bssplit=4k/50:1k/:32k/
+
+ would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always
+ add up to 100, if bssplit is given a range that adds up to more, it
+ will error out.
+
+ Comma-separated values may be specified for reads, writes, and trims as
+ described in :option:`blocksize`.
+
+ If you want a workload that has 50% 2k reads and 50% 4k reads, while
+ having 90% 4k writes and 10% 8k writes, you would specify::
+
+ bssplit=2k/50:4k/50,4k/90:8k/10
+
+ Fio supports defining up to 64 different weights for each data
+ direction.
+
+.. option:: blocksize_unaligned, bs_unaligned
+
+ If set, fio will issue I/O units with any size within
+ :option:`blocksize_range`, not just multiples of the minimum size. This
+ typically won't work with direct I/O, as that normally requires sector
+ alignment.
+
+.. option:: bs_is_seq_rand=bool
+
+ If this option is set, fio will use the normal read,write blocksize settings
+ as sequential,random blocksize settings instead. Any random read or write
+ will use the WRITE blocksize settings, and any sequential read or write will
+ use the READ blocksize settings.
+
+.. option:: blockalign=int[,int][,int], ba=int[,int][,int]
+
+ Boundary to which fio will align random I/O units. Default:
+ :option:`blocksize`. Minimum alignment is typically 512b for using direct
+ I/O, though it usually depends on the hardware block size. This option is
+ mutually exclusive with using a random map for files, so it will turn off
+ that option. Comma-separated values may be specified for reads, writes, and
+ trims as described in :option:`blocksize`.
+
+
+Buffers and memory
+~~~~~~~~~~~~~~~~~~
+
+.. option:: zero_buffers
+
+ Initialize buffers with all zeros. Default: fill buffers with random data.
+
+.. option:: refill_buffers
+
+ If this option is given, fio will refill the I/O buffers on every
+ submit. Only makes sense if :option:`zero_buffers` isn't specified,
+ naturally. Defaults to being unset i.e., the buffer is only filled at
+ init time and the data in it is reused when possible but if any of
+ :option:`verify`, :option:`buffer_compress_percentage` or
+ :option:`dedupe_percentage` are enabled then `refill_buffers` is also
+ automatically enabled.
+
+.. option:: scramble_buffers=bool
+
+ If :option:`refill_buffers` is too costly and the target is using data
+ deduplication, then setting this option will slightly modify the I/O buffer
+ contents to defeat normal de-dupe attempts. This is not enough to defeat
+ more clever block compression attempts, but it will stop naive dedupe of
+ blocks. Default: true.
+
+.. option:: buffer_compress_percentage=int
+
+ If this is set, then fio will attempt to provide I/O buffer content
+ (on WRITEs) that compresses to the specified level. Fio does this by
+ providing a mix of random data followed by fixed pattern data. The
+ fixed pattern is either zeros, or the pattern specified by
+ :option:`buffer_pattern`. If the `buffer_pattern` option is used, it
+ might skew the compression ratio slightly. Setting
+ `buffer_compress_percentage` to a value other than 100 will also
+ enable :option:`refill_buffers` in order to reduce the likelihood that
+ adjacent blocks are so similar that they over compress when seen
+ together. See :option:`buffer_compress_chunk` for how to set a finer or
+ coarser granularity for the random/fixed data region. Defaults to unset
+ i.e., buffer data will not adhere to any compression level.
+
+.. option:: buffer_compress_chunk=int
+
+ This setting allows fio to manage how big the random/fixed data region
+ is when using :option:`buffer_compress_percentage`. When
+ `buffer_compress_chunk` is set to some non-zero value smaller than the
+ block size, fio can repeat the random/fixed region throughout the I/O
+ buffer at the specified interval (which particularly useful when
+ bigger block sizes are used for a job). When set to 0, fio will use a
+ chunk size that matches the block size resulting in a single
+ random/fixed region within the I/O buffer. Defaults to 512. When the
+ unit is omitted, the value is interpreted in bytes.
+
+.. option:: buffer_pattern=str
+
+ If set, fio will fill the I/O buffers with this pattern or with the contents
+ of a file. If not set, the contents of I/O buffers are defined by the other
+ options related to buffer contents. The setting can be any pattern of bytes,
+ and can be prefixed with 0x for hex values. It may also be a string, where
+ the string must then be wrapped with ``""``. Or it may also be a filename,
+ where the filename must be wrapped with ``''`` in which case the file is
+ opened and read. Note that not all the file contents will be read if that
+ would cause the buffers to overflow. So, for example::
+
+ buffer_pattern='filename'
+
+ or::
+
+ buffer_pattern="abcd"
+
+ or::
+
+ buffer_pattern=-12
+
+ or::
+
+ buffer_pattern=0xdeadface
+
+ Also you can combine everything together in any order::
+
+ buffer_pattern=0xdeadface"abcd"-12'filename'
+
+.. option:: dedupe_percentage=int
+
+ If set, fio will generate this percentage of identical buffers when
+ writing. These buffers will be naturally dedupable. The contents of the
+ buffers depend on what other buffer compression settings have been set. It's
+ possible to have the individual buffers either fully compressible, or not at
+ all -- this option only controls the distribution of unique buffers. Setting
+ this option will also enable :option:`refill_buffers` to prevent every buffer
+ being identical.
+
+.. option:: dedupe_mode=str
+
+ If ``dedupe_percentage=<int>`` is given, then this option controls how fio
+ generates the dedupe buffers.
+
+ **repeat**
+ Generate dedupe buffers by repeating previous writes
+ **working_set**
+ Generate dedupe buffers from working set
+
+ ``repeat`` is the default option for fio. Dedupe buffers are generated
+ by repeating previous unique write.
+
+ ``working_set`` is a more realistic workload.
+ With ``working_set``, ``dedupe_working_set_percentage=<int>`` should be provided.
+ Given that, fio will use the initial unique write buffers as its working set.
+ Upon deciding to dedupe, fio will randomly choose a buffer from the working set.
+ Note that by using ``working_set`` the dedupe percentage will converge
+ to the desired over time while ``repeat`` maintains the desired percentage
+ throughout the job.
+
+.. option:: dedupe_working_set_percentage=int
+
+ If ``dedupe_mode=<str>`` is set to ``working_set``, then this controls
+ the percentage of size of the file or device used as the buffers
+ fio will choose to generate the dedupe buffers from
+
+ Note that size needs to be explicitly provided and only 1 file per
+ job is supported
+
+.. option:: dedupe_global=bool
+
+ This controls whether the deduplication buffers will be shared amongst
+ all jobs that have this option set. The buffers are spread evenly between
+ participating jobs.
+
+.. option:: invalidate=bool
+
+ Invalidate the buffer/page cache parts of the files to be used prior to
+ starting I/O if the platform and file type support it. Defaults to true.
+ This will be ignored if :option:`pre_read` is also specified for the
+ same job.
+
+.. option:: sync=str
+
+ Whether, and what type, of synchronous I/O to use for writes. The allowed
+ values are:
+
+ **none**
+ Do not use synchronous IO, the default.
+
+ **0**
+ Same as **none**.
+
+ **sync**
+ Use synchronous file IO. For the majority of I/O engines,
+ this means using O_SYNC.
+
+ **1**
+ Same as **sync**.
+
+ **dsync**
+ Use synchronous data IO. For the majority of I/O engines,
+ this means using O_DSYNC.
+
+
+.. option:: iomem=str, mem=str
+
+ Fio can use various types of memory as the I/O unit buffer. The allowed
+ values are:
+
+ **malloc**
+ Use memory from :manpage:`malloc(3)` as the buffers. Default memory
+ type.
+
+ **shm**
+ Use shared memory as the buffers. Allocated through
+ :manpage:`shmget(2)`.
+
+ **shmhuge**
+ Same as shm, but use huge pages as backing.
+
+ **mmap**
+ Use :manpage:`mmap(2)` to allocate buffers. May either be anonymous memory, or can
+ be file backed if a filename is given after the option. The format
+ is `mem=mmap:/path/to/file`.
+
+ **mmaphuge**
+ Use a memory mapped huge file as the buffer backing. Append filename
+ after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file`.
+
+ **mmapshared**
+ Same as mmap, but use a MMAP_SHARED mapping.
+
+ **cudamalloc**
+ Use GPU memory as the buffers for GPUDirect RDMA benchmark.
+ The :option:`ioengine` must be `rdma`.
+
+ The area allocated is a function of the maximum allowed bs size for the job,
+ multiplied by the I/O depth given. Note that for **shmhuge** and
+ **mmaphuge** to work, the system must have free huge pages allocated. This
+ can normally be checked and set by reading/writing
+ :file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page
+ is 2 or 4MiB in size depending on the platform. So to calculate the
+ number of huge pages you need for a given job file, add up the I/O
+ depth of all jobs (normally one unless :option:`iodepth` is used) and
+ multiply by the maximum bs set. Then divide that number by the huge
+ page size. You can see the size of the huge pages in
+ :file:`/proc/meminfo`. If no huge pages are allocated by having a
+ non-zero number in `nr_hugepages`, using **mmaphuge** or **shmhuge**
+ will fail. Also see :option:`hugepage-size`.
+
+ **mmaphuge** also needs to have hugetlbfs mounted and the file location
+ should point there. So if it's mounted in :file:`/huge`, you would use
+ `mem=mmaphuge:/huge/somefile`.
+
+.. option:: iomem_align=int, mem_align=int
+
+ This indicates the memory alignment of the I/O memory buffers. Note that
+ the given alignment is applied to the first I/O unit buffer, if using
+ :option:`iodepth` the alignment of the following buffers are given by the
+ :option:`bs` used. In other words, if using a :option:`bs` that is a
+ multiple of the page sized in the system, all buffers will be aligned to
+ this value. If using a :option:`bs` that is not page aligned, the alignment
+ of subsequent I/O memory buffers is the sum of the :option:`iomem_align` and
+ :option:`bs` used.
+
+.. option:: hugepage-size=int
+
+ Defines the size of a huge page. Must at least be equal to the system
+ setting, see :file:`/proc/meminfo` and
+ :file:`/sys/kernel/mm/hugepages/`. Defaults to 2 or 4MiB depending on
+ the platform. Should probably always be a multiple of megabytes, so
+ using ``hugepage-size=Xm`` is the preferred way to set this to avoid
+ setting a non-pow-2 bad value.
+
+.. option:: lockmem=int
+
+ Pin the specified amount of memory with :manpage:`mlock(2)`. Can be used to
+ simulate a smaller amount of memory. The amount specified is per worker.
+
+
+I/O size
+~~~~~~~~
+
+.. option:: size=int
+
+ The total size of file I/O for each thread of this job. Fio will run until
+ this many bytes has been transferred, unless runtime is altered by other means
+ such as (1) :option:`runtime`, (2) :option:`io_size` (3) :option:`number_ios`,
+ (4) gaps/holes while doing I/O's such as ``rw=read:16K``, or (5) sequential
+ I/O reaching end of the file which is possible when :option:`percentage_random`
+ is less than 100.
+ Fio will divide this size between the available files determined by options
+ such as :option:`nrfiles`, :option:`filename`, unless :option:`filesize` is
+ specified by the job. If the result of division happens to be 0, the size is
+ set to the physical size of the given files or devices if they exist.
+ If this option is not specified, fio will use the full size of the given
+ files or devices. If the files do not exist, size must be given. It is also
+ possible to give size as a percentage between 1 and 100. If ``size=20%`` is
+ given, fio will use 20% of the full size of the given files or devices.
+ In ZBD mode, value can also be set as number of zones using 'z'.
+ Can be combined with :option:`offset` to constrain the start and end range
+ that I/O will be done within.
+
+.. option:: io_size=int, io_limit=int
+
+ Normally fio operates within the region set by :option:`size`, which means
+ that the :option:`size` option sets both the region and size of I/O to be
+ performed. Sometimes that is not what you want. With this option, it is
+ possible to define just the amount of I/O that fio should do. For instance,
+ if :option:`size` is set to 20GiB and :option:`io_size` is set to 5GiB, fio
+ will perform I/O within the first 20GiB but exit when 5GiB have been
+ done. The opposite is also possible -- if :option:`size` is set to 20GiB,
+ and :option:`io_size` is set to 40GiB, then fio will do 40GiB of I/O within
+ the 0..20GiB region.
+
+.. option:: filesize=irange(int)
+
+ Individual file sizes. May be a range, in which case fio will select sizes for
+ files at random within the given range. If not given, each created file is the
+ same size. This option overrides :option:`size` in terms of file size, i.e. if
+ :option:`filesize` is specified then :option:`size` becomes merely the default
+ for :option:`io_size` and has no effect at all if :option:`io_size` is set
+ explicitly.
+
+.. option:: file_append=bool
+
+ Perform I/O after the end of the file. Normally fio will operate within the
+ size of a file. If this option is set, then fio will append to the file
+ instead. This has identical behavior to setting :option:`offset` to the size
+ of a file. This option is ignored on non-regular files.
+
+.. option:: fill_device=bool, fill_fs=bool
+
+ Sets size to something really large and waits for ENOSPC (no space left on
+ device) or EDQUOT (disk quota exceeded)
+ as the terminating condition. Only makes sense with sequential
+ write. For a read workload, the mount point will be filled first then I/O
+ started on the result. This option doesn't make sense if operating on a raw
+ device node, since the size of that is already known by the file system.
+ Additionally, writing beyond end-of-device will not return ENOSPC there.
+
+
+I/O engine
+~~~~~~~~~~
+
+.. option:: ioengine=str
+
+ fio supports 2 kinds of performance measurement: I/O and file/directory operation.
+
+ I/O engines define how the job issues I/O to the file. The following types are defined:
+
+ **sync**
+ Basic :manpage:`read(2)` or :manpage:`write(2)`
+ I/O. :manpage:`lseek(2)` is used to position the I/O location.
+ See :option:`fsync` and :option:`fdatasync` for syncing write I/Os.
+
+ **psync**
+ Basic :manpage:`pread(2)` or :manpage:`pwrite(2)` I/O. Default on
+ all supported operating systems except for Windows.
+
+ **vsync**
+ Basic :manpage:`readv(2)` or :manpage:`writev(2)` I/O. Will emulate
+ queuing by coalescing adjacent I/Os into a single submission.
+
+ **pvsync**
+ Basic :manpage:`preadv(2)` or :manpage:`pwritev(2)` I/O.
+
+ **pvsync2**
+ Basic :manpage:`preadv2(2)` or :manpage:`pwritev2(2)` I/O.
+
+ **io_uring**
+ Fast Linux native asynchronous I/O. Supports async IO
+ for both direct and buffered IO.
+ This engine defines engine specific options.
+
+ **io_uring_cmd**
+ Fast Linux native asynchronous I/O for pass through commands.
+ This engine defines engine specific options.
+
+ **libaio**
+ Linux native asynchronous I/O. Note that Linux may only support
+ queued behavior with non-buffered I/O (set ``direct=1`` or
+ ``buffered=0``).
+ This engine defines engine specific options.
+
+ **posixaio**
+ POSIX asynchronous I/O using :manpage:`aio_read(3)` and
+ :manpage:`aio_write(3)`.
+
+ **solarisaio**
+ Solaris native asynchronous I/O.
+
+ **windowsaio**
+ Windows native asynchronous I/O. Default on Windows.
+
+ **mmap**
+ File is memory mapped with :manpage:`mmap(2)` and data copied
+ to/from using :manpage:`memcpy(3)`.
+
+ **splice**
+ :manpage:`splice(2)` is used to transfer the data and
+ :manpage:`vmsplice(2)` to transfer data from user space to the
+ kernel.
+
+ **sg**
+ SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
+ ioctl, or if the target is an sg character device we use
+ :manpage:`read(2)` and :manpage:`write(2)` for asynchronous
+ I/O. Requires :option:`filename` option to specify either block or
+ character devices. This engine supports trim operations.
+ The sg engine includes engine specific options.
+
+ **libzbc**
+ Read, write, trim and ZBC/ZAC operations to a zoned
+ block device using libzbc library. The target can be
+ either an SG character device or a block device file.
+
+ **null**
+ Doesn't transfer any data, just pretends to. This is mainly used to
+ exercise fio itself and for debugging/testing purposes.
+
+ **net**
+ Transfer over the network to given ``host:port``. Depending on the
+ :option:`protocol` used, the :option:`hostname`, :option:`port`,
+ :option:`listen` and :option:`filename` options are used to specify
+ what sort of connection to make, while the :option:`protocol` option
+ determines which protocol will be used. This engine defines engine
+ specific options.
+
+ **netsplice**
+ Like **net**, but uses :manpage:`splice(2)` and
+ :manpage:`vmsplice(2)` to map data and send/receive.
+ This engine defines engine specific options.
+
+ **cpuio**
+ Doesn't transfer any data, but burns CPU cycles according to the
+ :option:`cpuload`, :option:`cpuchunks` and :option:`cpumode` options.
+ Setting :option:`cpuload`\=85 will cause that job to do nothing but burn 85%
+ of the CPU. In case of SMP machines, use :option:`numjobs`\=<nr_of_cpu>
+ to get desired CPU usage, as the cpuload only loads a
+ single CPU at the desired rate. A job never finishes unless there is
+ at least one non-cpuio job.
+ Setting :option:`cpumode`\=qsort replace the default noop instructions loop
+ by a qsort algorithm to consume more energy.
+
+ **rdma**
+ The RDMA I/O engine supports both RDMA memory semantics
+ (RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
+ InfiniBand, RoCE and iWARP protocols. This engine defines engine
+ specific options.
+
+ **falloc**
+ I/O engine that does regular fallocate to simulate data transfer as
+ fio ioengine.
+
+ DDIR_READ
+ does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
+
+ DDIR_WRITE
+ does fallocate(,mode = 0).
+
+ DDIR_TRIM
+ does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
+
+ **ftruncate**
+ I/O engine that sends :manpage:`ftruncate(2)` operations in response
+ to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
+ size to the current block offset. :option:`blocksize` is ignored.
+
+ **e4defrag**
+ I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+ defragment activity in request to DDIR_WRITE event.
+
+ **rados**
+ I/O engine supporting direct access to Ceph Reliable Autonomic
+ Distributed Object Store (RADOS) via librados. This ioengine
+ defines engine specific options.
+
+ **rbd**
+ I/O engine supporting direct access to Ceph Rados Block Devices
+ (RBD) via librbd without the need to use the kernel rbd driver. This
+ ioengine defines engine specific options.
+
+ **http**
+ I/O engine supporting GET/PUT requests over HTTP(S) with libcurl to
+ a WebDAV or S3 endpoint. This ioengine defines engine specific options.
+
+ This engine only supports direct IO of iodepth=1; you need to scale this
+ via numjobs. blocksize defines the size of the objects to be created.
+
+ TRIM is translated to object deletion.
+
+ **gfapi**
+ Using GlusterFS libgfapi sync interface to direct access to
+ GlusterFS volumes without having to go through FUSE. This ioengine
+ defines engine specific options.
+
+ **gfapi_async**
+ Using GlusterFS libgfapi async interface to direct access to
+ GlusterFS volumes without having to go through FUSE. This ioengine
+ defines engine specific options.
+
+ **libhdfs**
+ Read and write through Hadoop (HDFS). The :option:`filename` option
+ is used to specify host,port of the hdfs name-node to connect. This
+ engine interprets offsets a little differently. In HDFS, files once
+ created cannot be modified so random writes are not possible. To
+ imitate this the libhdfs engine expects a bunch of small files to be
+ created over HDFS and will randomly pick a file from them
+ based on the offset generated by fio backend (see the example
+ job file to create such files, use ``rw=write`` option). Please
+ note, it may be necessary to set environment variables to work
+ with HDFS/libhdfs properly. Each job uses its own connection to
+ HDFS.
+
+ **mtd**
+ Read, write and erase an MTD character device (e.g.,
+ :file:`/dev/mtd0`). Discards are treated as erases. Depending on the
+ underlying device type, the I/O may have to go in a certain pattern,
+ e.g., on NAND, writing sequentially to erase blocks and discarding
+ before overwriting. The `trimwrite` mode works well for this
+ constraint.
+
+ **dev-dax**
+ Read and write using device DAX to a persistent memory device (e.g.,
+ /dev/dax0.0) through the PMDK libpmem library.
+
+ **external**
+ Prefix to specify loading an external I/O engine object file. Append
+ the engine filename, e.g. ``ioengine=external:/tmp/foo.o`` to load
+ ioengine :file:`foo.o` in :file:`/tmp`. The path can be either
+ absolute or relative. See :file:`engines/skeleton_external.c` for
+ details of writing an external I/O engine.
+
+ **libpmem**
+ Read and write using mmap I/O to a file on a filesystem
+ mounted with DAX on a persistent memory device through the PMDK
+ libpmem library.
+
+ **ime_psync**
+ Synchronous read and write using DDN's Infinite Memory Engine (IME).
+ This engine is very basic and issues calls to IME whenever an IO is
+ queued.
+
+ **ime_psyncv**
+ Synchronous read and write using DDN's Infinite Memory Engine (IME).
+ This engine uses iovecs and will try to stack as much IOs as possible
+ (if the IOs are "contiguous" and the IO depth is not exceeded)
+ before issuing a call to IME.
+
+ **ime_aio**
+ Asynchronous read and write using DDN's Infinite Memory Engine (IME).
+ This engine will try to stack as much IOs as possible by creating
+ requests for IME. FIO will then decide when to commit these requests.
+
+ **libiscsi**
+ Read and write iscsi lun with libiscsi.
+
+ **nbd**
+ Read and write a Network Block Device (NBD).
+
+ **libcufile**
+ I/O engine supporting libcufile synchronous access to nvidia-fs and a
+ GPUDirect Storage-supported filesystem. This engine performs
+ I/O without transferring buffers between user-space and the kernel,
+ unless :option:`verify` is set or :option:`cuda_io` is `posix`.
+ :option:`iomem` must not be `cudamalloc`. This ioengine defines
+ engine specific options.
+
+ **dfs**
+ I/O engine supporting asynchronous read and write operations to the
+ DAOS File System (DFS) via libdfs.
+
+ **nfs**
+ I/O engine supporting asynchronous read and write operations to
+ NFS filesystems from userspace via libnfs. This is useful for
+ achieving higher concurrency and thus throughput than is possible
+ via kernel NFS.
+
+ **exec**
+ Execute 3rd party tools. Could be used to perform monitoring during jobs runtime.
+
+ **xnvme**
+ I/O engine using the xNVMe C API, for NVMe devices. The xnvme engine provides
+ flexibility to access GNU/Linux Kernel NVMe driver via libaio, IOCTLs, io_uring,
+ the SPDK NVMe driver, or your own custom NVMe driver. The xnvme engine includes
+ engine specific options. (See https://xnvme.io).
+
+ **libblkio**
+ Use the libblkio library
+ (https://gitlab.com/libblkio/libblkio). The specific
+ *driver* to use must be set using
+ :option:`libblkio_driver`. If
+ :option:`mem`/:option:`iomem` is not specified, memory
+ allocation is delegated to libblkio (and so is
+ guaranteed to work with the selected *driver*). One
+ libblkio instance is used per process, so all jobs
+ setting option :option:`thread` will share a single
+ instance (with one queue per thread) and must specify
+ compatible options. Note that some drivers don't allow
+ several instances to access the same device or file
+ simultaneously, but allow it for threads.
+
+ File/directory operation engines define how the job operates file or directory. The
+ following types are defined:
+
+ **filecreate**
+ Simply create the files and do no I/O to them. You still need to
+ set `filesize` so that all the accounting still occurs, but no
+ actual I/O will be done other than creating the file.
+ Example job file: filecreate-ioengine.fio.
+
+ **filestat**
+ Simply do stat() and do no I/O to the file. You need to set 'filesize'
+ and 'nrfiles', so that files will be created.
+ This engine is to measure file lookup and meta data access.
+ Example job file: filestat-ioengine.fio.
+
+ **filedelete**
+ Simply delete the files by unlink() and do no I/O to them. You need to set 'filesize'
+ and 'nrfiles', so that the files will be created.
+ This engine is to measure file delete.
+ Example job file: filedelete-ioengine.fio.
+
+ **dircreate**
+ Simply create the directories and do no I/O to them. You still need to
+ set `filesize` so that all the accounting still occurs, but no
+ actual I/O will be done other than creating the directories.
+ Example job file: dircreate-ioengine.fio.
+
+ **dirstat**
+ Simply do stat() and do no I/O to the directories. You need to set 'filesize'
+ and 'nrfiles', so that directories will be created.
+ This engine is to measure directory lookup and meta data access.
+ Example job file: dirstat-ioengine.fio.
+
+ **dirdelete**
+ Simply delete the directories by rmdir() and do no I/O to them. You need to set 'filesize'
+ and 'nrfiles', so that the directories will be created.
+ This engine is to measure directory delete.
+ Example job file: dirdelete-ioengine.fio.
+
+ For file and directory operation engines, there is no I/O throughput, then the
+ statistics data in report have different meanings. The meaningful output indexes are: 'iops' and 'clat'.
+ 'bw' is meaningless. Refer to section: "Interpreting the output" for more details.
+
+
+I/O engine specific parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition, there are some parameters which are only valid when a specific
+:option:`ioengine` is in use. These are used identically to normal parameters,
+with the caveat that when used on the command line, they must come after the
+:option:`ioengine` that defines them is selected.
+
+.. option:: cmdprio_percentage=int[,int] : [io_uring] [libaio]
+
+ Set the percentage of I/O that will be issued with the highest priority.
+ Default: 0. A single value applies to reads and writes. Comma-separated
+ values may be specified for reads and writes. For this option to be
+ effective, NCQ priority must be supported and enabled, and the :option:`direct`
+ option must be set. fio must also be run as the root user. Unlike
+ slat/clat/lat stats, which can be tracked and reported independently, per
+ priority stats only track and report a single type of latency. By default,
+ completion latency (clat) will be reported, if :option:`lat_percentiles` is
+ set, total latency (lat) will be reported.
+
+.. option:: cmdprio_class=int[,int] : [io_uring] [libaio]
+
+ Set the I/O priority class to use for I/Os that must be issued with
+ a priority when :option:`cmdprio_percentage` or
+ :option:`cmdprio_bssplit` is set. If not specified when
+ :option:`cmdprio_percentage` or :option:`cmdprio_bssplit` is set,
+ this defaults to the highest priority class. A single value applies
+ to reads and writes. Comma-separated values may be specified for
+ reads and writes. See :manpage:`ionice(1)`. See also the
+ :option:`prioclass` option.
+
+.. option:: cmdprio_hint=int[,int] : [io_uring] [libaio]
+
+ Set the I/O priority hint to use for I/Os that must be issued with
+ a priority when :option:`cmdprio_percentage` or
+ :option:`cmdprio_bssplit` is set. If not specified when
+ :option:`cmdprio_percentage` or :option:`cmdprio_bssplit` is set,
+ this defaults to 0 (no hint). A single value applies to reads and
+ writes. Comma-separated values may be specified for reads and writes.
+ See also the :option:`priohint` option.
+
+.. option:: cmdprio=int[,int] : [io_uring] [libaio]
+
+ Set the I/O priority value to use for I/Os that must be issued with
+ a priority when :option:`cmdprio_percentage` or
+ :option:`cmdprio_bssplit` is set. If not specified when
+ :option:`cmdprio_percentage` or :option:`cmdprio_bssplit` is set,
+ this defaults to 0.
+ Linux limits us to a positive value between 0 and 7, with 0 being the
+ highest. A single value applies to reads and writes. Comma-separated
+ values may be specified for reads and writes. See :manpage:`ionice(1)`.
+ Refer to an appropriate manpage for other operating systems since
+ meaning of priority may differ. See also the :option:`prio` option.
+
+.. option:: cmdprio_bssplit=str[,str] : [io_uring] [libaio]
+
+ To get a finer control over I/O priority, this option allows
+ specifying the percentage of IOs that must have a priority set
+ depending on the block size of the IO. This option is useful only
+ when used together with the :option:`bssplit` option, that is,
+ multiple different block sizes are used for reads and writes.
+
+ The first accepted format for this option is the same as the format of
+ the :option:`bssplit` option:
+
+ cmdprio_bssplit=blocksize/percentage:blocksize/percentage
+
+ In this case, each entry will use the priority class, priority hint
+ and priority level defined by the options :option:`cmdprio_class`,
+ :option:`cmdprio` and :option:`cmdprio_hint` respectively.
+
+ The second accepted format for this option is:
+
+ cmdprio_bssplit=blocksize/percentage/class/level:blocksize/percentage/class/level
+
+ In this case, the priority class and priority level is defined inside
+ each entry. In comparison with the first accepted format, the second
+ accepted format does not restrict all entries to have the same priority
+ class and priority level.
+
+ The third accepted format for this option is:
+
+ cmdprio_bssplit=blocksize/percentage/class/level/hint:...
+
+ This is an extension of the second accepted format that allows one to
+ also specify a priority hint.
+
+ For all formats, only the read and write data directions are supported,
+ values for trim IOs are ignored. This option is mutually exclusive with
+ the :option:`cmdprio_percentage` option.
+
+.. option:: fixedbufs : [io_uring] [io_uring_cmd]
+
+ If fio is asked to do direct IO, then Linux will map pages for each
+ IO call, and release them when IO is done. If this option is set, the
+ pages are pre-mapped before IO is started. This eliminates the need to
+ map and release for each IO. This is more efficient, and reduces the
+ IO latency as well.
+
+.. option:: nonvectored=int : [io_uring] [io_uring_cmd]
+
+ With this option, fio will use non-vectored read/write commands, where
+ address must contain the address directly. Default is -1.
+
+.. option:: force_async=int : [io_uring] [io_uring_cmd]
+
+ Normal operation for io_uring is to try and issue an sqe as
+ non-blocking first, and if that fails, execute it in an async manner.
+ With this option set to N, then every N request fio will ask sqe to
+ be issued in an async manner. Default is 0.
+
+.. option:: registerfiles : [io_uring] [io_uring_cmd]
+
+ With this option, fio registers the set of files being used with the
+ kernel. This avoids the overhead of managing file counts in the kernel,
+ making the submission and completion part more lightweight. Required
+ for the below :option:`sqthread_poll` option.
+
+.. option:: sqthread_poll : [io_uring] [io_uring_cmd] [xnvme]
+
+ Normally fio will submit IO by issuing a system call to notify the
+ kernel of available items in the SQ ring. If this option is set, the
+ act of submitting IO will be done by a polling thread in the kernel.
+ This frees up cycles for fio, at the cost of using more CPU in the
+ system. As submission is just the time it takes to fill in the sqe
+ entries and any syscall required to wake up the idle kernel thread,
+ fio will not report submission latencies.
+
+.. option:: sqthread_poll_cpu=int : [io_uring] [io_uring_cmd]
+
+ When :option:`sqthread_poll` is set, this option provides a way to
+ define which CPU should be used for the polling thread.
+
+.. option:: cmd_type=str : [io_uring_cmd]
+
+ Specifies the type of uring passthrough command to be used. Supported
+ value is nvme. Default is nvme.
+
+.. option:: hipri
+
+ [io_uring] [io_uring_cmd] [xnvme]
+
+ If this option is set, fio will attempt to use polled IO completions.
+ Normal IO completions generate interrupts to signal the completion of
+ IO, polled completions do not. Hence they are require active reaping
+ by the application. The benefits are more efficient IO for high IOPS
+ scenarios, and lower latencies for low queue depth IO.
+
+ [libblkio]
+
+ Use poll queues. This is incompatible with
+ :option:`libblkio_wait_mode=eventfd <libblkio_wait_mode>` and
+ :option:`libblkio_force_enable_completion_eventfd`.
+
+ [pvsync2]
+
+ Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+ than normal.
+
+ [sg]
+
+ If this option is set, fio will attempt to use polled IO completions.
+ This will have a similar effect as (io_uring)hipri. Only SCSI READ and
+ WRITE commands will have the SGV4_FLAG_HIPRI set (not UNMAP (trim) nor
+ VERIFY). Older versions of the Linux sg driver that do not support
+ hipri will simply ignore this flag and do normal IO. The Linux SCSI
+ Low Level Driver (LLD) that "owns" the device also needs to support
+ hipri (also known as iopoll and mq_poll). The MegaRAID driver is an
+ example of a SCSI LLD. Default: clear (0) which does normal
+ (interrupted based) IO.
+
+.. option:: userspace_reap : [libaio]
+
+ Normally, with the libaio engine in use, fio will use the
+ :manpage:`io_getevents(2)` system call to reap newly returned events. With
+ this flag turned on, the AIO ring will be read directly from user-space to
+ reap events. The reaping mode is only enabled when polling for a minimum of
+ 0 events (e.g. when :option:`iodepth_batch_complete` `=0`).
+
+.. option:: hipri_percentage : [pvsync2]
+
+ When hipri is set this determines the probability of a pvsync2 I/O being high
+ priority. The default is 100%.
+
+.. option:: nowait=bool : [pvsync2] [libaio] [io_uring] [io_uring_cmd]
+
+ By default if a request cannot be executed immediately (e.g. resource starvation,
+ waiting on locks) it is queued and the initiating process will be blocked until
+ the required resource becomes free.
+
+ This option sets the RWF_NOWAIT flag (supported from the 4.14 Linux kernel) and
+ the call will return instantly with EAGAIN or a partial result rather than waiting.
+
+ It is useful to also use ignore_error=EAGAIN when using this option.
+
+ Note: glibc 2.27, 2.28 have a bug in syscall wrappers preadv2, pwritev2.
+ They return EOPNOTSUP instead of EAGAIN.
+
+ For cached I/O, using this option usually means a request operates only with
+ cached data. Currently the RWF_NOWAIT flag does not supported for cached write.
+
+ For direct I/O, requests will only succeed if cache invalidation isn't required,
+ file blocks are fully allocated and the disk request could be issued immediately.
+
+.. option:: fdp=bool : [io_uring_cmd] [xnvme]
+
+ Enable Flexible Data Placement mode for write commands.
+
+.. option:: dataplacement=str : [io_uring_cmd] [xnvme]
+
+ Specifies the data placement directive type to use for write commands.
+ The following types are supported:
+
+ **none**
+ Do not use a data placement directive. This is the
+ default.
+
+ **fdp**
+ Use Flexible Data Placement directives for write
+ commands. This is equivalent to specifying
+ :option:`fdp` =1.
+
+ **streams**
+ Use Streams directives for write commands.
+
+.. option:: plid_select=str, fdp_pli_select=str : [io_uring_cmd] [xnvme]
+
+ Defines how fio decides which placement ID to use next. The following
+ types are defined:
+
+ **random**
+ Choose a placement ID at random (uniform).
+
+ **roundrobin**
+ Round robin over available placement IDs. This is the
+ default.
+
+ The available placement ID (indices) are defined by the option
+ :option:`plids`.
+
+.. option:: plids=str, fdp_pli=str : [io_uring_cmd] [xnvme]
+
+ Select which Placement IDs (streams) or Placement ID Indices (FDP) this
+ job is allowed to use for writes. For FDP by default, the job will
+ cycle through all available Placement IDs, so use this to isolate these
+ identifiers to specific jobs. If you want fio to use FDP placement
+ identifiers only at indices 0, 2 and 5 specify ``plids=0,2,5``. For
+ streams this should be a comma-separated list of Stream IDs.
+
+.. option:: md_per_io_size=int : [io_uring_cmd] [xnvme]
+
+ Size in bytes for separate metadata buffer per IO. Default: 0.
+
+.. option:: pi_act=int : [io_uring_cmd] [xnvme]
+
+ Action to take when nvme namespace is formatted with protection
+ information. If this is set to 1 and namespace is formatted with
+ metadata size equal to protection information size, fio won't use
+ separate metadata buffer or extended logical block. If this is set to
+ 1 and namespace is formatted with metadata size greater than protection
+ information size, fio will not generate or verify the protection
+ information portion of metadata for write or read case respectively.
+ If this is set to 0, fio generates protection information for
+ write case and verifies for read case. Default: 1.
+
+ For 16 bit CRC generation fio will use isa-l if available otherwise
+ it will use the default slower generator.
+ (see: https://github.com/intel/isa-l)
+
+.. option:: pi_chk=str[,str][,str] : [io_uring_cmd] [xnvme]
+
+ Controls the protection information check. This can take one or more
+ of these values. Default: none.
+
+ **GUARD**
+ Enables protection information checking of guard field.
+ **REFTAG**
+ Enables protection information checking of logical block
+ reference tag field.
+ **APPTAG**
+ Enables protection information checking of application tag field.
+
+.. option:: apptag=int : [io_uring_cmd] [xnvme]
+
+ Specifies logical block application tag value, if namespace is
+ formatted to use end to end protection information. Default: 0x1234.
+
+.. option:: apptag_mask=int : [io_uring_cmd] [xnvme]
+
+ Specifies logical block application tag mask value, if namespace is
+ formatted to use end to end protection information. Default: 0xffff.
+
+.. option:: num_range=int : [io_uring_cmd]
+
+ For trim command this will be the number of ranges to trim per I/O
+ request. The number of logical blocks per range is determined by the
+ :option:`bs` option which should be a multiple of logical block size.
+ This cannot be used with read or write. Note that setting this
+ option > 1, :option:`log_offset` will not be able to log all the
+ offsets. Default: 1.
+
+.. option:: cpuload=int : [cpuio]
+
+ Attempt to use the specified percentage of CPU cycles. This is a mandatory
+ option when using cpuio I/O engine.
+
+.. option:: cpuchunks=int : [cpuio]
+
+ Split the load into cycles of the given time. In microseconds.
+
+.. option:: cpumode=str : [cpuio]
+
+ Specify how to stress the CPU. It can take these two values:
+
+ **noop**
+ This is the default where the CPU executes noop instructions.
+ **qsort**
+ Replace the default noop instructions loop with a qsort algorithm to
+ consume more energy.
+
+.. option:: exit_on_io_done=bool : [cpuio]
+
+ Detect when I/O threads are done, then exit.
+
+.. option:: namenode=str : [libhdfs]
+
+ The hostname or IP address of a HDFS cluster namenode to contact.
+
+.. option:: port=int
+
+ [libhdfs]
+
+ The listening port of the HFDS cluster namenode.
+
+ [netsplice], [net]
+
+ The TCP or UDP port to bind to or connect to. If this is used with
+ :option:`numjobs` to spawn multiple instances of the same job type, then
+ this will be the starting port number since fio will use a range of
+ ports.
+
+ [rdma], [librpma_*]
+
+ The port to use for RDMA-CM communication. This should be the same value
+ on the client and the server side.
+
+.. option:: hostname=str : [netsplice] [net] [rdma]
+
+ The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O. If the job
+ is a TCP listener or UDP reader, the hostname is not used and must be omitted
+ unless it is a valid UDP multicast address.
+
+.. option:: serverip=str : [librpma_*]
+
+ The IP address to be used for RDMA-CM based I/O.
+
+.. option:: direct_write_to_pmem=bool : [librpma_*]
+
+ Set to 1 only when Direct Write to PMem from the remote host is possible.
+ Otherwise, set to 0.
+
+.. option:: busy_wait_polling=bool : [librpma_*_server]
+
+ Set to 0 to wait for completion instead of busy-wait polling completion.
+ Default: 1.
+
+.. option:: interface=str : [netsplice] [net]
+
+ The IP address of the network interface used to send or receive UDP
+ multicast.
+
+.. option:: ttl=int : [netsplice] [net]
+
+ Time-to-live value for outgoing UDP multicast packets. Default: 1.
+
+.. option:: nodelay=bool : [netsplice] [net]
+
+ Set TCP_NODELAY on TCP connections.
+
+.. option:: protocol=str, proto=str : [netsplice] [net]
+
+ The network protocol to use. Accepted values are:
+
+ **tcp**
+ Transmission control protocol.
+ **tcpv6**
+ Transmission control protocol V6.
+ **udp**
+ User datagram protocol.
+ **udpv6**
+ User datagram protocol V6.
+ **unix**
+ UNIX domain socket.
+ **vsock**
+ VSOCK protocol.
+
+ When the protocol is TCP, UDP or VSOCK, the port must also be given, as well as the
+ hostname if the job is a TCP or VSOCK listener or UDP reader. For unix sockets, the
+ normal :option:`filename` option should be used and the port is invalid.
+ When the protocol is VSOCK, the :option:`hostname` is the CID of the remote VM.
+
+.. option:: listen : [netsplice] [net]
+
+ For TCP network connections, tell fio to listen for incoming connections
+ rather than initiating an outgoing connection. The :option:`hostname` must
+ be omitted if this option is used.
+
+.. option:: pingpong : [netsplice] [net]
+
+ Normally a network writer will just continue writing data, and a network
+ reader will just consume packages. If ``pingpong=1`` is set, a writer will
+ send its normal payload to the reader, then wait for the reader to send the
+ same payload back. This allows fio to measure network latencies. The
+ submission and completion latencies then measure local time spent sending or
+ receiving, and the completion latency measures how long it took for the
+ other end to receive and send back. For UDP multicast traffic
+ ``pingpong=1`` should only be set for a single reader when multiple readers
+ are listening to the same address.
+
+.. option:: window_size : [netsplice] [net]
+
+ Set the desired socket buffer size for the connection.
+
+.. option:: mss : [netsplice] [net]
+
+ Set the TCP maximum segment size (TCP_MAXSEG).
+
+.. option:: donorname=str : [e4defrag]
+
+ File will be used as a block donor (swap extents between files).
+
+.. option:: inplace=int : [e4defrag]
+
+ Configure donor file blocks allocation strategy:
+
+ **0**
+ Default. Preallocate donor's file on init.
+ **1**
+ Allocate space immediately inside defragment event, and free right
+ after event.
+
+.. option:: clustername=str : [rbd,rados]
+
+ Specifies the name of the Ceph cluster.
+
+.. option:: rbdname=str : [rbd]
+
+ Specifies the name of the RBD.
+
+.. option:: clientname=str : [rbd,rados]
+
+ Specifies the username (without the 'client.' prefix) used to access the
+ Ceph cluster. If the *clustername* is specified, the *clientname* shall be
+ the full *type.id* string. If no type. prefix is given, fio will add
+ 'client.' by default.
+
+.. option:: conf=str : [rados]
+
+ Specifies the configuration path of ceph cluster, so conf file does not
+ have to be /etc/ceph/ceph.conf.
+
+.. option:: busy_poll=bool : [rbd,rados]
+
+ Poll store instead of waiting for completion. Usually this provides better
+ throughput at cost of higher(up to 100%) CPU utilization.
+
+.. option:: touch_objects=bool : [rados]
+
+ During initialization, touch (create if do not exist) all objects (files).
+ Touching all objects affects ceph caches and likely impacts test results.
+ Enabled by default.
+
+.. option:: pool=str :
+
+ [rbd,rados]
+
+ Specifies the name of the Ceph pool containing RBD or RADOS data.
+
+ [dfs]
+
+ Specify the label or UUID of the DAOS pool to connect to.
+
+.. option:: cont=str : [dfs]
+
+ Specify the label or UUID of the DAOS container to open.
+
+.. option:: chunk_size=int
+
+ [dfs]
+
+ Specify a different chunk size (in bytes) for the dfs file.
+ Use DAOS container's chunk size by default.
+
+ [libhdfs]
+
+ The size of the chunk to use for each file.
+
+.. option:: object_class=str : [dfs]
+
+ Specify a different object class for the dfs file.
+ Use DAOS container's object class by default.
+
+.. option:: skip_bad=bool : [mtd]
+
+ Skip operations against known bad blocks.
+
+.. option:: hdfsdirectory : [libhdfs]
+
+ libhdfs will create chunk in this HDFS directory.
+
+.. option:: verb=str : [rdma]
+
+ The RDMA verb to use on this side of the RDMA ioengine connection. Valid
+ values are write, read, send and recv. These correspond to the equivalent
+ RDMA verbs (e.g. write = rdma_write etc.). Note that this only needs to be
+ specified on the client side of the connection. See the examples folder.
+
+.. option:: bindname=str : [rdma]
+
+ The name to use to bind the local RDMA-CM connection to a local RDMA device.
+ This could be a hostname or an IPv4 or IPv6 address. On the server side this
+ will be passed into the rdma_bind_addr() function and on the client site it
+ will be used in the rdma_resolve_add() function. This can be useful when
+ multiple paths exist between the client and the server or in certain loopback
+ configurations.
+
+.. option:: stat_type=str : [filestat]
+
+ Specify stat system call type to measure lookup/getattr performance.
+ Default is **stat** for :manpage:`stat(2)`.
+
+.. option:: readfua=bool : [sg]
+
+ With readfua option set to 1, read operations include
+ the force unit access (fua) flag. Default is 0.
+
+.. option:: writefua=bool : [sg]
+
+ With writefua option set to 1, write operations include
+ the force unit access (fua) flag. Default is 0.
+
+.. option:: sg_write_mode=str : [sg]
+
+ Specify the type of write commands to issue. This option can take ten values:
+
+ **write**
+ This is the default where write opcodes are issued as usual.
+ **write_and_verify**
+ Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
+ directs the device to carry out a medium verification with no data
+ comparison. The writefua option is ignored with this selection.
+ **verify**
+ This option is deprecated. Use write_and_verify instead.
+ **write_same**
+ Issue WRITE SAME commands. This transfers a single block to the device
+ and writes this same block of data to a contiguous sequence of LBAs
+ beginning at the specified offset. fio's block size parameter specifies
+ the amount of data written with each command. However, the amount of data
+ actually transferred to the device is equal to the device's block
+ (sector) size. For a device with 512 byte sectors, blocksize=8k will
+ write 16 sectors with each command. fio will still generate 8k of data
+ for each command but only the first 512 bytes will be used and
+ transferred to the device. The writefua option is ignored with this
+ selection.
+ **same**
+ This option is deprecated. Use write_same instead.
+ **write_same_ndob**
+ Issue WRITE SAME(16) commands as above but with the No Data Output
+ Buffer (NDOB) bit set. No data will be transferred to the device with
+ this bit set. Data written will be a pre-determined pattern such as
+ all zeroes.
+ **write_stream**
+ Issue WRITE STREAM(16) commands. Use the **stream_id** option to specify
+ the stream identifier.
+ **verify_bytchk_00**
+ Issue VERIFY commands with BYTCHK set to 00. This directs the
+ device to carry out a medium verification with no data comparison.
+ **verify_bytchk_01**
+ Issue VERIFY commands with BYTCHK set to 01. This directs the device to
+ compare the data on the device with the data transferred to the device.
+ **verify_bytchk_11**
+ Issue VERIFY commands with BYTCHK set to 11. This transfers a
+ single block to the device and compares the contents of this block with the
+ data on the device beginning at the specified offset. fio's block size
+ parameter specifies the total amount of data compared with this command.
+ However, only one block (sector) worth of data is transferred to the device.
+ This is similar to the WRITE SAME command except that data is compared instead
+ of written.
+
+.. option:: stream_id=int : [sg]
+
+ Set the stream identifier for WRITE STREAM commands. If this is set to 0 (which is not
+ a valid stream identifier) fio will open a stream and then close it when done. Default
+ is 0.
+
+.. option:: http_host=str : [http]
+
+ Hostname to connect to. For S3, this could be the bucket hostname.
+ Default is **localhost**
+
+.. option:: http_user=str : [http]
+
+ Username for HTTP authentication.
+
+.. option:: http_pass=str : [http]
+
+ Password for HTTP authentication.
+
+.. option:: https=str : [http]
+
+ Enable HTTPS instead of http. *on* enables HTTPS; *insecure*
+ will enable HTTPS, but disable SSL peer verification (use with
+ caution!). Default is **off**
+
+.. option:: http_mode=str : [http]
+
+ Which HTTP access mode to use: *webdav*, *swift*, or *s3*.
+ Default is **webdav**
+
+.. option:: http_s3_region=str : [http]
+
+ The S3 region/zone string.
+ Default is **us-east-1**
+
+.. option:: http_s3_key=str : [http]
+
+ The S3 secret key.
+
+.. option:: http_s3_keyid=str : [http]
+
+ The S3 key/access id.
+
+.. option:: http_s3_sse_customer_key=str : [http]
+
+ The encryption customer key in SSE server side.
+
+.. option:: http_s3_sse_customer_algorithm=str : [http]
+
+ The encryption customer algorithm in SSE server side.
+ Default is **AES256**
+
+.. option:: http_s3_storage_class=str : [http]
+
+ Which storage class to access. User-customizable settings.
+ Default is **STANDARD**
+
+.. option:: http_swift_auth_token=str : [http]
+
+ The Swift auth token. See the example configuration file on how
+ to retrieve this.
+
+.. option:: http_verbose=int : [http]
+
+ Enable verbose requests from libcurl. Useful for debugging. 1
+ turns on verbose logging from libcurl, 2 additionally enables
+ HTTP IO tracing. Default is **0**
+
+.. option:: uri=str : [nbd]
+
+ Specify the NBD URI of the server to test. The string
+ is a standard NBD URI
+ (see https://github.com/NetworkBlockDevice/nbd/tree/master/doc).
+ Example URIs: nbd://localhost:10809
+ nbd+unix:///?socket=/tmp/socket
+ nbds://tlshost/exportname
+
+.. option:: gpu_dev_ids=str : [libcufile]
+
+ Specify the GPU IDs to use with CUDA. This is a colon-separated list of
+ int. GPUs are assigned to workers roundrobin. Default is 0.
+
+.. option:: cuda_io=str : [libcufile]
+
+ Specify the type of I/O to use with CUDA. Default is **cufile**.
+
+ **cufile**
+ Use libcufile and nvidia-fs. This option performs I/O directly
+ between a GPUDirect Storage filesystem and GPU buffers,
+ avoiding use of a bounce buffer. If :option:`verify` is set,
+ cudaMemcpy is used to copy verificaton data between RAM and GPU.
+ Verification data is copied from RAM to GPU before a write
+ and from GPU to RAM after a read. :option:`direct` must be 1.
+ **posix**
+ Use POSIX to perform I/O with a RAM buffer, and use cudaMemcpy
+ to transfer data between RAM and the GPUs. Data is copied from
+ GPU to RAM before a write and copied from RAM to GPU after a
+ read. :option:`verify` does not affect use of cudaMemcpy.
+
+.. option:: nfs_url=str : [nfs]
+
+ URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
+ Refer to the libnfs README for more details.
+
+.. option:: program=str : [exec]
+
+ Specify the program to execute.
+
+.. option:: arguments=str : [exec]
+
+ Specify arguments to pass to program.
+ Some special variables can be expanded to pass fio's job details to the program.
+
+ **%r**
+ Replaced by the duration of the job in seconds.
+ **%n**
+ Replaced by the name of the job.
+
+.. option:: grace_time=int : [exec]
+
+ Specify the time between the SIGTERM and SIGKILL signals. Default is 1 second.
+
+.. option:: std_redirect=bool : [exec]
+
+ If set, stdout and stderr streams are redirected to files named from the job name. Default is true.
+
+.. option:: xnvme_async=str : [xnvme]
+
+ Select the xnvme async command interface. This can take these values.
+
+ **emu**
+ This is default and use to emulate asynchronous I/O by using a
+ single thread to create a queue pair on top of a synchronous
+ I/O interface using the NVMe driver IOCTL.
+ **thrpool**
+ Emulate an asynchronous I/O interface with a pool of userspace
+ threads on top of a synchronous I/O interface using the NVMe
+ driver IOCTL. By default four threads are used.
+ **io_uring**
+ Linux native asynchronous I/O interface which supports both
+ direct and buffered I/O.
+ **io_uring_cmd**
+ Fast Linux native asynchronous I/O interface for NVMe pass
+ through commands. This only works with NVMe character device
+ (/dev/ngXnY).
+ **libaio**
+ Use Linux aio for Asynchronous I/O.
+ **posix**
+ Use the posix asynchronous I/O interface to perform one or
+ more I/O operations asynchronously.
+ **vfio**
+ Use the user-space VFIO-based backend, implemented using
+ libvfn instead of SPDK.
+ **nil**
+ Do not transfer any data; just pretend to. This is mainly used
+ for introspective performance evaluation.
+
+.. option:: xnvme_sync=str : [xnvme]
+
+ Select the xnvme synchronous command interface. This can take these values.
+
+ **nvme**
+ This is default and uses Linux NVMe Driver ioctl() for
+ synchronous I/O.
+ **psync**
+ This supports regular as well as vectored pread() and pwrite()
+ commands.
+ **block**
+ This is the same as psync except that it also supports zone
+ management commands using Linux block layer IOCTLs.
+
+.. option:: xnvme_admin=str : [xnvme]
+
+ Select the xnvme admin command interface. This can take these values.
+
+ **nvme**
+ This is default and uses linux NVMe Driver ioctl() for admin
+ commands.
+ **block**
+ Use Linux Block Layer ioctl() and sysfs for admin commands.
+
+.. option:: xnvme_dev_nsid=int : [xnvme]
+
+ xnvme namespace identifier for userspace NVMe driver, SPDK or vfio.
+
+.. option:: xnvme_dev_subnqn=str : [xnvme]
+
+ Sets the subsystem NQN for fabrics. This is for xNVMe to utilize a
+ fabrics target with multiple systems.
+
+.. option:: xnvme_mem=str : [xnvme]
+
+ Select the xnvme memory backend. This can take these values.
+
+ **posix**
+ This is the default posix memory backend for linux NVMe driver.
+ **hugepage**
+ Use hugepages, instead of existing posix memory backend. The
+ memory backend uses hugetlbfs. This require users to allocate
+ hugepages, mount hugetlbfs and set an environment variable for
+ XNVME_HUGETLB_PATH.
+ **spdk**
+ Uses SPDK's memory allocator.
+ **vfio**
+ Uses libvfn's memory allocator. This also specifies the use
+ of libvfn backend instead of SPDK.
+
+.. option:: xnvme_iovec=int : [xnvme]
+
+ If this option is set. xnvme will use vectored read/write commands.
+
+.. option:: libblkio_driver=str : [libblkio]
+
+ The libblkio *driver* to use. Different drivers access devices through
+ different underlying interfaces. Available drivers depend on the
+ libblkio version in use and are listed at
+ https://libblkio.gitlab.io/libblkio/blkio.html#drivers
+
+.. option:: libblkio_path=str : [libblkio]
+
+ Sets the value of the driver-specific "path" property before connecting
+ the libblkio instance, which identifies the target device or file on
+ which to perform I/O. Its exact semantics are driver-dependent and not
+ all drivers may support it; see
+ https://libblkio.gitlab.io/libblkio/blkio.html#drivers
+
+.. option:: libblkio_pre_connect_props=str : [libblkio]
+
+ A colon-separated list of additional libblkio properties to be set after
+ creating but before connecting the libblkio instance. Each property must
+ have the format ``<name>=<value>``. Colons can be escaped as ``\:``.
+ These are set after the engine sets any other properties, so those can
+ be overridden. Available properties depend on the libblkio version in use
+ and are listed at
+ https://libblkio.gitlab.io/libblkio/blkio.html#properties
+
+.. option:: libblkio_num_entries=int : [libblkio]
+
+ Sets the value of the driver-specific "num-entries" property before
+ starting the libblkio instance. Its exact semantics are driver-dependent
+ and not all drivers may support it; see
+ https://libblkio.gitlab.io/libblkio/blkio.html#drivers
+
+.. option:: libblkio_queue_size=int : [libblkio]
+
+ Sets the value of the driver-specific "queue-size" property before
+ starting the libblkio instance. Its exact semantics are driver-dependent
+ and not all drivers may support it; see
+ https://libblkio.gitlab.io/libblkio/blkio.html#drivers
+
+.. option:: libblkio_pre_start_props=str : [libblkio]
+
+ A colon-separated list of additional libblkio properties to be set after
+ connecting but before starting the libblkio instance. Each property must
+ have the format ``<name>=<value>``. Colons can be escaped as ``\:``.
+ These are set after the engine sets any other properties, so those can
+ be overridden. Available properties depend on the libblkio version in use
+ and are listed at
+ https://libblkio.gitlab.io/libblkio/blkio.html#properties
+
+.. option:: libblkio_vectored : [libblkio]
+
+ Submit vectored read and write requests.
+
+.. option:: libblkio_write_zeroes_on_trim : [libblkio]
+
+ Submit trims as "write zeroes" requests instead of discard requests.
+
+.. option:: libblkio_wait_mode=str : [libblkio]
+
+ How to wait for completions:
+
+ **block** (default)
+ Use a blocking call to ``blkioq_do_io()``.
+ **eventfd**
+ Use a blocking call to ``read()`` on the completion eventfd.
+ **loop**
+ Use a busy loop with a non-blocking call to ``blkioq_do_io()``.
+
+.. option:: libblkio_force_enable_completion_eventfd : [libblkio]
+
+ Enable the queue's completion eventfd even when unused. This may impact
+ performance. The default is to enable it only if
+ :option:`libblkio_wait_mode=eventfd <libblkio_wait_mode>`.
+
+.. option:: no_completion_thread : [windowsaio]
+
+ Avoid using a separate thread for completion polling.
+
+I/O depth
+~~~~~~~~~
+
+.. option:: iodepth=int
+
+ Number of I/O units to keep in flight against the file. Note that
+ increasing *iodepth* beyond 1 will not affect synchronous ioengines (except
+ for small degrees when :option:`verify_async` is in use). Even async
+ engines may impose OS restrictions causing the desired depth not to be
+ achieved. This may happen on Linux when using libaio and not setting
+ :option:`direct`\=1, since buffered I/O is not async on that OS. Keep an
+ eye on the I/O depth distribution in the fio output to verify that the
+ achieved depth is as expected. Default: 1.
+
+.. option:: iodepth_batch_submit=int, iodepth_batch=int
+
+ This defines how many pieces of I/O to submit at once. It defaults to 1
+ which means that we submit each I/O as soon as it is available, but can be
+ raised to submit bigger batches of I/O at the time. If it is set to 0 the
+ :option:`iodepth` value will be used.
+
+.. option:: iodepth_batch_complete_min=int, iodepth_batch_complete=int
+
+ This defines how many pieces of I/O to retrieve at once. It defaults to 1
+ which means that we'll ask for a minimum of 1 I/O in the retrieval process
+ from the kernel. The I/O retrieval will go on until we hit the limit set by
+ :option:`iodepth_low`. If this variable is set to 0, then fio will always
+ check for completed events before queuing more I/O. This helps reduce I/O
+ latency, at the cost of more retrieval system calls.
+
+.. option:: iodepth_batch_complete_max=int
+
+ This defines maximum pieces of I/O to retrieve at once. This variable should
+ be used along with :option:`iodepth_batch_complete_min`\=int variable,
+ specifying the range of min and max amount of I/O which should be
+ retrieved. By default it is equal to the :option:`iodepth_batch_complete_min`
+ value.
+
+ Example #1::
+
+ iodepth_batch_complete_min=1
+ iodepth_batch_complete_max=<iodepth>
+
+ which means that we will retrieve at least 1 I/O and up to the whole
+ submitted queue depth. If none of I/O has been completed yet, we will wait.
+
+ Example #2::
+
+ iodepth_batch_complete_min=0
+ iodepth_batch_complete_max=<iodepth>
+
+ which means that we can retrieve up to the whole submitted queue depth, but
+ if none of I/O has been completed yet, we will NOT wait and immediately exit
+ the system call. In this example we simply do polling.
+
+.. option:: iodepth_low=int
+
+ The low water mark indicating when to start filling the queue
+ again. Defaults to the same as :option:`iodepth`, meaning that fio will
+ attempt to keep the queue full at all times. If :option:`iodepth` is set to
+ e.g. 16 and *iodepth_low* is set to 4, then after fio has filled the queue of
+ 16 requests, it will let the depth drain down to 4 before starting to fill
+ it again.
+
+.. option:: serialize_overlap=bool
+
+ Serialize in-flight I/Os that might otherwise cause or suffer from data races.
+ When two or more I/Os are submitted simultaneously, there is no guarantee that
+ the I/Os will be processed or completed in the submitted order. Further, if
+ two or more of those I/Os are writes, any overlapping region between them can
+ become indeterminate/undefined on certain storage. These issues can cause
+ verification to fail erratically when at least one of the racing I/Os is
+ changing data and the overlapping region has a non-zero size. Setting
+ ``serialize_overlap`` tells fio to avoid provoking this behavior by explicitly
+ serializing in-flight I/Os that have a non-zero overlap. Note that setting
+ this option can reduce both performance and the :option:`iodepth` achieved.
+
+ This option only applies to I/Os issued for a single job except when it is
+ enabled along with :option:`io_submit_mode`\=offload. In offload mode, fio
+ will check for overlap among all I/Os submitted by offload jobs with :option:`serialize_overlap`
+ enabled.
+
+ Default: false.
+
+.. option:: io_submit_mode=str
+
+ This option controls how fio submits the I/O to the I/O engine. The default
+ is `inline`, which means that the fio job threads submit and reap I/O
+ directly. If set to `offload`, the job threads will offload I/O submission
+ to a dedicated pool of I/O threads. This requires some coordination and thus
+ has a bit of extra overhead, especially for lower queue depth I/O where it
+ can increase latencies. The benefit is that fio can manage submission rates
+ independently of the device completion rates. This avoids skewed latency
+ reporting if I/O gets backed up on the device side (the coordinated omission
+ problem). Note that this option cannot reliably be used with async IO
+ engines.
+
+
+I/O rate
+~~~~~~~~
+
+.. option:: thinkcycles=int
+
+ Stall the job for the specified number of cycles after an I/O has completed before
+ issuing the next. May be used to simulate processing being done by an application.
+ This is not taken into account for the time to be waited on for :option:`thinktime`.
+ Might not have any effect on some platforms, this can be checked by trying a setting
+ a high enough amount of thinkcycles.
+
+.. option:: thinktime=time
+
+ Stall the job for the specified period of time after an I/O has completed before issuing the
+ next. May be used to simulate processing being done by an application.
+ When the unit is omitted, the value is interpreted in microseconds. See
+ :option:`thinktime_blocks`, :option:`thinktime_iotime` and :option:`thinktime_spin`.
+
+.. option:: thinktime_spin=time
+
+ Only valid if :option:`thinktime` is set - pretend to spend CPU time doing
+ something with the data received, before falling back to sleeping for the
+ rest of the period specified by :option:`thinktime`. When the unit is
+ omitted, the value is interpreted in microseconds.
+
+.. option:: thinktime_blocks=int
+
+ Only valid if :option:`thinktime` is set - control how many blocks to issue,
+ before waiting :option:`thinktime` usecs. If not set, defaults to 1 which will make
+ fio wait :option:`thinktime` usecs after every block. This effectively makes any
+ queue depth setting redundant, since no more than 1 I/O will be queued
+ before we have to complete it and do our :option:`thinktime`. In other words, this
+ setting effectively caps the queue depth if the latter is larger.
+
+.. option:: thinktime_blocks_type=str
+
+ Only valid if :option:`thinktime` is set - control how :option:`thinktime_blocks`
+ triggers. The default is `complete`, which triggers thinktime when fio completes
+ :option:`thinktime_blocks` blocks. If this is set to `issue`, then the trigger happens
+ at the issue side.
+
+.. option:: thinktime_iotime=time
+
+ Only valid if :option:`thinktime` is set - control :option:`thinktime`
+ interval by time. The :option:`thinktime` stall is repeated after IOs
+ are executed for :option:`thinktime_iotime`. For example,
+ ``--thinktime_iotime=9s --thinktime=1s`` repeat 10-second cycle with IOs
+ for 9 seconds and stall for 1 second. When the unit is omitted,
+ :option:`thinktime_iotime` is interpreted as a number of seconds. If
+ this option is used together with :option:`thinktime_blocks`, the
+ :option:`thinktime` stall is repeated after :option:`thinktime_iotime`
+ or after :option:`thinktime_blocks` IOs, whichever happens first.
+
+.. option:: rate=int[,int][,int]
+
+ Cap the bandwidth used by this job. The number is in bytes/sec, the normal
+ suffix rules apply. Comma-separated values may be specified for reads,
+ writes, and trims as described in :option:`blocksize`.
+
+ For example, using `rate=1m,500k` would limit reads to 1MiB/sec and writes to
+ 500KiB/sec. Capping only reads or writes can be done with `rate=,500k` or
+ `rate=500k,` where the former will only limit writes (to 500KiB/sec) and the
+ latter will only limit reads.
+
+.. option:: rate_min=int[,int][,int]
+
+ Tell fio to do whatever it can to maintain at least this bandwidth. Failing
+ to meet this requirement will cause the job to exit. Comma-separated values
+ may be specified for reads, writes, and trims as described in
+ :option:`blocksize`.
+
+.. option:: rate_iops=int[,int][,int]
+
+ Cap the bandwidth to this number of IOPS. Basically the same as
+ :option:`rate`, just specified independently of bandwidth. If the job is
+ given a block size range instead of a fixed value, the smallest block size
+ is used as the metric. Comma-separated values may be specified for reads,
+ writes, and trims as described in :option:`blocksize`.
+
+.. option:: rate_iops_min=int[,int][,int]
+
+ If fio doesn't meet this rate of I/O, it will cause the job to exit.
+ Comma-separated values may be specified for reads, writes, and trims as
+ described in :option:`blocksize`.
+
+.. option:: rate_process=str
+
+ This option controls how fio manages rated I/O submissions. The default is
+ `linear`, which submits I/O in a linear fashion with fixed delays between
+ I/Os that gets adjusted based on I/O completion rates. If this is set to
+ `poisson`, fio will submit I/O based on a more real world random request
+ flow, known as the Poisson process
+ (https://en.wikipedia.org/wiki/Poisson_point_process). The lambda will be
+ 10^6 / IOPS for the given workload.
+
+.. option:: rate_ignore_thinktime=bool
+
+ By default, fio will attempt to catch up to the specified rate setting,
+ if any kind of thinktime setting was used. If this option is set, then
+ fio will ignore the thinktime and continue doing IO at the specified
+ rate, instead of entering a catch-up mode after thinktime is done.
+
+.. option:: rate_cycle=int
+
+ Average bandwidth for :option:`rate_min` and :option:`rate_iops_min`
+ over this number of milliseconds. Defaults to 1000.
+
+
+I/O latency
+~~~~~~~~~~~
+
+.. option:: latency_target=time
+
+ If set, fio will attempt to find the max performance point that the given
+ workload will run at while maintaining a latency below this target. When
+ the unit is omitted, the value is interpreted in microseconds. See
+ :option:`latency_window` and :option:`latency_percentile`.
+
+.. option:: latency_window=time
+
+ Used with :option:`latency_target` to specify the sample window that the job
+ is run at varying queue depths to test the performance. When the unit is
+ omitted, the value is interpreted in microseconds.
+
+.. option:: latency_percentile=float
+
+ The percentage of I/Os that must fall within the criteria specified by
+ :option:`latency_target` and :option:`latency_window`. If not set, this
+ defaults to 100.0, meaning that all I/Os must be equal or below to the value
+ set by :option:`latency_target`.
+
+.. option:: latency_run=bool
+
+ Used with :option:`latency_target`. If false (default), fio will find
+ the highest queue depth that meets :option:`latency_target` and exit. If
+ true, fio will continue running and try to meet :option:`latency_target`
+ by adjusting queue depth.
+
+.. option:: max_latency=time[,time][,time]
+
+ If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
+ maximum latency. When the unit is omitted, the value is interpreted in
+ microseconds. Comma-separated values may be specified for reads, writes,
+ and trims as described in :option:`blocksize`.
+
+
+I/O replay
+~~~~~~~~~~
+
+.. option:: write_iolog=str
+
+ Write the issued I/O patterns to the specified file. See
+ :option:`read_iolog`. Specify a separate file for each job, otherwise the
+ iologs will be interspersed and the file may be corrupt. This file will
+ be opened in append mode.
+
+.. option:: read_iolog=str
+
+ Open an iolog with the specified filename and replay the I/O patterns it
+ contains. This can be used to store a workload and replay it sometime
+ later. The iolog given may also be a blktrace binary file, which allows fio
+ to replay a workload captured by :command:`blktrace`. See
+ :manpage:`blktrace(8)` for how to capture such logging data. For blktrace
+ replay, the file needs to be turned into a blkparse binary data file first
+ (``blkparse <device> -o /dev/null -d file_for_fio.bin``).
+ You can specify a number of files by separating the names with a ':'
+ character. See the :option:`filename` option for information on how to
+ escape ':' characters within the file names. These files will
+ be sequentially assigned to job clones created by :option:`numjobs`.
+ '-' is a reserved name, meaning read from stdin, notably if
+ :option:`filename` is set to '-' which means stdin as well, then
+ this flag can't be set to '-'.
+
+.. option:: read_iolog_chunked=bool
+
+ Determines how iolog is read. If false(default) entire :option:`read_iolog`
+ will be read at once. If selected true, input from iolog will be read
+ gradually. Useful when iolog is very large, or it is generated.
+
+.. option:: merge_blktrace_file=str
+
+ When specified, rather than replaying the logs passed to :option:`read_iolog`,
+ the logs go through a merge phase which aggregates them into a single
+ blktrace. The resulting file is then passed on as the :option:`read_iolog`
+ parameter. The intention here is to make the order of events consistent.
+ This limits the influence of the scheduler compared to replaying multiple
+ blktraces via concurrent jobs.
+
+.. option:: merge_blktrace_scalars=float_list
+
+ This is a percentage based option that is index paired with the list of
+ files passed to :option:`read_iolog`. When merging is performed, scale
+ the time of each event by the corresponding amount. For example,
+ ``--merge_blktrace_scalars="50:100"`` runs the first trace in halftime
+ and the second trace in realtime. This knob is separately tunable from
+ :option:`replay_time_scale` which scales the trace during runtime and
+ does not change the output of the merge unlike this option.
+
+.. option:: merge_blktrace_iters=float_list
+
+ This is a whole number option that is index paired with the list of files
+ passed to :option:`read_iolog`. When merging is performed, run each trace
+ for the specified number of iterations. For example,
+ ``--merge_blktrace_iters="2:1"`` runs the first trace for two iterations
+ and the second trace for one iteration.
+
+.. option:: replay_no_stall=bool
+
+ When replaying I/O with :option:`read_iolog` the default behavior is to
+ attempt to respect the timestamps within the log and replay them with the
+ appropriate delay between IOPS. By setting this variable fio will not
+ respect the timestamps and attempt to replay them as fast as possible while
+ still respecting ordering. The result is the same I/O pattern to a given
+ device, but different timings.
+
+.. option:: replay_time_scale=int
+
+ When replaying I/O with :option:`read_iolog`, fio will honor the
+ original timing in the trace. With this option, it's possible to scale
+ the time. It's a percentage option, if set to 50 it means run at 50%
+ the original IO rate in the trace. If set to 200, run at twice the
+ original IO rate. Defaults to 100.
+
+.. option:: replay_redirect=str
+
+ While replaying I/O patterns using :option:`read_iolog` the default behavior
+ is to replay the IOPS onto the major/minor device that each IOP was recorded
+ from. This is sometimes undesirable because on a different machine those
+ major/minor numbers can map to a different device. Changing hardware on the
+ same system can also result in a different major/minor mapping.
+ ``replay_redirect`` causes all I/Os to be replayed onto the single specified
+ device regardless of the device it was recorded
+ from. i.e. :option:`replay_redirect`\= :file:`/dev/sdc` would cause all I/O
+ in the blktrace or iolog to be replayed onto :file:`/dev/sdc`. This means
+ multiple devices will be replayed onto a single device, if the trace
+ contains multiple devices. If you want multiple devices to be replayed
+ concurrently to multiple redirected devices you must blkparse your trace
+ into separate traces and replay them with independent fio invocations.
+ Unfortunately this also breaks the strict time ordering between multiple
+ device accesses.
+
+.. option:: replay_align=int
+
+ Force alignment of the byte offsets in a trace to this value. The value
+ must be a power of 2.
+
+.. option:: replay_scale=int
+
+ Scale byte offsets down by this factor when replaying traces. Should most
+ likely use :option:`replay_align` as well.
+
+.. option:: replay_skip=str
+
+ Sometimes it's useful to skip certain IO types in a replay trace.
+ This could be, for instance, eliminating the writes in the trace.
+ Or not replaying the trims/discards, if you are redirecting to
+ a device that doesn't support them. This option takes a comma
+ separated list of read, write, trim, sync.
+
+
+Threads, processes and job synchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: thread
+
+ Fio defaults to creating jobs by using fork, however if this option is
+ given, fio will create jobs by using POSIX Threads' function
+ :manpage:`pthread_create(3)` to create threads instead.
+
+.. option:: wait_for=str
+
+ If set, the current job won't be started until all workers of the specified
+ waitee job are done.
+
+ ``wait_for`` operates on the job name basis, so there are a few
+ limitations. First, the waitee must be defined prior to the waiter job
+ (meaning no forward references). Second, if a job is being referenced as a
+ waitee, it must have a unique name (no duplicate waitees).
+
+.. option:: nice=int
+
+ Run the job with the given nice value. See man :manpage:`nice(2)`.
+
+ On Windows, values less than -15 set the process class to "High"; -1 through
+ -15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
+ priority class.
+
+.. option:: prio=int
+
+ Set the I/O priority value of this job. Linux limits us to a positive value
+ between 0 and 7, with 0 being the highest. See man
+ :manpage:`ionice(1)`. Refer to an appropriate manpage for other operating
+ systems since meaning of priority may differ. For per-command priority
+ setting, see I/O engine specific :option:`cmdprio_percentage` and
+ :option:`cmdprio` options.
+
+.. option:: prioclass=int
+
+ Set the I/O priority class. See man :manpage:`ionice(1)`. For per-command
+ priority setting, see I/O engine specific :option:`cmdprio_percentage`
+ and :option:`cmdprio_class` options.
+
+.. option:: priohint=int
+
+ Set the I/O priority hint. This is only applicable to platforms that
+ support I/O priority classes and to devices with features controlled
+ through priority hints, e.g. block devices supporting command duration
+ limits, or CDL. CDL is a way to indicate the desired maximum latency
+ of I/Os so that the device can optimize its internal command scheduling
+ according to the latency limits indicated by the user.
+
+ For per-I/O priority hint setting, see the I/O engine specific
+ :option:`cmdprio_hint` option.
+
+.. option:: cpus_allowed=str
+
+ Controls the same options as :option:`cpumask`, but accepts a textual
+ specification of the permitted CPUs instead and CPUs are indexed from 0. So
+ to use CPUs 0 and 5 you would specify ``cpus_allowed=0,5``. This option also
+ allows a range of CPUs to be specified -- say you wanted a binding to CPUs
+ 0, 5, and 8 to 15, you would set ``cpus_allowed=0,5,8-15``.
+
+ On Windows, when ``cpus_allowed`` is unset only CPUs from fio's current
+ processor group will be used and affinity settings are inherited from the
+ system. An fio build configured to target Windows 7 makes options that set
+ CPUs processor group aware and values will set both the processor group
+ and a CPU from within that group. For example, on a system where processor
+ group 0 has 40 CPUs and processor group 1 has 32 CPUs, ``cpus_allowed``
+ values between 0 and 39 will bind CPUs from processor group 0 and
+ ``cpus_allowed`` values between 40 and 71 will bind CPUs from processor
+ group 1. When using ``cpus_allowed_policy=shared`` all CPUs specified by a
+ single ``cpus_allowed`` option must be from the same processor group. For
+ Windows fio builds not built for Windows 7, CPUs will only be selected from
+ (and be relative to) whatever processor group fio happens to be running in
+ and CPUs from other processor groups cannot be used.
+
+.. option:: cpus_allowed_policy=str
+
+ Set the policy of how fio distributes the CPUs specified by
+ :option:`cpus_allowed` or :option:`cpumask`. Two policies are supported:
+
+ **shared**
+ All jobs will share the CPU set specified.
+ **split**
+ Each job will get a unique CPU from the CPU set.
+
+ **shared** is the default behavior, if the option isn't specified. If
+ **split** is specified, then fio will assign one cpu per job. If not
+ enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
+ in the set.
+
+.. option:: cpumask=int
+
+ Set the CPU affinity of this job. The parameter given is a bit mask of
+ allowed CPUs the job may run on. So if you want the allowed CPUs to be 1
+ and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
+ :manpage:`sched_setaffinity(2)`. This may not work on all supported
+ operating systems or kernel versions. This option doesn't work well for a
+ higher CPU count than what you can store in an integer mask, so it can only
+ control cpus 1-32. For boxes with larger CPU counts, use
+ :option:`cpus_allowed`.
+
+.. option:: numa_cpu_nodes=str
+
+ Set this job running on specified NUMA nodes' CPUs. The arguments allow
+ comma delimited list of cpu numbers, A-B ranges, or `all`. Note, to enable
+ NUMA options support, fio must be built on a system with libnuma-dev(el)
+ installed.
+
+.. option:: numa_mem_policy=str
+
+ Set this job's memory policy and corresponding NUMA nodes. Format of the
+ arguments::
+
+ <mode>[:<nodelist>]
+
+ ``mode`` is one of the following memory policies: ``default``, ``prefer``,
+ ``bind``, ``interleave`` or ``local``. For ``default`` and ``local`` memory
+ policies, no node needs to be specified. For ``prefer``, only one node is
+ allowed. For ``bind`` and ``interleave`` the ``nodelist`` may be as
+ follows: a comma delimited list of numbers, A-B ranges, or `all`.
+
+.. option:: cgroup=str
+
+ Add job to this control group. If it doesn't exist, it will be created. The
+ system must have a mounted cgroup blkio mount point for this to work. If
+ your system doesn't have it mounted, you can do so with::
+
+ # mount -t cgroup -o blkio none /cgroup
+
+.. option:: cgroup_weight=int
+
+ Set the weight of the cgroup to this value. See the documentation that comes
+ with the kernel, allowed values are in the range of 100..1000.
+
+.. option:: cgroup_nodelete=bool
+
+ Normally fio will delete the cgroups it has created after the job
+ completion. To override this behavior and to leave cgroups around after the
+ job completion, set ``cgroup_nodelete=1``. This can be useful if one wants
+ to inspect various cgroup files after job completion. Default: false.
+
+.. option:: flow_id=int
+
+ The ID of the flow. If not specified, it defaults to being a global
+ flow. See :option:`flow`.
+
+.. option:: flow=int
+
+ Weight in token-based flow control. If this value is used, then fio
+ regulates the activity between two or more jobs sharing the same
+ flow_id. Fio attempts to keep each job activity proportional to other
+ jobs' activities in the same flow_id group, with respect to requested
+ weight per job. That is, if one job has `flow=3', another job has
+ `flow=2' and another with `flow=1`, then there will be a roughly 3:2:1
+ ratio in how much one runs vs the others.
+
+.. option:: flow_sleep=int
+
+ The period of time, in microseconds, to wait after the flow counter
+ has exceeded its proportion before retrying operations.
+
+.. option:: stonewall, wait_for_previous
+
+ Wait for preceding jobs in the job file to exit, before starting this
+ one. Can be used to insert serialization points in the job file. A stone
+ wall also implies starting a new reporting group, see
+ :option:`group_reporting`.
+
+.. option:: exitall
+
+ By default, fio will continue running all other jobs when one job finishes.
+ Sometimes this is not the desired action. Setting ``exitall`` will instead
+ make fio terminate all jobs in the same group, as soon as one job of that
+ group finishes.
+
+.. option:: exit_what=str
+
+ By default, fio will continue running all other jobs when one job finishes.
+ Sometimes this is not the desired action. Setting ``exitall`` will
+ instead make fio terminate all jobs in the same group. The option
+ ``exit_what`` allows one to control which jobs get terminated when ``exitall``
+ is enabled. The default is ``group`` and does not change the behaviour of
+ ``exitall``. The setting ``all`` terminates all jobs. The setting ``stonewall``
+ terminates all currently running jobs across all groups and continues execution
+ with the next stonewalled group.
+
+.. option:: exec_prerun=str
+
+ Before running this job, issue the command specified through
+ :manpage:`system(3)`. Output is redirected in a file called
+ :file:`jobname.prerun.txt`.
+
+.. option:: exec_postrun=str
+
+ After the job completes, issue the command specified though
+ :manpage:`system(3)`. Output is redirected in a file called
+ :file:`jobname.postrun.txt`.
+
+.. option:: uid=int
+
+ Instead of running as the invoking user, set the user ID to this value
+ before the thread/process does any work.
+
+.. option:: gid=int
+
+ Set group ID, see :option:`uid`.
+
+
+Verification
+~~~~~~~~~~~~
+
+.. option:: verify_only
+
+ Do not perform specified workload, only verify data still matches previous
+ invocation of this workload. This option allows one to check data multiple
+ times at a later date without overwriting it. This option makes sense only
+ for workloads that write data, and does not support workloads with the
+ :option:`time_based` option set.
+
+.. option:: do_verify=bool
+
+ Run the verify phase after a write phase. Only valid if :option:`verify` is
+ set. Default: true.
+
+.. option:: verify=str
+
+ If writing to a file, fio can verify the file contents after each iteration
+ of the job. Each verification method also implies verification of special
+ header, which is written to the beginning of each block. This header also
+ includes meta information, like offset of the block, block number, timestamp
+ when block was written, etc. :option:`verify` can be combined with
+ :option:`verify_pattern` option. The allowed values are:
+
+ **md5**
+ Use an md5 sum of the data area and store it in the header of
+ each block.
+
+ **crc64**
+ Use an experimental crc64 sum of the data area and store it in the
+ header of each block.
+
+ **crc32c**
+ Use a crc32c sum of the data area and store it in the header of
+ each block. This will automatically use hardware acceleration
+ (e.g. SSE4.2 on an x86 or CRC crypto extensions on ARM64) but will
+ fall back to software crc32c if none is found. Generally the
+ fastest checksum fio supports when hardware accelerated.
+
+ **crc32c-intel**
+ Synonym for crc32c.
+
+ **crc32**
+ Use a crc32 sum of the data area and store it in the header of each
+ block.
+
+ **crc16**
+ Use a crc16 sum of the data area and store it in the header of each
+ block.
+
+ **crc7**
+ Use a crc7 sum of the data area and store it in the header of each
+ block.
+
+ **xxhash**
+ Use xxhash as the checksum function. Generally the fastest software
+ checksum that fio supports.
+
+ **sha512**
+ Use sha512 as the checksum function.
+
+ **sha256**
+ Use sha256 as the checksum function.
+
+ **sha1**
+ Use optimized sha1 as the checksum function.
+
+ **sha3-224**
+ Use optimized sha3-224 as the checksum function.
+
+ **sha3-256**
+ Use optimized sha3-256 as the checksum function.
+
+ **sha3-384**
+ Use optimized sha3-384 as the checksum function.
+
+ **sha3-512**
+ Use optimized sha3-512 as the checksum function.
+
+ **meta**
+ This option is deprecated, since now meta information is included in
+ generic verification header and meta verification happens by
+ default. For detailed information see the description of the
+ :option:`verify` setting. This option is kept because of
+ compatibility's sake with old configurations. Do not use it.
+
+ **pattern**
+ Verify a strict pattern. Normally fio includes a header with some
+ basic information and checksumming, but if this option is set, only
+ the specific pattern set with :option:`verify_pattern` is verified.
+
+ **null**
+ Only pretend to verify. Useful for testing internals with
+ :option:`ioengine`\=null, not for much else.
+
+ This option can be used for repeated burn-in tests of a system to make sure
+ that the written data is also correctly read back. If the data direction
+ given is a read or random read, fio will assume that it should verify a
+ previously written file. If the data direction includes any form of write,
+ the verify will be of the newly written data.
+
+ To avoid false verification errors, do not use the norandommap option when
+ verifying data with async I/O engines and I/O depths > 1. Or use the
+ norandommap and the lfsr random generator together to avoid writing to the
+ same offset with multiple outstanding I/Os.
+
+.. option:: verify_offset=int
+
+ Swap the verification header with data somewhere else in the block before
+ writing. It is swapped back before verifying.
+
+.. option:: verify_interval=int
+
+ Write the verification header at a finer granularity than the
+ :option:`blocksize`. It will be written for chunks the size of
+ ``verify_interval``. :option:`blocksize` should divide this evenly.
+
+.. option:: verify_pattern=str
+
+ If set, fio will fill the I/O buffers with this pattern. Fio defaults to
+ filling with totally random bytes, but sometimes it's interesting to fill
+ with a known pattern for I/O verification purposes. Depending on the width
+ of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time (it can
+ be either a decimal or a hex number). The ``verify_pattern`` if larger than
+ a 32-bit quantity has to be a hex number that starts with either "0x" or
+ "0X". Use with :option:`verify`. Also, ``verify_pattern`` supports %o
+ format, which means that for each block offset will be written and then
+ verified back, e.g.::
+
+ verify_pattern=%o
+
+ Or use combination of everything::
+
+ verify_pattern=0xff%o"abcd"-12
+
+.. option:: verify_fatal=bool
+
+ Normally fio will keep checking the entire contents before quitting on a
+ block verification failure. If this option is set, fio will exit the job on
+ the first observed failure. Default: false.
+
+.. option:: verify_dump=bool
+
+ If set, dump the contents of both the original data block and the data block
+ we read off disk to files. This allows later analysis to inspect just what
+ kind of data corruption occurred. Off by default.
+
+.. option:: verify_async=int
+
+ Fio will normally verify I/O inline from the submitting thread. This option
+ takes an integer describing how many async offload threads to create for I/O
+ verification instead, causing fio to offload the duty of verifying I/O
+ contents to one or more separate threads. If using this offload option, even
+ sync I/O engines can benefit from using an :option:`iodepth` setting higher
+ than 1, as it allows them to have I/O in flight while verifies are running.
+ Defaults to 0 async threads, i.e. verification is not asynchronous.
+
+.. option:: verify_async_cpus=str
+
+ Tell fio to set the given CPU affinity on the async I/O verification
+ threads. See :option:`cpus_allowed` for the format used.
+
+.. option:: verify_backlog=int
+
+ Fio will normally verify the written contents of a job that utilizes verify
+ once that job has completed. In other words, everything is written then
+ everything is read back and verified. You may want to verify continually
+ instead for a variety of reasons. Fio stores the meta data associated with
+ an I/O block in memory, so for large verify workloads, quite a bit of memory
+ would be used up holding this meta data. If this option is enabled, fio will
+ write only N blocks before verifying these blocks.
+
+.. option:: verify_backlog_batch=int
+
+ Control how many blocks fio will verify if :option:`verify_backlog` is
+ set. If not set, will default to the value of :option:`verify_backlog`
+ (meaning the entire queue is read back and verified). If
+ ``verify_backlog_batch`` is less than :option:`verify_backlog` then not all
+ blocks will be verified, if ``verify_backlog_batch`` is larger than
+ :option:`verify_backlog`, some blocks will be verified more than once.
+
+.. option:: verify_state_save=bool
+
+ When a job exits during the write phase of a verify workload, save its
+ current state. This allows fio to replay up until that point, if the verify
+ state is loaded for the verify read phase. The format of the filename is,
+ roughly::
+
+ <type>-<jobname>-<jobindex>-verify.state.
+
+ <type> is "local" for a local run, "sock" for a client/server socket
+ connection, and "ip" (192.168.0.1, for instance) for a networked
+ client/server connection. Defaults to true.
+
+.. option:: verify_state_load=bool
+
+ If a verify termination trigger was used, fio stores the current write state
+ of each thread. This can be used at verification time so that fio knows how
+ far it should verify. Without this information, fio will run a full
+ verification pass, according to the settings in the job file used. Default
+ false.
+
+.. option:: experimental_verify=bool
+
+ Enable experimental verification. Standard verify records I/O metadata
+ for later use during the verification phase. Experimental verify
+ instead resets the file after the write phase and then replays I/Os for
+ the verification phase.
+
+.. option:: trim_percentage=int
+
+ Number of verify blocks to discard/trim.
+
+.. option:: trim_verify_zero=bool
+
+ Verify that trim/discarded blocks are returned as zeros.
+
+.. option:: trim_backlog=int
+
+ Trim after this number of blocks are written.
+
+.. option:: trim_backlog_batch=int
+
+ Trim this number of I/O blocks.
+
+Steady state
+~~~~~~~~~~~~
+
+.. option:: steadystate=str:float, ss=str:float
+
+ Define the criterion and limit for assessing steady state performance. The
+ first parameter designates the criterion whereas the second parameter sets
+ the threshold. When the criterion falls below the threshold for the
+ specified duration, the job will stop. For example, `iops_slope:0.1%` will
+ direct fio to terminate the job when the least squares regression slope
+ falls below 0.1% of the mean IOPS. If :option:`group_reporting` is enabled
+ this will apply to all jobs in the group. Below is the list of available
+ steady state assessment criteria. All assessments are carried out using only
+ data from the rolling collection window. Threshold limits can be expressed
+ as a fixed value or as a percentage of the mean in the collection window.
+
+ When using this feature, most jobs should include the :option:`time_based`
+ and :option:`runtime` options or the :option:`loops` option so that fio does not
+ stop running after it has covered the full size of the specified file(s) or device(s).
+
+ **iops**
+ Collect IOPS data. Stop the job if all individual IOPS measurements
+ are within the specified limit of the mean IOPS (e.g., ``iops:2``
+ means that all individual IOPS values must be within 2 of the mean,
+ whereas ``iops:0.2%`` means that all individual IOPS values must be
+ within 0.2% of the mean IOPS to terminate the job).
+
+ **iops_slope**
+ Collect IOPS data and calculate the least squares regression
+ slope. Stop the job if the slope falls below the specified limit.
+
+ **bw**
+ Collect bandwidth data. Stop the job if all individual bandwidth
+ measurements are within the specified limit of the mean bandwidth.
+
+ **bw_slope**
+ Collect bandwidth data and calculate the least squares regression
+ slope. Stop the job if the slope falls below the specified limit.
+
+.. option:: steadystate_duration=time, ss_dur=time
+
+ A rolling window of this duration will be used to judge whether steady
+ state has been reached. Data will be collected every
+ :option:`ss_interval`. The default is 0 which disables steady state
+ detection. When the unit is omitted, the value is interpreted in
+ seconds.
+
+.. option:: steadystate_ramp_time=time, ss_ramp=time
+
+ Allow the job to run for the specified duration before beginning data
+ collection for checking the steady state job termination criterion. The
+ default is 0. When the unit is omitted, the value is interpreted in seconds.
+
+.. option:: steadystate_check_interval=time, ss_interval=time
+
+ The values during the rolling window will be collected with a period of
+ this value. If :option:`ss_interval` is 30s and :option:`ss_dur` is
+ 300s, 10 measurements will be taken. Default is 1s but that might not
+ converge, especially for slower devices, so set this accordingly. When
+ the unit is omitted, the value is interpreted in seconds.
+
+
+Measurements and reporting
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: per_job_logs=bool
+
+ If set to true, fio generates bw/clat/iops logs with per job unique
+ filenames. If set to false, jobs with identical names will share a log
+ filename. Note that when this option is set to false log files will be
+ opened in append mode and if log files already exist the previous
+ contents will not be overwritten. Default: true.
+
+.. option:: group_reporting
+
+ It may sometimes be interesting to display statistics for groups of jobs as
+ a whole instead of for each individual job. This is especially true if
+ :option:`numjobs` is used; looking at individual thread/process output
+ quickly becomes unwieldy. To see the final report per-group instead of
+ per-job, use :option:`group_reporting`. Jobs in a file will be part of the
+ same reporting group, unless if separated by a :option:`stonewall`, or by
+ using :option:`new_group`.
+
+ NOTE: When :option:`group_reporting` is used along with `json` output,
+ there are certain per-job properties which can be different between jobs
+ but do not have a natural group-level equivalent. Examples include
+ `kb_base`, `unit_base`, `sig_figs`, `thread_number`, `pid`, and
+ `job_start`. For these properties, the values for the first job are
+ recorded for the group.
+
+.. option:: new_group
+
+ Start a new reporting group. See: :option:`group_reporting`. If not given,
+ all jobs in a file will be part of the same reporting group, unless
+ separated by a :option:`stonewall`.
+
+.. option:: stats=bool
+
+ By default, fio collects and shows final output results for all jobs
+ that run. If this option is set to 0, then fio will ignore it in
+ the final stat output.
+
+.. option:: write_bw_log=str
+
+ If given, write a bandwidth log for this job. Can be used to store data of
+ the bandwidth of the jobs in their lifetime.
+
+ If no str argument is given, the default filename of
+ :file:`jobname_type.x.log` is used. Even when the argument is given, fio
+ will still append the type of log. So if one specifies::
+
+ write_bw_log=foo
+
+ The actual log name will be :file:`foo_bw.x.log` where `x` is the index
+ of the job (`1..N`, where `N` is the number of jobs). If
+ :option:`per_job_logs` is false, then the filename will not include the
+ `.x` job index.
+
+ The included :command:`fio_generate_plots` script uses :command:`gnuplot` to turn these
+ text files into nice graphs. See `Log File Formats`_ for how data is
+ structured within the file.
+
+.. option:: write_lat_log=str
+
+ Same as :option:`write_bw_log`, except this option creates I/O
+ submission (e.g., :file:`name_slat.x.log`), completion (e.g.,
+ :file:`name_clat.x.log`), and total (e.g., :file:`name_lat.x.log`)
+ latency files instead. See :option:`write_bw_log` for details about
+ the filename format and `Log File Formats`_ for how data is structured
+ within the files.
+
+.. option:: write_hist_log=str
+
+ Same as :option:`write_bw_log` but writes an I/O completion latency
+ histogram file (e.g., :file:`name_hist.x.log`) instead. Note that this
+ file will be empty unless :option:`log_hist_msec` has also been set.
+ See :option:`write_bw_log` for details about the filename format and
+ `Log File Formats`_ for how data is structured within the file.
+
+.. option:: write_iops_log=str
+
+ Same as :option:`write_bw_log`, but writes an IOPS file (e.g.
+ :file:`name_iops.x.log`) instead. Because fio defaults to individual
+ I/O logging, the value entry in the IOPS log will be 1 unless windowed
+ logging (see :option:`log_avg_msec`) has been enabled. See
+ :option:`write_bw_log` for details about the filename format and `Log
+ File Formats`_ for how data is structured within the file.
+
+.. option:: log_entries=int
+
+ By default, fio will log an entry in the iops, latency, or bw log for
+ every I/O that completes. The initial number of I/O log entries is 1024.
+ When the log entries are all used, new log entries are dynamically
+ allocated. This dynamic log entry allocation may negatively impact
+ time-related statistics such as I/O tail latencies (e.g. 99.9th percentile
+ completion latency). This option allows specifying a larger initial
+ number of log entries to avoid run-time allocations of new log entries,
+ resulting in more precise time-related I/O statistics.
+ Also see :option:`log_avg_msec`. Defaults to 1024.
+
+.. option:: log_avg_msec=int
+
+ By default, fio will log an entry in the iops, latency, or bw log for
+ every I/O that completes. When writing to the disk log, that can
+ quickly grow to a very large size. Setting this option directs fio to
+ instead record an average over the specified duration for each log
+ entry, reducing the resolution of the log. When the job completes, fio
+ will flush any accumulated latency log data, so the final log interval
+ may not match the value specified by this option and there may even be
+ duplicate timestamps. See :option:`log_window_value` as well. Defaults
+ to 0, logging entries for each I/O. Also see `Log File Formats`_.
+
+.. option:: log_hist_msec=int
+
+ Same as :option:`log_avg_msec`, but logs entries for completion latency
+ histograms. Computing latency percentiles from averages of intervals using
+ :option:`log_avg_msec` is inaccurate. Setting this option makes fio log
+ histogram entries over the specified period of time, reducing log sizes for
+ high IOPS devices while retaining percentile accuracy. See
+ :option:`log_hist_coarseness` and :option:`write_hist_log` as well.
+ Defaults to 0, meaning histogram logging is disabled.
+
+.. option:: log_hist_coarseness=int
+
+ Integer ranging from 0 to 6, defining the coarseness of the resolution of
+ the histogram logs enabled with :option:`log_hist_msec`. For each increment
+ in coarseness, fio outputs half as many bins. Defaults to 0, for which
+ histogram logs contain 1216 latency bins. See :option:`write_hist_log`
+ and `Log File Formats`_.
+
+.. option:: log_window_value=str, log_max_value=str
+
+ If :option:`log_avg_msec` is set, fio by default logs the average over that
+ window. This option determines whether fio logs the average, maximum or
+ both the values over the window. This only affects the latency logging,
+ as both average and maximum values for iops or bw log will be same.
+ Accepted values are:
+
+ **avg**
+ Log average value over the window. The default.
+
+ **max**
+ Log maximum value in the window.
+
+ **both**
+ Log both average and maximum value over the window.
+
+ **0**
+ Backward-compatible alias for **avg**.
+
+ **1**
+ Backward-compatible alias for **max**.
+
+.. option:: log_offset=bool
+
+ If this is set, the iolog options will include the byte offset for the I/O
+ entry as well as the other data values. Defaults to 0 meaning that
+ offsets are not present in logs. Also see `Log File Formats`_.
+
+.. option:: log_compression=int
+
+ If this is set, fio will compress the I/O logs as it goes, to keep the
+ memory footprint lower. When a log reaches the specified size, that chunk is
+ removed and compressed in the background. Given that I/O logs are fairly
+ highly compressible, this yields a nice memory savings for longer runs. The
+ downside is that the compression will consume some background CPU cycles, so
+ it may impact the run. This, however, is also true if the logging ends up
+ consuming most of the system memory. So pick your poison. The I/O logs are
+ saved normally at the end of a run, by decompressing the chunks and storing
+ them in the specified log file. This feature depends on the availability of
+ zlib.
+
+.. option:: log_compression_cpus=str
+
+ Define the set of CPUs that are allowed to handle online log compression for
+ the I/O jobs. This can provide better isolation between performance
+ sensitive jobs, and background compression work. See
+ :option:`cpus_allowed` for the format used.
+
+.. option:: log_store_compressed=bool
+
+ If set, fio will store the log files in a compressed format. They can be
+ decompressed with fio, using the :option:`--inflate-log` command line
+ parameter. The files will be stored with a :file:`.fz` suffix.
+
+.. option:: log_unix_epoch=bool
+
+ Backwards compatible alias for log_alternate_epoch.
+
+.. option:: log_alternate_epoch=bool
+
+ If set, fio will log timestamps based on the epoch used by the clock specified
+ in the log_alternate_epoch_clock_id option, to the log files produced by
+ enabling write_type_log for each log type, instead of the default zero-based
+ timestamps.
+
+.. option:: log_alternate_epoch_clock_id=int
+
+ Specifies the clock_id to be used by clock_gettime to obtain the alternate
+ epoch if log_alternate_epoch is true. Otherwise has no effect. Default
+ value is 0, or CLOCK_REALTIME.
+
+.. option:: block_error_percentiles=bool
+
+ If set, record errors in trim block-sized units from writes and trims and
+ output a histogram of how many trims it took to get to errors, and what kind
+ of error was encountered.
+
+.. option:: bwavgtime=int
+
+ Average the calculated bandwidth over the given time. Value is specified in
+ milliseconds. If the job also does bandwidth logging through
+ :option:`write_bw_log`, then the minimum of this option and
+ :option:`log_avg_msec` will be used. Default: 500ms.
+
+.. option:: iopsavgtime=int
+
+ Average the calculated IOPS over the given time. Value is specified in
+ milliseconds. If the job also does IOPS logging through
+ :option:`write_iops_log`, then the minimum of this option and
+ :option:`log_avg_msec` will be used. Default: 500ms.
+
+.. option:: disk_util=bool
+
+ Generate disk utilization statistics, if the platform supports it.
+ Default: true.
+
+.. option:: disable_lat=bool
+
+ Disable measurements of total latency numbers. Useful only for cutting back
+ the number of calls to :manpage:`gettimeofday(2)`, as that does impact
+ performance at really high IOPS rates. Note that to really get rid of a
+ large amount of these calls, this option must be used with
+ :option:`disable_slat` and :option:`disable_bw_measurement` as well.
+
+.. option:: disable_clat=bool
+
+ Disable measurements of completion latency numbers. See
+ :option:`disable_lat`.
+
+.. option:: disable_slat=bool
+
+ Disable measurements of submission latency numbers. See
+ :option:`disable_lat`.
+
+.. option:: disable_bw_measurement=bool, disable_bw=bool
+
+ Disable measurements of throughput/bandwidth numbers. See
+ :option:`disable_lat`.
+
+.. option:: slat_percentiles=bool
+
+ Report submission latency percentiles. Submission latency is not recorded
+ for synchronous ioengines.
+
+.. option:: clat_percentiles=bool
+
+ Report completion latency percentiles.
+
+.. option:: lat_percentiles=bool
+
+ Report total latency percentiles. Total latency is the sum of submission
+ latency and completion latency.
+
+.. option:: percentile_list=float_list
+
+ Overwrite the default list of percentiles for latencies and the block error
+ histogram. Each number is a floating point number in the range (0,100], and
+ the maximum length of the list is 20. Use ``:`` to separate the numbers. For
+ example, ``--percentile_list=99.5:99.9`` will cause fio to report the
+ latency durations below which 99.5% and 99.9% of the observed latencies fell,
+ respectively.
+
+.. option:: significant_figures=int
+
+ If using :option:`--output-format` of `normal`, set the significant
+ figures to this value. Higher values will yield more precise IOPS and
+ throughput units, while lower values will round. Requires a minimum
+ value of 1 and a maximum value of 10. Defaults to 4.
+
+
+Error handling
+~~~~~~~~~~~~~~
+
+.. option:: exitall_on_error
+
+ When one job finishes in error, terminate the rest. The default is to wait
+ for each job to finish.
+
+.. option:: continue_on_error=str
+
+ Normally fio will exit the job on the first observed failure. If this option
+ is set, fio will continue the job when there is a 'non-fatal error' (EIO or
+ EILSEQ) until the runtime is exceeded or the I/O size specified is
+ completed. If this option is used, there are two more stats that are
+ appended, the total error count and the first error. The error field given
+ in the stats is the first error that was hit during the run.
+
+ Note: a write error from the device may go unnoticed by fio when using
+ buffered IO, as the write() (or similar) system call merely dirties the
+ kernel pages, unless :option:`sync` or :option:`direct` is used. Device IO
+ errors occur when the dirty data is actually written out to disk. If fully
+ sync writes aren't desirable, :option:`fsync` or :option:`fdatasync` can be
+ used as well. This is specific to writes, as reads are always synchronous.
+
+ The allowed values are:
+
+ **none**
+ Exit on any I/O or verify errors.
+
+ **read**
+ Continue on read errors, exit on all others.
+
+ **write**
+ Continue on write errors, exit on all others.
+
+ **io**
+ Continue on any I/O error, exit on all others.
+
+ **verify**
+ Continue on verify errors, exit on all others.
+
+ **all**
+ Continue on all errors.
+
+ **0**
+ Backward-compatible alias for 'none'.
+
+ **1**
+ Backward-compatible alias for 'all'.
+
+.. option:: ignore_error=str
+
+ Sometimes you want to ignore some errors during test in that case you can
+ specify error list for each error type, instead of only being able to
+ ignore the default 'non-fatal error' using :option:`continue_on_error`.
+ ``ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST`` errors for
+ given error type is separated with ':'. Error may be symbol ('ENOSPC',
+ 'ENOMEM') or integer. Example::
+
+ ignore_error=EAGAIN,ENOSPC:122
+
+ This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
+ WRITE. This option works by overriding :option:`continue_on_error` with
+ the list of errors for each error type if any.
+
+.. option:: error_dump=bool
+
+ If set dump every error even if it is non fatal, true by default. If
+ disabled only fatal error will be dumped.
+
+Running predefined workloads
+----------------------------
+
+Fio includes predefined profiles that mimic the I/O workloads generated by
+other tools.
+
+.. option:: profile=str
+
+ The predefined workload to run. Current profiles are:
+
+ **tiobench**
+ Threaded I/O bench (tiotest/tiobench) like workload.
+
+ **act**
+ Aerospike Certification Tool (ACT) like workload.
+
+To view a profile's additional options use :option:`--cmdhelp` after specifying
+the profile. For example::
+
+ $ fio --profile=act --cmdhelp
+
+Act profile options
+~~~~~~~~~~~~~~~~~~~
+
+.. option:: device-names=str
+ :noindex:
+
+ Devices to use.
+
+.. option:: load=int
+ :noindex:
+
+ ACT load multiplier. Default: 1.
+
+.. option:: test-duration=time
+ :noindex:
+
+ How long the entire test takes to run. When the unit is omitted, the value
+ is given in seconds. Default: 24h.
+
+.. option:: threads-per-queue=int
+ :noindex:
+
+ Number of read I/O threads per device. Default: 8.
+
+.. option:: read-req-num-512-blocks=int
+ :noindex:
+
+ Number of 512B blocks to read at the time. Default: 3.
+
+.. option:: large-block-op-kbytes=int
+ :noindex:
+
+ Size of large block ops in KiB (writes). Default: 131072.
+
+.. option:: prep
+ :noindex:
+
+ Set to run ACT prep phase.
+
+Tiobench profile options
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: size=str
+ :noindex:
+
+ Size in MiB.
+
+.. option:: block=int
+ :noindex:
+
+ Block size in bytes. Default: 4096.
+
+.. option:: numruns=int
+ :noindex:
+
+ Number of runs.
+
+.. option:: dir=str
+ :noindex:
+
+ Test directory.
+
+.. option:: threads=int
+ :noindex:
+
+ Number of threads.
+
+Interpreting the output
+-----------------------
+
+..
+ Example output was based on the following:
+ TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --time_based \
+ --rate=1256k --bs=14K --name=quick --runtime=1s --name=mixed \
+ --runtime=2m --rw=rw
+
+Fio spits out a lot of output. While running, fio will display the status of the
+jobs created. An example of that would be::
+
+ Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
+
+The characters inside the first set of square brackets denote the current status of
+each thread. The first character is the first job defined in the job file, and so
+forth. The possible values (in typical life cycle order) are:
+
++------+-----+-----------------------------------------------------------+
+| Idle | Run | |
++======+=====+===========================================================+
+| P | | Thread setup, but not started. |
++------+-----+-----------------------------------------------------------+
+| C | | Thread created. |
++------+-----+-----------------------------------------------------------+
+| I | | Thread initialized, waiting or generating necessary data. |
++------+-----+-----------------------------------------------------------+
+| | p | Thread running pre-reading file(s). |
++------+-----+-----------------------------------------------------------+
+| | / | Thread is in ramp period. |
++------+-----+-----------------------------------------------------------+
+| | R | Running, doing sequential reads. |
++------+-----+-----------------------------------------------------------+
+| | r | Running, doing random reads. |
++------+-----+-----------------------------------------------------------+
+| | W | Running, doing sequential writes. |
++------+-----+-----------------------------------------------------------+
+| | w | Running, doing random writes. |
++------+-----+-----------------------------------------------------------+
+| | M | Running, doing mixed sequential reads/writes. |
++------+-----+-----------------------------------------------------------+
+| | m | Running, doing mixed random reads/writes. |
++------+-----+-----------------------------------------------------------+
+| | D | Running, doing sequential trims. |
++------+-----+-----------------------------------------------------------+
+| | d | Running, doing random trims. |
++------+-----+-----------------------------------------------------------+
+| | F | Running, currently waiting for :manpage:`fsync(2)`. |
++------+-----+-----------------------------------------------------------+
+| | V | Running, doing verification of written data. |
++------+-----+-----------------------------------------------------------+
+| f | | Thread finishing. |
++------+-----+-----------------------------------------------------------+
+| E | | Thread exited, not reaped by main thread yet. |
++------+-----+-----------------------------------------------------------+
+| _ | | Thread reaped. |
++------+-----+-----------------------------------------------------------+
+| X | | Thread reaped, exited with an error. |
++------+-----+-----------------------------------------------------------+
+| K | | Thread reaped, exited due to signal. |
++------+-----+-----------------------------------------------------------+
+
+..
+ Example output was based on the following:
+ TZ=UTC fio --iodepth=8 --ioengine=null --size=100M --runtime=58m \
+ --time_based --rate=2512k --bs=256K --numjobs=10 \
+ --name=readers --rw=read --name=writers --rw=write
+
+Fio will condense the thread string as not to take up more space on the command
+line than needed. For instance, if you have 10 readers and 10 writers running,
+the output would look like this::
+
+ Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
+
+Note that the status string is displayed in order, so it's possible to tell which of
+the jobs are currently doing what. In the example above this means that jobs 1--10
+are readers and 11--20 are writers.
+
+The other values are fairly self explanatory -- number of threads currently
+running and doing I/O, the number of currently open files (f=), the estimated
+completion percentage, the rate of I/O since last check (read speed listed first,
+then write speed and optionally trim speed) in terms of bandwidth and IOPS,
+and time to completion for the current running group. It's impossible to estimate
+runtime of the following groups (if any).
+
+..
+ Example output was based on the following:
+ TZ=UTC fio --iodepth=16 --ioengine=posixaio --filename=/tmp/fiofile \
+ --direct=1 --size=100M --time_based --runtime=50s --rate_iops=89 \
+ --bs=7K --name=Client1 --rw=write
+
+When fio is done (or interrupted by :kbd:`Ctrl-C`), it will show the data for
+each thread, group of threads, and disks in that order. For each overall thread (or
+group) the output looks like::
+
+ Client1: (groupid=0, jobs=1): err= 0: pid=16109: Sat Jun 24 12:07:54 2017
+ write: IOPS=88, BW=623KiB/s (638kB/s)(30.4MiB/50032msec)
+ slat (nsec): min=500, max=145500, avg=8318.00, stdev=4781.50
+ clat (usec): min=170, max=78367, avg=4019.02, stdev=8293.31
+ lat (usec): min=174, max=78375, avg=4027.34, stdev=8291.79
+ clat percentiles (usec):
+ | 1.00th=[ 302], 5.00th=[ 326], 10.00th=[ 343], 20.00th=[ 363],
+ | 30.00th=[ 392], 40.00th=[ 404], 50.00th=[ 416], 60.00th=[ 445],
+ | 70.00th=[ 816], 80.00th=[ 6718], 90.00th=[12911], 95.00th=[21627],
+ | 99.00th=[43779], 99.50th=[51643], 99.90th=[68682], 99.95th=[72877],
+ | 99.99th=[78119]
+ bw ( KiB/s): min= 532, max= 686, per=0.10%, avg=622.87, stdev=24.82, samples= 100
+ iops : min= 76, max= 98, avg=88.98, stdev= 3.54, samples= 100
+ lat (usec) : 250=0.04%, 500=64.11%, 750=4.81%, 1000=2.79%
+ lat (msec) : 2=4.16%, 4=1.84%, 10=4.90%, 20=11.33%, 50=5.37%
+ lat (msec) : 100=0.65%
+ cpu : usr=0.27%, sys=0.18%, ctx=12072, majf=0, minf=21
+ IO depths : 1=85.0%, 2=13.1%, 4=1.8%, 8=0.1%, 16=0.0%, 32=0.0%, >=64=0.0%
+ submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+ complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+ issued rwt: total=0,4450,0, short=0,0,0, dropped=0,0,0
+ latency : target=0, window=0, percentile=100.00%, depth=8
+
+The job name (or first job's name when using :option:`group_reporting`) is printed,
+along with the group id, count of jobs being aggregated, last error id seen (which
+is 0 when there are no errors), pid/tid of that thread and the time the job/group
+completed. Below are the I/O statistics for each data direction performed (showing
+writes in the example above). In the order listed, they denote:
+
+**read/write/trim**
+ The string before the colon shows the I/O direction the statistics
+ are for. **IOPS** is the average I/Os performed per second. **BW**
+ is the average bandwidth rate shown as: value in power of 2 format
+ (value in power of 10 format). The last two values show: (**total
+ I/O performed** in power of 2 format / **runtime** of that thread).
+
+**slat**
+ Submission latency (**min** being the minimum, **max** being the
+ maximum, **avg** being the average, **stdev** being the standard
+ deviation). This is the time from when fio initialized the I/O
+ to submission. For synchronous ioengines this includes the time
+ up until just before the ioengine's queue function is called.
+ For asynchronous ioengines this includes the time up through the
+ completion of the ioengine's queue function (and commit function
+ if it is defined). For sync I/O this row is not displayed as the
+ slat is negligible. This value can be in nanoseconds,
+ microseconds or milliseconds --- fio will choose the most
+ appropriate base and print that (in the example above
+ nanoseconds was the best scale). Note: in :option:`--minimal`
+ mode latencies are always expressed in microseconds.
+
+**clat**
+ Completion latency. Same names as slat, this denotes the time from
+ submission to completion of the I/O pieces. For sync I/O, this
+ represents the time from when the I/O was submitted to the
+ operating system to when it was completed. For asynchronous
+ ioengines this is the time from when the ioengine's queue (and
+ commit if available) functions were completed to when the I/O's
+ completion was reaped by fio.
+
+ For file and directory operation engines, **clat** denotes the time
+ to complete one file or directory operation.
+
+ **filecreate engine**:the time cost to create a new file
+
+ **filestat engine**: the time cost to look up an existing file
+
+ **filedelete engine**:the time cost to delete a file
+
+ **dircreate engine**: the time cost to create a new directory
+
+ **dirstat engine**: the time cost to look up an existing directory
+
+ **dirdelete engine**: the time cost to delete a directory
+
+**lat**
+ Total latency. Same names as slat and clat, this denotes the time from
+ when fio created the I/O unit to completion of the I/O operation.
+ It is the sum of submission and completion latency.
+
+**bw**
+ Bandwidth statistics based on measurements from discrete
+ intervals. Fio continuously monitors bytes transferred and I/O
+ operations completed. By default fio calculates bandwidth in
+ each half-second interval (see :option:`bwavgtime`) and reports
+ descriptive statistics for the measurements here. Same names as
+ the xlat stats, but also includes the number of samples taken
+ (**samples**) and an approximate percentage of total aggregate
+ bandwidth this thread received in its group (**per**). This
+ last value is only really useful if the threads in this group
+ are on the same disk, since they are then competing for disk
+ access.
+
+ For file and directory operation engines, **bw** is meaningless.
+
+**iops**
+ IOPS statistics based on measurements from discrete intervals.
+ For details see the description for bw above. See
+ :option:`iopsavgtime` to control the duration of the intervals.
+ Same values reported here as for bw except for percentage.
+
+ For file and directory operation engines, **iops** is the most
+ fundamental index to denote the performance.
+ It means how many files or directories can be operated per second.
+
+ **filecreate engine**:number of files can be created per second
+
+ **filestat engine**: number of files can be looked up per second
+
+ **filedelete engine**:number of files can be deleted per second
+
+ **dircreate engine**: number of directories can be created per second
+
+ **dirstat engine**: number of directories can be looked up per second
+
+ **dirdelete engine**: number of directories can be deleted per second
+
+**lat (nsec/usec/msec)**
+ The distribution of I/O completion latencies. This is the time from when
+ I/O leaves fio and when it gets completed. Unlike the separate
+ read/write/trim sections above, the data here and in the remaining
+ sections apply to all I/Os for the reporting group. 250=0.04% means that
+ 0.04% of the I/Os completed in under 250us. 500=64.11% means that 64.11%
+ of the I/Os required 250 to 499us for completion.
+
+**cpu**
+ CPU usage. User and system time, along with the number of context
+ switches this thread went through, usage of system and user time, and
+ finally the number of major and minor page faults. The CPU utilization
+ numbers are averages for the jobs in that reporting group, while the
+ context and fault counters are summed.
+
+**IO depths**
+ The distribution of I/O depths over the job lifetime. The numbers are
+ divided into powers of 2 and each entry covers depths from that value
+ up to those that are lower than the next entry -- e.g., 16= covers
+ depths from 16 to 31. Note that the range covered by a depth
+ distribution entry can be different to the range covered by the
+ equivalent submit/complete distribution entry.
+
+**IO submit**
+ How many pieces of I/O were submitting in a single submit call. Each
+ entry denotes that amount and below, until the previous entry -- e.g.,
+ 16=100% means that we submitted anywhere between 9 to 16 I/Os per submit
+ call. Note that the range covered by a submit distribution entry can
+ be different to the range covered by the equivalent depth distribution
+ entry.
+
+**IO complete**
+ Like the above submit number, but for completions instead.
+
+**IO issued rwt**
+ The number of read/write/trim requests issued, and how many of them were
+ short or dropped.
+
+**IO latency**
+ These values are for :option:`latency_target` and related options. When
+ these options are engaged, this section describes the I/O depth required
+ to meet the specified latency target.
+
+..
+ Example output was based on the following:
+ TZ=UTC fio --ioengine=null --iodepth=2 --size=100M --numjobs=2 \
+ --rate_process=poisson --io_limit=32M --name=read --bs=128k \
+ --rate=11M --name=write --rw=write --bs=2k --rate=700k
+
+After each client has been listed, the group statistics are printed. They
+will look like this::
+
+ Run status group 0 (all jobs):
+ READ: bw=20.9MiB/s (21.9MB/s), 10.4MiB/s-10.8MiB/s (10.9MB/s-11.3MB/s), io=64.0MiB (67.1MB), run=2973-3069msec
+ WRITE: bw=1231KiB/s (1261kB/s), 616KiB/s-621KiB/s (630kB/s-636kB/s), io=64.0MiB (67.1MB), run=52747-53223msec
+
+For each data direction it prints:
+
+**bw**
+ Aggregate bandwidth of threads in this group followed by the
+ minimum and maximum bandwidth of all the threads in this group.
+ Values outside of brackets are power-of-2 format and those
+ within are the equivalent value in a power-of-10 format.
+**io**
+ Aggregate I/O performed of all threads in this group. The
+ format is the same as bw.
+**run**
+ The smallest and longest runtimes of the threads in this group.
+
+And finally, the disk statistics are printed. This is Linux specific. They will look like this::
+
+ Disk stats (read/write):
+ sda: ios=16398/16511, sectors=32321/65472, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+
+Each value is printed for both reads and writes, with reads first. The
+numbers denote:
+
+**ios**
+ Number of I/Os performed by all groups.
+**sectors**
+ Amount of data transferred in units of 512 bytes for all groups.
+**merge**
+ Number of merges performed by the I/O scheduler.
+**ticks**
+ Number of ticks we kept the disk busy.
+**in_queue**
+ Total time spent in the disk queue.
+**util**
+ The disk utilization. A value of 100% means we kept the disk
+ busy constantly, 50% would be a disk idling half of the time.
+
+It is also possible to get fio to dump the current output while it is running,
+without terminating the job. To do that, send fio the **USR1** signal. You can
+also get regularly timed dumps by using the :option:`--status-interval`
+parameter, or by creating a file in :file:`/tmp` named
+:file:`fio-dump-status`. If fio sees this file, it will unlink it and dump the
+current output status.
+
+
+Terse output
+------------
+
+For scripted usage where you typically want to generate tables or graphs of the
+results, fio can output the results in a semicolon separated format. The format
+is one long line of values, such as::
+
+ 2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
+ A description of this job goes here.
+
+The job description (if provided) follows on a second line for terse v2.
+It appears on the same line for other terse versions.
+
+To enable terse output, use the :option:`--minimal` or
+:option:`--output-format`\=terse command line options. The
+first value is the version of the terse output format. If the output has to be
+changed for some reason, this number will be incremented by 1 to signify that
+change.
+
+Split up, the format is as follows (comments in brackets denote when a
+field was introduced or whether it's specific to some terse version):
+
+ ::
+
+ terse version, fio version [v3], jobname, groupid, error
+
+ READ status::
+
+ Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+ Submission latency: min, max, mean, stdev (usec)
+ Completion latency: min, max, mean, stdev (usec)
+ Completion latency percentiles: 20 fields (see below)
+ Total latency: min, max, mean, stdev (usec)
+ Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+ IOPS [v5]: min, max, mean, stdev, number of samples
+
+ WRITE status:
+
+ ::
+
+ Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+ Submission latency: min, max, mean, stdev (usec)
+ Completion latency: min, max, mean, stdev (usec)
+ Completion latency percentiles: 20 fields (see below)
+ Total latency: min, max, mean, stdev (usec)
+ Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev, number of samples [v5]
+ IOPS [v5]: min, max, mean, stdev, number of samples
+
+ TRIM status [all but version 3]:
+
+ Fields are similar to READ/WRITE status.
+
+ CPU usage::
+
+ user, system, context switches, major faults, minor faults
+
+ I/O depths::
+
+ <=1, 2, 4, 8, 16, 32, >=64
+
+ I/O latencies microseconds::
+
+ <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+
+ I/O latencies milliseconds::
+
+ <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+
+ Disk utilization [v3]::
+
+ disk name, read ios, write ios, read merges, write merges, read ticks, write ticks,
+ time spent in queue, disk utilization percentage
+
+ Additional Info (dependent on continue_on_error, default off)::
+
+ total # errors, first error code
+
+ Additional Info (dependent on description being set)::
+
+ Text description
+
+Completion latency percentiles can be a grouping of up to 20 sets, so for the
+terse output fio writes all of them. Each field will look like this::
+
+ 1.00%=6112
+
+which is the Xth percentile, and the `usec` latency associated with it.
+
+For `Disk utilization`, all disks used by fio are shown. So for each disk there
+will be a disk utilization section.
+
+Below is a single line containing short names for each of the fields in the
+minimal output v3, separated by semicolons::
+
+ terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth_kb;read_iops;read_runtime_ms;read_slat_min_us;read_slat_max_us;read_slat_mean_us;read_slat_dev_us;read_clat_min_us;read_clat_max_us;read_clat_mean_us;read_clat_dev_us;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min_us;read_lat_max_us;read_lat_mean_us;read_lat_dev_us;read_bw_min_kb;read_bw_max_kb;read_bw_agg_pct;read_bw_mean_kb;read_bw_dev_kb;write_kb;write_bandwidth_kb;write_iops;write_runtime_ms;write_slat_min_us;write_slat_max_us;write_slat_mean_us;write_slat_dev_us;write_clat_min_us;write_clat_max_us;write_clat_mean_us;write_clat_dev_us;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min_us;write_lat_max_us;write_lat_mean_us;write_lat_dev_us;write_bw_min_kb;write_bw_max_kb;write_bw_agg_pct;write_bw_mean_kb;write_bw_dev_kb;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+
+In client/server mode terse output differs from what appears when jobs are run
+locally. Disk utilization data is omitted from the standard terse output and
+for v3 and later appears on its own separate line at the end of each terse
+reporting cycle.
+
+
+JSON output
+------------
+
+The `json` output format is intended to be both human readable and convenient
+for automated parsing. For the most part its sections mirror those of the
+`normal` output. The `runtime` value is reported in msec and the `bw` value is
+reported in 1024 bytes per second units.
+
+
+JSON+ output
+------------
+
+The `json+` output format is identical to the `json` output format except that it
+adds a full dump of the completion latency bins. Each `bins` object contains a
+set of (key, value) pairs where keys are latency durations and values count how
+many I/Os had completion latencies of the corresponding duration. For example,
+consider:
+
+ "bins" : { "87552" : 1, "89600" : 1, "94720" : 1, "96768" : 1, "97792" : 1, "99840" : 1, "100864" : 2, "103936" : 6, "104960" : 534, "105984" : 5995, "107008" : 7529, ... }
+
+This data indicates that one I/O required 87,552ns to complete, two I/Os required
+100,864ns to complete, and 7529 I/Os required 107,008ns to complete.
+
+Also included with fio is a Python script `fio_jsonplus_clat2csv` that takes
+json+ output and generates CSV-formatted latency data suitable for plotting.
+
+The latency durations actually represent the midpoints of latency intervals.
+For details refer to :file:`stat.h`.
+
+
+Trace file format
+-----------------
+
+There are two trace file format that you can encounter. The older (v1) format is
+unsupported since version 1.20-rc3 (March 2008). It will still be described
+below in case that you get an old trace and want to understand it.
+
+In any case the trace is a simple text file with a single action per line.
+
+
+Trace file format v1
+~~~~~~~~~~~~~~~~~~~~
+
+Each line represents a single I/O action in the following format::
+
+ rw, offset, length
+
+where `rw=0/1` for read/write, and the `offset` and `length` entries being in bytes.
+
+This format is not supported in fio versions >= 1.20-rc3.
+
+
+Trace file format v2
+~~~~~~~~~~~~~~~~~~~~
+
+The second version of the trace file format was added in fio version 1.17. It
+allows one to access more than one file per trace and has a bigger set of possible
+file actions.
+
+The first line of the trace file has to be::
+
+ fio version 2 iolog
+
+Following this can be lines in two different formats, which are described below.
+
+The file management format::
+
+ filename action
+
+The `filename` is given as an absolute path. The `action` can be one of these:
+
+**add**
+ Add the given `filename` to the trace.
+**open**
+ Open the file with the given `filename`. The `filename` has to have
+ been added with the **add** action before.
+**close**
+ Close the file with the given `filename`. The file has to have been
+ opened before.
+
+
+The file I/O action format::
+
+ filename action offset length
+
+The `filename` is given as an absolute path, and has to have been added and
+opened before it can be used with this format. The `offset` and `length` are
+given in bytes. The `action` can be one of these:
+
+**wait**
+ Wait for `offset` microseconds. Everything below 100 is discarded.
+ The time is relative to the previous `wait` statement. Note that
+ action `wait` is not allowed as of version 3, as the same behavior
+ can be achieved using timestamps.
+**read**
+ Read `length` bytes beginning from `offset`.
+**write**
+ Write `length` bytes beginning from `offset`.
+**sync**
+ :manpage:`fsync(2)` the file.
+**datasync**
+ :manpage:`fdatasync(2)` the file.
+**trim**
+ Trim the given file from the given `offset` for `length` bytes.
+
+
+Trace file format v3
+~~~~~~~~~~~~~~~~~~~~
+
+The third version of the trace file format was added in fio version 3.31. It
+forces each action to have a timestamp associated with it.
+
+The first line of the trace file has to be::
+
+ fio version 3 iolog
+
+Following this can be lines in two different formats, which are described below.
+
+The file management format::
+
+ timestamp filename action
+
+The file I/O action format::
+
+ timestamp filename action offset length
+
+The `timestamp` is relative to the beginning of the run (ie starts at 0). The
+`filename`, `action`, `offset` and `length` are identical to version 2, except
+that version 3 does not allow the `wait` action.
+
+
+I/O Replay - Merging Traces
+---------------------------
+
+Colocation is a common practice used to get the most out of a machine.
+Knowing which workloads play nicely with each other and which ones don't is
+a much harder task. While fio can replay workloads concurrently via multiple
+jobs, it leaves some variability up to the scheduler making results harder to
+reproduce. Merging is a way to make the order of events consistent.
+
+Merging is integrated into I/O replay and done when a
+:option:`merge_blktrace_file` is specified. The list of files passed to
+:option:`read_iolog` go through the merge process and output a single file
+stored to the specified file. The output file is passed on as if it were the
+only file passed to :option:`read_iolog`. An example would look like::
+
+ $ fio --read_iolog="<file1>:<file2>" --merge_blktrace_file="<output_file>"
+
+Creating only the merged file can be done by passing the command line argument
+:option:`--merge-blktrace-only`.
+
+Scaling traces can be done to see the relative impact of any particular trace
+being slowed down or sped up. :option:`merge_blktrace_scalars` takes in a colon
+separated list of percentage scalars. It is index paired with the files passed
+to :option:`read_iolog`.
+
+With scaling, it may be desirable to match the running time of all traces.
+This can be done with :option:`merge_blktrace_iters`. It is index paired with
+:option:`read_iolog` just like :option:`merge_blktrace_scalars`.
+
+In an example, given two traces, A and B, each 60s long. If we want to see
+the impact of trace A issuing IOs twice as fast and repeat trace A over the
+runtime of trace B, the following can be done::
+
+ $ fio --read_iolog="<trace_a>:"<trace_b>" --merge_blktrace_file"<output_file>" --merge_blktrace_scalars="50:100" --merge_blktrace_iters="2:1"
+
+This runs trace A at 2x the speed twice for approximately the same runtime as
+a single run of trace B.
+
+
+CPU idleness profiling
+----------------------
+
+In some cases, we want to understand CPU overhead in a test. For example, we
+test patches for the specific goodness of whether they reduce CPU usage.
+Fio implements a balloon approach to create a thread per CPU that runs at idle
+priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each CPU
+can be derived accordingly.
+
+An unit work is defined as touching a full page of unsigned characters. Mean and
+standard deviation of time to complete an unit work is reported in "unit work"
+section. Options can be chosen to report detailed percpu idleness or overall
+system idleness by aggregating percpu stats.
+
+
+Verification and triggers
+-------------------------
+
+Fio is usually run in one of two ways, when data verification is done. The first
+is a normal write job of some sort with verify enabled. When the write phase has
+completed, fio switches to reads and verifies everything it wrote. The second
+model is running just the write phase, and then later on running the same job
+(but with reads instead of writes) to repeat the same I/O patterns and verify
+the contents. Both of these methods depend on the write phase being completed,
+as fio otherwise has no idea how much data was written.
+
+With verification triggers, fio supports dumping the current write state to
+local files. Then a subsequent read verify workload can load this state and know
+exactly where to stop. This is useful for testing cases where power is cut to a
+server in a managed fashion, for instance.
+
+A verification trigger consists of two things:
+
+1) Storing the write state of each job.
+2) Executing a trigger command.
+
+The write state is relatively small, on the order of hundreds of bytes to single
+kilobytes. It contains information on the number of completions done, the last X
+completions, etc.
+
+A trigger is invoked either through creation ('touch') of a specified file in
+the system, or through a timeout setting. If fio is run with
+:option:`--trigger-file`\= :file:`/tmp/trigger-file`, then it will continually
+check for the existence of :file:`/tmp/trigger-file`. When it sees this file, it
+will fire off the trigger (thus saving state, and executing the trigger
+command).
+
+For client/server runs, there's both a local and remote trigger. If fio is
+running as a server backend, it will send the job states back to the client for
+safe storage, then execute the remote trigger, if specified. If a local trigger
+is specified, the server will still send back the write state, but the client
+will then execute the trigger.
+
+Verification trigger example
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Let's say we want to run a powercut test on the remote Linux machine 'server'.
+Our write workload is in :file:`write-test.fio`. We want to cut power to 'server' at
+some point during the run, and we'll run this test from the safety or our local
+machine, 'localbox'. On the server, we'll start the fio backend normally::
+
+ server# fio --server
+
+and on the client, we'll fire off the workload::
+
+ localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
+
+We set :file:`/tmp/my-trigger` as the trigger file, and we tell fio to execute::
+
+ echo b > /proc/sysrq-trigger
+
+on the server once it has received the trigger and sent us the write state. This
+will work, but it's not **really** cutting power to the server, it's merely
+abruptly rebooting it. If we have a remote way of cutting power to the server
+through IPMI or similar, we could do that through a local trigger command
+instead. Let's assume we have a script that does IPMI reboot of a given hostname,
+ipmi-reboot. On localbox, we could then have run fio with a local trigger
+instead::
+
+ localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
+
+For this case, fio would wait for the server to send us the write state, then
+execute ``ipmi-reboot server`` when that happened.
+
+Loading verify state
+~~~~~~~~~~~~~~~~~~~~
+
+To load stored write state, a read verification job file must contain the
+:option:`verify_state_load` option. If that is set, fio will load the previously
+stored state. For a local fio run this is done by loading the files directly,
+and on a client/server run, the server backend will ask the client to send the
+files over and load them from there.
+
+
+Log File Formats
+----------------
+
+Fio supports a variety of log file formats, for logging latencies, bandwidth,
+and IOPS. The logs share a common format, which looks like this:
+
+ *time* (`msec`), *value*, *data direction*, *block size* (`bytes`),
+ *offset* (`bytes`), *command priority*
+
+*Time* for the log entry is always in milliseconds. The *value* logged depends
+on the type of log, it will be one of the following:
+
+ **Latency log**
+ Value is latency in nsecs
+ **Bandwidth log**
+ Value is in KiB/sec
+ **IOPS log**
+ Value is IOPS
+
+*Data direction* is one of the following:
+
+ **0**
+ I/O is a READ
+ **1**
+ I/O is a WRITE
+ **2**
+ I/O is a TRIM
+
+The entry's *block size* is always in bytes. The *offset* is the position in bytes
+from the start of the file for that particular I/O. The logging of the offset can be
+toggled with :option:`log_offset`.
+
+*Command priority* is 0 for normal priority and 1 for high priority. This is controlled
+by the ioengine specific :option:`cmdprio_percentage`.
+
+Fio defaults to logging every individual I/O but when windowed logging is set
+through :option:`log_avg_msec`, either the average (by default), the maximum
+(:option:`log_window_value` is set to max) *value* seen over the specified period
+of time, or both the average *value* and maximum *value1* (:option:`log_window_value`
+is set to both) is recorded. The log file format when both the values are reported
+takes this form:
+
+ *time* (`msec`), *value*, *value1*, *data direction*, *block size* (`bytes`),
+ *offset* (`bytes`), *command priority*
+
+
+Each *data direction* seen within the window period will aggregate its values in a
+separate row. Further, when using windowed logging the *block size* and *offset*
+entries will always contain 0.
+
+
+Client/Server
+-------------
+
+Normally fio is invoked as a stand-alone application on the machine where the
+I/O workload should be generated. However, the backend and frontend of fio can
+be run separately i.e., the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled by a client on another machine.
+
+Start the server on the machine which has access to the storage DUT::
+
+ $ fio --server=args
+
+where `args` defines what fio listens to. The arguments are of the form
+``type,hostname`` or ``IP,port``. *type* is either ``ip`` (or ip4) for TCP/IP
+v4, ``ip6`` for TCP/IP v6, or ``sock`` for a local unix domain socket.
+*hostname* is either a hostname or IP address, and *port* is the port to listen
+to (only valid for TCP/IP, not a local socket). Some examples:
+
+1) ``fio --server``
+
+ Start a fio server, listening on all interfaces on the default port (8765).
+
+2) ``fio --server=ip:hostname,4444``
+
+ Start a fio server, listening on IP belonging to hostname and on port 4444.
+
+3) ``fio --server=ip6:::1,4444``
+
+ Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
+
+4) ``fio --server=,4444``
+
+ Start a fio server, listening on all interfaces on port 4444.
+
+5) ``fio --server=1.2.3.4``
+
+ Start a fio server, listening on IP 1.2.3.4 on the default port.
+
+6) ``fio --server=sock:/tmp/fio.sock``
+
+ Start a fio server, listening on the local socket :file:`/tmp/fio.sock`.
+
+Once a server is running, a "client" can connect to the fio server with::
+
+ fio <local-args> --client=<server> <remote-args> <job file(s)>
+
+where `local-args` are arguments for the client where it is running, `server`
+is the connect string, and `remote-args` and `job file(s)` are sent to the
+server. The `server` string follows the same format as it does on the server
+side, to allow IP/hostname/socket and port strings.
+
+Note that all job options must be defined in job files when running fio as a
+client. Any job options specified in `remote-args` will be ignored.
+
+Fio can connect to multiple servers this way::
+
+ fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
+
+If the job file is located on the fio server, then you can tell the server to
+load a local file as well. This is done by using :option:`--remote-config` ::
+
+ fio --client=server --remote-config /path/to/file.fio
+
+Then fio will open this local (to the server) job file instead of being passed
+one from the client.
+
+If you have many servers (example: 100 VMs/containers), you can input a pathname
+of a file containing host IPs/names as the parameter value for the
+:option:`--client` option. For example, here is an example :file:`host.list`
+file containing 2 hostnames::
+
+ host1.your.dns.domain
+ host2.your.dns.domain
+
+The fio command would then be::
+
+ fio --client=host.list <job file(s)>
+
+In this mode, you cannot input server-specific parameters or job files -- all
+servers receive the same job file.
+
+In order to let ``fio --client`` runs use a shared filesystem from multiple
+hosts, ``fio --client`` now prepends the IP address of the server to the
+filename. For example, if fio is using the directory :file:`/mnt/nfs/fio` and is
+writing filename :file:`fileio.tmp`, with a :option:`--client` `hostfile`
+containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and
+192.168.10.121, then fio will create two files::
+
+ /mnt/nfs/fio/192.168.10.120.fileio.tmp
+ /mnt/nfs/fio/192.168.10.121.fileio.tmp
+
+Terse output in client/server mode will differ slightly from what is produced
+when fio is run in stand-alone mode. See the terse output section for details.
endif
DEBUGFLAGS = -DFIO_INC_DEBUG
-CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
+CPPFLAGS+= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
OPTFLAGS= -g -ffast-math
FIO_CFLAGS= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
LIBS += -lm $(EXTLIBS)
SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py tools/hist/fio-histo-log-pctiles.py tools/fio_jsonplus_clat2csv)
ifndef CONFIG_FIO_NO_OPT
- FIO_CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2
+ FIO_CFLAGS += -O3
endif
ifdef CONFIG_BUILD_NATIVE
FIO_CFLAGS += -march=native
LDFLAGS += -fuse-ld=lld $(LINK_PDBFILE)
endif
+# If clang, do not use builtin stpcpy as it breaks the build
+ifeq ($(CC),clang)
+ FIO_CFLAGS += -fno-builtin-stpcpy
+endif
+
ifdef CONFIG_GFIO
PROGS += gfio
endif
pshared.c options.c \
smalloc.c filehash.c profile.c debug.c engines/cpu.c \
engines/mmap.c engines/sync.c engines/null.c engines/net.c \
- engines/ftruncate.c engines/filecreate.c engines/filestat.c \
+ engines/ftruncate.c engines/fileoperations.c \
+ engines/exec.c \
server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
gettime-thread.c helpers.c json.c idletime.c td_error.c \
profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
workqueue.c rate-submit.c optgroup.c helper_thread.c \
- steadystate.c zone-dist.c zbd.c
+ steadystate.c zone-dist.c zbd.c dedupe.c dataplacement.c
ifdef CONFIG_LIBHDFS
HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
- HDFSLIB= -Wl,-rpath $(JAVA_HOME)/jre/lib/$(FIO_HDFS_CPU)/server -L$(JAVA_HOME)/jre/lib/$(FIO_HDFS_CPU)/server $(FIO_LIBHDFS_LIB)/libhdfs.a -ljvm
+ HDFSLIB= -Wl,-rpath $(JAVA_HOME)/lib/$(FIO_HDFS_CPU)/server -L$(JAVA_HOME)/lib/$(FIO_HDFS_CPU)/server $(FIO_LIBHDFS_LIB)/libhdfs.a -ljvm
FIO_CFLAGS += $(HDFSFLAGS)
SOURCE += engines/libhdfs.c
endif
ifdef CONFIG_LIBISCSI
- iscsi_SRCS = engines/libiscsi.c
- iscsi_LIBS = $(LIBISCSI_LIBS)
- iscsi_CFLAGS = $(LIBISCSI_CFLAGS)
- ENGINES += iscsi
+ libiscsi_SRCS = engines/libiscsi.c
+ libiscsi_LIBS = $(LIBISCSI_LIBS)
+ libiscsi_CFLAGS = $(LIBISCSI_CFLAGS)
+ ENGINES += libiscsi
endif
ifdef CONFIG_LIBNBD
ENGINES += nbd
endif
+ifdef CONFIG_LIBNFS
+ CFLAGS += $(LIBNFS_CFLAGS)
+ LIBS += $(LIBNFS_LIBS)
+ SOURCE += engines/nfs.c
+endif
+
ifdef CONFIG_64BIT
CPPFLAGS += -DBITS_PER_LONG=64
else ifdef CONFIG_32BIT
CPPFLAGS += -DBITS_PER_LONG=32
endif
ifdef CONFIG_LIBAIO
- aio_SRCS = engines/libaio.c
- aio_LIBS = -laio
- ifdef CONFIG_LIBAIO_URING
- aio_LIBS = -luring
- else
- aio_LIBS = -laio
- endif
- ENGINES += aio
+ libaio_SRCS = engines/libaio.c
+ cmdprio_SRCS = engines/cmdprio.c
+ LIBS += -laio
+ libaio_LIBS = -laio
+ ENGINES += libaio
endif
ifdef CONFIG_RDMA
rdma_SRCS = engines/rdma.c
rdma_LIBS = -libverbs -lrdmacm
ENGINES += rdma
endif
+ifdef CONFIG_LIBRPMA_APM
+ librpma_apm_SRCS = engines/librpma_apm.c
+ librpma_fio_SRCS = engines/librpma_fio.c
+ ifdef CONFIG_LIBPMEM2_INSTALLED
+ librpma_apm_LIBS = -lrpma -lpmem2
+ else
+ librpma_apm_LIBS = -lrpma -lpmem
+ endif
+ ENGINES += librpma_apm
+endif
+ifdef CONFIG_LIBRPMA_GPSPM
+ librpma_gpspm_SRCS = engines/librpma_gpspm.c engines/librpma_gpspm_flush.pb-c.c
+ librpma_fio_SRCS = engines/librpma_fio.c
+ ifdef CONFIG_LIBPMEM2_INSTALLED
+ librpma_gpspm_LIBS = -lrpma -lpmem2 -lprotobuf-c
+ else
+ librpma_gpspm_LIBS = -lrpma -lpmem -lprotobuf-c
+ endif
+ ENGINES += librpma_gpspm
+endif
+ifdef librpma_fio_SRCS
+ SOURCE += $(librpma_fio_SRCS)
+endif
ifdef CONFIG_POSIXAIO
SOURCE += engines/posixaio.c
endif
ifdef CONFIG_LINUX_EXT4_MOVE_EXTENT
SOURCE += engines/e4defrag.c
endif
+ifdef CONFIG_LIBCUFILE
+ SOURCE += engines/libcufile.c
+endif
ifdef CONFIG_LINUX_SPLICE
SOURCE += engines/splice.c
endif
http_LIBS = -lcurl -lssl -lcrypto
ENGINES += http
endif
+ifdef CONFIG_DFS
+ dfs_SRCS = engines/dfs.c
+ dfs_LIBS = -luuid -ldaos -ldfs
+ ENGINES += dfs
+endif
SOURCE += oslib/asprintf.c
ifndef CONFIG_STRSEP
SOURCE += oslib/strsep.c
SOURCE += oslib/libmtd.c
SOURCE += oslib/libmtd_legacy.c
endif
-ifdef CONFIG_PMEMBLK
- pmemblk_SRCS = engines/pmemblk.c
- pmemblk_LIBS = -lpmemblk
- ENGINES += pmemblk
-endif
ifdef CONFIG_LINUX_DEVDAX
dev-dax_SRCS = engines/dev-dax.c
dev-dax_LIBS = -lpmem
ENGINES += dev-dax
endif
ifdef CONFIG_LIBPMEM
- pmem_SRCS = engines/libpmem.c
- pmem_LIBS = -lpmem
- ENGINES += pmem
+ libpmem_SRCS = engines/libpmem.c
+ libpmem_LIBS = -lpmem
+ ENGINES += libpmem
endif
ifdef CONFIG_IME
SOURCE += engines/ime.c
endif
ifdef CONFIG_LIBZBC
- zbc_SRCS = engines/libzbc.c
- zbc_LIBS = -lzbc
- ENGINES += zbc
+ libzbc_SRCS = engines/libzbc.c
+ libzbc_LIBS = -lzbc
+ ENGINES += libzbc
+endif
+ifdef CONFIG_LIBXNVME
+ xnvme_SRCS = engines/xnvme.c
+ xnvme_LIBS = $(LIBXNVME_LIBS)
+ xnvme_CFLAGS = $(LIBXNVME_CFLAGS)
+ ENGINES += xnvme
+endif
+ifdef CONFIG_LIBBLKIO
+ libblkio_SRCS = engines/libblkio.c
+ libblkio_LIBS = $(LIBBLKIO_LIBS)
+ libblkio_CFLAGS = $(LIBBLKIO_CFLAGS)
+ ENGINES += libblkio
endif
-
ifeq ($(CONFIG_TARGET_OS), Linux)
SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
- oslib/linux-dev-lookup.c engines/io_uring.c
+ oslib/linux-dev-lookup.c engines/io_uring.c engines/nvme.c
+ cmdprio_SRCS = engines/cmdprio.c
ifdef CONFIG_HAS_BLKZONED
SOURCE += oslib/linux-blkzoned.c
endif
endif
ifeq ($(CONFIG_TARGET_OS), Android)
SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c profiles/tiobench.c \
- oslib/linux-dev-lookup.c
+ oslib/linux-dev-lookup.c engines/io_uring.c engines/nvme.c \
+ engines/sg.c
+ cmdprio_SRCS = engines/cmdprio.c
ifdef CONFIG_HAS_BLKZONED
SOURCE += oslib/linux-blkzoned.c
endif
LIBS += -lpthread -ldl
endif
ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
- SOURCE += os/windows/cpu-affinity.c os/windows/posix.c
- WINDOWS_OBJS = os/windows/cpu-affinity.o os/windows/posix.o lib/hweight.o
+ SOURCE += os/windows/cpu-affinity.c os/windows/posix.c os/windows/dlls.c
+ WINDOWS_OBJS = os/windows/cpu-affinity.o os/windows/posix.o os/windows/dlls.o lib/hweight.o
LIBS += -lpthread -lpsapi -lws2_32 -lssp
FIO_CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format
endif
+ifdef cmdprio_SRCS
+ SOURCE += $(cmdprio_SRCS)
+endif
+
ifdef CONFIG_DYNAMIC_ENGINES
DYNAMIC_ENGS := $(ENGINES)
define engine_template =
$(1)_OBJS := $$($(1)_SRCS:.c=.o)
-$$($(1)_OBJS): FIO_CFLAGS += -fPIC $$($(1)_CFLAGS)
-ENGS_OBJS += engines/lib$(1).so
+$$($(1)_OBJS): CFLAGS := -fPIC $$($(1)_CFLAGS) $(CFLAGS)
+engines/fio-$(1).so: $$($(1)_OBJS)
+ $$(QUIET_LINK)$(CC) $(LDFLAGS) -shared -rdynamic -fPIC -Wl,-soname,fio-$(1).so.1 -o $$@ $$< $$($(1)_LIBS)
+ENGS_OBJS += engines/fio-$(1).so
endef
else # !CONFIG_DYNAMIC_ENGINES
define engine_template =
SOURCE += $$($(1)_SRCS)
LIBS += $$($(1)_LIBS)
-CFLAGS += $$($(1)_CFLAGS)
+override CFLAGS += $$($(1)_CFLAGS)
endef
endif
+FIO-VERSION-FILE: FORCE
+ @$(SHELL) $(SRCDIR)/FIO-VERSION-GEN
+-include FIO-VERSION-FILE
+
+override CFLAGS := -DFIO_VERSION='"$(FIO_VERSION)"' $(FIO_CFLAGS) $(CFLAGS)
+
$(foreach eng,$(ENGINES),$(eval $(call engine_template,$(eng))))
OBJS := $(SOURCE:.c=.o)
T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o
T_PIPE_ASYNC_PROGS = t/read-to-pipe-async
-T_IOU_RING_OBJS = t/io_uring.o
-T_IOU_RING_OBJS += t/arch.o
+T_IOU_RING_OBJS = t/io_uring.o lib/rand.o lib/pattern.o lib/strntol.o
T_IOU_RING_PROGS = t/io_uring
T_MEMLOCK_OBJS = t/memlock.o
T_TT_OBJS = t/time-test.o
T_TT_PROGS = t/time-test
+ifneq (,$(findstring -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION,$(CFLAGS)))
+T_FUZZ_OBJS = t/fuzz/fuzz_parseini.o
+T_FUZZ_OBJS += $(OBJS)
+ifdef CONFIG_ARITHMETIC
+T_FUZZ_OBJS += lex.yy.o y.tab.o
+endif
+# For proper fio code teardown CFLAGS needs to include -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+# in case there is no fuzz driver defined by environment variable LIB_FUZZING_ENGINE, use a simple one
+# For instance, with compiler clang, address sanitizer and libFuzzer as a fuzzing engine, you should define
+# export CFLAGS="-fsanitize=address,fuzzer-no-link -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION"
+# export LIB_FUZZING_ENGINE="-fsanitize=address"
+# export CC=clang
+# before running configure && make
+# You can adapt this with different compilers, sanitizers, and fuzzing engines
+ifndef LIB_FUZZING_ENGINE
+T_FUZZ_OBJS += t/fuzz/onefile.o
+endif
+T_FUZZ_PROGS = t/fuzz/fuzz_parseini
+else # CFLAGS includes -DFUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+T_FUZZ_OBJS =
+T_FUZZ_PROGS =
+endif
+
T_OBJS = $(T_SMALLOC_OBJS)
T_OBJS += $(T_IEEE_OBJS)
T_OBJS += $(T_ZIPF_OBJS)
T_OBJS += $(T_MEMLOCK_OBJS)
T_OBJS += $(T_TT_OBJS)
T_OBJS += $(T_IOU_RING_OBJS)
+T_OBJS += $(T_FUZZ_OBJS)
ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
T_DEDUPE_OBJS += $(WINDOWS_OBJS)
T_TEST_PROGS += $(T_LFSR_TEST_PROGS)
T_TEST_PROGS += $(T_GEN_RAND_PROGS)
T_PROGS += $(T_BTRACE_FIO_PROGS)
+ifdef CONFIG_ZLIB
T_PROGS += $(T_DEDUPE_PROGS)
+endif
T_PROGS += $(T_VS_PROGS)
T_TEST_PROGS += $(T_MEMLOCK_PROGS)
ifdef CONFIG_PREAD
ifneq (,$(findstring Linux,$(CONFIG_TARGET_OS)))
T_TEST_PROGS += $(T_IOU_RING_PROGS)
endif
+T_TEST_PROGS += $(T_FUZZ_PROGS)
PROGS += $(T_PROGS)
.PHONY: all install clean test
.PHONY: FORCE cscope
-FIO-VERSION-FILE: FORCE
- @$(SHELL) $(SRCDIR)/FIO-VERSION-GEN
--include FIO-VERSION-FILE
-
-override CFLAGS := -DFIO_VERSION='"$(FIO_VERSION)"' $(FIO_CFLAGS) $(CFLAGS)
-
%.o : %.c
@mkdir -p $(dir $@)
$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
$(QUIET_LEX)$(LEX) $<
endif
+ifneq (,$(findstring -Wimplicit-fallthrough,$(CFLAGS)))
+LEX_YY_CFLAGS := -Wno-implicit-fallthrough
+endif
+
+ifdef CONFIG_HAVE_NO_STRINGOP
+YTAB_YY_CFLAGS := -Wno-stringop-truncation
+endif
+
lex.yy.o: lex.yy.c y.tab.h
- $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+ $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) $(LEX_YY_CFLAGS) -c $<
y.tab.o: y.tab.c y.tab.h
- $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+ $(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) $(YTAB_YY_CFLAGS) -c $<
y.tab.c: exp/expression-parser.y
$(QUIET_YACC)$(YACC) -o $@ -l -d -b y $<
fio: $(FIO_OBJS)
$(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(FIO_OBJS) $(LIBS) $(HDFSLIB)
+t/fuzz/fuzz_parseini: $(T_FUZZ_OBJS)
+ifndef LIB_FUZZING_ENGINE
+ $(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_FUZZ_OBJS) $(LIBS) $(HDFSLIB)
+else
+ $(QUIET_LINK)$(CXX) $(LDFLAGS) -o $@ $(T_FUZZ_OBJS) $(LIB_FUZZING_ENGINE) $(LIBS) $(HDFSLIB)
+endif
+
gfio: $(GFIO_OBJS)
$(QUIET_LINK)$(CC) $(filter-out -static, $(LDFLAGS)) -o gfio $(GFIO_OBJS) $(LIBS) $(GFIO_LIBS) $(GTK_LDFLAGS) $(HDFSLIB)
$(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
endif
+ifdef CONFIG_ZLIB
t/fio-dedupe: $(T_DEDUPE_OBJS)
$(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS)
+endif
t/fio-verify-state: $(T_VS_OBJS)
$(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
$(QUIET_LINK)$(CC) $(LDFLAGS) -o $@ $(UT_OBJS) $(UT_TARGET_OBJS) -lcunit $(LIBS)
endif
-ifdef CONFIG_DYNAMIC_ENGINES
-engines/lib$(1).so: $$($(1)_OBJS)
- $$(QUIET_LINK)$(CC) -shared -rdynamic -fPIC -Wl,-soname,lib$(1).so.1 $$($(1)_LIBS) -o $$@ $$<
-endif
-
clean: FORCE
- @rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] engines/*.so profiles/*.[do] t/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
+ @rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(UT_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio unittests/unittest FIO-VERSION-FILE *.[do] lib/*.d oslib/*.[do] crc/*.d engines/*.[do] engines/*.so profiles/*.[do] t/*.[do] t/*/*.[do] unittests/*.[do] unittests/*/*.[do] config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
@rm -f t/fio-btrace2fio t/io_uring t/read-to-pipe-async
@rm -rf doc/output
fulltest:
sudo modprobe null_blk && \
if [ ! -e /usr/include/libzbc/zbc.h ]; then \
- git clone https://github.com/hgst/libzbc && \
+ git clone https://github.com/westerndigitalcorporation/libzbc && \
(cd libzbc && \
./autogen.sh && \
./configure --prefix=/usr && \
make -j && \
sudo make install) \
fi && \
- sudo t/zbd/run-tests-against-regular-nullb && \
+ sudo t/zbd/run-tests-against-nullb -s 1 && \
if [ -e /sys/module/null_blk/parameters/zoned ]; then \
- sudo t/zbd/run-tests-against-zoned-nullb; \
+ sudo t/zbd/run-tests-against-nullb -s 2; \
+ sudo t/zbd/run-tests-against-nullb -s 4; \
fi
install: $(PROGS) $(SCRIPTS) $(ENGS_OBJS) tools/plot/fio2gnuplot.1 FORCE
+++ /dev/null
-Overview and history
---------------------
-
-Fio was originally written to save me the hassle of writing special test case
-programs when I wanted to test a specific workload, either for performance
-reasons or to find/reproduce a bug. The process of writing such a test app can
-be tiresome, especially if you have to do it often. Hence I needed a tool that
-would be able to simulate a given I/O workload without resorting to writing a
-tailored test case again and again.
-
-A test work load is difficult to define, though. There can be any number of
-processes or threads involved, and they can each be using their own way of
-generating I/O. You could have someone dirtying large amounts of memory in an
-memory mapped file, or maybe several threads issuing reads using asynchronous
-I/O. fio needed to be flexible enough to simulate both of these cases, and many
-more.
-
-Fio spawns a number of threads or processes doing a particular type of I/O
-action as specified by the user. fio takes a number of global parameters, each
-inherited by the thread unless otherwise parameters given to them overriding
-that setting is given. The typical use of fio is to write a job file matching
-the I/O load one wants to simulate.
-
-
-Source
-------
-
-Fio resides in a git repo, the canonical place is:
-
- git://git.kernel.dk/fio.git
-
-When inside a corporate firewall, git:// URL sometimes does not work.
-If git:// does not work, use the http protocol instead:
-
- http://git.kernel.dk/fio.git
-
-Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
-meta data as well. Other tarballs are archives of official fio releases.
-Snapshots can download from:
-
- http://brick.kernel.dk/snaps/
-
-There are also two official mirrors. Both of these are automatically synced with
-the main repository, when changes are pushed. If the main repo is down for some
-reason, either one of these is safe to use as a backup:
-
- git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
-
- https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
-
-or
-
- git://github.com/axboe/fio.git
-
- https://github.com/axboe/fio.git
-
-
-Mailing list
-------------
-
-The fio project mailing list is meant for anything related to fio including
-general discussion, bug reporting, questions, and development. For bug reporting,
-see REPORTING-BUGS.
-
-An automated mail detailing recent commits is automatically sent to the list at
-most daily. The list address is fio@vger.kernel.org, subscribe by sending an
-email to majordomo@vger.kernel.org with
-
- subscribe fio
-
-in the body of the email. Archives can be found here:
-
- http://www.spinics.net/lists/fio/
-
-and archives for the old list can be found here:
-
- http://maillist.kernel.dk/fio-devel/
-
-
-Author
-------
-
-Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
-the Linux I/O subsystem and schedulers. He got tired of writing specific test
-applications to simulate a given workload, and found that the existing I/O
-benchmark/test tools out there weren't flexible enough to do what he wanted.
-
-Jens Axboe <axboe@kernel.dk> 20060905
-
-
-Binary packages
----------------
-
-Debian:
- Starting with Debian "Squeeze", fio packages are part of the official
- Debian repository. http://packages.debian.org/search?keywords=fio .
-
-Ubuntu:
- Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
- of the Ubuntu "universe" repository.
- http://packages.ubuntu.com/search?keywords=fio .
-
-Red Hat, Fedora, CentOS & Co:
- Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
- packages are part of the Fedora/EPEL repositories.
- https://apps.fedoraproject.org/packages/fio .
-
-Mandriva:
- Mandriva has integrated fio into their package repository, so installing
- on that distro should be as easy as typing ``urpmi fio``.
-
-Arch Linux:
- An Arch Linux package is provided under the Community sub-repository:
- https://www.archlinux.org/packages/?sort=&q=fio
-
-Solaris:
- Packages for Solaris are available from OpenCSW. Install their pkgutil
- tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
- ``pkgutil -i fio``.
-
-Windows:
- Rebecca Cran <rebecca@bsdio.com> has fio packages for Windows at
- https://bsdio.com/fio/ . The latest builds for Windows can also
- be grabbed from https://ci.appveyor.com/project/axboe/fio by clicking
- the latest x86 or x64 build, then selecting the ARTIFACTS tab.
-
-BSDs:
- Packages for BSDs may be available from their binary package repositories.
- Look for a package "fio" using their binary package managers.
-
-
-Building
---------
-
-Just type::
-
- $ ./configure
- $ make
- $ make install
-
-Note that GNU make is required. On BSDs it's available from devel/gmake within
-ports directory; on Solaris it's in the SUNWgmake package. On platforms where
-GNU make isn't the default, type ``gmake`` instead of ``make``.
-
-Configure will print the enabled options. Note that on Linux based platforms,
-the libaio development packages must be installed to use the libaio
-engine. Depending on distro, it is usually called libaio-devel or libaio-dev.
-
-For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
-to be installed. gfio isn't built automatically and can be enabled with a
-``--enable-gfio`` option to configure.
-
-To build fio with a cross-compiler::
-
- $ make clean
- $ make CROSS_COMPILE=/path/to/toolchain/prefix
-
-Configure will attempt to determine the target platform automatically.
-
-It's possible to build fio for ESX as well, use the ``--esx`` switch to
-configure.
-
-
-Windows
-~~~~~~~
-
-On Windows, Cygwin (https://www.cygwin.com/) is required in order to build
-fio. To create an MSI installer package install WiX from
-https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
-directory.
-
-How to compile fio on 64-bit Windows:
-
- 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
- packages starting with **mingw64-x86_64**. Ensure
- **mingw64-x86_64-zlib** are installed if you wish
- to enable fio's log compression functionality.
- 2. Open the Cygwin Terminal.
- 3. Go to the fio directory (source files).
- 4. Run ``make clean && make -j``.
-
-To build fio for 32-bit Windows, ensure the -i686 versions of the previously
-mentioned -x86_64 packages are installed and run ``./configure
---build-32bit-win`` before ``make``. To build an fio that supports versions of
-Windows below Windows 7/Windows Server 2008 R2 also add ``--target-win-ver=xp``
-to the end of the configure line that you run before doing ``make``.
-
-It's recommended that once built or installed, fio be run in a Command Prompt or
-other 'native' console such as console2, since there are known to be display and
-signal issues when running it under a Cygwin shell (see
-https://github.com/mintty/mintty/issues/56 and
-https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs
-for details).
-
-
-Documentation
-~~~~~~~~~~~~~
-
-Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
-To build HTML formatted documentation run ``make -C doc html`` and direct your
-browser to :file:`./doc/output/html/index.html`. To build manual page run
-``make -C doc man`` and then ``man doc/output/man/fio.1``. To see what other
-output formats are supported run ``make -C doc help``.
-
-.. _reStructuredText: http://www.sphinx-doc.org/rest.html
-.. _Sphinx: http://www.sphinx-doc.org
-
-
-Platforms
----------
-
-Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
-Windows, FreeBSD, and DragonFly. Some features and/or options may only be
-available on some of the platforms, typically because those features only apply
-to that platform (like the solarisaio engine, or the splice engine on Linux).
-
-Some features are not available on FreeBSD/Solaris even if they could be
-implemented, I'd be happy to take patches for that. An example of that is disk
-utility statistics and (I think) huge page support, support for that does exist
-in FreeBSD/Solaris.
-
-Fio uses pthread mutexes for signalling and locking and some platforms do not
-support process shared pthread mutexes. As a result, on such platforms only
-threads are supported. This could be fixed with sysv ipc locking or other
-locking alternatives.
-
-Other \*BSD platforms are untested, but fio should work there almost out of the
-box. Since I don't do test runs or even compiles on those platforms, your
-mileage may vary. Sending me patches for other platforms is greatly
-appreciated. There's a lot of value in having the same test/benchmark tool
-available on all platforms.
-
-Note that POSIX aio is not enabled by default on AIX. Messages like these::
-
- Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because:
- Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix.
-
-indicate one needs to enable POSIX aio. Run the following commands as root::
-
- # lsdev -C -l posix_aio0
- posix_aio0 Defined Posix Asynchronous I/O
- # cfgmgr -l posix_aio0
- # lsdev -C -l posix_aio0
- posix_aio0 Available Posix Asynchronous I/O
-
-POSIX aio should work now. To make the change permanent::
-
- # chdev -l posix_aio0 -P -a autoconfig='available'
- posix_aio0 changed
-
-
-Running fio
------------
-
-Running fio is normally the easiest part - you just give it the job file
-(or job files) as parameters::
-
- $ fio [options] [jobfile] ...
-
-and it will start doing what the *jobfile* tells it to do. You can give more
-than one job file on the command line, fio will serialize the running of those
-files. Internally that is the same as using the :option:`stonewall` parameter
-described in the parameter section.
-
-If the job file contains only one job, you may as well just give the parameters
-on the command line. The command line parameters are identical to the job
-parameters, with a few extra that control global parameters. For example, for
-the job file parameter :option:`iodepth=2 <iodepth>`, the mirror command line
-option would be :option:`--iodepth 2 <iodepth>` or :option:`--iodepth=2
-<iodepth>`. You can also use the command line for giving more than one job
-entry. For each :option:`--name <name>` option that fio sees, it will start a
-new job with that name. Command line entries following a
-:option:`--name <name>` entry will apply to that job, until there are no more
-entries or a new :option:`--name <name>` entry is seen. This is similar to the
-job file options, where each option applies to the current job until a new []
-job entry is seen.
-
-fio does not need to run as root, except if the files or devices specified in
-the job section requires that. Some other options may also be restricted, such
-as memory locking, I/O scheduler switching, and decreasing the nice value.
-
-If *jobfile* is specified as ``-``, the job file will be read from standard
-input.
--- /dev/null
+Overview and history
+--------------------
+
+Fio was originally written to save me the hassle of writing special test case
+programs when I wanted to test a specific workload, either for performance
+reasons or to find/reproduce a bug. The process of writing such a test app can
+be tiresome, especially if you have to do it often. Hence I needed a tool that
+would be able to simulate a given I/O workload without resorting to writing a
+tailored test case again and again.
+
+A test work load is difficult to define, though. There can be any number of
+processes or threads involved, and they can each be using their own way of
+generating I/O. You could have someone dirtying large amounts of memory in a
+memory mapped file, or maybe several threads issuing reads using asynchronous
+I/O. fio needed to be flexible enough to simulate both of these cases, and many
+more.
+
+Fio spawns a number of threads or processes doing a particular type of I/O
+action as specified by the user. fio takes a number of global parameters, each
+inherited by the thread unless otherwise parameters given to them overriding
+that setting is given. The typical use of fio is to write a job file matching
+the I/O load one wants to simulate.
+
+
+Source
+------
+
+Fio resides in a git repo, the canonical place is:
+
+ https://git.kernel.dk/cgit/fio/
+
+Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
+meta data as well. Other tarballs are archives of official fio releases.
+Snapshots can download from:
+
+ https://brick.kernel.dk/snaps/
+
+There are also two official mirrors. Both of these are automatically synced with
+the main repository, when changes are pushed. If the main repo is down for some
+reason, either one of these is safe to use as a backup:
+
+ https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
+
+ https://github.com/axboe/fio.git
+
+
+Mailing list
+------------
+
+The fio project mailing list is meant for anything related to fio including
+general discussion, bug reporting, questions, and development. For bug reporting,
+see REPORTING-BUGS.
+
+An automated mail detailing recent commits is automatically sent to the list at
+most daily. The list address is fio@vger.kernel.org, subscribe by sending an
+email to majordomo@vger.kernel.org with
+
+ subscribe fio
+
+in the body of the email. Archives can be found here:
+
+ https://www.spinics.net/lists/fio/
+
+or here:
+
+ https://lore.kernel.org/fio/
+
+and archives for the old list can be found here:
+
+ http://maillist.kernel.dk/fio-devel/
+
+
+Author
+------
+
+Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
+the Linux I/O subsystem and schedulers. He got tired of writing specific test
+applications to simulate a given workload, and found that the existing I/O
+benchmark/test tools out there weren't flexible enough to do what he wanted.
+
+Jens Axboe <axboe@kernel.dk> 20060905
+
+
+Maintainers
+-----------
+
+Fio is maintained by Jens Axboe <axboe@kernel.dk and
+Vincent Fu <vincentfu@gmail.com> - however, for reporting bugs please use
+the fio reflector or the GitHub page rather than email any of them
+directly. By using the public resources, others will be able to learn from
+the responses too. Chances are also good that other members will be able to
+help with your inquiry as well.
+
+
+Binary packages
+---------------
+
+Debian:
+ Starting with Debian "Squeeze", fio packages are part of the official
+ Debian repository. https://packages.debian.org/search?keywords=fio .
+
+Ubuntu:
+ Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
+ of the Ubuntu "universe" repository.
+ https://packages.ubuntu.com/search?keywords=fio .
+
+Red Hat, Fedora, CentOS & Co:
+ Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
+ packages are part of the Fedora/EPEL repositories.
+ https://packages.fedoraproject.org/pkgs/fio/ .
+
+Mandriva:
+ Mandriva has integrated fio into their package repository, so installing
+ on that distro should be as easy as typing ``urpmi fio``.
+
+Arch Linux:
+ An Arch Linux package is provided under the Community sub-repository:
+ https://www.archlinux.org/packages/?sort=&q=fio
+
+Solaris:
+ Packages for Solaris are available from OpenCSW. Install their pkgutil
+ tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
+ ``pkgutil -i fio``.
+
+Windows:
+ Beginning with fio 3.31 Windows installers for tagged releases are
+ available on GitHub at https://github.com/axboe/fio/releases. The
+ latest installers for Windows can also be obtained as GitHub Actions
+ artifacts by selecting a build from
+ https://github.com/axboe/fio/actions. These require logging in to a
+ GitHub account.
+
+BSDs:
+ Packages for BSDs may be available from their binary package repositories.
+ Look for a package "fio" using their binary package managers.
+
+
+Building
+--------
+
+Just type::
+
+ $ ./configure
+ $ make
+ $ make install
+
+Note that GNU make is required. On BSDs it's available from devel/gmake within
+ports directory; on Solaris it's in the SUNWgmake package. On platforms where
+GNU make isn't the default, type ``gmake`` instead of ``make``.
+
+Configure will print the enabled options. Note that on Linux based platforms,
+the libaio development packages must be installed to use the libaio
+engine. Depending on the distro, it is usually called libaio-devel or libaio-dev.
+
+For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
+to be installed. gfio isn't built automatically and can be enabled with a
+``--enable-gfio`` option to configure.
+
+To build fio with a cross-compiler::
+
+ $ make clean
+ $ make CROSS_COMPILE=/path/to/toolchain/prefix
+
+Configure will attempt to determine the target platform automatically.
+
+It's possible to build fio for ESX as well, use the ``--esx`` switch to
+configure.
+
+
+Windows
+~~~~~~~
+
+The minimum versions of Windows for building/running fio are Windows 7/Windows
+Server 2008 R2. On Windows, Cygwin (https://www.cygwin.com/) is required in
+order to build fio. To create an MSI installer package install WiX from
+https://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
+directory.
+
+How to compile fio on 64-bit Windows:
+
+ 1. Install Cygwin (https://www.cygwin.com/). Install **make** and all
+ packages starting with **mingw64-x86_64**. Ensure
+ **mingw64-x86_64-zlib** are installed if you wish
+ to enable fio's log compression functionality.
+ 2. Open the Cygwin Terminal.
+ 3. Go to the fio directory (source files).
+ 4. Run ``make clean && make -j``.
+
+To build fio for 32-bit Windows, ensure the -i686 versions of the previously
+mentioned -x86_64 packages are installed and run ``./configure
+--build-32bit-win`` before ``make``.
+
+It's recommended that once built or installed, fio be run in a Command Prompt or
+other 'native' console such as console2, since there are known to be display and
+signal issues when running it under a Cygwin shell (see
+https://github.com/mintty/mintty/issues/56 and
+https://github.com/mintty/mintty/wiki/Tips#inputoutput-interaction-with-alien-programs
+for details).
+
+
+Documentation
+~~~~~~~~~~~~~
+
+Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
+To build HTML formatted documentation run ``make -C doc html`` and direct your
+browser to :file:`./doc/output/html/index.html`. To build manual page run
+``make -C doc man`` and then ``man doc/output/man/fio.1``. To see what other
+output formats are supported run ``make -C doc help``.
+
+.. _reStructuredText: https://www.sphinx-doc.org/rest.html
+.. _Sphinx: https://www.sphinx-doc.org
+
+
+Platforms
+---------
+
+Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
+Windows, FreeBSD, and DragonFly. Some features and/or options may only be
+available on some of the platforms, typically because those features only apply
+to that platform (like the solarisaio engine, or the splice engine on Linux).
+
+Some features are not available on FreeBSD/Solaris even if they could be
+implemented, I'd be happy to take patches for that. An example of that is disk
+utility statistics and (I think) huge page support, support for that does exist
+in FreeBSD/Solaris.
+
+Fio uses pthread mutexes for signaling and locking and some platforms do not
+support process shared pthread mutexes. As a result, on such platforms only
+threads are supported. This could be fixed with sysv ipc locking or other
+locking alternatives.
+
+Other \*BSD platforms are untested, but fio should work there almost out of the
+box. Since I don't do test runs or even compiles on those platforms, your
+mileage may vary. Sending me patches for other platforms is greatly
+appreciated. There's a lot of value in having the same test/benchmark tool
+available on all platforms.
+
+Note that POSIX aio is not enabled by default on AIX. Messages like these::
+
+ Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because:
+ Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix.
+
+indicate one needs to enable POSIX aio. Run the following commands as root::
+
+ # lsdev -C -l posix_aio0
+ posix_aio0 Defined Posix Asynchronous I/O
+ # cfgmgr -l posix_aio0
+ # lsdev -C -l posix_aio0
+ posix_aio0 Available Posix Asynchronous I/O
+
+POSIX aio should work now. To make the change permanent::
+
+ # chdev -l posix_aio0 -P -a autoconfig='available'
+ posix_aio0 changed
+
+
+Running fio
+-----------
+
+Running fio is normally the easiest part - you just give it the job file
+(or job files) as parameters::
+
+ $ fio [options] [jobfile] ...
+
+and it will start doing what the *jobfile* tells it to do. You can give more
+than one job file on the command line, fio will serialize the running of those
+files. Internally that is the same as using the :option:`stonewall` parameter
+described in the parameter section.
+
+If the job file contains only one job, you may as well just give the parameters
+on the command line. The command line parameters are identical to the job
+parameters, with a few extra that control global parameters. For example, for
+the job file parameter :option:`iodepth=2 <iodepth>`, the mirror command line
+option would be :option:`--iodepth 2 <iodepth>` or :option:`--iodepth=2
+<iodepth>`. You can also use the command line for giving more than one job
+entry. For each :option:`--name <name>` option that fio sees, it will start a
+new job with that name. Command line entries following a
+:option:`--name <name>` entry will apply to that job, until there are no more
+entries or a new :option:`--name <name>` entry is seen. This is similar to the
+job file options, where each option applies to the current job until a new []
+job entry is seen.
+
+fio does not need to run as root, except if the files or devices specified in
+the job section requires that. Some other options may also be restricted, such
+as memory locking, I/O scheduler switching, and decreasing the nice value.
+
+If *jobfile* is specified as ``-``, the job file will be read from standard
+input.
Reporting a bug
---------------
-If you notice anything that seems like a fio bug, please do send email
-to the list (fio@vger.kernel.org, see README) about it. If you are not
-running the newest release of fio, upgrading first is recommended.
+...via the mailing list
+=======================
+
+If you notice anything that seems like a fio bug or want to ask fio related
+questions, please send a plain-text only email to the list
+(fio@vger.kernel.org, see README) about it. If you are not running the newest
+release of fio please upgrade first.
When reporting a bug, you'll need to include:
1) A description of what you think the bug is
-2) Environment (Linux distro version, kernel version). This is mostly
+2) Environment (e.g. Linux distro version, kernel version). This is mostly
needed if it's a build bug.
-3) The output from fio --version.
+3) The output from fio --version .
4) How to reproduce. Please include a full list of the parameters
passed to fio and the job file used (if any).
of getting to the bottom of the issue, and an eventual fix.
That's it!
+
+...via GitHub issues
+====================
+
+Please create an issue in the GitHub issue tracker
+(https://github.com/axboe/fio/issues ) but observe the following:
+
+a) If you are asking a question on how to do something ("How do I/Why is?")
+ please send it to the mailing list and not GitHub issues. The fio project
+ uses GitHub issues for reproducible bugs/enhancement requests.
+b) Please reproduce your bug using the latest fio listed on
+ https://github.com/axboe/fio/releases (see the Source and Building sections
+ of the README for how to build fio from source).
+c) Include all of the information requested in the mailing list section above
+ (description, environment, version, reproduction steps and all job parameters).
+
+Thanks!
Known issues/TODO (for steady-state)
-- Allow user to specify the frequency of measurements
+- Replace the test script with a better one
+ - Add test cases for the new check_interval option
+ - Parse debug=steadystate output to check calculations
+
+- Instead of calculating `intervals` every time, calculate it once and stash it
+ somewhere
+
+- Add the time unit to the ss_dur and check_interval variable names to reduce
+ possible confusion
- Better documentation for output
#define ARCH_HAVE_FFZ
+#define isb() asm volatile("isb" : : : "memory")
+
+static inline unsigned long long get_cpu_clock(void)
+{
+ unsigned long val;
+
+ isb();
+ asm volatile("mrs %0, cntvct_el0" : "=r" (val));
+ return val;
+}
+#define ARCH_HAVE_CPU_CLOCK
+
+#define ARCH_HAVE_INIT
+extern bool tsc_reliable;
+static inline int arch_init(char *envp[])
+{
+ tsc_reliable = true;
+ return 0;
+}
+
+#define __do_syscallN(...) ({ \
+ __asm__ volatile ( \
+ "svc 0" \
+ : "=r"(x0) \
+ : __VA_ARGS__ \
+ : "memory", "cc"); \
+ (long) x0; \
+})
+
+#define __do_syscall0(__n) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register long x0 __asm__("x0"); \
+ \
+ __do_syscallN("r" (x8)); \
+})
+
+#define __do_syscall1(__n, __a) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ \
+ __do_syscallN("r" (x8), "0" (x0)); \
+})
+
+#define __do_syscall2(__n, __a, __b) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ register __typeof__(__b) x1 __asm__("x1") = __b; \
+ \
+ __do_syscallN("r" (x8), "0" (x0), "r" (x1)); \
+})
+
+#define __do_syscall3(__n, __a, __b, __c) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ register __typeof__(__b) x1 __asm__("x1") = __b; \
+ register __typeof__(__c) x2 __asm__("x2") = __c; \
+ \
+ __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2)); \
+})
+
+#define __do_syscall4(__n, __a, __b, __c, __d) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ register __typeof__(__b) x1 __asm__("x1") = __b; \
+ register __typeof__(__c) x2 __asm__("x2") = __c; \
+ register __typeof__(__d) x3 __asm__("x3") = __d; \
+ \
+ __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3));\
+})
+
+#define __do_syscall5(__n, __a, __b, __c, __d, __e) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ register __typeof__(__b) x1 __asm__("x1") = __b; \
+ register __typeof__(__c) x2 __asm__("x2") = __c; \
+ register __typeof__(__d) x3 __asm__("x3") = __d; \
+ register __typeof__(__e) x4 __asm__("x4") = __e; \
+ \
+ __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3), \
+ "r"(x4)); \
+})
+
+#define __do_syscall6(__n, __a, __b, __c, __d, __e, __f) ({ \
+ register long x8 __asm__("x8") = __n; \
+ register __typeof__(__a) x0 __asm__("x0") = __a; \
+ register __typeof__(__b) x1 __asm__("x1") = __b; \
+ register __typeof__(__c) x2 __asm__("x2") = __c; \
+ register __typeof__(__d) x3 __asm__("x3") = __d; \
+ register __typeof__(__e) x4 __asm__("x4") = __e; \
+ register __typeof__(__f) x5 __asm__("x5") = __f; \
+ \
+ __do_syscallN("r" (x8), "0" (x0), "r" (x1), "r" (x2), "r" (x3), \
+ "r" (x4), "r"(x5)); \
+})
+
+#define FIO_ARCH_HAS_SYSCALL
+
#endif
--- /dev/null
+#ifndef ARCH_LOONGARCH64_H
+#define ARCH_LOONGARCH64_H
+
+#define FIO_ARCH (arch_loongarch64)
+
+#define read_barrier() __asm__ __volatile__("dbar 0": : :"memory")
+#define write_barrier() __asm__ __volatile__("dbar 0": : :"memory")
+#define nop __asm__ __volatile__("dbar 0": : :"memory")
+
+#endif
--- /dev/null
+#ifndef ARCH_RISCV64_H
+#define ARCH_RISCV64_H
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define FIO_ARCH (arch_riscv64)
+
+#define nop __asm__ __volatile__ ("nop")
+#define read_barrier() __asm__ __volatile__("fence r, r": : :"memory")
+#define write_barrier() __asm__ __volatile__("fence w, w": : :"memory")
+
+static inline unsigned long long get_cpu_clock(void)
+{
+ unsigned long val;
+
+ asm volatile("rdtime %0" : "=r"(val));
+ return val;
+}
+#define ARCH_HAVE_CPU_CLOCK
+
+#define ARCH_HAVE_INIT
+extern bool tsc_reliable;
+static inline int arch_init(char *envp[])
+{
+ tsc_reliable = true;
+ return 0;
+}
+
+#define __do_syscallM(...) ({ \
+ __asm__ volatile ( \
+ "ecall" \
+ : "=r"(a0) \
+ : __VA_ARGS__ \
+ : "memory", "a1"); \
+ (long) a0; \
+})
+
+#define __do_syscallN(...) ({ \
+ __asm__ volatile ( \
+ "ecall" \
+ : "=r"(a0) \
+ : __VA_ARGS__ \
+ : "memory"); \
+ (long) a0; \
+})
+
+#define __do_syscall0(__n) ({ \
+ register long a7 __asm__("a7") = __n; \
+ register long a0 __asm__("a0"); \
+ \
+ __do_syscallM("r" (a7)); \
+})
+
+#define __do_syscall1(__n, __a) ({ \
+ register long a7 __asm__("a7") = __n; \
+ register __typeof__(__a) a0 __asm__("a0") = __a; \
+ \
+ __do_syscallM("r" (a7), "0" (a0)); \
+})
+
+#define __do_syscall2(__n, __a, __b) ({ \
+ register long a7 __asm__("a7") = __n; \
+ register __typeof__(__a) a0 __asm__("a0") = __a; \
+ register __typeof__(__b) a1 __asm__("a1") = __b; \
+ \
+ __do_syscallN("r" (a7), "0" (a0), "r" (a1)); \
+})
+
+#define __do_syscall3(__n, __a, __b, __c) ({ \
+ register long a7 __asm__("a7") = __n; \
+ register __typeof__(__a) a0 __asm__("a0") = __a; \
+ register __typeof__(__b) a1 __asm__("a1") = __b; \
+ register __typeof__(__c) a2 __asm__("a2") = __c; \
+ \
+ __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2)); \
+})
+
+#define __do_syscall4(__n, __a, __b, __c, __d) ({ \
+ register long a7 __asm__("a7") = __n; \
+ register __typeof__(__a) a0 __asm__("a0") = __a; \
+ register __typeof__(__b) a1 __asm__("a1") = __b; \
+ register __typeof__(__c) a2 __asm__("a2") = __c; \
+ register __typeof__(__d) a3 __asm__("a3") = __d; \
+ \
+ __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2), "r" (a3));\
+})
+
+#define __do_syscall5(__n, __a, __b, __c, __d, __e) ({ \
+ register long a7 __asm__("a7") = __n; \
+ register __typeof__(__a) a0 __asm__("a0") = __a; \
+ register __typeof__(__b) a1 __asm__("a1") = __b; \
+ register __typeof__(__c) a2 __asm__("a2") = __c; \
+ register __typeof__(__d) a3 __asm__("a3") = __d; \
+ register __typeof__(__e) a4 __asm__("a4") = __e; \
+ \
+ __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2), "r" (a3), \
+ "r"(a4)); \
+})
+
+#define __do_syscall6(__n, __a, __b, __c, __d, __e, __f) ({ \
+ register long a7 __asm__("a7") = __n; \
+ register __typeof__(__a) a0 __asm__("a0") = __a; \
+ register __typeof__(__b) a1 __asm__("a1") = __b; \
+ register __typeof__(__c) a2 __asm__("a2") = __c; \
+ register __typeof__(__d) a3 __asm__("a3") = __d; \
+ register __typeof__(__e) a4 __asm__("a4") = __e; \
+ register __typeof__(__f) a5 __asm__("a5") = __f; \
+ \
+ __do_syscallN("r" (a7), "0" (a0), "r" (a1), "r" (a2), "r" (a3), \
+ "r" (a4), "r"(a5)); \
+})
+
+#define FIO_ARCH_HAS_SYSCALL
+
+#endif
return bitmask;
}
+static inline void tsc_barrier(void)
+{
+ __asm__ __volatile__("mfence":::"memory");
+}
+
static inline unsigned long long get_cpu_clock(void)
{
unsigned int lo, hi;
return 0;
}
+#define __do_syscall0(NUM) ({ \
+ intptr_t rax; \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"(NUM) /* %rax */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall1(NUM, ARG1) ({ \
+ intptr_t rax; \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)) /* %rdi */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall2(NUM, ARG1, ARG2) ({ \
+ intptr_t rax; \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)), /* %rdi */ \
+ "S"((ARG2)) /* %rsi */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall3(NUM, ARG1, ARG2, ARG3) ({ \
+ intptr_t rax; \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)), /* %rdi */ \
+ "S"((ARG2)), /* %rsi */ \
+ "d"((ARG3)) /* %rdx */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall4(NUM, ARG1, ARG2, ARG3, ARG4) ({ \
+ intptr_t rax; \
+ register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)), /* %rdi */ \
+ "S"((ARG2)), /* %rsi */ \
+ "d"((ARG3)), /* %rdx */ \
+ "r"(__r10) /* %r10 */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall5(NUM, ARG1, ARG2, ARG3, ARG4, ARG5) ({ \
+ intptr_t rax; \
+ register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \
+ register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5); \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)), /* %rdi */ \
+ "S"((ARG2)), /* %rsi */ \
+ "d"((ARG3)), /* %rdx */ \
+ "r"(__r10), /* %r10 */ \
+ "r"(__r8) /* %r8 */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define __do_syscall6(NUM, ARG1, ARG2, ARG3, ARG4, ARG5, ARG6) ({ \
+ intptr_t rax; \
+ register __typeof__(ARG4) __r10 __asm__("r10") = (ARG4); \
+ register __typeof__(ARG5) __r8 __asm__("r8") = (ARG5); \
+ register __typeof__(ARG6) __r9 __asm__("r9") = (ARG6); \
+ \
+ __asm__ volatile( \
+ "syscall" \
+ : "=a"(rax) /* %rax */ \
+ : "a"((NUM)), /* %rax */ \
+ "D"((ARG1)), /* %rdi */ \
+ "S"((ARG2)), /* %rsi */ \
+ "d"((ARG3)), /* %rdx */ \
+ "r"(__r10), /* %r10 */ \
+ "r"(__r8), /* %r8 */ \
+ "r"(__r9) /* %r9 */ \
+ : "rcx", "r11", "memory" \
+ ); \
+ rax; \
+})
+
+#define FIO_ARCH_HAS_SYSCALL
+
#endif
#ifndef ARCH_H
#define ARCH_H
+#ifdef __cplusplus
+#include <atomic>
+#else
#include <stdatomic.h>
+#endif
#include "../lib/types.h"
arch_hppa,
arch_mips,
arch_aarch64,
+ arch_loongarch64,
+ arch_riscv64,
arch_generic,
#define ARCH_CPU_CLOCK_WRAPS
+#ifdef __cplusplus
+#define atomic_add(p, v) \
+ std::atomic_fetch_add(p, (v))
+#define atomic_sub(p, v) \
+ std::atomic_fetch_sub(p, (v))
+#define atomic_load_relaxed(p) \
+ std::atomic_load_explicit(p, \
+ std::memory_order_relaxed)
+#define atomic_load_acquire(p) \
+ std::atomic_load_explicit(p, \
+ std::memory_order_acquire)
+#define atomic_store_release(p, v) \
+ std::atomic_store_explicit(p, (v), \
+ std::memory_order_release)
+#else
#define atomic_add(p, v) \
atomic_fetch_add((_Atomic typeof(*(p)) *)(p), v)
#define atomic_sub(p, v) \
#define atomic_store_release(p, v) \
atomic_store_explicit((_Atomic typeof(*(p)) *)(p), (v), \
memory_order_release)
+#endif
/* IWYU pragma: begin_exports */
#if defined(__i386__)
#include "arch-hppa.h"
#elif defined(__aarch64__)
#include "arch-aarch64.h"
+#elif defined(__loongarch64)
+#include "arch-loongarch64.h"
+#elif defined(__riscv) && __riscv_xlen == 64
+#include "arch-riscv64.h"
#else
#warning "Unknown architecture, attempting to use generic model."
#include "arch-generic.h"
#endif
+#if !defined(__x86_64__) && defined(CONFIG_SYNC_SYNC)
+static inline void tsc_barrier(void)
+{
+ __sync_synchronize();
+}
+#endif
+
#include "../lib/ffz.h"
/* IWYU pragma: end_exports */
#include "helper_thread.h"
#include "pshared.h"
#include "zone-dist.h"
+#include "fio_time.h"
static struct fio_sem *startup_sem;
static struct flist_head *cgroup_list;
int groupid = 0;
unsigned int thread_number = 0;
+unsigned int nr_segments = 0;
+unsigned int cur_segment = 0;
unsigned int stat_number = 0;
-int shm_id = 0;
int temp_stall_ts;
unsigned long done_secs = 0;
#ifdef PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP
static void sig_int(int sig)
{
- if (threads) {
+ if (nr_segments) {
if (is_backend)
fio_server_got_signal(sig);
else {
}
}
+#ifdef WIN32
+static void sig_break(int sig)
+{
+ sig_int(sig);
+
+ /**
+ * Windows terminates all job processes on SIGBREAK after the handler
+ * returns, so give them time to wrap-up and give stats
+ */
+ for_each_td(td) {
+ while (td->runstate < TD_EXITED)
+ sleep(1);
+ } end_for_each();
+}
+#endif
+
void sig_show_status(int sig)
{
show_running_run_stats();
/* Windows uses SIGBREAK as a quit signal from other applications */
#ifdef WIN32
memset(&act, 0, sizeof(act));
- act.sa_handler = sig_int;
+ act.sa_handler = sig_break;
act.sa_flags = SA_RESTART;
sigaction(SIGBREAK, &act, NULL);
#endif
static bool __check_min_rate(struct thread_data *td, struct timespec *now,
enum fio_ddir ddir)
{
- unsigned long long bytes = 0;
- unsigned long iops = 0;
- unsigned long spent;
- unsigned long long rate;
- unsigned long long ratemin = 0;
- unsigned int rate_iops = 0;
- unsigned int rate_iops_min = 0;
+ unsigned long long current_rate_check_bytes = td->this_io_bytes[ddir];
+ unsigned long current_rate_check_blocks = td->this_io_blocks[ddir];
+ unsigned long long option_rate_bytes_min = td->o.ratemin[ddir];
+ unsigned int option_rate_iops_min = td->o.rate_iops_min[ddir];
assert(ddir_rw(ddir));
if (mtime_since(&td->start, now) < 2000)
return false;
- iops += td->this_io_blocks[ddir];
- bytes += td->this_io_bytes[ddir];
- ratemin += td->o.ratemin[ddir];
- rate_iops += td->o.rate_iops[ddir];
- rate_iops_min += td->o.rate_iops_min[ddir];
-
/*
- * if rate blocks is set, sample is running
+ * if last_rate_check_blocks or last_rate_check_bytes is set,
+ * we can compute a rate per ratecycle
*/
- if (td->rate_bytes[ddir] || td->rate_blocks[ddir]) {
- spent = mtime_since(&td->lastrate[ddir], now);
- if (spent < td->o.ratecycle)
+ if (td->last_rate_check_bytes[ddir] || td->last_rate_check_blocks[ddir]) {
+ unsigned long spent = mtime_since(&td->last_rate_check_time[ddir], now);
+ if (spent < td->o.ratecycle || spent==0)
return false;
- if (td->o.rate[ddir] || td->o.ratemin[ddir]) {
+ if (td->o.ratemin[ddir]) {
/*
* check bandwidth specified rate
*/
- if (bytes < td->rate_bytes[ddir]) {
- log_err("%s: rate_min=%lluB/s not met, only transferred %lluB\n",
- td->o.name, ratemin, bytes);
+ unsigned long long current_rate_bytes =
+ ((current_rate_check_bytes - td->last_rate_check_bytes[ddir]) * 1000) / spent;
+ if (current_rate_bytes < option_rate_bytes_min) {
+ log_err("%s: rate_min=%lluB/s not met, got %lluB/s\n",
+ td->o.name, option_rate_bytes_min, current_rate_bytes);
return true;
- } else {
- if (spent)
- rate = ((bytes - td->rate_bytes[ddir]) * 1000) / spent;
- else
- rate = 0;
-
- if (rate < ratemin ||
- bytes < td->rate_bytes[ddir]) {
- log_err("%s: rate_min=%lluB/s not met, got %lluB/s\n",
- td->o.name, ratemin, rate);
- return true;
- }
}
} else {
/*
* checks iops specified rate
*/
- if (iops < rate_iops) {
- log_err("%s: rate_iops_min=%u not met, only performed %lu IOs\n",
- td->o.name, rate_iops, iops);
+ unsigned long long current_rate_iops =
+ ((current_rate_check_blocks - td->last_rate_check_blocks[ddir]) * 1000) / spent;
+
+ if (current_rate_iops < option_rate_iops_min) {
+ log_err("%s: rate_iops_min=%u not met, got %llu IOPS\n",
+ td->o.name, option_rate_iops_min, current_rate_iops);
return true;
- } else {
- if (spent)
- rate = ((iops - td->rate_blocks[ddir]) * 1000) / spent;
- else
- rate = 0;
-
- if (rate < rate_iops_min ||
- iops < td->rate_blocks[ddir]) {
- log_err("%s: rate_iops_min=%u not met, got %llu IOPS\n",
- td->o.name, rate_iops_min, rate);
- return true;
- }
}
}
}
- td->rate_bytes[ddir] = bytes;
- td->rate_blocks[ddir] = iops;
- memcpy(&td->lastrate[ddir], now, sizeof(*now));
+ td->last_rate_check_bytes[ddir] = current_rate_check_bytes;
+ td->last_rate_check_blocks[ddir] = current_rate_check_blocks;
+ memcpy(&td->last_rate_check_time[ddir], now, sizeof(*now));
return false;
}
td_clear_error(td);
*retptr = 0;
return false;
- } else if (td->o.fill_device && err == ENOSPC) {
+ } else if (td->o.fill_device && (err == ENOSPC || err == EDQUOT)) {
/*
* We expect to hit this error if
* fill_device option is set.
if ((full && !min_evts) || !td->o.iodepth_batch_complete_min)
min_evts = 1;
- if (time && __should_check_rate(td))
+ if (time && should_check_rate(td))
fio_gettime(time, NULL);
do {
if (!from_verify)
unlog_io_piece(td, io_u);
td_verror(td, EIO, "full resid");
- put_io_u(td, io_u);
+ clear_io_u(td, io_u);
break;
}
requeue_io_u(td, &io_u);
} else {
sync_done:
- if (comp_time && __should_check_rate(td))
+ if (comp_time && should_check_rate(td))
fio_gettime(comp_time, NULL);
*ret = io_u_sync_complete(td, io_u);
if (td->error)
return;
- /*
- * verify_state needs to be reset before verification
- * proceeds so that expected random seeds match actual
- * random seeds in headers. The main loop will reset
- * all random number generators if randrepeat is set.
- */
- if (!td->o.rand_repeatable)
- td_fill_verify_state_seed(td);
-
td_set_runstate(td, TD_VERIFYING);
io_u = NULL;
break;
}
} else {
- if (ddir_rw_sum(td->bytes_done) + td->o.rw_min_bs > verify_bytes)
+ if (td->bytes_verified + td->o.rw_min_bs > verify_bytes)
break;
while ((io_u = get_io_u(td)) != NULL) {
break;
} else if (io_u->ddir == DDIR_WRITE) {
io_u->ddir = DDIR_READ;
+ io_u->numberio = td->verify_read_issues;
+ td->verify_read_issues++;
populate_verify_io_u(td, io_u);
break;
} else {
if (td->o.rate_process == RATE_PROCESS_POISSON) {
uint64_t val, iops;
- iops = bps / td->o.bs[ddir];
+ iops = bps / td->o.min_bs[ddir];
val = (int64_t) (1000000 / iops) *
-logf(__rand_0_1(&td->poisson_state[ddir]));
if (val) {
return 0;
}
-static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir)
+static void init_thinktime(struct thread_data *td)
+{
+ if (td->o.thinktime_blocks_type == THINKTIME_BLOCKS_TYPE_COMPLETE)
+ td->thinktime_blocks_counter = td->io_blocks;
+ else
+ td->thinktime_blocks_counter = td->io_issues;
+ td->last_thinktime = td->epoch;
+ td->last_thinktime_blocks = 0;
+}
+
+static void handle_thinktime(struct thread_data *td, enum fio_ddir ddir,
+ struct timespec *time)
{
unsigned long long b;
+ unsigned long long runtime_left;
uint64_t total;
int left;
+ struct timespec now;
+ bool stall = false;
+
+ if (td->o.thinktime_iotime) {
+ fio_gettime(&now, NULL);
+ if (utime_since(&td->last_thinktime, &now)
+ >= td->o.thinktime_iotime) {
+ stall = true;
+ } else if (!fio_option_is_set(&td->o, thinktime_blocks)) {
+ /*
+ * When thinktime_iotime is set and thinktime_blocks is
+ * not set, skip the thinktime_blocks check, since
+ * thinktime_blocks default value 1 does not work
+ * together with thinktime_iotime.
+ */
+ return;
+ }
+
+ }
+
+ b = ddir_rw_sum(td->thinktime_blocks_counter);
+ if (b >= td->last_thinktime_blocks + td->o.thinktime_blocks)
+ stall = true;
- b = ddir_rw_sum(td->io_blocks);
- if (b % td->o.thinktime_blocks)
+ if (!stall)
return;
io_u_quiesce(td);
+ left = td->o.thinktime_spin;
+ if (td->o.timeout) {
+ runtime_left = td->o.timeout - utime_since_now(&td->epoch);
+ if (runtime_left < (unsigned long long)left)
+ left = runtime_left;
+ }
+
total = 0;
- if (td->o.thinktime_spin)
- total = usec_spin(td->o.thinktime_spin);
+ if (left)
+ total = usec_spin(left);
+
+ /*
+ * usec_spin() might run for slightly longer than intended in a VM
+ * where the vCPU could get descheduled or the hypervisor could steal
+ * CPU time. Ensure "left" doesn't become negative.
+ */
+ if (total < td->o.thinktime)
+ left = td->o.thinktime - total;
+ else
+ left = 0;
+
+ if (td->o.timeout) {
+ runtime_left = td->o.timeout - utime_since_now(&td->epoch);
+ if (runtime_left < (unsigned long long)left)
+ left = runtime_left;
+ }
- left = td->o.thinktime - total;
if (left)
total += usec_sleep(td, left);
/* adjust for rate_process=poisson */
td->last_usec[ddir] += total;
}
+
+ if (time && should_check_rate(td))
+ fio_gettime(time, NULL);
+
+ td->last_thinktime_blocks = b;
+ if (td->o.thinktime_iotime) {
+ fio_gettime(&now, NULL);
+ td->last_thinktime = now;
+ }
}
/*
*/
if (td_write(td) && td_random(td) && td->o.norandommap)
total_bytes = max(total_bytes, (uint64_t) td->o.io_size);
+
+ /* Don't break too early if io_size > size */
+ if (td_rw(td) && !td_random(td))
+ total_bytes = max(total_bytes, (uint64_t)td->o.io_size);
+
/*
* If verify_backlog is enabled, we'll run the verify in this
* handler as well. For that case, we may need up to twice the
total_bytes += td->o.size;
/* In trimwrite mode, each byte is trimmed and then written, so
- * allow total_bytes to be twice as big */
- if (td_trimwrite(td))
+ * allow total_bytes or number of ios to be twice as big */
+ if (td_trimwrite(td)) {
total_bytes += td->total_io_size;
+ td->o.number_ios *= 2;
+ }
while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
(!flist_empty(&td->trim_list)) || !io_issue_bytes_exceeded(td) ||
break;
}
- if (io_u->ddir == DDIR_WRITE && td->flags & TD_F_DO_VERIFY)
- populate_verify_io_u(td, io_u);
+ if (io_u->ddir == DDIR_WRITE && td->flags & TD_F_DO_VERIFY) {
+ if (!(io_u->flags & IO_U_F_PATTERN_DONE)) {
+ io_u_set(td, io_u, IO_U_F_PATTERN_DONE);
+ io_u->numberio = td->io_issues[io_u->ddir];
+ populate_verify_io_u(td, io_u);
+ }
+ }
ddir = io_u->ddir;
td->rate_io_issue_bytes[__ddir] += blen;
}
- if (should_check_rate(td))
+ if (should_check_rate(td)) {
td->rate_next_io_time[__ddir] = usec_for_io(td, __ddir);
+ fio_gettime(&comp_time, NULL);
+ }
} else {
ret = io_u_submit(td, io_u);
}
if (ret < 0)
break;
+
+ if (ddir_rw(ddir) && td->o.thinkcycles)
+ cycles_spin(td->o.thinkcycles);
+
+ if (ddir_rw(ddir) && td->o.thinktime)
+ handle_thinktime(td, ddir, &comp_time);
+
if (!ddir_rw_sum(td->bytes_done) &&
!td_ioengine_flagged(td, FIO_NOIO))
continue;
}
if (!in_ramp_time(td) && td->o.latency_target)
lat_target_check(td);
-
- if (ddir_rw(ddir) && td->o.thinktime)
- handle_thinktime(td, ddir);
}
check_update_rusage(td);
if (td->trim_entries)
log_err("fio: %lu trim entries leaked?\n", td->trim_entries);
- if (td->o.fill_device && td->error == ENOSPC) {
+ if (td->o.fill_device && (td->error == ENOSPC || td->error == EDQUOT)) {
td->error = 0;
fio_mark_td_terminate(td);
}
if (i) {
ret = io_u_queued_complete(td, i);
- if (td->o.fill_device && td->error == ENOSPC)
+ if (td->o.fill_device &&
+ (td->error == ENOSPC || td->error == EDQUOT))
td->error = 0;
}
f->file_name);
}
}
- } else
+ } else {
+ if (td->o.io_submit_mode == IO_MODE_OFFLOAD)
+ workqueue_flush(&td->io_wq);
cleanup_pending_aio(td);
+ }
/*
* stop job if we failed doing any IO
}
}
- init_io_u_buffers(td);
+ if (init_io_u_buffers(td))
+ return 1;
if (init_file_completion_logging(td, max_units))
return 1;
int init_io_u_buffers(struct thread_data *td)
{
struct io_u *io_u;
- unsigned long long max_bs, min_write;
+ unsigned long long max_bs, min_write, trim_bs = 0;
int i, max_units;
int data_xfer = 1;
char *p;
td->orig_buffer_size = (unsigned long long) max_bs
* (unsigned long long) max_units;
- if (td_ioengine_flagged(td, FIO_NOIO) || !(td_read(td) || td_write(td)))
+ if (td_trim(td) && td->o.num_range > 1) {
+ trim_bs = td->o.num_range * sizeof(struct trim_range);
+ td->orig_buffer_size = trim_bs
+ * (unsigned long long) max_units;
+ }
+
+ /*
+ * For reads, writes, and multi-range trim operations we need a
+ * data buffer
+ */
+ if (td_ioengine_flagged(td, FIO_NOIO) ||
+ !(td_read(td) || td_write(td) || (td_trim(td) && td->o.num_range > 1)))
data_xfer = 0;
/*
* overflow later. this adjustment may be too much if we get
* lucky and the allocator gives us an aligned address.
*/
- if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+ if (td->o.odirect || td->o.mem_align ||
td_ioengine_flagged(td, FIO_RAWIO))
td->orig_buffer_size += page_mask + td->o.mem_align;
if (data_xfer && allocate_io_mem(td))
return 1;
- if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+ if (td->o.odirect || td->o.mem_align ||
td_ioengine_flagged(td, FIO_RAWIO))
p = PTR_ALIGN(td->orig_buffer, page_mask) + td->o.mem_align;
else
fill_verify_pattern(td, io_u->buf, max_bs, io_u, 0, 0);
}
}
- p += max_bs;
+ if (td_trim(td) && td->o.num_range > 1)
+ p += trim_bs;
+ else
+ p += max_bs;
}
return 0;
}
+#ifdef FIO_HAVE_IOSCHED_SWITCH
/*
- * This function is Linux specific.
+ * These functions are Linux specific.
* FIO_HAVE_IOSCHED_SWITCH enabled currently means it's Linux.
*/
-static int switch_ioscheduler(struct thread_data *td)
+static int set_ioscheduler(struct thread_data *td, struct fio_file *file)
{
-#ifdef FIO_HAVE_IOSCHED_SWITCH
char tmp[256], tmp2[128], *p;
FILE *f;
int ret;
- if (td_ioengine_flagged(td, FIO_DISKLESSIO))
- return 0;
-
- assert(td->files && td->files[0]);
- sprintf(tmp, "%s/queue/scheduler", td->files[0]->du->sysfs_root);
+ assert(file->du && file->du->sysfs_root);
+ sprintf(tmp, "%s/queue/scheduler", file->du->sysfs_root);
f = fopen(tmp, "r+");
if (!f) {
sprintf(tmp2, "[%s]", td->o.ioscheduler);
if (!strstr(tmp, tmp2)) {
- log_err("fio: io scheduler %s not found\n", td->o.ioscheduler);
+ log_err("fio: unable to set io scheduler to %s\n", td->o.ioscheduler);
td_verror(td, EINVAL, "iosched_switch");
fclose(f);
return 1;
fclose(f);
return 0;
+}
+
+static int switch_ioscheduler(struct thread_data *td)
+{
+ struct fio_file *f;
+ unsigned int i;
+ int ret = 0;
+
+ if (td_ioengine_flagged(td, FIO_DISKLESSIO))
+ return 0;
+
+ assert(td->files && td->files[0]);
+
+ for_each_file(td, f, i) {
+
+ /* Only consider regular files and block device files */
+ switch (f->filetype) {
+ case FIO_TYPE_FILE:
+ case FIO_TYPE_BLOCK:
+ /*
+ * Make sure that the device hosting the file could
+ * be determined.
+ */
+ if (!f->du)
+ continue;
+ break;
+ case FIO_TYPE_CHAR:
+ case FIO_TYPE_PIPE:
+ default:
+ continue;
+ }
+
+ ret = set_ioscheduler(td, f);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
#else
+
+static int switch_ioscheduler(struct thread_data *td)
+{
return 0;
-#endif
}
+#endif /* FIO_HAVE_IOSCHED_SWITCH */
+
static bool keep_running(struct thread_data *td)
{
unsigned long long limit;
uint64_t bytes_done[DDIR_RWDIR_CNT];
int deadlock_loop_cnt;
bool clear_state;
- int res, ret;
+ int ret;
sk_out_assign(sk_out);
free(fd);
if (!init_iolog(td))
goto err;
+ /* ioprio_set() has to be done before td_io_init() */
+ if (fio_option_is_set(o, ioprio) ||
+ fio_option_is_set(o, ioprio_class) ||
+ fio_option_is_set(o, ioprio_hint)) {
+ ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class,
+ o->ioprio, o->ioprio_hint);
+ if (ret == -1) {
+ td_verror(td, errno, "ioprio_set");
+ goto err;
+ }
+ td->ioprio = ioprio_value(o->ioprio_class, o->ioprio,
+ o->ioprio_hint);
+ td->ts.ioprio = td->ioprio;
+ }
+
if (td_io_init(td))
goto err;
+ if (td_ioengine_flagged(td, FIO_SYNCIO) && td->o.iodepth > 1 && td->o.io_submit_mode != IO_MODE_OFFLOAD) {
+ log_info("note: both iodepth >= 1 and synchronous I/O engine "
+ "are selected, queue depth will be capped at 1\n");
+ }
+
if (init_io_u(td))
goto err;
if (o->verify_async && verify_async_init(td))
goto err;
- if (fio_option_is_set(o, ioprio) ||
- fio_option_is_set(o, ioprio_class)) {
- ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, o->ioprio);
- if (ret == -1) {
- td_verror(td, errno, "ioprio_set");
- goto err;
- }
- }
-
if (o->cgroup && cgroup_setup(td, cgroup_list, &cgroup_mnt))
goto err;
if (rate_submit_init(td, sk_out))
goto err;
- set_epoch_time(td, o->log_unix_epoch);
+ set_epoch_time(td, o->log_alternate_epoch_clock_id, o->job_start_clock_id);
fio_getrusage(&td->ru_start);
memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
+ init_thinktime(td);
+
if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] ||
o->ratemin[DDIR_TRIM]) {
- memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
+ memcpy(&td->last_rate_check_time[DDIR_READ], &td->bw_sample_time,
sizeof(td->bw_sample_time));
- memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time,
+ memcpy(&td->last_rate_check_time[DDIR_WRITE], &td->bw_sample_time,
sizeof(td->bw_sample_time));
- memcpy(&td->lastrate[DDIR_TRIM], &td->bw_sample_time,
+ memcpy(&td->last_rate_check_time[DDIR_TRIM], &td->bw_sample_time,
sizeof(td->bw_sample_time));
}
if (td->o.verify_only && td_write(td))
verify_bytes = do_dry_run(td);
else {
+ if (!td->o.rand_repeatable)
+ /* save verify rand state to replay hdr seeds later at verify */
+ frand_copy(&td->verify_state_last_do_io, &td->verify_state);
do_io(td, bytes_done);
-
+ if (!td->o.rand_repeatable)
+ frand_copy(&td->verify_state, &td->verify_state_last_do_io);
if (!ddir_rw_sum(bytes_done)) {
fio_mark_td_terminate(td);
verify_bytes = 0;
}
} while (1);
- if (td_read(td) && td->io_bytes[DDIR_READ])
+ if (td->io_bytes[DDIR_READ] && (td_read(td) ||
+ ((td->flags & TD_F_VER_BACKLOG) && td_write(td))))
update_runtime(td, elapsed_us, DDIR_READ);
if (td_write(td) && td->io_bytes[DDIR_WRITE])
update_runtime(td, elapsed_us, DDIR_WRITE);
* another thread is checking its io_u's for overlap
*/
if (td_offload_overlap(td)) {
- int res = pthread_mutex_lock(&overlap_check);
- assert(res == 0);
+ int res;
+
+ res = pthread_mutex_lock(&overlap_check);
+ if (res) {
+ td->error = errno;
+ goto err;
+ }
}
td_set_runstate(td, TD_FINISHING);
if (td_offload_overlap(td)) {
+ int res;
+
res = pthread_mutex_unlock(&overlap_check);
- assert(res == 0);
+ if (res) {
+ td->error = errno;
+ goto err;
+ }
}
update_rusage_stat(td);
static void reap_threads(unsigned int *nr_running, uint64_t *t_rate,
uint64_t *m_rate)
{
- struct thread_data *td;
unsigned int cputhreads, realthreads, pending;
- int i, status, ret;
+ int ret;
/*
* reap exited threads (TD_EXITED -> TD_REAPED)
*/
realthreads = pending = cputhreads = 0;
- for_each_td(td, i) {
- int flags = 0;
+ for_each_td(td) {
+ int flags = 0, status;
- if (!strcmp(td->o.ioengine, "cpuio"))
+ if (!strcmp(td->o.ioengine, "cpuio"))
cputhreads++;
else
realthreads++;
done_secs += mtime_since_now(&td->epoch) / 1000;
profile_td_exit(td);
flow_exit_job(td);
- }
+ } end_for_each();
if (*nr_running == cputhreads && !pending && realthreads)
fio_terminate_threads(TERMINATE_ALL, TERMINATE_ALL);
{
const char *waitee = me->o.wait_for;
const char *self = me->o.name;
- struct thread_data *td;
- int i;
if (!waitee)
return false;
- for_each_td(td, i) {
+ for_each_td(td) {
if (!strcmp(td->o.name, self) || strcmp(td->o.name, waitee))
continue;
runstate_to_name(td->runstate));
return true;
}
- }
+ } end_for_each();
dprint(FD_PROCESS, "%s: %s completed, can run\n", self, waitee);
return false;
set_sig_handlers();
nr_thread = nr_process = 0;
- for_each_td(td, i) {
+ for_each_td(td) {
if (check_mount_writes(td))
return;
if (td->o.use_thread)
nr_thread++;
else
nr_process++;
- }
+ } end_for_each();
if (output_format & FIO_OUTPUT_NORMAL) {
struct buf_output out;
nr_started = 0;
m_rate = t_rate = 0;
- for_each_td(td, i) {
+ for_each_td(td) {
print_status_init(td->thread_number - 1);
if (!td->o.create_serialize)
td_io_close_file(td, f);
}
}
- }
+ } end_for_each();
/* start idle threads before io threads start to run */
fio_idle_prof_start();
/*
* create threads (TD_NOT_CREATED -> TD_CREATED)
*/
- for_each_td(td, i) {
+ for_each_td(td) {
if (td->runstate != TD_NOT_CREATED)
continue;
strerror(ret));
} else {
pid_t pid;
+ void *eo;
dprint(FD_PROCESS, "will fork\n");
+ eo = td->eo;
+ read_barrier();
pid = fork();
if (!pid) {
int ret;
ret = (int)(uintptr_t)thread_main(fd);
_exit(ret);
- } else if (i == fio_debug_jobno)
+ } else if (__td_index == fio_debug_jobno)
*fio_debug_jobp = pid;
+ free(eo);
+ free(fd);
+ fd = NULL;
}
dprint(FD_MUTEX, "wait on startup_sem\n");
if (fio_sem_down_timeout(startup_sem, 10000)) {
break;
}
dprint(FD_MUTEX, "done waiting on startup_sem\n");
- }
+ } end_for_each();
/*
* Wait for the started threads to transition to
/*
* start created threads (TD_INITIALIZED -> TD_RUNNING).
*/
- for_each_td(td, i) {
+ for_each_td(td) {
if (td->runstate != TD_INITIALIZED)
continue;
t_rate += ddir_rw_sum(td->o.rate);
todo--;
fio_sem_up(td->sem);
- }
+ } end_for_each();
reap_threads(&nr_running, &t_rate, &m_rate);
int fio_backend(struct sk_out *sk_out)
{
- struct thread_data *td;
int i;
-
if (exec_profile) {
if (load_profile(exec_profile))
return 1;
setup_log(&agg_io_log[DDIR_TRIM], &p, "agg-trim_bw.log");
}
+ if (init_global_dedupe_working_set_seeds()) {
+ log_err("fio: failed to initialize global dedupe working set\n");
+ return 1;
+ }
+
startup_sem = fio_sem_init(FIO_SEM_LOCKED);
if (!sk_out)
is_local_backend = true;
}
}
- for_each_td(td, i) {
+ for_each_td(td) {
+ struct thread_stat *ts = &td->ts;
+
+ free_clat_prio_stats(ts);
steadystate_free(td);
fio_options_free(td);
+ fio_dump_options_free(td);
if (td->rusage_sem) {
fio_sem_remove(td->rusage_sem);
td->rusage_sem = NULL;
}
fio_sem_remove(td->sem);
td->sem = NULL;
- }
+ } end_for_each();
free_disk_util();
if (cgroup_list) {
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+#include <errno.h>
+#include <sys/sysmacros.h>
#include "flist.h"
#include "fio.h"
+#include "iolog.h"
#include "blktrace.h"
#include "blktrace_api.h"
#include "oslib/linux-dev-lookup.h"
-#define TRACE_FIFO_SIZE 8192
-
-/*
- * fifo refill frontend, to avoid reading data in trace sized bites
- */
-static int refill_fifo(struct thread_data *td, struct fifo *fifo, int fd)
-{
- char buf[TRACE_FIFO_SIZE];
- unsigned int total;
- int ret;
-
- total = sizeof(buf);
- if (total > fifo_room(fifo))
- total = fifo_room(fifo);
-
- ret = read(fd, buf, total);
- if (ret < 0) {
- int read_err = errno;
-
- assert(read_err > 0);
- td_verror(td, read_err, "read blktrace file");
- return -read_err;
- }
-
- if (ret > 0)
- ret = fifo_put(fifo, buf, ret);
-
- dprint(FD_BLKTRACE, "refill: filled %d bytes\n", ret);
- return ret;
-}
-
-/*
- * Retrieve 'len' bytes from the fifo, refilling if necessary.
- */
-static int trace_fifo_get(struct thread_data *td, struct fifo *fifo, int fd,
- void *buf, unsigned int len)
-{
- if (fifo_len(fifo) < len) {
- int ret = refill_fifo(td, fifo, fd);
-
- if (ret < 0)
- return ret;
- }
-
- return fifo_get(fifo, buf, len);
-}
+struct file_cache {
+ unsigned int maj;
+ unsigned int min;
+ unsigned int fileno;
+};
/*
* Just discard the pdu by seeking past it.
*/
-static int discard_pdu(struct thread_data *td, struct fifo *fifo, int fd,
- struct blk_io_trace *t)
+static int discard_pdu(FILE* f, struct blk_io_trace *t)
{
if (t->pdu_len == 0)
return 0;
dprint(FD_BLKTRACE, "discard pdu len %u\n", t->pdu_len);
- return trace_fifo_get(td, fifo, fd, NULL, t->pdu_len);
+ if (fseek(f, t->pdu_len, SEEK_CUR) < 0)
+ return -errno;
+
+ return t->pdu_len;
}
/*
flist_add_tail(&ipo->list, &td->io_log_list);
}
-static int trace_add_file(struct thread_data *td, __u32 device)
+static int trace_add_file(struct thread_data *td, __u32 device,
+ struct file_cache *cache)
{
- static unsigned int last_maj, last_min, last_fileno;
unsigned int maj = FMAJOR(device);
unsigned int min = FMINOR(device);
struct fio_file *f;
char dev[256];
unsigned int i;
- if (last_maj == maj && last_min == min)
- return last_fileno;
+ if (cache->maj == maj && cache->min == min)
+ return cache->fileno;
- last_maj = maj;
- last_min = min;
+ cache->maj = maj;
+ cache->min = min;
/*
* check for this file in our list
*/
for_each_file(td, f, i)
if (f->major == maj && f->minor == min) {
- last_fileno = f->fileno;
- return last_fileno;
+ cache->fileno = f->fileno;
+ return cache->fileno;
}
strcpy(dev, "/dev");
td->files[fileno]->major = maj;
td->files[fileno]->minor = min;
trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
- last_fileno = fileno;
+ cache->fileno = fileno;
}
- return last_fileno;
+ return cache->fileno;
}
static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t)
queue_io_piece(td, ipo);
}
-static void handle_trace_notify(struct blk_io_trace *t)
+static bool handle_trace_notify(struct blk_io_trace *t)
{
switch (t->action) {
case BLK_TN_PROCESS:
dprint(FD_BLKTRACE, "unknown trace act %x\n", t->action);
break;
}
+ return false;
}
-static void handle_trace_discard(struct thread_data *td,
+static bool handle_trace_discard(struct thread_data *td,
struct blk_io_trace *t,
unsigned long long ttime,
- unsigned long *ios, unsigned int *bs)
+ unsigned long *ios, unsigned long long *bs,
+ struct file_cache *cache)
{
struct io_piece *ipo;
int fileno;
if (td->o.replay_skip & (1u << DDIR_TRIM))
- return;
+ return false;
ipo = calloc(1, sizeof(*ipo));
init_ipo(ipo);
- fileno = trace_add_file(td, t->device);
+ fileno = trace_add_file(td, t->device, cache);
ios[DDIR_TRIM]++;
if (t->bytes > bs[DDIR_TRIM])
ipo->offset, ipo->len,
ipo->delay);
queue_io_piece(td, ipo);
+ return true;
}
static void dump_trace(struct blk_io_trace *t)
log_err("blktrace: ignoring zero byte trace: action=%x\n", t->action);
}
-static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
+static bool handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
unsigned long long ttime, unsigned long *ios,
- unsigned int *bs)
+ unsigned long long *bs, struct file_cache *cache)
{
int rw;
int fileno;
- fileno = trace_add_file(td, t->device);
+ fileno = trace_add_file(td, t->device, cache);
rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
if (rw) {
if (td->o.replay_skip & (1u << DDIR_WRITE))
- return;
+ return false;
} else {
if (td->o.replay_skip & (1u << DDIR_READ))
- return;
+ return false;
}
if (!t->bytes) {
if (!fio_did_warn(FIO_WARN_BTRACE_ZERO))
dump_trace(t);
- return;
+ return false;
}
if (t->bytes > bs[rw])
ios[rw]++;
td->o.size += t->bytes;
store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
+ return true;
}
-static void handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
- unsigned long long ttime, unsigned long *ios)
+static bool handle_trace_flush(struct thread_data *td, struct blk_io_trace *t,
+ unsigned long long ttime, unsigned long *ios,
+ struct file_cache *cache)
{
struct io_piece *ipo;
int fileno;
if (td->o.replay_skip & (1u << DDIR_SYNC))
- return;
+ return false;
ipo = calloc(1, sizeof(*ipo));
init_ipo(ipo);
- fileno = trace_add_file(td, t->device);
+ fileno = trace_add_file(td, t->device, cache);
ipo->delay = ttime / 1000;
ipo->ddir = DDIR_SYNC;
ios[DDIR_SYNC]++;
dprint(FD_BLKTRACE, "store flush delay=%lu\n", ipo->delay);
+
+ if (!(td->flags & TD_F_SYNCS))
+ td->flags |= TD_F_SYNCS;
+
queue_io_piece(td, ipo);
+ return true;
}
/*
* We only care for queue traces, most of the others are side effects
* due to internal workings of the block layer.
*/
-static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
- unsigned long *ios, unsigned int *bs)
+static bool queue_trace(struct thread_data *td, struct blk_io_trace *t,
+ unsigned long *ios, unsigned long long *bs,
+ struct file_cache *cache)
{
- static unsigned long long last_ttime;
+ unsigned long long *last_ttime = &td->io_log_last_ttime;
unsigned long long delay = 0;
if ((t->action & 0xffff) != __BLK_TA_QUEUE)
- return;
+ return false;
if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
- if (!last_ttime || td->o.no_stall)
- delay = 0;
- else if (td->o.replay_time_scale == 100)
- delay = t->time - last_ttime;
- else {
- double tmp = t->time - last_ttime;
- double scale;
-
- scale = (double) 100.0 / (double) td->o.replay_time_scale;
- tmp *= scale;
- delay = tmp;
- }
- last_ttime = t->time;
+ delay = delay_since_ttime(td, t->time);
+ *last_ttime = t->time;
}
t_bytes_align(&td->o, t);
if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
- handle_trace_notify(t);
+ return handle_trace_notify(t);
else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
- handle_trace_discard(td, t, delay, ios, bs);
+ return handle_trace_discard(td, t, delay, ios, bs, cache);
else if (t->action & BLK_TC_ACT(BLK_TC_FLUSH))
- handle_trace_flush(td, t, delay, ios);
+ return handle_trace_flush(td, t, delay, ios, cache);
else
- handle_trace_fs(td, t, delay, ios, bs);
+ return handle_trace_fs(td, t, delay, ios, bs, cache);
}
static void byteswap_trace(struct blk_io_trace *t)
* Load a blktrace file by reading all the blk_io_trace entries, and storing
* them as io_pieces like the fio text version would do.
*/
-bool load_blktrace(struct thread_data *td, const char *filename, int need_swap)
+bool init_blktrace_read(struct thread_data *td, const char *filename, int need_swap)
+{
+ int old_state;
+
+ td->io_log_rfile = fopen(filename, "rb");
+ if (!td->io_log_rfile) {
+ td_verror(td, errno, "open blktrace file");
+ goto err;
+ }
+ td->io_log_blktrace_swap = need_swap;
+ td->io_log_last_ttime = 0;
+ td->o.size = 0;
+
+ free_release_files(td);
+
+ old_state = td_bump_runstate(td, TD_SETTING_UP);
+
+ if (!read_blktrace(td)) {
+ goto err;
+ }
+
+ td_restore_runstate(td, old_state);
+
+ if (!td->files_index) {
+ log_err("fio: did not find replay device(s)\n");
+ return false;
+ }
+
+ return true;
+
+err:
+ if (td->io_log_rfile) {
+ fclose(td->io_log_rfile);
+ td->io_log_rfile = NULL;
+ }
+ return false;
+}
+
+bool read_blktrace(struct thread_data* td)
{
struct blk_io_trace t;
+ struct file_cache cache = {
+ .maj = ~0U,
+ .min = ~0U,
+ };
unsigned long ios[DDIR_RWDIR_SYNC_CNT] = { };
- unsigned int rw_bs[DDIR_RWDIR_CNT] = { };
+ unsigned long long rw_bs[DDIR_RWDIR_CNT] = { };
unsigned long skipped_writes;
- struct fifo *fifo;
- int fd, i, old_state, max_depth;
- struct fio_file *f;
+ FILE *f = td->io_log_rfile;
+ int i, max_depth;
+ struct fio_file *fiof;
int this_depth[DDIR_RWDIR_CNT] = { };
int depth[DDIR_RWDIR_CNT] = { };
+ int64_t items_to_fetch = 0;
- fd = open(filename, O_RDONLY);
- if (fd < 0) {
- td_verror(td, errno, "open blktrace file");
- return false;
+ if (td->o.read_iolog_chunked) {
+ items_to_fetch = iolog_items_to_fetch(td);
+ if (!items_to_fetch)
+ return true;
}
- fifo = fifo_alloc(TRACE_FIFO_SIZE);
-
- old_state = td_bump_runstate(td, TD_SETTING_UP);
-
- td->o.size = 0;
skipped_writes = 0;
do {
- int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
+ int ret = fread(&t, 1, sizeof(t), f);
- if (ret < 0)
+ if (ferror(f)) {
+ td_verror(td, errno, "read blktrace file");
goto err;
- else if (!ret)
+ } else if (feof(f)) {
break;
- else if (ret < (int) sizeof(t)) {
- log_err("fio: short fifo get\n");
+ } else if (ret < (int) sizeof(t)) {
+ log_err("fio: iolog short read\n");
break;
}
- if (need_swap)
+ if (td->io_log_blktrace_swap)
byteswap_trace(&t);
if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
t.magic & 0xff);
goto err;
}
- ret = discard_pdu(td, fifo, fd, &t);
+ ret = discard_pdu(f, &t);
if (ret < 0) {
td_verror(td, -ret, "blktrace lseek");
goto err;
- } else if (t.pdu_len != ret) {
- log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
- goto err;
}
if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
if ((t.action & 0xffff) == __BLK_TA_QUEUE)
}
}
- handle_trace(td, &t, ios, rw_bs);
- } while (1);
+ if (!queue_trace(td, &t, ios, rw_bs, &cache))
+ continue;
- for_each_file(td, f, i)
- trace_add_open_close_event(td, f->fileno, FIO_LOG_CLOSE_FILE);
+ if (td->o.read_iolog_chunked) {
+ td->io_log_current++;
+ items_to_fetch--;
+ if (items_to_fetch == 0)
+ break;
+ }
+ } while (1);
- fifo_free(fifo);
- close(fd);
+ if (td->o.read_iolog_chunked) {
+ td->io_log_highmark = td->io_log_current;
+ td->io_log_checkmark = (td->io_log_highmark + 1) / 2;
+ fio_gettime(&td->io_log_highmark_time, NULL);
+ }
- td_restore_runstate(td, old_state);
+ if (skipped_writes)
+ log_err("fio: %s skips replay of %lu writes due to read-only\n",
+ td->o.name, skipped_writes);
- if (!td->files_index) {
- log_err("fio: did not find replay device(s)\n");
- return false;
+ if (td->o.read_iolog_chunked) {
+ if (td->io_log_current == 0) {
+ return false;
+ }
+ td->o.td_ddir = TD_DDIR_RW;
+ if ((rw_bs[DDIR_READ] > td->o.max_bs[DDIR_READ] ||
+ rw_bs[DDIR_WRITE] > td->o.max_bs[DDIR_WRITE] ||
+ rw_bs[DDIR_TRIM] > td->o.max_bs[DDIR_TRIM]) &&
+ td->orig_buffer)
+ {
+ td->o.max_bs[DDIR_READ] = max(td->o.max_bs[DDIR_READ], rw_bs[DDIR_READ]);
+ td->o.max_bs[DDIR_WRITE] = max(td->o.max_bs[DDIR_WRITE], rw_bs[DDIR_WRITE]);
+ td->o.max_bs[DDIR_TRIM] = max(td->o.max_bs[DDIR_TRIM], rw_bs[DDIR_TRIM]);
+ io_u_quiesce(td);
+ free_io_mem(td);
+ if (init_io_u_buffers(td))
+ return false;
+ }
+ return true;
}
+ for_each_file(td, fiof, i)
+ trace_add_open_close_event(td, fiof->fileno, FIO_LOG_CLOSE_FILE);
+
+ fclose(td->io_log_rfile);
+ td->io_log_rfile = NULL;
+
/*
* For stacked devices, we don't always get a COMPLETE event so
* the depth grows to insane values. Limit it to something sane(r).
max_depth = max(depth[i], max_depth);
}
- if (skipped_writes)
- log_err("fio: %s skips replay of %lu writes due to read-only\n",
- td->o.name, skipped_writes);
-
if (!ios[DDIR_READ] && !ios[DDIR_WRITE] && !ios[DDIR_TRIM] &&
!ios[DDIR_SYNC]) {
log_err("fio: found no ios in blktrace data\n");
td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM];
}
- /*
- * We need to do direct/raw ios to the device, to avoid getting
- * read-ahead in our way. But only do so if the minimum block size
- * is a multiple of 4k, otherwise we don't know if it's safe to do so.
- */
- if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095))
- td->o.odirect = 1;
-
/*
* If depth wasn't manually set, use probed depth
*/
return true;
err:
- close(fd);
- fifo_free(fifo);
+ fclose(f);
return false;
}
{
bcs[i].iter++;
if (bcs[i].iter < bcs[i].nr_iter) {
- lseek(bcs[i].fd, 0, SEEK_SET);
+ fseek(bcs[i].f, 0, SEEK_SET);
return;
}
*nr_logs -= 1;
/* close file */
- fifo_free(bcs[i].fifo);
- close(bcs[i].fd);
+ fclose(bcs[i].f);
/* keep active files contiguous */
memmove(&bcs[i], &bcs[*nr_logs], sizeof(bcs[i]));
read_skip:
/* read an io trace */
- ret = trace_fifo_get(td, bc->fifo, bc->fd, t, sizeof(*t));
- if (ret < 0) {
+ ret = fread(&t, 1, sizeof(t), bc->f);
+ if (ferror(bc->f)) {
+ td_verror(td, errno, "read blktrace file");
return ret;
- } else if (!ret) {
+ } else if (feof(bc->f)) {
if (!bc->length)
bc->length = bc->t.time;
return ret;
} else if (ret < (int) sizeof(*t)) {
- log_err("fio: short fifo get\n");
+ log_err("fio: iolog short read\n");
return -1;
}
/* skip over actions that fio does not care about */
if ((t->action & 0xffff) != __BLK_TA_QUEUE ||
t_get_ddir(t) == DDIR_INVAL) {
- ret = discard_pdu(td, bc->fifo, bc->fd, t);
+ ret = discard_pdu(bc->f, t);
if (ret < 0) {
td_verror(td, -ret, "blktrace lseek");
return ret;
- } else if (t->pdu_len != ret) {
- log_err("fio: discarded %d of %d\n", ret,
- t->pdu_len);
- return -1;
}
goto read_skip;
}
str = ptr = strdup(td->o.read_iolog_file);
nr_logs = 0;
for (i = 0; (name = get_next_str(&ptr)) != NULL; i++) {
- bcs[i].fd = open(name, O_RDONLY);
- if (bcs[i].fd < 0) {
+ bcs[i].f = fopen(name, "rb");
+ if (!bcs[i].f) {
log_err("fio: could not open file: %s\n", name);
- ret = bcs[i].fd;
+ ret = -errno;
free(str);
goto err_file;
}
- bcs[i].fifo = fifo_alloc(TRACE_FIFO_SIZE);
nr_logs++;
if (!is_blktrace(name, &bcs[i].swap)) {
i = find_earliest_io(bcs, nr_logs);
bc = &bcs[i];
/* skip over the pdu */
- ret = discard_pdu(td, bc->fifo, bc->fd, &bc->t);
+ ret = discard_pdu(bc->f, &bc->t);
if (ret < 0) {
td_verror(td, -ret, "blktrace lseek");
goto err_file;
- } else if (bc->t.pdu_len != ret) {
- log_err("fio: discarded %d of %d\n", ret,
- bc->t.pdu_len);
- goto err_file;
}
ret = write_trace(merge_fp, &bc->t);
err_file:
/* cleanup */
for (i = 0; i < nr_logs; i++) {
- fifo_free(bcs[i].fifo);
- close(bcs[i].fd);
+ fclose(bcs[i].f);
}
err_merge_buf:
free(merge_buf);
struct blktrace_cursor {
struct fifo *fifo; // fifo queue for reading
- int fd; // blktrace file
+ FILE *f; // blktrace file
__u64 length; // length of trace
struct blk_io_trace t; // current io trace
int swap; // bitwise reverse required
};
bool is_blktrace(const char *, int *);
-bool load_blktrace(struct thread_data *, const char *, int);
+bool init_blktrace_read(struct thread_data *, const char *, int);
+bool read_blktrace(struct thread_data* td);
+
int merge_blktrace_iologs(struct thread_data *td);
#else
return false;
}
-static inline bool load_blktrace(struct thread_data *td, const char *fname,
+static inline bool init_blktrace_read(struct thread_data *td, const char *fname,
int need_swap)
{
return false;
}
+static inline bool read_blktrace(struct thread_data* td)
+{
+ return false;
+}
+
+
static inline int merge_blktrace_iologs(struct thread_data *td)
{
return false;
+#include "cairo_text_helpers.h"
+
#include <cairo.h>
#include <gtk/gtk.h>
#include <math.h>
#ifndef CAIRO_TEXT_HELPERS_H
#define CAIRO_TEXT_HELPERS_H
+#include <cairo.h>
+
void draw_centered_text(cairo_t *cr, const char *font, double x, double y,
double fontsize, const char *text);
free(o->profile);
free(o->cgroup);
+ free(o->verify_pattern);
+ free(o->buffer_pattern);
+
for (i = 0; i < DDIR_RWDIR_CNT; i++) {
free(o->bssplit[i]);
free(o->zone_split[i]);
}
}
-void convert_thread_options_to_cpu(struct thread_options *o,
- struct thread_options_pack *top)
+size_t thread_options_pack_size(struct thread_options *o)
+{
+ return sizeof(struct thread_options_pack) + o->verify_pattern_bytes +
+ o->buffer_pattern_bytes;
+}
+
+int convert_thread_options_to_cpu(struct thread_options *o,
+ struct thread_options_pack *top,
+ size_t top_sz)
{
int i, j;
o->serialize_overlap = le32_to_cpu(top->serialize_overlap);
o->size = le64_to_cpu(top->size);
o->io_size = le64_to_cpu(top->io_size);
+ o->num_range = le32_to_cpu(top->num_range);
o->size_percent = le32_to_cpu(top->size_percent);
o->io_size_percent = le32_to_cpu(top->io_size_percent);
o->fill_device = le32_to_cpu(top->fill_device);
o->rate_iops_min[i] = le32_to_cpu(top->rate_iops_min[i]);
o->perc_rand[i] = le32_to_cpu(top->perc_rand[i]);
+
+ o->max_latency[i] = le64_to_cpu(top->max_latency[i]);
}
o->ratecycle = le32_to_cpu(top->ratecycle);
o->verify_interval = le32_to_cpu(top->verify_interval);
o->verify_offset = le32_to_cpu(top->verify_offset);
- memcpy(o->verify_pattern, top->verify_pattern, MAX_PATTERN_SIZE);
- memcpy(o->buffer_pattern, top->buffer_pattern, MAX_PATTERN_SIZE);
-
o->verify_pattern_bytes = le32_to_cpu(top->verify_pattern_bytes);
+ o->buffer_pattern_bytes = le32_to_cpu(top->buffer_pattern_bytes);
+ if (o->verify_pattern_bytes >= MAX_PATTERN_SIZE ||
+ o->buffer_pattern_bytes >= MAX_PATTERN_SIZE ||
+ thread_options_pack_size(o) > top_sz)
+ return -EINVAL;
+
+ o->verify_pattern = realloc(o->verify_pattern,
+ o->verify_pattern_bytes);
+ o->buffer_pattern = realloc(o->buffer_pattern,
+ o->buffer_pattern_bytes);
+ memcpy(o->verify_pattern, top->patterns, o->verify_pattern_bytes);
+ memcpy(o->buffer_pattern, &top->patterns[o->verify_pattern_bytes],
+ o->buffer_pattern_bytes);
+
o->verify_fatal = le32_to_cpu(top->verify_fatal);
o->verify_dump = le32_to_cpu(top->verify_dump);
o->verify_async = le32_to_cpu(top->verify_async);
o->do_disk_util = le32_to_cpu(top->do_disk_util);
o->override_sync = le32_to_cpu(top->override_sync);
o->rand_repeatable = le32_to_cpu(top->rand_repeatable);
- o->allrand_repeatable = le32_to_cpu(top->allrand_repeatable);
o->rand_seed = le64_to_cpu(top->rand_seed);
+ o->log_entries = le32_to_cpu(top->log_entries);
o->log_avg_msec = le32_to_cpu(top->log_avg_msec);
o->log_hist_msec = le32_to_cpu(top->log_hist_msec);
o->log_hist_coarseness = le32_to_cpu(top->log_hist_coarseness);
o->log_max = le32_to_cpu(top->log_max);
o->log_offset = le32_to_cpu(top->log_offset);
+ o->log_prio = le32_to_cpu(top->log_prio);
o->log_gz = le32_to_cpu(top->log_gz);
o->log_gz_store = le32_to_cpu(top->log_gz_store);
- o->log_unix_epoch = le32_to_cpu(top->log_unix_epoch);
+ o->log_alternate_epoch = le32_to_cpu(top->log_alternate_epoch);
+ o->log_alternate_epoch_clock_id = le32_to_cpu(top->log_alternate_epoch_clock_id);
+ o->job_start_clock_id = le32_to_cpu(top->job_start_clock_id);
o->norandommap = le32_to_cpu(top->norandommap);
o->softrandommap = le32_to_cpu(top->softrandommap);
o->bs_unaligned = le32_to_cpu(top->bs_unaligned);
o->zipf_theta.u.f = fio_uint64_to_double(le64_to_cpu(top->zipf_theta.u.i));
o->pareto_h.u.f = fio_uint64_to_double(le64_to_cpu(top->pareto_h.u.i));
o->gauss_dev.u.f = fio_uint64_to_double(le64_to_cpu(top->gauss_dev.u.i));
+ o->random_center.u.f = fio_uint64_to_double(le64_to_cpu(top->random_center.u.i));
o->random_generator = le32_to_cpu(top->random_generator);
o->hugepage_size = le32_to_cpu(top->hugepage_size);
o->rw_min_bs = le64_to_cpu(top->rw_min_bs);
+ o->thinkcycles = le32_to_cpu(top->thinkcycles);
o->thinktime = le32_to_cpu(top->thinktime);
o->thinktime_spin = le32_to_cpu(top->thinktime_spin);
o->thinktime_blocks = le32_to_cpu(top->thinktime_blocks);
+ o->thinktime_blocks_type = le32_to_cpu(top->thinktime_blocks_type);
+ o->thinktime_iotime = le32_to_cpu(top->thinktime_iotime);
o->fsync_blocks = le32_to_cpu(top->fsync_blocks);
o->fdatasync_blocks = le32_to_cpu(top->fdatasync_blocks);
o->barrier_blocks = le32_to_cpu(top->barrier_blocks);
o->ss_ramp_time = le64_to_cpu(top->ss_ramp_time);
o->ss_state = le32_to_cpu(top->ss_state);
o->ss_limit.u.f = fio_uint64_to_double(le64_to_cpu(top->ss_limit.u.i));
+ o->ss_check_interval = le64_to_cpu(top->ss_check_interval);
o->zone_range = le64_to_cpu(top->zone_range);
o->zone_size = le64_to_cpu(top->zone_size);
o->zone_capacity = le64_to_cpu(top->zone_capacity);
o->zone_skip = le64_to_cpu(top->zone_skip);
o->zone_mode = le32_to_cpu(top->zone_mode);
+ o->max_open_zones = __le32_to_cpu(top->max_open_zones);
+ o->ignore_zone_limits = le32_to_cpu(top->ignore_zone_limits);
o->lockmem = le64_to_cpu(top->lockmem);
o->offset_increment_percent = le32_to_cpu(top->offset_increment_percent);
o->offset_increment = le64_to_cpu(top->offset_increment);
o->nice = le32_to_cpu(top->nice);
o->ioprio = le32_to_cpu(top->ioprio);
o->ioprio_class = le32_to_cpu(top->ioprio_class);
+ o->ioprio_hint = le32_to_cpu(top->ioprio_hint);
o->file_service_type = le32_to_cpu(top->file_service_type);
o->group_reporting = le32_to_cpu(top->group_reporting);
o->stats = le32_to_cpu(top->stats);
o->zero_buffers = le32_to_cpu(top->zero_buffers);
o->refill_buffers = le32_to_cpu(top->refill_buffers);
o->scramble_buffers = le32_to_cpu(top->scramble_buffers);
- o->buffer_pattern_bytes = le32_to_cpu(top->buffer_pattern_bytes);
o->time_based = le32_to_cpu(top->time_based);
o->disable_lat = le32_to_cpu(top->disable_lat);
o->disable_clat = le32_to_cpu(top->disable_clat);
o->sync_file_range = le32_to_cpu(top->sync_file_range);
o->latency_target = le64_to_cpu(top->latency_target);
o->latency_window = le64_to_cpu(top->latency_window);
- o->max_latency = le64_to_cpu(top->max_latency);
o->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(top->latency_percentile.u.i));
o->latency_run = le32_to_cpu(top->latency_run);
o->compress_percentage = le32_to_cpu(top->compress_percentage);
o->compress_chunk = le32_to_cpu(top->compress_chunk);
o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
+ o->dedupe_mode = le32_to_cpu(top->dedupe_mode);
+ o->dedupe_working_set_percentage = le32_to_cpu(top->dedupe_working_set_percentage);
+ o->dedupe_global = le32_to_cpu(top->dedupe_global);
o->block_error_hist = le32_to_cpu(top->block_error_hist);
o->replay_align = le32_to_cpu(top->replay_align);
o->replay_scale = le32_to_cpu(top->replay_scale);
for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
o->merge_blktrace_iters[i].u.f = fio_uint64_to_double(le64_to_cpu(top->merge_blktrace_iters[i].u.i));
+
+ o->fdp = le32_to_cpu(top->fdp);
+ o->dp_type = le32_to_cpu(top->dp_type);
+ o->dp_id_select = le32_to_cpu(top->dp_id_select);
+ o->dp_nr_ids = le32_to_cpu(top->dp_nr_ids);
+ for (i = 0; i < o->dp_nr_ids; i++)
+ o->dp_ids[i] = le32_to_cpu(top->dp_ids[i]);
#if 0
uint8_t cpumask[FIO_TOP_STR_MAX];
uint8_t verify_cpumask[FIO_TOP_STR_MAX];
uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
#endif
+
+ return 0;
}
void convert_thread_options_to_net(struct thread_options_pack *top,
top->do_disk_util = cpu_to_le32(o->do_disk_util);
top->override_sync = cpu_to_le32(o->override_sync);
top->rand_repeatable = cpu_to_le32(o->rand_repeatable);
- top->allrand_repeatable = cpu_to_le32(o->allrand_repeatable);
top->rand_seed = __cpu_to_le64(o->rand_seed);
+ top->log_entries = cpu_to_le32(o->log_entries);
top->log_avg_msec = cpu_to_le32(o->log_avg_msec);
top->log_max = cpu_to_le32(o->log_max);
top->log_offset = cpu_to_le32(o->log_offset);
+ top->log_prio = cpu_to_le32(o->log_prio);
top->log_gz = cpu_to_le32(o->log_gz);
top->log_gz_store = cpu_to_le32(o->log_gz_store);
- top->log_unix_epoch = cpu_to_le32(o->log_unix_epoch);
+ top->log_alternate_epoch = cpu_to_le32(o->log_alternate_epoch);
+ top->log_alternate_epoch_clock_id = cpu_to_le32(o->log_alternate_epoch_clock_id);
+ top->job_start_clock_id = cpu_to_le32(o->job_start_clock_id);
top->norandommap = cpu_to_le32(o->norandommap);
top->softrandommap = cpu_to_le32(o->softrandommap);
top->bs_unaligned = cpu_to_le32(o->bs_unaligned);
top->zipf_theta.u.i = __cpu_to_le64(fio_double_to_uint64(o->zipf_theta.u.f));
top->pareto_h.u.i = __cpu_to_le64(fio_double_to_uint64(o->pareto_h.u.f));
top->gauss_dev.u.i = __cpu_to_le64(fio_double_to_uint64(o->gauss_dev.u.f));
+ top->random_center.u.i = __cpu_to_le64(fio_double_to_uint64(o->random_center.u.f));
top->random_generator = cpu_to_le32(o->random_generator);
top->hugepage_size = cpu_to_le32(o->hugepage_size);
top->rw_min_bs = __cpu_to_le64(o->rw_min_bs);
+ top->thinkcycles = cpu_to_le32(o->thinkcycles);
top->thinktime = cpu_to_le32(o->thinktime);
top->thinktime_spin = cpu_to_le32(o->thinktime_spin);
top->thinktime_blocks = cpu_to_le32(o->thinktime_blocks);
+ top->thinktime_blocks_type = __cpu_to_le32(o->thinktime_blocks_type);
+ top->thinktime_iotime = __cpu_to_le32(o->thinktime_iotime);
top->fsync_blocks = cpu_to_le32(o->fsync_blocks);
top->fdatasync_blocks = cpu_to_le32(o->fdatasync_blocks);
top->barrier_blocks = cpu_to_le32(o->barrier_blocks);
top->nice = cpu_to_le32(o->nice);
top->ioprio = cpu_to_le32(o->ioprio);
top->ioprio_class = cpu_to_le32(o->ioprio_class);
+ top->ioprio_hint = cpu_to_le32(o->ioprio_hint);
top->file_service_type = cpu_to_le32(o->file_service_type);
top->group_reporting = cpu_to_le32(o->group_reporting);
top->stats = cpu_to_le32(o->stats);
top->sync_file_range = cpu_to_le32(o->sync_file_range);
top->latency_target = __cpu_to_le64(o->latency_target);
top->latency_window = __cpu_to_le64(o->latency_window);
- top->max_latency = __cpu_to_le64(o->max_latency);
top->latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(o->latency_percentile.u.f));
top->latency_run = __cpu_to_le32(o->latency_run);
top->compress_percentage = cpu_to_le32(o->compress_percentage);
top->compress_chunk = cpu_to_le32(o->compress_chunk);
top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
+ top->dedupe_mode = cpu_to_le32(o->dedupe_mode);
+ top->dedupe_working_set_percentage = cpu_to_le32(o->dedupe_working_set_percentage);
+ top->dedupe_global = cpu_to_le32(o->dedupe_global);
top->block_error_hist = cpu_to_le32(o->block_error_hist);
top->replay_align = cpu_to_le32(o->replay_align);
top->replay_scale = cpu_to_le32(o->replay_scale);
top->rate_iops_min[i] = cpu_to_le32(o->rate_iops_min[i]);
top->perc_rand[i] = cpu_to_le32(o->perc_rand[i]);
+
+ top->max_latency[i] = __cpu_to_le64(o->max_latency[i]);
}
- memcpy(top->verify_pattern, o->verify_pattern, MAX_PATTERN_SIZE);
- memcpy(top->buffer_pattern, o->buffer_pattern, MAX_PATTERN_SIZE);
+ memcpy(top->patterns, o->verify_pattern, o->verify_pattern_bytes);
+ memcpy(&top->patterns[o->verify_pattern_bytes], o->buffer_pattern,
+ o->buffer_pattern_bytes);
top->size = __cpu_to_le64(o->size);
top->io_size = __cpu_to_le64(o->io_size);
+ top->num_range = __cpu_to_le32(o->num_range);
top->verify_backlog = __cpu_to_le64(o->verify_backlog);
top->start_delay = __cpu_to_le64(o->start_delay);
top->start_delay_high = __cpu_to_le64(o->start_delay_high);
top->ss_ramp_time = __cpu_to_le64(top->ss_ramp_time);
top->ss_state = cpu_to_le32(top->ss_state);
top->ss_limit.u.i = __cpu_to_le64(fio_double_to_uint64(o->ss_limit.u.f));
+ top->ss_check_interval = __cpu_to_le64(top->ss_check_interval);
top->zone_range = __cpu_to_le64(o->zone_range);
top->zone_size = __cpu_to_le64(o->zone_size);
top->zone_capacity = __cpu_to_le64(o->zone_capacity);
top->zone_skip = __cpu_to_le64(o->zone_skip);
top->zone_mode = __cpu_to_le32(o->zone_mode);
+ top->max_open_zones = __cpu_to_le32(o->max_open_zones);
+ top->ignore_zone_limits = cpu_to_le32(o->ignore_zone_limits);
top->lockmem = __cpu_to_le64(o->lockmem);
top->ddir_seq_add = __cpu_to_le64(o->ddir_seq_add);
top->file_size_low = __cpu_to_le64(o->file_size_low);
for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
top->merge_blktrace_iters[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->merge_blktrace_iters[i].u.f));
+
+ top->fdp = cpu_to_le32(o->fdp);
+ top->dp_type = cpu_to_le32(o->dp_type);
+ top->dp_id_select = cpu_to_le32(o->dp_id_select);
+ top->dp_nr_ids = cpu_to_le32(o->dp_nr_ids);
+ for (i = 0; i < o->dp_nr_ids; i++)
+ top->dp_ids[i] = cpu_to_le32(o->dp_ids[i]);
#if 0
uint8_t cpumask[FIO_TOP_STR_MAX];
uint8_t verify_cpumask[FIO_TOP_STR_MAX];
uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
#endif
-
}
/*
*/
int fio_test_cconv(struct thread_options *__o)
{
- struct thread_options o;
- struct thread_options_pack top1, top2;
-
- memset(&top1, 0, sizeof(top1));
- memset(&top2, 0, sizeof(top2));
-
- convert_thread_options_to_net(&top1, __o);
- memset(&o, 0, sizeof(o));
- convert_thread_options_to_cpu(&o, &top1);
- convert_thread_options_to_net(&top2, &o);
-
- free_thread_options_to_cpu(&o);
-
- return memcmp(&top1, &top2, sizeof(top1));
+ struct thread_options o1 = *__o, o2;
+ struct thread_options_pack *top1, *top2;
+ size_t top_sz;
+ int ret;
+
+ o1.verify_pattern_bytes = 61;
+ o1.verify_pattern = malloc(o1.verify_pattern_bytes);
+ memset(o1.verify_pattern, 'V', o1.verify_pattern_bytes);
+ o1.buffer_pattern_bytes = 15;
+ o1.buffer_pattern = malloc(o1.buffer_pattern_bytes);
+ memset(o1.buffer_pattern, 'B', o1.buffer_pattern_bytes);
+
+ top_sz = thread_options_pack_size(&o1);
+ top1 = calloc(1, top_sz);
+ top2 = calloc(1, top_sz);
+
+ convert_thread_options_to_net(top1, &o1);
+ memset(&o2, 0, sizeof(o2));
+ ret = convert_thread_options_to_cpu(&o2, top1, top_sz);
+ if (ret)
+ goto out;
+
+ convert_thread_options_to_net(top2, &o2);
+ ret = memcmp(top1, top2, top_sz);
+
+out:
+ free_thread_options_to_cpu(&o2);
+ free(top2);
+ free(top1);
+ free(o1.buffer_pattern);
+ free(o1.verify_pattern);
+ return ret;
}
--- /dev/null
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+SCRIPT_DIR=$(dirname "$0")
+# shellcheck disable=SC1091
+. "${SCRIPT_DIR}/common.sh"
+
+main() {
+ local extra_cflags="-Werror"
+ local configure_flags=()
+
+ set_ci_target_os
+ case "${CI_TARGET_BUILD}/${CI_TARGET_OS}" in
+ android*/*)
+ export UNAME=Android
+ if [ -z "${CI_TARGET_ARCH}" ]; then
+ echo "Error: CI_TARGET_ARCH has not been set"
+ return 1
+ fi
+ NDK=$PWD/android-ndk-r24/toolchains/llvm/prebuilt/linux-x86_64/bin
+ export PATH="${NDK}:${PATH}"
+ if [ "${CI_TARGET_BUILD}" = "android" ]; then
+ export LIBS="-landroid"
+ fi
+ CC=${NDK}/${CI_TARGET_ARCH}-clang
+ if [ ! -e "${CC}" ]; then
+ echo "Error: could not find ${CC}"
+ return 1
+ fi
+ ;;
+ */linux)
+ case "${CI_TARGET_ARCH}" in
+ "i686")
+ extra_cflags="${extra_cflags} -m32"
+ export LDFLAGS="-m32"
+ ;;
+ "x86_64")
+ configure_flags+=(
+ "--enable-cuda"
+ "--enable-libiscsi"
+ "--enable-libnbd"
+ )
+ ;;
+ esac
+ ;;
+ */windows)
+ configure_flags+=("--disable-native")
+ case "${CI_TARGET_ARCH}" in
+ "i686")
+ configure_flags+=("--build-32bit-win")
+ ;;
+ "x86_64")
+ ;;
+ esac
+ if [ "${CI_TARGET_BUILD}" = "windows-msys2-64" ]; then
+ configure_flags+=("--disable-tls")
+ fi
+ ;;
+ esac
+ configure_flags+=(--extra-cflags="${extra_cflags}")
+
+ ./configure "${configure_flags[@]}"
+ make -j "$(nproc 2>/dev/null || sysctl -n hw.logicalcpu)"
+# macOS does not have nproc, so we have to use sysctl to obtain the number of
+# logical CPUs.
+}
+
+main
--- /dev/null
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+main() {
+ case "${CI_TARGET_BUILD}" in
+ android*)
+ return 0;;
+ esac
+
+ echo "Running long running tests..."
+ export PYTHONUNBUFFERED="TRUE"
+ if [[ "${CI_TARGET_ARCH}" == "arm64" ]]; then
+ python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20"
+ else
+ python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
+ fi
+ make -C doc html
+}
+
+main
--- /dev/null
+#!/bin/bash -e
+
+LIBRPMA_VERSION="1.0.0"
+ZIP_FILE=rpma.zip
+
+WORKDIR=$(pwd)
+
+# install librpma
+wget -O $ZIP_FILE https://github.com/pmem/rpma/archive/${LIBRPMA_VERSION}.zip
+unzip $ZIP_FILE
+mkdir -p rpma-${LIBRPMA_VERSION}/build
+cd rpma-${LIBRPMA_VERSION}/build
+cmake .. -DCMAKE_BUILD_TYPE=Release \
+ -DCMAKE_INSTALL_PREFIX=/usr \
+ -DBUILD_DOC=OFF \
+ -DBUILD_EXAMPLES=OFF \
+ -DBUILD_TESTS=OFF
+make -j"$(nproc)"
+sudo make -j"$(nproc)" install
+cd "$WORKDIR"
+rm -rf $ZIP_FILE rpma-${LIBRPMA_VERSION}
--- /dev/null
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+SCRIPT_DIR=$(dirname "$0")
+# shellcheck disable=SC1091
+. "${SCRIPT_DIR}/common.sh"
+
+install_ubuntu() {
+ local pkgs
+
+ cat <<DPKGCFG | sudo tee /etc/dpkg/dpkg.cfg.d/dpkg-speedup > /dev/null
+# Skip fsync
+force-unsafe-io
+# Don't install documentation
+path-exclude=/usr/share/man/*
+path-exclude=/usr/share/locale/*/LC_MESSAGES/*.mo
+path-exclude=/usr/share/doc/*
+DPKGCFG
+ # Packages available on i686 and x86_64
+ pkgs=(
+ libaio-dev
+ libcunit1-dev
+ libcurl4-openssl-dev
+ libfl-dev
+ libnuma-dev
+ libnfs-dev
+ valgrind
+ )
+ case "${CI_TARGET_ARCH}" in
+ "i686")
+ sudo dpkg --add-architecture i386
+ pkgs=("${pkgs[@]/%/:i386}")
+ pkgs+=(
+ gcc-multilib
+ pkg-config:i386
+ zlib1g-dev:i386
+ libc6:i386
+ libgcc-s1:i386
+ )
+ ;;
+ "x86_64")
+ pkgs+=(
+ libglusterfs-dev
+ libgoogle-perftools-dev
+ libiscsi-dev
+ libnbd-dev
+ libpmem-dev
+ libpmem2-dev
+ libprotobuf-c-dev
+ librbd-dev
+ libtcmalloc-minimal4
+ nvidia-cuda-dev
+ libibverbs-dev
+ librdmacm-dev
+ )
+ echo "Removing libunwind-14-dev because of conflicts with libunwind-dev"
+ sudo apt remove -y libunwind-14-dev
+ ;;
+ esac
+
+ # Architecture-independent packages and packages for which we don't
+ # care about the architecture.
+ pkgs+=(
+ python3-scipy
+ python3-sphinx
+ python3-statsmodels
+ )
+
+ echo "Updating APT..."
+ sudo apt-get -qq update
+ echo "Installing packages... ${pkgs[@]}"
+ sudo apt-get install -o APT::Immediate-Configure=false --no-install-recommends -qq -y "${pkgs[@]}"
+ if [ "${CI_TARGET_ARCH}" == "x86_64" ]; then
+ # install librpma from sources
+ ci/actions-install-librpma.sh
+ fi
+}
+
+install_linux() {
+ install_ubuntu
+}
+
+install_macos() {
+ # Assumes homebrew and python3 are already installed
+ #echo "Updating homebrew..."
+ #brew update >/dev/null 2>&1
+ echo "Installing packages..."
+ HOMEBREW_NO_AUTO_UPDATE=1 brew install cunit libnfs
+ pip3 install scipy six statsmodels sphinx
+}
+
+install_windows() {
+ pip3 install scipy six statsmodels sphinx
+}
+
+main() {
+ case "${CI_TARGET_BUILD}" in
+ android*)
+ echo "Installing Android NDK..."
+ wget --quiet https://dl.google.com/android/repository/android-ndk-r24-linux.zip
+ unzip -q android-ndk-r24-linux.zip
+ return 0
+ ;;
+ esac
+
+ set_ci_target_os
+
+ install_function="install_${CI_TARGET_OS}"
+ ${install_function}
+
+ echo "Python3 path: $(type -p python3 2>&1)"
+ echo "Python3 version: $(python3 -V 2>&1)"
+}
+
+main
--- /dev/null
+#!/bin/bash
+# This script expects to be invoked from the base fio directory.
+set -eu
+
+main() {
+ case "${CI_TARGET_BUILD}" in
+ android*)
+ return 0;;
+ esac
+
+ echo "Running smoke tests..."
+ make test
+}
+
+main
+++ /dev/null
-#!/bin/bash
-# The PATH to appropriate distro commands must already be set before invoking
-# this script
-# The following environment variables must be set:
-# PLATFORM={i686,x64}
-# DISTRO={cygwin,msys2}
-# The following environment can optionally be set:
-# CYG_MIRROR=<URL>
-set -eu
-
-case "${ARCHITECTURE}" in
- "x64")
- PACKAGE_ARCH="x86_64"
- ;;
- "x86")
- PACKAGE_ARCH="i686"
- ;;
-esac
-
-echo "Installing packages..."
-case "${DISTRO}" in
- "cygwin")
- CYG_MIRROR=${CYG_MIRROR:-"http://cygwin.mirror.constant.com"}
- setup-x86_64.exe --quiet-mode --no-shortcuts --only-site \
- --site "${CYG_MIRROR}" --packages \
- "mingw64-${PACKAGE_ARCH}-CUnit,mingw64-${PACKAGE_ARCH}-zlib"
- ;;
- "msys2")
- #pacman --noconfirm -Syuu # MSYS2 core update
- #pacman --noconfirm -Syuu # MSYS2 normal update
- pacman.exe --noconfirm -S \
- mingw-w64-${PACKAGE_ARCH}-clang \
- mingw-w64-${PACKAGE_ARCH}-cunit \
- mingw-w64-${PACKAGE_ARCH}-lld
- ;;
-esac
-
-python.exe -m pip install scipy six
-
-echo "Python3 path: $(type -p python3 2>&1)"
-echo "Python3 version: $(python3 -V 2>&1)"
--- /dev/null
+# shellcheck shell=bash
+
+function set_ci_target_os {
+ # Function that exports CI_TARGET_OS to the current OS if it is not already
+ # set.
+
+ # Don't override CI_TARGET_OS if already set
+ CI_TARGET_OS=${CI_TARGET_OS:-}
+ if [[ -z ${CI_TARGET_OS} ]]; then
+ # Detect operating system
+ case "${OSTYPE}" in
+ linux*)
+ CI_TARGET_OS="linux"
+ ;;
+ darwin*)
+ CI_TARGET_OS="macos"
+ ;;
+ cygwin|msys*)
+ CI_TARGET_OS="windows"
+ ;;
+ bsd*)
+ CI_TARGET_OS="bsd"
+ ;;
+ *)
+ CI_TARGET_OS=""
+ esac
+ fi
+
+ # Don't override CI_TARGET_ARCH if already set
+ CI_TARGET_ARCH=${CI_TARGET_ARCH:-}
+ if [[ -z ${CI_TARGET_ARCH} ]]; then
+ CI_TARGET_ARCH="$(uname -m)"
+ fi
+}
+++ /dev/null
-#!/bin/bash
-set -eu
-
-CI_TARGET_ARCH="${BUILD_ARCH:-$TRAVIS_CPU_ARCH}"
-EXTRA_CFLAGS="-Werror"
-export PYTHONUNBUFFERED=TRUE
-CONFIGURE_FLAGS=()
-
-case "$TRAVIS_OS_NAME" in
- "linux")
- CONFIGURE_FLAGS+=(--enable-libiscsi)
- case "$CI_TARGET_ARCH" in
- "x86")
- EXTRA_CFLAGS="${EXTRA_CFLAGS} -m32"
- export LDFLAGS="-m32"
- ;;
- "amd64")
- CONFIGURE_FLAGS+=(--enable-cuda)
- ;;
- esac
- ;;
-esac
-CONFIGURE_FLAGS+=(--extra-cflags="${EXTRA_CFLAGS}")
-
-./configure "${CONFIGURE_FLAGS[@]}" &&
- make &&
- make test &&
- if [[ "$CI_TARGET_ARCH" == "arm64" ]]; then
- sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug -p 1010:"--skip 15 16 17 18 19 20"
- else
- sudo python3 t/run-fio-tests.py --skip 6 1007 1008 --debug
- fi
+++ /dev/null
-#!/bin/bash
-set -eu
-
-CI_TARGET_ARCH="${BUILD_ARCH:-$TRAVIS_CPU_ARCH}"
-case "$TRAVIS_OS_NAME" in
- "linux")
- # Architecture-dependent packages.
- pkgs=(
- libaio-dev
- libcunit1-dev
- libfl-dev
- libgoogle-perftools-dev
- libibverbs-dev
- libiscsi-dev
- libnuma-dev
- librbd-dev
- librdmacm-dev
- libz-dev
- )
- case "$CI_TARGET_ARCH" in
- "x86")
- pkgs=("${pkgs[@]/%/:i386}")
- pkgs+=(
- gcc-multilib
- pkg-config:i386
- )
- ;;
- "amd64")
- pkgs+=(nvidia-cuda-dev)
- ;;
- esac
- if [[ $CI_TARGET_ARCH != "x86" ]]; then
- pkgs+=(glusterfs-common)
- fi
- # Architecture-independent packages and packages for which we don't
- # care about the architecture.
- pkgs+=(
- bison
- flex
- python3
- python3-scipy
- python3-six
- )
- sudo apt-get -qq update
- sudo apt-get install --no-install-recommends -qq -y "${pkgs[@]}"
- ;;
- "osx")
- brew update >/dev/null 2>&1
- brew install cunit
- pip3 install scipy six
- ;;
-esac
-
-echo "Python3 path: $(type -p python3 2>&1)"
-echo "Python3 version: $(python3 -V 2>&1)"
static void convert_text(struct fio_net_cmd *cmd);
static void client_display_thread_status(struct jobs_eta *je);
-struct client_ops fio_client_ops = {
+struct client_ops const fio_client_ops = {
.text = handle_text,
.disk_util = handle_du,
.thread_status = handle_ts,
static void fio_drain_client_text(struct fio_client *client)
{
do {
- struct fio_net_cmd *cmd;
+ struct fio_net_cmd *cmd = NULL;
- cmd = fio_net_recv_cmd(client->fd, false);
+ if (fio_server_poll_fd(client->fd, POLLIN, 0))
+ cmd = fio_net_recv_cmd(client->fd, false);
if (!cmd)
break;
{
struct fio_client *client;
- client = malloc(sizeof(*client));
- memset(client, 0, sizeof(*client));
+ client = calloc(1, sizeof(*client));
INIT_FLIST_HEAD(&client->list);
INIT_FLIST_HEAD(&client->hash_list);
return 0;
}
-int fio_client_add(struct client_ops *ops, const char *hostname, void **cookie)
+int fio_client_add(struct client_ops const *ops, const char *hostname, void **cookie)
{
struct fio_client *existing = *cookie;
struct fio_client *client;
dprint(FD_NET, "send remote ini %s to %s\n", filename, client->hostname);
p_size = sizeof(*pdu) + strlen(filename) + 1;
- pdu = malloc(p_size);
- memset(pdu, 0, p_size);
+ pdu = calloc(1, p_size);
pdu->name_len = strlen(filename);
strcpy((char *) pdu->file, filename);
pdu->client_type = cpu_to_le16((uint16_t) client->type);
int fio_client_update_options(struct fio_client *client,
struct thread_options *o, uint64_t *tag)
{
- struct cmd_add_job_pdu pdu;
+ size_t cmd_sz = offsetof(struct cmd_add_job_pdu, top) +
+ thread_options_pack_size(o);
+ struct cmd_add_job_pdu *pdu;
+ int ret;
- pdu.thread_number = cpu_to_le32(client->thread_number);
- pdu.groupid = cpu_to_le32(client->groupid);
- convert_thread_options_to_net(&pdu.top, o);
+ pdu = malloc(cmd_sz);
+ pdu->thread_number = cpu_to_le32(client->thread_number);
+ pdu->groupid = cpu_to_le32(client->groupid);
+ convert_thread_options_to_net(&pdu->top, o);
- return fio_net_send_cmd(client->fd, FIO_NET_CMD_UPDATE_JOB, &pdu, sizeof(pdu), tag, &client->cmd_list);
+ ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_UPDATE_JOB, pdu,
+ cmd_sz, tag, &client->cmd_list);
+ free(pdu);
+ return ret;
}
static void convert_io_stat(struct io_stat *dst, struct io_stat *src)
dst->error = le32_to_cpu(src->error);
dst->thread_number = le32_to_cpu(src->thread_number);
dst->groupid = le32_to_cpu(src->groupid);
+ dst->job_start = le64_to_cpu(src->job_start);
dst->pid = le32_to_cpu(src->pid);
dst->members = le32_to_cpu(src->members);
dst->unified_rw_rep = le32_to_cpu(src->unified_rw_rep);
+ dst->ioprio = le32_to_cpu(src->ioprio);
+ dst->disable_prio_stat = le32_to_cpu(src->disable_prio_stat);
for (i = 0; i < DDIR_RWDIR_CNT; i++) {
convert_io_stat(&dst->clat_stat[i], &src->clat_stat[i]);
dst->nr_block_infos = le64_to_cpu(src->nr_block_infos);
for (i = 0; i < dst->nr_block_infos; i++)
dst->block_infos[i] = le32_to_cpu(src->block_infos[i]);
- for (i = 0; i < DDIR_RWDIR_CNT; i++) {
- for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
- dst->io_u_plat_high_prio[i][j] = le64_to_cpu(src->io_u_plat_high_prio[i][j]);
- dst->io_u_plat_low_prio[i][j] = le64_to_cpu(src->io_u_plat_low_prio[i][j]);
- }
- convert_io_stat(&dst->clat_high_prio_stat[i], &src->clat_high_prio_stat[i]);
- convert_io_stat(&dst->clat_low_prio_stat[i], &src->clat_low_prio_stat[i]);
- }
dst->ss_dur = le64_to_cpu(src->ss_dur);
dst->ss_state = le32_to_cpu(src->ss_state);
dst->ss_deviation.u.f = fio_uint64_to_double(le64_to_cpu(src->ss_deviation.u.i));
dst->ss_criterion.u.f = fio_uint64_to_double(le64_to_cpu(src->ss_criterion.u.i));
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ dst->nr_clat_prio[i] = le32_to_cpu(src->nr_clat_prio[i]);
+ for (j = 0; j < dst->nr_clat_prio[i]; j++) {
+ for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+ dst->clat_prio[i][j].io_u_plat[k] =
+ le64_to_cpu(src->clat_prio[i][j].io_u_plat[k]);
+ convert_io_stat(&dst->clat_prio[i][j].clat_stat,
+ &src->clat_prio[i][j].clat_stat);
+ dst->clat_prio[i][j].ioprio =
+ le32_to_cpu(dst->clat_prio[i][j].ioprio);
+ }
+ }
+
if (dst->ss_state & FIO_SS_DATA) {
for (i = 0; i < dst->ss_dur; i++ ) {
dst->ss_iops_data[i] = le64_to_cpu(src->ss_iops_data[i]);
if (sum_stat_clients <= 1)
return;
- sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
+ sum_thread_stats(&client_ts, &p->ts);
sum_group_stats(&client_gs, &p->rs);
client_ts.members++;
if (store_direct) {
ssize_t wrote;
size_t sz;
- int fd;
+ int fd, flags;
- fd = open((const char *) log_pathname,
- O_WRONLY | O_CREAT | O_TRUNC, 0644);
+ if (pdu->per_job_logs)
+ flags = O_WRONLY | O_CREAT | O_TRUNC;
+ else
+ flags = O_WRONLY | O_CREAT | O_APPEND;
+ fd = open((const char *) log_pathname, flags, 0644);
if (fd < 0) {
log_err("fio: open log %s: %s\n",
log_pathname, strerror(errno));
ret = 0;
} else {
FILE *f;
- f = fopen((const char *) log_pathname, "w");
+ const char *mode;
+
+ if (pdu->per_job_logs)
+ mode = "w";
+ else
+ mode = "a";
+ f = fopen((const char *) log_pathname, mode);
if (!f) {
log_err("fio: fopen log %s : %s\n",
log_pathname, strerror(errno));
ret->log_type = le32_to_cpu(ret->log_type);
ret->compressed = le32_to_cpu(ret->compressed);
ret->log_offset = le32_to_cpu(ret->log_offset);
+ ret->log_prio = le32_to_cpu(ret->log_prio);
ret->log_hist_coarseness = le32_to_cpu(ret->log_hist_coarseness);
+ ret->per_job_logs = le32_to_cpu(ret->per_job_logs);
if (*store_direct)
return ret;
s = (struct io_sample *)((char *)s + sizeof(struct io_u_plat_entry) * i);
s->time = le64_to_cpu(s->time);
- s->data.val = le64_to_cpu(s->data.val);
+ if (ret->log_type != IO_LOG_TYPE_HIST) {
+ s->data.val.val0 = le64_to_cpu(s->data.val.val0);
+ s->data.val.val1 = le64_to_cpu(s->data.val.val1);
+ }
s->__ddir = __le32_to_cpu(s->__ddir);
s->bs = le64_to_cpu(s->bs);
+ s->priority = le16_to_cpu(s->priority);
if (ret->log_offset) {
struct io_sample_offset *so = (void *) s;
int fio_handle_client(struct fio_client *client)
{
- struct client_ops *ops = client->ops;
+ struct client_ops const *ops = client->ops;
struct fio_net_cmd *cmd;
- int size;
dprint(FD_NET, "client: handle %s\n", client->hostname);
}
case FIO_NET_CMD_TS: {
struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
+ uint64_t offset;
+ int i;
+
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ if (le32_to_cpu(p->ts.nr_clat_prio[i])) {
+ offset = le64_to_cpu(p->ts.clat_prio_offset[i]);
+ p->ts.clat_prio[i] =
+ (struct clat_prio_stat *)((char *)p + offset);
+ }
+ }
dprint(FD_NET, "client: ts->ss_state = %u\n", (unsigned int) le32_to_cpu(p->ts.ss_state));
if (le32_to_cpu(p->ts.ss_state) & FIO_SS_DATA) {
dprint(FD_NET, "client: received steadystate ring buffers\n");
- size = le64_to_cpu(p->ts.ss_dur);
- p->ts.ss_iops_data = (uint64_t *) ((struct cmd_ts_pdu *)cmd->payload + 1);
- p->ts.ss_bw_data = p->ts.ss_iops_data + size;
+ offset = le64_to_cpu(p->ts.ss_iops_data_offset);
+ p->ts.ss_iops_data = (uint64_t *)((char *)p + offset);
+
+ offset = le64_to_cpu(p->ts.ss_bw_data_offset);
+ p->ts.ss_bw_data = (uint64_t *)((char *)p + offset);
}
convert_ts(&p->ts, &p->ts);
return 0;
}
-static void request_client_etas(struct client_ops *ops)
+static void request_client_etas(struct client_ops const *ops)
{
struct fio_client *client;
struct flist_head *entry;
return ret;
}
-int fio_handle_clients(struct client_ops *ops)
+int fio_handle_clients(struct client_ops const *ops)
{
struct pollfd *pfds;
int i, ret = 0, retval = 0;
fio_client_json_fini();
+ free_clat_prio_stats(&client_ts);
free(pfds);
return retval || error_clients;
}
uint16_t argc;
char **argv;
- struct client_ops *ops;
+ struct client_ops const *ops;
void *client_data;
struct client_file *files;
typedef void (client_timed_out_op)(struct fio_client *);
typedef void (client_jobs_eta_op)(struct fio_client *client, struct jobs_eta *je);
-extern struct client_ops fio_client_ops;
+extern struct client_ops const fio_client_ops;
struct client_ops {
client_cmd_op *text;
extern int fio_start_all_clients(void);
extern int fio_clients_send_ini(const char *);
extern int fio_client_send_ini(struct fio_client *, const char *, bool);
-extern int fio_handle_clients(struct client_ops *);
-extern int fio_client_add(struct client_ops *, const char *, void **);
+extern int fio_handle_clients(struct client_ops const*);
+extern int fio_client_add(struct client_ops const*, const char *, void **);
extern struct fio_client *fio_client_add_explicit(struct client_ops *, const char *, int, int);
extern void fio_client_add_cmd_option(void *, const char *);
extern int fio_client_add_ini_file(void *, const char *, bool);
#endif
#ifdef FIO_INTERNAL
-#define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
-#define FIELD_SIZE(s, f) (sizeof(((__typeof__(s))0)->f))
+#define FIO_ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
+#define FIO_FIELD_SIZE(s, f) (sizeof(((__typeof__(s))0)->f))
#endif
#ifndef __has_attribute
+#define __has_attribute(x) __GCC4_has_attribute_##x
#define __GCC4_has_attribute___fallthrough__ 0
#endif
#if __has_attribute(__fallthrough__)
-#define fallthrough __attribute__((__fallthrough__))
+#define fio_fallthrough __attribute__((__fallthrough__))
#else
-#define fallthrough do {} while (0) /* fallthrough */
+#define fio_fallthrough do {} while (0) /* fallthrough */
#endif
#endif
# Default CFLAGS
CFLAGS="-D_GNU_SOURCE -include config-host.h $CFLAGS"
+CONFIGURE_CFLAGS="-Werror-implicit-function-declaration"
BUILD_CFLAGS=""
# Print a helpful header at the top of config.log
}
compile_object() {
- do_cc $CFLAGS -Werror-implicit-function-declaration -c -o $TMPO $TMPC
+ do_cc $CFLAGS $CONFIGURE_CFLAGS -c -o $TMPO $TMPC
}
compile_prog() {
local_cflags="$1"
local_ldflags="$2 $LIBS"
echo "Compiling test case $3" >> config.log
- do_cc $CFLAGS -Werror-implicit-function-declaration $local_cflags -o $TMPE $TMPC $LDFLAGS $local_ldflags
+ do_cc $CFLAGS $CONFIGURE_CFLAGS $local_cflags -o $TMPE $TMPC $LDFLAGS $local_ldflags
}
feature_not_found() {
type "$1" >/dev/null 2>&1
}
+num() {
+ echo "$1" | grep -E -q "^[0-9]+$"
+}
+
check_define() {
cat > $TMPC <<EOF
#if !defined($1)
compile_object
}
+check_val() {
+ cat > $TMPC <<EOF
+#if $1 == $2
+int main(void)
+{
+ return 0;
+}
+#else
+#error $1 is not equal $2
+#endif
+EOF
+ compile_object
+}
+
output_sym() {
echo "$1=y" >> $config_host_mak
echo "#define $1" >> $config_host_h
check_min_lib_version() {
_feature=$3
- if "${cross_prefix}"pkg-config --atleast-version="$2" "$1" > /dev/null 2>&1; then
+ if pkg-config --atleast-version="$2" "$1" > /dev/null 2>&1; then
return 0
fi
: "${_feature:=${1}}"
- if "${cross_prefix}"pkg-config --version > /dev/null 2>&1; then
- if eval "echo \$$_feature" = "yes" ; then
+ if pkg-config --version > /dev/null 2>&1; then
+ if test "$(eval echo \"\$$_feature\")" = "yes" ; then
feature_not_found "$_feature" "$1 >= $2"
fi
else
exit_val=0
gfio_check="no"
libhdfs="no"
-pmemblk="no"
devdax="no"
pmem="no"
cuda="no"
+libcufile="no"
disable_lex=""
disable_pmem="no"
disable_native="no"
march_set="no"
libiscsi="no"
libnbd="no"
-libaio_uring="no"
+libnfs=""
+xnvme=""
+isal=""
+libblkio=""
libzbc=""
+dfs=""
+seed_buckets=""
dynamic_engines="no"
prefix=/usr/local
;;
--enable-cuda) cuda="yes"
;;
+ --enable-libcufile) libcufile="yes"
+ ;;
--disable-native) disable_native="yes"
;;
--with-ime=*) ime_path="$optarg"
;;
--disable-libzbc) libzbc="no"
;;
+ --disable-xnvme) xnvme="no"
+ ;;
+ --disable-isal) isal="no"
+ ;;
+ --disable-libblkio) libblkio="no"
+ ;;
--disable-tcmalloc) disable_tcmalloc="yes"
;;
- --enable-libaio-uring) libaio_uring="yes"
+ --disable-libnfs) libnfs="no"
+ ;;
+ --enable-libnfs) libnfs="yes"
;;
--dynamic-libengines) dynamic_engines="yes"
;;
+ --disable-dfs) dfs="no"
+ ;;
+ --enable-asan) asan="yes"
+ ;;
+ --seed-buckets=*) seed_buckets="$optarg"
+ ;;
+ --disable-tls) tls_check="no"
+ ;;
--help)
show_help="yes"
;;
echo "--cc= Specify compiler to use"
echo "--extra-cflags= Specify extra CFLAGS to pass to compiler"
echo "--build-32bit-win Enable 32-bit build on Windows"
- echo "--target-win-ver= Minimum version of Windows to target (XP or 7)"
+ echo "--target-win-ver= Minimum version of Windows to target (only accepts 7)"
echo "--enable-pdb Enable Windows PDB symbols generation (needs clang/lld)"
echo "--build-static Build a static fio"
echo "--esx Configure build options for esx"
echo "--disable-http Disable HTTP support even if found"
echo "--disable-gfapi Disable gfapi"
echo "--enable-libhdfs Enable hdfs support"
+ echo "--enable-libnfs Enable nfs support"
+ echo "--disable-libnfs Disable nfs support"
echo "--disable-lex Disable use of lex/yacc for math"
echo "--disable-pmem Disable pmem based engines even if found"
echo "--enable-lex Enable use of lex/yacc for math"
echo "--disable-shm Disable SHM support"
echo "--disable-optimizations Don't enable compiler optimizations"
echo "--enable-cuda Enable GPUDirect RDMA support"
+ echo "--enable-libcufile Enable GPUDirect Storage cuFile support"
echo "--disable-native Don't build for native host"
echo "--with-ime= Install path for DDN's Infinite Memory Engine"
echo "--enable-libiscsi Enable iscsi support"
echo "--enable-libnbd Enable libnbd (NBD engine) support"
+ echo "--disable-xnvme Disable xnvme support even if found"
+ echo "--disable-isal Disable isal support even if found"
+ echo "--disable-libblkio Disable libblkio support even if found"
echo "--disable-libzbc Disable libzbc even if found"
- echo "--disable-tcmalloc Disable tcmalloc support"
- echo "--enable-libaio-uring Enable libaio emulated over io_uring"
- echo "--dynamic-libengines Lib-based ioengines as dynamic libraries"
+ echo "--disable-tcmalloc Disable tcmalloc support"
+ echo "--dynamic-libengines Lib-based ioengines as dynamic libraries"
+ echo "--disable-dfs Disable DAOS File System support even if found"
+ echo "--enable-asan Enable address sanitizer"
+ echo "--seed-buckets= Number of seed buckets for the refill-buffer"
+ echo "--disable-tls Disable __thread local storage"
exit $exit_val
fi
cross_prefix=${cross_prefix-${CROSS_COMPILE}}
-# Preferred compiler (can be overriden later after we know the platform):
+# Preferred compiler (can be overridden later after we know the platform):
# ${CC} (if set)
# ${cross_prefix}gcc (if cross-prefix specified)
# gcc if available
if test -z "$cpu" && test "$(sysctl -n hw.optional.x86_64)" = "1"; then
cpu="x86_64"
fi
- # Error at compile time linking of weak/partial symbols if possible...
+ # Avoid configure feature detection of features provided by weak symbols
cat > $TMPC <<EOF
int main(void)
{
return 0;
}
EOF
- if compile_prog "" "-Wl,-no_weak_imports" "disable weak symbols"; then
- echo "Disabling weak symbols"
- LDFLAGS="$LDFLAGS -Wl,-no_weak_imports"
+ if compile_prog "" "-Werror=partial-availability" "error on weak symbols"; then
+ CONFIGURE_CFLAGS="$CONFIGURE_CFLAGS -Werror=partial-availability"
fi
;;
SunOS)
# Default Windows API target
target_win_ver="7"
fi
- if test "$target_win_ver" = "XP"; then
- output_sym "CONFIG_WINDOWS_XP"
- # Technically the below is targeting 2003
- CFLAGS="$CFLAGS -D_WIN32_WINNT=0x0502"
- elif test "$target_win_ver" = "7"; then
+ if test "$target_win_ver" = "7"; then
output_sym "CONFIG_WINDOWS_7"
CFLAGS="$CFLAGS -D_WIN32_WINNT=0x0601"
else
clock_gettime="yes" # clock_monotonic probe has dependency on this
clock_monotonic="yes"
sched_idle="yes"
+ pthread_condattr_setclock="no"
+ pthread_affinity="no"
;;
esac
cpu="aarch64"
elif check_define __hppa__ ; then
cpu="hppa"
+elif check_define __loongarch64 ; then
+ cpu="loongarch64"
+elif check_define __riscv ; then
+ if check_val __riscv_xlen 32 ; then
+ cpu="riscv32"
+ elif check_val __riscv_xlen 64 ; then
+ cpu="riscv64"
+ elif check_val __riscv_xlen 128 ; then
+ cpu="riscv128"
+ fi
else
cpu=`uname -m`
fi
# Normalise host CPU name and set ARCH.
case "$cpu" in
- ia64|ppc|ppc64|s390|s390x|sparc64)
+ ia64|ppc|ppc64|s390|s390x|sparc64|loongarch64|riscv64)
cpu="$cpu"
;;
i386|i486|i586|i686|i86pc|BePC)
fi
print_config "zlib" "$zlib"
+##########################################
+# fcntl(F_FULLFSYNC) support
+if test "$fcntl_sync" != "yes" ; then
+ fcntl_sync="no"
+fi
+cat > $TMPC << EOF
+#include <unistd.h>
+#include <fcntl.h>
+
+int main(int argc, char **argv)
+{
+ return fcntl(0, F_FULLFSYNC);
+}
+EOF
+if compile_prog "" "" "fcntl(F_FULLFSYNC)" ; then
+ fcntl_sync="yes"
+fi
+print_config "fcntl(F_FULLFSYNC)" "$fcntl_sync"
+
##########################################
# linux-aio probe
if test "$libaio" != "yes" ; then
return 0;
}
EOF
- if test "$libaio_uring" = "yes"; then
- if compile_prog "" "-luring" "libaio io_uring" ; then
- libaio=yes
- LIBS="-luring $LIBS"
- else
- feature_not_found "libaio io_uring" ""
- fi
- elif compile_prog "" "-laio" "libaio" ; then
+ if compile_prog "" "-laio" "libaio" ; then
libaio=yes
- libaio_uring=no
else
if test "$libaio" = "yes" ; then
feature_not_found "linux AIO" "libaio-dev or libaio-devel"
fi
libaio=no
- libaio_uring=no
fi
cat > $TMPC <<EOF
fi
print_config "Linux AIO support" "$libaio"
print_config "Linux AIO support rw flags" "$libaio_rw_flags"
-print_config "Linux AIO over io_uring" "$libaio_uring"
##########################################
# posix aio probe
##########################################
# POSIX pthread_condattr_setclock() probe
-if test "$pthread_condattr_setclock" != "yes" ; then
- pthread_condattr_setclock="no"
-fi
-cat > $TMPC <<EOF
+if test "$pthread_condattr_setclock" != "no" ; then
+ cat > $TMPC <<EOF
#include <pthread.h>
int main(void)
{
return 0;
}
EOF
-if compile_prog "" "$LIBS" "pthread_condattr_setclock" ; then
- pthread_condattr_setclock=yes
-elif compile_prog "" "$LIBS -lpthread" "pthread_condattr_setclock" ; then
- pthread_condattr_setclock=yes
- LIBS="$LIBS -lpthread"
+ if compile_prog "" "$LIBS" "pthread_condattr_setclock" ; then
+ pthread_condattr_setclock=yes
+ elif compile_prog "" "$LIBS -lpthread" "pthread_condattr_setclock" ; then
+ pthread_condattr_setclock=yes
+ LIBS="$LIBS -lpthread"
+ fi
fi
print_config "pthread_condattr_setclock()" "$pthread_condattr_setclock"
#include <signal.h> /* pthread_sigmask() */
int main(void)
{
- return pthread_sigmask(0, NULL, NULL);
+ sigset_t sigmask;
+ return pthread_sigmask(0, NULL, &sigmask);
}
EOF
if compile_prog "" "$LIBS" "pthread_sigmask" ; then
fi
print_config "pthread_sigmask()" "$pthread_sigmask"
+##########################################
+# pthread_getaffinity_np() probe
+if test "$pthread_getaffinity" != "yes" ; then
+ pthread_getaffinity="no"
+fi
+cat > $TMPC <<EOF
+#include <stddef.h> /* NULL */
+#include <signal.h> /* pthread_sigmask() */
+#include <pthread.h>
+int main(void)
+{
+ cpu_set_t set;
+ return pthread_getaffinity_np(pthread_self(), sizeof(set), &set);
+}
+EOF
+if compile_prog "" "$LIBS" "pthread_getaffinity" ; then
+ pthread_getaffinity="yes"
+elif compile_prog "" "$LIBS -lpthread" "pthread_getaffinity" ; then
+ pthread_getaffinity="yes"
+ LIBS="$LIBS -lpthread"
+fi
+print_config "pthread_getaffinity_np()" "$pthread_getaffinity"
+
##########################################
# solaris aio probe
if test "$solaris_aio" != "yes" ; then
fi
print_config "rdmacm" "$rdmacm"
+##########################################
+# librpma probe
+# The librpma engines require librpma>=v0.11.0 with rpma_cq_get_wc().
+if test "$librpma" != "yes" ; then
+ librpma="no"
+fi
+cat > $TMPC << EOF
+#include <librpma.h>
+int main(void)
+{
+ void *ptr = rpma_cq_get_wc;
+ (void) ptr; /* unused */
+ return 0;
+}
+EOF
+if test "$disable_rdma" != "yes" && compile_prog "" "-lrpma" "rpma"; then
+ librpma="yes"
+fi
+print_config "librpma" "$librpma"
+
+##########################################
+# libprotobuf-c probe
+if test "$libprotobuf_c" != "yes" ; then
+ libprotobuf_c="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <protobuf-c/protobuf-c.h>
+#if !defined(PROTOBUF_C_VERSION_NUMBER)
+# error PROTOBUF_C_VERSION_NUMBER is not defined!
+#endif
+int main(int argc, char **argv)
+{
+ (void)protobuf_c_message_check(NULL);
+ return 0;
+}
+EOF
+if compile_prog "" "-lprotobuf-c" "protobuf_c"; then
+ libprotobuf_c="yes"
+fi
+print_config "libprotobuf_c" "$libprotobuf_c"
+
##########################################
# asprintf() and vasprintf() probes
if test "$have_asprintf" != "yes" ; then
#include <sched.h>
int main(int argc, char **argv)
{
- cpu_set_t mask;
+ cpu_set_t mask = { };
+
return sched_setaffinity(0, sizeof(mask), &mask);
}
EOF
#include <sched.h>
int main(int argc, char **argv)
{
- cpu_set_t mask;
+ cpu_set_t mask = { };
+
return sched_setaffinity(0, &mask);
}
EOF
#include <time.h>
int main(int argc, char **argv)
{
- return clock_gettime(0, NULL);
+ struct timespec ts;
+
+ return clock_gettime(0, &ts);
}
EOF
if compile_prog "" "" "clock_gettime"; then
#include <time.h>
int main(int argc, char **argv)
{
- return clock_gettime(CLOCK_MONOTONIC, NULL);
+ struct timespec ts;
+
+ return clock_gettime(CLOCK_MONOTONIC, &ts);
}
EOF
if compile_prog "" "$LIBS" "clock monotonic"; then
fi
print_config "sync_file_range" "$sync_file_range"
+##########################################
+# ASharedMemory_create() probe
+if test "$ASharedMemory_create" != "yes" ; then
+ ASharedMemory_create="no"
+fi
+cat > $TMPC << EOF
+#include <android/sharedmem.h>
+int main(int argc, char **argv)
+{
+ return ASharedMemory_create("", 0);
+}
+EOF
+if compile_prog "" "" "ASharedMemory_create"; then
+ ASharedMemory_create="yes"
+fi
+print_config "ASharedMemory_create" "$ASharedMemory_create"
+
##########################################
# ext4 move extent probe
if test "$ext4_me" != "yes" ; then
if test "$tls_thread" != "yes" ; then
tls_thread="no"
fi
-cat > $TMPC << EOF
+if test "$tls_check" != "no"; then
+ cat > $TMPC << EOF
#include <stdio.h>
static __thread int ret;
int main(int argc, char **argv)
if compile_prog "" "" "__thread"; then
tls_thread="yes"
fi
+fi
print_config "__thread" "$tls_thread"
##########################################
return GTK_CHECK_VERSION(2, 18, 0) ? 0 : 1; /* 0 on success */
}
EOF
-GTK_CFLAGS=$(${cross_prefix}pkg-config --cflags gtk+-2.0 gthread-2.0)
+GTK_CFLAGS=$(pkg-config --cflags gtk+-2.0 gthread-2.0)
ORG_LDFLAGS=$LDFLAGS
LDFLAGS=$(echo $LDFLAGS | sed s/"-static"//g)
if test "$?" != "0" ; then
echo "configure: gtk and gthread not found"
exit 1
fi
-GTK_LIBS=$(${cross_prefix}pkg-config --libs gtk+-2.0 gthread-2.0)
+GTK_LIBS=$(pkg-config --libs gtk+-2.0 gthread-2.0)
if test "$?" != "0" ; then
echo "configure: gtk and gthread not found"
exit 1
#include <sched.h>
int main(int argc, char **argv)
{
- struct sched_param p;
+ struct sched_param p = { };
+
return sched_setscheduler(0, SCHED_IDLE, &p);
}
EOF
fi
print_config "TCP_NODELAY" "$tcp_nodelay"
+##########################################
+# Check whether we have vsock
+if test "$vsock" != "yes" ; then
+ vsock="no"
+fi
+cat > $TMPC << EOF
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <linux/vm_sockets.h>
+int main(int argc, char **argv)
+{
+ return socket(AF_VSOCK, SOCK_STREAM, 0);
+}
+EOF
+if compile_prog "" "" "vsock"; then
+ vsock="yes"
+fi
+print_config "vsock" "$vsock"
+
##########################################
# Check whether we have SO_SNDBUF
if test "$window_size" != "yes" ; then
#include <sys/uio.h>
int main(int argc, char **argv)
{
- return pwritev(0, NULL, 1, 0) + preadv(0, NULL, 1, 0);
+ struct iovec iov[1] = { };
+
+ return pwritev(0, iov, 1, 0) + preadv(0, iov, 1, 0);
}
EOF
if compile_prog "" "" "pwritev"; then
#include <sys/uio.h>
int main(int argc, char **argv)
{
- return pwritev2(0, NULL, 1, 0, 0) + preadv2(0, NULL, 1, 0, 0);
+ struct iovec iov[1] = { };
+
+ return pwritev2(0, iov, 1, 0, 0) + preadv2(0, iov, 1, 0, 0);
}
EOF
if compile_prog "" "" "pwritev2"; then
#include <stdio.h>
int main(int argc, char **argv)
{
- struct addrinfo hints;
- struct in6_addr addr;
+ struct addrinfo hints = { };
+ struct in6_addr addr = in6addr_any;
int ret;
ret = getaddrinfo(NULL, NULL, &hints, NULL);
freeaddrinfo(NULL);
- printf("%s\n", gai_strerror(ret));
- addr = in6addr_any;
+ printf("%s %d\n", gai_strerror(ret), addr.s6_addr[0]);
+
return 0;
}
EOF
hdfs_conf_error=1
fi
if test "$FIO_LIBHDFS_INCLUDE" = "" ; then
- echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs inlude path"
+ echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs include path"
hdfs_conf_error=1
fi
if test "$FIO_LIBHDFS_LIB" = "" ; then
#include <stdlib.h>
int main(int argc, char **argv)
{
- int rc;
- rc = pmem_is_pmem(NULL, NULL);
- return 0;
+ return pmem_is_pmem(NULL, 0);
}
EOF
if compile_prog "" "-lpmem" "libpmem"; then
#include <stdlib.h>
int main(int argc, char **argv)
{
- pmem_memcpy(NULL, NULL, NULL, NULL);
+ pmem_memcpy(NULL, NULL, 0, 0);
return 0;
}
EOF
print_config "libpmem1_5" "$libpmem1_5"
##########################################
-# Check whether we have libpmemblk
-# libpmem is a prerequisite
-if test "$libpmemblk" != "yes" ; then
- libpmemblk="no"
+# Check whether we have libpmem2
+if test "$libpmem2" != "yes" ; then
+ libpmem2="no"
fi
-if test "$libpmem" = "yes"; then
- cat > $TMPC << EOF
-#include <libpmemblk.h>
+cat > $TMPC << EOF
+#include <libpmem2.h>
int main(int argc, char **argv)
{
- PMEMblkpool *pbp;
- pbp = pmemblk_open("", 0);
+ struct pmem2_config *cfg;
+ pmem2_config_new(&cfg);
+ pmem2_config_delete(&cfg);
return 0;
}
EOF
- if compile_prog "" "-lpmemblk" "libpmemblk"; then
- libpmemblk="yes"
- fi
+if compile_prog "" "-lpmem2" "libpmem2"; then
+ libpmem2="yes"
fi
-print_config "libpmemblk" "$libpmemblk"
+print_config "libpmem2" "$libpmem2"
# Choose libpmem-based ioengines
if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
if test "$libpmem1_5" = "yes"; then
pmem="yes"
fi
- if test "$libpmemblk" = "yes"; then
- pmemblk="yes"
- fi
fi
-##########################################
-# Report whether pmemblk engine is enabled
-print_config "PMDK pmemblk engine" "$pmemblk"
-
##########################################
# Report whether dev-dax engine is enabled
print_config "PMDK dev-dax engine" "$devdax"
fi
print_config "NBD engine" "$libnbd"
+##########################################
+# check for dfs (DAOS File System)
+if test "$dfs" != "no" ; then
+ cat > $TMPC << EOF
+#include <fcntl.h>
+#include <daos.h>
+#include <daos_fs.h>
+
+int main(int argc, char **argv)
+{
+ daos_handle_t poh;
+ daos_handle_t coh;
+ dfs_t *dfs;
+
+ (void) dfs_mount(poh, coh, O_RDWR, &dfs);
+
+ return 0;
+}
+EOF
+ if compile_prog "" "-luuid -ldfs -ldaos" "dfs"; then
+ dfs="yes"
+ else
+ dfs="no"
+ fi
+fi
+print_config "DAOS File System (dfs) Engine" "$dfs"
+
+##########################################
+# Check if we have libnfs (for userspace nfs support).
+if test "$libnfs" != "no" ; then
+ if $(pkg-config libnfs > /dev/null 2>&1); then
+ libnfs="yes"
+ libnfs_cflags=$(pkg-config --cflags libnfs)
+ libnfs_libs=$(pkg-config --libs libnfs)
+ else
+ if test "$libnfs" = "yes" ; then
+ feature_not_found "libnfs" "libnfs"
+ fi
+ libnfs="no"
+ fi
+fi
+print_config "NFS engine" "$libnfs"
+
##########################################
# Check if we have lex/yacc available
yacc="no"
FILE *mtab = setmntent(NULL, "r");
struct mntent *mnt = getmntent(mtab);
endmntent(mtab);
- return 0;
+ return mnt != NULL;
}
EOF
if compile_prog "" "" "getmntent"; then
fi
print_config "Valgrind headers" "$valgrind_dev"
-if test "$targetos" = "Linux" ; then
+if test "$targetos" = "Linux" || test "$targetos" = "Android"; then
##########################################
# <linux/blkzoned.h> probe
if test "$linux_blkzoned" != "yes" ; then
}
EOF
if test "$libzbc" != "no" ; then
+ if [ -e /usr/include/libzbc/libzbc ]; then
+ # SUSE Linux.
+ CFLAGS="$CFLAGS -I/usr/include/libzbc"
+ fi
if compile_prog "" "-lzbc" "libzbc"; then
libzbc="yes"
if ! check_min_lib_version libzbc 5; then
fi
print_config "libzbc engine" "$libzbc"
+if test "$targetos" = "Linux" || test "$targetos" = "Android"; then
##########################################
-# check march=armv8-a+crc+crypto
-if test "$march_armv8_a_crc_crypto" != "yes" ; then
- march_armv8_a_crc_crypto="no"
+# Check NVME_URING_CMD support
+cat > $TMPC << EOF
+#include <linux/nvme_ioctl.h>
+int main(void)
+{
+ return sizeof(struct nvme_uring_cmd);
+}
+EOF
+if compile_prog "" "" "nvme uring cmd"; then
+ output_sym "CONFIG_NVME_URING_CMD"
+ nvme_uring_cmd="yes"
+else
+ nvme_uring_cmd="no"
+fi
+print_config "NVMe uring command support" "$nvme_uring_cmd"
+fi
+
+##########################################
+# Check if we have xnvme
+if test "$xnvme" != "no" ; then
+ if check_min_lib_version xnvme 0.7.4; then
+ xnvme="yes"
+ xnvme_cflags=$(pkg-config --cflags xnvme)
+ xnvme_libs=$(pkg-config --libs xnvme)
+ else
+ xnvme="no"
+ fi
+fi
+print_config "xnvme engine" "$xnvme"
+
+if test "$targetos" = "Linux" ; then
+##########################################
+# Check ISA-L support
+cat > $TMPC << EOF
+#include <isa-l/crc.h>
+#include <stddef.h>
+int main(void)
+{
+ return crc16_t10dif(0, NULL, 4096);
+}
+EOF
+if test "$isal" != "no" ; then
+ if compile_prog "" "-lisal" "ISAL"; then
+ isal="yes"
+ LIBS="-lisal $LIBS"
+ else
+ isal="no"
+ fi
fi
+print_config "isal" "$isal"
+fi
+
+##########################################
+# Check if we have libblkio
+if test "$libblkio" != "no" ; then
+ if check_min_lib_version blkio 1.0.0; then
+ libblkio="yes"
+ libblkio_cflags=$(pkg-config --cflags blkio)
+ libblkio_libs=$(pkg-config --libs blkio)
+ else
+ if test "$libblkio" = "yes" ; then
+ feature_not_found "libblkio" "libblkio-dev or libblkio-devel"
+ fi
+ libblkio="no"
+ fi
+fi
+print_config "libblkio engine" "$libblkio"
+
+##########################################
+# check march=armv8-a+crc+crypto
+march_armv8_a_crc_crypto="no"
if test "$cpu" = "arm64" ; then
cat > $TMPC <<EOF
+#if __linux__
#include <arm_acle.h>
#include <arm_neon.h>
#include <sys/auxv.h>
+#endif
int main(void)
{
/* Can we also do a runtime probe? */
#if __linux__
return getauxval(AT_HWCAP);
+#elif defined(__APPLE__)
+ return 0;
#else
# error "Don't know how to do runtime probe for ARM CRC32c"
#endif
fi
print_config "cuda" "$cuda"
+##########################################
+# libcufile probe
+if test "$libcufile" != "no" ; then
+cat > $TMPC << EOF
+#include <cufile.h>
+
+int main(int argc, char* argv[]) {
+ cuFileDriverOpen();
+ return 0;
+}
+EOF
+ if compile_prog "" "-lcuda -lcudart -lcufile -ldl" "libcufile"; then
+ libcufile="yes"
+ LIBS="-lcuda -lcudart -lcufile -ldl $LIBS"
+ else
+ if test "$libcufile" = "yes" ; then
+ feature_not_found "libcufile" ""
+ fi
+ libcufile="no"
+ fi
+fi
+print_config "libcufile" "$libcufile"
+
##########################################
# check for cc -march=native
build_native="no"
fi
print_config "-Wimplicit-fallthrough=2" "$fallthrough"
+##########################################
+# check if the compiler has -Wno-stringop-concatenation
+no_stringop="no"
+cat > $TMPC << EOF
+#include <stdio.h>
+
+int main(int argc, char **argv)
+{
+ return printf("%s\n", argv[0]);
+}
+EOF
+if compile_prog "-Wno-stringop-truncation -Werror" "" "no_stringop"; then
+ no_stringop="yes"
+fi
+print_config "-Wno-stringop-truncation" "$no_stringop"
+
##########################################
# check for MADV_HUGEPAGE support
if test "$thp" != "yes" ; then
##########################################
# check for timerfd support
timerfd_create="no"
+if test "$esx" != "yes" ; then
cat > $TMPC << EOF
#include <sys/time.h>
#include <sys/timerfd.h>
return timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
}
EOF
-if compile_prog "" "" "timerfd_create"; then
- timerfd_create="yes"
+ if compile_prog "" "" "timerfd_create"; then
+ timerfd_create="yes"
+ fi
fi
print_config "timerfd_create" "$timerfd_create"
if test "$libaio_rw_flags" = "yes" ; then
output_sym "CONFIG_LIBAIO_RW_FLAGS"
fi
- if test "$libaio_uring" = "yes" ; then
- output_sym "CONFIG_LIBAIO_URING"
- fi
fi
if test "$posix_aio" = "yes" ; then
output_sym "CONFIG_POSIXAIO"
if test "$pthread_sigmask" = "yes" ; then
output_sym "CONFIG_PTHREAD_SIGMASK"
fi
+if test "$pthread_getaffinity" = "yes" ; then
+ output_sym "CONFIG_PTHREAD_GETAFFINITY"
+fi
if test "$have_asprintf" = "yes" ; then
output_sym "CONFIG_HAVE_ASPRINTF"
fi
if test "$sync_file_range" = "yes" ; then
output_sym "CONFIG_SYNC_FILE_RANGE"
fi
+if test "$ASharedMemory_create" = "yes" ; then
+ output_sym "CONFIG_ASHAREDMEMORY_CREATE"
+fi
if test "$sfaa" = "yes" ; then
output_sym "CONFIG_SFAA"
fi
if test "$libverbs" = "yes" -a "$rdmacm" = "yes" ; then
output_sym "CONFIG_RDMA"
fi
+# librpma is supported on the 'x86_64' architecture for now
+if test "$cpu" = "x86_64" -a "$libverbs" = "yes" -a "$rdmacm" = "yes" \
+ -a "$librpma" = "yes" \
+ && test "$libpmem" = "yes" -o "$libpmem2" = "yes" ; then
+ output_sym "CONFIG_LIBRPMA_APM"
+fi
+if test "$cpu" = "x86_64" -a "$libverbs" = "yes" -a "$rdmacm" = "yes" \
+ -a "$librpma" = "yes" -a "$libprotobuf_c" = "yes" \
+ && test "$libpmem" = "yes" -o "$libpmem2" = "yes" ; then
+ output_sym "CONFIG_LIBRPMA_GPSPM"
+fi
if test "$clock_gettime" = "yes" ; then
output_sym "CONFIG_CLOCK_GETTIME"
fi
if test "$clock_monotonic" = "yes" ; then
output_sym "CONFIG_CLOCK_MONOTONIC"
fi
-if test "$clock_monotonic_raw" = "yes" ; then
- output_sym "CONFIG_CLOCK_MONOTONIC_RAW"
-fi
-if test "$clock_monotonic_precise" = "yes" ; then
- output_sym "CONFIG_CLOCK_MONOTONIC_PRECISE"
-fi
if test "$clockid_t" = "yes"; then
output_sym "CONFIG_CLOCKID_T"
fi
if test "$ipv6" = "yes" ; then
output_sym "CONFIG_IPV6"
fi
+if test "$vsock" = "yes"; then
+ output_sym "CONFIG_VSOCK"
+fi
if test "$http" = "yes" ; then
output_sym "CONFIG_HTTP"
fi
if test "$mtd" = "yes" ; then
output_sym "CONFIG_MTD"
fi
-if test "$pmemblk" = "yes" ; then
- output_sym "CONFIG_PMEMBLK"
-fi
if test "$devdax" = "yes" ; then
output_sym "CONFIG_LINUX_DEVDAX"
fi
if test "$pmem" = "yes" ; then
output_sym "CONFIG_LIBPMEM"
fi
+if test "$libpmem2" = "yes" ; then
+ output_sym "CONFIG_LIBPMEM2_INSTALLED"
+fi
if test "$libime" = "yes" ; then
output_sym "CONFIG_IME"
fi
output_sym "CONFIG_LIBZBC"
fi
if test "$zlib" = "no" ; then
- echo "Consider installing zlib-dev (zlib-devel, some fio features depend on it."
+ echo "Consider installing zlib1g-dev (zlib-devel) as some fio features depend on it."
if test "$build_static" = "yes"; then
echo "Note that some distros have separate packages for static libraries."
fi
if test "$cuda" = "yes" ; then
output_sym "CONFIG_CUDA"
fi
+if test "$libcufile" = "yes" ; then
+ output_sym "CONFIG_LIBCUFILE"
+fi
+if test "$dfs" = "yes" ; then
+ output_sym "CONFIG_DFS"
+fi
if test "$march_set" = "no" && test "$build_native" = "yes" ; then
output_sym "CONFIG_BUILD_NATIVE"
fi
if test "$fallthrough" = "yes"; then
CFLAGS="$CFLAGS -Wimplicit-fallthrough"
fi
+if test "$no_stringop" = "yes"; then
+ output_sym "CONFIG_HAVE_NO_STRINGOP"
+fi
if test "$thp" = "yes" ; then
output_sym "CONFIG_HAVE_THP"
fi
echo "LIBNBD_CFLAGS=$libnbd_cflags" >> $config_host_mak
echo "LIBNBD_LIBS=$libnbd_libs" >> $config_host_mak
fi
+if test "$libnfs" = "yes" ; then
+ output_sym "CONFIG_LIBNFS"
+ echo "LIBNFS_CFLAGS=$libnfs_cflags" >> $config_host_mak
+ echo "LIBNFS_LIBS=$libnfs_libs" >> $config_host_mak
+fi
+if test "$xnvme" = "yes" ; then
+ output_sym "CONFIG_LIBXNVME"
+ echo "LIBXNVME_CFLAGS=$xnvme_cflags" >> $config_host_mak
+ echo "LIBXNVME_LIBS=$xnvme_libs" >> $config_host_mak
+fi
+if test "$isal" = "yes" ; then
+ output_sym "CONFIG_LIBISAL"
+fi
+if test "$libblkio" = "yes" ; then
+ output_sym "CONFIG_LIBBLKIO"
+ echo "LIBBLKIO_CFLAGS=$libblkio_cflags" >> $config_host_mak
+ echo "LIBBLKIO_LIBS=$libblkio_libs" >> $config_host_mak
+fi
if test "$dynamic_engines" = "yes" ; then
output_sym "CONFIG_DYNAMIC_ENGINES"
fi
if test "$pdb" = yes; then
output_sym "CONFIG_PDB"
fi
-
+if test "$fcntl_sync" = "yes" ; then
+ output_sym "CONFIG_FCNTL_SYNC"
+fi
+if test "$asan" = "yes"; then
+ CFLAGS="$CFLAGS -fsanitize=address"
+ LDFLAGS="$LDFLAGS -fsanitize=address"
+fi
print_config "Lib-based ioengines dynamic" "$dynamic_engines"
cat > $TMPC << EOF
int main(int argc, char **argv)
fi
fi
print_config "TCMalloc support" "$tcmalloc"
+if ! num "$seed_buckets"; then
+ seed_buckets=4
+elif test "$seed_buckets" -lt 2; then
+ seed_buckets=2
+elif test "$seed_buckets" -gt 16; then
+ seed_buckets=16
+fi
+echo "#define CONFIG_SEED_BUCKETS $seed_buckets" >> $config_host_h
+print_config "seed_buckets" "$seed_buckets"
echo "LIBS+=$LIBS" >> $config_host_mak
echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CRC_T10DIF_H
+#define __CRC_T10DIF_H
+
+extern unsigned short fio_crc_t10dif(unsigned short crc,
+ const unsigned char *buffer,
+ unsigned int len);
+
+#endif
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * crc64nvme[256] table is from the generator polynomial specified by NVMe
+ * 64b CRC and is defined as,
+ *
+ * x^64 + x^63 + x^61 + x^59 + x^58 + x^56 + x^55 + x^52 + x^49 + x^48 + x^47 +
+ * x^46 + x^44 + x^41 + x^37 + x^36 + x^34 + x^32 + x^31 + x^28 + x^26 + x^23 +
+ * x^22 + x^19 + x^16 + x^13 + x^12 + x^10 + x^9 + x^6 + x^4 + x^3 + 1
+ *
+ */
+
#include "crc64.h"
+#include "crc64table.h"
/*
* poly 0x95AC9329AC4BC9B5ULL and init 0xFFFFFFFFFFFFFFFFULL
return crc;
}
+/**
+ * fio_crc64_nvme - Calculate bitwise NVMe CRC64
+ * @crc: seed value for computation. 0 for a new CRC calculation, or the
+ * previous crc64 value if computing incrementally.
+ * @p: pointer to buffer over which CRC64 is run
+ * @len: length of buffer @p
+ */
+unsigned long long fio_crc64_nvme(unsigned long long crc, const void *p,
+ unsigned int len)
+{
+ const unsigned char *_p = p;
+ unsigned int i;
+
+ crc = ~crc;
+
+ for (i = 0; i < len; i++)
+ crc = (crc >> 8) ^ crc64nvmetable[(crc & 0xff) ^ *_p++];
+
+ return ~crc;
+}
unsigned long long fio_crc64(const unsigned char *, unsigned long);
+unsigned long long fio_crc64_nvme(unsigned long long crc, const void *p,
+ unsigned int len);
+
#endif
--- /dev/null
+static const unsigned long long crc64nvmetable[256] = {
+ 0x0000000000000000ULL, 0x7f6ef0c830358979ULL,
+ 0xfedde190606b12f2ULL, 0x81b31158505e9b8bULL,
+ 0xc962e5739841b68fULL, 0xb60c15bba8743ff6ULL,
+ 0x37bf04e3f82aa47dULL, 0x48d1f42bc81f2d04ULL,
+ 0xa61cecb46814fe75ULL, 0xd9721c7c5821770cULL,
+ 0x58c10d24087fec87ULL, 0x27affdec384a65feULL,
+ 0x6f7e09c7f05548faULL, 0x1010f90fc060c183ULL,
+ 0x91a3e857903e5a08ULL, 0xeecd189fa00bd371ULL,
+ 0x78e0ff3b88be6f81ULL, 0x078e0ff3b88be6f8ULL,
+ 0x863d1eabe8d57d73ULL, 0xf953ee63d8e0f40aULL,
+ 0xb1821a4810ffd90eULL, 0xceecea8020ca5077ULL,
+ 0x4f5ffbd87094cbfcULL, 0x30310b1040a14285ULL,
+ 0xdefc138fe0aa91f4ULL, 0xa192e347d09f188dULL,
+ 0x2021f21f80c18306ULL, 0x5f4f02d7b0f40a7fULL,
+ 0x179ef6fc78eb277bULL, 0x68f0063448deae02ULL,
+ 0xe943176c18803589ULL, 0x962de7a428b5bcf0ULL,
+ 0xf1c1fe77117cdf02ULL, 0x8eaf0ebf2149567bULL,
+ 0x0f1c1fe77117cdf0ULL, 0x7072ef2f41224489ULL,
+ 0x38a31b04893d698dULL, 0x47cdebccb908e0f4ULL,
+ 0xc67efa94e9567b7fULL, 0xb9100a5cd963f206ULL,
+ 0x57dd12c379682177ULL, 0x28b3e20b495da80eULL,
+ 0xa900f35319033385ULL, 0xd66e039b2936bafcULL,
+ 0x9ebff7b0e12997f8ULL, 0xe1d10778d11c1e81ULL,
+ 0x606216208142850aULL, 0x1f0ce6e8b1770c73ULL,
+ 0x8921014c99c2b083ULL, 0xf64ff184a9f739faULL,
+ 0x77fce0dcf9a9a271ULL, 0x08921014c99c2b08ULL,
+ 0x4043e43f0183060cULL, 0x3f2d14f731b68f75ULL,
+ 0xbe9e05af61e814feULL, 0xc1f0f56751dd9d87ULL,
+ 0x2f3dedf8f1d64ef6ULL, 0x50531d30c1e3c78fULL,
+ 0xd1e00c6891bd5c04ULL, 0xae8efca0a188d57dULL,
+ 0xe65f088b6997f879ULL, 0x9931f84359a27100ULL,
+ 0x1882e91b09fcea8bULL, 0x67ec19d339c963f2ULL,
+ 0xd75adabd7a6e2d6fULL, 0xa8342a754a5ba416ULL,
+ 0x29873b2d1a053f9dULL, 0x56e9cbe52a30b6e4ULL,
+ 0x1e383fcee22f9be0ULL, 0x6156cf06d21a1299ULL,
+ 0xe0e5de5e82448912ULL, 0x9f8b2e96b271006bULL,
+ 0x71463609127ad31aULL, 0x0e28c6c1224f5a63ULL,
+ 0x8f9bd7997211c1e8ULL, 0xf0f5275142244891ULL,
+ 0xb824d37a8a3b6595ULL, 0xc74a23b2ba0eececULL,
+ 0x46f932eaea507767ULL, 0x3997c222da65fe1eULL,
+ 0xafba2586f2d042eeULL, 0xd0d4d54ec2e5cb97ULL,
+ 0x5167c41692bb501cULL, 0x2e0934dea28ed965ULL,
+ 0x66d8c0f56a91f461ULL, 0x19b6303d5aa47d18ULL,
+ 0x980521650afae693ULL, 0xe76bd1ad3acf6feaULL,
+ 0x09a6c9329ac4bc9bULL, 0x76c839faaaf135e2ULL,
+ 0xf77b28a2faafae69ULL, 0x8815d86aca9a2710ULL,
+ 0xc0c42c4102850a14ULL, 0xbfaadc8932b0836dULL,
+ 0x3e19cdd162ee18e6ULL, 0x41773d1952db919fULL,
+ 0x269b24ca6b12f26dULL, 0x59f5d4025b277b14ULL,
+ 0xd846c55a0b79e09fULL, 0xa72835923b4c69e6ULL,
+ 0xeff9c1b9f35344e2ULL, 0x90973171c366cd9bULL,
+ 0x1124202993385610ULL, 0x6e4ad0e1a30ddf69ULL,
+ 0x8087c87e03060c18ULL, 0xffe938b633338561ULL,
+ 0x7e5a29ee636d1eeaULL, 0x0134d92653589793ULL,
+ 0x49e52d0d9b47ba97ULL, 0x368bddc5ab7233eeULL,
+ 0xb738cc9dfb2ca865ULL, 0xc8563c55cb19211cULL,
+ 0x5e7bdbf1e3ac9decULL, 0x21152b39d3991495ULL,
+ 0xa0a63a6183c78f1eULL, 0xdfc8caa9b3f20667ULL,
+ 0x97193e827bed2b63ULL, 0xe877ce4a4bd8a21aULL,
+ 0x69c4df121b863991ULL, 0x16aa2fda2bb3b0e8ULL,
+ 0xf86737458bb86399ULL, 0x8709c78dbb8deae0ULL,
+ 0x06bad6d5ebd3716bULL, 0x79d4261ddbe6f812ULL,
+ 0x3105d23613f9d516ULL, 0x4e6b22fe23cc5c6fULL,
+ 0xcfd833a67392c7e4ULL, 0xb0b6c36e43a74e9dULL,
+ 0x9a6c9329ac4bc9b5ULL, 0xe50263e19c7e40ccULL,
+ 0x64b172b9cc20db47ULL, 0x1bdf8271fc15523eULL,
+ 0x530e765a340a7f3aULL, 0x2c608692043ff643ULL,
+ 0xadd397ca54616dc8ULL, 0xd2bd67026454e4b1ULL,
+ 0x3c707f9dc45f37c0ULL, 0x431e8f55f46abeb9ULL,
+ 0xc2ad9e0da4342532ULL, 0xbdc36ec59401ac4bULL,
+ 0xf5129aee5c1e814fULL, 0x8a7c6a266c2b0836ULL,
+ 0x0bcf7b7e3c7593bdULL, 0x74a18bb60c401ac4ULL,
+ 0xe28c6c1224f5a634ULL, 0x9de29cda14c02f4dULL,
+ 0x1c518d82449eb4c6ULL, 0x633f7d4a74ab3dbfULL,
+ 0x2bee8961bcb410bbULL, 0x548079a98c8199c2ULL,
+ 0xd53368f1dcdf0249ULL, 0xaa5d9839ecea8b30ULL,
+ 0x449080a64ce15841ULL, 0x3bfe706e7cd4d138ULL,
+ 0xba4d61362c8a4ab3ULL, 0xc52391fe1cbfc3caULL,
+ 0x8df265d5d4a0eeceULL, 0xf29c951de49567b7ULL,
+ 0x732f8445b4cbfc3cULL, 0x0c41748d84fe7545ULL,
+ 0x6bad6d5ebd3716b7ULL, 0x14c39d968d029fceULL,
+ 0x95708ccedd5c0445ULL, 0xea1e7c06ed698d3cULL,
+ 0xa2cf882d2576a038ULL, 0xdda178e515432941ULL,
+ 0x5c1269bd451db2caULL, 0x237c997575283bb3ULL,
+ 0xcdb181ead523e8c2ULL, 0xb2df7122e51661bbULL,
+ 0x336c607ab548fa30ULL, 0x4c0290b2857d7349ULL,
+ 0x04d364994d625e4dULL, 0x7bbd94517d57d734ULL,
+ 0xfa0e85092d094cbfULL, 0x856075c11d3cc5c6ULL,
+ 0x134d926535897936ULL, 0x6c2362ad05bcf04fULL,
+ 0xed9073f555e26bc4ULL, 0x92fe833d65d7e2bdULL,
+ 0xda2f7716adc8cfb9ULL, 0xa54187de9dfd46c0ULL,
+ 0x24f29686cda3dd4bULL, 0x5b9c664efd965432ULL,
+ 0xb5517ed15d9d8743ULL, 0xca3f8e196da80e3aULL,
+ 0x4b8c9f413df695b1ULL, 0x34e26f890dc31cc8ULL,
+ 0x7c339ba2c5dc31ccULL, 0x035d6b6af5e9b8b5ULL,
+ 0x82ee7a32a5b7233eULL, 0xfd808afa9582aa47ULL,
+ 0x4d364994d625e4daULL, 0x3258b95ce6106da3ULL,
+ 0xb3eba804b64ef628ULL, 0xcc8558cc867b7f51ULL,
+ 0x8454ace74e645255ULL, 0xfb3a5c2f7e51db2cULL,
+ 0x7a894d772e0f40a7ULL, 0x05e7bdbf1e3ac9deULL,
+ 0xeb2aa520be311aafULL, 0x944455e88e0493d6ULL,
+ 0x15f744b0de5a085dULL, 0x6a99b478ee6f8124ULL,
+ 0x224840532670ac20ULL, 0x5d26b09b16452559ULL,
+ 0xdc95a1c3461bbed2ULL, 0xa3fb510b762e37abULL,
+ 0x35d6b6af5e9b8b5bULL, 0x4ab846676eae0222ULL,
+ 0xcb0b573f3ef099a9ULL, 0xb465a7f70ec510d0ULL,
+ 0xfcb453dcc6da3dd4ULL, 0x83daa314f6efb4adULL,
+ 0x0269b24ca6b12f26ULL, 0x7d0742849684a65fULL,
+ 0x93ca5a1b368f752eULL, 0xeca4aad306bafc57ULL,
+ 0x6d17bb8b56e467dcULL, 0x12794b4366d1eea5ULL,
+ 0x5aa8bf68aecec3a1ULL, 0x25c64fa09efb4ad8ULL,
+ 0xa4755ef8cea5d153ULL, 0xdb1bae30fe90582aULL,
+ 0xbcf7b7e3c7593bd8ULL, 0xc399472bf76cb2a1ULL,
+ 0x422a5673a732292aULL, 0x3d44a6bb9707a053ULL,
+ 0x759552905f188d57ULL, 0x0afba2586f2d042eULL,
+ 0x8b48b3003f739fa5ULL, 0xf42643c80f4616dcULL,
+ 0x1aeb5b57af4dc5adULL, 0x6585ab9f9f784cd4ULL,
+ 0xe436bac7cf26d75fULL, 0x9b584a0fff135e26ULL,
+ 0xd389be24370c7322ULL, 0xace74eec0739fa5bULL,
+ 0x2d545fb4576761d0ULL, 0x523aaf7c6752e8a9ULL,
+ 0xc41748d84fe75459ULL, 0xbb79b8107fd2dd20ULL,
+ 0x3acaa9482f8c46abULL, 0x45a459801fb9cfd2ULL,
+ 0x0d75adabd7a6e2d6ULL, 0x721b5d63e7936bafULL,
+ 0xf3a84c3bb7cdf024ULL, 0x8cc6bcf387f8795dULL,
+ 0x620ba46c27f3aa2cULL, 0x1d6554a417c62355ULL,
+ 0x9cd645fc4798b8deULL, 0xe3b8b53477ad31a7ULL,
+ 0xab69411fbfb21ca3ULL, 0xd407b1d78f8795daULL,
+ 0x55b4a08fdfd90e51ULL, 0x2ada5047efec8728ULL,
+};
--- /dev/null
+/*
+ * Cryptographic API.
+ *
+ * T10 Data Integrity Field CRC16 Crypto Transform
+ *
+ * Copyright (c) 2007 Oracle Corporation. All rights reserved.
+ * Written by Martin K. Petersen <martin.petersen@oracle.com>
+ * Copyright (C) 2013 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifdef CONFIG_LIBISAL
+#include <isa-l/crc.h>
+
+extern unsigned short fio_crc_t10dif(unsigned short crc,
+ const unsigned char *buffer,
+ unsigned int len)
+{
+ return crc16_t10dif(crc, buffer, len);
+}
+
+#else
+#include "crc-t10dif.h"
+
+/* Table generated using the following polynomium:
+ * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1
+ * gt: 0x8bb7
+ */
+static const unsigned short t10_dif_crc_table[256] = {
+ 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
+ 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
+ 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
+ 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
+ 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
+ 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
+ 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
+ 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
+ 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
+ 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
+ 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
+ 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
+ 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
+ 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
+ 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
+ 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
+ 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
+ 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
+ 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
+ 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
+ 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
+ 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
+ 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
+ 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
+ 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
+ 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
+ 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
+ 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
+ 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
+ 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
+ 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
+ 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
+};
+
+extern unsigned short fio_crc_t10dif(unsigned short crc,
+ const unsigned char *buffer,
+ unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0 ; i < len ; i++)
+ crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff];
+
+ return crc;
+}
+
+#endif
switch (len & 3) {
case 3:
k1 ^= tail[2] << 16;
- fallthrough;
+ fio_fallthrough;
case 2:
k1 ^= tail[1] << 8;
- fallthrough;
+ fio_fallthrough;
case 1:
k1 ^= tail[0];
k1 *= c1;
//#define XXH_ACCEPT_NULL_INPUT_POINTER 1
// XXH_FORCE_NATIVE_FORMAT :
-// By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
+// By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
// Results are therefore identical for little-endian and big-endian CPU.
// This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
-// Should endian-independance be of no importance for your application, you may set the #define below to 1.
+// Should endian-independence be of no importance for your application, you may set the #define below to 1.
// It will improve speed for Big-endian CPU.
// This option has no impact on Little_Endian CPU.
#define XXH_FORCE_NATIVE_FORMAT 0
--- /dev/null
+/*
+ * Note: This is similar to a very basic setup
+ * of ZBD devices
+ *
+ * Specify fdp=1 (With char devices /dev/ng0n1)
+ */
+
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "fio.h"
+#include "file.h"
+
+#include "pshared.h"
+#include "dataplacement.h"
+
+static int fdp_ruh_info(struct thread_data *td, struct fio_file *f,
+ struct fio_ruhs_info *ruhs)
+{
+ int ret = -EINVAL;
+
+ if (!td->io_ops) {
+ log_err("fio: no ops set in fdp init?!\n");
+ return ret;
+ }
+
+ if (td->io_ops->fdp_fetch_ruhs) {
+ ret = td->io_ops->fdp_fetch_ruhs(td, f, ruhs);
+ if (ret < 0) {
+ td_verror(td, errno, "fdp fetch ruhs failed");
+ log_err("%s: fdp fetch ruhs failed (%d)\n",
+ f->file_name, errno);
+ }
+ } else {
+ log_err("%s: engine (%s) lacks fetch ruhs\n",
+ f->file_name, td->io_ops->name);
+ }
+
+ return ret;
+}
+
+static int init_ruh_info(struct thread_data *td, struct fio_file *f)
+{
+ struct fio_ruhs_info *ruhs, *tmp;
+ int i, ret;
+
+ ruhs = scalloc(1, sizeof(*ruhs) + FDP_MAX_RUHS * sizeof(*ruhs->plis));
+ if (!ruhs)
+ return -ENOMEM;
+
+ /* set up the data structure used for FDP to work with the supplied stream IDs */
+ if (td->o.dp_type == FIO_DP_STREAMS) {
+ if (!td->o.dp_nr_ids) {
+ log_err("fio: stream IDs must be provided for dataplacement=streams\n");
+ return -EINVAL;
+ }
+ ruhs->nr_ruhs = td->o.dp_nr_ids;
+ for (int i = 0; i < ruhs->nr_ruhs; i++)
+ ruhs->plis[i] = td->o.dp_ids[i];
+
+ f->ruhs_info = ruhs;
+ return 0;
+ }
+
+ ret = fdp_ruh_info(td, f, ruhs);
+ if (ret) {
+ log_info("fio: ruh info failed for %s (%d)\n",
+ f->file_name, -ret);
+ goto out;
+ }
+
+ if (ruhs->nr_ruhs > FDP_MAX_RUHS)
+ ruhs->nr_ruhs = FDP_MAX_RUHS;
+
+ if (td->o.dp_nr_ids == 0) {
+ f->ruhs_info = ruhs;
+ return 0;
+ }
+
+ for (i = 0; i < td->o.dp_nr_ids; i++) {
+ if (td->o.dp_ids[i] >= ruhs->nr_ruhs) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ tmp = scalloc(1, sizeof(*tmp) + ruhs->nr_ruhs * sizeof(*tmp->plis));
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ tmp->nr_ruhs = td->o.dp_nr_ids;
+ for (i = 0; i < td->o.dp_nr_ids; i++)
+ tmp->plis[i] = ruhs->plis[td->o.dp_ids[i]];
+ f->ruhs_info = tmp;
+out:
+ sfree(ruhs);
+ return ret;
+}
+
+int dp_init(struct thread_data *td)
+{
+ struct fio_file *f;
+ int i, ret = 0;
+
+ for_each_file(td, f, i) {
+ ret = init_ruh_info(td, f);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+void fdp_free_ruhs_info(struct fio_file *f)
+{
+ if (!f->ruhs_info)
+ return;
+ sfree(f->ruhs_info);
+ f->ruhs_info = NULL;
+}
+
+void dp_fill_dspec_data(struct thread_data *td, struct io_u *io_u)
+{
+ struct fio_file *f = io_u->file;
+ struct fio_ruhs_info *ruhs = f->ruhs_info;
+ int dspec;
+
+ if (!ruhs || io_u->ddir != DDIR_WRITE) {
+ io_u->dtype = 0;
+ io_u->dspec = 0;
+ return;
+ }
+
+ if (td->o.dp_id_select == FIO_DP_RR) {
+ if (ruhs->pli_loc >= ruhs->nr_ruhs)
+ ruhs->pli_loc = 0;
+
+ dspec = ruhs->plis[ruhs->pli_loc++];
+ } else {
+ ruhs->pli_loc = rand_between(&td->fdp_state, 0, ruhs->nr_ruhs - 1);
+ dspec = ruhs->plis[ruhs->pli_loc];
+ }
+
+ io_u->dtype = td->o.dp_type == FIO_DP_FDP ? FDP_DIR_DTYPE : STREAMS_DIR_DTYPE;
+ io_u->dspec = dspec;
+ dprint(FD_IO, "dtype set to 0x%x, dspec set to 0x%x\n", io_u->dtype, io_u->dspec);
+}
--- /dev/null
+#ifndef FIO_DATAPLACEMENT_H
+#define FIO_DATAPLACEMENT_H
+
+#include "io_u.h"
+
+#define STREAMS_DIR_DTYPE 1
+#define FDP_DIR_DTYPE 2
+#define FDP_MAX_RUHS 128
+#define FIO_MAX_DP_IDS 16
+
+/*
+ * How fio chooses what placement identifier to use next. Choice of
+ * uniformly random, or roundrobin.
+ */
+enum {
+ FIO_DP_RANDOM = 0x1,
+ FIO_DP_RR = 0x2,
+};
+
+
+enum {
+ FIO_DP_NONE = 0x0,
+ FIO_DP_FDP = 0x1,
+ FIO_DP_STREAMS = 0x2,
+};
+
+struct fio_ruhs_info {
+ uint32_t nr_ruhs;
+ uint32_t pli_loc;
+ uint16_t plis[];
+};
+
+int dp_init(struct thread_data *td);
+void fdp_free_ruhs_info(struct fio_file *f);
+void dp_fill_dspec_data(struct thread_data *td, struct io_u *io_u);
+
+#endif /* FIO_DATAPLACEMENT_H */
--- /dev/null
+#include "fio.h"
+
+/**
+ * initializes the global dedup workset.
+ * this needs to be called after all jobs' seeds
+ * have been initialized
+ */
+int init_global_dedupe_working_set_seeds(void)
+{
+ for_each_td(td) {
+ if (!td->o.dedupe_global)
+ continue;
+
+ if (init_dedupe_working_set_seeds(td, 1))
+ return 1;
+ } end_for_each();
+
+ return 0;
+}
+
+int init_dedupe_working_set_seeds(struct thread_data *td, bool global_dedup)
+{
+ int tindex;
+ struct thread_data *td_seed;
+ unsigned long long i, j, num_seed_advancements, pages_per_seed;
+ struct frand_state dedupe_working_set_state = {0};
+
+ if (!td->o.dedupe_percentage || !(td->o.dedupe_mode == DEDUPE_MODE_WORKING_SET))
+ return 0;
+
+ tindex = td->thread_number - 1;
+ num_seed_advancements = td->o.min_bs[DDIR_WRITE] /
+ min_not_zero(td->o.min_bs[DDIR_WRITE], (unsigned long long) td->o.compress_chunk);
+ /*
+ * The dedupe working set keeps seeds of unique data (generated by buf_state).
+ * Dedupe-ed pages will be generated using those seeds.
+ */
+ td->num_unique_pages = (td->o.size * (unsigned long long)td->o.dedupe_working_set_percentage / 100) / td->o.min_bs[DDIR_WRITE];
+ td->dedupe_working_set_states = malloc(sizeof(struct frand_state) * td->num_unique_pages);
+ if (!td->dedupe_working_set_states) {
+ log_err("fio: could not allocate dedupe working set\n");
+ return 1;
+ }
+
+ frand_copy(&dedupe_working_set_state, &td->buf_state);
+ frand_copy(&td->dedupe_working_set_states[0], &dedupe_working_set_state);
+ pages_per_seed = max(td->num_unique_pages / thread_number, 1ull);
+ for (i = 1; i < td->num_unique_pages; i++) {
+ /*
+ * When compression is used the seed is advanced multiple times to
+ * generate the buffer. We want to regenerate the same buffer when
+ * deduping against this page
+ */
+ for (j = 0; j < num_seed_advancements; j++)
+ __get_next_seed(&dedupe_working_set_state);
+
+ /*
+ * When global dedup is used, we rotate the seeds to allow
+ * generating same buffers across different jobs. Deduplication buffers
+ * are spread evenly across jobs participating in global dedupe
+ */
+ if (global_dedup && i % pages_per_seed == 0) {
+ td_seed = tnumber_to_td(++tindex % thread_number);
+ frand_copy(&dedupe_working_set_state, &td_seed->buf_state);
+ }
+
+ frand_copy(&td->dedupe_working_set_states[i], &dedupe_working_set_state);
+ }
+
+ return 0;
+}
--- /dev/null
+#ifndef DEDUPE_H
+#define DEDUPE_H
+
+int init_dedupe_working_set_seeds(struct thread_data *td, bool global_dedupe);
+int init_global_dedupe_working_set_seeds(void);
+
+#endif
+#include <inttypes.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
static int get_io_ticks(struct disk_util *du, struct disk_util_stat *dus)
{
- unsigned in_flight;
- unsigned long long sectors[2];
char line[256];
FILE *f;
char *p;
dprint(FD_DISKUTIL, "%s: %s", du->path, p);
- ret = sscanf(p, "%llu %llu %llu %llu %llu %llu %llu %llu %u %llu %llu\n",
- (unsigned long long *) &dus->s.ios[0],
- (unsigned long long *) &dus->s.merges[0],
- §ors[0],
- (unsigned long long *) &dus->s.ticks[0],
- (unsigned long long *) &dus->s.ios[1],
- (unsigned long long *) &dus->s.merges[1],
- §ors[1],
- (unsigned long long *) &dus->s.ticks[1],
- &in_flight,
- (unsigned long long *) &dus->s.io_ticks,
- (unsigned long long *) &dus->s.time_in_queue);
+ ret = sscanf(p, "%"SCNu64" %"SCNu64" %"SCNu64" %"SCNu64" "
+ "%"SCNu64" %"SCNu64" %"SCNu64" %"SCNu64" "
+ "%*u %"SCNu64" %"SCNu64"\n",
+ &dus->s.ios[0], &dus->s.merges[0], &dus->s.sectors[0],
+ &dus->s.ticks[0],
+ &dus->s.ios[1], &dus->s.merges[1], &dus->s.sectors[1],
+ &dus->s.ticks[1],
+ &dus->s.io_ticks, &dus->s.time_in_queue);
fclose(f);
- dprint(FD_DISKUTIL, "%s: stat read ok? %d\n", du->path, ret == 1);
- dus->s.sectors[0] = sectors[0];
- dus->s.sectors[1] = sectors[1];
- return ret != 11;
+ dprint(FD_DISKUTIL, "%s: stat read ok? %d\n", du->path, ret == 10);
+ return ret != 10;
+}
+
+static uint64_t safe_32bit_diff(uint64_t nval, uint64_t oval)
+{
+ /* Linux kernel prints some of the stat fields as 32-bit integers. It is
+ * possible that the value overflows, but since fio uses unsigned 64-bit
+ * arithmetic in update_io_tick_disk(), it instead results in a huge
+ * bogus value being added to the respective accumulating field. Just
+ * in case Linux starts reporting these metrics as 64-bit values in the
+ * future, check that overflow actually happens around the 32-bit
+ * unsigned boundary; assume overflow only happens once between
+ * successive polls.
+ */
+ if (oval <= nval || oval >= (1ull << 32))
+ return nval - oval;
+ else
+ return (1ull << 32) + nval - oval;
}
static void update_io_tick_disk(struct disk_util *du)
dus->s.ios[1] += (__dus.s.ios[1] - ldus->s.ios[1]);
dus->s.merges[0] += (__dus.s.merges[0] - ldus->s.merges[0]);
dus->s.merges[1] += (__dus.s.merges[1] - ldus->s.merges[1]);
- dus->s.ticks[0] += (__dus.s.ticks[0] - ldus->s.ticks[0]);
- dus->s.ticks[1] += (__dus.s.ticks[1] - ldus->s.ticks[1]);
- dus->s.io_ticks += (__dus.s.io_ticks - ldus->s.io_ticks);
- dus->s.time_in_queue += (__dus.s.time_in_queue - ldus->s.time_in_queue);
+ dus->s.ticks[0] += safe_32bit_diff(__dus.s.ticks[0], ldus->s.ticks[0]);
+ dus->s.ticks[1] += safe_32bit_diff(__dus.s.ticks[1], ldus->s.ticks[1]);
+ dus->s.io_ticks += safe_32bit_diff(__dus.s.io_ticks, ldus->s.io_ticks);
+ dus->s.time_in_queue +=
+ safe_32bit_diff(__dus.s.time_in_queue, ldus->s.time_in_queue);
fio_gettime(&t, NULL);
dus->s.msec += mtime_since(&du->time, &t);
- memcpy(&du->time, &t, sizeof(t));
- memcpy(&ldus->s, &__dus.s, sizeof(__dus.s));
+ du->time = t;
+ ldus->s = __dus.s;
}
int update_io_ticks(void)
if (S_ISBLK(st.st_mode)) {
majdev = major(st.st_rdev);
mindev = minor(st.st_rdev);
- } else if (S_ISCHR(st.st_mode)) {
- majdev = major(st.st_rdev);
- mindev = minor(st.st_rdev);
- if (fio_lookup_raw(st.st_rdev, &majdev, &mindev))
- return -1;
- } else if (S_ISFIFO(st.st_mode))
+ } else if (S_ISCHR(st.st_mode) ||
+ S_ISFIFO(st.st_mode)) {
return -1;
- else {
+ } else {
majdev = major(st.st_dev);
mindev = minor(st.st_dev);
}
/*
* must be a file, open "." in that path
*/
- snprintf(tempname, ARRAY_SIZE(tempname), "%s", file_name);
+ snprintf(tempname, FIO_ARRAY_SIZE(tempname), "%s", file_name);
p = dirname(tempname);
if (stat(p, &st)) {
perror("disk util stat");
sfree(du);
return NULL;
}
- snprintf((char *) du->dus.name, ARRAY_SIZE(du->dus.name), "%s",
+ snprintf((char *) du->dus.name, FIO_ARRAY_SIZE(du->dus.name), "%s",
basename(path));
du->sysfs_root = strdup(path);
du->major = majdev;
log_err("unknown sysfs layout\n");
return NULL;
}
- snprintf(tmp, ARRAY_SIZE(tmp), "%s", p);
+ snprintf(tmp, FIO_ARRAY_SIZE(tmp), "%s", p);
sprintf(path, "%s", tmp);
}
#define FIO_DISKUTIL_H
#define FIO_DU_NAME_SZ 64
+#include <stdint.h>
+#include <limits.h>
+
#include "helper_thread.h"
#include "fio_sem.h"
-
+#include "flist.h"
+#include "lib/ieee754.h"
+
+/**
+ * @ios: Number of I/O operations that have been completed successfully.
+ * @merges: Number of I/O operations that have been merged.
+ * @sectors: I/O size in 512-byte units.
+ * @ticks: Time spent on I/O in milliseconds.
+ * @io_ticks: CPU time spent on I/O in milliseconds.
+ * @time_in_queue: Weighted time spent doing I/O in milliseconds.
+ *
+ * For the array members, index 0 refers to reads and index 1 refers to writes.
+ */
struct disk_util_stats {
uint64_t ios[2];
uint64_t merges[2];
};
/*
- * Disk utils as read in /sys/block/<dev>/stat
+ * Disk utilization as read from /sys/block/<dev>/stat
*/
struct disk_util_stat {
uint8_t name[FIO_DU_NAME_SZ];
#
# You can set these variables from the command line.
-SPHINXOPTS =
+SPHINXOPTS = -W --keep-going
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = output
version, release = fio_version()
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
- ('fio_man', 'fio', 'flexible I/O tester',
+ ('fio_doc', 'fio', 'flexible I/O tester',
[author], 1)
]
========================================
-.. include:: ../README
+.. include:: ../README.rst
-.. include:: ../HOWTO
+.. include:: ../HOWTO.rst
+.. only:: not man
+ Examples
+ ========
-Examples
-========
+ .. include:: fio_examples.rst
-.. include:: fio_examples.rst
+ TODO
+ ====
-TODO
-====
+ GFIO TODO
+ ---------
-GFIO TODO
----------
+ .. include:: ../GFIO-TODO
-.. include:: ../GFIO-TODO
+ Server TODO
+ -----------
-Server TODO
------------
+ .. include:: ../SERVER-TODO
-.. include:: ../SERVER-TODO
+ Steady State TODO
+ -----------------
-Steady State TODO
------------------
+ .. include:: ../STEADYSTATE-TODO
-.. include:: ../STEADYSTATE-TODO
+ Moral License
+ =============
-Moral License
-=============
+ .. include:: ../MORAL-LICENSE
-.. include:: ../MORAL-LICENSE
+ License
+ =======
-License
-=======
-
-.. literalinclude:: ../COPYING
+ .. literalinclude:: ../COPYING
+++ /dev/null
-:orphan:
-
-Fio Manpage
-===========
-
-(rev. |release|)
-
-
-.. include:: ../README
-
-
-.. include:: ../HOWTO
--- /dev/null
+/*
+ * IO priority handling helper functions common to the libaio and io_uring
+ * engines.
+ */
+
+#include "cmdprio.h"
+
+/*
+ * Temporary array used during parsing. Will be freed after the corresponding
+ * struct bsprio_desc has been generated and saved in cmdprio->bsprio_desc.
+ */
+struct cmdprio_parse_result {
+ struct split_prio *entries;
+ int nr_entries;
+};
+
+/*
+ * Temporary array used during init. Will be freed after the corresponding
+ * struct clat_prio_stat array has been saved in td->ts.clat_prio and the
+ * matching clat_prio_indexes have been saved in each struct cmdprio_prio.
+ */
+struct cmdprio_values {
+ unsigned int *prios;
+ int nr_prios;
+};
+
+static int find_clat_prio_index(unsigned int *all_prios, int nr_prios,
+ int32_t prio)
+{
+ int i;
+
+ for (i = 0; i < nr_prios; i++) {
+ if (all_prios[i] == prio)
+ return i;
+ }
+
+ return -1;
+}
+
+/**
+ * assign_clat_prio_index - In order to avoid stat.c the need to loop through
+ * all possible priorities each time add_clat_sample() / add_lat_sample() is
+ * called, save which index to use in each cmdprio_prio. This will later be
+ * propagated to the io_u, if the specific io_u was determined to use a cmdprio
+ * priority value.
+ */
+static void assign_clat_prio_index(struct cmdprio_prio *prio,
+ struct cmdprio_values *values)
+{
+ int clat_prio_index = find_clat_prio_index(values->prios,
+ values->nr_prios,
+ prio->prio);
+ if (clat_prio_index == -1) {
+ clat_prio_index = values->nr_prios;
+ values->prios[clat_prio_index] = prio->prio;
+ values->nr_prios++;
+ }
+ prio->clat_prio_index = clat_prio_index;
+}
+
+/**
+ * init_cmdprio_values - Allocate a temporary array that can hold all unique
+ * priorities (per ddir), so that we can assign_clat_prio_index() for each
+ * cmdprio_prio during setup. This temporary array is freed after setup.
+ */
+static int init_cmdprio_values(struct cmdprio_values *values,
+ int max_unique_prios, struct thread_stat *ts)
+{
+ values->prios = calloc(max_unique_prios + 1,
+ sizeof(*values->prios));
+ if (!values->prios)
+ return 1;
+
+ /* td->ioprio/ts->ioprio is always stored at index 0. */
+ values->prios[0] = ts->ioprio;
+ values->nr_prios++;
+
+ return 0;
+}
+
+/**
+ * init_ts_clat_prio - Allocates and fills a clat_prio_stat array which holds
+ * all unique priorities (per ddir).
+ */
+static int init_ts_clat_prio(struct thread_stat *ts, enum fio_ddir ddir,
+ struct cmdprio_values *values)
+{
+ int i;
+
+ if (alloc_clat_prio_stat_ddir(ts, ddir, values->nr_prios))
+ return 1;
+
+ for (i = 0; i < values->nr_prios; i++)
+ ts->clat_prio[ddir][i].ioprio = values->prios[i];
+
+ return 0;
+}
+
+static int fio_cmdprio_fill_bsprio(struct cmdprio_bsprio *bsprio,
+ struct split_prio *entries,
+ struct cmdprio_values *values,
+ int implicit_cmdprio, int start, int end)
+{
+ struct cmdprio_prio *prio;
+ int i = end - start + 1;
+
+ bsprio->prios = calloc(i, sizeof(*bsprio->prios));
+ if (!bsprio->prios)
+ return 1;
+
+ bsprio->bs = entries[start].bs;
+ bsprio->nr_prios = 0;
+ for (i = start; i <= end; i++) {
+ prio = &bsprio->prios[bsprio->nr_prios];
+ prio->perc = entries[i].perc;
+ if (entries[i].prio == -1)
+ prio->prio = implicit_cmdprio;
+ else
+ prio->prio = entries[i].prio;
+ assign_clat_prio_index(prio, values);
+ bsprio->tot_perc += entries[i].perc;
+ if (bsprio->tot_perc > 100) {
+ log_err("fio: cmdprio_bssplit total percentage "
+ "for bs: %"PRIu64" exceeds 100\n",
+ bsprio->bs);
+ free(bsprio->prios);
+ return 1;
+ }
+ bsprio->nr_prios++;
+ }
+
+ return 0;
+}
+
+static int
+fio_cmdprio_generate_bsprio_desc(struct cmdprio_bsprio_desc *bsprio_desc,
+ struct cmdprio_parse_result *parse_res,
+ struct cmdprio_values *values,
+ int implicit_cmdprio)
+{
+ struct split_prio *entries = parse_res->entries;
+ int nr_entries = parse_res->nr_entries;
+ struct cmdprio_bsprio *bsprio;
+ int i, start, count = 0;
+
+ /*
+ * The parsed result is sorted by blocksize, so count only the number
+ * of different blocksizes, to know how many cmdprio_bsprio we need.
+ */
+ for (i = 0; i < nr_entries; i++) {
+ while (i + 1 < nr_entries && entries[i].bs == entries[i + 1].bs)
+ i++;
+ count++;
+ }
+
+ /*
+ * This allocation is not freed on error. Instead, the calling function
+ * is responsible for calling fio_cmdprio_cleanup() on error.
+ */
+ bsprio_desc->bsprios = calloc(count, sizeof(*bsprio_desc->bsprios));
+ if (!bsprio_desc->bsprios)
+ return 1;
+
+ start = 0;
+ bsprio_desc->nr_bsprios = 0;
+ for (i = 0; i < nr_entries; i++) {
+ while (i + 1 < nr_entries && entries[i].bs == entries[i + 1].bs)
+ i++;
+ bsprio = &bsprio_desc->bsprios[bsprio_desc->nr_bsprios];
+ /*
+ * All parsed entries with the same blocksize get saved in the
+ * same cmdprio_bsprio, to expedite the search in the hot path.
+ */
+ if (fio_cmdprio_fill_bsprio(bsprio, entries, values,
+ implicit_cmdprio, start, i))
+ return 1;
+
+ start = i + 1;
+ bsprio_desc->nr_bsprios++;
+ }
+
+ return 0;
+}
+
+static int fio_cmdprio_bssplit_ddir(struct thread_options *to, void *cb_arg,
+ enum fio_ddir ddir, char *str, bool data)
+{
+ struct cmdprio_parse_result *parse_res_arr = cb_arg;
+ struct cmdprio_parse_result *parse_res = &parse_res_arr[ddir];
+
+ if (ddir == DDIR_TRIM)
+ return 0;
+
+ if (split_parse_prio_ddir(to, &parse_res->entries,
+ &parse_res->nr_entries, str))
+ return 1;
+
+ return 0;
+}
+
+static int fio_cmdprio_bssplit_parse(struct thread_data *td, const char *input,
+ struct cmdprio_parse_result *parse_res)
+{
+ char *str, *p;
+ int ret = 0;
+
+ p = str = strdup(input);
+
+ strip_blank_front(&str);
+ strip_blank_end(str);
+
+ ret = str_split_parse(td, str, fio_cmdprio_bssplit_ddir, parse_res,
+ false);
+
+ free(p);
+ return ret;
+}
+
+/**
+ * fio_cmdprio_percentage - Returns the percentage of I/Os that should
+ * use a cmdprio priority value (rather than the default context priority).
+ *
+ * For CMDPRIO_MODE_BSSPLIT, if the percentage is non-zero, we will also
+ * return the matching bsprio, to avoid the same linear search elsewhere.
+ * For CMDPRIO_MODE_PERC, we will never return a bsprio.
+ */
+static int fio_cmdprio_percentage(struct cmdprio *cmdprio, struct io_u *io_u,
+ struct cmdprio_bsprio **bsprio)
+{
+ struct cmdprio_bsprio *bsprio_entry;
+ enum fio_ddir ddir = io_u->ddir;
+ int i;
+
+ switch (cmdprio->mode) {
+ case CMDPRIO_MODE_PERC:
+ *bsprio = NULL;
+ return cmdprio->perc_entry[ddir].perc;
+ case CMDPRIO_MODE_BSSPLIT:
+ for (i = 0; i < cmdprio->bsprio_desc[ddir].nr_bsprios; i++) {
+ bsprio_entry = &cmdprio->bsprio_desc[ddir].bsprios[i];
+ if (bsprio_entry->bs == io_u->buflen) {
+ *bsprio = bsprio_entry;
+ return bsprio_entry->tot_perc;
+ }
+ }
+ break;
+ default:
+ /*
+ * An I/O engine should never call this function if cmdprio
+ * is not is use.
+ */
+ assert(0);
+ }
+
+ /*
+ * This is totally fine, the given blocksize simply does not
+ * have any (non-zero) cmdprio_bssplit entries defined.
+ */
+ *bsprio = NULL;
+ return 0;
+}
+
+/**
+ * fio_cmdprio_set_ioprio - Set an io_u ioprio according to cmdprio options
+ *
+ * Generates a random percentage value to determine if an io_u ioprio needs
+ * to be set. If the random percentage value is within the user specified
+ * percentage of I/Os that should use a cmdprio priority value (rather than
+ * the default priority), then this function updates the io_u with an ioprio
+ * value as defined by the cmdprio/cmdprio_hint/cmdprio_class or
+ * cmdprio_bssplit options.
+ *
+ * Return true if the io_u ioprio was changed and false otherwise.
+ */
+bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio,
+ struct io_u *io_u)
+{
+ struct cmdprio_bsprio *bsprio;
+ unsigned int p, rand;
+ uint32_t perc = 0;
+ int i;
+
+ p = fio_cmdprio_percentage(cmdprio, io_u, &bsprio);
+ if (!p)
+ return false;
+
+ rand = rand_between(&td->prio_state, 0, 99);
+ if (rand >= p)
+ return false;
+
+ switch (cmdprio->mode) {
+ case CMDPRIO_MODE_PERC:
+ io_u->ioprio = cmdprio->perc_entry[io_u->ddir].prio;
+ io_u->clat_prio_index =
+ cmdprio->perc_entry[io_u->ddir].clat_prio_index;
+ return true;
+ case CMDPRIO_MODE_BSSPLIT:
+ assert(bsprio);
+ for (i = 0; i < bsprio->nr_prios; i++) {
+ struct cmdprio_prio *prio = &bsprio->prios[i];
+
+ perc += prio->perc;
+ if (rand < perc) {
+ io_u->ioprio = prio->prio;
+ io_u->clat_prio_index = prio->clat_prio_index;
+ return true;
+ }
+ }
+ break;
+ default:
+ assert(0);
+ }
+
+ /* When rand < p (total perc), we should always find a cmdprio_prio. */
+ assert(0);
+ return false;
+}
+
+static int fio_cmdprio_gen_perc(struct thread_data *td, struct cmdprio *cmdprio)
+{
+ struct cmdprio_options *options = cmdprio->options;
+ struct cmdprio_prio *prio;
+ struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {};
+ struct thread_stat *ts = &td->ts;
+ enum fio_ddir ddir;
+ int ret;
+
+ for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+ /*
+ * Do not allocate a clat_prio array nor set the cmdprio struct
+ * if zero percent of the I/Os (for the ddir) should use a
+ * cmdprio priority value, or when the ddir is not enabled.
+ */
+ if (!options->percentage[ddir] ||
+ (ddir == DDIR_READ && !td_read(td)) ||
+ (ddir == DDIR_WRITE && !td_write(td)))
+ continue;
+
+ ret = init_cmdprio_values(&values[ddir], 1, ts);
+ if (ret)
+ goto err;
+
+ prio = &cmdprio->perc_entry[ddir];
+ prio->perc = options->percentage[ddir];
+ prio->prio = ioprio_value(options->class[ddir],
+ options->level[ddir],
+ options->hint[ddir]);
+ assign_clat_prio_index(prio, &values[ddir]);
+
+ ret = init_ts_clat_prio(ts, ddir, &values[ddir]);
+ if (ret)
+ goto err;
+
+ free(values[ddir].prios);
+ values[ddir].prios = NULL;
+ values[ddir].nr_prios = 0;
+ }
+
+ return 0;
+
+err:
+ for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++)
+ free(values[ddir].prios);
+ free_clat_prio_stats(ts);
+
+ return ret;
+}
+
+static int fio_cmdprio_parse_and_gen_bssplit(struct thread_data *td,
+ struct cmdprio *cmdprio)
+{
+ struct cmdprio_options *options = cmdprio->options;
+ struct cmdprio_parse_result parse_res[CMDPRIO_RWDIR_CNT] = {};
+ struct cmdprio_values values[CMDPRIO_RWDIR_CNT] = {};
+ struct thread_stat *ts = &td->ts;
+ int ret, implicit_cmdprio;
+ enum fio_ddir ddir;
+
+ ret = fio_cmdprio_bssplit_parse(td, options->bssplit_str,
+ &parse_res[0]);
+ if (ret)
+ goto err;
+
+ for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+ /*
+ * Do not allocate a clat_prio array nor set the cmdprio structs
+ * if there are no non-zero entries (for the ddir), or when the
+ * ddir is not enabled.
+ */
+ if (!parse_res[ddir].nr_entries ||
+ (ddir == DDIR_READ && !td_read(td)) ||
+ (ddir == DDIR_WRITE && !td_write(td))) {
+ free(parse_res[ddir].entries);
+ parse_res[ddir].entries = NULL;
+ parse_res[ddir].nr_entries = 0;
+ continue;
+ }
+
+ ret = init_cmdprio_values(&values[ddir],
+ parse_res[ddir].nr_entries, ts);
+ if (ret)
+ goto err;
+
+ implicit_cmdprio = ioprio_value(options->class[ddir],
+ options->level[ddir],
+ options->hint[ddir]);
+
+ ret = fio_cmdprio_generate_bsprio_desc(&cmdprio->bsprio_desc[ddir],
+ &parse_res[ddir],
+ &values[ddir],
+ implicit_cmdprio);
+ if (ret)
+ goto err;
+
+ free(parse_res[ddir].entries);
+ parse_res[ddir].entries = NULL;
+ parse_res[ddir].nr_entries = 0;
+
+ ret = init_ts_clat_prio(ts, ddir, &values[ddir]);
+ if (ret)
+ goto err;
+
+ free(values[ddir].prios);
+ values[ddir].prios = NULL;
+ values[ddir].nr_prios = 0;
+ }
+
+ return 0;
+
+err:
+ for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+ free(parse_res[ddir].entries);
+ free(values[ddir].prios);
+ }
+ free_clat_prio_stats(ts);
+ fio_cmdprio_cleanup(cmdprio);
+
+ return ret;
+}
+
+static int fio_cmdprio_parse_and_gen(struct thread_data *td,
+ struct cmdprio *cmdprio)
+{
+ struct cmdprio_options *options = cmdprio->options;
+ int i, ret;
+
+ /*
+ * If cmdprio_percentage/cmdprio_bssplit is set and cmdprio_class
+ * is not set, default to RT priority class.
+ */
+ for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
+ /*
+ * A cmdprio value is only used when fio_cmdprio_percentage()
+ * returns non-zero, so it is safe to set a class even for a
+ * DDIR that will never use it.
+ */
+ if (!options->class[i])
+ options->class[i] = IOPRIO_CLASS_RT;
+ }
+
+ switch (cmdprio->mode) {
+ case CMDPRIO_MODE_BSSPLIT:
+ ret = fio_cmdprio_parse_and_gen_bssplit(td, cmdprio);
+ break;
+ case CMDPRIO_MODE_PERC:
+ ret = fio_cmdprio_gen_perc(td, cmdprio);
+ break;
+ default:
+ assert(0);
+ return 1;
+ }
+
+ return ret;
+}
+
+void fio_cmdprio_cleanup(struct cmdprio *cmdprio)
+{
+ enum fio_ddir ddir;
+ int i;
+
+ for (ddir = 0; ddir < CMDPRIO_RWDIR_CNT; ddir++) {
+ for (i = 0; i < cmdprio->bsprio_desc[ddir].nr_bsprios; i++)
+ free(cmdprio->bsprio_desc[ddir].bsprios[i].prios);
+ free(cmdprio->bsprio_desc[ddir].bsprios);
+ cmdprio->bsprio_desc[ddir].bsprios = NULL;
+ cmdprio->bsprio_desc[ddir].nr_bsprios = 0;
+ }
+
+ /*
+ * options points to a cmdprio_options struct that is part of td->eo.
+ * td->eo itself will be freed by free_ioengine().
+ */
+ cmdprio->options = NULL;
+}
+
+int fio_cmdprio_init(struct thread_data *td, struct cmdprio *cmdprio,
+ struct cmdprio_options *options)
+{
+ struct thread_options *to = &td->o;
+ bool has_cmdprio_percentage = false;
+ bool has_cmdprio_bssplit = false;
+ int i;
+
+ cmdprio->options = options;
+
+ if (options->bssplit_str && strlen(options->bssplit_str))
+ has_cmdprio_bssplit = true;
+
+ for (i = 0; i < CMDPRIO_RWDIR_CNT; i++) {
+ if (options->percentage[i])
+ has_cmdprio_percentage = true;
+ }
+
+ /*
+ * Check for option conflicts
+ */
+ if (has_cmdprio_percentage && has_cmdprio_bssplit) {
+ log_err("%s: cmdprio_percentage and cmdprio_bssplit options "
+ "are mutually exclusive\n",
+ to->name);
+ return 1;
+ }
+
+ if (has_cmdprio_bssplit)
+ cmdprio->mode = CMDPRIO_MODE_BSSPLIT;
+ else if (has_cmdprio_percentage)
+ cmdprio->mode = CMDPRIO_MODE_PERC;
+ else
+ cmdprio->mode = CMDPRIO_MODE_NONE;
+
+ /* Nothing left to do if cmdprio is not used */
+ if (cmdprio->mode == CMDPRIO_MODE_NONE)
+ return 0;
+
+ return fio_cmdprio_parse_and_gen(td, cmdprio);
+}
--- /dev/null
+/*
+ * IO priority handling declarations and helper functions common to the
+ * libaio and io_uring engines.
+ */
+
+#ifndef FIO_CMDPRIO_H
+#define FIO_CMDPRIO_H
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+/* read and writes only, no trim */
+#define CMDPRIO_RWDIR_CNT 2
+
+enum {
+ CMDPRIO_MODE_NONE,
+ CMDPRIO_MODE_PERC,
+ CMDPRIO_MODE_BSSPLIT,
+};
+
+struct cmdprio_prio {
+ int32_t prio;
+ uint32_t perc;
+ uint16_t clat_prio_index;
+};
+
+struct cmdprio_bsprio {
+ uint64_t bs;
+ uint32_t tot_perc;
+ unsigned int nr_prios;
+ struct cmdprio_prio *prios;
+};
+
+struct cmdprio_bsprio_desc {
+ struct cmdprio_bsprio *bsprios;
+ unsigned int nr_bsprios;
+};
+
+struct cmdprio_options {
+ unsigned int percentage[CMDPRIO_RWDIR_CNT];
+ unsigned int class[CMDPRIO_RWDIR_CNT];
+ unsigned int level[CMDPRIO_RWDIR_CNT];
+ unsigned int hint[CMDPRIO_RWDIR_CNT];
+ char *bssplit_str;
+};
+
+#ifdef FIO_HAVE_IOPRIO_CLASS
+#define CMDPRIO_OPTIONS(opt_struct, opt_group) \
+ { \
+ .name = "cmdprio_percentage", \
+ .lname = "high priority percentage", \
+ .type = FIO_OPT_INT, \
+ .off1 = offsetof(opt_struct, \
+ cmdprio_options.percentage[DDIR_READ]), \
+ .off2 = offsetof(opt_struct, \
+ cmdprio_options.percentage[DDIR_WRITE]), \
+ .minval = 0, \
+ .maxval = 100, \
+ .help = "Send high priority I/O this percentage of the time", \
+ .category = FIO_OPT_C_ENGINE, \
+ .group = opt_group, \
+ }, \
+ { \
+ .name = "cmdprio_class", \
+ .lname = "Asynchronous I/O priority class", \
+ .type = FIO_OPT_INT, \
+ .off1 = offsetof(opt_struct, \
+ cmdprio_options.class[DDIR_READ]), \
+ .off2 = offsetof(opt_struct, \
+ cmdprio_options.class[DDIR_WRITE]), \
+ .help = "Set asynchronous IO priority class", \
+ .minval = IOPRIO_MIN_PRIO_CLASS + 1, \
+ .maxval = IOPRIO_MAX_PRIO_CLASS, \
+ .interval = 1, \
+ .category = FIO_OPT_C_ENGINE, \
+ .group = opt_group, \
+ }, \
+ { \
+ .name = "cmdprio_hint", \
+ .lname = "Asynchronous I/O priority hint", \
+ .type = FIO_OPT_INT, \
+ .off1 = offsetof(opt_struct, \
+ cmdprio_options.hint[DDIR_READ]), \
+ .off2 = offsetof(opt_struct, \
+ cmdprio_options.hint[DDIR_WRITE]), \
+ .help = "Set asynchronous IO priority hint", \
+ .minval = IOPRIO_MIN_PRIO_HINT, \
+ .maxval = IOPRIO_MAX_PRIO_HINT, \
+ .interval = 1, \
+ .category = FIO_OPT_C_ENGINE, \
+ .group = opt_group, \
+ }, \
+ { \
+ .name = "cmdprio", \
+ .lname = "Asynchronous I/O priority level", \
+ .type = FIO_OPT_INT, \
+ .off1 = offsetof(opt_struct, \
+ cmdprio_options.level[DDIR_READ]), \
+ .off2 = offsetof(opt_struct, \
+ cmdprio_options.level[DDIR_WRITE]), \
+ .help = "Set asynchronous IO priority level", \
+ .minval = IOPRIO_MIN_PRIO, \
+ .maxval = IOPRIO_MAX_PRIO, \
+ .interval = 1, \
+ .category = FIO_OPT_C_ENGINE, \
+ .group = opt_group, \
+ }, \
+ { \
+ .name = "cmdprio_bssplit", \
+ .lname = "Priority percentage block size split", \
+ .type = FIO_OPT_STR_STORE, \
+ .off1 = offsetof(opt_struct, cmdprio_options.bssplit_str), \
+ .help = "Set priority percentages for different block sizes", \
+ .category = FIO_OPT_C_ENGINE, \
+ .group = opt_group, \
+ }
+#else
+#define CMDPRIO_OPTIONS(opt_struct, opt_group) \
+ { \
+ .name = "cmdprio_percentage", \
+ .lname = "high priority percentage", \
+ .type = FIO_OPT_UNSUPPORTED, \
+ .help = "Platform does not support I/O priority classes", \
+ }, \
+ { \
+ .name = "cmdprio_class", \
+ .lname = "Asynchronous I/O priority class", \
+ .type = FIO_OPT_UNSUPPORTED, \
+ .help = "Platform does not support I/O priority classes", \
+ }, \
+ { \
+ .name = "cmdprio_hint", \
+ .lname = "Asynchronous I/O priority hint", \
+ .type = FIO_OPT_UNSUPPORTED, \
+ .help = "Platform does not support I/O priority classes", \
+ }, \
+ { \
+ .name = "cmdprio", \
+ .lname = "Asynchronous I/O priority level", \
+ .type = FIO_OPT_UNSUPPORTED, \
+ .help = "Platform does not support I/O priority classes", \
+ }, \
+ { \
+ .name = "cmdprio_bssplit", \
+ .lname = "Priority percentage block size split", \
+ .type = FIO_OPT_UNSUPPORTED, \
+ .help = "Platform does not support I/O priority classes", \
+ }
+#endif
+
+struct cmdprio {
+ struct cmdprio_options *options;
+ struct cmdprio_prio perc_entry[CMDPRIO_RWDIR_CNT];
+ struct cmdprio_bsprio_desc bsprio_desc[CMDPRIO_RWDIR_CNT];
+ unsigned int mode;
+};
+
+bool fio_cmdprio_set_ioprio(struct thread_data *td, struct cmdprio *cmdprio,
+ struct io_u *io_u);
+
+void fio_cmdprio_cleanup(struct cmdprio *cmdprio);
+
+int fio_cmdprio_init(struct thread_data *td, struct cmdprio *cmdprio,
+ struct cmdprio_options *options);
+
+#endif
#include "../fio.h"
#include "../optgroup.h"
+// number of 32 bit integers to sort
+size_t qsort_size = (256 * (1ULL << 10)); // 256KB
+
+struct mwc {
+ uint32_t w;
+ uint32_t z;
+};
+
+enum stress_mode {
+ FIO_CPU_NOOP = 0,
+ FIO_CPU_QSORT = 1,
+};
+
struct cpu_options {
void *pad;
unsigned int cpuload;
unsigned int cpucycle;
+ enum stress_mode cpumode;
unsigned int exit_io_done;
+ int32_t *qsort_data;
};
static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_INVALID,
},
+ {
+ .name = "cpumode",
+ .lname = "cpumode",
+ .type = FIO_OPT_STR,
+ .help = "Stress mode",
+ .off1 = offsetof(struct cpu_options, cpumode),
+ .def = "noop",
+ .posval = {
+ { .ival = "noop",
+ .oval = FIO_CPU_NOOP,
+ .help = "NOOP instructions",
+ },
+ { .ival = "qsort",
+ .oval = FIO_CPU_QSORT,
+ .help = "QSORT computation",
+ },
+ },
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
{
.name = "cpuchunks",
.lname = "CPU chunk",
},
};
+/*
+ * mwc32()
+ * Multiply-with-carry random numbers
+ * fast pseudo random number generator, see
+ * http://www.cse.yorku.ca/~oz/marsaglia-rng.html
+ */
+uint32_t mwc32(struct mwc *mwc)
+{
+ mwc->z = 36969 * (mwc->z & 65535) + (mwc->z >> 16);
+ mwc->w = 18000 * (mwc->w & 65535) + (mwc->w >> 16);
+ return (mwc->z << 16) + mwc->w;
+}
+
+/*
+ * stress_qsort_cmp_1()
+ * qsort comparison - sort on int32 values
+ */
+static int stress_qsort_cmp_1(const void *p1, const void *p2)
+{
+ const int32_t *i1 = (const int32_t *)p1;
+ const int32_t *i2 = (const int32_t *)p2;
+
+ if (*i1 > *i2)
+ return 1;
+ else if (*i1 < *i2)
+ return -1;
+ else
+ return 0;
+}
+
+/*
+ * stress_qsort_cmp_2()
+ * qsort comparison - reverse sort on int32 values
+ */
+static int stress_qsort_cmp_2(const void *p1, const void *p2)
+{
+ return stress_qsort_cmp_1(p2, p1);
+}
+
+/*
+ * stress_qsort_cmp_3()
+ * qsort comparison - sort on int8 values
+ */
+static int stress_qsort_cmp_3(const void *p1, const void *p2)
+{
+ const int8_t *i1 = (const int8_t *)p1;
+ const int8_t *i2 = (const int8_t *)p2;
+
+ /* Force re-ordering on 8 bit value */
+ return *i1 - *i2;
+}
+
+static int do_qsort(struct thread_data *td)
+{
+ struct thread_options *o = &td->o;
+ struct cpu_options *co = td->eo;
+ struct timespec start, now;
+
+ fio_get_mono_time(&start);
+
+ /* Sort "random" data */
+ qsort(co->qsort_data, qsort_size, sizeof(*(co->qsort_data)), stress_qsort_cmp_1);
+
+ /* Reverse sort */
+ qsort(co->qsort_data, qsort_size, sizeof(*(co->qsort_data)), stress_qsort_cmp_2);
+
+ /* And re-order by byte compare */
+ qsort((uint8_t *)co->qsort_data, qsort_size * 4, sizeof(uint8_t), stress_qsort_cmp_3);
+
+ /* Reverse sort this again */
+ qsort(co->qsort_data, qsort_size, sizeof(*(co->qsort_data)), stress_qsort_cmp_2);
+ fio_get_mono_time(&now);
+
+ /* Adjusting cpucycle automatically to be as close as possible to the
+ * expected cpuload The time to execute do_qsort() may change over time
+ * as per : - the job concurrency - the cpu clock adjusted by the power
+ * management After every do_qsort() call, the next thinktime is
+ * adjusted regarding the last run performance
+ */
+ co->cpucycle = utime_since(&start, &now);
+ o->thinktime = ((unsigned long long) co->cpucycle *
+ (100 - co->cpuload)) / co->cpuload;
+
+ return 0;
+}
static enum fio_q_status fio_cpuio_queue(struct thread_data *td,
struct io_u fio_unused *io_u)
return FIO_Q_BUSY;
}
- usec_spin(co->cpucycle);
+ switch (co->cpumode) {
+ case FIO_CPU_NOOP:
+ usec_spin(co->cpucycle);
+ break;
+ case FIO_CPU_QSORT:
+ do_qsort(td);
+ break;
+ }
+
return FIO_Q_COMPLETED;
}
+static int noop_init(struct thread_data *td)
+{
+ struct cpu_options *co = td->eo;
+
+ log_info("%s (noop): ioengine=%s, cpuload=%u, cpucycle=%u\n",
+ td->o.name, td->io_ops->name, co->cpuload, co->cpucycle);
+ return 0;
+}
+
+static int qsort_cleanup(struct thread_data *td)
+{
+ struct cpu_options *co = td->eo;
+
+ if (co->qsort_data) {
+ free(co->qsort_data);
+ co->qsort_data = NULL;
+ }
+
+ return 0;
+}
+
+static int qsort_init(struct thread_data *td)
+{
+ /* Setting up a default entropy */
+ struct mwc mwc = { 521288629UL, 362436069UL };
+ struct cpu_options *co = td->eo;
+ int32_t *ptr;
+ int i;
+
+ co->qsort_data = calloc(qsort_size, sizeof(*co->qsort_data));
+ if (co->qsort_data == NULL) {
+ td_verror(td, ENOMEM, "qsort_init");
+ return 1;
+ }
+
+ /* This is expensive, init the memory once */
+ for (ptr = co->qsort_data, i = 0; i < qsort_size; i++)
+ *ptr++ = mwc32(&mwc);
+
+ log_info("%s (qsort): ioengine=%s, cpuload=%u, cpucycle=%u\n",
+ td->o.name, td->io_ops->name, co->cpuload, co->cpucycle);
+
+ return 0;
+}
+
static int fio_cpuio_init(struct thread_data *td)
{
struct thread_options *o = &td->o;
struct cpu_options *co = td->eo;
+ int td_previous_state;
+ char *msg;
if (!co->cpuload) {
td_vmsg(td, EINVAL, "cpu thread needs rate (cpuload=)","cpuio");
if (co->cpuload > 100)
co->cpuload = 100;
+ /* Saving the current thread state */
+ td_previous_state = td->runstate;
+
+ /* Reporting that we are preparing the engine
+ * This is useful as the qsort() calibration takes time
+ * This prevents the job from starting before init is completed
+ */
+ td_set_runstate(td, TD_SETTING_UP);
+
/*
* set thinktime_sleep and thinktime_spin appropriately
*/
o->thinktime_blocks = 1;
+ o->thinktime_blocks_type = THINKTIME_BLOCKS_TYPE_COMPLETE;
o->thinktime_spin = 0;
- o->thinktime = ((unsigned long long) co->cpucycle * (100 - co->cpuload)) / co->cpuload;
+ o->thinktime = ((unsigned long long) co->cpucycle *
+ (100 - co->cpuload)) / co->cpuload;
o->nr_files = o->open_files = 1;
- log_info("%s: ioengine=%s, cpuload=%u, cpucycle=%u\n",
- td->o.name, td->io_ops->name, co->cpuload, co->cpucycle);
+ switch (co->cpumode) {
+ case FIO_CPU_NOOP:
+ noop_init(td);
+ break;
+ case FIO_CPU_QSORT:
+ qsort_init(td);
+ break;
+ default:
+ if (asprintf(&msg, "bad cpu engine mode: %d", co->cpumode) < 0)
+ msg = NULL;
+ td_vmsg(td, EINVAL, msg ? : "(?)", __func__);
+ free(msg);
+ return 1;
+ }
+ /* Let's restore the previous state. */
+ td_set_runstate(td, td_previous_state);
return 0;
}
+static void fio_cpuio_cleanup(struct thread_data *td)
+{
+ struct cpu_options *co = td->eo;
+
+ switch (co->cpumode) {
+ case FIO_CPU_NOOP:
+ break;
+ case FIO_CPU_QSORT:
+ qsort_cleanup(td);
+ break;
+ }
+}
+
static int fio_cpuio_open(struct thread_data fio_unused *td,
struct fio_file fio_unused *f)
{
}
static struct ioengine_ops ioengine = {
- .name = "cpuio",
- .version = FIO_IOOPS_VERSION,
- .queue = fio_cpuio_queue,
- .init = fio_cpuio_init,
- .open_file = fio_cpuio_open,
- .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOIO,
+ .name = "cpuio",
+ .version = FIO_IOOPS_VERSION,
+ .queue = fio_cpuio_queue,
+ .init = fio_cpuio_init,
+ .cleanup = fio_cpuio_cleanup,
+ .open_file = fio_cpuio_open,
+ .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOIO,
.options = options,
.option_struct_size = sizeof(struct cpu_options),
};
--- /dev/null
+/**
+ * FIO engine for DAOS File System (dfs).
+ *
+ * (C) Copyright 2020-2021 Intel Corporation.
+ */
+
+#include <fio.h>
+#include <optgroup.h>
+
+#include <daos.h>
+#include <daos_fs.h>
+
+static bool daos_initialized;
+static int num_threads;
+static pthread_mutex_t daos_mutex = PTHREAD_MUTEX_INITIALIZER;
+daos_handle_t poh; /* pool handle */
+daos_handle_t coh; /* container handle */
+daos_oclass_id_t cid = OC_UNKNOWN; /* object class */
+dfs_t *dfs; /* dfs mount reference */
+
+struct daos_iou {
+ struct io_u *io_u;
+ daos_event_t ev;
+ d_sg_list_t sgl;
+ d_iov_t iov;
+ daos_size_t size;
+ bool complete;
+};
+
+struct daos_data {
+ daos_handle_t eqh;
+ dfs_obj_t *obj;
+ struct io_u **io_us;
+ int queued;
+ int num_ios;
+};
+
+struct daos_fio_options {
+ void *pad;
+ char *pool; /* Pool UUID */
+ char *cont; /* Container UUID */
+ daos_size_t chsz; /* Chunk size */
+ char *oclass; /* object class */
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+ char *svcl; /* service replica list, deprecated */
+#endif
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "pool",
+ .lname = "pool uuid or label",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct daos_fio_options, pool),
+ .help = "DAOS pool uuid or label",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_DFS,
+ },
+ {
+ .name = "cont",
+ .lname = "container uuid or label",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct daos_fio_options, cont),
+ .help = "DAOS container uuid or label",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_DFS,
+ },
+ {
+ .name = "chunk_size",
+ .lname = "DFS chunk size",
+ .type = FIO_OPT_ULL,
+ .off1 = offsetof(struct daos_fio_options, chsz),
+ .help = "DFS chunk size in bytes",
+ .def = "0", /* use container default */
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_DFS,
+ },
+ {
+ .name = "object_class",
+ .lname = "object class",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct daos_fio_options, oclass),
+ .help = "DAOS object class",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_DFS,
+ },
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+ {
+ .name = "svcl",
+ .lname = "List of service ranks",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct daos_fio_options, svcl),
+ .help = "List of pool replicated service ranks",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_DFS,
+ },
+#endif
+ {
+ .name = NULL,
+ },
+};
+
+static int daos_fio_global_init(struct thread_data *td)
+{
+ struct daos_fio_options *eo = td->eo;
+ daos_pool_info_t pool_info;
+ daos_cont_info_t co_info;
+ int rc = 0;
+
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+ if (!eo->pool || !eo->cont || !eo->svcl) {
+#else
+ if (!eo->pool || !eo->cont) {
+#endif
+ log_err("Missing required DAOS options\n");
+ return EINVAL;
+ }
+
+ rc = daos_init();
+ if (rc != -DER_ALREADY && rc) {
+ log_err("Failed to initialize daos %d\n", rc);
+ td_verror(td, rc, "daos_init");
+ return rc;
+ }
+
+#if !defined(DAOS_API_VERSION_MAJOR) || \
+ (DAOS_API_VERSION_MAJOR == 1 && DAOS_API_VERSION_MINOR < 3)
+ uuid_t pool_uuid, co_uuid;
+
+ rc = uuid_parse(eo->pool, pool_uuid);
+ if (rc) {
+ log_err("Failed to parse 'Pool uuid': %s\n", eo->pool);
+ td_verror(td, EINVAL, "uuid_parse(eo->pool)");
+ return EINVAL;
+ }
+
+ rc = uuid_parse(eo->cont, co_uuid);
+ if (rc) {
+ log_err("Failed to parse 'Cont uuid': %s\n", eo->cont);
+ td_verror(td, EINVAL, "uuid_parse(eo->cont)");
+ return EINVAL;
+ }
+#endif
+
+ /* Connect to the DAOS pool */
+#if !defined(DAOS_API_VERSION_MAJOR) || DAOS_API_VERSION_MAJOR < 1
+ d_rank_list_t *svcl = NULL;
+
+ svcl = daos_rank_list_parse(eo->svcl, ":");
+ if (svcl == NULL) {
+ log_err("Failed to parse svcl\n");
+ td_verror(td, EINVAL, "daos_rank_list_parse");
+ return EINVAL;
+ }
+
+ rc = daos_pool_connect(pool_uuid, NULL, svcl, DAOS_PC_RW,
+ &poh, &pool_info, NULL);
+ d_rank_list_free(svcl);
+#elif (DAOS_API_VERSION_MAJOR == 1 && DAOS_API_VERSION_MINOR < 3)
+ rc = daos_pool_connect(pool_uuid, NULL, DAOS_PC_RW, &poh, &pool_info,
+ NULL);
+#else
+ rc = daos_pool_connect(eo->pool, NULL, DAOS_PC_RW, &poh, &pool_info,
+ NULL);
+#endif
+ if (rc) {
+ log_err("Failed to connect to pool %d\n", rc);
+ td_verror(td, rc, "daos_pool_connect");
+ return rc;
+ }
+
+ /* Open the DAOS container */
+#if !defined(DAOS_API_VERSION_MAJOR) || \
+ (DAOS_API_VERSION_MAJOR == 1 && DAOS_API_VERSION_MINOR < 3)
+ rc = daos_cont_open(poh, co_uuid, DAOS_COO_RW, &coh, &co_info, NULL);
+#else
+ rc = daos_cont_open(poh, eo->cont, DAOS_COO_RW, &coh, &co_info, NULL);
+#endif
+ if (rc) {
+ log_err("Failed to open container: %d\n", rc);
+ td_verror(td, rc, "daos_cont_open");
+ (void)daos_pool_disconnect(poh, NULL);
+ return rc;
+ }
+
+ /* Mount encapsulated filesystem */
+ rc = dfs_mount(poh, coh, O_RDWR, &dfs);
+ if (rc) {
+ log_err("Failed to mount DFS namespace: %d\n", rc);
+ td_verror(td, rc, "dfs_mount");
+ (void)daos_pool_disconnect(poh, NULL);
+ (void)daos_cont_close(coh, NULL);
+ return rc;
+ }
+
+ /* Retrieve object class to use, if specified */
+ if (eo->oclass)
+ cid = daos_oclass_name2id(eo->oclass);
+
+ return 0;
+}
+
+static int daos_fio_global_cleanup()
+{
+ int rc;
+ int ret = 0;
+
+ rc = dfs_umount(dfs);
+ if (rc) {
+ log_err("failed to umount dfs: %d\n", rc);
+ ret = rc;
+ }
+ rc = daos_cont_close(coh, NULL);
+ if (rc) {
+ log_err("failed to close container: %d\n", rc);
+ if (ret == 0)
+ ret = rc;
+ }
+ rc = daos_pool_disconnect(poh, NULL);
+ if (rc) {
+ log_err("failed to disconnect pool: %d\n", rc);
+ if (ret == 0)
+ ret = rc;
+ }
+ rc = daos_fini();
+ if (rc) {
+ log_err("failed to finalize daos: %d\n", rc);
+ if (ret == 0)
+ ret = rc;
+ }
+
+ return ret;
+}
+
+static int daos_fio_setup(struct thread_data *td)
+{
+ return 0;
+}
+
+static int daos_fio_init(struct thread_data *td)
+{
+ struct daos_data *dd;
+ int rc = 0;
+
+ pthread_mutex_lock(&daos_mutex);
+
+ dd = malloc(sizeof(*dd));
+ if (dd == NULL) {
+ log_err("Failed to allocate DAOS-private data\n");
+ rc = ENOMEM;
+ goto out;
+ }
+
+ dd->queued = 0;
+ dd->num_ios = td->o.iodepth;
+ dd->io_us = calloc(dd->num_ios, sizeof(struct io_u *));
+ if (dd->io_us == NULL) {
+ log_err("Failed to allocate IO queue\n");
+ rc = ENOMEM;
+ goto out;
+ }
+
+ /* initialize DAOS stack if not already up */
+ if (!daos_initialized) {
+ rc = daos_fio_global_init(td);
+ if (rc)
+ goto out;
+ daos_initialized = true;
+ }
+
+ rc = daos_eq_create(&dd->eqh);
+ if (rc) {
+ log_err("Failed to create event queue: %d\n", rc);
+ td_verror(td, rc, "daos_eq_create");
+ goto out;
+ }
+
+ td->io_ops_data = dd;
+ num_threads++;
+out:
+ if (rc) {
+ if (dd) {
+ free(dd->io_us);
+ free(dd);
+ }
+ if (num_threads == 0 && daos_initialized) {
+ /* don't clobber error return value */
+ (void)daos_fio_global_cleanup();
+ daos_initialized = false;
+ }
+ }
+ pthread_mutex_unlock(&daos_mutex);
+ return rc;
+}
+
+static void daos_fio_cleanup(struct thread_data *td)
+{
+ struct daos_data *dd = td->io_ops_data;
+ int rc;
+
+ if (dd == NULL)
+ return;
+
+ rc = daos_eq_destroy(dd->eqh, DAOS_EQ_DESTROY_FORCE);
+ if (rc < 0) {
+ log_err("failed to destroy event queue: %d\n", rc);
+ td_verror(td, rc, "daos_eq_destroy");
+ }
+
+ free(dd->io_us);
+ free(dd);
+
+ pthread_mutex_lock(&daos_mutex);
+ num_threads--;
+ if (daos_initialized && num_threads == 0) {
+ int ret;
+
+ ret = daos_fio_global_cleanup();
+ if (ret < 0 && rc == 0) {
+ log_err("failed to clean up: %d\n", ret);
+ td_verror(td, ret, "daos_fio_global_cleanup");
+ }
+ daos_initialized = false;
+ }
+ pthread_mutex_unlock(&daos_mutex);
+}
+
+static int daos_fio_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+ char *file_name = f->file_name;
+ struct stat stbuf = {0};
+ int rc;
+
+ dprint(FD_FILE, "dfs stat %s\n", f->file_name);
+
+ if (!daos_initialized)
+ return 0;
+
+ rc = dfs_stat(dfs, NULL, file_name, &stbuf);
+ if (rc) {
+ log_err("Failed to stat %s: %d\n", f->file_name, rc);
+ td_verror(td, rc, "dfs_stat");
+ return rc;
+ }
+
+ f->real_file_size = stbuf.st_size;
+ return 0;
+}
+
+static int daos_fio_close(struct thread_data *td, struct fio_file *f)
+{
+ struct daos_data *dd = td->io_ops_data;
+ int rc;
+
+ dprint(FD_FILE, "dfs release %s\n", f->file_name);
+
+ rc = dfs_release(dd->obj);
+ if (rc) {
+ log_err("Failed to release %s: %d\n", f->file_name, rc);
+ td_verror(td, rc, "dfs_release");
+ return rc;
+ }
+
+ return 0;
+}
+
+static int daos_fio_open(struct thread_data *td, struct fio_file *f)
+{
+ struct daos_data *dd = td->io_ops_data;
+ struct daos_fio_options *eo = td->eo;
+ int flags = 0;
+ int rc;
+
+ dprint(FD_FILE, "dfs open %s (%s/%d/%d)\n",
+ f->file_name, td_write(td) & !read_only ? "rw" : "r",
+ td->o.create_on_open, td->o.allow_create);
+
+ if (td->o.create_on_open && td->o.allow_create)
+ flags |= O_CREAT;
+
+ if (td_write(td)) {
+ if (!read_only)
+ flags |= O_RDWR;
+ if (td->o.allow_create)
+ flags |= O_CREAT;
+ } else if (td_read(td)) {
+ flags |= O_RDONLY;
+ }
+
+ rc = dfs_open(dfs, NULL, f->file_name,
+ S_IFREG | S_IRUSR | S_IWUSR,
+ flags, cid, eo->chsz, NULL, &dd->obj);
+ if (rc) {
+ log_err("Failed to open %s: %d\n", f->file_name, rc);
+ td_verror(td, rc, "dfs_open");
+ return rc;
+ }
+
+ return 0;
+}
+
+static int daos_fio_unlink(struct thread_data *td, struct fio_file *f)
+{
+ int rc;
+
+ dprint(FD_FILE, "dfs remove %s\n", f->file_name);
+
+ rc = dfs_remove(dfs, NULL, f->file_name, false, NULL);
+ if (rc) {
+ log_err("Failed to remove %s: %d\n", f->file_name, rc);
+ td_verror(td, rc, "dfs_remove");
+ return rc;
+ }
+
+ return 0;
+}
+
+static int daos_fio_invalidate(struct thread_data *td, struct fio_file *f)
+{
+ dprint(FD_FILE, "dfs invalidate %s\n", f->file_name);
+ return 0;
+}
+
+static void daos_fio_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+ struct daos_iou *io = io_u->engine_data;
+
+ if (io) {
+ io_u->engine_data = NULL;
+ free(io);
+ }
+}
+
+static int daos_fio_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+ struct daos_iou *io;
+
+ io = malloc(sizeof(struct daos_iou));
+ if (!io) {
+ td_verror(td, ENOMEM, "malloc");
+ return ENOMEM;
+ }
+ io->io_u = io_u;
+ io_u->engine_data = io;
+ return 0;
+}
+
+static struct io_u * daos_fio_event(struct thread_data *td, int event)
+{
+ struct daos_data *dd = td->io_ops_data;
+
+ return dd->io_us[event];
+}
+
+static int daos_fio_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
+{
+ struct daos_data *dd = td->io_ops_data;
+ daos_event_t *evp[max];
+ unsigned int events = 0;
+ int i;
+ int rc;
+
+ while (events < min) {
+ rc = daos_eq_poll(dd->eqh, 0, DAOS_EQ_NOWAIT, max, evp);
+ if (rc < 0) {
+ log_err("Event poll failed: %d\n", rc);
+ td_verror(td, rc, "daos_eq_poll");
+ return events;
+ }
+
+ for (i = 0; i < rc; i++) {
+ struct daos_iou *io;
+ struct io_u *io_u;
+
+ io = container_of(evp[i], struct daos_iou, ev);
+ if (io->complete)
+ log_err("Completion on already completed I/O\n");
+
+ io_u = io->io_u;
+ if (io->ev.ev_error)
+ io_u->error = io->ev.ev_error;
+ else
+ io_u->resid = 0;
+
+ dd->io_us[events] = io_u;
+ dd->queued--;
+ daos_event_fini(&io->ev);
+ io->complete = true;
+ events++;
+ }
+ }
+
+ dprint(FD_IO, "dfs eq_pool returning %d (%u/%u)\n", events, min, max);
+
+ return events;
+}
+
+static enum fio_q_status daos_fio_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct daos_data *dd = td->io_ops_data;
+ struct daos_iou *io = io_u->engine_data;
+ daos_off_t offset = io_u->offset;
+ int rc;
+
+ if (dd->queued == td->o.iodepth)
+ return FIO_Q_BUSY;
+
+ io->sgl.sg_nr = 1;
+ io->sgl.sg_nr_out = 0;
+ d_iov_set(&io->iov, io_u->xfer_buf, io_u->xfer_buflen);
+ io->sgl.sg_iovs = &io->iov;
+ io->size = io_u->xfer_buflen;
+
+ io->complete = false;
+ rc = daos_event_init(&io->ev, dd->eqh, NULL);
+ if (rc) {
+ log_err("Event init failed: %d\n", rc);
+ io_u->error = rc;
+ return FIO_Q_COMPLETED;
+ }
+
+ switch (io_u->ddir) {
+ case DDIR_WRITE:
+ rc = dfs_write(dfs, dd->obj, &io->sgl, offset, &io->ev);
+ if (rc) {
+ log_err("dfs_write failed: %d\n", rc);
+ io_u->error = rc;
+ return FIO_Q_COMPLETED;
+ }
+ break;
+ case DDIR_READ:
+ rc = dfs_read(dfs, dd->obj, &io->sgl, offset, &io->size,
+ &io->ev);
+ if (rc) {
+ log_err("dfs_read failed: %d\n", rc);
+ io_u->error = rc;
+ return FIO_Q_COMPLETED;
+ }
+ break;
+ case DDIR_SYNC:
+ io_u->error = 0;
+ return FIO_Q_COMPLETED;
+ default:
+ dprint(FD_IO, "Invalid IO type: %d\n", io_u->ddir);
+ io_u->error = -DER_INVAL;
+ return FIO_Q_COMPLETED;
+ }
+
+ dd->queued++;
+ return FIO_Q_QUEUED;
+}
+
+static int daos_fio_prep(struct thread_data fio_unused *td, struct io_u *io_u)
+{
+ return 0;
+}
+
+/* ioengine_ops for get_ioengine() */
+FIO_STATIC struct ioengine_ops ioengine = {
+ .name = "dfs",
+ .version = FIO_IOOPS_VERSION,
+ .flags = FIO_DISKLESSIO | FIO_NODISKUTIL,
+
+ .setup = daos_fio_setup,
+ .init = daos_fio_init,
+ .prep = daos_fio_prep,
+ .cleanup = daos_fio_cleanup,
+
+ .open_file = daos_fio_open,
+ .invalidate = daos_fio_invalidate,
+ .get_file_size = daos_fio_get_file_size,
+ .close_file = daos_fio_close,
+ .unlink_file = daos_fio_unlink,
+
+ .queue = daos_fio_queue,
+ .getevents = daos_fio_getevents,
+ .event = daos_fio_event,
+ .io_u_init = daos_fio_io_u_init,
+ .io_u_free = daos_fio_io_u_free,
+
+ .option_struct_size = sizeof(struct daos_fio_options),
+ .options = options,
+};
+
+static void fio_init fio_dfs_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_dfs_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
return 1;
}
- ed = malloc(sizeof(*ed));
+ ed = calloc(1, sizeof(*ed));
if (!ed) {
td_verror(td, ENOMEM, "io_queue_init");
return 1;
}
- memset(ed, 0 ,sizeof(*ed));
if (td->o.directory)
len = sprintf(donor_name, "%s/", td->o.directory);
--- /dev/null
+/*
+ * Exec engine
+ *
+ * Doesn't transfer any data, merely run 3rd party tools
+ *
+ */
+#include "../fio.h"
+#include "../optgroup.h"
+#include <signal.h>
+
+struct exec_options {
+ void *pad;
+ char *program;
+ char *arguments;
+ int grace_time;
+ unsigned int std_redirect;
+ pid_t pid;
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "program",
+ .lname = "Program",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct exec_options, program),
+ .help = "Program to execute",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "arguments",
+ .lname = "Arguments",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct exec_options, arguments),
+ .help = "Arguments to pass",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "grace_time",
+ .lname = "Grace time",
+ .type = FIO_OPT_INT,
+ .minval = 0,
+ .def = "1",
+ .off1 = offsetof(struct exec_options, grace_time),
+ .help = "Grace time before sending a SIGKILL",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "std_redirect",
+ .lname = "Std redirect",
+ .type = FIO_OPT_BOOL,
+ .def = "1",
+ .off1 = offsetof(struct exec_options, std_redirect),
+ .help = "Redirect stdout & stderr to files",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+char *str_replace(char *orig, const char *rep, const char *with)
+{
+ /*
+ * Replace a substring by another.
+ *
+ * Returns the new string if occurrences were found
+ * Returns orig if no occurrence is found
+ */
+ char *result, *insert, *tmp;
+ int len_rep, len_with, len_front, count;
+
+ /* sanity checks and initialization */
+ if (!orig || !rep)
+ return orig;
+
+ len_rep = strlen(rep);
+ if (len_rep == 0)
+ return orig;
+
+ if (!with)
+ with = "";
+ len_with = strlen(with);
+
+ insert = orig;
+ for (count = 0; (tmp = strstr(insert, rep)); ++count) {
+ insert = tmp + len_rep;
+ }
+
+ tmp = result = malloc(strlen(orig) + (len_with - len_rep) * count + 1);
+
+ if (!result)
+ return orig;
+
+ while (count--) {
+ insert = strstr(orig, rep);
+ len_front = insert - orig;
+ tmp = strncpy(tmp, orig, len_front) + len_front;
+ tmp = strcpy(tmp, with) + len_with;
+ orig += len_front + len_rep;
+ }
+ strcpy(tmp, orig);
+ return result;
+}
+
+char *expand_variables(struct thread_options *o, char *arguments)
+{
+ char str[16];
+ char *expanded_runtime, *expanded_name;
+ snprintf(str, sizeof(str), "%lld", o->timeout / 1000000);
+
+ /* %r is replaced by the runtime in seconds */
+ expanded_runtime = str_replace(arguments, "%r", str);
+
+ /* %n is replaced by the name of the running job */
+ expanded_name = str_replace(expanded_runtime, "%n", o->name);
+
+ free(expanded_runtime);
+ return expanded_name;
+}
+
+static int exec_background(struct thread_options *o, struct exec_options *eo)
+{
+ char *outfilename = NULL, *errfilename = NULL;
+ int outfd = 0, errfd = 0;
+ pid_t pid;
+ char *expanded_arguments = NULL;
+ /* For the arguments splitting */
+ char **arguments_array = NULL;
+ char *p;
+ char *exec_cmd = NULL;
+ size_t arguments_nb_items = 0, q;
+
+ if (asprintf(&outfilename, "%s.stdout", o->name) < 0)
+ return -1;
+
+ if (asprintf(&errfilename, "%s.stderr", o->name) < 0) {
+ free(outfilename);
+ return -1;
+ }
+
+ /* If we have variables in the arguments, let's expand them */
+ expanded_arguments = expand_variables(o, eo->arguments);
+
+ if (eo->std_redirect) {
+ log_info("%s : Saving output of %s %s : stdout=%s stderr=%s\n",
+ o->name, eo->program, expanded_arguments, outfilename,
+ errfilename);
+
+ /* Creating the stderr & stdout output files */
+ outfd = open(outfilename, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+ if (outfd < 0) {
+ log_err("fio: cannot open output file %s : %s\n",
+ outfilename, strerror(errno));
+ free(outfilename);
+ free(errfilename);
+ free(expanded_arguments);
+ return -1;
+ }
+
+ errfd = open(errfilename, O_CREAT | O_WRONLY | O_TRUNC, 0644);
+ if (errfd < 0) {
+ log_err("fio: cannot open output file %s : %s\n",
+ errfilename, strerror(errno));
+ free(outfilename);
+ free(errfilename);
+ free(expanded_arguments);
+ close(outfd);
+ return -1;
+ }
+ } else {
+ log_info("%s : Running %s %s\n",
+ o->name, eo->program, expanded_arguments);
+ }
+
+ pid = fork();
+
+ /* We are on the control thread (parent side of the fork */
+ if (pid > 0) {
+ eo->pid = pid;
+ if (eo->std_redirect) {
+ /* The output file is for the client side of the fork */
+ close(outfd);
+ close(errfd);
+ free(outfilename);
+ free(errfilename);
+ }
+ free(expanded_arguments);
+ return 0;
+ }
+
+ /* If the fork failed */
+ if (pid < 0) {
+ log_err("fio: forking failed %s \n", strerror(errno));
+ if (eo->std_redirect) {
+ close(outfd);
+ close(errfd);
+ free(outfilename);
+ free(errfilename);
+ }
+ free(expanded_arguments);
+ return -1;
+ }
+
+ /* We are in the worker (child side of the fork) */
+ if (pid == 0) {
+ if (eo->std_redirect) {
+ /* replace stdout by the output file we create */
+ dup2(outfd, 1);
+ /* replace stderr by the output file we create */
+ dup2(errfd, 2);
+ close(outfd);
+ close(errfd);
+ free(outfilename);
+ free(errfilename);
+ }
+
+ /*
+ * Let's split the command line into a null terminated array to
+ * be passed to the exec'd program.
+ * But don't asprintf expanded_arguments if NULL as it would be
+ * converted to a '(null)' argument, while we want no arguments
+ * at all.
+ */
+ if (expanded_arguments != NULL) {
+ if (asprintf(&exec_cmd, "%s %s", eo->program, expanded_arguments) < 0) {
+ free(expanded_arguments);
+ return -1;
+ }
+ } else {
+ if (asprintf(&exec_cmd, "%s", eo->program) < 0)
+ return -1;
+ }
+
+ /*
+ * Let's build an argv array to based on the program name and
+ * arguments
+ */
+ p = exec_cmd;
+ for (;;) {
+ p += strspn(p, " ");
+
+ if (!(q = strcspn(p, " ")))
+ break;
+
+ if (q) {
+ arguments_array =
+ realloc(arguments_array,
+ (arguments_nb_items +
+ 1) * sizeof(char *));
+ arguments_array[arguments_nb_items] =
+ malloc(q + 1);
+ strncpy(arguments_array[arguments_nb_items], p,
+ q);
+ arguments_array[arguments_nb_items][q] = 0;
+ arguments_nb_items++;
+ p += q;
+ }
+ }
+
+ /* Adding a null-terminated item to close the list */
+ arguments_array =
+ realloc(arguments_array,
+ (arguments_nb_items + 1) * sizeof(char *));
+ arguments_array[arguments_nb_items] = NULL;
+
+ /*
+ * Replace the fio program from the child fork by the target
+ * program
+ */
+ execvp(arguments_array[0], arguments_array);
+ }
+ /* We never reach this place */
+ /* Let's free the malloc'ed structures to make static checkers happy */
+ if (expanded_arguments)
+ free(expanded_arguments);
+ if (arguments_array)
+ free(arguments_array);
+ return 0;
+}
+
+static enum fio_q_status
+fio_exec_queue(struct thread_data *td, struct io_u fio_unused * io_u)
+{
+ struct thread_options *o = &td->o;
+ struct exec_options *eo = td->eo;
+
+ /* Let's execute the program the first time we get queued */
+ if (eo->pid == -1) {
+ exec_background(o, eo);
+ } else {
+ /*
+ * The program is running in background, let's check on a
+ * regular basis
+ * if the time is over and if we need to stop the tool
+ */
+ usleep(o->thinktime);
+ if (utime_since_now(&td->start) > o->timeout) {
+ /* Let's stop the child */
+ kill(eo->pid, SIGTERM);
+ /*
+ * Let's give grace_time (1 sec by default) to the 3rd
+ * party tool to stop
+ */
+ sleep(eo->grace_time);
+ }
+ }
+
+ return FIO_Q_COMPLETED;
+}
+
+static int fio_exec_init(struct thread_data *td)
+{
+ struct thread_options *o = &td->o;
+ struct exec_options *eo = td->eo;
+ int td_previous_state;
+
+ eo->pid = -1;
+
+ if (!eo->program) {
+ td_vmsg(td, EINVAL,
+ "no program is defined, it is mandatory to define one",
+ "exec");
+ return 1;
+ }
+
+ log_info("%s : program=%s, arguments=%s\n",
+ td->o.name, eo->program, eo->arguments);
+
+ /* Saving the current thread state */
+ td_previous_state = td->runstate;
+
+ /*
+ * Reporting that we are preparing the engine
+ * This is useful as the qsort() calibration takes time
+ * This prevents the job from starting before init is completed
+ */
+ td_set_runstate(td, TD_SETTING_UP);
+
+ /*
+ * set thinktime_sleep and thinktime_spin appropriately
+ */
+ o->thinktime_blocks = 1;
+ o->thinktime_blocks_type = THINKTIME_BLOCKS_TYPE_COMPLETE;
+ o->thinktime_spin = 0;
+ /* 50ms pause when waiting for the program to complete */
+ o->thinktime = 50000;
+
+ o->nr_files = o->open_files = 1;
+
+ /* Let's restore the previous state. */
+ td_set_runstate(td, td_previous_state);
+ return 0;
+}
+
+static void fio_exec_cleanup(struct thread_data *td)
+{
+ struct exec_options *eo = td->eo;
+ /* Send a sigkill to ensure the job is well terminated */
+ if (eo->pid > 0)
+ kill(eo->pid, SIGKILL);
+}
+
+static int
+fio_exec_open(struct thread_data fio_unused * td,
+ struct fio_file fio_unused * f)
+{
+ return 0;
+}
+
+static struct ioengine_ops ioengine = {
+ .name = "exec",
+ .version = FIO_IOOPS_VERSION,
+ .queue = fio_exec_queue,
+ .init = fio_exec_init,
+ .cleanup = fio_exec_cleanup,
+ .open_file = fio_exec_open,
+ .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOIO,
+ .options = options,
+ .option_struct_size = sizeof(struct exec_options),
+};
+
+static void fio_init fio_exec_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_exec_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
dprint(FD_FILE, "fd open %s\n", f->file_name);
- if (f->filetype != FIO_TYPE_FILE) {
- log_err("fio: only files are supported fallocate \n");
+ if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK) {
+ log_err("fio: only files and blockdev are supported fallocate \n");
return 1;
}
if (!strcmp(f->file_name, "-")) {
+++ /dev/null
-/*
- * filecreate engine
- *
- * IO engine that doesn't do any IO, just creates files and tracks the latency
- * of the file creation.
- */
-#include <stdio.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include "../fio.h"
-
-struct fc_data {
- enum fio_ddir stat_ddir;
-};
-
-static int open_file(struct thread_data *td, struct fio_file *f)
-{
- struct timespec start;
- int do_lat = !td->o.disable_lat;
-
- dprint(FD_FILE, "fd open %s\n", f->file_name);
-
- if (f->filetype != FIO_TYPE_FILE) {
- log_err("fio: only files are supported fallocate \n");
- return 1;
- }
- if (!strcmp(f->file_name, "-")) {
- log_err("fio: can't read/write to stdin/out\n");
- return 1;
- }
-
- if (do_lat)
- fio_gettime(&start, NULL);
-
- f->fd = open(f->file_name, O_CREAT|O_RDWR, 0600);
-
- if (f->fd == -1) {
- char buf[FIO_VERROR_SIZE];
- int e = errno;
-
- snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
- td_verror(td, e, buf);
- return 1;
- }
-
- if (do_lat) {
- struct fc_data *data = td->io_ops_data;
- uint64_t nsec;
-
- nsec = ntime_since_now(&start);
- add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0);
- }
-
- return 0;
-}
-
-static enum fio_q_status queue_io(struct thread_data *td,
- struct io_u fio_unused *io_u)
-{
- return FIO_Q_COMPLETED;
-}
-
-/*
- * Ensure that we at least have a block size worth of IO to do for each
- * file. If the job file has td->o.size < nr_files * block_size, then
- * fio won't do anything.
- */
-static int get_file_size(struct thread_data *td, struct fio_file *f)
-{
- f->real_file_size = td_min_bs(td);
- return 0;
-}
-
-static int init(struct thread_data *td)
-{
- struct fc_data *data;
-
- data = calloc(1, sizeof(*data));
-
- if (td_read(td))
- data->stat_ddir = DDIR_READ;
- else if (td_write(td))
- data->stat_ddir = DDIR_WRITE;
-
- td->io_ops_data = data;
- return 0;
-}
-
-static void cleanup(struct thread_data *td)
-{
- struct fc_data *data = td->io_ops_data;
-
- free(data);
-}
-
-static struct ioengine_ops ioengine = {
- .name = "filecreate",
- .version = FIO_IOOPS_VERSION,
- .init = init,
- .cleanup = cleanup,
- .queue = queue_io,
- .get_file_size = get_file_size,
- .open_file = open_file,
- .close_file = generic_close_file,
- .flags = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
- FIO_NOSTATS | FIO_NOFILEHASH,
-};
-
-static void fio_init fio_filecreate_register(void)
-{
- register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_filecreate_unregister(void)
-{
- unregister_ioengine(&ioengine);
-}
--- /dev/null
+/*
+ * file/directory operations engine
+ *
+ * IO engine that doesn't do any IO, just operates files/directories
+ * and tracks the latency of the operation.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "../fio.h"
+#include "../optgroup.h"
+#include "../oslib/statx.h"
+
+enum fio_engine {
+ UNKNOWN_OP_ENGINE = 0,
+ FILE_OP_ENGINE = 1,
+ DIR_OP_ENGINE = 2,
+};
+
+struct fc_data {
+ enum fio_ddir stat_ddir;
+ enum fio_engine op_engine;
+};
+
+struct filestat_options {
+ void *pad;
+ unsigned int stat_type;
+};
+
+enum {
+ FIO_FILESTAT_STAT = 1,
+ FIO_FILESTAT_LSTAT = 2,
+ FIO_FILESTAT_STATX = 3,
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "stat_type",
+ .lname = "stat_type",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct filestat_options, stat_type),
+ .help = "Specify stat system call type to measure lookup/getattr performance",
+ .def = "stat",
+ .posval = {
+ { .ival = "stat",
+ .oval = FIO_FILESTAT_STAT,
+ .help = "Use stat(2)",
+ },
+ { .ival = "lstat",
+ .oval = FIO_FILESTAT_LSTAT,
+ .help = "Use lstat(2)",
+ },
+ { .ival = "statx",
+ .oval = FIO_FILESTAT_STATX,
+ .help = "Use statx(2) if exists",
+ },
+ },
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_FILESTAT,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+static int setup_dirs(struct thread_data *td)
+{
+ int ret = 0;
+ int i;
+ struct fio_file *f;
+
+ for_each_file(td, f, i) {
+ dprint(FD_FILE, "setup directory %s\n", f->file_name);
+ ret = fio_mkdir(f->file_name, 0700);
+ if ((ret && errno != EEXIST)) {
+ log_err("create directory %s failed with %d\n",
+ f->file_name, errno);
+ break;
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+static int open_file(struct thread_data *td, struct fio_file *f)
+{
+ struct timespec start;
+ int do_lat = !td->o.disable_lat;
+ struct fc_data *fcd = td->io_ops_data;
+
+ dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+ if (f->filetype != FIO_TYPE_FILE) {
+ log_err("fio: only files are supported\n");
+ return 1;
+ }
+ if (!strcmp(f->file_name, "-")) {
+ log_err("fio: can't read/write to stdin/out\n");
+ return 1;
+ }
+
+ if (do_lat)
+ fio_gettime(&start, NULL);
+
+ if (fcd->op_engine == FILE_OP_ENGINE)
+ f->fd = open(f->file_name, O_CREAT|O_RDWR, 0600);
+ else if (fcd->op_engine == DIR_OP_ENGINE)
+ f->fd = fio_mkdir(f->file_name, S_IFDIR);
+ else {
+ log_err("fio: unknown file/directory operation engine\n");
+ return 1;
+ }
+
+ if (f->fd == -1) {
+ char buf[FIO_VERROR_SIZE];
+ int e = errno;
+
+ snprintf(buf, sizeof(buf), "open(%s)", f->file_name);
+ td_verror(td, e, buf);
+ return 1;
+ }
+
+ if (do_lat) {
+ struct fc_data *data = td->io_ops_data;
+ uint64_t nsec;
+
+ nsec = ntime_since_now(&start);
+ add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
+ }
+
+ return 0;
+}
+
+static int stat_file(struct thread_data *td, struct fio_file *f)
+{
+ struct filestat_options *o = td->eo;
+ struct timespec start;
+ int do_lat = !td->o.disable_lat;
+ struct stat statbuf;
+#ifndef WIN32
+ struct statx statxbuf;
+ char *abspath;
+#endif
+ int ret;
+
+ dprint(FD_FILE, "fd stat %s\n", f->file_name);
+
+ if (f->filetype != FIO_TYPE_FILE) {
+ log_err("fio: only files are supported\n");
+ return 1;
+ }
+ if (!strcmp(f->file_name, "-")) {
+ log_err("fio: can't read/write to stdin/out\n");
+ return 1;
+ }
+
+ if (do_lat)
+ fio_gettime(&start, NULL);
+
+ switch (o->stat_type) {
+ case FIO_FILESTAT_STAT:
+ ret = stat(f->file_name, &statbuf);
+ break;
+ case FIO_FILESTAT_LSTAT:
+ ret = lstat(f->file_name, &statbuf);
+ break;
+ case FIO_FILESTAT_STATX:
+#ifndef WIN32
+ abspath = realpath(f->file_name, NULL);
+ if (abspath) {
+ ret = statx(-1, abspath, 0, STATX_ALL, &statxbuf);
+ free(abspath);
+ } else
+ ret = -1;
+#else
+ ret = -1;
+#endif
+ break;
+ default:
+ ret = -1;
+ break;
+ }
+
+ if (ret == -1) {
+ char buf[FIO_VERROR_SIZE];
+ int e = errno;
+
+ snprintf(buf, sizeof(buf), "stat(%s) type=%u", f->file_name,
+ o->stat_type);
+ td_verror(td, e, buf);
+ return 1;
+ }
+
+ if (do_lat) {
+ struct fc_data *data = td->io_ops_data;
+ uint64_t nsec;
+
+ nsec = ntime_since_now(&start);
+ add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
+ }
+
+ return 0;
+}
+
+static int delete_file(struct thread_data *td, struct fio_file *f)
+{
+ struct timespec start;
+ int do_lat = !td->o.disable_lat;
+ struct fc_data *fcd = td->io_ops_data;
+ int ret;
+
+ dprint(FD_FILE, "fd delete %s\n", f->file_name);
+
+ if (f->filetype != FIO_TYPE_FILE) {
+ log_err("fio: only files are supported\n");
+ return 1;
+ }
+ if (!strcmp(f->file_name, "-")) {
+ log_err("fio: can't read/write to stdin/out\n");
+ return 1;
+ }
+
+ if (do_lat)
+ fio_gettime(&start, NULL);
+
+ if (fcd->op_engine == FILE_OP_ENGINE)
+ ret = unlink(f->file_name);
+ else if (fcd->op_engine == DIR_OP_ENGINE)
+ ret = rmdir(f->file_name);
+ else {
+ log_err("fio: unknown file/directory operation engine\n");
+ return 1;
+ }
+
+ if (ret == -1) {
+ char buf[FIO_VERROR_SIZE];
+ int e = errno;
+
+ snprintf(buf, sizeof(buf), "delete(%s)", f->file_name);
+ td_verror(td, e, buf);
+ return 1;
+ }
+
+ if (do_lat) {
+ struct fc_data *data = td->io_ops_data;
+ uint64_t nsec;
+
+ nsec = ntime_since_now(&start);
+ add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0, 0);
+ }
+
+ return 0;
+}
+
+static int invalidate_do_nothing(struct thread_data *td, struct fio_file *f)
+{
+ /* do nothing because file not opened */
+ return 0;
+}
+
+static enum fio_q_status queue_io(struct thread_data *td, struct io_u *io_u)
+{
+ return FIO_Q_COMPLETED;
+}
+
+/*
+ * Ensure that we at least have a block size worth of IO to do for each
+ * file. If the job file has td->o.size < nr_files * block_size, then
+ * fio won't do anything.
+ */
+static int get_file_size(struct thread_data *td, struct fio_file *f)
+{
+ f->real_file_size = td_min_bs(td);
+ return 0;
+}
+
+static int init(struct thread_data *td)
+{
+ struct fc_data *data;
+
+ data = calloc(1, sizeof(*data));
+
+ if (td_read(td))
+ data->stat_ddir = DDIR_READ;
+ else if (td_write(td))
+ data->stat_ddir = DDIR_WRITE;
+
+ data->op_engine = UNKNOWN_OP_ENGINE;
+
+ if (!strncmp(td->o.ioengine, "file", 4)) {
+ data->op_engine = FILE_OP_ENGINE;
+ dprint(FD_FILE, "Operate engine type: file\n");
+ }
+ if (!strncmp(td->o.ioengine, "dir", 3)) {
+ data->op_engine = DIR_OP_ENGINE;
+ dprint(FD_FILE, "Operate engine type: directory\n");
+ }
+
+ td->io_ops_data = data;
+ return 0;
+}
+
+static void cleanup(struct thread_data *td)
+{
+ struct fc_data *data = td->io_ops_data;
+
+ free(data);
+}
+
+static int remove_dir(struct thread_data *td, struct fio_file *f)
+{
+ dprint(FD_FILE, "remove directory %s\n", f->file_name);
+ return rmdir(f->file_name);
+}
+
+static struct ioengine_ops ioengine_filecreate = {
+ .name = "filecreate",
+ .version = FIO_IOOPS_VERSION,
+ .init = init,
+ .cleanup = cleanup,
+ .queue = queue_io,
+ .get_file_size = get_file_size,
+ .open_file = open_file,
+ .close_file = generic_close_file,
+ .flags = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+ FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static struct ioengine_ops ioengine_filestat = {
+ .name = "filestat",
+ .version = FIO_IOOPS_VERSION,
+ .init = init,
+ .cleanup = cleanup,
+ .queue = queue_io,
+ .invalidate = invalidate_do_nothing,
+ .get_file_size = generic_get_file_size,
+ .open_file = stat_file,
+ .flags = FIO_SYNCIO | FIO_FAKEIO |
+ FIO_NOSTATS | FIO_NOFILEHASH,
+ .options = options,
+ .option_struct_size = sizeof(struct filestat_options),
+};
+
+static struct ioengine_ops ioengine_filedelete = {
+ .name = "filedelete",
+ .version = FIO_IOOPS_VERSION,
+ .init = init,
+ .invalidate = invalidate_do_nothing,
+ .cleanup = cleanup,
+ .queue = queue_io,
+ .get_file_size = generic_get_file_size,
+ .open_file = delete_file,
+ .flags = FIO_SYNCIO | FIO_FAKEIO |
+ FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static struct ioengine_ops ioengine_dircreate = {
+ .name = "dircreate",
+ .version = FIO_IOOPS_VERSION,
+ .init = init,
+ .cleanup = cleanup,
+ .queue = queue_io,
+ .get_file_size = get_file_size,
+ .open_file = open_file,
+ .close_file = generic_close_file,
+ .unlink_file = remove_dir,
+ .flags = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+ FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static struct ioengine_ops ioengine_dirstat = {
+ .name = "dirstat",
+ .version = FIO_IOOPS_VERSION,
+ .setup = setup_dirs,
+ .init = init,
+ .cleanup = cleanup,
+ .queue = queue_io,
+ .invalidate = invalidate_do_nothing,
+ .get_file_size = generic_get_file_size,
+ .open_file = stat_file,
+ .unlink_file = remove_dir,
+ .flags = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+ FIO_NOSTATS | FIO_NOFILEHASH,
+ .options = options,
+ .option_struct_size = sizeof(struct filestat_options),
+};
+
+static struct ioengine_ops ioengine_dirdelete = {
+ .name = "dirdelete",
+ .version = FIO_IOOPS_VERSION,
+ .setup = setup_dirs,
+ .init = init,
+ .invalidate = invalidate_do_nothing,
+ .cleanup = cleanup,
+ .queue = queue_io,
+ .get_file_size = get_file_size,
+ .open_file = delete_file,
+ .unlink_file = remove_dir,
+ .flags = FIO_DISKLESSIO | FIO_SYNCIO | FIO_FAKEIO |
+ FIO_NOSTATS | FIO_NOFILEHASH,
+};
+
+static void fio_init fio_fileoperations_register(void)
+{
+ register_ioengine(&ioengine_filecreate);
+ register_ioengine(&ioengine_filestat);
+ register_ioengine(&ioengine_filedelete);
+ register_ioengine(&ioengine_dircreate);
+ register_ioengine(&ioengine_dirstat);
+ register_ioengine(&ioengine_dirdelete);
+}
+
+static void fio_exit fio_fileoperations_unregister(void)
+{
+ unregister_ioengine(&ioengine_filecreate);
+ unregister_ioengine(&ioengine_filestat);
+ unregister_ioengine(&ioengine_filedelete);
+ unregister_ioengine(&ioengine_dircreate);
+ unregister_ioengine(&ioengine_dirstat);
+ unregister_ioengine(&ioengine_dirdelete);
+}
+++ /dev/null
-/*
- * filestat engine
- *
- * IO engine that doesn't do any IO, just stat files and tracks the latency
- * of the file stat.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include "../fio.h"
-#include "../optgroup.h"
-#include "../oslib/statx.h"
-
-struct fc_data {
- enum fio_ddir stat_ddir;
-};
-
-struct filestat_options {
- void *pad;
- unsigned int stat_type;
-};
-
-enum {
- FIO_FILESTAT_STAT = 1,
- FIO_FILESTAT_LSTAT = 2,
- FIO_FILESTAT_STATX = 3,
-};
-
-static struct fio_option options[] = {
- {
- .name = "stat_type",
- .lname = "stat_type",
- .type = FIO_OPT_STR,
- .off1 = offsetof(struct filestat_options, stat_type),
- .help = "Specify stat system call type to measure lookup/getattr performance",
- .def = "stat",
- .posval = {
- { .ival = "stat",
- .oval = FIO_FILESTAT_STAT,
- .help = "Use stat(2)",
- },
- { .ival = "lstat",
- .oval = FIO_FILESTAT_LSTAT,
- .help = "Use lstat(2)",
- },
- { .ival = "statx",
- .oval = FIO_FILESTAT_STATX,
- .help = "Use statx(2) if exists",
- },
- },
- .category = FIO_OPT_C_ENGINE,
- .group = FIO_OPT_G_FILESTAT,
- },
- {
- .name = NULL,
- },
-};
-
-static int stat_file(struct thread_data *td, struct fio_file *f)
-{
- struct filestat_options *o = td->eo;
- struct timespec start;
- int do_lat = !td->o.disable_lat;
- struct stat statbuf;
-#ifndef WIN32
- struct statx statxbuf;
- char *abspath;
-#endif
- int ret;
-
- dprint(FD_FILE, "fd stat %s\n", f->file_name);
-
- if (f->filetype != FIO_TYPE_FILE) {
- log_err("fio: only files are supported\n");
- return 1;
- }
- if (!strcmp(f->file_name, "-")) {
- log_err("fio: can't read/write to stdin/out\n");
- return 1;
- }
-
- if (do_lat)
- fio_gettime(&start, NULL);
-
- switch (o->stat_type){
- case FIO_FILESTAT_STAT:
- ret = stat(f->file_name, &statbuf);
- break;
- case FIO_FILESTAT_LSTAT:
- ret = lstat(f->file_name, &statbuf);
- break;
- case FIO_FILESTAT_STATX:
-#ifndef WIN32
- abspath = realpath(f->file_name, NULL);
- if (abspath) {
- ret = statx(-1, abspath, 0, STATX_ALL, &statxbuf);
- free(abspath);
- } else
- ret = -1;
-#else
- ret = -1;
-#endif
- break;
- default:
- ret = -1;
- break;
- }
-
- if (ret == -1) {
- char buf[FIO_VERROR_SIZE];
- int e = errno;
-
- snprintf(buf, sizeof(buf), "stat(%s) type=%u", f->file_name,
- o->stat_type);
- td_verror(td, e, buf);
- return 1;
- }
-
- if (do_lat) {
- struct fc_data *data = td->io_ops_data;
- uint64_t nsec;
-
- nsec = ntime_since_now(&start);
- add_clat_sample(td, data->stat_ddir, nsec, 0, 0, 0);
- }
-
- return 0;
-}
-
-static enum fio_q_status queue_io(struct thread_data *td, struct io_u fio_unused *io_u)
-{
- return FIO_Q_COMPLETED;
-}
-
-static int init(struct thread_data *td)
-{
- struct fc_data *data;
-
- data = calloc(1, sizeof(*data));
-
- if (td_read(td))
- data->stat_ddir = DDIR_READ;
- else if (td_write(td))
- data->stat_ddir = DDIR_WRITE;
-
- td->io_ops_data = data;
- return 0;
-}
-
-static void cleanup(struct thread_data *td)
-{
- struct fc_data *data = td->io_ops_data;
-
- free(data);
-}
-
-static int stat_invalidate(struct thread_data *td, struct fio_file *f)
-{
- /* do nothing because file not opened */
- return 0;
-}
-
-static struct ioengine_ops ioengine = {
- .name = "filestat",
- .version = FIO_IOOPS_VERSION,
- .init = init,
- .cleanup = cleanup,
- .queue = queue_io,
- .invalidate = stat_invalidate,
- .get_file_size = generic_get_file_size,
- .open_file = stat_file,
- .flags = FIO_SYNCIO | FIO_FAKEIO |
- FIO_NOSTATS | FIO_NOFILEHASH,
- .options = options,
- .option_struct_size = sizeof(struct filestat_options),
-};
-
-static void fio_init fio_filestat_register(void)
-{
- register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_filestat_unregister(void)
-{
- unregister_ioengine(&ioengine);
-}
if (td->o.odirect)
flags |= OS_O_DIRECT;
- if (td->o.sync_io)
- flags |= O_SYNC;
+ flags |= td->o.sync_io;
dprint(FD_FILE, "fio file %s open mode %s td rw %s\n", f->file_name,
flags & O_RDONLY ? "ro" : "rw", td_read(td) ? "read" : "write");
#include "fio.h"
#include "../optgroup.h"
+/*
+ * Silence OpenSSL 3.0 deprecated function warnings
+ */
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
enum {
FIO_HTTP_WEBDAV = 0,
char *s3_key;
char *s3_keyid;
char *s3_region;
+ char *s3_sse_customer_key;
+ char *s3_sse_customer_algorithm;
+ char *s3_storage_class;
char *swift_auth_token;
int verbose;
unsigned int mode;
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_HTTP,
},
+ {
+ .name = "http_s3_sse_customer_key",
+ .lname = "SSE Customer Key",
+ .type = FIO_OPT_STR_STORE,
+ .help = "S3 SSE Customer Key",
+ .off1 = offsetof(struct http_options, s3_sse_customer_key),
+ .def = "",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_HTTP,
+ },
+ {
+ .name = "http_s3_sse_customer_algorithm",
+ .lname = "SSE Customer Algorithm",
+ .type = FIO_OPT_STR_STORE,
+ .help = "S3 SSE Customer Algorithm",
+ .off1 = offsetof(struct http_options, s3_sse_customer_algorithm),
+ .def = "AES256",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_HTTP,
+ },
+ {
+ .name = "http_s3_storage_class",
+ .lname = "S3 Storage class",
+ .type = FIO_OPT_STR_STORE,
+ .help = "S3 Storage Class",
+ .off1 = offsetof(struct http_options, s3_storage_class),
+ .def = "STANDARD",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_HTTP,
+ },
{
.name = "http_mode",
.lname = "Request mode to use",
for (i = 0; (c = uri[i]); i++) {
if (n > bufsize-5) {
log_err("encoding the URL failed\n");
+ free(r);
return NULL;
}
return _conv_hex(hash, MD5_DIGEST_LENGTH);
}
+static char *_conv_base64_encode(const unsigned char *p, size_t len)
+{
+ char *r, *ret;
+ int i;
+ static const char sEncodingTable[] = {
+ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
+ 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+ 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+ 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+ 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+ 'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+ 'w', 'x', 'y', 'z', '0', '1', '2', '3',
+ '4', '5', '6', '7', '8', '9', '+', '/'
+ };
+
+ size_t out_len = 4 * ((len + 2) / 3);
+ ret = r = malloc(out_len + 1);
+
+ for (i = 0; i < len - 2; i += 3) {
+ *r++ = sEncodingTable[(p[i] >> 2) & 0x3F];
+ *r++ = sEncodingTable[((p[i] & 0x3) << 4) | ((int) (p[i + 1] & 0xF0) >> 4)];
+ *r++ = sEncodingTable[((p[i + 1] & 0xF) << 2) | ((int) (p[i + 2] & 0xC0) >> 6)];
+ *r++ = sEncodingTable[p[i + 2] & 0x3F];
+ }
+
+ if (i < len) {
+ *r++ = sEncodingTable[(p[i] >> 2) & 0x3F];
+ if (i == (len - 1)) {
+ *r++ = sEncodingTable[((p[i] & 0x3) << 4)];
+ *r++ = '=';
+ } else {
+ *r++ = sEncodingTable[((p[i] & 0x3) << 4) | ((int) (p[i + 1] & 0xF0) >> 4)];
+ *r++ = sEncodingTable[((p[i + 1] & 0xF) << 2)];
+ }
+ *r++ = '=';
+ }
+
+ ret[out_len]=0;
+ return ret;
+}
+
+static char *_gen_base64_md5(const unsigned char *p, size_t len)
+{
+ unsigned char hash[MD5_DIGEST_LENGTH];
+ MD5((unsigned char*)p, len, hash);
+ return _conv_base64_encode(hash, MD5_DIGEST_LENGTH);
+}
+
static void _hmac(unsigned char *md, void *key, int key_len, char *data) {
#ifndef CONFIG_HAVE_OPAQUE_HMAC_CTX
HMAC_CTX _ctx;
switch (type) {
case CURLINFO_TEXT:
fprintf(stderr, "== Info: %s", data);
- /* fall through */
+ fio_fallthrough;
default:
case CURLINFO_SSL_DATA_OUT:
- /* fall through */
case CURLINFO_SSL_DATA_IN:
return 0;
char date_iso[32];
char method[8];
char dkey[128];
- char creq[512];
- char sts[256];
+ char creq[4096];
+ char sts[512];
char s[512];
char *uri_encoded = NULL;
char *dsha = NULL;
const char *service = "s3";
const char *aws = "aws4_request";
unsigned char md[SHA256_DIGEST_LENGTH];
+ unsigned char sse_key[33] = {0};
+ char *sse_key_base64 = NULL;
+ char *sse_key_md5_base64 = NULL;
time_t t = time(NULL);
struct tm *gtm = gmtime(&t);
strftime (date_iso, sizeof(date_iso), "%Y%m%dT%H%M%SZ", gtm);
uri_encoded = _aws_uriencode(uri);
+ if (o->s3_sse_customer_key != NULL)
+ strncpy((char*)sse_key, o->s3_sse_customer_key, sizeof(sse_key) - 1);
+
if (op == DDIR_WRITE) {
dsha = _gen_hex_sha256(buf, len);
sprintf(method, "PUT");
}
/* Create the canonical request first */
- snprintf(creq, sizeof(creq),
- "%s\n"
- "%s\n"
- "\n"
- "host:%s\n"
- "x-amz-content-sha256:%s\n"
- "x-amz-date:%s\n"
- "\n"
- "host;x-amz-content-sha256;x-amz-date\n"
- "%s"
- , method
- , uri_encoded, o->host, dsha, date_iso, dsha);
+ if (sse_key[0] != '\0') {
+ sse_key_base64 = _conv_base64_encode(sse_key, sizeof(sse_key) - 1);
+ sse_key_md5_base64 = _gen_base64_md5(sse_key, sizeof(sse_key) - 1);
+ snprintf(creq, sizeof(creq),
+ "%s\n"
+ "%s\n"
+ "\n"
+ "host:%s\n"
+ "x-amz-content-sha256:%s\n"
+ "x-amz-date:%s\n"
+ "x-amz-server-side-encryption-customer-algorithm:%s\n"
+ "x-amz-server-side-encryption-customer-key:%s\n"
+ "x-amz-server-side-encryption-customer-key-md5:%s\n"
+ "x-amz-storage-class:%s\n"
+ "\n"
+ "host;x-amz-content-sha256;x-amz-date;"
+ "x-amz-server-side-encryption-customer-algorithm;"
+ "x-amz-server-side-encryption-customer-key;"
+ "x-amz-server-side-encryption-customer-key-md5;"
+ "x-amz-storage-class\n"
+ "%s"
+ , method
+ , uri_encoded, o->host, dsha, date_iso
+ , o->s3_sse_customer_algorithm, sse_key_base64
+ , sse_key_md5_base64, o->s3_storage_class, dsha);
+ } else {
+ snprintf(creq, sizeof(creq),
+ "%s\n"
+ "%s\n"
+ "\n"
+ "host:%s\n"
+ "x-amz-content-sha256:%s\n"
+ "x-amz-date:%s\n"
+ "x-amz-storage-class:%s\n"
+ "\n"
+ "host;x-amz-content-sha256;x-amz-date;x-amz-storage-class\n"
+ "%s"
+ , method
+ , uri_encoded, o->host, dsha, date_iso, o->s3_storage_class, dsha);
+ }
csha = _gen_hex_sha256(creq, strlen(creq));
snprintf(sts, sizeof(sts), "AWS4-HMAC-SHA256\n%s\n%s/%s/%s/%s\n%s",
- date_iso, date_short, o->s3_region, service, aws, csha);
+ date_iso, date_short, o->s3_region, service, aws, csha);
snprintf((char *)dkey, sizeof(dkey), "AWS4%s", o->s3_key);
_hmac(md, dkey, strlen(dkey), date_short);
signature = _conv_hex(md, SHA256_DIGEST_LENGTH);
- /* Surpress automatic Accept: header */
+ /* Suppress automatic Accept: header */
slist = curl_slist_append(slist, "Accept:");
snprintf(s, sizeof(s), "x-amz-content-sha256: %s", dsha);
snprintf(s, sizeof(s), "x-amz-date: %s", date_iso);
slist = curl_slist_append(slist, s);
- snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
- "SignedHeaders=host;x-amz-content-sha256;x-amz-date,Signature=%s",
- o->s3_keyid, date_short, o->s3_region, signature);
+ if (sse_key[0] != '\0') {
+ snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-algorithm: %s", o->s3_sse_customer_algorithm);
+ slist = curl_slist_append(slist, s);
+ snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-key: %s", sse_key_base64);
+ slist = curl_slist_append(slist, s);
+ snprintf(s, sizeof(s), "x-amz-server-side-encryption-customer-key-md5: %s", sse_key_md5_base64);
+ slist = curl_slist_append(slist, s);
+ }
+
+ snprintf(s, sizeof(s), "x-amz-storage-class: %s", o->s3_storage_class);
+ slist = curl_slist_append(slist, s);
+
+ if (sse_key[0] != '\0') {
+ snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
+ "SignedHeaders=host;x-amz-content-sha256;"
+ "x-amz-date;x-amz-server-side-encryption-customer-algorithm;"
+ "x-amz-server-side-encryption-customer-key;"
+ "x-amz-server-side-encryption-customer-key-md5;"
+ "x-amz-storage-class,"
+ "Signature=%s",
+ o->s3_keyid, date_short, o->s3_region, signature);
+ } else {
+ snprintf(s, sizeof(s), "Authorization: AWS4-HMAC-SHA256 Credential=%s/%s/%s/s3/aws4_request,"
+ "SignedHeaders=host;x-amz-content-sha256;x-amz-date;x-amz-storage-class,Signature=%s",
+ o->s3_keyid, date_short, o->s3_region, signature);
+ }
slist = curl_slist_append(slist, s);
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, slist);
free(csha);
free(dsha);
free(signature);
+ if (sse_key_base64 != NULL) {
+ free(sse_key_base64);
+ free(sse_key_md5_base64);
+ }
}
static void _add_swift_header(CURL *curl, struct curl_slist *slist, struct http_options *o,
if (op == DDIR_WRITE) {
dsha = _gen_hex_md5(buf, len);
}
- /* Surpress automatic Accept: header */
+ /* Suppress automatic Accept: header */
slist = curl_slist_append(slist, "Accept:");
snprintf(s, sizeof(s), "etag: %s", dsha);
char url[1024];
long status;
CURLcode res;
- int r = -1;
fio_ro_check(td, io_u);
memset(&_curl_stream, 0, sizeof(_curl_stream));
if (status == 100 || (status >= 200 && status <= 204))
goto out;
log_err("DDIR_WRITE failed with HTTP status code %ld\n", status);
- goto err;
}
+ goto err;
} else if (io_u->ddir == DDIR_READ) {
curl_easy_setopt(http->curl, CURLOPT_READDATA, NULL);
curl_easy_setopt(http->curl, CURLOPT_WRITEDATA, &_curl_stream);
log_err("WARNING: Only DDIR_READ/DDIR_WRITE/DDIR_TRIM are supported!\n");
err:
- io_u->error = r;
+ io_u->error = EIO;
td_verror(td, io_u->error, "transfer");
out:
curl_slist_free_all(slist);
};
struct iovec *iovecs; /* array of queued iovecs */
struct io_u **io_us; /* array of queued io_u pointers */
- struct io_u **event_io_us; /* array of the events retieved afer get_events*/
+ struct io_u **event_io_us; /* array of the events retrieved after get_events*/
unsigned int queued; /* iovecs/io_us in the queue */
unsigned int events; /* number of committed iovecs/io_us */
return 1;
}
- if (td->o.oatomic) {
- td_verror(td, EINVAL, "IME does not support atomic IO");
- return 1;
- }
if (td->o.odirect)
flags |= O_DIRECT;
- if (td->o.sync_io)
- flags |= O_SYNC;
+ flags |= td->o.sync_io;
if (td->o.create_on_open && td->o.allow_create)
flags |= O_CREAT;
#include "../lib/memalign.h"
#include "../lib/fls.h"
#include "../lib/roundup.h"
+#include "../verify.h"
#ifdef ARCH_HAVE_IOURING
#include "../lib/types.h"
#include "../os/linux/io_uring.h"
+#include "cmdprio.h"
+#include "zbd.h"
+#include "nvme.h"
+
+#include <sys/stat.h>
+
+enum uring_cmd_type {
+ FIO_URING_CMD_NVME = 1,
+};
struct io_sq_ring {
unsigned *head;
int ring_fd;
struct io_u **io_u_index;
+ char *md_buf;
int *fds;
int queued;
int cq_ring_off;
unsigned iodepth;
- bool ioprio_class_set;
- bool ioprio_set;
int prepped;
struct ioring_mmap mmap[3];
+
+ struct cmdprio cmdprio;
+
+ struct nvme_dsm *dsm;
};
struct ioring_options {
- void *pad;
+ struct thread_data *td;
unsigned int hipri;
- unsigned int cmdprio_percentage;
+ struct cmdprio_options cmdprio_options;
unsigned int fixedbufs;
unsigned int registerfiles;
unsigned int sqpoll_thread;
unsigned int uncached;
unsigned int nowait;
unsigned int force_async;
+ unsigned int md_per_io_size;
+ unsigned int pi_act;
+ unsigned int apptag;
+ unsigned int apptag_mask;
+ unsigned int prchk;
+ char *pi_chk;
+ enum uring_cmd_type cmd_type;
};
static const int ddir_to_op[2][2] = {
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_IOURING,
},
-#ifdef FIO_HAVE_IOPRIO_CLASS
- {
- .name = "cmdprio_percentage",
- .lname = "high priority percentage",
- .type = FIO_OPT_INT,
- .off1 = offsetof(struct ioring_options, cmdprio_percentage),
- .minval = 1,
- .maxval = 100,
- .help = "Send high priority I/O this percentage of the time",
- .category = FIO_OPT_C_ENGINE,
- .group = FIO_OPT_G_IOURING,
- },
-#else
- {
- .name = "cmdprio_percentage",
- .lname = "high priority percentage",
- .type = FIO_OPT_UNSUPPORTED,
- .help = "Your platform does not support I/O priority classes",
- },
-#endif
{
.name = "fixedbufs",
.lname = "Fixed (pre-mapped) IO buffers",
{
.name = "sqthread_poll",
.lname = "Kernel SQ thread polling",
- .type = FIO_OPT_INT,
+ .type = FIO_OPT_STR_SET,
.off1 = offsetof(struct ioring_options, sqpoll_thread),
.help = "Offload submission/completion to kernel thread",
.category = FIO_OPT_C_ENGINE,
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_IOURING,
},
+ {
+ .name = "cmd_type",
+ .lname = "Uring cmd type",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct ioring_options, cmd_type),
+ .help = "Specify uring-cmd type",
+ .def = "nvme",
+ .posval = {
+ { .ival = "nvme",
+ .oval = FIO_URING_CMD_NVME,
+ .help = "Issue nvme-uring-cmd",
+ },
+ },
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_IOURING,
+ },
+ CMDPRIO_OPTIONS(struct ioring_options, FIO_OPT_G_IOURING),
+ {
+ .name = "md_per_io_size",
+ .lname = "Separate Metadata Buffer Size per I/O",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct ioring_options, md_per_io_size),
+ .def = "0",
+ .help = "Size of separate metadata buffer per I/O (Default: 0)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_IOURING,
+ },
+ {
+ .name = "pi_act",
+ .lname = "Protection Information Action",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct ioring_options, pi_act),
+ .def = "1",
+ .help = "Protection Information Action bit (pi_act=1 or pi_act=0)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_IOURING,
+ },
+ {
+ .name = "pi_chk",
+ .lname = "Protection Information Check",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct ioring_options, pi_chk),
+ .def = NULL,
+ .help = "Control of Protection Information Checking (pi_chk=GUARD,REFTAG,APPTAG)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_IOURING,
+ },
+ {
+ .name = "apptag",
+ .lname = "Application Tag used in Protection Information",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct ioring_options, apptag),
+ .def = "0x1234",
+ .help = "Application Tag used in Protection Information field (Default: 0x1234)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_IOURING,
+ },
+ {
+ .name = "apptag_mask",
+ .lname = "Application Tag Mask",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct ioring_options, apptag_mask),
+ .def = "0xffff",
+ .help = "Application Tag Mask used with Application Tag (Default: 0xffff)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_IOURING,
+ },
{
.name = NULL,
},
static int io_uring_enter(struct ioring_data *ld, unsigned int to_submit,
unsigned int min_complete, unsigned int flags)
{
+#ifdef FIO_ARCH_HAS_SYSCALL
+ return __do_syscall6(__NR_io_uring_enter, ld->ring_fd, to_submit,
+ min_complete, flags, NULL, 0);
+#else
return syscall(__NR_io_uring_enter, ld->ring_fd, to_submit,
min_complete, flags, NULL, 0);
+#endif
}
static int fio_ioring_prep(struct thread_data *td, struct io_u *io_u)
sqe->flags = IOSQE_FIXED_FILE;
} else {
sqe->fd = f->fd;
+ sqe->flags = 0;
}
if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) {
sqe->len = 1;
}
}
+ sqe->rw_flags = 0;
if (!td->o.odirect && o->uncached)
- sqe->rw_flags = RWF_UNCACHED;
+ sqe->rw_flags |= RWF_UNCACHED;
if (o->nowait)
sqe->rw_flags |= RWF_NOWAIT;
- if (ld->ioprio_class_set)
- sqe->ioprio = td->o.ioprio_class << 13;
- if (ld->ioprio_set)
- sqe->ioprio |= td->o.ioprio;
+
+ /*
+ * Since io_uring can have a submission context (sqthread_poll)
+ * that is different from the process context, we cannot rely on
+ * the IO priority set by ioprio_set() (options prio, prioclass,
+ * and priohint) to be inherited.
+ * td->ioprio will have the value of the "default prio", so set
+ * this unconditionally. This value might get overridden by
+ * fio_ioring_cmdprio_prep() if the option cmdprio_percentage or
+ * cmdprio_bssplit is used.
+ */
+ sqe->ioprio = td->ioprio;
sqe->off = io_u->offset;
- sqe->rw_flags = 0;
} else if (ddir_sync(io_u->ddir)) {
sqe->ioprio = 0;
if (io_u->ddir == DDIR_SYNC_FILE_RANGE) {
return 0;
}
+static int fio_ioring_cmd_prep(struct thread_data *td, struct io_u *io_u)
+{
+ struct ioring_data *ld = td->io_ops_data;
+ struct ioring_options *o = td->eo;
+ struct fio_file *f = io_u->file;
+ struct nvme_uring_cmd *cmd;
+ struct io_uring_sqe *sqe;
+ struct nvme_dsm *dsm;
+ void *ptr = ld->dsm;
+ unsigned int dsm_size;
+
+ /* only supports nvme_uring_cmd */
+ if (o->cmd_type != FIO_URING_CMD_NVME)
+ return -EINVAL;
+
+ if (io_u->ddir == DDIR_TRIM && td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM)
+ return 0;
+
+ sqe = &ld->sqes[(io_u->index) << 1];
+
+ if (o->registerfiles) {
+ sqe->fd = f->engine_pos;
+ sqe->flags = IOSQE_FIXED_FILE;
+ } else {
+ sqe->fd = f->fd;
+ }
+ sqe->rw_flags = 0;
+ if (!td->o.odirect && o->uncached)
+ sqe->rw_flags |= RWF_UNCACHED;
+ if (o->nowait)
+ sqe->rw_flags |= RWF_NOWAIT;
+
+ sqe->opcode = IORING_OP_URING_CMD;
+ sqe->user_data = (unsigned long) io_u;
+ if (o->nonvectored)
+ sqe->cmd_op = NVME_URING_CMD_IO;
+ else
+ sqe->cmd_op = NVME_URING_CMD_IO_VEC;
+ if (o->force_async && ++ld->prepped == o->force_async) {
+ ld->prepped = 0;
+ sqe->flags |= IOSQE_ASYNC;
+ }
+ if (o->fixedbufs) {
+ sqe->uring_cmd_flags = IORING_URING_CMD_FIXED;
+ sqe->buf_index = io_u->index;
+ }
+
+ cmd = (struct nvme_uring_cmd *)sqe->cmd;
+ dsm_size = sizeof(*ld->dsm) + td->o.num_range * sizeof(struct nvme_dsm_range);
+ ptr += io_u->index * dsm_size;
+ dsm = (struct nvme_dsm *)ptr;
+
+ return fio_nvme_uring_cmd_prep(cmd, io_u,
+ o->nonvectored ? NULL : &ld->iovecs[io_u->index],
+ dsm);
+}
+
static struct io_u *fio_ioring_event(struct thread_data *td, int event)
{
struct ioring_data *ld = td->io_ops_data;
return io_u;
}
+static struct io_u *fio_ioring_cmd_event(struct thread_data *td, int event)
+{
+ struct ioring_data *ld = td->io_ops_data;
+ struct ioring_options *o = td->eo;
+ struct io_uring_cqe *cqe;
+ struct io_u *io_u;
+ struct nvme_data *data;
+ unsigned index;
+ int ret;
+
+ index = (event + ld->cq_ring_off) & ld->cq_ring_mask;
+ if (o->cmd_type == FIO_URING_CMD_NVME)
+ index <<= 1;
+
+ cqe = &ld->cq_ring.cqes[index];
+ io_u = (struct io_u *) (uintptr_t) cqe->user_data;
+
+ if (cqe->res != 0) {
+ io_u->error = -cqe->res;
+ return io_u;
+ } else {
+ io_u->error = 0;
+ }
+
+ if (o->cmd_type == FIO_URING_CMD_NVME) {
+ data = FILE_ENG_DATA(io_u->file);
+ if (data->pi_type && (io_u->ddir == DDIR_READ) && !o->pi_act) {
+ ret = fio_nvme_pi_verify(data, io_u);
+ if (ret)
+ io_u->error = ret;
+ }
+ }
+
+ return io_u;
+}
+
static int fio_ioring_cqring_reap(struct thread_data *td, unsigned int events,
unsigned int max)
{
r = fio_ioring_cqring_reap(td, events, max);
if (r) {
events += r;
+ max -= r;
if (actual_min != 0)
actual_min -= r;
continue;
if (r < 0) {
if (errno == EAGAIN || errno == EINTR)
continue;
+ r = -errno;
td_verror(td, errno, "io_uring_enter");
break;
}
return r < 0 ? r : events;
}
-static void fio_ioring_prio_prep(struct thread_data *td, struct io_u *io_u)
+static inline void fio_ioring_cmd_nvme_pi(struct thread_data *td,
+ struct io_u *io_u)
{
- struct ioring_options *o = td->eo;
struct ioring_data *ld = td->io_ops_data;
- if (rand_between(&td->prio_state, 0, 99) < o->cmdprio_percentage) {
- ld->sqes[io_u->index].ioprio = IOPRIO_CLASS_RT << IOPRIO_CLASS_SHIFT;
- io_u->flags |= IO_U_F_PRIORITY;
+ struct ioring_options *o = td->eo;
+ struct nvme_uring_cmd *cmd;
+ struct io_uring_sqe *sqe;
+ struct nvme_cmd_ext_io_opts ext_opts = {0};
+ struct nvme_data *data = FILE_ENG_DATA(io_u->file);
+
+ if (io_u->ddir == DDIR_TRIM)
+ return;
+
+ sqe = &ld->sqes[(io_u->index) << 1];
+ cmd = (struct nvme_uring_cmd *)sqe->cmd;
+
+ if (data->pi_type) {
+ if (o->pi_act)
+ ext_opts.io_flags |= NVME_IO_PRINFO_PRACT;
+ ext_opts.io_flags |= o->prchk;
+ ext_opts.apptag = o->apptag;
+ ext_opts.apptag_mask = o->apptag_mask;
}
- return;
+
+ fio_nvme_pi_fill(cmd, io_u, &ext_opts);
+}
+
+static inline void fio_ioring_cmdprio_prep(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct ioring_data *ld = td->io_ops_data;
+ struct cmdprio *cmdprio = &ld->cmdprio;
+
+ if (fio_cmdprio_set_ioprio(td, cmdprio, io_u))
+ ld->sqes[io_u->index].ioprio = io_u->ioprio;
}
static enum fio_q_status fio_ioring_queue(struct thread_data *td,
struct io_u *io_u)
{
struct ioring_data *ld = td->io_ops_data;
- struct io_sq_ring *ring = &ld->sq_ring;
struct ioring_options *o = td->eo;
+ struct io_sq_ring *ring = &ld->sq_ring;
unsigned tail, next_tail;
fio_ro_check(td, io_u);
if (ld->queued == ld->iodepth)
return FIO_Q_BUSY;
- if (io_u->ddir == DDIR_TRIM) {
+ if (io_u->ddir == DDIR_TRIM && td->io_ops->flags & FIO_ASYNCIO_SYNC_TRIM) {
if (ld->queued)
return FIO_Q_BUSY;
do_io_u_trim(td, io_u);
+
io_u_mark_submit(td, 1);
io_u_mark_complete(td, 1);
return FIO_Q_COMPLETED;
tail = *ring->tail;
next_tail = tail + 1;
- if (next_tail == atomic_load_acquire(ring->head))
+ if (next_tail == atomic_load_relaxed(ring->head))
return FIO_Q_BUSY;
- if (o->cmdprio_percentage)
- fio_ioring_prio_prep(td, io_u);
+ if (ld->cmdprio.mode != CMDPRIO_MODE_NONE)
+ fio_ioring_cmdprio_prep(td, io_u);
+
+ if (!strcmp(td->io_ops->name, "io_uring_cmd") &&
+ o->cmd_type == FIO_URING_CMD_NVME)
+ fio_ioring_cmd_nvme_pi(td, io_u);
+
ring->array[tail & ld->sq_ring_mask] = io_u->index;
atomic_store_release(ring->tail, next_tail);
start++;
}
+
+ /*
+ * only used for iolog
+ */
+ if (td->o.read_iolog_file)
+ memcpy(&td->last_issue, &now, sizeof(now));
}
static int fio_ioring_commit(struct thread_data *td)
*/
if (o->sqpoll_thread) {
struct io_sq_ring *ring = &ld->sq_ring;
+ unsigned start = *ld->sq_ring.tail - ld->queued;
unsigned flags;
- flags = atomic_load_acquire(ring->flags);
+ flags = atomic_load_relaxed(ring->flags);
if (flags & IORING_SQ_NEED_WAKEUP)
io_uring_enter(ld, ld->queued, 0,
IORING_ENTER_SQ_WAKEUP);
+ fio_ioring_queued(td, start, ld->queued);
+ io_u_mark_submit(td, ld->queued);
+
ld->queued = 0;
return 0;
}
usleep(1);
continue;
}
+ ret = -errno;
td_verror(td, errno, "io_uring_enter submit");
break;
}
{
int i;
- for (i = 0; i < ARRAY_SIZE(ld->mmap); i++)
+ for (i = 0; i < FIO_ARRAY_SIZE(ld->mmap); i++)
munmap(ld->mmap[i].ptr, ld->mmap[i].len);
close(ld->ring_fd);
}
if (!(td->flags & TD_F_CHILD))
fio_ioring_unmap(ld);
+ fio_cmdprio_cleanup(&ld->cmdprio);
free(ld->io_u_index);
+ free(ld->md_buf);
free(ld->iovecs);
free(ld->fds);
+ free(ld->dsm);
free(ld);
}
}
sring->array = ptr + p->sq_off.array;
ld->sq_ring_mask = *sring->ring_mask;
- ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
+ if (p->flags & IORING_SETUP_SQE128)
+ ld->mmap[1].len = 2 * p->sq_entries * sizeof(struct io_uring_sqe);
+ else
+ ld->mmap[1].len = p->sq_entries * sizeof(struct io_uring_sqe);
ld->sqes = mmap(0, ld->mmap[1].len, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, ld->ring_fd,
IORING_OFF_SQES);
ld->mmap[1].ptr = ld->sqes;
- ld->mmap[2].len = p->cq_off.cqes +
- p->cq_entries * sizeof(struct io_uring_cqe);
+ if (p->flags & IORING_SETUP_CQE32) {
+ ld->mmap[2].len = p->cq_off.cqes +
+ 2 * p->cq_entries * sizeof(struct io_uring_cqe);
+ } else {
+ ld->mmap[2].len = p->cq_off.cqes +
+ p->cq_entries * sizeof(struct io_uring_cqe);
+ }
ptr = mmap(0, ld->mmap[2].len, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, ld->ring_fd,
IORING_OFF_CQ_RING);
/* default to off, as that's always safe */
o->nonvectored = 0;
- p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
+ p = calloc(1, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
if (!p)
return;
- memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
ret = syscall(__NR_io_uring_register, ld->ring_fd,
IORING_REGISTER_PROBE, p, 256);
if (ret < 0)
p.flags |= IORING_SETUP_SQ_AFF;
p.sq_thread_cpu = o->sqpoll_cpu;
}
+
+ /*
+ * Submission latency for sqpoll_thread is just the time it
+ * takes to fill in the SQ ring entries, and any syscall if
+ * IORING_SQ_NEED_WAKEUP is set, we don't need to log that time
+ * separately.
+ */
+ td->o.disable_slat = 1;
}
+ /*
+ * Clamp CQ ring size at our SQ ring size, we don't need more entries
+ * than that.
+ */
+ p.flags |= IORING_SETUP_CQSIZE;
+ p.cq_entries = depth;
+
+ /*
+ * Setup COOP_TASKRUN as we don't need to get IPI interrupted for
+ * completing IO operations.
+ */
+ p.flags |= IORING_SETUP_COOP_TASKRUN;
+
+ /*
+ * io_uring is always a single issuer, and we can defer task_work
+ * runs until we reap events.
+ */
+ p.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN;
+
+retry:
ret = syscall(__NR_io_uring_setup, depth, &p);
- if (ret < 0)
+ if (ret < 0) {
+ if (errno == EINVAL && p.flags & IORING_SETUP_DEFER_TASKRUN) {
+ p.flags &= ~IORING_SETUP_DEFER_TASKRUN;
+ p.flags &= ~IORING_SETUP_SINGLE_ISSUER;
+ goto retry;
+ }
+ if (errno == EINVAL && p.flags & IORING_SETUP_COOP_TASKRUN) {
+ p.flags &= ~IORING_SETUP_COOP_TASKRUN;
+ goto retry;
+ }
+ if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
+ p.flags &= ~IORING_SETUP_CQSIZE;
+ goto retry;
+ }
return ret;
+ }
+
+ ld->ring_fd = ret;
+
+ fio_ioring_probe(td);
+
+ if (o->fixedbufs) {
+ ret = syscall(__NR_io_uring_register, ld->ring_fd,
+ IORING_REGISTER_BUFFERS, ld->iovecs, depth);
+ if (ret < 0)
+ return ret;
+ }
+
+ return fio_ioring_mmap(ld, &p);
+}
+
+static int fio_ioring_cmd_queue_init(struct thread_data *td)
+{
+ struct ioring_data *ld = td->io_ops_data;
+ struct ioring_options *o = td->eo;
+ int depth = td->o.iodepth;
+ struct io_uring_params p;
+ int ret;
+
+ memset(&p, 0, sizeof(p));
+
+ if (o->hipri)
+ p.flags |= IORING_SETUP_IOPOLL;
+ if (o->sqpoll_thread) {
+ p.flags |= IORING_SETUP_SQPOLL;
+ if (o->sqpoll_set) {
+ p.flags |= IORING_SETUP_SQ_AFF;
+ p.sq_thread_cpu = o->sqpoll_cpu;
+ }
+
+ /*
+ * Submission latency for sqpoll_thread is just the time it
+ * takes to fill in the SQ ring entries, and any syscall if
+ * IORING_SQ_NEED_WAKEUP is set, we don't need to log that time
+ * separately.
+ */
+ td->o.disable_slat = 1;
+ }
+ if (o->cmd_type == FIO_URING_CMD_NVME) {
+ p.flags |= IORING_SETUP_SQE128;
+ p.flags |= IORING_SETUP_CQE32;
+ }
+
+ /*
+ * Clamp CQ ring size at our SQ ring size, we don't need more entries
+ * than that.
+ */
+ p.flags |= IORING_SETUP_CQSIZE;
+ p.cq_entries = depth;
+
+ /*
+ * Setup COOP_TASKRUN as we don't need to get IPI interrupted for
+ * completing IO operations.
+ */
+ p.flags |= IORING_SETUP_COOP_TASKRUN;
+
+ /*
+ * io_uring is always a single issuer, and we can defer task_work
+ * runs until we reap events.
+ */
+ p.flags |= IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN;
+
+retry:
+ ret = syscall(__NR_io_uring_setup, depth, &p);
+ if (ret < 0) {
+ if (errno == EINVAL && p.flags & IORING_SETUP_DEFER_TASKRUN) {
+ p.flags &= ~IORING_SETUP_DEFER_TASKRUN;
+ p.flags &= ~IORING_SETUP_SINGLE_ISSUER;
+ goto retry;
+ }
+ if (errno == EINVAL && p.flags & IORING_SETUP_COOP_TASKRUN) {
+ p.flags &= ~IORING_SETUP_COOP_TASKRUN;
+ goto retry;
+ }
+ if (errno == EINVAL && p.flags & IORING_SETUP_CQSIZE) {
+ p.flags &= ~IORING_SETUP_CQSIZE;
+ goto retry;
+ }
+ return ret;
+ }
ld->ring_fd = ret;
err = fio_ioring_queue_init(td);
if (err) {
- td_verror(td, errno, "io_queue_init");
+ int init_err = errno;
+
+ if (init_err == ENOSYS)
+ log_err("fio: your kernel doesn't support io_uring\n");
+ td_verror(td, init_err, "io_queue_init");
return 1;
}
return 0;
}
-static int fio_ioring_init(struct thread_data *td)
+static int fio_ioring_cmd_post_init(struct thread_data *td)
{
+ struct ioring_data *ld = td->io_ops_data;
struct ioring_options *o = td->eo;
- struct ioring_data *ld;
- struct thread_options *to = &td->o;
+ struct io_u *io_u;
+ int err, i;
- if (to->io_submit_mode == IO_MODE_OFFLOAD) {
- log_err("fio: io_submit_mode=offload is not compatible (or "
- "useful) with io_uring\n");
+ for (i = 0; i < td->o.iodepth; i++) {
+ struct iovec *iov = &ld->iovecs[i];
+
+ io_u = ld->io_u_index[i];
+ iov->iov_base = io_u->buf;
+ iov->iov_len = td_max_bs(td);
+ }
+
+ err = fio_ioring_cmd_queue_init(td);
+ if (err) {
+ int init_err = errno;
+
+ td_verror(td, init_err, "io_queue_init");
return 1;
}
+ for (i = 0; i < td->o.iodepth; i++) {
+ struct io_uring_sqe *sqe;
+
+ if (o->cmd_type == FIO_URING_CMD_NVME) {
+ sqe = &ld->sqes[i << 1];
+ memset(sqe, 0, 2 * sizeof(*sqe));
+ } else {
+ sqe = &ld->sqes[i];
+ memset(sqe, 0, sizeof(*sqe));
+ }
+ }
+
+ if (o->registerfiles) {
+ err = fio_ioring_register_files(td);
+ if (err) {
+ td_verror(td, errno, "ioring_register_files");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static void parse_prchk_flags(struct ioring_options *o)
+{
+ if (!o->pi_chk)
+ return;
+
+ if (strstr(o->pi_chk, "GUARD") != NULL)
+ o->prchk = NVME_IO_PRINFO_PRCHK_GUARD;
+ if (strstr(o->pi_chk, "REFTAG") != NULL)
+ o->prchk |= NVME_IO_PRINFO_PRCHK_REF;
+ if (strstr(o->pi_chk, "APPTAG") != NULL)
+ o->prchk |= NVME_IO_PRINFO_PRCHK_APP;
+}
+
+static int fio_ioring_init(struct thread_data *td)
+{
+ struct ioring_options *o = td->eo;
+ struct ioring_data *ld;
+ struct nvme_dsm *dsm;
+ void *ptr;
+ unsigned int dsm_size;
+ unsigned long long md_size;
+ int ret, i;
+
/* sqthread submission requires registered files */
if (o->sqpoll_thread)
o->registerfiles = 1;
/* io_u index */
ld->io_u_index = calloc(td->o.iodepth, sizeof(struct io_u *));
+
+ /*
+ * metadata buffer for nvme command.
+ * We are only supporting iomem=malloc / mem=malloc as of now.
+ */
+ if (!strcmp(td->io_ops->name, "io_uring_cmd") &&
+ (o->cmd_type == FIO_URING_CMD_NVME) && o->md_per_io_size) {
+ md_size = (unsigned long long) o->md_per_io_size
+ * (unsigned long long) td->o.iodepth;
+ md_size += page_mask + td->o.mem_align;
+ if (td->o.mem_align && td->o.mem_align > page_size)
+ md_size += td->o.mem_align - page_size;
+ if (td->o.mem_type == MEM_MALLOC) {
+ ld->md_buf = malloc(md_size);
+ if (!ld->md_buf) {
+ free(ld);
+ return 1;
+ }
+ } else {
+ log_err("fio: Only iomem=malloc or mem=malloc is supported\n");
+ free(ld);
+ return 1;
+ }
+ }
+ parse_prchk_flags(o);
+
ld->iovecs = calloc(td->o.iodepth, sizeof(struct iovec));
td->io_ops_data = ld;
- /*
- * Check for option conflicts
- */
- if ((fio_option_is_set(to, ioprio) || fio_option_is_set(to, ioprio_class)) &&
- o->cmdprio_percentage != 0) {
- log_err("%s: cmdprio_percentage option and mutually exclusive "
- "prio or prioclass option is set, exiting\n", to->name);
- td_verror(td, EINVAL, "fio_io_uring_init");
+ ret = fio_cmdprio_init(td, &ld->cmdprio, &o->cmdprio_options);
+ if (ret) {
+ td_verror(td, EINVAL, "fio_ioring_init");
return 1;
}
- if (fio_option_is_set(&td->o, ioprio_class))
- ld->ioprio_class_set = true;
- if (fio_option_is_set(&td->o, ioprio))
- ld->ioprio_set = true;
+ /*
+ * For io_uring_cmd, trims are async operations unless we are operating
+ * in zbd mode where trim means zone reset.
+ */
+ if (!strcmp(td->io_ops->name, "io_uring_cmd") && td_trim(td) &&
+ td->o.zone_mode == ZONE_MODE_ZBD) {
+ td->io_ops->flags |= FIO_ASYNCIO_SYNC_TRIM;
+ } else {
+ dsm_size = sizeof(*ld->dsm) +
+ td->o.num_range * sizeof(struct nvme_dsm_range);
+ ld->dsm = calloc(td->o.iodepth, dsm_size);
+ ptr = ld->dsm;
+ for (i = 0; i < td->o.iodepth; i++) {
+ dsm = (struct nvme_dsm *)ptr;
+ dsm->nr_ranges = td->o.num_range;
+ ptr += dsm_size;
+ }
+ }
return 0;
}
static int fio_ioring_io_u_init(struct thread_data *td, struct io_u *io_u)
{
struct ioring_data *ld = td->io_ops_data;
+ struct ioring_options *o = td->eo;
+ struct nvme_pi_data *pi_data;
+ char *p;
ld->io_u_index[io_u->index] = io_u;
+
+ if (!strcmp(td->io_ops->name, "io_uring_cmd")) {
+ p = PTR_ALIGN(ld->md_buf, page_mask) + td->o.mem_align;
+ p += o->md_per_io_size * io_u->index;
+ io_u->mmap_data = p;
+
+ if (!o->pi_act) {
+ pi_data = calloc(1, sizeof(*pi_data));
+ pi_data->io_flags |= o->prchk;
+ pi_data->apptag_mask = o->apptag_mask;
+ pi_data->apptag = o->apptag;
+ io_u->engine_data = pi_data;
+ }
+ }
+
return 0;
}
+static void fio_ioring_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+ struct ioring_options *o = td->eo;
+ struct nvme_pi *pi;
+
+ if (!strcmp(td->io_ops->name, "io_uring_cmd") &&
+ (o->cmd_type == FIO_URING_CMD_NVME)) {
+ pi = io_u->engine_data;
+ free(pi);
+ io_u->engine_data = NULL;
+ }
+}
+
static int fio_ioring_open_file(struct thread_data *td, struct fio_file *f)
{
struct ioring_data *ld = td->io_ops_data;
return 0;
}
+static int fio_ioring_cmd_open_file(struct thread_data *td, struct fio_file *f)
+{
+ struct ioring_data *ld = td->io_ops_data;
+ struct ioring_options *o = td->eo;
+
+ if (o->cmd_type == FIO_URING_CMD_NVME) {
+ struct nvme_data *data = NULL;
+ unsigned int lba_size = 0;
+ __u64 nlba = 0;
+ int ret;
+
+ /* Store the namespace-id and lba size. */
+ data = FILE_ENG_DATA(f);
+ if (data == NULL) {
+ data = calloc(1, sizeof(struct nvme_data));
+ ret = fio_nvme_get_info(f, &nlba, o->pi_act, data);
+ if (ret) {
+ free(data);
+ return ret;
+ }
+
+ FILE_SET_ENG_DATA(f, data);
+ }
+
+ lba_size = data->lba_ext ? data->lba_ext : data->lba_size;
+
+ for_each_rw_ddir(ddir) {
+ if (td->o.min_bs[ddir] % lba_size || td->o.max_bs[ddir] % lba_size) {
+ if (data->lba_ext) {
+ log_err("%s: block size must be a multiple of %u "
+ "(LBA data size + Metadata size)\n", f->file_name, lba_size);
+ if (td->o.min_bs[ddir] == td->o.max_bs[ddir] &&
+ !(td->o.min_bs[ddir] % data->lba_size)) {
+ /* fixed block size is actually a multiple of LBA data size */
+ unsigned long long suggestion = lba_size *
+ (td->o.min_bs[ddir] / data->lba_size);
+ log_err("Did you mean to use a block size of %llu?\n", suggestion);
+ }
+ } else {
+ log_err("%s: block size must be a multiple of LBA data size\n",
+ f->file_name);
+ }
+ td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
+ return 1;
+ }
+ if (data->ms && !data->lba_ext && ddir != DDIR_TRIM &&
+ (o->md_per_io_size < ((td->o.max_bs[ddir] / data->lba_size) *
+ data->ms))) {
+ log_err("%s: md_per_io_size should be at least %llu bytes\n",
+ f->file_name,
+ ((td->o.max_bs[ddir] / data->lba_size) * data->ms));
+ td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
+ return 1;
+ }
+ }
+
+ /*
+ * For extended logical block sizes we cannot use verify when
+ * end to end data protection checks are enabled, as the PI
+ * section of data buffer conflicts with verify.
+ */
+ if (data->ms && data->pi_type && data->lba_ext &&
+ td->o.verify != VERIFY_NONE) {
+ log_err("%s: for extended LBA, verify cannot be used when E2E data protection is enabled\n",
+ f->file_name);
+ td_verror(td, EINVAL, "fio_ioring_cmd_open_file");
+ return 1;
+ }
+ }
+ if (!ld || !o->registerfiles)
+ return generic_open_file(td, f);
+
+ f->fd = ld->fds[f->engine_pos];
+ return 0;
+}
+
static int fio_ioring_close_file(struct thread_data *td, struct fio_file *f)
{
struct ioring_data *ld = td->io_ops_data;
return 0;
}
-static struct ioengine_ops ioengine = {
+static int fio_ioring_cmd_close_file(struct thread_data *td,
+ struct fio_file *f)
+{
+ struct ioring_data *ld = td->io_ops_data;
+ struct ioring_options *o = td->eo;
+
+ if (o->cmd_type == FIO_URING_CMD_NVME) {
+ struct nvme_data *data = FILE_ENG_DATA(f);
+
+ FILE_SET_ENG_DATA(f, NULL);
+ free(data);
+ }
+ if (!ld || !o->registerfiles)
+ return generic_close_file(td, f);
+
+ f->fd = -1;
+ return 0;
+}
+
+static int fio_ioring_cmd_get_file_size(struct thread_data *td,
+ struct fio_file *f)
+{
+ struct ioring_options *o = td->eo;
+
+ if (fio_file_size_known(f))
+ return 0;
+
+ if (o->cmd_type == FIO_URING_CMD_NVME) {
+ struct nvme_data *data = NULL;
+ __u64 nlba = 0;
+ int ret;
+
+ data = calloc(1, sizeof(struct nvme_data));
+ ret = fio_nvme_get_info(f, &nlba, o->pi_act, data);
+ if (ret) {
+ free(data);
+ return ret;
+ }
+
+ f->real_file_size = data->lba_size * nlba;
+ fio_file_set_size_known(f);
+
+ FILE_SET_ENG_DATA(f, data);
+ return 0;
+ }
+ return generic_get_file_size(td, f);
+}
+
+static int fio_ioring_cmd_get_zoned_model(struct thread_data *td,
+ struct fio_file *f,
+ enum zbd_zoned_model *model)
+{
+ return fio_nvme_get_zoned_model(td, f, model);
+}
+
+static int fio_ioring_cmd_report_zones(struct thread_data *td,
+ struct fio_file *f, uint64_t offset,
+ struct zbd_zone *zbdz,
+ unsigned int nr_zones)
+{
+ return fio_nvme_report_zones(td, f, offset, zbdz, nr_zones);
+}
+
+static int fio_ioring_cmd_reset_wp(struct thread_data *td, struct fio_file *f,
+ uint64_t offset, uint64_t length)
+{
+ return fio_nvme_reset_wp(td, f, offset, length);
+}
+
+static int fio_ioring_cmd_get_max_open_zones(struct thread_data *td,
+ struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ return fio_nvme_get_max_open_zones(td, f, max_open_zones);
+}
+
+static int fio_ioring_cmd_fetch_ruhs(struct thread_data *td, struct fio_file *f,
+ struct fio_ruhs_info *fruhs_info)
+{
+ struct nvme_fdp_ruh_status *ruhs;
+ int bytes, ret, i;
+
+ bytes = sizeof(*ruhs) + FDP_MAX_RUHS * sizeof(struct nvme_fdp_ruh_status_desc);
+ ruhs = scalloc(1, bytes);
+ if (!ruhs)
+ return -ENOMEM;
+
+ ret = fio_nvme_iomgmt_ruhs(td, f, ruhs, bytes);
+ if (ret)
+ goto free;
+
+ fruhs_info->nr_ruhs = le16_to_cpu(ruhs->nruhsd);
+ for (i = 0; i < fruhs_info->nr_ruhs; i++)
+ fruhs_info->plis[i] = le16_to_cpu(ruhs->ruhss[i].pid);
+free:
+ sfree(ruhs);
+ return ret;
+}
+
+static struct ioengine_ops ioengine_uring = {
.name = "io_uring",
.version = FIO_IOOPS_VERSION,
- .flags = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD,
+ .flags = FIO_ASYNCIO_SYNC_TRIM | FIO_NO_OFFLOAD |
+ FIO_ASYNCIO_SETS_ISSUE_TIME,
.init = fio_ioring_init,
.post_init = fio_ioring_post_init,
.io_u_init = fio_ioring_io_u_init,
.option_struct_size = sizeof(struct ioring_options),
};
+static struct ioengine_ops ioengine_uring_cmd = {
+ .name = "io_uring_cmd",
+ .version = FIO_IOOPS_VERSION,
+ .flags = FIO_NO_OFFLOAD | FIO_MEMALIGN | FIO_RAWIO |
+ FIO_ASYNCIO_SETS_ISSUE_TIME |
+ FIO_MULTI_RANGE_TRIM,
+ .init = fio_ioring_init,
+ .post_init = fio_ioring_cmd_post_init,
+ .io_u_init = fio_ioring_io_u_init,
+ .io_u_free = fio_ioring_io_u_free,
+ .prep = fio_ioring_cmd_prep,
+ .queue = fio_ioring_queue,
+ .commit = fio_ioring_commit,
+ .getevents = fio_ioring_getevents,
+ .event = fio_ioring_cmd_event,
+ .cleanup = fio_ioring_cleanup,
+ .open_file = fio_ioring_cmd_open_file,
+ .close_file = fio_ioring_cmd_close_file,
+ .get_file_size = fio_ioring_cmd_get_file_size,
+ .get_zoned_model = fio_ioring_cmd_get_zoned_model,
+ .report_zones = fio_ioring_cmd_report_zones,
+ .reset_wp = fio_ioring_cmd_reset_wp,
+ .get_max_open_zones = fio_ioring_cmd_get_max_open_zones,
+ .options = options,
+ .option_struct_size = sizeof(struct ioring_options),
+ .fdp_fetch_ruhs = fio_ioring_cmd_fetch_ruhs,
+};
+
static void fio_init fio_ioring_register(void)
{
- register_ioengine(&ioengine);
+ register_ioengine(&ioengine_uring);
+ register_ioengine(&ioengine_uring_cmd);
}
static void fio_exit fio_ioring_unregister(void)
{
- unregister_ioengine(&ioengine);
+ unregister_ioengine(&ioengine_uring);
+ unregister_ioengine(&ioengine_uring_cmd);
}
#endif
#include "../lib/pow2.h"
#include "../optgroup.h"
#include "../lib/memalign.h"
+#include "cmdprio.h"
/* Should be defined in newest aio_abi.h */
#ifndef IOCB_FLAG_IOPRIO
unsigned int queued;
unsigned int head;
unsigned int tail;
+
+ struct cmdprio cmdprio;
};
struct libaio_options {
- void *pad;
+ struct thread_data *td;
unsigned int userspace_reap;
- unsigned int cmdprio_percentage;
+ struct cmdprio_options cmdprio_options;
unsigned int nowait;
};
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_LIBAIO,
},
-#ifdef FIO_HAVE_IOPRIO_CLASS
- {
- .name = "cmdprio_percentage",
- .lname = "high priority percentage",
- .type = FIO_OPT_INT,
- .off1 = offsetof(struct libaio_options, cmdprio_percentage),
- .minval = 1,
- .maxval = 100,
- .help = "Send high priority I/O this percentage of the time",
- .category = FIO_OPT_C_ENGINE,
- .group = FIO_OPT_G_LIBAIO,
- },
-#else
- {
- .name = "cmdprio_percentage",
- .lname = "high priority percentage",
- .type = FIO_OPT_UNSUPPORTED,
- .help = "Your platform does not support I/O priority classes",
- },
-#endif
{
.name = "nowait",
.lname = "RWF_NOWAIT",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_LIBAIO,
},
+ CMDPRIO_OPTIONS(struct libaio_options, FIO_OPT_G_LIBAIO),
{
.name = NULL,
},
return 0;
}
-static void fio_libaio_prio_prep(struct thread_data *td, struct io_u *io_u)
+static inline void fio_libaio_cmdprio_prep(struct thread_data *td,
+ struct io_u *io_u)
{
- struct libaio_options *o = td->eo;
- if (rand_between(&td->prio_state, 0, 99) < o->cmdprio_percentage) {
- io_u->iocb.aio_reqprio = IOPRIO_CLASS_RT << IOPRIO_CLASS_SHIFT;
+ struct libaio_data *ld = td->io_ops_data;
+ struct cmdprio *cmdprio = &ld->cmdprio;
+
+ if (fio_cmdprio_set_ioprio(td, cmdprio, io_u)) {
+ io_u->iocb.aio_reqprio = io_u->ioprio;
io_u->iocb.u.c.flags |= IOCB_FLAG_IOPRIO;
- io_u->flags |= IO_U_F_PRIORITY;
}
- return;
}
static struct io_u *fio_libaio_event(struct thread_data *td, int event)
&& actual_min == 0
&& ((struct aio_ring *)(ld->aio_ctx))->magic
== AIO_RING_MAGIC) {
- r = user_io_getevents(ld->aio_ctx, max,
+ r = user_io_getevents(ld->aio_ctx, max - events,
ld->aio_events + events);
} else {
r = io_getevents(ld->aio_ctx, actual_min,
- max, ld->aio_events + events, lt);
+ max - events, ld->aio_events + events, lt);
}
- if (r > 0)
+ if (r > 0) {
events += r;
+ actual_min -= min((unsigned int)events, actual_min);
+ }
else if ((min && r == 0) || r == -EAGAIN) {
fio_libaio_commit(td);
if (actual_min)
struct io_u *io_u)
{
struct libaio_data *ld = td->io_ops_data;
- struct libaio_options *o = td->eo;
fio_ro_check(td, io_u);
return FIO_Q_COMPLETED;
}
- if (o->cmdprio_percentage)
- fio_libaio_prio_prep(td, io_u);
+ if (ld->cmdprio.mode != CMDPRIO_MODE_NONE)
+ fio_libaio_cmdprio_prep(td, io_u);
ld->iocbs[ld->head] = &io_u->iocb;
ld->io_us[ld->head] = io_u;
memcpy(&io_u->issue_time, &now, sizeof(now));
io_u_queued(td, io_u);
}
+
+ /*
+ * only used for iolog
+ */
+ if (td->o.read_iolog_file)
+ memcpy(&td->last_issue, &now, sizeof(now));
}
static int fio_libaio_commit(struct thread_data *td)
*/
if (!(td->flags & TD_F_CHILD))
io_destroy(ld->aio_ctx);
+
+ fio_cmdprio_cleanup(&ld->cmdprio);
free(ld->aio_events);
free(ld->iocbs);
free(ld->io_us);
static int fio_libaio_init(struct thread_data *td)
{
struct libaio_data *ld;
- struct thread_options *to = &td->o;
struct libaio_options *o = td->eo;
+ int ret;
ld = calloc(1, sizeof(*ld));
ld->io_us = calloc(ld->entries, sizeof(struct io_u *));
td->io_ops_data = ld;
- /*
- * Check for option conflicts
- */
- if ((fio_option_is_set(to, ioprio) || fio_option_is_set(to, ioprio_class)) &&
- o->cmdprio_percentage != 0) {
- log_err("%s: cmdprio_percentage option and mutually exclusive "
- "prio or prioclass option is set, exiting\n", to->name);
+
+ ret = fio_cmdprio_init(td, &ld->cmdprio, &o->cmdprio_options);
+ if (ret) {
td_verror(td, EINVAL, "fio_libaio_init");
return 1;
}
+
return 0;
}
FIO_STATIC struct ioengine_ops ioengine = {
.name = "libaio",
.version = FIO_IOOPS_VERSION,
- .flags = FIO_ASYNCIO_SYNC_TRIM,
+ .flags = FIO_ASYNCIO_SYNC_TRIM |
+ FIO_ASYNCIO_SETS_ISSUE_TIME,
.init = fio_libaio_init,
.post_init = fio_libaio_post_init,
.prep = fio_libaio_prep,
--- /dev/null
+/*
+ * libblkio engine
+ *
+ * IO engine using libblkio to access various block I/O interfaces:
+ * https://gitlab.com/libblkio/libblkio
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <blkio.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+#include "../options.h"
+#include "../parse.h"
+
+/* per-process state */
+static struct {
+ pthread_mutex_t mutex;
+ int initted_threads;
+ int initted_hipri_threads;
+ struct blkio *b;
+} proc_state = { PTHREAD_MUTEX_INITIALIZER, 0, 0, NULL };
+
+static void fio_blkio_proc_lock(void) {
+ int ret;
+ ret = pthread_mutex_lock(&proc_state.mutex);
+ assert(ret == 0);
+}
+
+static void fio_blkio_proc_unlock(void) {
+ int ret;
+ ret = pthread_mutex_unlock(&proc_state.mutex);
+ assert(ret == 0);
+}
+
+/* per-thread state */
+struct fio_blkio_data {
+ struct blkioq *q;
+ int completion_fd; /* may be -1 if not FIO_BLKIO_WAIT_MODE_EVENTFD */
+
+ bool has_mem_region; /* whether mem_region is valid */
+ struct blkio_mem_region mem_region; /* only if allocated by libblkio */
+
+ struct iovec *iovecs; /* for vectored requests */
+ struct blkio_completion *completions;
+};
+
+enum fio_blkio_wait_mode {
+ FIO_BLKIO_WAIT_MODE_BLOCK,
+ FIO_BLKIO_WAIT_MODE_EVENTFD,
+ FIO_BLKIO_WAIT_MODE_LOOP,
+};
+
+struct fio_blkio_options {
+ void *pad; /* option fields must not have offset 0 */
+
+ char *driver;
+
+ char *path;
+ char *pre_connect_props;
+
+ int num_entries;
+ int queue_size;
+ char *pre_start_props;
+
+ unsigned int hipri;
+ unsigned int vectored;
+ unsigned int write_zeroes_on_trim;
+ enum fio_blkio_wait_mode wait_mode;
+ unsigned int force_enable_completion_eventfd;
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "libblkio_driver",
+ .lname = "libblkio driver name",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct fio_blkio_options, driver),
+ .help = "Name of the driver to be used by libblkio",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = "libblkio_path",
+ .lname = "libblkio \"path\" property",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct fio_blkio_options, path),
+ .help = "Value to set the \"path\" property to",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = "libblkio_pre_connect_props",
+ .lname = "Additional properties to be set before blkio_connect()",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct fio_blkio_options, pre_connect_props),
+ .help = "",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = "libblkio_num_entries",
+ .lname = "libblkio \"num-entries\" property",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct fio_blkio_options, num_entries),
+ .help = "Value to set the \"num-entries\" property to",
+ .minval = 1,
+ .interval = 1,
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = "libblkio_queue_size",
+ .lname = "libblkio \"queue-size\" property",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct fio_blkio_options, queue_size),
+ .help = "Value to set the \"queue-size\" property to",
+ .minval = 1,
+ .interval = 1,
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = "libblkio_pre_start_props",
+ .lname = "Additional properties to be set before blkio_start()",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct fio_blkio_options, pre_start_props),
+ .help = "",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = "hipri",
+ .lname = "Use poll queues",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct fio_blkio_options, hipri),
+ .help = "Use poll queues",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = "libblkio_vectored",
+ .lname = "Use blkioq_{readv,writev}()",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct fio_blkio_options, vectored),
+ .help = "Use blkioq_{readv,writev}() instead of blkioq_{read,write}()",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = "libblkio_write_zeroes_on_trim",
+ .lname = "Use blkioq_write_zeroes() for TRIM",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct fio_blkio_options,
+ write_zeroes_on_trim),
+ .help = "Use blkioq_write_zeroes() for TRIM instead of blkioq_discard()",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = "libblkio_wait_mode",
+ .lname = "How to wait for completions",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct fio_blkio_options, wait_mode),
+ .help = "How to wait for completions",
+ .def = "block",
+ .posval = {
+ { .ival = "block",
+ .oval = FIO_BLKIO_WAIT_MODE_BLOCK,
+ .help = "Blocking blkioq_do_io()",
+ },
+ { .ival = "eventfd",
+ .oval = FIO_BLKIO_WAIT_MODE_EVENTFD,
+ .help = "Blocking read() on the completion eventfd",
+ },
+ { .ival = "loop",
+ .oval = FIO_BLKIO_WAIT_MODE_LOOP,
+ .help = "Busy loop with non-blocking blkioq_do_io()",
+ },
+ },
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = "libblkio_force_enable_completion_eventfd",
+ .lname = "Force enable the completion eventfd, even if unused",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct fio_blkio_options,
+ force_enable_completion_eventfd),
+ .help = "This can impact performance",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBBLKIO,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+static int fio_blkio_set_props_from_str(struct blkio *b, const char *opt_name,
+ const char *str) {
+ int ret = 0;
+ char *new_str, *name, *value;
+
+ if (!str)
+ return 0;
+
+ /* iteration can mutate string, so copy it */
+ new_str = strdup(str);
+ if (!new_str) {
+ log_err("fio: strdup() failed\n");
+ return 1;
+ }
+
+ /* iterate over property name-value pairs */
+ while ((name = get_next_str(&new_str))) {
+ /* split into property name and value */
+ value = strchr(name, '=');
+ if (!value) {
+ log_err("fio: missing '=' in option %s\n", opt_name);
+ ret = 1;
+ break;
+ }
+
+ *value = '\0';
+ ++value;
+
+ /* strip whitespace from property name */
+ strip_blank_front(&name);
+ strip_blank_end(name);
+
+ if (name[0] == '\0') {
+ log_err("fio: empty property name in option %s\n",
+ opt_name);
+ ret = 1;
+ break;
+ }
+
+ /* strip whitespace from property value */
+ strip_blank_front(&value);
+ strip_blank_end(value);
+
+ /* set property */
+ if (blkio_set_str(b, name, value) != 0) {
+ log_err("fio: error setting property '%s' to '%s': %s\n",
+ name, value, blkio_get_error_msg());
+ ret = 1;
+ break;
+ }
+ }
+
+ free(new_str);
+ return ret;
+}
+
+/*
+ * Log the failure of a libblkio function.
+ *
+ * `(void)func` is to ensure `func` exists and prevent typos
+ */
+#define fio_blkio_log_err(func) \
+ ({ \
+ (void)func; \
+ log_err("fio: %s() failed: %s\n", #func, \
+ blkio_get_error_msg()); \
+ })
+
+static bool possibly_null_strs_equal(const char *a, const char *b)
+{
+ return (!a && !b) || (a && b && strcmp(a, b) == 0);
+}
+
+/*
+ * Returns the total number of subjobs using the 'libblkio' ioengine and setting
+ * the 'thread' option in the entire workload that have the given value for the
+ * 'hipri' option.
+ */
+static int total_threaded_subjobs(bool hipri)
+{
+ int count = 0;
+
+ for_each_td(td) {
+ const struct fio_blkio_options *options = td->eo;
+ if (strcmp(td->o.ioengine, "libblkio") == 0 &&
+ td->o.use_thread && (bool)options->hipri == hipri)
+ ++count;
+ } end_for_each();
+
+ return count;
+}
+
+static struct {
+ bool set_up;
+ bool direct;
+ struct fio_blkio_options opts;
+} first_threaded_subjob = { 0 };
+
+static void fio_blkio_log_opt_compat_err(const char *option_name)
+{
+ log_err("fio: jobs using engine libblkio and sharing a process must agree on the %s option\n",
+ option_name);
+}
+
+/*
+ * If td represents a subjob with option 'thread', check if its options are
+ * compatible with those of other threaded subjobs that were already set up.
+ */
+static int fio_blkio_check_opt_compat(struct thread_data *td)
+{
+ const struct fio_blkio_options *options = td->eo, *prev_options;
+
+ if (!td->o.use_thread)
+ return 0; /* subjob doesn't use 'thread' */
+
+ if (!first_threaded_subjob.set_up) {
+ /* first subjob using 'thread', store options for later */
+ first_threaded_subjob.set_up = true;
+ first_threaded_subjob.direct = td->o.odirect;
+ first_threaded_subjob.opts = *options;
+ return 0;
+ }
+
+ /* not first subjob using 'thread', check option compatibility */
+ prev_options = &first_threaded_subjob.opts;
+
+ if (td->o.odirect != first_threaded_subjob.direct) {
+ fio_blkio_log_opt_compat_err("direct/buffered");
+ return 1;
+ }
+
+ if (strcmp(options->driver, prev_options->driver) != 0) {
+ fio_blkio_log_opt_compat_err("libblkio_driver");
+ return 1;
+ }
+
+ if (!possibly_null_strs_equal(options->path, prev_options->path)) {
+ fio_blkio_log_opt_compat_err("libblkio_path");
+ return 1;
+ }
+
+ if (!possibly_null_strs_equal(options->pre_connect_props,
+ prev_options->pre_connect_props)) {
+ fio_blkio_log_opt_compat_err("libblkio_pre_connect_props");
+ return 1;
+ }
+
+ if (options->num_entries != prev_options->num_entries) {
+ fio_blkio_log_opt_compat_err("libblkio_num_entries");
+ return 1;
+ }
+
+ if (options->queue_size != prev_options->queue_size) {
+ fio_blkio_log_opt_compat_err("libblkio_queue_size");
+ return 1;
+ }
+
+ if (!possibly_null_strs_equal(options->pre_start_props,
+ prev_options->pre_start_props)) {
+ fio_blkio_log_opt_compat_err("libblkio_pre_start_props");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int fio_blkio_create_and_connect(struct thread_data *td,
+ struct blkio **out_blkio)
+{
+ const struct fio_blkio_options *options = td->eo;
+ struct blkio *b;
+ int ret;
+
+ if (!options->driver) {
+ log_err("fio: engine libblkio requires option libblkio_driver to be set\n");
+ return 1;
+ }
+
+ if (blkio_create(options->driver, &b) != 0) {
+ fio_blkio_log_err(blkio_create);
+ return 1;
+ }
+
+ /* don't fail if driver doesn't have a "direct" property */
+ ret = blkio_set_bool(b, "direct", td->o.odirect);
+ if (ret != 0 && ret != -ENOENT) {
+ fio_blkio_log_err(blkio_set_bool);
+ goto err_blkio_destroy;
+ }
+
+ if (blkio_set_bool(b, "read-only", read_only) != 0) {
+ fio_blkio_log_err(blkio_set_bool);
+ goto err_blkio_destroy;
+ }
+
+ if (options->path) {
+ if (blkio_set_str(b, "path", options->path) != 0) {
+ fio_blkio_log_err(blkio_set_str);
+ goto err_blkio_destroy;
+ }
+ }
+
+ if (fio_blkio_set_props_from_str(b, "libblkio_pre_connect_props",
+ options->pre_connect_props) != 0)
+ goto err_blkio_destroy;
+
+ if (blkio_connect(b) != 0) {
+ fio_blkio_log_err(blkio_connect);
+ goto err_blkio_destroy;
+ }
+
+ if (options->num_entries != 0) {
+ if (blkio_set_int(b, "num-entries",
+ options->num_entries) != 0) {
+ fio_blkio_log_err(blkio_set_int);
+ goto err_blkio_destroy;
+ }
+ }
+
+ if (options->queue_size != 0) {
+ if (blkio_set_int(b, "queue-size", options->queue_size) != 0) {
+ fio_blkio_log_err(blkio_set_int);
+ goto err_blkio_destroy;
+ }
+ }
+
+ if (fio_blkio_set_props_from_str(b, "libblkio_pre_start_props",
+ options->pre_start_props) != 0)
+ goto err_blkio_destroy;
+
+ *out_blkio = b;
+ return 0;
+
+err_blkio_destroy:
+ blkio_destroy(&b);
+ return 1;
+}
+
+static bool incompatible_threaded_subjob_options = false;
+
+/*
+ * This callback determines the device/file size, so it creates and connects a
+ * blkio instance. But it is invoked from the main thread in the original fio
+ * process, not from the processes in which jobs will actually run. It thus
+ * subsequently destroys the blkio, which is recreated in the init() callback.
+ */
+static int fio_blkio_setup(struct thread_data *td)
+{
+ const struct fio_blkio_options *options = td->eo;
+ struct blkio *b;
+ int ret = 0;
+ uint64_t capacity;
+
+ assert(td->files_index == 1);
+
+ if (fio_blkio_check_opt_compat(td) != 0) {
+ incompatible_threaded_subjob_options = true;
+ return 1;
+ }
+
+ if (options->hipri &&
+ options->wait_mode == FIO_BLKIO_WAIT_MODE_EVENTFD) {
+ log_err("fio: option hipri is incompatible with option libblkio_wait_mode=eventfd\n");
+ return 1;
+ }
+
+ if (options->hipri && options->force_enable_completion_eventfd) {
+ log_err("fio: option hipri is incompatible with option libblkio_force_enable_completion_eventfd\n");
+ return 1;
+ }
+
+ if (fio_blkio_create_and_connect(td, &b) != 0)
+ return 1;
+
+ if (blkio_get_uint64(b, "capacity", &capacity) != 0) {
+ fio_blkio_log_err(blkio_get_uint64);
+ ret = 1;
+ goto out_blkio_destroy;
+ }
+
+ td->files[0]->real_file_size = capacity;
+ fio_file_set_size_known(td->files[0]);
+
+out_blkio_destroy:
+ blkio_destroy(&b);
+ return ret;
+}
+
+static int fio_blkio_init(struct thread_data *td)
+{
+ const struct fio_blkio_options *options = td->eo;
+ struct fio_blkio_data *data;
+ int flags;
+
+ if (td->o.use_thread && incompatible_threaded_subjob_options) {
+ /*
+ * Different subjobs using option 'thread' specified
+ * incompatible options. We don't know which configuration
+ * should win, so we just fail all such subjobs.
+ */
+ return 1;
+ }
+
+ /*
+ * Request enqueueing is fast, and it's not possible to know exactly
+ * when a request is submitted, so never report submission latencies.
+ */
+ td->o.disable_slat = 1;
+
+ data = calloc(1, sizeof(*data));
+ if (!data) {
+ log_err("fio: calloc() failed\n");
+ return 1;
+ }
+
+ data->iovecs = calloc(td->o.iodepth, sizeof(data->iovecs[0]));
+ data->completions = calloc(td->o.iodepth, sizeof(data->completions[0]));
+ if (!data->iovecs || !data->completions) {
+ log_err("fio: calloc() failed\n");
+ goto err_free;
+ }
+
+ fio_blkio_proc_lock();
+
+ if (proc_state.initted_threads == 0) {
+ /* initialize per-process blkio */
+ int num_queues, num_poll_queues;
+
+ if (td->o.use_thread) {
+ num_queues = total_threaded_subjobs(false);
+ num_poll_queues = total_threaded_subjobs(true);
+ } else {
+ num_queues = options->hipri ? 0 : 1;
+ num_poll_queues = options->hipri ? 1 : 0;
+ }
+
+ if (fio_blkio_create_and_connect(td, &proc_state.b) != 0)
+ goto err_unlock;
+
+ if (blkio_set_int(proc_state.b, "num-queues",
+ num_queues) != 0) {
+ fio_blkio_log_err(blkio_set_int);
+ goto err_blkio_destroy;
+ }
+
+ if (blkio_set_int(proc_state.b, "num-poll-queues",
+ num_poll_queues) != 0) {
+ fio_blkio_log_err(blkio_set_int);
+ goto err_blkio_destroy;
+ }
+
+ if (blkio_start(proc_state.b) != 0) {
+ fio_blkio_log_err(blkio_start);
+ goto err_blkio_destroy;
+ }
+ }
+
+ if (options->hipri) {
+ int i = proc_state.initted_hipri_threads;
+ data->q = blkio_get_poll_queue(proc_state.b, i);
+ } else {
+ int i = proc_state.initted_threads -
+ proc_state.initted_hipri_threads;
+ data->q = blkio_get_queue(proc_state.b, i);
+ }
+
+ if (options->wait_mode == FIO_BLKIO_WAIT_MODE_EVENTFD ||
+ options->force_enable_completion_eventfd) {
+ /* enable completion fd and make it blocking */
+ blkioq_set_completion_fd_enabled(data->q, true);
+ data->completion_fd = blkioq_get_completion_fd(data->q);
+
+ flags = fcntl(data->completion_fd, F_GETFL);
+ if (flags < 0) {
+ log_err("fio: fcntl(F_GETFL) failed: %s\n",
+ strerror(errno));
+ goto err_blkio_destroy;
+ }
+
+ if (fcntl(data->completion_fd, F_SETFL,
+ flags & ~O_NONBLOCK) != 0) {
+ log_err("fio: fcntl(F_SETFL) failed: %s\n",
+ strerror(errno));
+ goto err_blkio_destroy;
+ }
+ } else {
+ data->completion_fd = -1;
+ }
+
+ ++proc_state.initted_threads;
+ if (options->hipri)
+ ++proc_state.initted_hipri_threads;
+
+ /* Set data last so cleanup() does nothing if init() fails. */
+ td->io_ops_data = data;
+
+ fio_blkio_proc_unlock();
+
+ return 0;
+
+err_blkio_destroy:
+ if (proc_state.initted_threads == 0)
+ blkio_destroy(&proc_state.b);
+err_unlock:
+ if (proc_state.initted_threads == 0)
+ proc_state.b = NULL;
+ fio_blkio_proc_unlock();
+err_free:
+ free(data->completions);
+ free(data->iovecs);
+ free(data);
+ return 1;
+}
+
+static int fio_blkio_post_init(struct thread_data *td)
+{
+ struct fio_blkio_data *data = td->io_ops_data;
+
+ if (!data->has_mem_region) {
+ /*
+ * Memory was allocated by the fio core and not iomem_alloc(),
+ * so we need to register it as a memory region here.
+ *
+ * `td->orig_buffer_size` is computed like `len` below, but then
+ * fio can add some padding to it to make sure it is
+ * sufficiently aligned to the page size and the mem_align
+ * option. However, this can make it become unaligned to the
+ * "mem-region-alignment" property in ways that the user can't
+ * control, so we essentially recompute `td->orig_buffer_size`
+ * here but without adding that padding.
+ */
+
+ unsigned long long max_block_size;
+ struct blkio_mem_region region;
+
+ max_block_size = max(td->o.max_bs[DDIR_READ],
+ max(td->o.max_bs[DDIR_WRITE],
+ td->o.max_bs[DDIR_TRIM]));
+
+ region = (struct blkio_mem_region) {
+ .addr = td->orig_buffer,
+ .len = (size_t)max_block_size *
+ (size_t)td->o.iodepth,
+ .fd = -1,
+ };
+
+ if (blkio_map_mem_region(proc_state.b, ®ion) != 0) {
+ fio_blkio_log_err(blkio_map_mem_region);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static void fio_blkio_cleanup(struct thread_data *td)
+{
+ struct fio_blkio_data *data = td->io_ops_data;
+
+ /*
+ * Subjobs from different jobs can be terminated at different times, so
+ * this callback may be invoked for one subjob while another is still
+ * doing I/O. Those subjobs may share the process, so we must wait until
+ * the last subjob in the process wants to clean up to actually destroy
+ * the blkio.
+ */
+
+ if (data) {
+ free(data->completions);
+ free(data->iovecs);
+ free(data);
+
+ fio_blkio_proc_lock();
+ if (--proc_state.initted_threads == 0) {
+ blkio_destroy(&proc_state.b);
+ proc_state.b = NULL;
+ }
+ fio_blkio_proc_unlock();
+ }
+}
+
+#define align_up(x, y) ((((x) + (y) - 1) / (y)) * (y))
+
+static int fio_blkio_iomem_alloc(struct thread_data *td, size_t size)
+{
+ struct fio_blkio_data *data = td->io_ops_data;
+ int ret;
+ uint64_t mem_region_alignment;
+
+ if (blkio_get_uint64(proc_state.b, "mem-region-alignment",
+ &mem_region_alignment) != 0) {
+ fio_blkio_log_err(blkio_get_uint64);
+ return 1;
+ }
+
+ /* round up size to satisfy mem-region-alignment */
+ size = align_up(size, (size_t)mem_region_alignment);
+
+ fio_blkio_proc_lock();
+
+ if (blkio_alloc_mem_region(proc_state.b, &data->mem_region,
+ size) != 0) {
+ fio_blkio_log_err(blkio_alloc_mem_region);
+ ret = 1;
+ goto out;
+ }
+
+ if (blkio_map_mem_region(proc_state.b, &data->mem_region) != 0) {
+ fio_blkio_log_err(blkio_map_mem_region);
+ ret = 1;
+ goto out_free;
+ }
+
+ td->orig_buffer = data->mem_region.addr;
+ data->has_mem_region = true;
+
+ ret = 0;
+ goto out;
+
+out_free:
+ blkio_free_mem_region(proc_state.b, &data->mem_region);
+out:
+ fio_blkio_proc_unlock();
+ return ret;
+}
+
+static void fio_blkio_iomem_free(struct thread_data *td)
+{
+ struct fio_blkio_data *data = td->io_ops_data;
+
+ if (data && data->has_mem_region) {
+ fio_blkio_proc_lock();
+ blkio_unmap_mem_region(proc_state.b, &data->mem_region);
+ blkio_free_mem_region(proc_state.b, &data->mem_region);
+ fio_blkio_proc_unlock();
+
+ data->has_mem_region = false;
+ }
+}
+
+static int fio_blkio_open_file(struct thread_data *td, struct fio_file *f)
+{
+ return 0;
+}
+
+static enum fio_q_status fio_blkio_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ const struct fio_blkio_options *options = td->eo;
+ struct fio_blkio_data *data = td->io_ops_data;
+
+ fio_ro_check(td, io_u);
+
+ switch (io_u->ddir) {
+ case DDIR_READ:
+ if (options->vectored) {
+ struct iovec *iov = &data->iovecs[io_u->index];
+ iov->iov_base = io_u->xfer_buf;
+ iov->iov_len = (size_t)io_u->xfer_buflen;
+
+ blkioq_readv(data->q, io_u->offset, iov, 1,
+ io_u, 0);
+ } else {
+ blkioq_read(data->q, io_u->offset,
+ io_u->xfer_buf,
+ (size_t)io_u->xfer_buflen, io_u, 0);
+ }
+ break;
+ case DDIR_WRITE:
+ if (options->vectored) {
+ struct iovec *iov = &data->iovecs[io_u->index];
+ iov->iov_base = io_u->xfer_buf;
+ iov->iov_len = (size_t)io_u->xfer_buflen;
+
+ blkioq_writev(data->q, io_u->offset, iov, 1,
+ io_u, 0);
+ } else {
+ blkioq_write(data->q, io_u->offset,
+ io_u->xfer_buf,
+ (size_t)io_u->xfer_buflen, io_u,
+ 0);
+ }
+ break;
+ case DDIR_TRIM:
+ if (options->write_zeroes_on_trim) {
+ blkioq_write_zeroes(data->q, io_u->offset,
+ io_u->xfer_buflen, io_u, 0);
+ } else {
+ blkioq_discard(data->q, io_u->offset,
+ io_u->xfer_buflen, io_u, 0);
+ }
+ break;
+ case DDIR_SYNC:
+ case DDIR_DATASYNC:
+ blkioq_flush(data->q, io_u, 0);
+ break;
+ default:
+ io_u->error = ENOTSUP;
+ io_u_log_error(td, io_u);
+ return FIO_Q_COMPLETED;
+ }
+
+ return FIO_Q_QUEUED;
+}
+
+static int fio_blkio_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
+{
+ const struct fio_blkio_options *options = td->eo;
+ struct fio_blkio_data *data = td->io_ops_data;
+ int ret, n;
+ uint64_t event;
+
+ switch (options->wait_mode) {
+ case FIO_BLKIO_WAIT_MODE_BLOCK:
+ n = blkioq_do_io(data->q, data->completions, (int)min, (int)max,
+ NULL);
+ if (n < 0) {
+ fio_blkio_log_err(blkioq_do_io);
+ return -1;
+ }
+ return n;
+ case FIO_BLKIO_WAIT_MODE_EVENTFD:
+ n = blkioq_do_io(data->q, data->completions, 0, (int)max, NULL);
+ if (n < 0) {
+ fio_blkio_log_err(blkioq_do_io);
+ return -1;
+ }
+ while (n < (int)min) {
+ ret = read(data->completion_fd, &event, sizeof(event));
+ if (ret != sizeof(event)) {
+ log_err("fio: read() on the completion fd returned %d\n",
+ ret);
+ return -1;
+ }
+
+ ret = blkioq_do_io(data->q, data->completions + n, 0,
+ (int)max - n, NULL);
+ if (ret < 0) {
+ fio_blkio_log_err(blkioq_do_io);
+ return -1;
+ }
+
+ n += ret;
+ }
+ return n;
+ case FIO_BLKIO_WAIT_MODE_LOOP:
+ for (n = 0; n < (int)min; ) {
+ ret = blkioq_do_io(data->q, data->completions + n, 0,
+ (int)max - n, NULL);
+ if (ret < 0) {
+ fio_blkio_log_err(blkioq_do_io);
+ return -1;
+ }
+
+ n += ret;
+ }
+ return n;
+ default:
+ return -1;
+ }
+}
+
+static struct io_u *fio_blkio_event(struct thread_data *td, int event)
+{
+ struct fio_blkio_data *data = td->io_ops_data;
+ struct blkio_completion *completion = &data->completions[event];
+ struct io_u *io_u = completion->user_data;
+
+ io_u->error = -completion->ret;
+
+ return io_u;
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+ .name = "libblkio",
+ .version = FIO_IOOPS_VERSION,
+ .flags = FIO_DISKLESSIO | FIO_NOEXTEND |
+ FIO_NO_OFFLOAD | FIO_SKIPPABLE_IOMEM_ALLOC,
+
+ .setup = fio_blkio_setup,
+ .init = fio_blkio_init,
+ .post_init = fio_blkio_post_init,
+ .cleanup = fio_blkio_cleanup,
+
+ .iomem_alloc = fio_blkio_iomem_alloc,
+ .iomem_free = fio_blkio_iomem_free,
+
+ .open_file = fio_blkio_open_file,
+
+ .queue = fio_blkio_queue,
+ .getevents = fio_blkio_getevents,
+ .event = fio_blkio_event,
+
+ .options = options,
+ .option_struct_size = sizeof(struct fio_blkio_options),
+};
+
+static void fio_init fio_blkio_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_blkio_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
--- /dev/null
+/*
+ * Copyright (c)2020 System Fabric Works, Inc. All Rights Reserved.
+ * mailto:info@systemfabricworks.com
+ *
+ * License: GPLv2, see COPYING.
+ *
+ * libcufile engine
+ *
+ * fio I/O engine using the NVIDIA cuFile API.
+ *
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <cufile.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <pthread.h>
+
+#include "../fio.h"
+#include "../lib/pow2.h"
+#include "../optgroup.h"
+#include "../lib/memalign.h"
+
+#define ALIGNED_4KB(v) (((v) & 0x0fff) == 0)
+
+#define LOGGED_BUFLEN_NOT_ALIGNED 0x01
+#define LOGGED_GPU_OFFSET_NOT_ALIGNED 0x02
+#define GPU_ID_SEP ":"
+
+enum {
+ IO_CUFILE = 1,
+ IO_POSIX = 2
+};
+
+struct libcufile_options {
+ struct thread_data *td;
+ char *gpu_ids; /* colon-separated list of GPU ids,
+ one per job */
+ void *cu_mem_ptr; /* GPU memory */
+ void *junk_buf; /* buffer to simulate cudaMemcpy with
+ posix I/O write */
+ int my_gpu_id; /* GPU id to use for this job */
+ unsigned int cuda_io; /* Type of I/O to use with CUDA */
+ size_t total_mem; /* size for cu_mem_ptr and junk_buf */
+ int logged; /* bitmask of log messages that have
+ been output, prevent flood */
+};
+
+struct fio_libcufile_data {
+ CUfileDescr_t cf_descr;
+ CUfileHandle_t cf_handle;
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "gpu_dev_ids",
+ .lname = "libcufile engine gpu dev ids",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct libcufile_options, gpu_ids),
+ .help = "GPU IDs, one per subjob, separated by " GPU_ID_SEP,
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBCUFILE,
+ },
+ {
+ .name = "cuda_io",
+ .lname = "libcufile cuda io",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct libcufile_options, cuda_io),
+ .help = "Type of I/O to use with CUDA",
+ .def = "cufile",
+ .posval = {
+ { .ival = "cufile",
+ .oval = IO_CUFILE,
+ .help = "libcufile nvidia-fs"
+ },
+ { .ival = "posix",
+ .oval = IO_POSIX,
+ .help = "POSIX I/O"
+ }
+ },
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBCUFILE,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+static int running = 0;
+static int cufile_initialized = 0;
+static pthread_mutex_t running_lock = PTHREAD_MUTEX_INITIALIZER;
+
+#define check_cudaruntimecall(fn, rc) \
+ do { \
+ cudaError_t res = fn; \
+ if (res != cudaSuccess) { \
+ const char *str = cudaGetErrorName(res); \
+ log_err("cuda runtime api call failed %s:%d : err=%d:%s\n", \
+ #fn, __LINE__, res, str); \
+ rc = -1; \
+ } else \
+ rc = 0; \
+ } while(0)
+
+static const char *fio_libcufile_get_cuda_error(CUfileError_t st)
+{
+ if (IS_CUFILE_ERR(st.err))
+ return cufileop_status_error(st.err);
+ return "unknown";
+}
+
+/*
+ * Assign GPU to subjob roundrobin, similar to how multiple
+ * entries in 'directory' are handled by fio.
+ */
+static int fio_libcufile_find_gpu_id(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+ int gpu_id = 0;
+
+ if (o->gpu_ids != NULL) {
+ char *gpu_ids, *pos, *cur;
+ int i, id_count, gpu_idx;
+
+ for (id_count = 0, cur = o->gpu_ids; cur != NULL; id_count++) {
+ cur = strchr(cur, GPU_ID_SEP[0]);
+ if (cur != NULL)
+ cur++;
+ }
+
+ gpu_idx = td->subjob_number % id_count;
+
+ pos = gpu_ids = strdup(o->gpu_ids);
+ if (gpu_ids == NULL) {
+ log_err("strdup(gpu_ids): err=%d\n", errno);
+ return -1;
+ }
+
+ i = 0;
+ while (pos != NULL && i <= gpu_idx) {
+ i++;
+ cur = strsep(&pos, GPU_ID_SEP);
+ }
+
+ if (cur)
+ gpu_id = atoi(cur);
+
+ free(gpu_ids);
+ }
+
+ return gpu_id;
+}
+
+static int fio_libcufile_init(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+ CUfileError_t status;
+ int initialized;
+ int rc;
+
+ pthread_mutex_lock(&running_lock);
+ if (running == 0) {
+ assert(cufile_initialized == 0);
+ if (o->cuda_io == IO_CUFILE) {
+ /* only open the driver if this is the first worker thread */
+ status = cuFileDriverOpen();
+ if (status.err != CU_FILE_SUCCESS)
+ log_err("cuFileDriverOpen: err=%d:%s\n", status.err,
+ fio_libcufile_get_cuda_error(status));
+ else
+ cufile_initialized = 1;
+ }
+ }
+ running++;
+ initialized = cufile_initialized;
+ pthread_mutex_unlock(&running_lock);
+
+ if (o->cuda_io == IO_CUFILE && !initialized)
+ return 1;
+
+ o->my_gpu_id = fio_libcufile_find_gpu_id(td);
+ if (o->my_gpu_id < 0)
+ return 1;
+
+ dprint(FD_MEM, "Subjob %d uses GPU %d\n", td->subjob_number, o->my_gpu_id);
+ check_cudaruntimecall(cudaSetDevice(o->my_gpu_id), rc);
+ if (rc != 0)
+ return 1;
+
+ return 0;
+}
+
+static inline int fio_libcufile_pre_write(struct thread_data *td,
+ struct libcufile_options *o,
+ struct io_u *io_u,
+ size_t gpu_offset)
+{
+ int rc = 0;
+
+ if (o->cuda_io == IO_CUFILE) {
+ if (td->o.verify) {
+ /*
+ Data is being verified, copy the io_u buffer to GPU memory.
+ This isn't done in the non-verify case because the data would
+ already be in GPU memory in a normal cuFile application.
+ */
+ check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buf,
+ io_u->xfer_buflen,
+ cudaMemcpyHostToDevice), rc);
+ if (rc != 0) {
+ log_err("DDIR_WRITE cudaMemcpy H2D failed\n");
+ io_u->error = EIO;
+ }
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+
+ /*
+ POSIX I/O is being used, the data has to be copied out of the
+ GPU into a CPU buffer. GPU memory doesn't contain the actual
+ data to write, copy the data to the junk buffer. The purpose
+ of this is to add the overhead of cudaMemcpy() that would be
+ present in a POSIX I/O CUDA application.
+ */
+ check_cudaruntimecall(cudaMemcpy(o->junk_buf + gpu_offset,
+ ((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buflen,
+ cudaMemcpyDeviceToHost), rc);
+ if (rc != 0) {
+ log_err("DDIR_WRITE cudaMemcpy D2H failed\n");
+ io_u->error = EIO;
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ assert(0);
+ rc = EINVAL;
+ }
+
+ return rc;
+}
+
+static inline int fio_libcufile_post_read(struct thread_data *td,
+ struct libcufile_options *o,
+ struct io_u *io_u,
+ size_t gpu_offset)
+{
+ int rc = 0;
+
+ if (o->cuda_io == IO_CUFILE) {
+ if (td->o.verify) {
+ /* Copy GPU memory to CPU buffer for verify */
+ check_cudaruntimecall(cudaMemcpy(io_u->xfer_buf,
+ ((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buflen,
+ cudaMemcpyDeviceToHost), rc);
+ if (rc != 0) {
+ log_err("DDIR_READ cudaMemcpy D2H failed\n");
+ io_u->error = EIO;
+ }
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+ /* POSIX I/O read, copy the CPU buffer to GPU memory */
+ check_cudaruntimecall(cudaMemcpy(((char*) o->cu_mem_ptr) + gpu_offset,
+ io_u->xfer_buf,
+ io_u->xfer_buflen,
+ cudaMemcpyHostToDevice), rc);
+ if (rc != 0) {
+ log_err("DDIR_READ cudaMemcpy H2D failed\n");
+ io_u->error = EIO;
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ assert(0);
+ rc = EINVAL;
+ }
+
+ return rc;
+}
+
+static enum fio_q_status fio_libcufile_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct libcufile_options *o = td->eo;
+ struct fio_libcufile_data *fcd = FILE_ENG_DATA(io_u->file);
+ unsigned long long io_offset;
+ ssize_t sz;
+ ssize_t remaining;
+ size_t xfered;
+ size_t gpu_offset;
+ int rc;
+
+ if (o->cuda_io == IO_CUFILE && fcd == NULL) {
+ io_u->error = EINVAL;
+ td_verror(td, EINVAL, "xfer");
+ return FIO_Q_COMPLETED;
+ }
+
+ fio_ro_check(td, io_u);
+
+ switch(io_u->ddir) {
+ case DDIR_SYNC:
+ rc = fsync(io_u->file->fd);
+ if (rc != 0) {
+ io_u->error = errno;
+ log_err("fsync: err=%d\n", errno);
+ }
+ break;
+
+ case DDIR_DATASYNC:
+ rc = fdatasync(io_u->file->fd);
+ if (rc != 0) {
+ io_u->error = errno;
+ log_err("fdatasync: err=%d\n", errno);
+ }
+ break;
+
+ case DDIR_READ:
+ case DDIR_WRITE:
+ /*
+ There may be a better way to calculate gpu_offset. The intent is
+ that gpu_offset equals the the difference between io_u->xfer_buf and
+ the page-aligned base address for io_u buffers.
+ */
+ gpu_offset = io_u->index * io_u->xfer_buflen;
+ io_offset = io_u->offset;
+ remaining = io_u->xfer_buflen;
+
+ xfered = 0;
+ sz = 0;
+
+ assert(gpu_offset + io_u->xfer_buflen <= o->total_mem);
+
+ if (o->cuda_io == IO_CUFILE) {
+ if (!(ALIGNED_4KB(io_u->xfer_buflen) ||
+ (o->logged & LOGGED_BUFLEN_NOT_ALIGNED))) {
+ log_err("buflen not 4KB-aligned: %llu\n", io_u->xfer_buflen);
+ o->logged |= LOGGED_BUFLEN_NOT_ALIGNED;
+ }
+
+ if (!(ALIGNED_4KB(gpu_offset) ||
+ (o->logged & LOGGED_GPU_OFFSET_NOT_ALIGNED))) {
+ log_err("gpu_offset not 4KB-aligned: %lu\n", gpu_offset);
+ o->logged |= LOGGED_GPU_OFFSET_NOT_ALIGNED;
+ }
+ }
+
+ if (io_u->ddir == DDIR_WRITE)
+ rc = fio_libcufile_pre_write(td, o, io_u, gpu_offset);
+
+ if (io_u->error != 0)
+ break;
+
+ while (remaining > 0) {
+ assert(gpu_offset + xfered <= o->total_mem);
+ if (io_u->ddir == DDIR_READ) {
+ if (o->cuda_io == IO_CUFILE) {
+ sz = cuFileRead(fcd->cf_handle, o->cu_mem_ptr, remaining,
+ io_offset + xfered, gpu_offset + xfered);
+ if (sz == -1) {
+ io_u->error = errno;
+ log_err("cuFileRead: err=%d\n", errno);
+ } else if (sz < 0) {
+ io_u->error = EIO;
+ log_err("cuFileRead: err=%ld:%s\n", sz,
+ cufileop_status_error(-sz));
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+ sz = pread(io_u->file->fd, ((char*) io_u->xfer_buf) + xfered,
+ remaining, io_offset + xfered);
+ if (sz < 0) {
+ io_u->error = errno;
+ log_err("pread: err=%d\n", errno);
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ io_u->error = -1;
+ assert(0);
+ }
+ } else if (io_u->ddir == DDIR_WRITE) {
+ if (o->cuda_io == IO_CUFILE) {
+ sz = cuFileWrite(fcd->cf_handle, o->cu_mem_ptr, remaining,
+ io_offset + xfered, gpu_offset + xfered);
+ if (sz == -1) {
+ io_u->error = errno;
+ log_err("cuFileWrite: err=%d\n", errno);
+ } else if (sz < 0) {
+ io_u->error = EIO;
+ log_err("cuFileWrite: err=%ld:%s\n", sz,
+ cufileop_status_error(-sz));
+ }
+ } else if (o->cuda_io == IO_POSIX) {
+ sz = pwrite(io_u->file->fd,
+ ((char*) io_u->xfer_buf) + xfered,
+ remaining, io_offset + xfered);
+ if (sz < 0) {
+ io_u->error = errno;
+ log_err("pwrite: err=%d\n", errno);
+ }
+ } else {
+ log_err("Illegal CUDA IO type: %d\n", o->cuda_io);
+ io_u->error = -1;
+ assert(0);
+ }
+ } else {
+ log_err("not DDIR_READ or DDIR_WRITE: %d\n", io_u->ddir);
+ io_u->error = -1;
+ assert(0);
+ break;
+ }
+
+ if (io_u->error != 0)
+ break;
+
+ remaining -= sz;
+ xfered += sz;
+
+ if (remaining != 0)
+ log_info("Incomplete %s: %ld bytes remaining\n",
+ io_u->ddir == DDIR_READ? "read" : "write", remaining);
+ }
+
+ if (io_u->error != 0)
+ break;
+
+ if (io_u->ddir == DDIR_READ)
+ rc = fio_libcufile_post_read(td, o, io_u, gpu_offset);
+ break;
+
+ default:
+ io_u->error = EINVAL;
+ break;
+ }
+
+ if (io_u->error != 0) {
+ log_err("IO failed\n");
+ td_verror(td, io_u->error, "xfer");
+ }
+
+ return FIO_Q_COMPLETED;
+}
+
+static int fio_libcufile_open_file(struct thread_data *td, struct fio_file *f)
+{
+ struct libcufile_options *o = td->eo;
+ struct fio_libcufile_data *fcd = NULL;
+ int rc;
+ CUfileError_t status;
+
+ rc = generic_open_file(td, f);
+ if (rc)
+ return rc;
+
+ if (o->cuda_io == IO_CUFILE) {
+ fcd = calloc(1, sizeof(*fcd));
+ if (fcd == NULL) {
+ rc = ENOMEM;
+ goto exit_err;
+ }
+
+ fcd->cf_descr.handle.fd = f->fd;
+ fcd->cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+ status = cuFileHandleRegister(&fcd->cf_handle, &fcd->cf_descr);
+ if (status.err != CU_FILE_SUCCESS) {
+ log_err("cufile register: err=%d:%s\n", status.err,
+ fio_libcufile_get_cuda_error(status));
+ rc = EINVAL;
+ goto exit_err;
+ }
+ }
+
+ FILE_SET_ENG_DATA(f, fcd);
+ return 0;
+
+exit_err:
+ if (fcd) {
+ free(fcd);
+ fcd = NULL;
+ }
+ if (f) {
+ int rc2 = generic_close_file(td, f);
+ if (rc2)
+ log_err("generic_close_file: err=%d\n", rc2);
+ }
+ return rc;
+}
+
+static int fio_libcufile_close_file(struct thread_data *td, struct fio_file *f)
+{
+ struct fio_libcufile_data *fcd = FILE_ENG_DATA(f);
+ int rc;
+
+ if (fcd != NULL) {
+ cuFileHandleDeregister(fcd->cf_handle);
+ FILE_SET_ENG_DATA(f, NULL);
+ free(fcd);
+ }
+
+ rc = generic_close_file(td, f);
+
+ return rc;
+}
+
+static int fio_libcufile_iomem_alloc(struct thread_data *td, size_t total_mem)
+{
+ struct libcufile_options *o = td->eo;
+ int rc;
+ CUfileError_t status;
+
+ o->total_mem = total_mem;
+ o->logged = 0;
+ o->cu_mem_ptr = NULL;
+ o->junk_buf = NULL;
+ td->orig_buffer = calloc(1, total_mem);
+ if (!td->orig_buffer) {
+ log_err("orig_buffer calloc failed: err=%d\n", errno);
+ goto exit_error;
+ }
+
+ if (o->cuda_io == IO_POSIX) {
+ o->junk_buf = calloc(1, total_mem);
+ if (o->junk_buf == NULL) {
+ log_err("junk_buf calloc failed: err=%d\n", errno);
+ goto exit_error;
+ }
+ }
+
+ dprint(FD_MEM, "Alloc %zu for GPU %d\n", total_mem, o->my_gpu_id);
+ check_cudaruntimecall(cudaMalloc(&o->cu_mem_ptr, total_mem), rc);
+ if (rc != 0)
+ goto exit_error;
+ check_cudaruntimecall(cudaMemset(o->cu_mem_ptr, 0xab, total_mem), rc);
+ if (rc != 0)
+ goto exit_error;
+
+ if (o->cuda_io == IO_CUFILE) {
+ status = cuFileBufRegister(o->cu_mem_ptr, total_mem, 0);
+ if (status.err != CU_FILE_SUCCESS) {
+ log_err("cuFileBufRegister: err=%d:%s\n", status.err,
+ fio_libcufile_get_cuda_error(status));
+ goto exit_error;
+ }
+ }
+
+ return 0;
+
+exit_error:
+ if (td->orig_buffer) {
+ free(td->orig_buffer);
+ td->orig_buffer = NULL;
+ }
+ if (o->junk_buf) {
+ free(o->junk_buf);
+ o->junk_buf = NULL;
+ }
+ if (o->cu_mem_ptr) {
+ cudaFree(o->cu_mem_ptr);
+ o->cu_mem_ptr = NULL;
+ }
+ return 1;
+}
+
+static void fio_libcufile_iomem_free(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+
+ if (o->junk_buf) {
+ free(o->junk_buf);
+ o->junk_buf = NULL;
+ }
+ if (o->cu_mem_ptr) {
+ if (o->cuda_io == IO_CUFILE)
+ cuFileBufDeregister(o->cu_mem_ptr);
+ cudaFree(o->cu_mem_ptr);
+ o->cu_mem_ptr = NULL;
+ }
+ if (td->orig_buffer) {
+ free(td->orig_buffer);
+ td->orig_buffer = NULL;
+ }
+}
+
+static void fio_libcufile_cleanup(struct thread_data *td)
+{
+ struct libcufile_options *o = td->eo;
+
+ pthread_mutex_lock(&running_lock);
+ running--;
+ assert(running >= 0);
+ if (running == 0) {
+ /* only close the driver if initialized and
+ this is the last worker thread */
+ if (o->cuda_io == IO_CUFILE && cufile_initialized)
+ cuFileDriverClose();
+ cufile_initialized = 0;
+ }
+ pthread_mutex_unlock(&running_lock);
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+ .name = "libcufile",
+ .version = FIO_IOOPS_VERSION,
+ .init = fio_libcufile_init,
+ .queue = fio_libcufile_queue,
+ .get_file_size = generic_get_file_size,
+ .open_file = fio_libcufile_open_file,
+ .close_file = fio_libcufile_close_file,
+ .iomem_alloc = fio_libcufile_iomem_alloc,
+ .iomem_free = fio_libcufile_iomem_free,
+ .cleanup = fio_libcufile_cleanup,
+ .flags = FIO_SYNCIO,
+ .options = options,
+ .option_struct_size = sizeof(struct libcufile_options)
+};
+
+void fio_init fio_libcufile_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+void fio_exit fio_libcufile_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
};
struct hdfsio_options {
- void *pad; /* needed because offset can't be 0 for a option defined used offsetof */
+ void *pad; /* needed because offset can't be 0 for an option defined used offsetof */
char *host;
char *directory;
unsigned int port;
return 0;
}
-static int fio_hdfsio_init(struct thread_data *td)
+static int fio_hdfsio_io_u_init(struct thread_data *td, struct io_u *io_u)
{
struct hdfsio_options *options = td->eo;
struct hdfsio_data *hd = td->io_ops_data;
uint64_t file_size, total_file_size;
if (!td->io_ops_data) {
- hd = malloc(sizeof(*hd));
- memset(hd, 0, sizeof(*hd));
+ hd = calloc(1, sizeof(*hd));
hd->curr_file_id = -1;
return 0;
}
-static int fio_hdfsio_io_u_init(struct thread_data *td, struct io_u *io_u)
+static int fio_hdfsio_init(struct thread_data *td)
{
struct hdfsio_data *hd = td->io_ops_data;
struct hdfsio_options *options = td->eo;
struct scsi_readcapacity16 *rc16 = NULL;
int ret = 0;
- iscsi_lun = malloc(sizeof(struct iscsi_lun));
- memset(iscsi_lun, 0, sizeof(struct iscsi_lun));
+ iscsi_lun = calloc(1, sizeof(struct iscsi_lun));
iscsi_lun->iscsi_info = iscsi_info;
* libpmem: IO engine that uses PMDK libpmem to read and write data
*
* Copyright (C) 2017 Nippon Telegraph and Telephone Corporation.
- * Copyright 2018-2020, Intel Corporation
+ * Copyright 2018-2021, Intel Corporation
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License,
/*
* libpmem engine
*
- * IO engine that uses libpmem to write data (and memcpy to read)
+ * IO engine that uses libpmem (part of PMDK collection) to write data
+ * and libc's memcpy to read. It requires PMDK >= 1.5.
*
* To use:
* ioengine=libpmem
* mkdir /mnt/pmem0
* mount -o dax /dev/pmem0 /mnt/pmem0
*
- * See examples/libpmem.fio for more.
- *
- *
- * libpmem.so
- * By default, the libpmem engine will let the system find the libpmem.so
- * that it uses. You can use an alternative libpmem by setting the
- * FIO_PMEM_LIB environment variable to the full path to the desired
- * libpmem.so. This engine requires PMDK >= 1.5.
+ * See examples/libpmem.fio for complete usage example.
*/
#include <stdio.h>
-#include <limits.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/sysmacros.h>
-#include <libgen.h>
#include <libpmem.h>
#include "../fio.h"
{
struct thread_options *o = &td->o;
- dprint(FD_IO,"o->rw_min_bs %llu \n o->fsync_blocks %u \n o->fdatasync_blocks %u \n",
- o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks);
+ dprint(FD_IO, "o->rw_min_bs %llu\n o->fsync_blocks %u\n o->fdatasync_blocks %u\n",
+ o->rw_min_bs, o->fsync_blocks, o->fdatasync_blocks);
dprint(FD_IO, "DEBUG fio_libpmem_init\n");
if ((o->rw_min_bs & page_mask) &&
}
/*
- * This is the pmem_map_file execution function
+ * This is the pmem_map_file execution function, a helper to
+ * fio_libpmem_open_file function.
*/
static int fio_libpmem_file(struct thread_data *td, struct fio_file *f,
size_t length, off_t off)
{
struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
- mode_t mode = 0;
+ mode_t mode = S_IWUSR | S_IRUSR;
size_t mapped_len;
int is_pmem;
- if(td_rw(td))
- mode = S_IWUSR | S_IRUSR;
- else if (td_write(td))
- mode = S_IWUSR;
- else
- mode = S_IRUSR;
-
dprint(FD_IO, "DEBUG fio_libpmem_file\n");
dprint(FD_IO, "f->file_name = %s td->o.verify = %d \n", f->file_name,
td->o.verify);
{
struct fio_libpmem_data *fdd;
- dprint(FD_IO,"DEBUG fio_libpmem_open_file\n");
- dprint(FD_IO,"f->io_size=%ld \n",f->io_size);
- dprint(FD_IO,"td->o.size=%lld \n",td->o.size);
- dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth);
- dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch);
+ dprint(FD_IO, "DEBUG fio_libpmem_open_file\n");
+ dprint(FD_IO, "f->io_size=%ld\n", f->io_size);
+ dprint(FD_IO, "td->o.size=%lld\n", td->o.size);
+ dprint(FD_IO, "td->o.iodepth=%d\n", td->o.iodepth);
+ dprint(FD_IO, "td->o.iodepth_batch=%d\n", td->o.iodepth_batch);
if (fio_file_open(f))
td_io_close_file(td, f);
struct fio_file *f = io_u->file;
struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
- dprint(FD_IO, "DEBUG fio_libpmem_prep\n" );
- dprint(FD_IO," io_u->offset %llu : fdd->libpmem_off %ld : "
+ dprint(FD_IO, "DEBUG fio_libpmem_prep\n");
+ dprint(FD_IO, "io_u->offset %llu : fdd->libpmem_off %ld : "
"io_u->buflen %llu : fdd->libpmem_sz %ld\n",
io_u->offset, fdd->libpmem_off,
io_u->buflen, fdd->libpmem_sz);
io_u->error = 0;
dprint(FD_IO, "DEBUG fio_libpmem_queue\n");
- dprint(FD_IO,"td->o.odirect %d td->o.sync_io %d \n",td->o.odirect, td->o.sync_io);
+ dprint(FD_IO, "td->o.odirect %d td->o.sync_io %d\n",
+ td->o.odirect, td->o.sync_io);
+ /* map both O_SYNC / DSYNC to not use NODRAIN */
flags = td->o.sync_io ? 0 : PMEM_F_MEM_NODRAIN;
flags |= td->o.odirect ? PMEM_F_MEM_NONTEMPORAL : PMEM_F_MEM_TEMPORAL;
break;
case DDIR_WRITE:
dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n",
- io_u->mmap_data, io_u->xfer_buf );
+ io_u->mmap_data, io_u->xfer_buf);
pmem_memcpy(io_u->mmap_data,
io_u->xfer_buf,
io_u->xfer_buflen,
struct fio_libpmem_data *fdd = FILE_ENG_DATA(f);
int ret = 0;
- dprint(FD_IO,"DEBUG fio_libpmem_close_file\n");
- dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect);
-
- if (!td->o.odirect) {
- dprint(FD_IO,"pmem_drain\n");
- pmem_drain();
- }
+ dprint(FD_IO, "DEBUG fio_libpmem_close_file\n");
if (fdd->libpmem_ptr)
ret = pmem_unmap(fdd->libpmem_ptr, fdd->libpmem_sz);
.open_file = fio_libpmem_open_file,
.close_file = fio_libpmem_close_file,
.get_file_size = generic_get_file_size,
+ .prepopulate_file = generic_prepopulate_file,
.flags = FIO_SYNCIO | FIO_RAWIO | FIO_DISKLESSIO | FIO_NOEXTEND |
FIO_NODISKUTIL | FIO_BARRIER | FIO_MEMALIGN,
};
--- /dev/null
+/*
+* librpma_apm: IO engine that uses PMDK librpma to read and write data,
+ * based on Appliance Persistency Method
+ *
+ * Copyright 2020-2021, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include "librpma_fio.h"
+
+/* client side implementation */
+
+static inline int client_io_flush(struct thread_data *td,
+ struct io_u *first_io_u, struct io_u *last_io_u,
+ unsigned long long int len);
+
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index);
+
+static int client_init(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd;
+ unsigned int sq_size;
+ uint32_t cq_size;
+ struct rpma_conn_cfg *cfg = NULL;
+ struct rpma_peer_cfg *pcfg = NULL;
+ int ret;
+
+ /* not supported readwrite = trim / randtrim / trimwrite */
+ if (td_trim(td)) {
+ td_verror(td, EINVAL, "Not supported mode.");
+ return -1;
+ }
+
+ /*
+ * Calculate the required queue sizes where:
+ * - the send queue (SQ) has to be big enough to accommodate
+ * all io_us (WRITEs) and all flush requests (FLUSHes)
+ * - the completion queue (CQ) has to be big enough to accommodate all
+ * success and error completions (cq_size = sq_size)
+ */
+ if (td_random(td) || td_rw(td)) {
+ /*
+ * sq_size = max(rand_read_sq_size, rand_write_sq_size)
+ * where rand_read_sq_size < rand_write_sq_size because read
+ * does not require flush afterwards
+ * rand_write_sq_size = N * (WRITE + FLUSH)
+ *
+ * Note: rw is no different from random write since having
+ * interleaved reads with writes in extreme forces you to flush
+ * as often as when the writes are random.
+ */
+ sq_size = 2 * td->o.iodepth;
+ } else if (td_write(td)) {
+ /* sequential TD_DDIR_WRITE only */
+ if (td->o.sync_io) {
+ sq_size = 2; /* WRITE + FLUSH */
+ } else {
+ /*
+ * N * WRITE + B * FLUSH where:
+ * - B == ceil(iodepth / iodepth_batch)
+ * which is the number of batches for N writes
+ */
+ sq_size = td->o.iodepth + LIBRPMA_FIO_CEIL(td->o.iodepth,
+ td->o.iodepth_batch);
+ }
+ } else {
+ /* TD_DDIR_READ only */
+ if (td->o.sync_io) {
+ sq_size = 1; /* READ */
+ } else {
+ sq_size = td->o.iodepth; /* N x READ */
+ }
+ }
+ cq_size = sq_size;
+
+ /* create a connection configuration object */
+ if ((ret = rpma_conn_cfg_new(&cfg))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_new");
+ return -1;
+ }
+
+ /* apply queue sizes */
+ if ((ret = rpma_conn_cfg_set_sq_size(cfg, sq_size))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size");
+ goto err_cfg_delete;
+ }
+ if ((ret = rpma_conn_cfg_set_cq_size(cfg, cq_size))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size");
+ goto err_cfg_delete;
+ }
+
+ if (librpma_fio_client_init(td, cfg))
+ goto err_cfg_delete;
+
+ ccd = td->io_ops_data;
+
+ if (ccd->server_mr_flush_type == RPMA_FLUSH_TYPE_PERSISTENT) {
+ if (!ccd->ws->direct_write_to_pmem) {
+ if (td->thread_number == 1)
+ log_err(
+ "Fio librpma engine will not work until the Direct Write to PMem on the server side is possible (direct_write_to_pmem)\n");
+ goto err_cleanup_common;
+ }
+
+ /* configure peer's direct write to pmem support */
+ if ((ret = rpma_peer_cfg_new(&pcfg))) {
+ librpma_td_verror(td, ret, "rpma_peer_cfg_new");
+ goto err_cleanup_common;
+ }
+
+ if ((ret = rpma_peer_cfg_set_direct_write_to_pmem(pcfg, true))) {
+ librpma_td_verror(td, ret,
+ "rpma_peer_cfg_set_direct_write_to_pmem");
+ (void) rpma_peer_cfg_delete(&pcfg);
+ goto err_cleanup_common;
+ }
+
+ if ((ret = rpma_conn_apply_remote_peer_cfg(ccd->conn, pcfg))) {
+ librpma_td_verror(td, ret,
+ "rpma_conn_apply_remote_peer_cfg");
+ (void) rpma_peer_cfg_delete(&pcfg);
+ goto err_cleanup_common;
+ }
+
+ (void) rpma_peer_cfg_delete(&pcfg);
+ } else if (td->thread_number == 1) {
+ /* XXX log_info mixes with the JSON output */
+ log_err(
+ "Note: Direct Write to PMem is not supported by default nor required if you use DRAM instead of PMem on the server side (direct_write_to_pmem).\n"
+ "Remember that flushing to DRAM does not make your data persistent and may be used only for experimental purposes.\n");
+ }
+
+ if ((ret = rpma_conn_cfg_delete(&cfg))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_delete");
+ /* non fatal error - continue */
+ }
+
+ ccd->flush = client_io_flush;
+ ccd->get_io_u_index = client_get_io_u_index;
+
+ return 0;
+
+err_cleanup_common:
+ librpma_fio_client_cleanup(td);
+
+err_cfg_delete:
+ (void) rpma_conn_cfg_delete(&cfg);
+
+ return -1;
+}
+
+static void client_cleanup(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+
+ if (ccd == NULL)
+ return;
+
+ free(ccd->client_data);
+
+ librpma_fio_client_cleanup(td);
+}
+
+static inline int client_io_flush(struct thread_data *td,
+ struct io_u *first_io_u, struct io_u *last_io_u,
+ unsigned long long int len)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ size_t dst_offset = first_io_u->offset;
+ int ret;
+
+ if ((ret = rpma_flush(ccd->conn, ccd->server_mr, dst_offset, len,
+ ccd->server_mr_flush_type, RPMA_F_COMPLETION_ALWAYS,
+ (void *)(uintptr_t)last_io_u->index))) {
+ librpma_td_verror(td, ret, "rpma_flush");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index)
+{
+ memcpy(io_u_index, &wc->wr_id, sizeof(*io_u_index));
+
+ return 1;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_client = {
+ .name = "librpma_apm_client",
+ .version = FIO_IOOPS_VERSION,
+ .init = client_init,
+ .post_init = librpma_fio_client_post_init,
+ .get_file_size = librpma_fio_client_get_file_size,
+ .open_file = librpma_fio_file_nop,
+ .queue = librpma_fio_client_queue,
+ .commit = librpma_fio_client_commit,
+ .getevents = librpma_fio_client_getevents,
+ .event = librpma_fio_client_event,
+ .errdetails = librpma_fio_client_errdetails,
+ .close_file = librpma_fio_file_nop,
+ .cleanup = client_cleanup,
+ .flags = FIO_DISKLESSIO | FIO_ASYNCIO_SETS_ISSUE_TIME,
+ .options = librpma_fio_options,
+ .option_struct_size = sizeof(struct librpma_fio_options_values),
+};
+
+/* server side implementation */
+
+static int server_open_file(struct thread_data *td, struct fio_file *f)
+{
+ return librpma_fio_server_open_file(td, f, NULL);
+}
+
+static enum fio_q_status server_queue(struct thread_data *td, struct io_u *io_u)
+{
+ return FIO_Q_COMPLETED;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_server = {
+ .name = "librpma_apm_server",
+ .version = FIO_IOOPS_VERSION,
+ .init = librpma_fio_server_init,
+ .open_file = server_open_file,
+ .close_file = librpma_fio_server_close_file,
+ .queue = server_queue,
+ .invalidate = librpma_fio_file_nop,
+ .cleanup = librpma_fio_server_cleanup,
+ .flags = FIO_SYNCIO,
+ .options = librpma_fio_options,
+ .option_struct_size = sizeof(struct librpma_fio_options_values),
+};
+
+/* register both engines */
+
+static void fio_init fio_librpma_apm_register(void)
+{
+ register_ioengine(&ioengine_client);
+ register_ioengine(&ioengine_server);
+}
+
+static void fio_exit fio_librpma_apm_unregister(void)
+{
+ unregister_ioengine(&ioengine_client);
+ unregister_ioengine(&ioengine_server);
+}
--- /dev/null
+/*
+ * librpma_fio: librpma_apm and librpma_gpspm engines' common part.
+ *
+ * Copyright 2021-2022, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifdef CONFIG_LIBPMEM2_INSTALLED
+#include "librpma_fio_pmem2.h"
+#else
+#include "librpma_fio_pmem.h"
+#endif /* CONFIG_LIBPMEM2_INSTALLED */
+
+struct fio_option librpma_fio_options[] = {
+ {
+ .name = "serverip",
+ .lname = "rpma_server_ip",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct librpma_fio_options_values, server_ip),
+ .help = "IP address the server is listening on",
+ .def = "",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBRPMA,
+ },
+ {
+ .name = "port",
+ .lname = "rpma_server port",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct librpma_fio_options_values, port),
+ .help = "port the server is listening on",
+ .def = "7204",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBRPMA,
+ },
+ {
+ .name = "direct_write_to_pmem",
+ .lname = "Direct Write to PMem (via RDMA) from the remote host is possible",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct librpma_fio_options_values,
+ direct_write_to_pmem),
+ .help = "Set to true ONLY when Direct Write to PMem from the remote host is possible (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)",
+ .def = "",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBRPMA,
+ },
+ {
+ .name = "busy_wait_polling",
+ .lname = "Set to 0 to wait for completion instead of busy-wait polling completion.",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct librpma_fio_options_values,
+ busy_wait_polling),
+ .help = "Set to false if you want to reduce CPU usage",
+ .def = "1",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_LIBRPMA,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+int librpma_fio_td_port(const char *port_base_str, struct thread_data *td,
+ char *port_out)
+{
+ unsigned long int port_ul = strtoul(port_base_str, NULL, 10);
+ unsigned int port_new;
+
+ port_out[0] = '\0';
+
+ if (port_ul == ULONG_MAX) {
+ td_verror(td, errno, "strtoul");
+ return -1;
+ }
+ port_ul += td->thread_number - 1;
+ if (port_ul >= UINT_MAX) {
+ log_err("[%u] port number (%lu) bigger than UINT_MAX\n",
+ td->thread_number, port_ul);
+ return -1;
+ }
+
+ port_new = port_ul;
+ snprintf(port_out, LIBRPMA_FIO_PORT_STR_LEN_MAX - 1, "%u", port_new);
+
+ return 0;
+}
+
+char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
+ struct librpma_fio_mem *mem)
+{
+ char *mem_ptr = NULL;
+ int ret;
+
+ if ((ret = posix_memalign((void **)&mem_ptr, page_size, size))) {
+ log_err("fio: posix_memalign() failed\n");
+ td_verror(td, ret, "posix_memalign");
+ return NULL;
+ }
+
+ mem->mem_ptr = mem_ptr;
+ mem->size_mmap = 0;
+
+ return mem_ptr;
+}
+
+char *librpma_fio_allocate_pmem(struct thread_data *td, struct fio_file *f,
+ size_t size, struct librpma_fio_mem *mem)
+{
+ size_t ws_offset;
+ mem->mem_ptr = NULL;
+
+ if (size % page_size) {
+ log_err("fio: size (%zu) is not aligned to page size (%zu)\n",
+ size, page_size);
+ return NULL;
+ }
+
+ if (f->filetype == FIO_TYPE_CHAR) {
+ /* Each thread uses a separate offset within DeviceDAX. */
+ ws_offset = (td->thread_number - 1) * size;
+ } else {
+ /* Each thread uses a separate FileSystemDAX file. No offset is needed. */
+ ws_offset = 0;
+ }
+
+ if (!f->file_name) {
+ log_err("fio: filename is not set\n");
+ return NULL;
+ }
+
+ if (librpma_fio_pmem_map_file(f, size, mem, ws_offset)) {
+ log_err("fio: librpma_fio_pmem_map_file(%s) failed\n",
+ f->file_name);
+ return NULL;
+ }
+
+ log_info("fio: size of memory mapped from the file %s: %zu\n",
+ f->file_name, mem->size_mmap);
+
+ log_info("fio: library used to map PMem from file: %s\n", RPMA_PMEM_USED);
+
+ return mem->mem_ptr ? mem->mem_ptr + ws_offset : NULL;
+}
+
+void librpma_fio_free(struct librpma_fio_mem *mem)
+{
+ if (mem->size_mmap)
+ librpma_fio_unmap(mem);
+ else
+ free(mem->mem_ptr);
+}
+
+#define LIBRPMA_FIO_RETRY_MAX_NO 10
+#define LIBRPMA_FIO_RETRY_DELAY_S 5
+
+int librpma_fio_client_init(struct thread_data *td,
+ struct rpma_conn_cfg *cfg)
+{
+ struct librpma_fio_client_data *ccd;
+ struct librpma_fio_options_values *o = td->eo;
+ struct ibv_context *dev = NULL;
+ char port_td[LIBRPMA_FIO_PORT_STR_LEN_MAX];
+ struct rpma_conn_req *req = NULL;
+ enum rpma_conn_event event;
+ struct rpma_conn_private_data pdata;
+ enum rpma_log_level log_level_aux = RPMA_LOG_LEVEL_WARNING;
+ int remote_flush_type;
+ int retry;
+ int ret;
+
+ /* --debug=net sets RPMA_LOG_THRESHOLD_AUX to RPMA_LOG_LEVEL_INFO */
+#ifdef FIO_INC_DEBUG
+ if ((1UL << FD_NET) & fio_debug)
+ log_level_aux = RPMA_LOG_LEVEL_INFO;
+#endif
+
+ /* configure logging thresholds to see more details */
+ rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
+ rpma_log_set_threshold(RPMA_LOG_THRESHOLD_AUX, log_level_aux);
+
+ /* obtain an IBV context for a remote IP address */
+ if ((ret = rpma_utils_get_ibv_context(o->server_ip,
+ RPMA_UTIL_IBV_CONTEXT_REMOTE, &dev))) {
+ librpma_td_verror(td, ret, "rpma_utils_get_ibv_context");
+ return -1;
+ }
+
+ /* allocate client's data */
+ ccd = calloc(1, sizeof(*ccd));
+ if (ccd == NULL) {
+ td_verror(td, errno, "calloc");
+ return -1;
+ }
+
+ /* allocate all in-memory queues */
+ ccd->io_us_queued = calloc(td->o.iodepth, sizeof(*ccd->io_us_queued));
+ if (ccd->io_us_queued == NULL) {
+ td_verror(td, errno, "calloc");
+ goto err_free_ccd;
+ }
+
+ ccd->io_us_flight = calloc(td->o.iodepth, sizeof(*ccd->io_us_flight));
+ if (ccd->io_us_flight == NULL) {
+ td_verror(td, errno, "calloc");
+ goto err_free_io_u_queues;
+ }
+
+ ccd->io_us_completed = calloc(td->o.iodepth,
+ sizeof(*ccd->io_us_completed));
+ if (ccd->io_us_completed == NULL) {
+ td_verror(td, errno, "calloc");
+ goto err_free_io_u_queues;
+ }
+
+ /* create a new peer object */
+ if ((ret = rpma_peer_new(dev, &ccd->peer))) {
+ librpma_td_verror(td, ret, "rpma_peer_new");
+ goto err_free_io_u_queues;
+ }
+
+ /* create a connection request */
+ if (librpma_fio_td_port(o->port, td, port_td))
+ goto err_peer_delete;
+
+ for (retry = 0; retry < LIBRPMA_FIO_RETRY_MAX_NO; retry++) {
+ if ((ret = rpma_conn_req_new(ccd->peer, o->server_ip, port_td,
+ cfg, &req))) {
+ librpma_td_verror(td, ret, "rpma_conn_req_new");
+ goto err_peer_delete;
+ }
+
+ /*
+ * Connect the connection request
+ * and obtain the connection object.
+ */
+ if ((ret = rpma_conn_req_connect(&req, NULL, &ccd->conn))) {
+ librpma_td_verror(td, ret, "rpma_conn_req_connect");
+ goto err_req_delete;
+ }
+
+ /* wait for the connection to establish */
+ if ((ret = rpma_conn_next_event(ccd->conn, &event))) {
+ librpma_td_verror(td, ret, "rpma_conn_next_event");
+ goto err_conn_delete;
+ } else if (event == RPMA_CONN_ESTABLISHED) {
+ break;
+ } else if (event == RPMA_CONN_REJECTED) {
+ (void) rpma_conn_disconnect(ccd->conn);
+ (void) rpma_conn_delete(&ccd->conn);
+ if (retry < LIBRPMA_FIO_RETRY_MAX_NO - 1) {
+ log_err("Thread [%d]: Retrying (#%i) ...\n",
+ td->thread_number, retry + 1);
+ sleep(LIBRPMA_FIO_RETRY_DELAY_S);
+ } else {
+ log_err(
+ "Thread [%d]: The maximum number of retries exceeded. Closing.\n",
+ td->thread_number);
+ }
+ } else {
+ log_err(
+ "rpma_conn_next_event returned an unexptected event: (%s != RPMA_CONN_ESTABLISHED)\n",
+ rpma_utils_conn_event_2str(event));
+ goto err_conn_delete;
+ }
+ }
+
+ if (retry > 0)
+ log_err("Thread [%d]: Connected after retry #%i\n",
+ td->thread_number, retry);
+
+ if (ccd->conn == NULL)
+ goto err_peer_delete;
+
+ /* get the connection's main CQ */
+ if ((ret = rpma_conn_get_cq(ccd->conn, &ccd->cq))) {
+ librpma_td_verror(td, ret, "rpma_conn_get_cq");
+ goto err_conn_delete;
+ }
+
+ /* get the connection's private data sent from the server */
+ if ((ret = rpma_conn_get_private_data(ccd->conn, &pdata))) {
+ librpma_td_verror(td, ret, "rpma_conn_get_private_data");
+ goto err_conn_delete;
+ }
+
+ /* get the server's workspace representation */
+ ccd->ws = pdata.ptr;
+
+ /* create the server's memory representation */
+ if ((ret = rpma_mr_remote_from_descriptor(&ccd->ws->descriptor[0],
+ ccd->ws->mr_desc_size, &ccd->server_mr))) {
+ librpma_td_verror(td, ret, "rpma_mr_remote_from_descriptor");
+ goto err_conn_delete;
+ }
+
+ /* get the total size of the shared server memory */
+ if ((ret = rpma_mr_remote_get_size(ccd->server_mr, &ccd->ws_size))) {
+ librpma_td_verror(td, ret, "rpma_mr_remote_get_size");
+ goto err_conn_delete;
+ }
+
+ /* get flush type of the remote node */
+ if ((ret = rpma_mr_remote_get_flush_type(ccd->server_mr,
+ &remote_flush_type))) {
+ librpma_td_verror(td, ret, "rpma_mr_remote_get_flush_type");
+ goto err_conn_delete;
+ }
+
+ ccd->server_mr_flush_type =
+ (remote_flush_type & RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT) ?
+ RPMA_FLUSH_TYPE_PERSISTENT : RPMA_FLUSH_TYPE_VISIBILITY;
+
+ /*
+ * Assure an io_us buffer allocation is page-size-aligned which is required
+ * to register for RDMA. User-provided value is intentionally ignored.
+ */
+ td->o.mem_align = page_size;
+
+ td->io_ops_data = ccd;
+
+ return 0;
+
+err_conn_delete:
+ (void) rpma_conn_disconnect(ccd->conn);
+ (void) rpma_conn_delete(&ccd->conn);
+
+err_req_delete:
+ (void) rpma_conn_req_delete(&req);
+
+err_peer_delete:
+ (void) rpma_peer_delete(&ccd->peer);
+
+err_free_io_u_queues:
+ free(ccd->io_us_queued);
+ free(ccd->io_us_flight);
+ free(ccd->io_us_completed);
+
+err_free_ccd:
+ free(ccd);
+
+ return -1;
+}
+
+void librpma_fio_client_cleanup(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ enum rpma_conn_event ev;
+ int ret;
+
+ if (ccd == NULL)
+ return;
+
+ /* delete the iou's memory registration */
+ if ((ret = rpma_mr_dereg(&ccd->orig_mr)))
+ librpma_td_verror(td, ret, "rpma_mr_dereg");
+ /* delete the iou's memory registration */
+ if ((ret = rpma_mr_remote_delete(&ccd->server_mr)))
+ librpma_td_verror(td, ret, "rpma_mr_remote_delete");
+ /* initiate disconnection */
+ if ((ret = rpma_conn_disconnect(ccd->conn)))
+ librpma_td_verror(td, ret, "rpma_conn_disconnect");
+ /* wait for disconnection to end up */
+ if ((ret = rpma_conn_next_event(ccd->conn, &ev))) {
+ librpma_td_verror(td, ret, "rpma_conn_next_event");
+ } else if (ev != RPMA_CONN_CLOSED) {
+ log_err(
+ "client_cleanup received an unexpected event (%s != RPMA_CONN_CLOSED)\n",
+ rpma_utils_conn_event_2str(ev));
+ }
+ /* delete the connection */
+ if ((ret = rpma_conn_delete(&ccd->conn)))
+ librpma_td_verror(td, ret, "rpma_conn_delete");
+ /* delete the peer */
+ if ((ret = rpma_peer_delete(&ccd->peer)))
+ librpma_td_verror(td, ret, "rpma_peer_delete");
+ /* free the software queues */
+ free(ccd->io_us_queued);
+ free(ccd->io_us_flight);
+ free(ccd->io_us_completed);
+ free(ccd);
+ td->io_ops_data = NULL; /* zero ccd */
+}
+
+int librpma_fio_file_nop(struct thread_data *td, struct fio_file *f)
+{
+ /* NOP */
+ return 0;
+}
+
+int librpma_fio_client_post_init(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ size_t io_us_size;
+ int ret;
+
+ /*
+ * td->orig_buffer is not aligned. The engine requires aligned io_us
+ * so FIO aligns up the address using the formula below.
+ */
+ ccd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) +
+ td->o.mem_align;
+
+ /*
+ * td->orig_buffer_size beside the space really consumed by io_us
+ * has paddings which can be omitted for the memory registration.
+ */
+ io_us_size = (unsigned long long)td_max_bs(td) *
+ (unsigned long long)td->o.iodepth;
+
+ if ((ret = rpma_mr_reg(ccd->peer, ccd->orig_buffer_aligned, io_us_size,
+ RPMA_MR_USAGE_READ_DST | RPMA_MR_USAGE_READ_SRC |
+ RPMA_MR_USAGE_WRITE_DST | RPMA_MR_USAGE_WRITE_SRC |
+ RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT, &ccd->orig_mr)))
+ librpma_td_verror(td, ret, "rpma_mr_reg");
+ return ret;
+}
+
+int librpma_fio_client_get_file_size(struct thread_data *td,
+ struct fio_file *f)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+
+ f->real_file_size = ccd->ws_size;
+ fio_file_set_size_known(f);
+
+ return 0;
+}
+
+static enum fio_q_status client_queue_sync(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct ibv_wc wc;
+ unsigned io_u_index;
+ int ret;
+
+ /* execute io_u */
+ if (io_u->ddir == DDIR_READ) {
+ /* post an RDMA read operation */
+ if (librpma_fio_client_io_read(td, io_u,
+ RPMA_F_COMPLETION_ALWAYS))
+ goto err;
+ } else if (io_u->ddir == DDIR_WRITE) {
+ /* post an RDMA write operation */
+ if (librpma_fio_client_io_write(td, io_u))
+ goto err;
+ if (ccd->flush(td, io_u, io_u, io_u->xfer_buflen))
+ goto err;
+ } else {
+ log_err("unsupported IO mode: %s\n", io_ddir_name(io_u->ddir));
+ goto err;
+ }
+
+ do {
+ /* get a completion */
+ ret = rpma_cq_get_wc(ccd->cq, 1, &wc, NULL);
+ if (ret == RPMA_E_NO_COMPLETION) {
+ /* lack of completion is not an error */
+ continue;
+ } else if (ret != 0) {
+ /* an error occurred */
+ librpma_td_verror(td, ret, "rpma_cq_get_wc");
+ goto err;
+ }
+
+ /* if io_us has completed with an error */
+ if (wc.status != IBV_WC_SUCCESS)
+ goto err;
+
+ if (wc.opcode == IBV_WC_SEND)
+ ++ccd->op_send_completed;
+ else {
+ if (wc.opcode == IBV_WC_RECV)
+ ++ccd->op_recv_completed;
+
+ break;
+ }
+ } while (1);
+
+ if (ccd->get_io_u_index(&wc, &io_u_index) != 1)
+ goto err;
+
+ if (io_u->index != io_u_index) {
+ log_err(
+ "no matching io_u for received completion found (io_u_index=%u)\n",
+ io_u_index);
+ goto err;
+ }
+
+ /* make sure all SENDs are completed before exit - clean up SQ */
+ if (librpma_fio_client_io_complete_all_sends(td))
+ goto err;
+
+ return FIO_Q_COMPLETED;
+
+err:
+ io_u->error = -1;
+ return FIO_Q_COMPLETED;
+}
+
+enum fio_q_status librpma_fio_client_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+
+ if (ccd->io_u_queued_nr == (int)td->o.iodepth)
+ return FIO_Q_BUSY;
+
+ if (td->o.sync_io)
+ return client_queue_sync(td, io_u);
+
+ /* io_u -> queued[] */
+ ccd->io_us_queued[ccd->io_u_queued_nr] = io_u;
+ ccd->io_u_queued_nr++;
+
+ return FIO_Q_QUEUED;
+}
+
+int librpma_fio_client_commit(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ int flags = RPMA_F_COMPLETION_ON_ERROR;
+ struct timespec now;
+ bool fill_time;
+ int i;
+ struct io_u *flush_first_io_u = NULL;
+ unsigned long long int flush_len = 0;
+
+ if (!ccd->io_us_queued)
+ return -1;
+
+ /* execute all io_us from queued[] */
+ for (i = 0; i < ccd->io_u_queued_nr; i++) {
+ struct io_u *io_u = ccd->io_us_queued[i];
+
+ if (io_u->ddir == DDIR_READ) {
+ if (i + 1 == ccd->io_u_queued_nr ||
+ ccd->io_us_queued[i + 1]->ddir == DDIR_WRITE)
+ flags = RPMA_F_COMPLETION_ALWAYS;
+ /* post an RDMA read operation */
+ if (librpma_fio_client_io_read(td, io_u, flags))
+ return -1;
+ } else if (io_u->ddir == DDIR_WRITE) {
+ /* post an RDMA write operation */
+ if (librpma_fio_client_io_write(td, io_u))
+ return -1;
+
+ /* cache the first io_u in the sequence */
+ if (flush_first_io_u == NULL)
+ flush_first_io_u = io_u;
+
+ /*
+ * the flush length is the sum of all io_u's creating
+ * the sequence
+ */
+ flush_len += io_u->xfer_buflen;
+
+ /*
+ * if io_u's are random the rpma_flush is required
+ * after each one of them
+ */
+ if (!td_random(td)) {
+ /*
+ * When the io_u's are sequential and
+ * the current io_u is not the last one and
+ * the next one is also a write operation
+ * the flush can be postponed by one io_u and
+ * cover all of them which build a continuous
+ * sequence.
+ */
+ if ((i + 1 < ccd->io_u_queued_nr) &&
+ (ccd->io_us_queued[i + 1]->ddir == DDIR_WRITE))
+ continue;
+ }
+
+ /* flush all writes which build a continuous sequence */
+ if (ccd->flush(td, flush_first_io_u, io_u, flush_len))
+ return -1;
+
+ /*
+ * reset the flush parameters in preparation for
+ * the next one
+ */
+ flush_first_io_u = NULL;
+ flush_len = 0;
+ } else {
+ log_err("unsupported IO mode: %s\n",
+ io_ddir_name(io_u->ddir));
+ return -1;
+ }
+ }
+
+ if ((fill_time = fio_fill_issue_time(td))) {
+ fio_gettime(&now, NULL);
+
+ /*
+ * only used for iolog
+ */
+ if (td->o.read_iolog_file)
+ memcpy(&td->last_issue, &now, sizeof(now));
+
+ }
+ /* move executed io_us from queued[] to flight[] */
+ for (i = 0; i < ccd->io_u_queued_nr; i++) {
+ struct io_u *io_u = ccd->io_us_queued[i];
+
+ /* FIO does not do this if the engine is asynchronous */
+ if (fill_time)
+ memcpy(&io_u->issue_time, &now, sizeof(now));
+
+ /* move executed io_us from queued[] to flight[] */
+ ccd->io_us_flight[ccd->io_u_flight_nr] = io_u;
+ ccd->io_u_flight_nr++;
+
+ /*
+ * FIO says:
+ * If an engine has the commit hook
+ * it has to call io_u_queued() itself.
+ */
+ io_u_queued(td, io_u);
+ }
+
+ /* FIO does not do this if an engine has the commit hook. */
+ io_u_mark_submit(td, ccd->io_u_queued_nr);
+ ccd->io_u_queued_nr = 0;
+
+ return 0;
+}
+
+/*
+ * RETURN VALUE
+ * - > 0 - a number of completed io_us
+ * - 0 - when no complicitions received
+ * - (-1) - when an error occurred
+ */
+static int client_getevent_process(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct ibv_wc wc;
+ /* io_u->index of completed io_u (wc.wr_id) */
+ unsigned int io_u_index;
+ /* # of completed io_us */
+ int cmpl_num = 0;
+ /* helpers */
+ struct io_u *io_u;
+ int i;
+ int ret;
+
+ /* get a completion */
+ if ((ret = rpma_cq_get_wc(ccd->cq, 1, &wc, NULL))) {
+ /* lack of completion is not an error */
+ if (ret == RPMA_E_NO_COMPLETION) {
+ /* lack of completion is not an error */
+ return 0;
+ }
+
+ /* an error occurred */
+ librpma_td_verror(td, ret, "rpma_cq_get_wc");
+ return -1;
+ }
+
+ /* if io_us has completed with an error */
+ if (wc.status != IBV_WC_SUCCESS) {
+ td->error = wc.status;
+ return -1;
+ }
+
+ if (wc.opcode == IBV_WC_SEND)
+ ++ccd->op_send_completed;
+ else if (wc.opcode == IBV_WC_RECV)
+ ++ccd->op_recv_completed;
+
+ if ((ret = ccd->get_io_u_index(&wc, &io_u_index)) != 1)
+ return ret;
+
+ /* look for an io_u being completed */
+ for (i = 0; i < ccd->io_u_flight_nr; ++i) {
+ if (ccd->io_us_flight[i]->index == io_u_index) {
+ cmpl_num = i + 1;
+ break;
+ }
+ }
+
+ /* if no matching io_u has been found */
+ if (cmpl_num == 0) {
+ log_err(
+ "no matching io_u for received completion found (io_u_index=%u)\n",
+ io_u_index);
+ return -1;
+ }
+
+ /* move completed io_us to the completed in-memory queue */
+ for (i = 0; i < cmpl_num; ++i) {
+ /* get and prepare io_u */
+ io_u = ccd->io_us_flight[i];
+
+ /* append to the queue */
+ ccd->io_us_completed[ccd->io_u_completed_nr] = io_u;
+ ccd->io_u_completed_nr++;
+ }
+
+ /* remove completed io_us from the flight queue */
+ for (i = cmpl_num; i < ccd->io_u_flight_nr; ++i)
+ ccd->io_us_flight[i - cmpl_num] = ccd->io_us_flight[i];
+ ccd->io_u_flight_nr -= cmpl_num;
+
+ return cmpl_num;
+}
+
+int librpma_fio_client_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ /* total # of completed io_us */
+ int cmpl_num_total = 0;
+ /* # of completed io_us from a single event */
+ int cmpl_num;
+
+ do {
+ cmpl_num = client_getevent_process(td);
+ if (cmpl_num > 0) {
+ /* new completions collected */
+ cmpl_num_total += cmpl_num;
+ } else if (cmpl_num == 0) {
+ /*
+ * It is required to make sure that CQEs for SENDs
+ * will flow at least at the same pace as CQEs for RECVs.
+ */
+ if (cmpl_num_total >= min &&
+ ccd->op_send_completed >= ccd->op_recv_completed)
+ break;
+
+ /*
+ * To reduce CPU consumption one can use
+ * the rpma_cq_wait() function.
+ * Note this greatly increase the latency
+ * and make the results less stable.
+ * The bandwidth stays more or less the same.
+ */
+ } else {
+ /* an error occurred */
+ return -1;
+ }
+
+ /*
+ * The expected max can be exceeded if CQEs for RECVs will come up
+ * faster than CQEs for SENDs. But it is required to make sure CQEs for
+ * SENDs will flow at least at the same pace as CQEs for RECVs.
+ */
+ } while (cmpl_num_total < max ||
+ ccd->op_send_completed < ccd->op_recv_completed);
+
+ /*
+ * All posted SENDs are completed and RECVs for them (responses) are
+ * completed. This is the initial situation so the counters are reset.
+ */
+ if (ccd->op_send_posted == ccd->op_send_completed &&
+ ccd->op_send_completed == ccd->op_recv_completed) {
+ ccd->op_send_posted = 0;
+ ccd->op_send_completed = 0;
+ ccd->op_recv_completed = 0;
+ }
+
+ return cmpl_num_total;
+}
+
+struct io_u *librpma_fio_client_event(struct thread_data *td, int event)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct io_u *io_u;
+ int i;
+
+ /* get the first io_u from the queue */
+ io_u = ccd->io_us_completed[0];
+
+ /* remove the first io_u from the queue */
+ for (i = 1; i < ccd->io_u_completed_nr; ++i)
+ ccd->io_us_completed[i - 1] = ccd->io_us_completed[i];
+ ccd->io_u_completed_nr--;
+
+ dprint_io_u(io_u, "client_event");
+
+ return io_u;
+}
+
+char *librpma_fio_client_errdetails(struct io_u *io_u)
+{
+ /* get the string representation of an error */
+ enum ibv_wc_status status = io_u->error;
+ const char *status_str = ibv_wc_status_str(status);
+
+ char *details = strdup(status_str);
+ if (details == NULL) {
+ fprintf(stderr, "Error: %s\n", status_str);
+ fprintf(stderr, "Fatal error: out of memory. Aborting.\n");
+ abort();
+ }
+
+ /* FIO frees the returned string when it becomes obsolete */
+ return details;
+}
+
+int librpma_fio_server_init(struct thread_data *td)
+{
+ struct librpma_fio_options_values *o = td->eo;
+ struct librpma_fio_server_data *csd;
+ struct ibv_context *dev = NULL;
+ enum rpma_log_level log_level_aux = RPMA_LOG_LEVEL_WARNING;
+ int ret = -1;
+
+ /* --debug=net sets RPMA_LOG_THRESHOLD_AUX to RPMA_LOG_LEVEL_INFO */
+#ifdef FIO_INC_DEBUG
+ if ((1UL << FD_NET) & fio_debug)
+ log_level_aux = RPMA_LOG_LEVEL_INFO;
+#endif
+
+ /* configure logging thresholds to see more details */
+ rpma_log_set_threshold(RPMA_LOG_THRESHOLD, RPMA_LOG_LEVEL_INFO);
+ rpma_log_set_threshold(RPMA_LOG_THRESHOLD_AUX, log_level_aux);
+
+
+ /* obtain an IBV context for a remote IP address */
+ if ((ret = rpma_utils_get_ibv_context(o->server_ip,
+ RPMA_UTIL_IBV_CONTEXT_LOCAL, &dev))) {
+ librpma_td_verror(td, ret, "rpma_utils_get_ibv_context");
+ return -1;
+ }
+
+ /* allocate server's data */
+ csd = calloc(1, sizeof(*csd));
+ if (csd == NULL) {
+ td_verror(td, errno, "calloc");
+ return -1;
+ }
+
+ /* create a new peer object */
+ if ((ret = rpma_peer_new(dev, &csd->peer))) {
+ librpma_td_verror(td, ret, "rpma_peer_new");
+ goto err_free_csd;
+ }
+
+ td->io_ops_data = csd;
+
+ return 0;
+
+err_free_csd:
+ free(csd);
+
+ return -1;
+}
+
+void librpma_fio_server_cleanup(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ int ret;
+
+ if (csd == NULL)
+ return;
+
+ /* free the peer */
+ if ((ret = rpma_peer_delete(&csd->peer)))
+ librpma_td_verror(td, ret, "rpma_peer_delete");
+
+ free(csd);
+}
+
+int librpma_fio_server_open_file(struct thread_data *td, struct fio_file *f,
+ struct rpma_conn_cfg *cfg)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct librpma_fio_options_values *o = td->eo;
+ enum rpma_conn_event conn_event = RPMA_CONN_UNDEFINED;
+ struct librpma_fio_workspace ws = {0};
+ struct rpma_conn_private_data pdata;
+ uint32_t max_msg_num;
+ struct rpma_conn_req *conn_req;
+ struct rpma_conn *conn;
+ struct rpma_mr_local *mr;
+ char port_td[LIBRPMA_FIO_PORT_STR_LEN_MAX];
+ struct rpma_ep *ep;
+ size_t mem_size = td->o.size;
+ size_t mr_desc_size;
+ void *ws_ptr;
+ bool is_dram;
+ int usage_mem_type;
+ int ret;
+
+ if (!f->file_name) {
+ log_err("fio: filename is not set\n");
+ return -1;
+ }
+
+ /* start a listening endpoint at addr:port */
+ if (librpma_fio_td_port(o->port, td, port_td))
+ return -1;
+
+ if ((ret = rpma_ep_listen(csd->peer, o->server_ip, port_td, &ep))) {
+ librpma_td_verror(td, ret, "rpma_ep_listen");
+ return -1;
+ }
+
+ is_dram = !strcmp(f->file_name, "malloc");
+ if (is_dram) {
+ /* allocation from DRAM using posix_memalign() */
+ ws_ptr = librpma_fio_allocate_dram(td, mem_size, &csd->mem);
+ usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_VISIBILITY;
+ } else {
+ /* allocation from PMEM using pmem_map_file() */
+ ws_ptr = librpma_fio_allocate_pmem(td, f, mem_size, &csd->mem);
+ usage_mem_type = RPMA_MR_USAGE_FLUSH_TYPE_PERSISTENT;
+ }
+
+ if (ws_ptr == NULL)
+ goto err_ep_shutdown;
+
+ f->real_file_size = mem_size;
+
+ if ((ret = rpma_mr_reg(csd->peer, ws_ptr, mem_size,
+ RPMA_MR_USAGE_READ_DST | RPMA_MR_USAGE_READ_SRC |
+ RPMA_MR_USAGE_WRITE_DST | RPMA_MR_USAGE_WRITE_SRC |
+ usage_mem_type, &mr))) {
+ librpma_td_verror(td, ret, "rpma_mr_reg");
+ goto err_free;
+ }
+
+ if (!is_dram && f->filetype == FIO_TYPE_FILE) {
+ ret = rpma_mr_advise(mr, 0, mem_size,
+ IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE,
+ IBV_ADVISE_MR_FLAG_FLUSH);
+ if (ret) {
+ librpma_td_verror(td, ret, "rpma_mr_advise");
+ /* an invalid argument is an error */
+ if (ret == RPMA_E_INVAL)
+ goto err_mr_dereg;
+
+ /* log_err used instead of log_info to avoid corruption of the JSON output */
+ log_err("Note: having rpma_mr_advise(3) failed because of RPMA_E_NOSUPP or RPMA_E_PROVIDER may come with a performance penalty, but it is not a blocker for running the benchmark.\n");
+ }
+ }
+
+ /* get size of the memory region's descriptor */
+ if ((ret = rpma_mr_get_descriptor_size(mr, &mr_desc_size))) {
+ librpma_td_verror(td, ret, "rpma_mr_get_descriptor_size");
+ goto err_mr_dereg;
+ }
+
+ /* verify size of the memory region's descriptor */
+ if (mr_desc_size > LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE) {
+ log_err(
+ "size of the memory region's descriptor is too big (max=%i)\n",
+ LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE);
+ goto err_mr_dereg;
+ }
+
+ /* get the memory region's descriptor */
+ if ((ret = rpma_mr_get_descriptor(mr, &ws.descriptor[0]))) {
+ librpma_td_verror(td, ret, "rpma_mr_get_descriptor");
+ goto err_mr_dereg;
+ }
+
+ if (cfg != NULL) {
+ if ((ret = rpma_conn_cfg_get_rq_size(cfg, &max_msg_num))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_get_rq_size");
+ goto err_mr_dereg;
+ }
+
+ /* verify whether iodepth fits into uint16_t */
+ if (max_msg_num > UINT16_MAX) {
+ log_err("fio: iodepth too big (%u > %u)\n",
+ max_msg_num, UINT16_MAX);
+ return -1;
+ }
+
+ ws.max_msg_num = max_msg_num;
+ }
+
+ /* prepare a workspace description */
+ ws.direct_write_to_pmem = o->direct_write_to_pmem;
+ ws.mr_desc_size = mr_desc_size;
+ pdata.ptr = &ws;
+ pdata.len = sizeof(ws);
+
+ /* receive an incoming connection request */
+ if ((ret = rpma_ep_next_conn_req(ep, cfg, &conn_req))) {
+ librpma_td_verror(td, ret, "rpma_ep_next_conn_req");
+ goto err_mr_dereg;
+ }
+
+ if (csd->prepare_connection && csd->prepare_connection(td, conn_req))
+ goto err_req_delete;
+
+ /* accept the connection request and obtain the connection object */
+ if ((ret = rpma_conn_req_connect(&conn_req, &pdata, &conn))) {
+ librpma_td_verror(td, ret, "rpma_conn_req_connect");
+ goto err_req_delete;
+ }
+
+ /* wait for the connection to be established */
+ if ((ret = rpma_conn_next_event(conn, &conn_event))) {
+ librpma_td_verror(td, ret, "rpma_conn_next_event");
+ goto err_conn_delete;
+ } else if (conn_event != RPMA_CONN_ESTABLISHED) {
+ log_err("rpma_conn_next_event returned an unexptected event\n");
+ goto err_conn_delete;
+ }
+
+ /* end-point is no longer needed */
+ (void) rpma_ep_shutdown(&ep);
+
+ csd->ws_mr = mr;
+ csd->ws_ptr = ws_ptr;
+ csd->conn = conn;
+
+ /* get the connection's main CQ */
+ if ((ret = rpma_conn_get_cq(csd->conn, &csd->cq))) {
+ librpma_td_verror(td, ret, "rpma_conn_get_cq");
+ goto err_conn_delete;
+ }
+
+ return 0;
+
+err_conn_delete:
+ (void) rpma_conn_delete(&conn);
+
+err_req_delete:
+ (void) rpma_conn_req_delete(&conn_req);
+
+err_mr_dereg:
+ (void) rpma_mr_dereg(&mr);
+
+err_free:
+ librpma_fio_free(&csd->mem);
+
+err_ep_shutdown:
+ (void) rpma_ep_shutdown(&ep);
+
+ return -1;
+}
+
+int librpma_fio_server_close_file(struct thread_data *td, struct fio_file *f)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ enum rpma_conn_event conn_event = RPMA_CONN_UNDEFINED;
+ int rv = 0;
+ int ret;
+
+ /* wait for the connection to be closed */
+ ret = rpma_conn_next_event(csd->conn, &conn_event);
+ if (!ret && conn_event != RPMA_CONN_CLOSED) {
+ log_err("rpma_conn_next_event returned an unexptected event\n");
+ rv = -1;
+ }
+
+ if ((ret = rpma_conn_disconnect(csd->conn))) {
+ librpma_td_verror(td, ret, "rpma_conn_disconnect");
+ rv = -1;
+ }
+
+ if ((ret = rpma_conn_delete(&csd->conn))) {
+ librpma_td_verror(td, ret, "rpma_conn_delete");
+ rv = -1;
+ }
+
+ if ((ret = rpma_mr_dereg(&csd->ws_mr))) {
+ librpma_td_verror(td, ret, "rpma_mr_dereg");
+ rv = -1;
+ }
+
+ librpma_fio_free(&csd->mem);
+
+ return rv;
+}
--- /dev/null
+/*
+ * librpma_fio: librpma_apm and librpma_gpspm engines' common header.
+ *
+ * Copyright 2021-2022, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef LIBRPMA_FIO_H
+#define LIBRPMA_FIO_H 1
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+#include <librpma.h>
+
+/* servers' and clients' common */
+
+#define librpma_td_verror(td, err, func) \
+ td_vmsg((td), (err), rpma_err_2str(err), (func))
+
+/* ceil(a / b) = (a + b - 1) / b */
+#define LIBRPMA_FIO_CEIL(a, b) (((a) + (b) - 1) / (b))
+
+/* common option structure for server and client */
+struct librpma_fio_options_values {
+ /*
+ * FIO considers .off1 == 0 absent so the first meaningful field has to
+ * have padding ahead of it.
+ */
+ void *pad;
+ char *server_ip;
+ /* base server listening port */
+ char *port;
+ /* Direct Write to PMem is possible */
+ unsigned int direct_write_to_pmem;
+ /* Set to 0 to wait for completion instead of busy-wait polling completion. */
+ unsigned int busy_wait_polling;
+};
+
+extern struct fio_option librpma_fio_options[];
+
+/*
+ * Limited by the maximum length of the private data
+ * for rdma_connect() in case of RDMA_PS_TCP (28 bytes).
+ */
+#define LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE 24
+
+struct librpma_fio_workspace {
+ uint16_t max_msg_num; /* # of RQ slots */
+ uint8_t direct_write_to_pmem; /* Direct Write to PMem is possible */
+ uint8_t mr_desc_size; /* size of mr_desc in descriptor[] */
+ /* buffer containing mr_desc */
+ char descriptor[LIBRPMA_FIO_DESCRIPTOR_MAX_SIZE];
+};
+
+#define LIBRPMA_FIO_PORT_STR_LEN_MAX 12
+
+int librpma_fio_td_port(const char *port_base_str, struct thread_data *td,
+ char *port_out);
+
+struct librpma_fio_mem {
+ /* memory buffer */
+ char *mem_ptr;
+
+ /* size of the mapped persistent memory */
+ size_t size_mmap;
+
+#ifdef CONFIG_LIBPMEM2_INSTALLED
+ /* libpmem2 structure used for mapping PMem */
+ struct pmem2_map *map;
+#endif
+};
+
+char *librpma_fio_allocate_dram(struct thread_data *td, size_t size,
+ struct librpma_fio_mem *mem);
+
+char *librpma_fio_allocate_pmem(struct thread_data *td, struct fio_file *f,
+ size_t size, struct librpma_fio_mem *mem);
+
+void librpma_fio_free(struct librpma_fio_mem *mem);
+
+/* clients' common */
+
+typedef int (*librpma_fio_flush_t)(struct thread_data *td,
+ struct io_u *first_io_u, struct io_u *last_io_u,
+ unsigned long long int len);
+
+/*
+ * RETURN VALUE
+ * - ( 1) - on success
+ * - ( 0) - skip
+ * - (-1) - on error
+ */
+typedef int (*librpma_fio_get_io_u_index_t)(struct ibv_wc *wc,
+ unsigned int *io_u_index);
+
+struct librpma_fio_client_data {
+ struct rpma_peer *peer;
+ struct rpma_conn *conn;
+ struct rpma_cq *cq;
+
+ /* aligned td->orig_buffer */
+ char *orig_buffer_aligned;
+
+ /* ious's base address memory registration (cd->orig_buffer_aligned) */
+ struct rpma_mr_local *orig_mr;
+
+ struct librpma_fio_workspace *ws;
+
+ /* a server's memory representation */
+ struct rpma_mr_remote *server_mr;
+ enum rpma_flush_type server_mr_flush_type;
+
+ /* remote workspace description */
+ size_t ws_size;
+
+ /* in-memory queues */
+ struct io_u **io_us_queued;
+ int io_u_queued_nr;
+ struct io_u **io_us_flight;
+ int io_u_flight_nr;
+ struct io_u **io_us_completed;
+ int io_u_completed_nr;
+
+ /* SQ control. Note: all of them have to be kept in sync. */
+ uint32_t op_send_posted;
+ uint32_t op_send_completed;
+ uint32_t op_recv_completed;
+
+ librpma_fio_flush_t flush;
+ librpma_fio_get_io_u_index_t get_io_u_index;
+
+ /* engine-specific client data */
+ void *client_data;
+};
+
+int librpma_fio_client_init(struct thread_data *td,
+ struct rpma_conn_cfg *cfg);
+void librpma_fio_client_cleanup(struct thread_data *td);
+
+int librpma_fio_file_nop(struct thread_data *td, struct fio_file *f);
+int librpma_fio_client_get_file_size(struct thread_data *td,
+ struct fio_file *f);
+
+int librpma_fio_client_post_init(struct thread_data *td);
+
+enum fio_q_status librpma_fio_client_queue(struct thread_data *td,
+ struct io_u *io_u);
+
+int librpma_fio_client_commit(struct thread_data *td);
+
+int librpma_fio_client_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t);
+
+struct io_u *librpma_fio_client_event(struct thread_data *td, int event);
+
+char *librpma_fio_client_errdetails(struct io_u *io_u);
+
+static inline int librpma_fio_client_io_read(struct thread_data *td,
+ struct io_u *io_u, int flags)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ size_t dst_offset = (char *)(io_u->xfer_buf) - ccd->orig_buffer_aligned;
+ size_t src_offset = io_u->offset;
+ int ret;
+
+ if ((ret = rpma_read(ccd->conn, ccd->orig_mr, dst_offset,
+ ccd->server_mr, src_offset, io_u->xfer_buflen,
+ flags, (void *)(uintptr_t)io_u->index))) {
+ librpma_td_verror(td, ret, "rpma_read");
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int librpma_fio_client_io_write(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ size_t src_offset = (char *)(io_u->xfer_buf) - ccd->orig_buffer_aligned;
+ size_t dst_offset = io_u->offset;
+ int ret;
+
+ if ((ret = rpma_write(ccd->conn, ccd->server_mr, dst_offset,
+ ccd->orig_mr, src_offset, io_u->xfer_buflen,
+ RPMA_F_COMPLETION_ON_ERROR,
+ (void *)(uintptr_t)io_u->index))) {
+ librpma_td_verror(td, ret, "rpma_write");
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int librpma_fio_client_io_complete_all_sends(
+ struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct ibv_wc wc;
+ int ret;
+
+ while (ccd->op_send_posted != ccd->op_send_completed) {
+ /* get a completion */
+ ret = rpma_cq_get_wc(ccd->cq, 1, &wc, NULL);
+ if (ret == RPMA_E_NO_COMPLETION) {
+ /* lack of completion is not an error */
+ continue;
+ } else if (ret != 0) {
+ /* an error occurred */
+ librpma_td_verror(td, ret, "rpma_cq_get_wc");
+ break;
+ }
+
+ if (wc.status != IBV_WC_SUCCESS)
+ return -1;
+
+ if (wc.opcode == IBV_WC_SEND)
+ ++ccd->op_send_completed;
+ else {
+ log_err(
+ "A completion other than IBV_WC_SEND got during cleaning up the CQ from SENDs\n");
+ return -1;
+ }
+ }
+
+ /*
+ * All posted SENDs are completed and RECVs for them (responses) are
+ * completed. This is the initial situation so the counters are reset.
+ */
+ if (ccd->op_send_posted == ccd->op_send_completed &&
+ ccd->op_send_completed == ccd->op_recv_completed) {
+ ccd->op_send_posted = 0;
+ ccd->op_send_completed = 0;
+ ccd->op_recv_completed = 0;
+ }
+
+ return 0;
+}
+
+/* servers' common */
+
+typedef int (*librpma_fio_prepare_connection_t)(
+ struct thread_data *td,
+ struct rpma_conn_req *conn_req);
+
+struct librpma_fio_server_data {
+ struct rpma_peer *peer;
+
+ /* resources of an incoming connection */
+ struct rpma_conn *conn;
+ struct rpma_cq *cq;
+
+ char *ws_ptr;
+ struct rpma_mr_local *ws_mr;
+ struct librpma_fio_mem mem;
+
+ /* engine-specific server data */
+ void *server_data;
+
+ librpma_fio_prepare_connection_t prepare_connection;
+};
+
+int librpma_fio_server_init(struct thread_data *td);
+
+void librpma_fio_server_cleanup(struct thread_data *td);
+
+int librpma_fio_server_open_file(struct thread_data *td,
+ struct fio_file *f, struct rpma_conn_cfg *cfg);
+
+int librpma_fio_server_close_file(struct thread_data *td,
+ struct fio_file *f);
+
+#endif /* LIBRPMA_FIO_H */
--- /dev/null
+/*
+ * librpma_fio_pmem: allocates pmem using libpmem.
+ *
+ * Copyright 2022, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <libpmem.h>
+#include "librpma_fio.h"
+
+#define RPMA_PMEM_USED "libpmem"
+
+static int librpma_fio_pmem_map_file(struct fio_file *f, size_t size,
+ struct librpma_fio_mem *mem, size_t ws_offset)
+{
+ int is_pmem = 0;
+ size_t size_mmap = 0;
+
+ /* map the file */
+ mem->mem_ptr = pmem_map_file(f->file_name, 0 /* len */, 0 /* flags */,
+ 0 /* mode */, &size_mmap, &is_pmem);
+ if (mem->mem_ptr == NULL) {
+ /* pmem_map_file() sets errno on failure */
+ log_err("fio: pmem_map_file(%s) failed: %s (errno %i)\n",
+ f->file_name, strerror(errno), errno);
+ return -1;
+ }
+
+ /* pmem is expected */
+ if (!is_pmem) {
+ log_err("fio: %s is not located in persistent memory\n",
+ f->file_name);
+ goto err_unmap;
+ }
+
+ /* check size of allocated persistent memory */
+ if (size_mmap < ws_offset + size) {
+ log_err(
+ "fio: %s is too small to handle so many threads (%zu < %zu)\n",
+ f->file_name, size_mmap, ws_offset + size);
+ goto err_unmap;
+ }
+
+ log_info("fio: size of memory mapped from the file %s: %zu\n",
+ f->file_name, size_mmap);
+
+ mem->size_mmap = size_mmap;
+
+ return 0;
+
+err_unmap:
+ (void) pmem_unmap(mem->mem_ptr, size_mmap);
+ return -1;
+}
+
+static inline void librpma_fio_unmap(struct librpma_fio_mem *mem)
+{
+ (void) pmem_unmap(mem->mem_ptr, mem->size_mmap);
+}
--- /dev/null
+/*
+ * librpma_fio_pmem2: allocates pmem using libpmem2.
+ *
+ * Copyright 2022, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <libpmem2.h>
+#include "librpma_fio.h"
+
+#define RPMA_PMEM_USED "libpmem2"
+
+static int librpma_fio_pmem_map_file(struct fio_file *f, size_t size,
+ struct librpma_fio_mem *mem, size_t ws_offset)
+{
+ int fd;
+ struct pmem2_config *cfg = NULL;
+ struct pmem2_map *map = NULL;
+ struct pmem2_source *src = NULL;
+
+ size_t size_mmap;
+
+ if((fd = open(f->file_name, O_RDWR)) < 0) {
+ log_err("fio: cannot open fio file\n");
+ return -1;
+ }
+
+ if (pmem2_source_from_fd(&src, fd) != 0) {
+ log_err("fio: pmem2_source_from_fd() failed\n");
+ goto err_close;
+ }
+
+ if (pmem2_config_new(&cfg) != 0) {
+ log_err("fio: pmem2_config_new() failed\n");
+ goto err_source_delete;
+ }
+
+ if (pmem2_config_set_required_store_granularity(cfg,
+ PMEM2_GRANULARITY_CACHE_LINE) != 0) {
+ log_err("fio: pmem2_config_set_required_store_granularity() failed: %s\n", pmem2_errormsg());
+ goto err_config_delete;
+ }
+
+ if (pmem2_map_new(&map, cfg, src) != 0) {
+ log_err("fio: pmem2_map_new(%s) failed: %s\n", f->file_name, pmem2_errormsg());
+ goto err_config_delete;
+ }
+
+ size_mmap = pmem2_map_get_size(map);
+
+ /* check size of allocated persistent memory */
+ if (size_mmap < ws_offset + size) {
+ log_err(
+ "fio: %s is too small to handle so many threads (%zu < %zu)\n",
+ f->file_name, size_mmap, ws_offset + size);
+ goto err_map_delete;
+ }
+
+ mem->mem_ptr = pmem2_map_get_address(map);
+ mem->size_mmap = size_mmap;
+ mem->map = map;
+ pmem2_config_delete(&cfg);
+ pmem2_source_delete(&src);
+ close(fd);
+
+ return 0;
+
+err_map_delete:
+ pmem2_map_delete(&map);
+err_config_delete:
+ pmem2_config_delete(&cfg);
+err_source_delete:
+ pmem2_source_delete(&src);
+err_close:
+ close(fd);
+
+ return -1;
+}
+
+static inline void librpma_fio_unmap(struct librpma_fio_mem *mem)
+{
+ (void) pmem2_map_delete(&mem->map);
+}
--- /dev/null
+/*
+ * librpma_gpspm: IO engine that uses PMDK librpma to write data,
+ * based on General Purpose Server Persistency Method
+ *
+ * Copyright 2020-2022, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include "librpma_fio.h"
+
+#ifdef CONFIG_LIBPMEM2_INSTALLED
+#include <libpmem2.h>
+#else
+#include <libpmem.h>
+#endif
+
+/* Generated by the protocol buffer compiler from: librpma_gpspm_flush.proto */
+#include "librpma_gpspm_flush.pb-c.h"
+
+#define MAX_MSG_SIZE (512)
+#define IO_U_BUF_LEN (2 * MAX_MSG_SIZE)
+#define SEND_OFFSET (0)
+#define RECV_OFFSET (SEND_OFFSET + MAX_MSG_SIZE)
+
+#define GPSPM_FLUSH_REQUEST__LAST \
+ { PROTOBUF_C_MESSAGE_INIT(&gpspm_flush_request__descriptor), 0, 0, 0 }
+
+/*
+ * 'Flush_req_last' is the last flush request
+ * the client has to send to server to indicate
+ * that the client is done.
+ */
+static const GPSPMFlushRequest Flush_req_last = GPSPM_FLUSH_REQUEST__LAST;
+
+#define IS_NOT_THE_LAST_MESSAGE(flush_req) \
+ (flush_req->length != Flush_req_last.length || \
+ flush_req->offset != Flush_req_last.offset)
+
+/* client side implementation */
+
+/* get next io_u message buffer in the round-robin fashion */
+#define IO_U_NEXT_BUF_OFF_CLIENT(cd) \
+ (IO_U_BUF_LEN * ((cd->msg_curr++) % cd->msg_num))
+
+struct client_data {
+ /* memory for sending and receiving buffered */
+ char *io_us_msgs;
+
+ /* resources for messaging buffer */
+ uint32_t msg_num;
+ uint32_t msg_curr;
+ struct rpma_mr_local *msg_mr;
+};
+
+static inline int client_io_flush(struct thread_data *td,
+ struct io_u *first_io_u, struct io_u *last_io_u,
+ unsigned long long int len);
+
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index);
+
+static int client_init(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd;
+ struct client_data *cd;
+ uint32_t write_num;
+ struct rpma_conn_cfg *cfg = NULL;
+ int ret;
+
+ /*
+ * not supported:
+ * - readwrite = read / trim / randread / randtrim /
+ * / rw / randrw / trimwrite
+ */
+ if (td_read(td) || td_trim(td)) {
+ td_verror(td, EINVAL, "Not supported mode.");
+ return -1;
+ }
+
+ /* allocate client's data */
+ cd = calloc(1, sizeof(*cd));
+ if (cd == NULL) {
+ td_verror(td, errno, "calloc");
+ return -1;
+ }
+
+ /*
+ * Calculate the required number of WRITEs and FLUSHes.
+ *
+ * Note: Each flush is a request (SEND) and response (RECV) pair.
+ */
+ if (td_random(td)) {
+ write_num = td->o.iodepth; /* WRITE * N */
+ cd->msg_num = td->o.iodepth; /* FLUSH * N */
+ } else {
+ if (td->o.sync_io) {
+ write_num = 1; /* WRITE */
+ cd->msg_num = 1; /* FLUSH */
+ } else {
+ write_num = td->o.iodepth; /* WRITE * N */
+ /*
+ * FLUSH * B where:
+ * - B == ceil(iodepth / iodepth_batch)
+ * which is the number of batches for N writes
+ */
+ cd->msg_num = LIBRPMA_FIO_CEIL(td->o.iodepth,
+ td->o.iodepth_batch);
+ }
+ }
+
+ /* create a connection configuration object */
+ if ((ret = rpma_conn_cfg_new(&cfg))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_new");
+ goto err_free_cd;
+ }
+
+ /*
+ * Calculate the required queue sizes where:
+ * - the send queue (SQ) has to be big enough to accommodate
+ * all io_us (WRITEs) and all flush requests (SENDs)
+ * - the receive queue (RQ) has to be big enough to accommodate
+ * all flush responses (RECVs)
+ * - the completion queue (CQ) has to be big enough to accommodate all
+ * success and error completions (sq_size + rq_size)
+ */
+ if ((ret = rpma_conn_cfg_set_sq_size(cfg, write_num + cd->msg_num))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size");
+ goto err_cfg_delete;
+ }
+ if ((ret = rpma_conn_cfg_set_rq_size(cfg, cd->msg_num))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_rq_size");
+ goto err_cfg_delete;
+ }
+ if ((ret = rpma_conn_cfg_set_cq_size(cfg, write_num + cd->msg_num * 2))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size");
+ goto err_cfg_delete;
+ }
+
+ if (librpma_fio_client_init(td, cfg))
+ goto err_cfg_delete;
+
+ ccd = td->io_ops_data;
+
+ if (ccd->ws->direct_write_to_pmem &&
+ ccd->server_mr_flush_type == RPMA_FLUSH_TYPE_PERSISTENT &&
+ td->thread_number == 1) {
+ /* XXX log_info mixes with the JSON output */
+ log_err(
+ "Note: The server side supports Direct Write to PMem and it is equipped with PMem (direct_write_to_pmem).\n"
+ "You can use librpma_client and librpma_server engines for better performance instead of GPSPM.\n");
+ }
+
+ /* validate the server's RQ capacity */
+ if (cd->msg_num > ccd->ws->max_msg_num) {
+ log_err(
+ "server's RQ size (iodepth) too small to handle the client's workspace requirements (%u < %u)\n",
+ ccd->ws->max_msg_num, cd->msg_num);
+ goto err_cleanup_common;
+ }
+
+ if ((ret = rpma_conn_cfg_delete(&cfg))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_delete");
+ /* non fatal error - continue */
+ }
+
+ ccd->flush = client_io_flush;
+ ccd->get_io_u_index = client_get_io_u_index;
+ ccd->client_data = cd;
+
+ return 0;
+
+err_cleanup_common:
+ librpma_fio_client_cleanup(td);
+
+err_cfg_delete:
+ (void) rpma_conn_cfg_delete(&cfg);
+
+err_free_cd:
+ free(cd);
+
+ return -1;
+}
+
+static int client_post_init(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct client_data *cd = ccd->client_data;
+ unsigned int io_us_msgs_size;
+ int ret;
+
+ /* message buffers initialization and registration */
+ io_us_msgs_size = cd->msg_num * IO_U_BUF_LEN;
+ if ((ret = posix_memalign((void **)&cd->io_us_msgs, page_size,
+ io_us_msgs_size))) {
+ td_verror(td, ret, "posix_memalign");
+ return ret;
+ }
+ if ((ret = rpma_mr_reg(ccd->peer, cd->io_us_msgs, io_us_msgs_size,
+ RPMA_MR_USAGE_SEND | RPMA_MR_USAGE_RECV,
+ &cd->msg_mr))) {
+ librpma_td_verror(td, ret, "rpma_mr_reg");
+ return ret;
+ }
+
+ return librpma_fio_client_post_init(td);
+}
+
+static void client_cleanup(struct thread_data *td)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct client_data *cd;
+ size_t flush_req_size;
+ size_t io_u_buf_off;
+ size_t send_offset;
+ void *send_ptr;
+ int ret;
+
+ if (ccd == NULL)
+ return;
+
+ cd = ccd->client_data;
+ if (cd == NULL) {
+ librpma_fio_client_cleanup(td);
+ return;
+ }
+
+ /*
+ * Make sure all SEND completions are collected ergo there are free
+ * slots in the SQ for the last SEND message.
+ *
+ * Note: If any operation will fail we still can send the termination
+ * notice.
+ */
+ (void) librpma_fio_client_io_complete_all_sends(td);
+
+ /* prepare the last flush message and pack it to the send buffer */
+ flush_req_size = gpspm_flush_request__get_packed_size(&Flush_req_last);
+ if (flush_req_size > MAX_MSG_SIZE) {
+ log_err(
+ "Packed flush request size is bigger than available send buffer space (%zu > %d\n",
+ flush_req_size, MAX_MSG_SIZE);
+ } else {
+ io_u_buf_off = IO_U_NEXT_BUF_OFF_CLIENT(cd);
+ send_offset = io_u_buf_off + SEND_OFFSET;
+ send_ptr = cd->io_us_msgs + send_offset;
+ (void) gpspm_flush_request__pack(&Flush_req_last, send_ptr);
+
+ /* send the flush message */
+ if ((ret = rpma_send(ccd->conn, cd->msg_mr, send_offset,
+ flush_req_size, RPMA_F_COMPLETION_ALWAYS,
+ NULL)))
+ librpma_td_verror(td, ret, "rpma_send");
+
+ ++ccd->op_send_posted;
+
+ /* Wait for the SEND to complete */
+ (void) librpma_fio_client_io_complete_all_sends(td);
+ }
+
+ /* deregister the messaging buffer memory */
+ if ((ret = rpma_mr_dereg(&cd->msg_mr)))
+ librpma_td_verror(td, ret, "rpma_mr_dereg");
+
+ free(ccd->client_data);
+
+ librpma_fio_client_cleanup(td);
+}
+
+static inline int client_io_flush(struct thread_data *td,
+ struct io_u *first_io_u, struct io_u *last_io_u,
+ unsigned long long int len)
+{
+ struct librpma_fio_client_data *ccd = td->io_ops_data;
+ struct client_data *cd = ccd->client_data;
+ size_t io_u_buf_off = IO_U_NEXT_BUF_OFF_CLIENT(cd);
+ size_t send_offset = io_u_buf_off + SEND_OFFSET;
+ size_t recv_offset = io_u_buf_off + RECV_OFFSET;
+ void *send_ptr = cd->io_us_msgs + send_offset;
+ void *recv_ptr = cd->io_us_msgs + recv_offset;
+ GPSPMFlushRequest flush_req = GPSPM_FLUSH_REQUEST__INIT;
+ size_t flush_req_size = 0;
+ int ret;
+
+ /* prepare a response buffer */
+ if ((ret = rpma_recv(ccd->conn, cd->msg_mr, recv_offset, MAX_MSG_SIZE,
+ recv_ptr))) {
+ librpma_td_verror(td, ret, "rpma_recv");
+ return -1;
+ }
+
+ /* prepare a flush message and pack it to a send buffer */
+ flush_req.offset = first_io_u->offset;
+ flush_req.length = len;
+ flush_req.op_context = last_io_u->index;
+ flush_req_size = gpspm_flush_request__get_packed_size(&flush_req);
+ if (flush_req_size > MAX_MSG_SIZE) {
+ log_err(
+ "Packed flush request size is bigger than available send buffer space (%"
+ PRIu64 " > %d\n", flush_req_size, MAX_MSG_SIZE);
+ return -1;
+ }
+ (void) gpspm_flush_request__pack(&flush_req, send_ptr);
+
+ /* send the flush message */
+ if ((ret = rpma_send(ccd->conn, cd->msg_mr, send_offset, flush_req_size,
+ RPMA_F_COMPLETION_ALWAYS, NULL))) {
+ librpma_td_verror(td, ret, "rpma_send");
+ return -1;
+ }
+
+ ++ccd->op_send_posted;
+
+ return 0;
+}
+
+static int client_get_io_u_index(struct ibv_wc *wc, unsigned int *io_u_index)
+{
+ GPSPMFlushResponse *flush_resp;
+
+ if (wc->opcode != IBV_WC_RECV)
+ return 0;
+
+ /* unpack a response from the received buffer */
+ flush_resp = gpspm_flush_response__unpack(NULL,
+ wc->byte_len, (void *)wc->wr_id);
+ if (flush_resp == NULL) {
+ log_err("Cannot unpack the flush response buffer\n");
+ return -1;
+ }
+
+ memcpy(io_u_index, &flush_resp->op_context, sizeof(*io_u_index));
+
+ gpspm_flush_response__free_unpacked(flush_resp, NULL);
+
+ return 1;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_client = {
+ .name = "librpma_gpspm_client",
+ .version = FIO_IOOPS_VERSION,
+ .init = client_init,
+ .post_init = client_post_init,
+ .get_file_size = librpma_fio_client_get_file_size,
+ .open_file = librpma_fio_file_nop,
+ .queue = librpma_fio_client_queue,
+ .commit = librpma_fio_client_commit,
+ .getevents = librpma_fio_client_getevents,
+ .event = librpma_fio_client_event,
+ .errdetails = librpma_fio_client_errdetails,
+ .close_file = librpma_fio_file_nop,
+ .cleanup = client_cleanup,
+ .flags = FIO_DISKLESSIO | FIO_ASYNCIO_SETS_ISSUE_TIME,
+ .options = librpma_fio_options,
+ .option_struct_size = sizeof(struct librpma_fio_options_values),
+};
+
+/* server side implementation */
+
+#define IO_U_BUFF_OFF_SERVER(i) (i * IO_U_BUF_LEN)
+
+typedef void (*librpma_fio_persist_fn)(const void *ptr, size_t size);
+
+struct server_data {
+ /* aligned td->orig_buffer */
+ char *orig_buffer_aligned;
+
+ /* resources for messaging buffer from DRAM allocated by fio */
+ struct rpma_mr_local *msg_mr;
+
+ uint32_t msg_sqe_available; /* # of free SQ slots */
+
+ /* in-memory queues */
+ struct ibv_wc *msgs_queued;
+ uint32_t msg_queued_nr;
+
+ librpma_fio_persist_fn persist;
+};
+
+static int server_init(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd;
+ struct server_data *sd;
+ int ret = -1;
+
+ if ((ret = librpma_fio_server_init(td)))
+ return ret;
+
+ csd = td->io_ops_data;
+
+ /* allocate server's data */
+ sd = calloc(1, sizeof(*sd));
+ if (sd == NULL) {
+ td_verror(td, errno, "calloc");
+ goto err_server_cleanup;
+ }
+
+ /* allocate in-memory queue */
+ sd->msgs_queued = calloc(td->o.iodepth, sizeof(*sd->msgs_queued));
+ if (sd->msgs_queued == NULL) {
+ td_verror(td, errno, "calloc");
+ goto err_free_sd;
+ }
+
+#ifdef CONFIG_LIBPMEM2_INSTALLED
+ /* get libpmem2 persist function from pmem2_map */
+ sd->persist = pmem2_get_persist_fn(csd->mem.map);
+#else
+ sd->persist = pmem_persist;
+#endif
+
+ /*
+ * Assure a single io_u buffer can store both SEND and RECV messages and
+ * an io_us buffer allocation is page-size-aligned which is required
+ * to register for RDMA. User-provided values are intentionally ignored.
+ */
+ td->o.max_bs[DDIR_READ] = IO_U_BUF_LEN;
+ td->o.mem_align = page_size;
+
+ csd->server_data = sd;
+
+ return 0;
+
+err_free_sd:
+ free(sd);
+
+err_server_cleanup:
+ librpma_fio_server_cleanup(td);
+
+ return -1;
+}
+
+static int server_post_init(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd = csd->server_data;
+ size_t io_us_size;
+ size_t io_u_buflen;
+ int ret;
+
+ /*
+ * td->orig_buffer is not aligned. The engine requires aligned io_us
+ * so FIO aligns up the address using the formula below.
+ */
+ sd->orig_buffer_aligned = PTR_ALIGN(td->orig_buffer, page_mask) +
+ td->o.mem_align;
+
+ /*
+ * XXX
+ * Each io_u message buffer contains recv and send messages.
+ * Aligning each of those buffers may potentially give
+ * some performance benefits.
+ */
+ io_u_buflen = td_max_bs(td);
+
+ /* check whether io_u buffer is big enough */
+ if (io_u_buflen < IO_U_BUF_LEN) {
+ log_err(
+ "blocksize too small to accommodate assumed maximal request/response pair size (%" PRIu64 " < %d)\n",
+ io_u_buflen, IO_U_BUF_LEN);
+ return -1;
+ }
+
+ /*
+ * td->orig_buffer_size beside the space really consumed by io_us
+ * has paddings which can be omitted for the memory registration.
+ */
+ io_us_size = (unsigned long long)io_u_buflen *
+ (unsigned long long)td->o.iodepth;
+
+ if ((ret = rpma_mr_reg(csd->peer, sd->orig_buffer_aligned, io_us_size,
+ RPMA_MR_USAGE_SEND | RPMA_MR_USAGE_RECV,
+ &sd->msg_mr))) {
+ librpma_td_verror(td, ret, "rpma_mr_reg");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void server_cleanup(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd;
+ int ret;
+
+ if (csd == NULL)
+ return;
+
+ sd = csd->server_data;
+
+ if (sd != NULL) {
+ /* rpma_mr_dereg(messaging buffer from DRAM) */
+ if ((ret = rpma_mr_dereg(&sd->msg_mr)))
+ librpma_td_verror(td, ret, "rpma_mr_dereg");
+
+ free(sd->msgs_queued);
+ free(sd);
+ }
+
+ librpma_fio_server_cleanup(td);
+}
+
+static int prepare_connection(struct thread_data *td,
+ struct rpma_conn_req *conn_req)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd = csd->server_data;
+ int ret;
+ int i;
+
+ /* prepare buffers for a flush requests */
+ sd->msg_sqe_available = td->o.iodepth;
+ for (i = 0; i < td->o.iodepth; i++) {
+ size_t offset_recv_msg = IO_U_BUFF_OFF_SERVER(i) + RECV_OFFSET;
+ if ((ret = rpma_conn_req_recv(conn_req, sd->msg_mr,
+ offset_recv_msg, MAX_MSG_SIZE,
+ (const void *)(uintptr_t)i))) {
+ librpma_td_verror(td, ret, "rpma_conn_req_recv");
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int server_open_file(struct thread_data *td, struct fio_file *f)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct rpma_conn_cfg *cfg = NULL;
+ uint16_t max_msg_num = td->o.iodepth;
+ int ret;
+
+ csd->prepare_connection = prepare_connection;
+
+ /* create a connection configuration object */
+ if ((ret = rpma_conn_cfg_new(&cfg))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_new");
+ return -1;
+ }
+
+ /*
+ * Calculate the required queue sizes where:
+ * - the send queue (SQ) has to be big enough to accommodate
+ * all possible flush requests (SENDs)
+ * - the receive queue (RQ) has to be big enough to accommodate
+ * all flush responses (RECVs)
+ * - the completion queue (CQ) has to be big enough to accommodate
+ * all success and error completions (sq_size + rq_size)
+ */
+ if ((ret = rpma_conn_cfg_set_sq_size(cfg, max_msg_num))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_sq_size");
+ goto err_cfg_delete;
+ }
+ if ((ret = rpma_conn_cfg_set_rq_size(cfg, max_msg_num))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_rq_size");
+ goto err_cfg_delete;
+ }
+ if ((ret = rpma_conn_cfg_set_cq_size(cfg, max_msg_num * 2))) {
+ librpma_td_verror(td, ret, "rpma_conn_cfg_set_cq_size");
+ goto err_cfg_delete;
+ }
+
+ ret = librpma_fio_server_open_file(td, f, cfg);
+
+err_cfg_delete:
+ (void) rpma_conn_cfg_delete(&cfg);
+
+ return ret;
+}
+
+static int server_qe_process(struct thread_data *td, struct ibv_wc *wc)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd = csd->server_data;
+ GPSPMFlushRequest *flush_req;
+ GPSPMFlushResponse flush_resp = GPSPM_FLUSH_RESPONSE__INIT;
+ size_t flush_resp_size = 0;
+ size_t send_buff_offset;
+ size_t recv_buff_offset;
+ size_t io_u_buff_offset;
+ void *send_buff_ptr;
+ void *recv_buff_ptr;
+ void *op_ptr;
+ int msg_index;
+ int ret;
+
+ /* calculate SEND/RECV pair parameters */
+ msg_index = (int)(uintptr_t)wc->wr_id;
+ io_u_buff_offset = IO_U_BUFF_OFF_SERVER(msg_index);
+ send_buff_offset = io_u_buff_offset + SEND_OFFSET;
+ recv_buff_offset = io_u_buff_offset + RECV_OFFSET;
+ send_buff_ptr = sd->orig_buffer_aligned + send_buff_offset;
+ recv_buff_ptr = sd->orig_buffer_aligned + recv_buff_offset;
+
+ /* unpack a flush request from the received buffer */
+ flush_req = gpspm_flush_request__unpack(NULL, wc->byte_len,
+ recv_buff_ptr);
+ if (flush_req == NULL) {
+ log_err("cannot unpack the flush request buffer\n");
+ goto err_terminate;
+ }
+
+ if (IS_NOT_THE_LAST_MESSAGE(flush_req)) {
+ op_ptr = csd->ws_ptr + flush_req->offset;
+ sd->persist(op_ptr, flush_req->length);
+ } else {
+ /*
+ * This is the last message - the client is done.
+ */
+ gpspm_flush_request__free_unpacked(flush_req, NULL);
+ td->done = true;
+ return 0;
+ }
+
+ /* initiate the next receive operation */
+ if ((ret = rpma_recv(csd->conn, sd->msg_mr, recv_buff_offset,
+ MAX_MSG_SIZE,
+ (const void *)(uintptr_t)msg_index))) {
+ librpma_td_verror(td, ret, "rpma_recv");
+ goto err_free_unpacked;
+ }
+
+ /* prepare a flush response and pack it to a send buffer */
+ flush_resp.op_context = flush_req->op_context;
+ flush_resp_size = gpspm_flush_response__get_packed_size(&flush_resp);
+ if (flush_resp_size > MAX_MSG_SIZE) {
+ log_err(
+ "Size of the packed flush response is bigger than the available space of the send buffer (%"
+ PRIu64 " > %i\n", flush_resp_size, MAX_MSG_SIZE);
+ goto err_free_unpacked;
+ }
+
+ (void) gpspm_flush_response__pack(&flush_resp, send_buff_ptr);
+
+ /* send the flush response */
+ if ((ret = rpma_send(csd->conn, sd->msg_mr, send_buff_offset,
+ flush_resp_size, RPMA_F_COMPLETION_ALWAYS, NULL))) {
+ librpma_td_verror(td, ret, "rpma_send");
+ goto err_free_unpacked;
+ }
+ --sd->msg_sqe_available;
+
+ gpspm_flush_request__free_unpacked(flush_req, NULL);
+
+ return 0;
+
+err_free_unpacked:
+ gpspm_flush_request__free_unpacked(flush_req, NULL);
+
+err_terminate:
+ td->terminate = true;
+
+ return -1;
+}
+
+static inline int server_queue_process(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd = csd->server_data;
+ int ret;
+ int i;
+
+ /* min(# of queue entries, # of SQ entries available) */
+ uint32_t qes_to_process = min(sd->msg_queued_nr, sd->msg_sqe_available);
+ if (qes_to_process == 0)
+ return 0;
+
+ /* process queued completions */
+ for (i = 0; i < qes_to_process; ++i) {
+ if ((ret = server_qe_process(td, &sd->msgs_queued[i])))
+ return ret;
+ }
+
+ /* progress the queue */
+ for (i = 0; i < sd->msg_queued_nr - qes_to_process; ++i) {
+ memcpy(&sd->msgs_queued[i],
+ &sd->msgs_queued[qes_to_process + i],
+ sizeof(sd->msgs_queued[i]));
+ }
+
+ sd->msg_queued_nr -= qes_to_process;
+
+ return 0;
+}
+
+static int server_cmpl_process(struct thread_data *td)
+{
+ struct librpma_fio_server_data *csd = td->io_ops_data;
+ struct server_data *sd = csd->server_data;
+ struct ibv_wc *wc = &sd->msgs_queued[sd->msg_queued_nr];
+ struct librpma_fio_options_values *o = td->eo;
+ int ret;
+
+ ret = rpma_cq_get_wc(csd->cq, 1, wc, NULL);
+ if (ret == RPMA_E_NO_COMPLETION) {
+ if (o->busy_wait_polling)
+ return 0; /* lack of completion is not an error */
+
+ ret = rpma_cq_wait(csd->cq);
+ if (ret == RPMA_E_NO_COMPLETION)
+ return 0; /* lack of completion is not an error */
+ if (ret) {
+ librpma_td_verror(td, ret, "rpma_cq_wait");
+ goto err_terminate;
+ }
+
+ ret = rpma_cq_get_wc(csd->cq, 1, wc, NULL);
+ if (ret == RPMA_E_NO_COMPLETION)
+ return 0; /* lack of completion is not an error */
+ if (ret) {
+ librpma_td_verror(td, ret, "rpma_cq_get_wc");
+ goto err_terminate;
+ }
+ } else if (ret) {
+ librpma_td_verror(td, ret, "rpma_cq_get_wc");
+ goto err_terminate;
+ }
+
+ /* validate the completion */
+ if (wc->status != IBV_WC_SUCCESS)
+ goto err_terminate;
+
+ if (wc->opcode == IBV_WC_RECV)
+ ++sd->msg_queued_nr;
+ else if (wc->opcode == IBV_WC_SEND)
+ ++sd->msg_sqe_available;
+
+ return 0;
+
+err_terminate:
+ td->terminate = true;
+
+ return -1;
+}
+
+static enum fio_q_status server_queue(struct thread_data *td, struct io_u *io_u)
+{
+ do {
+ if (server_cmpl_process(td))
+ return FIO_Q_BUSY;
+
+ if (server_queue_process(td))
+ return FIO_Q_BUSY;
+
+ } while (!td->done);
+
+ return FIO_Q_COMPLETED;
+}
+
+FIO_STATIC struct ioengine_ops ioengine_server = {
+ .name = "librpma_gpspm_server",
+ .version = FIO_IOOPS_VERSION,
+ .init = server_init,
+ .post_init = server_post_init,
+ .open_file = server_open_file,
+ .close_file = librpma_fio_server_close_file,
+ .queue = server_queue,
+ .invalidate = librpma_fio_file_nop,
+ .cleanup = server_cleanup,
+ .flags = FIO_SYNCIO,
+ .options = librpma_fio_options,
+ .option_struct_size = sizeof(struct librpma_fio_options_values),
+};
+
+/* register both engines */
+
+static void fio_init fio_librpma_gpspm_register(void)
+{
+ register_ioengine(&ioengine_client);
+ register_ioengine(&ioengine_server);
+}
+
+static void fio_exit fio_librpma_gpspm_unregister(void)
+{
+ unregister_ioengine(&ioengine_client);
+ unregister_ioengine(&ioengine_server);
+}
--- /dev/null
+/*
+ * Copyright 2020, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* Generated by the protocol buffer compiler. DO NOT EDIT! */
+/* Generated from: librpma_gpspm_flush.proto */
+
+/* Do not generate deprecated warnings for self */
+#ifndef PROTOBUF_C__NO_DEPRECATED
+#define PROTOBUF_C__NO_DEPRECATED
+#endif
+
+#include "librpma_gpspm_flush.pb-c.h"
+void gpspm_flush_request__init
+ (GPSPMFlushRequest *message)
+{
+ static const GPSPMFlushRequest init_value = GPSPM_FLUSH_REQUEST__INIT;
+ *message = init_value;
+}
+size_t gpspm_flush_request__get_packed_size
+ (const GPSPMFlushRequest *message)
+{
+ assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+ return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t gpspm_flush_request__pack
+ (const GPSPMFlushRequest *message,
+ uint8_t *out)
+{
+ assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+ return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t gpspm_flush_request__pack_to_buffer
+ (const GPSPMFlushRequest *message,
+ ProtobufCBuffer *buffer)
+{
+ assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+ return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+GPSPMFlushRequest *
+ gpspm_flush_request__unpack
+ (ProtobufCAllocator *allocator,
+ size_t len,
+ const uint8_t *data)
+{
+ return (GPSPMFlushRequest *)
+ protobuf_c_message_unpack (&gpspm_flush_request__descriptor,
+ allocator, len, data);
+}
+void gpspm_flush_request__free_unpacked
+ (GPSPMFlushRequest *message,
+ ProtobufCAllocator *allocator)
+{
+ if(!message)
+ return;
+ assert(message->base.descriptor == &gpspm_flush_request__descriptor);
+ protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+void gpspm_flush_response__init
+ (GPSPMFlushResponse *message)
+{
+ static const GPSPMFlushResponse init_value = GPSPM_FLUSH_RESPONSE__INIT;
+ *message = init_value;
+}
+size_t gpspm_flush_response__get_packed_size
+ (const GPSPMFlushResponse *message)
+{
+ assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+ return protobuf_c_message_get_packed_size ((const ProtobufCMessage*)(message));
+}
+size_t gpspm_flush_response__pack
+ (const GPSPMFlushResponse *message,
+ uint8_t *out)
+{
+ assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+ return protobuf_c_message_pack ((const ProtobufCMessage*)message, out);
+}
+size_t gpspm_flush_response__pack_to_buffer
+ (const GPSPMFlushResponse *message,
+ ProtobufCBuffer *buffer)
+{
+ assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+ return protobuf_c_message_pack_to_buffer ((const ProtobufCMessage*)message, buffer);
+}
+GPSPMFlushResponse *
+ gpspm_flush_response__unpack
+ (ProtobufCAllocator *allocator,
+ size_t len,
+ const uint8_t *data)
+{
+ return (GPSPMFlushResponse *)
+ protobuf_c_message_unpack (&gpspm_flush_response__descriptor,
+ allocator, len, data);
+}
+void gpspm_flush_response__free_unpacked
+ (GPSPMFlushResponse *message,
+ ProtobufCAllocator *allocator)
+{
+ if(!message)
+ return;
+ assert(message->base.descriptor == &gpspm_flush_response__descriptor);
+ protobuf_c_message_free_unpacked ((ProtobufCMessage*)message, allocator);
+}
+static const ProtobufCFieldDescriptor gpspm_flush_request__field_descriptors[3] =
+{
+ {
+ "offset",
+ 1,
+ PROTOBUF_C_LABEL_REQUIRED,
+ PROTOBUF_C_TYPE_FIXED64,
+ 0, /* quantifier_offset */
+ offsetof(GPSPMFlushRequest, offset),
+ NULL,
+ NULL,
+ 0, /* flags */
+ 0,NULL,NULL /* reserved1,reserved2, etc */
+ },
+ {
+ "length",
+ 2,
+ PROTOBUF_C_LABEL_REQUIRED,
+ PROTOBUF_C_TYPE_FIXED64,
+ 0, /* quantifier_offset */
+ offsetof(GPSPMFlushRequest, length),
+ NULL,
+ NULL,
+ 0, /* flags */
+ 0,NULL,NULL /* reserved1,reserved2, etc */
+ },
+ {
+ "op_context",
+ 3,
+ PROTOBUF_C_LABEL_REQUIRED,
+ PROTOBUF_C_TYPE_FIXED64,
+ 0, /* quantifier_offset */
+ offsetof(GPSPMFlushRequest, op_context),
+ NULL,
+ NULL,
+ 0, /* flags */
+ 0,NULL,NULL /* reserved1,reserved2, etc */
+ },
+};
+static const unsigned gpspm_flush_request__field_indices_by_name[] = {
+ 1, /* field[1] = length */
+ 0, /* field[0] = offset */
+ 2, /* field[2] = op_context */
+};
+static const ProtobufCIntRange gpspm_flush_request__number_ranges[1 + 1] =
+{
+ { 1, 0 },
+ { 0, 3 }
+};
+const ProtobufCMessageDescriptor gpspm_flush_request__descriptor =
+{
+ PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+ "GPSPM_flush_request",
+ "GPSPMFlushRequest",
+ "GPSPMFlushRequest",
+ "",
+ sizeof(GPSPMFlushRequest),
+ 3,
+ gpspm_flush_request__field_descriptors,
+ gpspm_flush_request__field_indices_by_name,
+ 1, gpspm_flush_request__number_ranges,
+ (ProtobufCMessageInit) gpspm_flush_request__init,
+ NULL,NULL,NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor gpspm_flush_response__field_descriptors[1] =
+{
+ {
+ "op_context",
+ 1,
+ PROTOBUF_C_LABEL_REQUIRED,
+ PROTOBUF_C_TYPE_FIXED64,
+ 0, /* quantifier_offset */
+ offsetof(GPSPMFlushResponse, op_context),
+ NULL,
+ NULL,
+ 0, /* flags */
+ 0,NULL,NULL /* reserved1,reserved2, etc */
+ },
+};
+static const unsigned gpspm_flush_response__field_indices_by_name[] = {
+ 0, /* field[0] = op_context */
+};
+static const ProtobufCIntRange gpspm_flush_response__number_ranges[1 + 1] =
+{
+ { 1, 0 },
+ { 0, 1 }
+};
+const ProtobufCMessageDescriptor gpspm_flush_response__descriptor =
+{
+ PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+ "GPSPM_flush_response",
+ "GPSPMFlushResponse",
+ "GPSPMFlushResponse",
+ "",
+ sizeof(GPSPMFlushResponse),
+ 1,
+ gpspm_flush_response__field_descriptors,
+ gpspm_flush_response__field_indices_by_name,
+ 1, gpspm_flush_response__number_ranges,
+ (ProtobufCMessageInit) gpspm_flush_response__init,
+ NULL,NULL,NULL /* reserved[123] */
+};
--- /dev/null
+/*
+ * Copyright 2020, Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* Generated by the protocol buffer compiler. DO NOT EDIT! */
+/* Generated from: librpma_gpspm_flush.proto */
+
+#ifndef PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED
+#define PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED
+
+#include <protobuf-c/protobuf-c.h>
+
+PROTOBUF_C__BEGIN_DECLS
+
+#if PROTOBUF_C_VERSION_NUMBER < 1000000
+# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
+#elif 1003003 < PROTOBUF_C_MIN_COMPILER_VERSION
+# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
+#endif
+
+
+typedef struct _GPSPMFlushRequest GPSPMFlushRequest;
+typedef struct _GPSPMFlushResponse GPSPMFlushResponse;
+
+
+/* --- enums --- */
+
+
+/* --- messages --- */
+
+struct _GPSPMFlushRequest
+{
+ ProtobufCMessage base;
+ uint64_t offset;
+ uint64_t length;
+ uint64_t op_context;
+};
+#define GPSPM_FLUSH_REQUEST__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&gpspm_flush_request__descriptor) \
+ , 0, 0, 0 }
+
+
+struct _GPSPMFlushResponse
+{
+ ProtobufCMessage base;
+ uint64_t op_context;
+};
+#define GPSPM_FLUSH_RESPONSE__INIT \
+ { PROTOBUF_C_MESSAGE_INIT (&gpspm_flush_response__descriptor) \
+ , 0 }
+
+
+/* GPSPMFlushRequest methods */
+void gpspm_flush_request__init
+ (GPSPMFlushRequest *message);
+size_t gpspm_flush_request__get_packed_size
+ (const GPSPMFlushRequest *message);
+size_t gpspm_flush_request__pack
+ (const GPSPMFlushRequest *message,
+ uint8_t *out);
+size_t gpspm_flush_request__pack_to_buffer
+ (const GPSPMFlushRequest *message,
+ ProtobufCBuffer *buffer);
+GPSPMFlushRequest *
+ gpspm_flush_request__unpack
+ (ProtobufCAllocator *allocator,
+ size_t len,
+ const uint8_t *data);
+void gpspm_flush_request__free_unpacked
+ (GPSPMFlushRequest *message,
+ ProtobufCAllocator *allocator);
+/* GPSPMFlushResponse methods */
+void gpspm_flush_response__init
+ (GPSPMFlushResponse *message);
+size_t gpspm_flush_response__get_packed_size
+ (const GPSPMFlushResponse *message);
+size_t gpspm_flush_response__pack
+ (const GPSPMFlushResponse *message,
+ uint8_t *out);
+size_t gpspm_flush_response__pack_to_buffer
+ (const GPSPMFlushResponse *message,
+ ProtobufCBuffer *buffer);
+GPSPMFlushResponse *
+ gpspm_flush_response__unpack
+ (ProtobufCAllocator *allocator,
+ size_t len,
+ const uint8_t *data);
+void gpspm_flush_response__free_unpacked
+ (GPSPMFlushResponse *message,
+ ProtobufCAllocator *allocator);
+/* --- per-message closures --- */
+
+typedef void (*GPSPMFlushRequest_Closure)
+ (const GPSPMFlushRequest *message,
+ void *closure_data);
+typedef void (*GPSPMFlushResponse_Closure)
+ (const GPSPMFlushResponse *message,
+ void *closure_data);
+
+/* --- services --- */
+
+
+/* --- descriptors --- */
+
+extern const ProtobufCMessageDescriptor gpspm_flush_request__descriptor;
+extern const ProtobufCMessageDescriptor gpspm_flush_response__descriptor;
+
+PROTOBUF_C__END_DECLS
+
+
+#endif /* PROTOBUF_C_GPSPM_5fflush_2eproto__INCLUDED */
--- /dev/null
+syntax = "proto2";
+
+message GPSPM_flush_request {
+ /* an offset of a region to be flushed within its memory registration */
+ required fixed64 offset = 1;
+ /* a length of a region to be flushed */
+ required fixed64 length = 2;
+ /* a user-defined operation context */
+ required fixed64 op_context = 3;
+}
+
+message GPSPM_flush_response {
+ /* the operation context of a completed request */
+ required fixed64 op_context = 1;
+}
#include "fio.h"
#include "err.h"
#include "zbd_types.h"
+#include "zbd.h"
struct libzbc_data {
struct zbc_device *zdev;
enum zbc_dev_model model;
uint64_t nr_sectors;
+ uint32_t max_open_seq_req;
};
static int libzbc_get_dev_info(struct libzbc_data *ld, struct fio_file *f)
zbc_get_device_info(ld->zdev, zinfo);
ld->model = zinfo->zbd_model;
ld->nr_sectors = zinfo->zbd_sectors;
+ ld->max_open_seq_req = zinfo->zbd_max_nr_open_seq_req;
dprint(FD_ZBD, "%s: vendor_id:%s, type: %s, model: %s\n",
f->file_name, zinfo->zbd_vendor_id,
return -EINVAL;
}
- if (td_write(td)) {
+ if (td_write(td) || td_trim(td)) {
if (!read_only)
flags |= O_RDWR;
} else if (td_read(td)) {
- if (f->filetype == FIO_TYPE_CHAR && !read_only)
- flags |= O_RDWR;
- else
flags |= O_RDONLY;
- } else if (td_trim(td)) {
- td_verror(td, EINVAL, "libzbc does not support trim");
- log_err("%s: libzbc does not support trim\n", f->file_name);
- return -EINVAL;
- }
-
- if (td->o.oatomic) {
- td_verror(td, EINVAL, "libzbc does not support O_ATOMIC");
- log_err("%s: libzbc does not support O_ATOMIC\n", f->file_name);
- return -EINVAL;
}
ld = calloc(1, sizeof(*ld));
return -ENOMEM;
ret = zbc_open(f->file_name,
- flags | ZBC_O_DRV_SCSI | ZBC_O_DRV_ATA, &ld->zdev);
+ flags | ZBC_O_DRV_SCSI | ZBC_O_DRV_ATA,
+ &ld->zdev);
if (ret) {
log_err("%s: zbc_open() failed, err=%d\n",
f->file_name, ret);
struct libzbc_data *ld;
int ret;
- if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) {
- *model = ZBD_IGNORE;
- return 0;
- }
+ if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR)
+ return -EINVAL;
ret = libzbc_open_dev(td, f, &ld);
if (ret)
default:
/* Treat all these conditions as offline (don't use!) */
zbdz->cond = ZBD_ZONE_COND_OFFLINE;
- break;
+ zbdz->wp = zbdz->start;
}
}
return -ret;
}
+static int libzbc_finish_zone(struct thread_data *td, struct fio_file *f,
+ uint64_t offset, uint64_t length)
+{
+ struct libzbc_data *ld = td->io_ops_data;
+ uint64_t sector = offset >> 9;
+ unsigned int nr_zones;
+ struct zbc_errno err;
+ int i, ret;
+
+ assert(ld);
+ assert(ld->zdev);
+
+ nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size;
+ assert(nr_zones > 0);
+
+ for (i = 0; i < nr_zones; i++, sector += td->o.zone_size >> 9) {
+ ret = zbc_finish_zone(ld->zdev, sector, 0);
+ if (ret)
+ goto err;
+ }
+
+ return 0;
+
+err:
+ zbc_errno(ld->zdev, &err);
+ td_verror(td, errno, "zbc_finish_zone failed");
+ if (err.sk)
+ log_err("%s: finish zone failed %s:%s\n",
+ f->file_name,
+ zbc_sk_str(err.sk), zbc_asc_ascq_str(err.asc_ascq));
+ return -ret;
+}
+
+static int libzbc_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ struct libzbc_data *ld;
+ int ret;
+
+ ret = libzbc_open_dev(td, f, &ld);
+ if (ret)
+ return ret;
+
+ if (ld->max_open_seq_req == ZBC_NO_LIMIT)
+ *max_open_zones = 0;
+ else
+ *max_open_zones = ld->max_open_seq_req;
+
+ return 0;
+}
+
ssize_t libzbc_rw(struct thread_data *td, struct io_u *io_u)
{
struct libzbc_data *ld = td->io_ops_data;
ret = zbc_flush(ld->zdev);
if (ret)
log_err("zbc_flush error %zd\n", ret);
- } else if (io_u->ddir != DDIR_TRIM) {
+ } else if (io_u->ddir == DDIR_TRIM) {
+ ret = zbd_do_io_u_trim(td, io_u);
+ if (!ret)
+ ret = EINVAL;
+ } else {
log_err("Unsupported operation %u\n", io_u->ddir);
ret = -EINVAL;
}
.get_zoned_model = libzbc_get_zoned_model,
.report_zones = libzbc_report_zones,
.reset_wp = libzbc_reset_wp,
+ .get_max_open_zones = libzbc_get_max_open_zones,
+ .finish_zone = libzbc_finish_zone,
.queue = libzbc_queue,
.flags = FIO_SYNCIO | FIO_NOEXTEND | FIO_RAWIO,
};
},
};
-/* Alocates nbd_data. */
+/* Allocates nbd_data. */
static int nbd_setup(struct thread_data *td)
{
struct nbd_data *nbd_data;
#include <sys/socket.h>
#include <sys/un.h>
+#ifdef CONFIG_VSOCK
+#include <linux/vm_sockets.h>
+#else
+struct sockaddr_vm {
+};
+#ifndef AF_VSOCK
+#define AF_VSOCK -1
+#endif
+#endif
+
#include "../fio.h"
#include "../verify.h"
#include "../optgroup.h"
struct sockaddr_in addr;
struct sockaddr_in6 addr6;
struct sockaddr_un addr_un;
+ struct sockaddr_vm addr_vm;
uint64_t udp_send_seq;
uint64_t udp_recv_seq;
};
FIO_TYPE_UNIX = 3,
FIO_TYPE_TCP_V6 = 4,
FIO_TYPE_UDP_V6 = 5,
+ FIO_TYPE_VSOCK_STREAM = 6,
};
static int str_hostname_cb(void *data, const char *input);
.oval = FIO_TYPE_UNIX,
.help = "UNIX domain socket",
},
+ { .ival = "vsock",
+ .oval = FIO_TYPE_VSOCK_STREAM,
+ .help = "Virtual socket",
+ },
},
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_NETIO,
return o->proto == FIO_TYPE_UDP_V6 || o->proto == FIO_TYPE_TCP_V6;
}
+static inline int is_vsock(struct netio_options *o)
+{
+ return o->proto == FIO_TYPE_VSOCK_STREAM;
+}
+
static int set_window_size(struct thread_data *td, int fd)
{
#ifdef CONFIG_NET_WINDOWSIZE
} else if (o->proto == FIO_TYPE_UNIX) {
domain = AF_UNIX;
type = SOCK_STREAM;
+ } else if (is_vsock(o)) {
+ domain = AF_VSOCK;
+ type = SOCK_STREAM;
} else {
log_err("fio: bad network type %d\n", o->proto);
f->fd = -1;
close(f->fd);
return 1;
}
+ } else if (is_vsock(o)) {
+ socklen_t len = sizeof(nd->addr_vm);
+ if (connect(f->fd, (struct sockaddr *) &nd->addr_vm, len) < 0) {
+ td_verror(td, errno, "connect");
+ close(f->fd);
+ return 1;
+ }
} else {
struct sockaddr_un *addr = &nd->addr_un;
socklen_t len;
if (o->proto == FIO_TYPE_TCP) {
socklen = sizeof(nd->addr);
f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr, &socklen);
+ } else if (is_vsock(o)) {
+ socklen = sizeof(nd->addr_vm);
+ f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr_vm, &socklen);
} else {
socklen = sizeof(nd->addr6);
f->fd = accept(nd->listenfd, (struct sockaddr *) &nd->addr6, &socklen);
if (is_ipv6(o)) {
to = (struct sockaddr *) &nd->addr6;
len = sizeof(nd->addr6);
+ } else if (is_vsock(o)) {
+ to = NULL;
+ len = 0;
} else {
to = (struct sockaddr *) &nd->addr;
len = sizeof(nd->addr);
if (is_ipv6(o)) {
len = sizeof(nd->addr6);
to = (struct sockaddr *) &nd->addr6;
+ } else if (is_vsock(o)) {
+ len = sizeof(nd->addr_vm);
+ to = (struct sockaddr *) &nd->addr_vm;
} else {
len = sizeof(nd->addr);
to = (struct sockaddr *) &nd->addr;
memset(&hints, 0, sizeof(hints));
- if (is_tcp(o))
+ if (is_tcp(o) || is_vsock(o))
hints.ai_socktype = SOCK_STREAM;
else
hints.ai_socktype = SOCK_DGRAM;
if (is_ipv6(o))
hints.ai_family = AF_INET6;
+#ifdef CONFIG_VSOCK
+ else if (is_vsock(o))
+ hints.ai_family = AF_VSOCK;
+#endif
else
hints.ai_family = AF_INET;
return 0;
}
+static int fio_netio_setup_connect_vsock(struct thread_data *td,
+ const char *host, unsigned short port)
+{
+#ifdef CONFIG_VSOCK
+ struct netio_data *nd = td->io_ops_data;
+ struct sockaddr_vm *addr = &nd->addr_vm;
+ int cid;
+
+ if (!host) {
+ log_err("fio: connect with no host to connect to.\n");
+ if (td_read(td))
+ log_err("fio: did you forget to set 'listen'?\n");
+
+ td_verror(td, EINVAL, "no hostname= set");
+ return 1;
+ }
+
+ addr->svm_family = AF_VSOCK;
+ addr->svm_port = port;
+
+ if (host) {
+ cid = atoi(host);
+ if (cid < 0 || cid > UINT32_MAX) {
+ log_err("fio: invalid CID %d\n", cid);
+ return 1;
+ }
+ addr->svm_cid = cid;
+ }
+
+ return 0;
+#else
+ td_verror(td, -EINVAL, "vsock not supported");
+ return 1;
+#endif
+}
+
static int fio_netio_setup_connect(struct thread_data *td)
{
struct netio_options *o = td->eo;
if (is_udp(o) || is_tcp(o))
return fio_netio_setup_connect_inet(td, td->o.filename,o->port);
+ else if (is_vsock(o))
+ return fio_netio_setup_connect_vsock(td, td->o.filename, o->port);
else
return fio_netio_setup_connect_unix(td, td->o.filename);
}
return 0;
}
+static int fio_netio_setup_listen_vsock(struct thread_data *td, short port, int type)
+{
+#ifdef CONFIG_VSOCK
+ struct netio_data *nd = td->io_ops_data;
+ struct sockaddr_vm *addr = &nd->addr_vm;
+ int fd, opt;
+ socklen_t len;
+
+ fd = socket(AF_VSOCK, type, 0);
+ if (fd < 0) {
+ td_verror(td, errno, "socket");
+ return 1;
+ }
+
+ opt = 1;
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (void *) &opt, sizeof(opt)) < 0) {
+ td_verror(td, errno, "setsockopt");
+ close(fd);
+ return 1;
+ }
+
+ len = sizeof(*addr);
+
+ nd->addr_vm.svm_family = AF_VSOCK;
+ nd->addr_vm.svm_cid = VMADDR_CID_ANY;
+ nd->addr_vm.svm_port = port;
+
+ if (bind(fd, (struct sockaddr *) addr, len) < 0) {
+ td_verror(td, errno, "bind");
+ close(fd);
+ return 1;
+ }
+
+ nd->listenfd = fd;
+ return 0;
+#else
+ td_verror(td, -EINVAL, "vsock not supported");
+ return -1;
+#endif
+}
+
static int fio_netio_setup_listen(struct thread_data *td)
{
struct netio_data *nd = td->io_ops_data;
if (is_udp(o) || is_tcp(o))
ret = fio_netio_setup_listen_inet(td, o->port);
+ else if (is_vsock(o))
+ ret = fio_netio_setup_listen_vsock(td, o->port, SOCK_STREAM);
else
ret = fio_netio_setup_listen_unix(td, td->o.filename);
if (o->proto == FIO_TYPE_UNIX && o->port) {
log_err("fio: network IO port not valid with unix socket\n");
return 1;
+ } else if (is_vsock(o) && !o->port) {
+ log_err("fio: network IO requires port for vsock\n");
+ return 1;
} else if (o->proto != FIO_TYPE_UNIX && !o->port) {
log_err("fio: network IO requires port for tcp or udp\n");
return 1;
o->port += td->subjob_number;
- if (!is_tcp(o)) {
+ if (!is_tcp(o) && !is_vsock(o)) {
if (o->listen) {
log_err("fio: listen only valid for TCP proto IO\n");
return 1;
}
if (!td->io_ops_data) {
- nd = malloc(sizeof(*nd));
-
- memset(nd, 0, sizeof(*nd));
+ nd = calloc(1, sizeof(*nd));
nd->listenfd = -1;
nd->pipes[0] = nd->pipes[1] = -1;
td->io_ops_data = nd;
--- /dev/null
+#include <stdlib.h>
+#include <poll.h>
+#include <nfsc/libnfs.h>
+#include <nfsc/libnfs-raw.h>
+#include <nfsc/libnfs-raw-mount.h>
+
+#include "../fio.h"
+#include "../optgroup.h"
+
+enum nfs_op_type {
+ NFS_READ_WRITE = 0,
+ NFS_STAT_MKDIR_RMDIR,
+ NFS_STAT_TOUCH_RM,
+};
+
+struct fio_libnfs_options {
+ struct nfs_context *context;
+ char *nfs_url;
+ /* nfs_callback needs this info, but doesn't have fio td structure to
+ * pull it from
+ */
+ unsigned int queue_depth;
+
+ /* the following implement a circular queue of outstanding IOs */
+
+ /* IOs issued to libnfs, that have not returned yet */
+ int outstanding_events;
+ /* event last returned via fio_libnfs_event */
+ int prev_requested_event_index;
+ int next_buffered_event; /* round robin-pointer within events[] */
+ int buffered_event_count; /* IOs completed by libnfs, waiting for FIO */
+ int free_event_buffer_index; /* next free buffer */
+ struct io_u**events;
+};
+
+struct nfs_data {
+ struct nfsfh *nfsfh;
+ struct fio_libnfs_options *options;
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "nfs_url",
+ .lname = "nfs_url",
+ .type = FIO_OPT_STR_STORE,
+ .help = "URL in libnfs format, eg nfs://<server|ipv4|"
+ "ipv6>/path[?arg=val[&arg=val]*]",
+ .off1 = offsetof(struct fio_libnfs_options, nfs_url),
+ .category = FIO_OPT_C_ENGINE,
+ .group = __FIO_OPT_G_NFS,
+ },
+ {
+ .name = NULL,
+ },
+};
+
+static struct io_u *fio_libnfs_event(struct thread_data *td, int event)
+{
+ struct fio_libnfs_options *o = td->eo;
+ struct io_u *io_u = o->events[o->next_buffered_event];
+
+ assert(o->events[o->next_buffered_event]);
+ o->events[o->next_buffered_event] = NULL;
+ o->next_buffered_event = (o->next_buffered_event + 1) % td->o.iodepth;
+
+ /* validate our state machine */
+ assert(o->buffered_event_count);
+ o->buffered_event_count--;
+ assert(io_u);
+
+ /* assert that fio_libnfs_event is being called in sequential fashion */
+ assert(event == 0 || o->prev_requested_event_index + 1 == event);
+ if (o->buffered_event_count == 0)
+ o->prev_requested_event_index = -1;
+ else
+ o->prev_requested_event_index = event;
+ return io_u;
+}
+
+/*
+ * fio core logic seems to stop calling this event-loop if we ever return with
+ * 0 events
+ */
+#define SHOULD_WAIT(td, o, flush) \
+ ((o)->outstanding_events == (td)->o.iodepth || \
+ (flush && (o)->outstanding_events))
+
+static int nfs_event_loop(struct thread_data *td, bool flush)
+{
+ struct fio_libnfs_options *o = td->eo;
+ struct pollfd pfds[1]; /* nfs:0 */
+
+ /* we already have stuff queued for fio, no need to waste cpu on poll() */
+ if (o->buffered_event_count)
+ return o->buffered_event_count;
+
+ do {
+ int timeout = SHOULD_WAIT(td, o, flush) ? -1 : 0;
+ int ret = 0;
+
+ pfds[0].fd = nfs_get_fd(o->context);
+ pfds[0].events = nfs_which_events(o->context);
+ ret = poll(&pfds[0], 1, timeout);
+ if (ret < 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ log_err("nfs: failed to poll events: %s\n", strerror(errno));
+ break;
+ }
+
+ ret = nfs_service(o->context, pfds[0].revents);
+ if (ret < 0) {
+ log_err("nfs: socket is in an unrecoverable error state.\n");
+ break;
+ }
+ } while (SHOULD_WAIT(td, o, flush));
+
+ return o->buffered_event_count;
+}
+
+static int fio_libnfs_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
+{
+ return nfs_event_loop(td, false);
+}
+
+static void nfs_callback(int res, struct nfs_context *nfs, void *data,
+ void *private_data)
+{
+ struct io_u *io_u = private_data;
+ struct nfs_data *nfs_data = io_u->file->engine_data;
+ struct fio_libnfs_options *o = nfs_data->options;
+ if (res < 0) {
+ log_err("Failed NFS operation(code:%d): %s\n", res,
+ nfs_get_error(o->context));
+ io_u->error = -res;
+ /* res is used for read math below, don't want to pass negative
+ * there
+ */
+ res = 0;
+ } else if (io_u->ddir == DDIR_READ) {
+ memcpy(io_u->buf, data, res);
+ if (res == 0)
+ log_err("Got NFS EOF, this is probably not expected\n");
+ }
+ /* fio uses resid to track remaining data */
+ io_u->resid = io_u->xfer_buflen - res;
+
+ assert(!o->events[o->free_event_buffer_index]);
+ o->events[o->free_event_buffer_index] = io_u;
+ o->free_event_buffer_index = (o->free_event_buffer_index + 1) % o->queue_depth;
+ o->outstanding_events--;
+ o->buffered_event_count++;
+}
+
+static int queue_write(struct fio_libnfs_options *o, struct io_u *io_u)
+{
+ struct nfs_data *nfs_data = io_u->engine_data;
+
+ return nfs_pwrite_async(o->context, nfs_data->nfsfh, io_u->offset,
+ io_u->buflen, io_u->buf, nfs_callback, io_u);
+}
+
+static int queue_read(struct fio_libnfs_options *o, struct io_u *io_u)
+{
+ struct nfs_data *nfs_data = io_u->engine_data;
+
+ return nfs_pread_async(o->context, nfs_data->nfsfh, io_u->offset,
+ io_u->buflen, nfs_callback, io_u);
+}
+
+static enum fio_q_status fio_libnfs_queue(struct thread_data *td,
+ struct io_u *io_u)
+{
+ struct nfs_data *nfs_data = io_u->file->engine_data;
+ struct fio_libnfs_options *o = nfs_data->options;
+ struct nfs_context *nfs = o->context;
+ enum fio_q_status ret = FIO_Q_QUEUED;
+ int err;
+
+ io_u->engine_data = nfs_data;
+ switch (io_u->ddir) {
+ case DDIR_WRITE:
+ err = queue_write(o, io_u);
+ break;
+ case DDIR_READ:
+ err = queue_read(o, io_u);
+ break;
+ case DDIR_TRIM:
+ log_err("nfs: trim is not supported");
+ err = -1;
+ break;
+ default:
+ log_err("nfs: unhandled io %d\n", io_u->ddir);
+ err = -1;
+ }
+ if (err) {
+ log_err("nfs: Failed to queue nfs op: %s\n", nfs_get_error(nfs));
+ td->error = 1;
+ return FIO_Q_COMPLETED;
+ }
+ o->outstanding_events++;
+ return ret;
+}
+
+/*
+ * Do a mount if one has not been done before
+ */
+static int do_mount(struct thread_data *td, const char *url)
+{
+ size_t event_size = sizeof(struct io_u **) * td->o.iodepth;
+ struct fio_libnfs_options *options = td->eo;
+ struct nfs_url *nfs_url = NULL;
+ int ret = 0;
+ int path_len = 0;
+ char *mnt_dir = NULL;
+
+ if (options->context)
+ return 0;
+
+ options->context = nfs_init_context();
+ if (!options->context) {
+ log_err("nfs: failed to init nfs context\n");
+ return -1;
+ }
+
+ options->events = calloc(1, event_size);
+
+ options->prev_requested_event_index = -1;
+ options->queue_depth = td->o.iodepth;
+
+ nfs_url = nfs_parse_url_full(options->context, url);
+ path_len = strlen(nfs_url->path);
+ mnt_dir = malloc(path_len + strlen(nfs_url->file) + 1);
+ strcpy(mnt_dir, nfs_url->path);
+ strcpy(mnt_dir + strlen(nfs_url->path), nfs_url->file);
+ ret = nfs_mount(options->context, nfs_url->server, mnt_dir);
+ free(mnt_dir);
+ nfs_destroy_url(nfs_url);
+ return ret;
+}
+
+static int fio_libnfs_setup(struct thread_data *td)
+{
+ /* Using threads with libnfs causes fio to hang on exit, lower
+ * performance
+ */
+ td->o.use_thread = 0;
+ return 0;
+}
+
+static void fio_libnfs_cleanup(struct thread_data *td)
+{
+ struct fio_libnfs_options *o = td->eo;
+
+ nfs_umount(o->context);
+ nfs_destroy_context(o->context);
+ free(o->events);
+}
+
+static int fio_libnfs_open(struct thread_data *td, struct fio_file *f)
+{
+ struct fio_libnfs_options *options = td->eo;
+ struct nfs_data *nfs_data = NULL;
+ int flags = 0;
+ int ret;
+
+ if (!options->nfs_url) {
+ log_err("nfs: nfs_url is a required parameter\n");
+ return -1;
+ }
+
+ ret = do_mount(td, options->nfs_url);
+
+ if (ret) {
+ log_err("nfs: Failed to mount %s with code %d: %s\n",
+ options->nfs_url, ret, nfs_get_error(options->context));
+ return ret;
+ }
+ nfs_data = calloc(1, sizeof(struct nfs_data));
+ nfs_data->options = options;
+
+ if (td->o.td_ddir == TD_DDIR_WRITE)
+ flags |= O_CREAT | O_RDWR;
+ else
+ flags |= O_RDWR;
+
+ ret = nfs_open(options->context, f->file_name, flags, &nfs_data->nfsfh);
+
+ if (ret)
+ log_err("Failed to open %s: %s\n", f->file_name,
+ nfs_get_error(options->context));
+ f->engine_data = nfs_data;
+ return ret;
+}
+
+static int fio_libnfs_close(struct thread_data *td, struct fio_file *f)
+{
+ struct nfs_data *nfs_data = f->engine_data;
+ struct fio_libnfs_options *o = nfs_data->options;
+ int ret = 0;
+
+ if (nfs_data->nfsfh)
+ ret = nfs_close(o->context, nfs_data->nfsfh);
+
+ free(nfs_data);
+ f->engine_data = NULL;
+ return ret;
+}
+
+static struct ioengine_ops ioengine = {
+ .name = "nfs",
+ .version = FIO_IOOPS_VERSION,
+ .setup = fio_libnfs_setup,
+ .queue = fio_libnfs_queue,
+ .getevents = fio_libnfs_getevents,
+ .event = fio_libnfs_event,
+ .cleanup = fio_libnfs_cleanup,
+ .open_file = fio_libnfs_open,
+ .close_file = fio_libnfs_close,
+ .flags = FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+ .options = options,
+ .option_struct_size = sizeof(struct fio_libnfs_options),
+};
+
+static void fio_init fio_nfs_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_nfs_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
*
* It also can act as external C++ engine - compiled with:
*
- * g++ -O2 -g -shared -rdynamic -fPIC -o cpp_null null.c -DFIO_EXTERNAL_ENGINE
+ * g++ -O2 -g -shared -rdynamic -fPIC -o cpp_null null.c \
+ * -include ../config-host.h -DFIO_EXTERNAL_ENGINE
*
* to test it execute:
*
return ret;
}
+static void null_queued(struct thread_data *td, struct null_data *nd)
+{
+ struct timespec now;
+
+ if (!fio_fill_issue_time(td))
+ return;
+
+ fio_gettime(&now, NULL);
+
+ for (int i = 0; i < nd->queued; i++) {
+ struct io_u *io_u = nd->io_us[i];
+
+ memcpy(&io_u->issue_time, &now, sizeof(now));
+ io_u_queued(td, io_u);
+ }
+}
+
static int null_commit(struct thread_data *td, struct null_data *nd)
{
if (!nd->events) {
+ null_queued(td, nd);
+
#ifndef FIO_EXTERNAL_ENGINE
io_u_mark_submit(td, nd->queued);
#endif
static struct null_data *null_init(struct thread_data *td)
{
- struct null_data *nd = (struct null_data *) malloc(sizeof(*nd));
+ struct null_data *nd;
+ nd = malloc(sizeof(*nd));
memset(nd, 0, sizeof(*nd));
if (td->o.iodepth != 1) {
- nd->io_us = (struct io_u **) malloc(td->o.iodepth * sizeof(struct io_u *));
- memset(nd->io_us, 0, td->o.iodepth * sizeof(struct io_u *));
+ nd->io_us = calloc(td->o.iodepth, sizeof(struct io_u *));
+ td->io_ops->flags |= FIO_ASYNCIO_SETS_ISSUE_TIME;
} else
td->io_ops->flags |= FIO_SYNCIO;
+ td_set_ioengine_flags(td);
return nd;
}
return null_commit(td, impl_);
}
- int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+ fio_q_status fio_null_queue(struct thread_data *td, struct io_u *io_u)
{
return null_queue(td, impl_, io_u);
}
return NullData::get(td)->fio_null_commit(td);
}
-static int fio_null_queue(struct thread_data *td, struct io_u *io_u)
+static fio_q_status fio_null_queue(struct thread_data *td, struct io_u *io_u)
{
return NullData::get(td)->fio_null_queue(td, io_u);
}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * nvme structure declarations and helper functions for the
+ * io_uring_cmd engine.
+ */
+
+#include "nvme.h"
+#include "../crc/crc-t10dif.h"
+#include "../crc/crc64.h"
+
+static inline __u64 get_slba(struct nvme_data *data, __u64 offset)
+{
+ if (data->lba_ext)
+ return offset / data->lba_ext;
+
+ return offset >> data->lba_shift;
+}
+
+static inline __u32 get_nlb(struct nvme_data *data, __u64 len)
+{
+ if (data->lba_ext)
+ return len / data->lba_ext - 1;
+
+ return (len >> data->lba_shift) - 1;
+}
+
+static void fio_nvme_generate_pi_16b_guard(struct nvme_data *data,
+ struct io_u *io_u,
+ struct nvme_cmd_ext_io_opts *opts)
+{
+ struct nvme_pi_data *pi_data = io_u->engine_data;
+ struct nvme_16b_guard_pif *pi;
+ unsigned char *buf = io_u->xfer_buf;
+ unsigned char *md_buf = io_u->mmap_data;
+ __u64 slba = get_slba(data, io_u->offset);
+ __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
+ __u32 lba_num = 0;
+ __u16 guard = 0;
+
+ if (data->pi_loc) {
+ if (data->lba_ext)
+ pi_data->interval = data->lba_ext - data->ms;
+ else
+ pi_data->interval = 0;
+ } else {
+ if (data->lba_ext)
+ pi_data->interval = data->lba_ext - sizeof(struct nvme_16b_guard_pif);
+ else
+ pi_data->interval = data->ms - sizeof(struct nvme_16b_guard_pif);
+ }
+
+ if (io_u->ddir != DDIR_WRITE)
+ return;
+
+ while (lba_num < nlb) {
+ if (data->lba_ext)
+ pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval);
+ else
+ pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval);
+
+ if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
+ if (data->lba_ext) {
+ guard = fio_crc_t10dif(0, buf, pi_data->interval);
+ } else {
+ guard = fio_crc_t10dif(0, buf, data->lba_size);
+ guard = fio_crc_t10dif(guard, md_buf, pi_data->interval);
+ }
+ pi->guard = cpu_to_be16(guard);
+ }
+
+ if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
+ pi->apptag = cpu_to_be16(pi_data->apptag);
+
+ if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
+ switch (data->pi_type) {
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ pi->srtag = cpu_to_be32((__u32)slba + lba_num);
+ break;
+ case NVME_NS_DPS_PI_TYPE3:
+ break;
+ }
+ }
+ if (data->lba_ext) {
+ buf += data->lba_ext;
+ } else {
+ buf += data->lba_size;
+ md_buf += data->ms;
+ }
+ lba_num++;
+ }
+}
+
+static int fio_nvme_verify_pi_16b_guard(struct nvme_data *data,
+ struct io_u *io_u)
+{
+ struct nvme_pi_data *pi_data = io_u->engine_data;
+ struct nvme_16b_guard_pif *pi;
+ struct fio_file *f = io_u->file;
+ unsigned char *buf = io_u->xfer_buf;
+ unsigned char *md_buf = io_u->mmap_data;
+ __u64 slba = get_slba(data, io_u->offset);
+ __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
+ __u32 lba_num = 0;
+ __u16 unmask_app, unmask_app_exp, guard = 0;
+
+ while (lba_num < nlb) {
+ if (data->lba_ext)
+ pi = (struct nvme_16b_guard_pif *)(buf + pi_data->interval);
+ else
+ pi = (struct nvme_16b_guard_pif *)(md_buf + pi_data->interval);
+
+ if (data->pi_type == NVME_NS_DPS_PI_TYPE3) {
+ if (pi->apptag == NVME_PI_APP_DISABLE &&
+ pi->srtag == NVME_PI_REF_DISABLE)
+ goto next;
+ } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 ||
+ data->pi_type == NVME_NS_DPS_PI_TYPE2) {
+ if (pi->apptag == NVME_PI_APP_DISABLE)
+ goto next;
+ }
+
+ if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
+ if (data->lba_ext) {
+ guard = fio_crc_t10dif(0, buf, pi_data->interval);
+ } else {
+ guard = fio_crc_t10dif(0, buf, data->lba_size);
+ guard = fio_crc_t10dif(guard, md_buf, pi_data->interval);
+ }
+ if (be16_to_cpu(pi->guard) != guard) {
+ log_err("%s: Guard compare error: LBA: %llu Expected=%x, Actual=%x\n",
+ f->file_name, (unsigned long long)slba,
+ guard, be16_to_cpu(pi->guard));
+ return -EIO;
+ }
+ }
+
+ if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) {
+ unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask;
+ unmask_app_exp = pi_data->apptag & pi_data->apptag_mask;
+ if (unmask_app != unmask_app_exp) {
+ log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
+ f->file_name, (unsigned long long)slba,
+ unmask_app_exp, unmask_app);
+ return -EIO;
+ }
+ }
+
+ if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
+ switch (data->pi_type) {
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ if (be32_to_cpu(pi->srtag) !=
+ ((__u32)slba + lba_num)) {
+ log_err("%s: REFTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
+ f->file_name, (unsigned long long)slba,
+ (__u32)slba + lba_num,
+ be32_to_cpu(pi->srtag));
+ return -EIO;
+ }
+ break;
+ case NVME_NS_DPS_PI_TYPE3:
+ break;
+ }
+ }
+next:
+ if (data->lba_ext) {
+ buf += data->lba_ext;
+ } else {
+ buf += data->lba_size;
+ md_buf += data->ms;
+ }
+ lba_num++;
+ }
+
+ return 0;
+}
+
+static void fio_nvme_generate_pi_64b_guard(struct nvme_data *data,
+ struct io_u *io_u,
+ struct nvme_cmd_ext_io_opts *opts)
+{
+ struct nvme_pi_data *pi_data = io_u->engine_data;
+ struct nvme_64b_guard_pif *pi;
+ unsigned char *buf = io_u->xfer_buf;
+ unsigned char *md_buf = io_u->mmap_data;
+ uint64_t guard = 0;
+ __u64 slba = get_slba(data, io_u->offset);
+ __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
+ __u32 lba_num = 0;
+
+ if (data->pi_loc) {
+ if (data->lba_ext)
+ pi_data->interval = data->lba_ext - data->ms;
+ else
+ pi_data->interval = 0;
+ } else {
+ if (data->lba_ext)
+ pi_data->interval = data->lba_ext - sizeof(struct nvme_64b_guard_pif);
+ else
+ pi_data->interval = data->ms - sizeof(struct nvme_64b_guard_pif);
+ }
+
+ if (io_u->ddir != DDIR_WRITE)
+ return;
+
+ while (lba_num < nlb) {
+ if (data->lba_ext)
+ pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval);
+ else
+ pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval);
+
+ if (opts->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
+ if (data->lba_ext) {
+ guard = fio_crc64_nvme(0, buf, pi_data->interval);
+ } else {
+ guard = fio_crc64_nvme(0, buf, data->lba_size);
+ guard = fio_crc64_nvme(guard, md_buf, pi_data->interval);
+ }
+ pi->guard = cpu_to_be64(guard);
+ }
+
+ if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
+ pi->apptag = cpu_to_be16(pi_data->apptag);
+
+ if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
+ switch (data->pi_type) {
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ put_unaligned_be48(slba + lba_num, pi->srtag);
+ break;
+ case NVME_NS_DPS_PI_TYPE3:
+ break;
+ }
+ }
+ if (data->lba_ext) {
+ buf += data->lba_ext;
+ } else {
+ buf += data->lba_size;
+ md_buf += data->ms;
+ }
+ lba_num++;
+ }
+}
+
+static int fio_nvme_verify_pi_64b_guard(struct nvme_data *data,
+ struct io_u *io_u)
+{
+ struct nvme_pi_data *pi_data = io_u->engine_data;
+ struct nvme_64b_guard_pif *pi;
+ struct fio_file *f = io_u->file;
+ unsigned char *buf = io_u->xfer_buf;
+ unsigned char *md_buf = io_u->mmap_data;
+ __u64 slba = get_slba(data, io_u->offset);
+ __u64 ref, ref_exp, guard = 0;
+ __u32 nlb = get_nlb(data, io_u->xfer_buflen) + 1;
+ __u32 lba_num = 0;
+ __u16 unmask_app, unmask_app_exp;
+
+ while (lba_num < nlb) {
+ if (data->lba_ext)
+ pi = (struct nvme_64b_guard_pif *)(buf + pi_data->interval);
+ else
+ pi = (struct nvme_64b_guard_pif *)(md_buf + pi_data->interval);
+
+ if (data->pi_type == NVME_NS_DPS_PI_TYPE3) {
+ if (pi->apptag == NVME_PI_APP_DISABLE &&
+ fio_nvme_pi_ref_escape(pi->srtag))
+ goto next;
+ } else if (data->pi_type == NVME_NS_DPS_PI_TYPE1 ||
+ data->pi_type == NVME_NS_DPS_PI_TYPE2) {
+ if (pi->apptag == NVME_PI_APP_DISABLE)
+ goto next;
+ }
+
+ if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_GUARD) {
+ if (data->lba_ext) {
+ guard = fio_crc64_nvme(0, buf, pi_data->interval);
+ } else {
+ guard = fio_crc64_nvme(0, buf, data->lba_size);
+ guard = fio_crc64_nvme(guard, md_buf, pi_data->interval);
+ }
+ if (be64_to_cpu((uint64_t)pi->guard) != guard) {
+ log_err("%s: Guard compare error: LBA: %llu Expected=%llx, Actual=%llx\n",
+ f->file_name, (unsigned long long)slba,
+ guard, be64_to_cpu((uint64_t)pi->guard));
+ return -EIO;
+ }
+ }
+
+ if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_APP) {
+ unmask_app = be16_to_cpu(pi->apptag) & pi_data->apptag_mask;
+ unmask_app_exp = pi_data->apptag & pi_data->apptag_mask;
+ if (unmask_app != unmask_app_exp) {
+ log_err("%s: APPTAG compare error: LBA: %llu Expected=%x, Actual=%x\n",
+ f->file_name, (unsigned long long)slba,
+ unmask_app_exp, unmask_app);
+ return -EIO;
+ }
+ }
+
+ if (pi_data->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
+ switch (data->pi_type) {
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ ref = get_unaligned_be48(pi->srtag);
+ ref_exp = (slba + lba_num) & ((1ULL << 48) - 1);
+ if (ref != ref_exp) {
+ log_err("%s: REFTAG compare error: LBA: %llu Expected=%llx, Actual=%llx\n",
+ f->file_name, (unsigned long long)slba,
+ ref_exp, ref);
+ return -EIO;
+ }
+ break;
+ case NVME_NS_DPS_PI_TYPE3:
+ break;
+ }
+ }
+next:
+ if (data->lba_ext) {
+ buf += data->lba_ext;
+ } else {
+ buf += data->lba_size;
+ md_buf += data->ms;
+ }
+ lba_num++;
+ }
+
+ return 0;
+}
+void fio_nvme_uring_cmd_trim_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+ struct nvme_dsm *dsm)
+{
+ struct nvme_data *data = FILE_ENG_DATA(io_u->file);
+ struct trim_range *range;
+ uint8_t *buf_point;
+ int i;
+
+ cmd->opcode = nvme_cmd_dsm;
+ cmd->nsid = data->nsid;
+ cmd->cdw11 = NVME_ATTRIBUTE_DEALLOCATE;
+ cmd->addr = (__u64) (uintptr_t) (&dsm->range[0]);
+
+ if (dsm->nr_ranges == 1) {
+ dsm->range[0].slba = get_slba(data, io_u->offset);
+ /* nlb is a 1-based value for deallocate */
+ dsm->range[0].nlb = get_nlb(data, io_u->xfer_buflen) + 1;
+ cmd->cdw10 = 0;
+ cmd->data_len = sizeof(struct nvme_dsm_range);
+ } else {
+ buf_point = io_u->xfer_buf;
+ for (i = 0; i < io_u->number_trim; i++) {
+ range = (struct trim_range *)buf_point;
+ dsm->range[i].slba = get_slba(data, range->start);
+ /* nlb is a 1-based value for deallocate */
+ dsm->range[i].nlb = get_nlb(data, range->len) + 1;
+ buf_point += sizeof(struct trim_range);
+ }
+ cmd->cdw10 = io_u->number_trim - 1;
+ cmd->data_len = io_u->number_trim * sizeof(struct nvme_dsm_range);
+ }
+}
+
+int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+ struct iovec *iov, struct nvme_dsm *dsm)
+{
+ struct nvme_data *data = FILE_ENG_DATA(io_u->file);
+ __u64 slba;
+ __u32 nlb;
+
+ memset(cmd, 0, sizeof(struct nvme_uring_cmd));
+
+ switch (io_u->ddir) {
+ case DDIR_READ:
+ cmd->opcode = nvme_cmd_read;
+ break;
+ case DDIR_WRITE:
+ cmd->opcode = nvme_cmd_write;
+ break;
+ case DDIR_TRIM:
+ fio_nvme_uring_cmd_trim_prep(cmd, io_u, dsm);
+ return 0;
+ default:
+ return -ENOTSUP;
+ }
+
+ slba = get_slba(data, io_u->offset);
+ nlb = get_nlb(data, io_u->xfer_buflen);
+
+ /* cdw10 and cdw11 represent starting lba */
+ cmd->cdw10 = slba & 0xffffffff;
+ cmd->cdw11 = slba >> 32;
+ /* cdw12 represent number of lba's for read/write */
+ cmd->cdw12 = nlb | (io_u->dtype << 20);
+ cmd->cdw13 = io_u->dspec << 16;
+ if (iov) {
+ iov->iov_base = io_u->xfer_buf;
+ iov->iov_len = io_u->xfer_buflen;
+ cmd->addr = (__u64)(uintptr_t)iov;
+ cmd->data_len = 1;
+ } else {
+ cmd->addr = (__u64)(uintptr_t)io_u->xfer_buf;
+ cmd->data_len = io_u->xfer_buflen;
+ }
+ if (data->lba_shift && data->ms) {
+ cmd->metadata = (__u64)(uintptr_t)io_u->mmap_data;
+ cmd->metadata_len = (nlb + 1) * data->ms;
+ }
+ cmd->nsid = data->nsid;
+ return 0;
+}
+
+void fio_nvme_pi_fill(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+ struct nvme_cmd_ext_io_opts *opts)
+{
+ struct nvme_data *data = FILE_ENG_DATA(io_u->file);
+ __u64 slba;
+
+ slba = get_slba(data, io_u->offset);
+ cmd->cdw12 |= opts->io_flags;
+
+ if (data->pi_type && !(opts->io_flags & NVME_IO_PRINFO_PRACT)) {
+ if (data->guard_type == NVME_NVM_NS_16B_GUARD)
+ fio_nvme_generate_pi_16b_guard(data, io_u, opts);
+ else if (data->guard_type == NVME_NVM_NS_64B_GUARD)
+ fio_nvme_generate_pi_64b_guard(data, io_u, opts);
+ }
+
+ switch (data->pi_type) {
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ switch (data->guard_type) {
+ case NVME_NVM_NS_16B_GUARD:
+ if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF)
+ cmd->cdw14 = (__u32)slba;
+ break;
+ case NVME_NVM_NS_64B_GUARD:
+ if (opts->io_flags & NVME_IO_PRINFO_PRCHK_REF) {
+ cmd->cdw14 = (__u32)slba;
+ cmd->cdw3 = ((slba >> 32) & 0xffff);
+ }
+ break;
+ default:
+ break;
+ }
+ if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
+ cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag);
+ break;
+ case NVME_NS_DPS_PI_TYPE3:
+ if (opts->io_flags & NVME_IO_PRINFO_PRCHK_APP)
+ cmd->cdw15 = (opts->apptag_mask << 16 | opts->apptag);
+ break;
+ case NVME_NS_DPS_PI_NONE:
+ break;
+ }
+}
+
+int fio_nvme_pi_verify(struct nvme_data *data, struct io_u *io_u)
+{
+ int ret = 0;
+
+ switch (data->guard_type) {
+ case NVME_NVM_NS_16B_GUARD:
+ ret = fio_nvme_verify_pi_16b_guard(data, io_u);
+ break;
+ case NVME_NVM_NS_64B_GUARD:
+ ret = fio_nvme_verify_pi_64b_guard(data, io_u);
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
+ enum nvme_csi csi, void *data)
+{
+ struct nvme_passthru_cmd cmd = {
+ .opcode = nvme_admin_identify,
+ .nsid = nsid,
+ .addr = (__u64)(uintptr_t)data,
+ .data_len = NVME_IDENTIFY_DATA_SIZE,
+ .cdw10 = cns,
+ .cdw11 = csi << NVME_IDENTIFY_CSI_SHIFT,
+ .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT,
+ };
+
+ return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
+}
+
+int fio_nvme_get_info(struct fio_file *f, __u64 *nlba, __u32 pi_act,
+ struct nvme_data *data)
+{
+ struct nvme_id_ns ns;
+ struct nvme_id_ctrl ctrl;
+ struct nvme_nvm_id_ns nvm_ns;
+ int namespace_id;
+ int fd, err;
+ __u32 format_idx, elbaf;
+
+ if (f->filetype != FIO_TYPE_CHAR) {
+ log_err("ioengine io_uring_cmd only works with nvme ns "
+ "generic char devices (/dev/ngXnY)\n");
+ return 1;
+ }
+
+ fd = open(f->file_name, O_RDONLY);
+ if (fd < 0)
+ return -errno;
+
+ namespace_id = ioctl(fd, NVME_IOCTL_ID);
+ if (namespace_id < 0) {
+ err = -errno;
+ log_err("%s: failed to fetch namespace-id\n", f->file_name);
+ goto out;
+ }
+
+ err = nvme_identify(fd, 0, NVME_IDENTIFY_CNS_CTRL, NVME_CSI_NVM, &ctrl);
+ if (err) {
+ log_err("%s: failed to fetch identify ctrl\n", f->file_name);
+ goto out;
+ }
+
+ /*
+ * Identify namespace to get namespace-id, namespace size in LBA's
+ * and LBA data size.
+ */
+ err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
+ NVME_CSI_NVM, &ns);
+ if (err) {
+ log_err("%s: failed to fetch identify namespace\n",
+ f->file_name);
+ goto out;
+ }
+
+ data->nsid = namespace_id;
+
+ /*
+ * 16 or 64 as maximum number of supported LBA formats.
+ * From flbas bit 0-3 indicates lsb and bit 5-6 indicates msb
+ * of the format index used to format the namespace.
+ */
+ if (ns.nlbaf < 16)
+ format_idx = ns.flbas & 0xf;
+ else
+ format_idx = (ns.flbas & 0xf) + (((ns.flbas >> 5) & 0x3) << 4);
+
+ data->lba_size = 1 << ns.lbaf[format_idx].ds;
+ data->ms = le16_to_cpu(ns.lbaf[format_idx].ms);
+
+ /* Check for end to end data protection support */
+ if (data->ms && (ns.dps & NVME_NS_DPS_PI_MASK))
+ data->pi_type = (ns.dps & NVME_NS_DPS_PI_MASK);
+
+ if (!data->pi_type)
+ goto check_elba;
+
+ if (ctrl.ctratt & NVME_CTRL_CTRATT_ELBAS) {
+ err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_CSI_NS,
+ NVME_CSI_NVM, &nvm_ns);
+ if (err) {
+ log_err("%s: failed to fetch identify nvm namespace\n",
+ f->file_name);
+ goto out;
+ }
+
+ elbaf = le32_to_cpu(nvm_ns.elbaf[format_idx]);
+
+ /* Currently we don't support storage tags */
+ if (elbaf & NVME_ID_NS_NVM_STS_MASK) {
+ log_err("%s: Storage tag not supported\n",
+ f->file_name);
+ err = -ENOTSUP;
+ goto out;
+ }
+
+ data->guard_type = (elbaf >> NVME_ID_NS_NVM_GUARD_SHIFT) &
+ NVME_ID_NS_NVM_GUARD_MASK;
+
+ /* No 32 bit guard, as storage tag is mandatory for it */
+ switch (data->guard_type) {
+ case NVME_NVM_NS_16B_GUARD:
+ data->pi_size = sizeof(struct nvme_16b_guard_pif);
+ break;
+ case NVME_NVM_NS_64B_GUARD:
+ data->pi_size = sizeof(struct nvme_64b_guard_pif);
+ break;
+ default:
+ break;
+ }
+ } else {
+ data->guard_type = NVME_NVM_NS_16B_GUARD;
+ data->pi_size = sizeof(struct nvme_16b_guard_pif);
+ }
+
+ /*
+ * when PRACT bit is set to 1, and metadata size is equal to protection
+ * information size, controller inserts and removes PI for write and
+ * read commands respectively.
+ */
+ if (pi_act && data->ms == data->pi_size)
+ data->ms = 0;
+
+ data->pi_loc = (ns.dps & NVME_NS_DPS_PI_FIRST);
+
+check_elba:
+ /*
+ * Bit 4 for flbas indicates if metadata is transferred at the end of
+ * logical block creating an extended LBA.
+ */
+ if (data->ms && ((ns.flbas >> 4) & 0x1))
+ data->lba_ext = data->lba_size + data->ms;
+ else
+ data->lba_shift = ilog2(data->lba_size);
+
+ *nlba = ns.nsze;
+
+out:
+ close(fd);
+ return err;
+}
+
+int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
+ enum zbd_zoned_model *model)
+{
+ struct nvme_data *data = FILE_ENG_DATA(f);
+ struct nvme_id_ns ns;
+ struct nvme_passthru_cmd cmd;
+ int fd, ret = 0;
+
+ if (f->filetype != FIO_TYPE_CHAR)
+ return -EINVAL;
+
+ /* File is not yet opened */
+ fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+ if (fd < 0)
+ return -errno;
+
+ /* Using nvme_id_ns for data as sizes are same */
+ ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_CTRL,
+ NVME_CSI_ZNS, &ns);
+ if (ret) {
+ *model = ZBD_NONE;
+ goto out;
+ }
+
+ memset(&cmd, 0, sizeof(struct nvme_passthru_cmd));
+
+ /* Using nvme_id_ns for data as sizes are same */
+ ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+ NVME_CSI_ZNS, &ns);
+ if (ret) {
+ *model = ZBD_NONE;
+ goto out;
+ }
+
+ *model = ZBD_HOST_MANAGED;
+out:
+ close(fd);
+ return 0;
+}
+
+static int nvme_report_zones(int fd, __u32 nsid, __u64 slba, __u32 zras_feat,
+ __u32 data_len, void *data)
+{
+ struct nvme_passthru_cmd cmd = {
+ .opcode = nvme_zns_cmd_mgmt_recv,
+ .nsid = nsid,
+ .addr = (__u64)(uintptr_t)data,
+ .data_len = data_len,
+ .cdw10 = slba & 0xffffffff,
+ .cdw11 = slba >> 32,
+ .cdw12 = (data_len >> 2) - 1,
+ .cdw13 = NVME_ZNS_ZRA_REPORT_ZONES | zras_feat,
+ .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT,
+ };
+
+ return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+}
+
+int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
+ uint64_t offset, struct zbd_zone *zbdz,
+ unsigned int nr_zones)
+{
+ struct nvme_data *data = FILE_ENG_DATA(f);
+ struct nvme_zone_report *zr;
+ struct nvme_zns_id_ns zns_ns;
+ struct nvme_id_ns ns;
+ unsigned int i = 0, j, zones_fetched = 0;
+ unsigned int max_zones, zones_chunks = 1024;
+ int fd, ret = 0;
+ __u32 zr_len;
+ __u64 zlen;
+
+ /* File is not yet opened */
+ fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+ if (fd < 0)
+ return -errno;
+
+ zones_fetched = 0;
+ zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
+ zr = calloc(1, zr_len);
+ if (!zr) {
+ close(fd);
+ return -ENOMEM;
+ }
+
+ ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_NS,
+ NVME_CSI_NVM, &ns);
+ if (ret) {
+ log_err("%s: nvme_identify_ns failed, err=%d\n", f->file_name,
+ ret);
+ goto out;
+ }
+
+ ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+ NVME_CSI_ZNS, &zns_ns);
+ if (ret) {
+ log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
+ f->file_name, ret);
+ goto out;
+ }
+ zlen = zns_ns.lbafe[ns.flbas & 0x0f].zsze << data->lba_shift;
+
+ max_zones = (f->real_file_size - offset) / zlen;
+ if (max_zones < nr_zones)
+ nr_zones = max_zones;
+
+ if (nr_zones < zones_chunks)
+ zones_chunks = nr_zones;
+
+ while (zones_fetched < nr_zones) {
+ if (zones_fetched + zones_chunks >= nr_zones) {
+ zones_chunks = nr_zones - zones_fetched;
+ zr_len = sizeof(*zr) + (zones_chunks * sizeof(struct nvme_zns_desc));
+ }
+ ret = nvme_report_zones(fd, data->nsid, offset >> data->lba_shift,
+ NVME_ZNS_ZRAS_FEAT_ERZ, zr_len, (void *)zr);
+ if (ret) {
+ log_err("%s: nvme_zns_report_zones failed, err=%d\n",
+ f->file_name, ret);
+ goto out;
+ }
+
+ /* Transform the zone-report */
+ for (j = 0; j < zr->nr_zones; j++, i++) {
+ struct nvme_zns_desc *desc = (struct nvme_zns_desc *)&(zr->entries[j]);
+
+ zbdz[i].start = desc->zslba << data->lba_shift;
+ zbdz[i].len = zlen;
+ zbdz[i].wp = desc->wp << data->lba_shift;
+ zbdz[i].capacity = desc->zcap << data->lba_shift;
+
+ /* Zone Type is stored in first 4 bits. */
+ switch (desc->zt & 0x0f) {
+ case NVME_ZONE_TYPE_SEQWRITE_REQ:
+ zbdz[i].type = ZBD_ZONE_TYPE_SWR;
+ break;
+ default:
+ log_err("%s: invalid type for zone at offset %llu.\n",
+ f->file_name, (unsigned long long) desc->zslba);
+ ret = -EIO;
+ goto out;
+ }
+
+ /* Zone State is stored in last 4 bits. */
+ switch (desc->zs >> 4) {
+ case NVME_ZNS_ZS_EMPTY:
+ zbdz[i].cond = ZBD_ZONE_COND_EMPTY;
+ break;
+ case NVME_ZNS_ZS_IMPL_OPEN:
+ zbdz[i].cond = ZBD_ZONE_COND_IMP_OPEN;
+ break;
+ case NVME_ZNS_ZS_EXPL_OPEN:
+ zbdz[i].cond = ZBD_ZONE_COND_EXP_OPEN;
+ break;
+ case NVME_ZNS_ZS_CLOSED:
+ zbdz[i].cond = ZBD_ZONE_COND_CLOSED;
+ break;
+ case NVME_ZNS_ZS_FULL:
+ zbdz[i].cond = ZBD_ZONE_COND_FULL;
+ break;
+ case NVME_ZNS_ZS_READ_ONLY:
+ case NVME_ZNS_ZS_OFFLINE:
+ default:
+ /* Treat all these conditions as offline (don't use!) */
+ zbdz[i].cond = ZBD_ZONE_COND_OFFLINE;
+ zbdz[i].wp = zbdz[i].start;
+ }
+ }
+ zones_fetched += zr->nr_zones;
+ offset += zr->nr_zones * zlen;
+ }
+
+ ret = zones_fetched;
+out:
+ free(zr);
+ close(fd);
+
+ return ret;
+}
+
+int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
+ uint64_t offset, uint64_t length)
+{
+ struct nvme_data *data = FILE_ENG_DATA(f);
+ unsigned int nr_zones;
+ unsigned long long zslba;
+ int i, fd, ret = 0;
+
+ /* If the file is not yet opened, open it for this function. */
+ fd = f->fd;
+ if (fd < 0) {
+ fd = open(f->file_name, O_RDWR | O_LARGEFILE);
+ if (fd < 0)
+ return -errno;
+ }
+
+ zslba = offset >> data->lba_shift;
+ nr_zones = (length + td->o.zone_size - 1) / td->o.zone_size;
+
+ for (i = 0; i < nr_zones; i++, zslba += (td->o.zone_size >> data->lba_shift)) {
+ struct nvme_passthru_cmd cmd = {
+ .opcode = nvme_zns_cmd_mgmt_send,
+ .nsid = data->nsid,
+ .cdw10 = zslba & 0xffffffff,
+ .cdw11 = zslba >> 32,
+ .cdw13 = NVME_ZNS_ZSA_RESET,
+ .addr = (__u64)(uintptr_t)NULL,
+ .data_len = 0,
+ .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT,
+ };
+
+ ret = ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+ }
+
+ if (f->fd < 0)
+ close(fd);
+ return -ret;
+}
+
+int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ struct nvme_data *data = FILE_ENG_DATA(f);
+ struct nvme_zns_id_ns zns_ns;
+ int fd, ret = 0;
+
+ fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+ if (fd < 0)
+ return -errno;
+
+ ret = nvme_identify(fd, data->nsid, NVME_IDENTIFY_CNS_CSI_NS,
+ NVME_CSI_ZNS, &zns_ns);
+ if (ret) {
+ log_err("%s: nvme_zns_identify_ns failed, err=%d\n",
+ f->file_name, ret);
+ goto out;
+ }
+
+ *max_open_zones = zns_ns.mor + 1;
+out:
+ close(fd);
+ return ret;
+}
+
+static inline int nvme_fdp_reclaim_unit_handle_status(int fd, __u32 nsid,
+ __u32 data_len, void *data)
+{
+ struct nvme_passthru_cmd cmd = {
+ .opcode = nvme_cmd_io_mgmt_recv,
+ .nsid = nsid,
+ .addr = (__u64)(uintptr_t)data,
+ .data_len = data_len,
+ .cdw10 = 1,
+ .cdw11 = (data_len >> 2) - 1,
+ };
+
+ return ioctl(fd, NVME_IOCTL_IO_CMD, &cmd);
+}
+
+int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
+ struct nvme_fdp_ruh_status *ruhs, __u32 bytes)
+{
+ struct nvme_data *data = FILE_ENG_DATA(f);
+ int fd, ret;
+
+ fd = open(f->file_name, O_RDONLY | O_LARGEFILE);
+ if (fd < 0)
+ return -errno;
+
+ ret = nvme_fdp_reclaim_unit_handle_status(fd, data->nsid, bytes, ruhs);
+ if (ret) {
+ log_err("%s: nvme_fdp_reclaim_unit_handle_status failed, err=%d\n",
+ f->file_name, ret);
+ errno = ENOTSUP;
+ } else
+ errno = 0;
+
+ ret = -errno;
+ close(fd);
+ return ret;
+}
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * nvme structure declarations and helper functions for the
+ * io_uring_cmd engine.
+ */
+
+#ifndef FIO_NVME_H
+#define FIO_NVME_H
+
+#include <linux/nvme_ioctl.h>
+#include "../fio.h"
+
+/*
+ * If the uapi headers installed on the system lacks nvme uring command
+ * support, use the local version to prevent compilation issues.
+ */
+#ifndef CONFIG_NVME_URING_CMD
+struct nvme_uring_cmd {
+ __u8 opcode;
+ __u8 flags;
+ __u16 rsvd1;
+ __u32 nsid;
+ __u32 cdw2;
+ __u32 cdw3;
+ __u64 metadata;
+ __u64 addr;
+ __u32 metadata_len;
+ __u32 data_len;
+ __u32 cdw10;
+ __u32 cdw11;
+ __u32 cdw12;
+ __u32 cdw13;
+ __u32 cdw14;
+ __u32 cdw15;
+ __u32 timeout_ms;
+ __u32 rsvd2;
+};
+
+#define NVME_URING_CMD_IO _IOWR('N', 0x80, struct nvme_uring_cmd)
+#define NVME_URING_CMD_IO_VEC _IOWR('N', 0x81, struct nvme_uring_cmd)
+#endif /* CONFIG_NVME_URING_CMD */
+
+#define NVME_DEFAULT_IOCTL_TIMEOUT 0
+#define NVME_IDENTIFY_DATA_SIZE 4096
+#define NVME_IDENTIFY_CSI_SHIFT 24
+#define NVME_NQN_LENGTH 256
+
+#define NVME_PI_APP_DISABLE 0xFFFF
+#define NVME_PI_REF_DISABLE 0xFFFFFFFF
+
+#define NVME_ZNS_ZRA_REPORT_ZONES 0
+#define NVME_ZNS_ZRAS_FEAT_ERZ (1 << 16)
+#define NVME_ZNS_ZSA_RESET 0x4
+#define NVME_ZONE_TYPE_SEQWRITE_REQ 0x2
+
+#define NVME_ATTRIBUTE_DEALLOCATE (1 << 2)
+
+enum nvme_identify_cns {
+ NVME_IDENTIFY_CNS_NS = 0x00,
+ NVME_IDENTIFY_CNS_CTRL = 0x01,
+ NVME_IDENTIFY_CNS_CSI_NS = 0x05,
+ NVME_IDENTIFY_CNS_CSI_CTRL = 0x06,
+};
+
+enum nvme_csi {
+ NVME_CSI_NVM = 0,
+ NVME_CSI_KV = 1,
+ NVME_CSI_ZNS = 2,
+};
+
+enum nvme_admin_opcode {
+ nvme_admin_identify = 0x06,
+};
+
+enum nvme_io_opcode {
+ nvme_cmd_write = 0x01,
+ nvme_cmd_read = 0x02,
+ nvme_cmd_dsm = 0x09,
+ nvme_cmd_io_mgmt_recv = 0x12,
+ nvme_zns_cmd_mgmt_send = 0x79,
+ nvme_zns_cmd_mgmt_recv = 0x7a,
+};
+
+enum nvme_zns_zs {
+ NVME_ZNS_ZS_EMPTY = 0x1,
+ NVME_ZNS_ZS_IMPL_OPEN = 0x2,
+ NVME_ZNS_ZS_EXPL_OPEN = 0x3,
+ NVME_ZNS_ZS_CLOSED = 0x4,
+ NVME_ZNS_ZS_READ_ONLY = 0xd,
+ NVME_ZNS_ZS_FULL = 0xe,
+ NVME_ZNS_ZS_OFFLINE = 0xf,
+};
+
+enum nvme_id_ctrl_ctratt {
+ NVME_CTRL_CTRATT_ELBAS = 1 << 15,
+};
+
+enum {
+ NVME_ID_NS_NVM_STS_MASK = 0x7f,
+ NVME_ID_NS_NVM_GUARD_SHIFT = 7,
+ NVME_ID_NS_NVM_GUARD_MASK = 0x3,
+};
+
+enum {
+ NVME_NVM_NS_16B_GUARD = 0,
+ NVME_NVM_NS_32B_GUARD = 1,
+ NVME_NVM_NS_64B_GUARD = 2,
+};
+
+struct nvme_data {
+ __u32 nsid;
+ __u32 lba_shift;
+ __u32 lba_size;
+ __u32 lba_ext;
+ __u16 ms;
+ __u16 pi_size;
+ __u8 pi_type;
+ __u8 guard_type;
+ __u8 pi_loc;
+};
+
+enum nvme_id_ns_dps {
+ NVME_NS_DPS_PI_NONE = 0,
+ NVME_NS_DPS_PI_TYPE1 = 1,
+ NVME_NS_DPS_PI_TYPE2 = 2,
+ NVME_NS_DPS_PI_TYPE3 = 3,
+ NVME_NS_DPS_PI_MASK = 7 << 0,
+ NVME_NS_DPS_PI_FIRST = 1 << 3,
+};
+
+enum nvme_io_control_flags {
+ NVME_IO_PRINFO_PRCHK_REF = 1U << 26,
+ NVME_IO_PRINFO_PRCHK_APP = 1U << 27,
+ NVME_IO_PRINFO_PRCHK_GUARD = 1U << 28,
+ NVME_IO_PRINFO_PRACT = 1U << 29,
+};
+
+struct nvme_pi_data {
+ __u32 interval;
+ __u32 io_flags;
+ __u16 apptag;
+ __u16 apptag_mask;
+};
+
+struct nvme_lbaf {
+ __le16 ms;
+ __u8 ds;
+ __u8 rp;
+};
+
+/* 16 bit guard protection Information format */
+struct nvme_16b_guard_pif {
+ __be16 guard;
+ __be16 apptag;
+ __be32 srtag;
+};
+
+/* 64 bit guard protection Information format */
+struct nvme_64b_guard_pif {
+ __be64 guard;
+ __be16 apptag;
+ __u8 srtag[6];
+};
+
+struct nvme_id_ns {
+ __le64 nsze;
+ __le64 ncap;
+ __le64 nuse;
+ __u8 nsfeat;
+ __u8 nlbaf;
+ __u8 flbas;
+ __u8 mc;
+ __u8 dpc;
+ __u8 dps;
+ __u8 nmic;
+ __u8 rescap;
+ __u8 fpi;
+ __u8 dlfeat;
+ __le16 nawun;
+ __le16 nawupf;
+ __le16 nacwu;
+ __le16 nabsn;
+ __le16 nabo;
+ __le16 nabspf;
+ __le16 noiob;
+ __u8 nvmcap[16];
+ __le16 npwg;
+ __le16 npwa;
+ __le16 npdg;
+ __le16 npda;
+ __le16 nows;
+ __le16 mssrl;
+ __le32 mcl;
+ __u8 msrc;
+ __u8 rsvd81[11];
+ __le32 anagrpid;
+ __u8 rsvd96[3];
+ __u8 nsattr;
+ __le16 nvmsetid;
+ __le16 endgid;
+ __u8 nguid[16];
+ __u8 eui64[8];
+ struct nvme_lbaf lbaf[64];
+ __u8 vs[3712];
+};
+
+struct nvme_id_psd {
+ __le16 mp;
+ __u8 rsvd2;
+ __u8 flags;
+ __le32 enlat;
+ __le32 exlat;
+ __u8 rrt;
+ __u8 rrl;
+ __u8 rwt;
+ __u8 rwl;
+ __le16 idlp;
+ __u8 ips;
+ __u8 rsvd19;
+ __le16 actp;
+ __u8 apws;
+ __u8 rsvd23[9];
+};
+
+struct nvme_id_ctrl {
+ __le16 vid;
+ __le16 ssvid;
+ char sn[20];
+ char mn[40];
+ char fr[8];
+ __u8 rab;
+ __u8 ieee[3];
+ __u8 cmic;
+ __u8 mdts;
+ __le16 cntlid;
+ __le32 ver;
+ __le32 rtd3r;
+ __le32 rtd3e;
+ __le32 oaes;
+ __le32 ctratt;
+ __le16 rrls;
+ __u8 rsvd102[9];
+ __u8 cntrltype;
+ __u8 fguid[16];
+ __le16 crdt1;
+ __le16 crdt2;
+ __le16 crdt3;
+ __u8 rsvd134[119];
+ __u8 nvmsr;
+ __u8 vwci;
+ __u8 mec;
+ __le16 oacs;
+ __u8 acl;
+ __u8 aerl;
+ __u8 frmw;
+ __u8 lpa;
+ __u8 elpe;
+ __u8 npss;
+ __u8 avscc;
+ __u8 apsta;
+ __le16 wctemp;
+ __le16 cctemp;
+ __le16 mtfa;
+ __le32 hmpre;
+ __le32 hmmin;
+ __u8 tnvmcap[16];
+ __u8 unvmcap[16];
+ __le32 rpmbs;
+ __le16 edstt;
+ __u8 dsto;
+ __u8 fwug;
+ __le16 kas;
+ __le16 hctma;
+ __le16 mntmt;
+ __le16 mxtmt;
+ __le32 sanicap;
+ __le32 hmminds;
+ __le16 hmmaxd;
+ __le16 nsetidmax;
+ __le16 endgidmax;
+ __u8 anatt;
+ __u8 anacap;
+ __le32 anagrpmax;
+ __le32 nanagrpid;
+ __le32 pels;
+ __le16 domainid;
+ __u8 rsvd358[10];
+ __u8 megcap[16];
+ __u8 rsvd384[128];
+ __u8 sqes;
+ __u8 cqes;
+ __le16 maxcmd;
+ __le32 nn;
+ __le16 oncs;
+ __le16 fuses;
+ __u8 fna;
+ __u8 vwc;
+ __le16 awun;
+ __le16 awupf;
+ __u8 icsvscc;
+ __u8 nwpc;
+ __le16 acwu;
+ __le16 ocfs;
+ __le32 sgls;
+ __le32 mnan;
+ __u8 maxdna[16];
+ __le32 maxcna;
+ __u8 rsvd564[204];
+ char subnqn[NVME_NQN_LENGTH];
+ __u8 rsvd1024[768];
+
+ /* Fabrics Only */
+ __le32 ioccsz;
+ __le32 iorcsz;
+ __le16 icdoff;
+ __u8 fcatt;
+ __u8 msdbd;
+ __le16 ofcs;
+ __u8 dctype;
+ __u8 rsvd1807[241];
+
+ struct nvme_id_psd psd[32];
+ __u8 vs[1024];
+};
+
+struct nvme_nvm_id_ns {
+ __le64 lbstm;
+ __u8 pic;
+ __u8 rsvd9[3];
+ __le32 elbaf[64];
+ __u8 rsvd268[3828];
+};
+
+static inline int ilog2(uint32_t i)
+{
+ int log = -1;
+
+ while (i) {
+ i >>= 1;
+ log++;
+ }
+ return log;
+}
+
+struct nvme_zns_lbafe {
+ __le64 zsze;
+ __u8 zdes;
+ __u8 rsvd9[7];
+};
+
+struct nvme_zns_id_ns {
+ __le16 zoc;
+ __le16 ozcs;
+ __le32 mar;
+ __le32 mor;
+ __le32 rrl;
+ __le32 frl;
+ __le32 rrl1;
+ __le32 rrl2;
+ __le32 rrl3;
+ __le32 frl1;
+ __le32 frl2;
+ __le32 frl3;
+ __le32 numzrwa;
+ __le16 zrwafg;
+ __le16 zrwasz;
+ __u8 zrwacap;
+ __u8 rsvd53[2763];
+ struct nvme_zns_lbafe lbafe[64];
+ __u8 vs[256];
+};
+
+struct nvme_zns_desc {
+ __u8 zt;
+ __u8 zs;
+ __u8 za;
+ __u8 zai;
+ __u8 rsvd4[4];
+ __le64 zcap;
+ __le64 zslba;
+ __le64 wp;
+ __u8 rsvd32[32];
+};
+
+struct nvme_zone_report {
+ __le64 nr_zones;
+ __u8 rsvd8[56];
+ struct nvme_zns_desc entries[];
+};
+
+struct nvme_fdp_ruh_status_desc {
+ __u16 pid;
+ __u16 ruhid;
+ __u32 earutr;
+ __u64 ruamw;
+ __u8 rsvd16[16];
+};
+
+struct nvme_fdp_ruh_status {
+ __u8 rsvd0[14];
+ __le16 nruhsd;
+ struct nvme_fdp_ruh_status_desc ruhss[];
+};
+
+struct nvme_dsm_range {
+ __le32 cattr;
+ __le32 nlb;
+ __le64 slba;
+};
+
+struct nvme_dsm {
+ __u32 nr_ranges;
+ struct nvme_dsm_range range[];
+};
+
+struct nvme_cmd_ext_io_opts {
+ __u32 io_flags;
+ __u16 apptag;
+ __u16 apptag_mask;
+};
+
+int fio_nvme_iomgmt_ruhs(struct thread_data *td, struct fio_file *f,
+ struct nvme_fdp_ruh_status *ruhs, __u32 bytes);
+
+int fio_nvme_get_info(struct fio_file *f, __u64 *nlba, __u32 pi_act,
+ struct nvme_data *data);
+
+int fio_nvme_uring_cmd_prep(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+ struct iovec *iov, struct nvme_dsm *dsm);
+
+void fio_nvme_pi_fill(struct nvme_uring_cmd *cmd, struct io_u *io_u,
+ struct nvme_cmd_ext_io_opts *opts);
+
+int fio_nvme_pi_verify(struct nvme_data *data, struct io_u *io_u);
+
+int fio_nvme_get_zoned_model(struct thread_data *td, struct fio_file *f,
+ enum zbd_zoned_model *model);
+
+int fio_nvme_report_zones(struct thread_data *td, struct fio_file *f,
+ uint64_t offset, struct zbd_zone *zbdz,
+ unsigned int nr_zones);
+
+int fio_nvme_reset_wp(struct thread_data *td, struct fio_file *f,
+ uint64_t offset, uint64_t length);
+
+int fio_nvme_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones);
+
+static inline void put_unaligned_be48(__u64 val, __u8 *p)
+{
+ *p++ = val >> 40;
+ *p++ = val >> 32;
+ *p++ = val >> 24;
+ *p++ = val >> 16;
+ *p++ = val >> 8;
+ *p++ = val;
+}
+
+static inline __u64 get_unaligned_be48(__u8 *p)
+{
+ return (__u64)p[0] << 40 | (__u64)p[1] << 32 | (__u64)p[2] << 24 |
+ p[3] << 16 | p[4] << 8 | p[5];
+}
+
+static inline bool fio_nvme_pi_ref_escape(__u8 *reftag)
+{
+ __u8 ref_esc[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+ return memcmp(reftag, ref_esc, sizeof(ref_esc)) == 0;
+}
+
+#endif
+++ /dev/null
-/*
- * pmemblk: IO engine that uses PMDK libpmemblk to read and write data
- *
- * Copyright (C) 2016 Hewlett Packard Enterprise Development LP
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License,
- * version 2 as published by the Free Software Foundation..
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the Free
- * Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
- * Boston, MA 02110-1301, USA.
- */
-
-/*
- * pmemblk engine
- *
- * IO engine that uses libpmemblk to read and write data
- *
- * To use:
- * ioengine=pmemblk
- *
- * Other relevant settings:
- * thread=1 REQUIRED
- * iodepth=1
- * direct=1
- * unlink=1
- * filename=/mnt/pmem0/fiotestfile,BSIZE,FSIZEMiB
- *
- * thread must be set to 1 for pmemblk as multiple processes cannot
- * open the same block pool file.
- *
- * iodepth should be set to 1 as pmemblk is always synchronous.
- * Use numjobs to scale up.
- *
- * direct=1 is implied as pmemblk is always direct. A warning message
- * is printed if this is not specified.
- *
- * unlink=1 removes the block pool file after testing, and is optional.
- *
- * The pmem device must have a DAX-capable filesystem and be mounted
- * with DAX enabled. filename must point to a file on that filesystem.
- *
- * Example:
- * mkfs.xfs /dev/pmem0
- * mkdir /mnt/pmem0
- * mount -o dax /dev/pmem0 /mnt/pmem0
- *
- * When specifying the filename, if the block pool file does not already
- * exist, then the pmemblk engine creates the pool file if you specify
- * the block and file sizes. BSIZE is the block size in bytes.
- * FSIZEMB is the pool file size in MiB.
- *
- * See examples/pmemblk.fio for more.
- *
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/uio.h>
-#include <errno.h>
-#include <assert.h>
-#include <string.h>
-#include <libpmem.h>
-#include <libpmemblk.h>
-
-#include "../fio.h"
-
-/*
- * libpmemblk
- */
-typedef struct fio_pmemblk_file *fio_pmemblk_file_t;
-
-struct fio_pmemblk_file {
- fio_pmemblk_file_t pmb_next;
- char *pmb_filename;
- uint64_t pmb_refcnt;
- PMEMblkpool *pmb_pool;
- size_t pmb_bsize;
- size_t pmb_nblocks;
-};
-
-static fio_pmemblk_file_t Cache;
-
-static pthread_mutex_t CacheLock = PTHREAD_MUTEX_INITIALIZER;
-
-#define PMB_CREATE (0x0001) /* should create file */
-
-fio_pmemblk_file_t fio_pmemblk_cache_lookup(const char *filename)
-{
- fio_pmemblk_file_t i;
-
- for (i = Cache; i != NULL; i = i->pmb_next)
- if (!strcmp(filename, i->pmb_filename))
- return i;
-
- return NULL;
-}
-
-static void fio_pmemblk_cache_insert(fio_pmemblk_file_t pmb)
-{
- pmb->pmb_next = Cache;
- Cache = pmb;
-}
-
-static void fio_pmemblk_cache_remove(fio_pmemblk_file_t pmb)
-{
- fio_pmemblk_file_t i;
-
- if (pmb == Cache) {
- Cache = Cache->pmb_next;
- pmb->pmb_next = NULL;
- return;
- }
-
- for (i = Cache; i != NULL; i = i->pmb_next)
- if (pmb == i->pmb_next) {
- i->pmb_next = i->pmb_next->pmb_next;
- pmb->pmb_next = NULL;
- return;
- }
-}
-
-/*
- * to control block size and gross file size at the libpmemblk
- * level, we allow the block size and file size to be appended
- * to the file name:
- *
- * path[,bsize,fsizemib]
- *
- * note that we do not use the fio option "filesize" to dictate
- * the file size because we can only give libpmemblk the gross
- * file size, which is different from the net or usable file
- * size (which is probably what fio wants).
- *
- * the final path without the parameters is returned in ppath.
- * the block size and file size are returned in pbsize and fsize.
- *
- * note that the user specifies the file size in MiB, but
- * we return bytes from here.
- */
-static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize,
- uint64_t *pfsize)
-{
- char *path;
- char *s;
- uint64_t bsize;
- uint64_t fsizemib;
-
- path = strdup(pathspec);
- if (!path) {
- *ppath = NULL;
- return;
- }
-
- /* extract sizes, if given */
- s = strrchr(path, ',');
- if (s && (fsizemib = strtoull(s + 1, NULL, 10))) {
- *s = 0;
- s = strrchr(path, ',');
- if (s && (bsize = strtoull(s + 1, NULL, 10))) {
- *s = 0;
- *ppath = path;
- *pbsize = bsize;
- *pfsize = fsizemib << 20;
- return;
- }
- }
-
- /* size specs not found */
- strcpy(path, pathspec);
- *ppath = path;
- *pbsize = 0;
- *pfsize = 0;
-}
-
-static fio_pmemblk_file_t pmb_open(const char *pathspec, int flags)
-{
- fio_pmemblk_file_t pmb;
- char *path = NULL;
- uint64_t bsize = 0;
- uint64_t fsize = 0;
-
- pmb_parse_path(pathspec, &path, &bsize, &fsize);
- if (!path)
- return NULL;
-
- pthread_mutex_lock(&CacheLock);
-
- pmb = fio_pmemblk_cache_lookup(path);
- if (!pmb) {
- pmb = malloc(sizeof(*pmb));
- if (!pmb)
- goto error;
-
- /* try opening existing first, create it if needed */
- pmb->pmb_pool = pmemblk_open(path, bsize);
- if (!pmb->pmb_pool && (errno == ENOENT) &&
- (flags & PMB_CREATE) && (0 < fsize) && (0 < bsize)) {
- pmb->pmb_pool =
- pmemblk_create(path, bsize, fsize, 0644);
- }
- if (!pmb->pmb_pool) {
- log_err("pmemblk: unable to open pmemblk pool file %s (%s)\n",
- path, strerror(errno));
- goto error;
- }
-
- pmb->pmb_filename = path;
- pmb->pmb_next = NULL;
- pmb->pmb_refcnt = 0;
- pmb->pmb_bsize = pmemblk_bsize(pmb->pmb_pool);
- pmb->pmb_nblocks = pmemblk_nblock(pmb->pmb_pool);
-
- fio_pmemblk_cache_insert(pmb);
- } else {
- free(path);
- }
-
- pmb->pmb_refcnt += 1;
-
- pthread_mutex_unlock(&CacheLock);
-
- return pmb;
-
-error:
- if (pmb) {
- if (pmb->pmb_pool)
- pmemblk_close(pmb->pmb_pool);
- pmb->pmb_pool = NULL;
- pmb->pmb_filename = NULL;
- free(pmb);
- }
- if (path)
- free(path);
-
- pthread_mutex_unlock(&CacheLock);
- return NULL;
-}
-
-static void pmb_close(fio_pmemblk_file_t pmb, const bool keep)
-{
- pthread_mutex_lock(&CacheLock);
-
- pmb->pmb_refcnt--;
-
- if (!keep && !pmb->pmb_refcnt) {
- pmemblk_close(pmb->pmb_pool);
- pmb->pmb_pool = NULL;
- free(pmb->pmb_filename);
- pmb->pmb_filename = NULL;
- fio_pmemblk_cache_remove(pmb);
- free(pmb);
- }
-
- pthread_mutex_unlock(&CacheLock);
-}
-
-static int pmb_get_flags(struct thread_data *td, uint64_t *pflags)
-{
- static int thread_warned = 0;
- static int odirect_warned = 0;
-
- uint64_t flags = 0;
-
- if (!td->o.use_thread) {
- if (!thread_warned) {
- thread_warned = 1;
- log_err("pmemblk: must set thread=1 for pmemblk engine\n");
- }
- return 1;
- }
-
- if (!td->o.odirect && !odirect_warned) {
- odirect_warned = 1;
- log_info("pmemblk: direct == 0, but pmemblk is always direct\n");
- }
-
- if (td->o.allow_create)
- flags |= PMB_CREATE;
-
- (*pflags) = flags;
- return 0;
-}
-
-static int fio_pmemblk_open_file(struct thread_data *td, struct fio_file *f)
-{
- uint64_t flags = 0;
- fio_pmemblk_file_t pmb;
-
- if (pmb_get_flags(td, &flags))
- return 1;
-
- pmb = pmb_open(f->file_name, flags);
- if (!pmb)
- return 1;
-
- FILE_SET_ENG_DATA(f, pmb);
- return 0;
-}
-
-static int fio_pmemblk_close_file(struct thread_data fio_unused *td,
- struct fio_file *f)
-{
- fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
-
- if (pmb)
- pmb_close(pmb, false);
-
- FILE_SET_ENG_DATA(f, NULL);
- return 0;
-}
-
-static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f)
-{
- uint64_t flags = 0;
- fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
-
- if (fio_file_size_known(f))
- return 0;
-
- if (!pmb) {
- if (pmb_get_flags(td, &flags))
- return 1;
- pmb = pmb_open(f->file_name, flags);
- if (!pmb)
- return 1;
- }
-
- f->real_file_size = pmb->pmb_bsize * pmb->pmb_nblocks;
-
- fio_file_set_size_known(f);
-
- if (!FILE_ENG_DATA(f))
- pmb_close(pmb, true);
-
- return 0;
-}
-
-static enum fio_q_status fio_pmemblk_queue(struct thread_data *td,
- struct io_u *io_u)
-{
- struct fio_file *f = io_u->file;
- fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
-
- unsigned long long off;
- unsigned long len;
- void *buf;
-
- fio_ro_check(td, io_u);
-
- switch (io_u->ddir) {
- case DDIR_READ:
- case DDIR_WRITE:
- off = io_u->offset;
- len = io_u->xfer_buflen;
-
- io_u->error = EINVAL;
- if (off % pmb->pmb_bsize)
- break;
- if (len % pmb->pmb_bsize)
- break;
- if ((off + len) / pmb->pmb_bsize > pmb->pmb_nblocks)
- break;
-
- io_u->error = 0;
- buf = io_u->xfer_buf;
- off /= pmb->pmb_bsize;
- len /= pmb->pmb_bsize;
- while (0 < len) {
- if (io_u->ddir == DDIR_READ &&
- 0 != pmemblk_read(pmb->pmb_pool, buf, off)) {
- io_u->error = errno;
- break;
- } else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) {
- io_u->error = errno;
- break;
- }
- buf += pmb->pmb_bsize;
- off++;
- len--;
- }
- off *= pmb->pmb_bsize;
- len *= pmb->pmb_bsize;
- io_u->resid = io_u->xfer_buflen - (off - io_u->offset);
- break;
- case DDIR_SYNC:
- case DDIR_DATASYNC:
- case DDIR_SYNC_FILE_RANGE:
- /* we're always sync'd */
- io_u->error = 0;
- break;
- default:
- io_u->error = EINVAL;
- break;
- }
-
- return FIO_Q_COMPLETED;
-}
-
-static int fio_pmemblk_unlink_file(struct thread_data *td, struct fio_file *f)
-{
- char *path = NULL;
- uint64_t bsize = 0;
- uint64_t fsize = 0;
-
- /*
- * we need our own unlink in case the user has specified
- * the block and file sizes in the path name. we parse
- * the file_name to determine the file name we actually used.
- */
-
- pmb_parse_path(f->file_name, &path, &bsize, &fsize);
- if (!path)
- return ENOENT;
-
- unlink(path);
- free(path);
- return 0;
-}
-
-FIO_STATIC struct ioengine_ops ioengine = {
- .name = "pmemblk",
- .version = FIO_IOOPS_VERSION,
- .queue = fio_pmemblk_queue,
- .open_file = fio_pmemblk_open_file,
- .close_file = fio_pmemblk_close_file,
- .get_file_size = fio_pmemblk_get_file_size,
- .unlink_file = fio_pmemblk_unlink_file,
- .flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
-};
-
-static void fio_init fio_pmemblk_register(void)
-{
- register_ioengine(&ioengine);
-}
-
-static void fio_exit fio_pmemblk_unregister(void)
-{
- unregister_ioengine(&ioengine);
-}
static int fio_posixaio_init(struct thread_data *td)
{
- struct posixaio_data *pd = malloc(sizeof(*pd));
-
- memset(pd, 0, sizeof(*pd));
- pd->aio_events = malloc(td->o.iodepth * sizeof(struct io_u *));
- memset(pd->aio_events, 0, td->o.iodepth * sizeof(struct io_u *));
+ struct posixaio_data *pd;
+ pd = calloc(1, sizeof(*pd));
+ pd->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
td->io_ops_data = pd;
return 0;
char *cluster_name;
char *pool_name;
char *client_name;
+ char *conf;
int busy_poll;
+ int touch_objects;
};
static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_RBD,
},
+ {
+ .name = "conf",
+ .lname = "ceph configuration file path",
+ .type = FIO_OPT_STR_STORE,
+ .help = "Path of the ceph configuration file",
+ .off1 = offsetof(struct rados_options, conf),
+ .def = "/etc/ceph/ceph.conf",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_RBD,
+ },
{
.name = "busy_poll",
.lname = "busy poll mode",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_RBD,
},
+ {
+ .name = "touch_objects",
+ .lname = "touch objects on start",
+ .type = FIO_OPT_BOOL,
+ .help = "Touch (create) objects on start",
+ .off1 = offsetof(struct rados_options, touch_objects),
+ .def = "1",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_RBD,
+ },
{
.name = NULL,
},
char *client_name = NULL;
/*
- * If we specify cluser name, the rados_create2
+ * If we specify cluster name, the rados_create2
* will not assume 'client.'. name is considered
* as a full type.id namestr
*/
goto failed_early;
}
- r = rados_conf_read_file(rados->cluster, NULL);
+ r = rados_conf_read_file(rados->cluster, o->conf);
if (r < 0) {
log_err("rados_conf_read_file failed.\n");
goto failed_early;
for (i = 0; i < td->o.nr_files; i++) {
f = td->files[i];
f->real_file_size = file_size;
- r = rados_write(rados->io_ctx, f->file_name, "", 0, 0);
- if (r < 0) {
- goto failed_obj_create;
+ if (o->touch_objects) {
+ r = rados_write(rados->io_ctx, f->file_name, "", 0, 0);
+ if (r < 0) {
+ goto failed_obj_create;
+ }
}
}
return 0;
char *client_name = NULL;
/*
- * If we specify cluser name, the rados_create2
+ * If we specify cluster name, the rados_create2
* will not assume 'client.'. name is considered
* as a full type.id namestr
*/
goto failed_shutdown;
}
+ if (td->o.odirect) {
+ r = rados_conf_set(rbd->cluster, "rbd_cache", "false");
+ if (r < 0) {
+ log_info("failed to disable RBD in-memory cache\n");
+ }
+ }
+
r = rbd_open(rbd->io_ctx, o->rbd_name, &rbd->image, NULL /*snap */ );
if (r < 0) {
log_err("rbd_open failed.\n");
goto failed_open;
}
+ if (!td->o.odirect) {
+ /*
+ * ensure cache enables writeback/around mode unless explicitly
+ * configured for writethrough mode
+ */
+ r = rbd_flush(rbd->image);
+ if (r < 0) {
+ log_info("rbd: failed to issue initial flush\n");
+ }
+ }
+
if (!_fio_rbd_setup_poll(rbd))
goto failed_poll;
/* taken from "net" engine. Pretend we deal with files,
* even if we do not have any ideas about files.
- * The size of the RBD is set instead of a artificial file.
+ * The size of the RBD is set instead of an artificial file.
*/
if (!td->files_index) {
add_file(td, td->o.filename ? : "rbd", 0, 0);
int i;
while ((ret = ibv_poll_cq(rd->cq, 1, &wc)) == 1) {
- ret = 0;
compevnum++;
if (wc.status) {
memcpy(&io_u->issue_time, &now, sizeof(now));
io_u_queued(td, io_u);
}
+
+ /*
+ * only used for iolog
+ */
+ if (td->o.read_iolog_file)
+ memcpy(&td->last_issue, &now, sizeof(now));
}
static int fio_rdmaio_commit(struct thread_data *td)
ret = fio_rdmaio_send(td, io_us, rd->io_u_queued_nr);
else if (!rd->is_client)
ret = fio_rdmaio_recv(td, io_us, rd->io_u_queued_nr);
- else
- ret = 0; /* must be a SYNC */
if (ret > 0) {
fio_rdmaio_queued(td, io_us, ret);
static int compat_options(struct thread_data *td)
{
- // The original RDMA engine had an ugly / seperator
+ // The original RDMA engine had an ugly / separator
// on the filename for it's options. This function
// retains backwards compatibility with it. Note we do not
// support setting the bindname option is this legacy mode.
if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
(rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
- rd->rmt_us =
- malloc(FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
- memset(rd->rmt_us, 0,
- FIO_RDMA_MAX_IO_DEPTH * sizeof(struct remote_u));
+ rd->rmt_us = calloc(FIO_RDMA_MAX_IO_DEPTH,
+ sizeof(struct remote_u));
rd->rmt_nr = 0;
}
- rd->io_us_queued = malloc(td->o.iodepth * sizeof(struct io_u *));
- memset(rd->io_us_queued, 0, td->o.iodepth * sizeof(struct io_u *));
+ rd->io_us_queued = calloc(td->o.iodepth, sizeof(struct io_u *));
rd->io_u_queued_nr = 0;
- rd->io_us_flight = malloc(td->o.iodepth * sizeof(struct io_u *));
- memset(rd->io_us_flight, 0, td->o.iodepth * sizeof(struct io_u *));
+ rd->io_us_flight = calloc(td->o.iodepth, sizeof(struct io_u *));
rd->io_u_flight_nr = 0;
- rd->io_us_completed = malloc(td->o.iodepth * sizeof(struct io_u *));
- memset(rd->io_us_completed, 0, td->o.iodepth * sizeof(struct io_u *));
+ rd->io_us_completed = calloc(td->o.iodepth, sizeof(struct io_u *));
rd->io_u_completed_nr = 0;
if (td_read(td)) { /* READ as the server */
for (i = 0; i < td->io_u_freelist.nr; i++) {
struct io_u *io_u = td->io_u_freelist.io_us[i];
- io_u->engine_data = malloc(sizeof(struct rdma_io_u_data));
- memset(io_u->engine_data, 0, sizeof(struct rdma_io_u_data));
+ io_u->engine_data = calloc(1, sizeof(struct rdma_io_u_data));
((struct rdma_io_u_data *)io_u->engine_data)->wr_id = i;
io_u->mr = ibv_reg_mr(rd->pd, io_u->buf, max_bs,
}
if (!td->io_ops_data) {
- rd = malloc(sizeof(*rd));
-
- memset(rd, 0, sizeof(*rd));
- init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME, 0);
+ rd = calloc(1, sizeof(*rd));
+ init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_64, 0);
td->io_ops_data = rd;
}
.cleanup = fio_rdmaio_cleanup,
.open_file = fio_rdmaio_open_file,
.close_file = fio_rdmaio_close_file,
- .flags = FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
+ .flags = FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO |
+ FIO_ASYNCIO_SETS_ISSUE_TIME,
.options = options,
.option_struct_size = sizeof(struct rdmaio_options),
};
#ifdef FIO_HAVE_SGIO
+#ifndef SGV4_FLAG_HIPRI
+#define SGV4_FLAG_HIPRI 0x800
+#endif
+
enum {
FIO_SG_WRITE = 1,
- FIO_SG_WRITE_VERIFY = 2,
- FIO_SG_WRITE_SAME = 3
+ FIO_SG_WRITE_VERIFY,
+ FIO_SG_WRITE_SAME,
+ FIO_SG_WRITE_SAME_NDOB,
+ FIO_SG_WRITE_STREAM,
+ FIO_SG_VERIFY_BYTCHK_00,
+ FIO_SG_VERIFY_BYTCHK_01,
+ FIO_SG_VERIFY_BYTCHK_11,
};
struct sg_options {
void *pad;
+ unsigned int hipri;
unsigned int readfua;
unsigned int writefua;
unsigned int write_mode;
+ uint16_t stream_id;
};
static struct fio_option options[] = {
+ {
+ .name = "hipri",
+ .lname = "High Priority",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct sg_options, hipri),
+ .help = "Use polled IO completions",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_SG,
+ },
{
.name = "readfua",
.lname = "sg engine read fua flag support",
.oval = FIO_SG_WRITE,
.help = "Issue standard SCSI WRITE commands",
},
- { .ival = "verify",
+ { .ival = "write_and_verify",
.oval = FIO_SG_WRITE_VERIFY,
.help = "Issue SCSI WRITE AND VERIFY commands",
},
- { .ival = "same",
+ { .ival = "verify",
+ .oval = FIO_SG_WRITE_VERIFY,
+ .help = "Issue SCSI WRITE AND VERIFY commands. This "
+ "option is deprecated. Use write_and_verify instead.",
+ },
+ { .ival = "write_same",
.oval = FIO_SG_WRITE_SAME,
.help = "Issue SCSI WRITE SAME commands",
},
+ { .ival = "same",
+ .oval = FIO_SG_WRITE_SAME,
+ .help = "Issue SCSI WRITE SAME commands. This "
+ "option is deprecated. Use write_same instead.",
+ },
+ { .ival = "write_same_ndob",
+ .oval = FIO_SG_WRITE_SAME_NDOB,
+ .help = "Issue SCSI WRITE SAME(16) commands with NDOB flag set",
+ },
+ { .ival = "verify_bytchk_00",
+ .oval = FIO_SG_VERIFY_BYTCHK_00,
+ .help = "Issue SCSI VERIFY commands with BYTCHK set to 00",
+ },
+ { .ival = "verify_bytchk_01",
+ .oval = FIO_SG_VERIFY_BYTCHK_01,
+ .help = "Issue SCSI VERIFY commands with BYTCHK set to 01",
+ },
+ { .ival = "verify_bytchk_11",
+ .oval = FIO_SG_VERIFY_BYTCHK_11,
+ .help = "Issue SCSI VERIFY commands with BYTCHK set to 11",
+ },
+ { .ival = "write_stream",
+ .oval = FIO_SG_WRITE_STREAM,
+ .help = "Issue SCSI WRITE STREAM(16) commands",
+ },
},
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_SG,
},
+ {
+ .name = "stream_id",
+ .lname = "stream id for WRITE STREAM(16) commands",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct sg_options, stream_id),
+ .help = "Stream ID for WRITE STREAM(16) commands",
+ .def = "0",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_SG,
+ },
{
.name = NULL,
},
#endif
};
+static inline uint16_t sgio_get_be16(uint8_t *buf)
+{
+ return be16_to_cpu(*((uint16_t *) buf));
+}
+
static inline uint32_t sgio_get_be32(uint8_t *buf)
{
return be32_to_cpu(*((uint32_t *) buf));
if (__io_u == io_u)
break;
- if (io_u_sync_complete(td, __io_u)) {
- ret = -1;
+ if (io_u_sync_complete(td, __io_u))
break;
- }
+
} while (1);
return FIO_Q_COMPLETED;
}
static void fio_sgio_rw_lba(struct sg_io_hdr *hdr, unsigned long long lba,
- unsigned long long nr_blocks)
+ unsigned long long nr_blocks, bool override16)
{
- if (lba < MAX_10B_LBA) {
+ if (lba < MAX_10B_LBA && !override16) {
sgio_set_be32((uint32_t) lba, &hdr->cmdp[2]);
sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[7]);
} else {
else
hdr->cmdp[0] = 0x88; // read(16)
+ if (o->hipri)
+ hdr->flags |= SGV4_FLAG_HIPRI;
if (o->readfua)
hdr->cmdp[1] |= 0x08;
- fio_sgio_rw_lba(hdr, lba, nr_blocks);
+ fio_sgio_rw_lba(hdr, lba, nr_blocks, false);
} else if (io_u->ddir == DDIR_WRITE) {
sgio_hdr_init(sd, hdr, io_u, 1);
hdr->cmdp[0] = 0x2a; // write(10)
else
hdr->cmdp[0] = 0x8a; // write(16)
+ if (o->hipri)
+ hdr->flags |= SGV4_FLAG_HIPRI;
if (o->writefua)
hdr->cmdp[1] |= 0x08;
break;
else
hdr->cmdp[0] = 0x93; // write same(16)
break;
+ case FIO_SG_WRITE_SAME_NDOB:
+ hdr->cmdp[0] = 0x93; // write same(16)
+ hdr->cmdp[1] |= 0x1; // no data output buffer
+ hdr->dxfer_len = 0;
+ break;
+ case FIO_SG_WRITE_STREAM:
+ hdr->cmdp[0] = 0x9a; // write stream (16)
+ if (o->writefua)
+ hdr->cmdp[1] |= 0x08;
+ sgio_set_be64(lba, &hdr->cmdp[2]);
+ sgio_set_be16((uint16_t) io_u->file->engine_pos, &hdr->cmdp[10]);
+ sgio_set_be16((uint16_t) nr_blocks, &hdr->cmdp[12]);
+ break;
+ case FIO_SG_VERIFY_BYTCHK_00:
+ if (lba < MAX_10B_LBA)
+ hdr->cmdp[0] = 0x2f; // VERIFY(10)
+ else
+ hdr->cmdp[0] = 0x8f; // VERIFY(16)
+ hdr->dxfer_len = 0;
+ break;
+ case FIO_SG_VERIFY_BYTCHK_01:
+ if (lba < MAX_10B_LBA)
+ hdr->cmdp[0] = 0x2f; // VERIFY(10)
+ else
+ hdr->cmdp[0] = 0x8f; // VERIFY(16)
+ hdr->cmdp[1] |= 0x02; // BYTCHK = 01b
+ break;
+ case FIO_SG_VERIFY_BYTCHK_11:
+ if (lba < MAX_10B_LBA)
+ hdr->cmdp[0] = 0x2f; // VERIFY(10)
+ else
+ hdr->cmdp[0] = 0x8f; // VERIFY(16)
+ hdr->cmdp[1] |= 0x06; // BYTCHK = 11b
+ hdr->dxfer_len = sd->bs;
+ break;
};
- fio_sgio_rw_lba(hdr, lba, nr_blocks);
+ if (o->write_mode != FIO_SG_WRITE_STREAM)
+ fio_sgio_rw_lba(hdr, lba, nr_blocks,
+ o->write_mode == FIO_SG_WRITE_SAME_NDOB);
} else if (io_u->ddir == DDIR_TRIM) {
struct sgio_trim *st;
{
struct sgio_data *sd;
struct sgio_trim *st;
+ struct sg_io_hdr *h3p;
int i;
sd = calloc(1, sizeof(*sd));
#ifdef FIO_SGIO_DEBUG
sd->trim_queue_map = calloc(td->o.iodepth, sizeof(int));
#endif
- for (i = 0; i < td->o.iodepth; i++) {
+ for (i = 0, h3p = sd->sgbuf; i < td->o.iodepth; i++, ++h3p) {
sd->trim_queues[i] = calloc(1, sizeof(struct sgio_trim));
st = sd->trim_queues[i];
st->unmap_param = calloc(td->o.iodepth + 1, sizeof(char[16]));
st->unmap_range_count = 0;
st->trim_io_us = calloc(td->o.iodepth, sizeof(struct io_u *));
+ h3p->interface_id = 'S';
}
td->io_ops_data = sd;
return 0;
}
+static int fio_sgio_stream_control(struct fio_file *f, bool open_stream, uint16_t *stream_id)
+{
+ struct sg_io_hdr hdr;
+ unsigned char cmd[16];
+ unsigned char sb[64];
+ unsigned char buf[8];
+ int ret;
+
+ memset(&hdr, 0, sizeof(hdr));
+ memset(cmd, 0, sizeof(cmd));
+ memset(sb, 0, sizeof(sb));
+ memset(buf, 0, sizeof(buf));
+
+ hdr.interface_id = 'S';
+ hdr.cmdp = cmd;
+ hdr.cmd_len = 16;
+ hdr.sbp = sb;
+ hdr.mx_sb_len = sizeof(sb);
+ hdr.timeout = SCSI_TIMEOUT_MS;
+ hdr.cmdp[0] = 0x9e;
+ hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+ hdr.dxferp = buf;
+ hdr.dxfer_len = sizeof(buf);
+ sgio_set_be32(sizeof(buf), &hdr.cmdp[10]);
+
+ if (open_stream)
+ hdr.cmdp[1] = 0x34;
+ else {
+ hdr.cmdp[1] = 0x54;
+ sgio_set_be16(*stream_id, &hdr.cmdp[4]);
+ }
+
+ ret = ioctl(f->fd, SG_IO, &hdr);
+
+ if (ret < 0)
+ return ret;
+
+ if (hdr.info & SG_INFO_CHECK)
+ return 1;
+
+ if (open_stream) {
+ *stream_id = sgio_get_be16(&buf[4]);
+ dprint(FD_FILE, "sgio_stream_control: opened stream %u\n", (unsigned int) *stream_id);
+ assert(*stream_id != 0);
+ } else
+ dprint(FD_FILE, "sgio_stream_control: closed stream %u\n", (unsigned int) *stream_id);
+
+ return 0;
+}
+
static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
{
struct sgio_data *sd = td->io_ops_data;
+ struct sg_options *o = td->eo;
int ret;
ret = generic_open_file(td, f);
if (sd && !sd->type_checked && fio_sgio_type_check(td, f)) {
ret = generic_close_file(td, f);
- return 1;
+ return ret;
+ }
+
+ if (o->write_mode == FIO_SG_WRITE_STREAM) {
+ if (o->stream_id)
+ f->engine_pos = o->stream_id;
+ else {
+ ret = fio_sgio_stream_control(f, true, (uint16_t *) &f->engine_pos);
+ if (ret)
+ return ret;
+ }
}
return 0;
}
+int fio_sgio_close(struct thread_data *td, struct fio_file *f)
+{
+ struct sg_options *o = td->eo;
+ int ret;
+
+ if (!o->stream_id && o->write_mode == FIO_SG_WRITE_STREAM) {
+ ret = fio_sgio_stream_control(f, false, (uint16_t *) &f->engine_pos);
+ if (ret)
+ return ret;
+ }
+
+ return generic_close_file(td, f);
+}
+
/*
* Build an error string with details about the driver, host or scsi
* error contained in the sg header Caller will use as necessary.
strlcat(msg, ". ", MAXERRDETAIL);
}
if (hdr->sb_len_wr) {
+ const uint8_t *const sbp = hdr->sbp;
+
snprintf(msgchunk, MAXMSGCHUNK, "Sense Data (%d bytes):", hdr->sb_len_wr);
strlcat(msg, msgchunk, MAXERRDETAIL);
for (i = 0; i < hdr->sb_len_wr; i++) {
- snprintf(msgchunk, MAXMSGCHUNK, " %02x", hdr->sbp[i]);
+ snprintf(msgchunk, MAXMSGCHUNK, " %02x", sbp[i]);
strlcat(msg, msgchunk, MAXERRDETAIL);
}
strlcat(msg, ". ", MAXERRDETAIL);
.event = fio_sgio_event,
.cleanup = fio_sgio_cleanup,
.open_file = fio_sgio_open,
- .close_file = generic_close_file,
+ .close_file = fio_sgio_close,
.get_file_size = fio_sgio_get_file_size,
- .flags = FIO_SYNCIO | FIO_RAWIO,
+ .flags = FIO_SYNCIO | FIO_RAWIO | FIO_RO_NEEDS_RW_OPEN,
.options = options,
.option_struct_size = sizeof(struct sg_options)
};
/*
* Hook for getting the zoned model of a zoned block device for zonemode=zbd.
* The zoned model can be one of (see zbd_types.h):
- * - ZBD_IGNORE: skip regular files
* - ZBD_NONE: regular block device (zone emulation will be used)
* - ZBD_HOST_AWARE: host aware zoned block device
* - ZBD_HOST_MANAGED: host managed zoned block device
return 0;
}
+/*
+ * Hook called for getting the maximum number of open zones for a
+ * ZBD_HOST_MANAGED zoned block device.
+ * A @max_open_zones value set to zero means no limit.
+ */
+static int fio_skeleton_get_max_open_zones(struct thread_data *td,
+ struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ return 0;
+}
+
/*
* Note that the structure is exported, so that fio can get it via
* dlsym(..., "ioengine"); for (and only for) external engines.
.get_zoned_model = fio_skeleton_get_zoned_model,
.report_zones = fio_skeleton_report_zones,
.reset_wp = fio_skeleton_reset_wp,
+ .get_max_open_zones = fio_skeleton_get_max_open_zones,
.options = options,
.option_struct_size = sizeof(struct fio_skeleton_options),
};
static int fio_solarisaio_init(struct thread_data *td)
{
- struct solarisaio_data *sd = malloc(sizeof(*sd));
unsigned int max_depth;
+ struct solarisaio_data *sd;
+ sd = calloc(1, sizeof(*sd));
max_depth = td->o.iodepth;
if (max_depth > MAXASYNCHIO) {
max_depth);
}
- memset(sd, 0, sizeof(*sd));
- sd->aio_events = malloc(max_depth * sizeof(struct io_u *));
- memset(sd->aio_events, 0, max_depth * sizeof(struct io_u *));
+ sd->aio_events = calloc(max_depth, sizeof(struct io_u *));
sd->max_depth = max_depth;
#ifdef USE_SIGNAL_COMPLETIONS
{
struct syncio_data *sd;
- sd = malloc(sizeof(*sd));
- memset(sd, 0, sizeof(*sd));
+ sd = calloc(1, sizeof(*sd));
sd->last_offset = -1ULL;
sd->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
sd->io_us = malloc(td->o.iodepth * sizeof(struct io_u *));
#include <errno.h>
#include "../fio.h"
+#include "../optgroup.h"
typedef BOOL (WINAPI *CANCELIOEX)(HANDLE hFile, LPOVERLAPPED lpOverlapped);
struct windowsaio_data *wd;
};
+struct windowsaio_options {
+ struct thread_data *td;
+ unsigned int no_completion_thread;
+};
+
+static struct fio_option options[] = {
+ {
+ .name = "no_completion_thread",
+ .lname = "No completion polling thread",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct windowsaio_options, no_completion_thread),
+ .help = "Use to avoid separate completion polling thread",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_WINDOWSAIO,
+ },
+ {
+ .name = NULL,
+ },
+};
+
static DWORD WINAPI IoCompletionRoutine(LPVOID lpParameter);
static int fio_windowsaio_init(struct thread_data *td)
struct thread_ctx *ctx;
struct windowsaio_data *wd;
HANDLE hFile;
+ struct windowsaio_options *o = td->eo;
hFile = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
if (hFile == INVALID_HANDLE_VALUE) {
wd->iothread_running = TRUE;
wd->iocp = hFile;
- if (!rc)
- ctx = malloc(sizeof(struct thread_ctx));
+ if (o->no_completion_thread == 0) {
+ if (!rc)
+ ctx = malloc(sizeof(struct thread_ctx));
- if (!rc && ctx == NULL) {
- log_err("windowsaio: failed to allocate memory for thread context structure\n");
- CloseHandle(hFile);
- rc = 1;
- }
+ if (!rc && ctx == NULL) {
+ log_err("windowsaio: failed to allocate memory for thread context structure\n");
+ CloseHandle(hFile);
+ rc = 1;
+ }
- if (!rc) {
- DWORD threadid;
+ if (!rc) {
+ DWORD threadid;
- ctx->iocp = hFile;
- ctx->wd = wd;
- wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
- if (!wd->iothread)
- log_err("windowsaio: failed to create io completion thread\n");
- else if (fio_option_is_set(&td->o, cpumask))
- fio_setaffinity(threadid, td->o.cpumask);
+ ctx->iocp = hFile;
+ ctx->wd = wd;
+ wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
+ if (!wd->iothread)
+ log_err("windowsaio: failed to create io completion thread\n");
+ else if (fio_option_is_set(&td->o, cpumask))
+ fio_setaffinity(threadid, td->o.cpumask);
+ }
+ if (rc || wd->iothread == NULL)
+ rc = 1;
}
-
- if (rc || wd->iothread == NULL)
- rc = 1;
}
return rc;
log_err("fio: unknown fadvise type %d\n", td->o.fadvise_hint);
}
- if (!td_write(td) || read_only)
+ if ((!td_write(td) && !(td->flags & TD_F_SYNCS)) || read_only)
access = GENERIC_READ;
else
access = (GENERIC_READ | GENERIC_WRITE);
return wd->aio_events[event];
}
-static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
- unsigned int max,
- const struct timespec *t)
+/* dequeue completion entrees directly (no separate completion thread) */
+static int fio_windowsaio_getevents_nothread(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
+{
+ struct windowsaio_data *wd = td->io_ops_data;
+ unsigned int dequeued = 0;
+ struct io_u *io_u;
+ DWORD start_count = 0;
+ DWORD end_count = 0;
+ DWORD mswait = 250;
+ struct fio_overlapped *fov;
+
+ if (t != NULL) {
+ mswait = (t->tv_sec * 1000) + (t->tv_nsec / 1000000);
+ start_count = GetTickCount();
+ end_count = start_count + (t->tv_sec * 1000) + (t->tv_nsec / 1000000);
+ }
+
+ do {
+ BOOL ret;
+ OVERLAPPED *ovl;
+
+ ULONG entries = min(16, max-dequeued);
+ OVERLAPPED_ENTRY oe[16];
+ ret = GetQueuedCompletionStatusEx(wd->iocp, oe, 16, &entries, mswait, 0);
+ if (ret && entries) {
+ int entry_num;
+
+ for (entry_num=0; entry_num<entries; entry_num++) {
+ ovl = oe[entry_num].lpOverlapped;
+ fov = CONTAINING_RECORD(ovl, struct fio_overlapped, o);
+ io_u = fov->io_u;
+
+ if (ovl->Internal == ERROR_SUCCESS) {
+ io_u->resid = io_u->xfer_buflen - ovl->InternalHigh;
+ io_u->error = 0;
+ } else {
+ io_u->resid = io_u->xfer_buflen;
+ io_u->error = win_to_posix_error(GetLastError());
+ }
+
+ fov->io_complete = FALSE;
+ wd->aio_events[dequeued] = io_u;
+ dequeued++;
+ }
+ }
+
+ if (dequeued >= min ||
+ (t != NULL && timeout_expired(start_count, end_count)))
+ break;
+ } while (1);
+ return dequeued;
+}
+
+/* dequeue completion entrees creates by separate IoCompletionRoutine thread */
+static int fio_windowaio_getevents_thread(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
{
struct windowsaio_data *wd = td->io_ops_data;
unsigned int dequeued = 0;
wd->aio_events[dequeued] = io_u;
dequeued++;
}
-
}
if (dequeued >= min)
break;
return dequeued;
}
+static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
+ unsigned int max, const struct timespec *t)
+{
+ struct windowsaio_options *o = td->eo;
+
+ if (o->no_completion_thread)
+ return fio_windowsaio_getevents_nothread(td, min, max, t);
+ return fio_windowaio_getevents_thread(td, min, max, t);
+}
+
static enum fio_q_status fio_windowsaio_queue(struct thread_data *td,
struct io_u *io_u)
{
.get_file_size = generic_get_file_size,
.io_u_init = fio_windowsaio_io_u_init,
.io_u_free = fio_windowsaio_io_u_free,
+ .options = options,
+ .option_struct_size = sizeof(struct windowsaio_options),
};
static void fio_init fio_windowsaio_register(void)
--- /dev/null
+/*
+ * fio xNVMe IO Engine
+ *
+ * IO engine using the xNVMe C API.
+ *
+ * See: http://xnvme.io/
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <stdlib.h>
+#include <assert.h>
+#include <libxnvme.h>
+#include "fio.h"
+#include "verify.h"
+#include "zbd_types.h"
+#include "dataplacement.h"
+#include "optgroup.h"
+
+static pthread_mutex_t g_serialize = PTHREAD_MUTEX_INITIALIZER;
+
+struct xnvme_fioe_fwrap {
+ /* fio file representation */
+ struct fio_file *fio_file;
+
+ /* xNVMe device handle */
+ struct xnvme_dev *dev;
+ /* xNVMe device geometry */
+ const struct xnvme_geo *geo;
+
+ struct xnvme_queue *queue;
+
+ uint32_t ssw;
+ uint32_t lba_nbytes;
+ uint32_t md_nbytes;
+ uint32_t lba_pow2;
+
+ uint8_t _pad[16];
+};
+XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_fwrap) == 64, "Incorrect size")
+
+struct xnvme_fioe_data {
+ /* I/O completion queue */
+ struct io_u **iocq;
+
+ /* # of iocq entries; incremented via getevents()/cb_pool() */
+ uint64_t completed;
+
+ /*
+ * # of errors; incremented when observed on completion via
+ * getevents()/cb_pool()
+ */
+ uint64_t ecount;
+
+ /* Controller which device/file to select */
+ int32_t prev;
+ int32_t cur;
+
+ /* Number of devices/files for which open() has been called */
+ int64_t nopen;
+ /* Number of devices/files allocated in files[] */
+ uint64_t nallocated;
+
+ struct iovec *iovec;
+ struct iovec *md_iovec;
+
+ struct xnvme_fioe_fwrap files[];
+};
+XNVME_STATIC_ASSERT(sizeof(struct xnvme_fioe_data) == 64, "Incorrect size")
+
+struct xnvme_fioe_request {
+ /* Context for NVMe PI */
+ struct xnvme_pi_ctx pi_ctx;
+
+ /* Separate metadata buffer pointer */
+ void *md_buf;
+};
+
+struct xnvme_fioe_options {
+ void *padding;
+ unsigned int hipri;
+ unsigned int sqpoll_thread;
+ unsigned int xnvme_dev_nsid;
+ unsigned int xnvme_iovec;
+ unsigned int md_per_io_size;
+ unsigned int pi_act;
+ unsigned int apptag;
+ unsigned int apptag_mask;
+ unsigned int prchk;
+ char *xnvme_be;
+ char *xnvme_mem;
+ char *xnvme_async;
+ char *xnvme_sync;
+ char *xnvme_admin;
+ char *xnvme_dev_subnqn;
+};
+
+static int str_pi_chk_cb(void *data, const char *str)
+{
+ struct xnvme_fioe_options *o = data;
+
+ if (strstr(str, "GUARD") != NULL)
+ o->prchk = XNVME_PI_FLAGS_GUARD_CHECK;
+ if (strstr(str, "REFTAG") != NULL)
+ o->prchk |= XNVME_PI_FLAGS_REFTAG_CHECK;
+ if (strstr(str, "APPTAG") != NULL)
+ o->prchk |= XNVME_PI_FLAGS_APPTAG_CHECK;
+
+ return 0;
+}
+
+static struct fio_option options[] = {
+ {
+ .name = "hipri",
+ .lname = "High Priority",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct xnvme_fioe_options, hipri),
+ .help = "Use polled IO completions",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "sqthread_poll",
+ .lname = "Kernel SQ thread polling",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct xnvme_fioe_options, sqpoll_thread),
+ .help = "Offload submission/completion to kernel thread",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "xnvme_be",
+ .lname = "xNVMe Backend",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct xnvme_fioe_options, xnvme_be),
+ .help = "Select xNVMe backend [spdk,linux,fbsd]",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "xnvme_mem",
+ .lname = "xNVMe Memory Backend",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct xnvme_fioe_options, xnvme_mem),
+ .help = "Select xNVMe memory backend",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "xnvme_async",
+ .lname = "xNVMe Asynchronous command-interface",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct xnvme_fioe_options, xnvme_async),
+ .help = "Select xNVMe async. interface: "
+ "[emu,thrpool,io_uring,io_uring_cmd,libaio,posix,vfio,nil]",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "xnvme_sync",
+ .lname = "xNVMe Synchronous. command-interface",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct xnvme_fioe_options, xnvme_sync),
+ .help = "Select xNVMe sync. interface: [nvme,psync,block]",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "xnvme_admin",
+ .lname = "xNVMe Admin command-interface",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct xnvme_fioe_options, xnvme_admin),
+ .help = "Select xNVMe admin. cmd-interface: [nvme,block]",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "xnvme_dev_nsid",
+ .lname = "xNVMe Namespace-Identifier, for user-space NVMe driver",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_nsid),
+ .help = "xNVMe Namespace-Identifier, for user-space NVMe driver",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "xnvme_dev_subnqn",
+ .lname = "Subsystem nqn for Fabrics",
+ .type = FIO_OPT_STR_STORE,
+ .off1 = offsetof(struct xnvme_fioe_options, xnvme_dev_subnqn),
+ .help = "Subsystem NQN for Fabrics",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "xnvme_iovec",
+ .lname = "Vectored IOs",
+ .type = FIO_OPT_STR_SET,
+ .off1 = offsetof(struct xnvme_fioe_options, xnvme_iovec),
+ .help = "Send vectored IOs",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "md_per_io_size",
+ .lname = "Separate Metadata Buffer Size per I/O",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct xnvme_fioe_options, md_per_io_size),
+ .def = "0",
+ .help = "Size of separate metadata buffer per I/O (Default: 0)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "pi_act",
+ .lname = "Protection Information Action",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct xnvme_fioe_options, pi_act),
+ .def = "1",
+ .help = "Protection Information Action bit (pi_act=1 or pi_act=0)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "pi_chk",
+ .lname = "Protection Information Check",
+ .type = FIO_OPT_STR_STORE,
+ .def = NULL,
+ .help = "Control of Protection Information Checking (pi_chk=GUARD,REFTAG,APPTAG)",
+ .cb = str_pi_chk_cb,
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "apptag",
+ .lname = "Application Tag used in Protection Information",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct xnvme_fioe_options, apptag),
+ .def = "0x1234",
+ .help = "Application Tag used in Protection Information field (Default: 0x1234)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+ {
+ .name = "apptag_mask",
+ .lname = "Application Tag Mask",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct xnvme_fioe_options, apptag_mask),
+ .def = "0xffff",
+ .help = "Application Tag Mask used with Application Tag (Default: 0xffff)",
+ .category = FIO_OPT_C_ENGINE,
+ .group = FIO_OPT_G_XNVME,
+ },
+
+ {
+ .name = NULL,
+ },
+};
+
+static void cb_pool(struct xnvme_cmd_ctx *ctx, void *cb_arg)
+{
+ struct io_u *io_u = cb_arg;
+ struct xnvme_fioe_data *xd = io_u->mmap_data;
+ struct xnvme_fioe_request *fio_req = io_u->engine_data;
+ struct xnvme_fioe_fwrap *fwrap = &xd->files[io_u->file->fileno];
+ bool pi_act = (fio_req->pi_ctx.pi_flags >> 3);
+ int err;
+
+ if (xnvme_cmd_ctx_cpl_status(ctx)) {
+ xnvme_cmd_ctx_pr(ctx, XNVME_PR_DEF);
+ xd->ecount += 1;
+ io_u->error = EIO;
+ }
+
+ if (!io_u->error && fwrap->geo->pi_type && (io_u->ddir == DDIR_READ) && !pi_act) {
+ err = xnvme_pi_verify(&fio_req->pi_ctx, io_u->xfer_buf,
+ fio_req->md_buf, io_u->xfer_buflen / fwrap->lba_nbytes);
+ if (err) {
+ xd->ecount += 1;
+ io_u->error = EIO;
+ }
+ }
+
+ xd->iocq[xd->completed++] = io_u;
+ xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+}
+
+static struct xnvme_opts xnvme_opts_from_fioe(struct thread_data *td)
+{
+ struct xnvme_fioe_options *o = td->eo;
+ struct xnvme_opts opts = xnvme_opts_default();
+
+ opts.nsid = o->xnvme_dev_nsid;
+ opts.subnqn = o->xnvme_dev_subnqn;
+ opts.be = o->xnvme_be;
+ opts.mem = o->xnvme_mem;
+ opts.async = o->xnvme_async;
+ opts.sync = o->xnvme_sync;
+ opts.admin = o->xnvme_admin;
+
+ opts.poll_io = o->hipri;
+ opts.poll_sq = o->sqpoll_thread;
+
+ opts.direct = td->o.odirect;
+
+ return opts;
+}
+
+static void _dev_close(struct thread_data *td, struct xnvme_fioe_fwrap *fwrap)
+{
+ if (fwrap->dev)
+ xnvme_queue_term(fwrap->queue);
+
+ xnvme_dev_close(fwrap->dev);
+
+ memset(fwrap, 0, sizeof(*fwrap));
+}
+
+static void xnvme_fioe_cleanup(struct thread_data *td)
+{
+ struct xnvme_fioe_data *xd = NULL;
+ int err;
+
+ if (!td->io_ops_data)
+ return;
+
+ xd = td->io_ops_data;
+
+ err = pthread_mutex_lock(&g_serialize);
+ if (err)
+ log_err("ioeng->cleanup(): pthread_mutex_lock(), err(%d)\n", err);
+ /* NOTE: not returning here */
+
+ for (uint64_t i = 0; i < xd->nallocated; ++i)
+ _dev_close(td, &xd->files[i]);
+
+ if (!err) {
+ err = pthread_mutex_unlock(&g_serialize);
+ if (err)
+ log_err("ioeng->cleanup(): pthread_mutex_unlock(), err(%d)\n", err);
+ }
+
+ free(xd->iocq);
+ free(xd->iovec);
+ free(xd->md_iovec);
+ free(xd);
+ td->io_ops_data = NULL;
+}
+
+static int _verify_options(struct thread_data *td, struct fio_file *f,
+ struct xnvme_fioe_fwrap *fwrap)
+{
+ struct xnvme_fioe_options *o = td->eo;
+ unsigned int correct_md_size;
+
+ for_each_rw_ddir(ddir) {
+ if (td->o.min_bs[ddir] % fwrap->lba_nbytes || td->o.max_bs[ddir] % fwrap->lba_nbytes) {
+ if (!fwrap->lba_pow2) {
+ log_err("ioeng->_verify_options(%s): block size must be a multiple of %u "
+ "(LBA data size + Metadata size)\n", f->file_name, fwrap->lba_nbytes);
+ } else {
+ log_err("ioeng->_verify_options(%s): block size must be a multiple of LBA data size\n",
+ f->file_name);
+ }
+ return 1;
+ }
+ if (ddir == DDIR_TRIM)
+ continue;
+
+ correct_md_size = (td->o.max_bs[ddir] / fwrap->lba_nbytes) * fwrap->md_nbytes;
+ if (fwrap->md_nbytes && fwrap->lba_pow2 && (o->md_per_io_size < correct_md_size)) {
+ log_err("ioeng->_verify_options(%s): md_per_io_size should be at least %u bytes\n",
+ f->file_name, correct_md_size);
+ return 1;
+ }
+ }
+
+ /*
+ * For extended logical block sizes we cannot use verify when
+ * end to end data protection checks are enabled, as the PI
+ * section of data buffer conflicts with verify.
+ */
+ if (fwrap->md_nbytes && fwrap->geo->pi_type && !fwrap->lba_pow2 &&
+ td->o.verify != VERIFY_NONE) {
+ log_err("ioeng->_verify_options(%s): for extended LBA, verify cannot be used when E2E data protection is enabled\n",
+ f->file_name);
+ return 1;
+ }
+
+ return 0;
+}
+
+/**
+ * Helper function setting up device handles as addressed by the naming
+ * convention of the given `fio_file` filename.
+ *
+ * Checks thread-options for explicit control of asynchronous implementation via
+ * the ``--xnvme_async={thrpool,emu,posix,io_uring,libaio,nil}``.
+ */
+static int _dev_open(struct thread_data *td, struct fio_file *f)
+{
+ struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+ struct xnvme_fioe_options *o = td->eo;
+ struct xnvme_fioe_data *xd = td->io_ops_data;
+ struct xnvme_fioe_fwrap *fwrap;
+ int flags = 0;
+ int err;
+
+ if (f->fileno > (int)xd->nallocated) {
+ log_err("ioeng->_dev_open(%s): invalid assumption\n", f->file_name);
+ return 1;
+ }
+
+ fwrap = &xd->files[f->fileno];
+
+ err = pthread_mutex_lock(&g_serialize);
+ if (err) {
+ log_err("ioeng->_dev_open(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
+ err);
+ return -err;
+ }
+
+ fwrap->dev = xnvme_dev_open(f->file_name, &opts);
+ if (!fwrap->dev) {
+ log_err("ioeng->_dev_open(%s): xnvme_dev_open(), err(%d)\n", f->file_name, errno);
+ goto failure;
+ }
+ fwrap->geo = xnvme_dev_get_geo(fwrap->dev);
+
+ if (xnvme_queue_init(fwrap->dev, td->o.iodepth, flags, &(fwrap->queue))) {
+ log_err("ioeng->_dev_open(%s): xnvme_queue_init(), err(?)\n", f->file_name);
+ goto failure;
+ }
+ xnvme_queue_set_cb(fwrap->queue, cb_pool, NULL);
+
+ fwrap->ssw = xnvme_dev_get_ssw(fwrap->dev);
+ fwrap->lba_nbytes = fwrap->geo->lba_nbytes;
+ fwrap->md_nbytes = fwrap->geo->nbytes_oob;
+
+ if (fwrap->geo->lba_extended)
+ fwrap->lba_pow2 = 0;
+ else
+ fwrap->lba_pow2 = 1;
+
+ /*
+ * When PI action is set and PI size is equal to metadata size, the
+ * controller inserts/removes PI. So update the LBA data and metadata
+ * sizes accordingly.
+ */
+ if (o->pi_act && fwrap->geo->pi_type &&
+ fwrap->geo->nbytes_oob == xnvme_pi_size(fwrap->geo->pi_format)) {
+ if (fwrap->geo->lba_extended) {
+ fwrap->lba_nbytes -= fwrap->geo->nbytes_oob;
+ fwrap->lba_pow2 = 1;
+ }
+ fwrap->md_nbytes = 0;
+ }
+
+ if (_verify_options(td, f, fwrap)) {
+ td_verror(td, EINVAL, "_dev_open");
+ goto failure;
+ }
+
+ fwrap->fio_file = f;
+ fwrap->fio_file->filetype = FIO_TYPE_BLOCK;
+ fwrap->fio_file->real_file_size = fwrap->geo->tbytes;
+ fio_file_set_size_known(fwrap->fio_file);
+
+ err = pthread_mutex_unlock(&g_serialize);
+ if (err)
+ log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
+ err);
+
+ return 0;
+
+failure:
+ xnvme_queue_term(fwrap->queue);
+ xnvme_dev_close(fwrap->dev);
+
+ err = pthread_mutex_unlock(&g_serialize);
+ if (err)
+ log_err("ioeng->_dev_open(%s): pthread_mutex_unlock(), err(%d)\n", f->file_name,
+ err);
+
+ return 1;
+}
+
+static int xnvme_fioe_init(struct thread_data *td)
+{
+ struct xnvme_fioe_data *xd = NULL;
+ struct xnvme_fioe_options *o = td->eo;
+ struct fio_file *f;
+ unsigned int i;
+
+ if (!td->o.use_thread) {
+ log_err("ioeng->init(): --thread=1 is required\n");
+ return 1;
+ }
+
+ /* Allocate xd and iocq */
+ xd = calloc(1, sizeof(*xd) + sizeof(*xd->files) * td->o.nr_files);
+ if (!xd) {
+ log_err("ioeng->init(): !calloc(), err(%d)\n", errno);
+ return 1;
+ }
+
+ xd->iocq = calloc(td->o.iodepth, sizeof(struct io_u *));
+ if (!xd->iocq) {
+ free(xd);
+ log_err("ioeng->init(): !calloc(xd->iocq), err(%d)\n", errno);
+ return 1;
+ }
+
+ if (o->xnvme_iovec) {
+ xd->iovec = calloc(td->o.iodepth, sizeof(*xd->iovec));
+ if (!xd->iovec) {
+ free(xd->iocq);
+ free(xd);
+ log_err("ioeng->init(): !calloc(xd->iovec), err(%d)\n", errno);
+ return 1;
+ }
+ }
+
+ if (o->xnvme_iovec && o->md_per_io_size) {
+ xd->md_iovec = calloc(td->o.iodepth, sizeof(*xd->md_iovec));
+ if (!xd->md_iovec) {
+ free(xd->iocq);
+ free(xd->iovec);
+ free(xd);
+ log_err("ioeng->init(): !calloc(xd->md_iovec), err(%d)\n", errno);
+ return 1;
+ }
+ }
+
+ xd->prev = -1;
+ td->io_ops_data = xd;
+
+ for_each_file(td, f, i)
+ {
+ if (_dev_open(td, f)) {
+ /*
+ * Note: We are not freeing xd, iocq, iovec and md_iovec.
+ * This will be done as part of cleanup routine.
+ */
+ log_err("ioeng->init(): failed; _dev_open(%s)\n", f->file_name);
+ return 1;
+ }
+
+ ++(xd->nallocated);
+ }
+
+ if (xd->nallocated != td->o.nr_files) {
+ log_err("ioeng->init(): failed; nallocated != td->o.nr_files\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+/* NOTE: using the first device for buffer-allocators) */
+static int xnvme_fioe_iomem_alloc(struct thread_data *td, size_t total_mem)
+{
+ struct xnvme_fioe_data *xd = td->io_ops_data;
+ struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
+
+ if (!fwrap->dev) {
+ log_err("ioeng->iomem_alloc(): failed; no dev-handle\n");
+ return 1;
+ }
+
+ td->orig_buffer = xnvme_buf_alloc(fwrap->dev, total_mem);
+
+ return td->orig_buffer == NULL;
+}
+
+/* NOTE: using the first device for buffer-allocators) */
+static void xnvme_fioe_iomem_free(struct thread_data *td)
+{
+ struct xnvme_fioe_data *xd = NULL;
+ struct xnvme_fioe_fwrap *fwrap = NULL;
+
+ if (!td->io_ops_data)
+ return;
+
+ xd = td->io_ops_data;
+ fwrap = &xd->files[0];
+
+ if (!fwrap->dev) {
+ log_err("ioeng->iomem_free(): failed no dev-handle\n");
+ return;
+ }
+
+ xnvme_buf_free(fwrap->dev, td->orig_buffer);
+}
+
+static int xnvme_fioe_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+ struct xnvme_fioe_request *fio_req;
+ struct xnvme_fioe_options *o = td->eo;
+ struct xnvme_fioe_data *xd = td->io_ops_data;
+ struct xnvme_fioe_fwrap *fwrap = &xd->files[0];
+
+ if (!fwrap->dev) {
+ log_err("ioeng->io_u_init(): failed; no dev-handle\n");
+ return 1;
+ }
+
+ io_u->mmap_data = td->io_ops_data;
+ io_u->engine_data = NULL;
+
+ fio_req = calloc(1, sizeof(*fio_req));
+ if (!fio_req) {
+ log_err("ioeng->io_u_init(): !calloc(fio_req), err(%d)\n", errno);
+ return 1;
+ }
+
+ if (o->md_per_io_size) {
+ fio_req->md_buf = xnvme_buf_alloc(fwrap->dev, o->md_per_io_size);
+ if (!fio_req->md_buf) {
+ free(fio_req);
+ return 1;
+ }
+ }
+
+ io_u->engine_data = fio_req;
+
+ return 0;
+}
+
+static void xnvme_fioe_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+ struct xnvme_fioe_data *xd = NULL;
+ struct xnvme_fioe_fwrap *fwrap = NULL;
+ struct xnvme_fioe_request *fio_req = NULL;
+
+ if (!td->io_ops_data)
+ return;
+
+ xd = td->io_ops_data;
+ fwrap = &xd->files[0];
+
+ if (!fwrap->dev) {
+ log_err("ioeng->io_u_free(): failed no dev-handle\n");
+ return;
+ }
+
+ fio_req = io_u->engine_data;
+ if (fio_req->md_buf)
+ xnvme_buf_free(fwrap->dev, fio_req->md_buf);
+
+ free(fio_req);
+
+ io_u->mmap_data = NULL;
+}
+
+static struct io_u *xnvme_fioe_event(struct thread_data *td, int event)
+{
+ struct xnvme_fioe_data *xd = td->io_ops_data;
+
+ assert(event >= 0);
+ assert((unsigned)event < xd->completed);
+
+ return xd->iocq[event];
+}
+
+static int xnvme_fioe_getevents(struct thread_data *td, unsigned int min, unsigned int max,
+ const struct timespec *t)
+{
+ struct xnvme_fioe_data *xd = td->io_ops_data;
+ struct xnvme_fioe_fwrap *fwrap = NULL;
+ int nfiles = xd->nallocated;
+ int err = 0;
+
+ if (xd->prev != -1 && ++xd->prev < nfiles) {
+ fwrap = &xd->files[xd->prev];
+ xd->cur = xd->prev;
+ }
+
+ xd->completed = 0;
+ for (;;) {
+ if (fwrap == NULL || xd->cur == nfiles) {
+ fwrap = &xd->files[0];
+ xd->cur = 0;
+ }
+
+ while (fwrap != NULL && xd->cur < nfiles && err >= 0) {
+ err = xnvme_queue_poke(fwrap->queue, max - xd->completed);
+ if (err < 0) {
+ switch (err) {
+ case -EBUSY:
+ case -EAGAIN:
+ usleep(1);
+ break;
+
+ default:
+ log_err("ioeng->getevents(): unhandled IO error\n");
+ assert(false);
+ return 0;
+ }
+ }
+ if (xd->completed >= min) {
+ xd->prev = xd->cur;
+ return xd->completed;
+ }
+ xd->cur++;
+ fwrap = &xd->files[xd->cur];
+
+ if (err < 0) {
+ switch (err) {
+ case -EBUSY:
+ case -EAGAIN:
+ usleep(1);
+ break;
+ }
+ }
+ }
+ }
+
+ xd->cur = 0;
+
+ return xd->completed;
+}
+
+static enum fio_q_status xnvme_fioe_queue(struct thread_data *td, struct io_u *io_u)
+{
+ struct xnvme_fioe_data *xd = td->io_ops_data;
+ struct xnvme_fioe_options *o = td->eo;
+ struct xnvme_fioe_fwrap *fwrap;
+ struct xnvme_cmd_ctx *ctx;
+ struct xnvme_fioe_request *fio_req = io_u->engine_data;
+ uint32_t nsid;
+ uint64_t slba;
+ uint16_t nlb;
+ int err;
+ bool vectored_io = ((struct xnvme_fioe_options *)td->eo)->xnvme_iovec;
+ uint32_t dir = io_u->dtype;
+
+ fio_ro_check(td, io_u);
+
+ fwrap = &xd->files[io_u->file->fileno];
+ nsid = xnvme_dev_get_nsid(fwrap->dev);
+
+ if (fwrap->lba_pow2) {
+ slba = io_u->offset >> fwrap->ssw;
+ nlb = (io_u->xfer_buflen >> fwrap->ssw) - 1;
+ } else {
+ slba = io_u->offset / fwrap->lba_nbytes;
+ nlb = (io_u->xfer_buflen / fwrap->lba_nbytes) - 1;
+ }
+
+ ctx = xnvme_queue_get_cmd_ctx(fwrap->queue);
+ ctx->async.cb_arg = io_u;
+
+ ctx->cmd.common.nsid = nsid;
+ ctx->cmd.nvm.slba = slba;
+ ctx->cmd.nvm.nlb = nlb;
+ if (dir) {
+ ctx->cmd.nvm.dtype = io_u->dtype;
+ ctx->cmd.nvm.cdw13.dspec = io_u->dspec;
+ }
+
+ switch (io_u->ddir) {
+ case DDIR_READ:
+ ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_READ;
+ break;
+
+ case DDIR_WRITE:
+ ctx->cmd.common.opcode = XNVME_SPEC_NVM_OPC_WRITE;
+ break;
+
+ default:
+ log_err("ioeng->queue(): ENOSYS: %u\n", io_u->ddir);
+ xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+
+ io_u->error = ENOSYS;
+ assert(false);
+ return FIO_Q_COMPLETED;
+ }
+
+ if (fwrap->geo->pi_type && !o->pi_act) {
+ err = xnvme_pi_ctx_init(&fio_req->pi_ctx, fwrap->lba_nbytes,
+ fwrap->geo->nbytes_oob, fwrap->geo->lba_extended,
+ fwrap->geo->pi_loc, fwrap->geo->pi_type,
+ (o->pi_act << 3 | o->prchk), slba, o->apptag_mask,
+ o->apptag, fwrap->geo->pi_format);
+ if (err) {
+ log_err("ioeng->queue(): err: '%d'\n", err);
+
+ xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+
+ io_u->error = abs(err);
+ return FIO_Q_COMPLETED;
+ }
+
+ if (io_u->ddir == DDIR_WRITE)
+ xnvme_pi_generate(&fio_req->pi_ctx, io_u->xfer_buf, fio_req->md_buf,
+ nlb + 1);
+ }
+
+ if (fwrap->geo->pi_type)
+ ctx->cmd.nvm.prinfo = (o->pi_act << 3 | o->prchk);
+
+ switch (fwrap->geo->pi_type) {
+ case XNVME_PI_TYPE1:
+ case XNVME_PI_TYPE2:
+ switch (fwrap->geo->pi_format) {
+ case XNVME_SPEC_NVM_NS_16B_GUARD:
+ if (o->prchk & XNVME_PI_FLAGS_REFTAG_CHECK)
+ ctx->cmd.nvm.ilbrt = (uint32_t)slba;
+ break;
+ case XNVME_SPEC_NVM_NS_64B_GUARD:
+ if (o->prchk & XNVME_PI_FLAGS_REFTAG_CHECK) {
+ ctx->cmd.nvm.ilbrt = (uint32_t)slba;
+ ctx->cmd.common.cdw03 = ((slba >> 32) & 0xffff);
+ }
+ break;
+ default:
+ break;
+ }
+ if (o->prchk & XNVME_PI_FLAGS_APPTAG_CHECK) {
+ ctx->cmd.nvm.lbat = o->apptag;
+ ctx->cmd.nvm.lbatm = o->apptag_mask;
+ }
+ break;
+ case XNVME_PI_TYPE3:
+ if (o->prchk & XNVME_PI_FLAGS_APPTAG_CHECK) {
+ ctx->cmd.nvm.lbat = o->apptag;
+ ctx->cmd.nvm.lbatm = o->apptag_mask;
+ }
+ break;
+ case XNVME_PI_DISABLE:
+ break;
+ }
+
+ if (vectored_io) {
+ xd->iovec[io_u->index].iov_base = io_u->xfer_buf;
+ xd->iovec[io_u->index].iov_len = io_u->xfer_buflen;
+ if (fwrap->md_nbytes && fwrap->lba_pow2) {
+ xd->md_iovec[io_u->index].iov_base = fio_req->md_buf;
+ xd->md_iovec[io_u->index].iov_len = fwrap->md_nbytes * (nlb + 1);
+ err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen,
+ &xd->md_iovec[io_u->index], 1,
+ fwrap->md_nbytes * (nlb + 1));
+ } else {
+ err = xnvme_cmd_passv(ctx, &xd->iovec[io_u->index], 1, io_u->xfer_buflen,
+ NULL, 0, 0);
+ }
+ } else {
+ if (fwrap->md_nbytes && fwrap->lba_pow2)
+ err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen,
+ fio_req->md_buf, fwrap->md_nbytes * (nlb + 1));
+ else
+ err = xnvme_cmd_pass(ctx, io_u->xfer_buf, io_u->xfer_buflen, NULL, 0);
+ }
+ switch (err) {
+ case 0:
+ return FIO_Q_QUEUED;
+
+ case -EBUSY:
+ case -EAGAIN:
+ xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+ return FIO_Q_BUSY;
+
+ default:
+ log_err("ioeng->queue(): err: '%d'\n", err);
+
+ xnvme_queue_put_cmd_ctx(ctx->async.queue, ctx);
+
+ io_u->error = abs(err);
+ assert(false);
+ return FIO_Q_COMPLETED;
+ }
+}
+
+static int xnvme_fioe_close(struct thread_data *td, struct fio_file *f)
+{
+ struct xnvme_fioe_data *xd = td->io_ops_data;
+
+ dprint(FD_FILE, "xnvme close %s -- nopen: %ld\n", f->file_name, xd->nopen);
+
+ --(xd->nopen);
+
+ return 0;
+}
+
+static int xnvme_fioe_open(struct thread_data *td, struct fio_file *f)
+{
+ struct xnvme_fioe_data *xd = td->io_ops_data;
+
+ dprint(FD_FILE, "xnvme open %s -- nopen: %ld\n", f->file_name, xd->nopen);
+
+ if (f->fileno > (int)xd->nallocated) {
+ log_err("ioeng->open(): f->fileno > xd->nallocated; invalid assumption\n");
+ return 1;
+ }
+ if (xd->files[f->fileno].fio_file != f) {
+ log_err("ioeng->open(): fio_file != f; invalid assumption\n");
+ return 1;
+ }
+
+ ++(xd->nopen);
+
+ return 0;
+}
+
+static int xnvme_fioe_invalidate(struct thread_data *td, struct fio_file *f)
+{
+ /* Consider only doing this with be:spdk */
+ return 0;
+}
+
+static int xnvme_fioe_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+ struct xnvme_dev *dev;
+ const struct xnvme_spec_znd_idfy_ns *zns;
+ int err = 0, err_lock;
+
+ if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
+ f->filetype != FIO_TYPE_CHAR) {
+ log_info("ioeng->get_max_open_zoned(): ignoring filetype: %d\n", f->filetype);
+ return 0;
+ }
+ err_lock = pthread_mutex_lock(&g_serialize);
+ if (err_lock) {
+ log_err("ioeng->get_max_open_zones(): pthread_mutex_lock(), err(%d)\n", err_lock);
+ return -err_lock;
+ }
+
+ dev = xnvme_dev_open(f->file_name, &opts);
+ if (!dev) {
+ log_err("ioeng->get_max_open_zones(): xnvme_dev_open(), err(%d)\n", err_lock);
+ err = -errno;
+ goto exit;
+ }
+ if (xnvme_dev_get_geo(dev)->type != XNVME_GEO_ZONED) {
+ errno = EINVAL;
+ err = -errno;
+ goto exit;
+ }
+
+ zns = (void *)xnvme_dev_get_ns_css(dev);
+ if (!zns) {
+ log_err("ioeng->get_max_open_zones(): xnvme_dev_get_ns_css(), err(%d)\n", errno);
+ err = -errno;
+ goto exit;
+ }
+
+ /*
+ * intentional overflow as the value is zero-based and NVMe
+ * defines 0xFFFFFFFF as unlimited thus overflowing to 0 which
+ * is how fio indicates unlimited and otherwise just converting
+ * to one-based.
+ */
+ *max_open_zones = zns->mor + 1;
+
+exit:
+ xnvme_dev_close(dev);
+ err_lock = pthread_mutex_unlock(&g_serialize);
+ if (err_lock)
+ log_err("ioeng->get_max_open_zones(): pthread_mutex_unlock(), err(%d)\n",
+ err_lock);
+
+ return err;
+}
+
+/**
+ * Currently, this function is called before of I/O engine initialization, so,
+ * we cannot consult the file-wrapping done when 'fioe' initializes.
+ * Instead we just open based on the given filename.
+ *
+ * TODO: unify the different setup methods, consider keeping the handle around,
+ * and consider how to support the --be option in this usecase
+ */
+static int xnvme_fioe_get_zoned_model(struct thread_data *td, struct fio_file *f,
+ enum zbd_zoned_model *model)
+{
+ struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+ struct xnvme_dev *dev;
+ int err = 0, err_lock;
+
+ if (f->filetype != FIO_TYPE_FILE && f->filetype != FIO_TYPE_BLOCK &&
+ f->filetype != FIO_TYPE_CHAR) {
+ log_info("ioeng->get_zoned_model(): ignoring filetype: %d\n", f->filetype);
+ return -EINVAL;
+ }
+
+ err = pthread_mutex_lock(&g_serialize);
+ if (err) {
+ log_err("ioeng->get_zoned_model(): pthread_mutex_lock(), err(%d)\n", err);
+ return -err;
+ }
+
+ dev = xnvme_dev_open(f->file_name, &opts);
+ if (!dev) {
+ log_err("ioeng->get_zoned_model(): xnvme_dev_open(%s) failed, errno: %d\n",
+ f->file_name, errno);
+ err = -errno;
+ goto exit;
+ }
+
+ switch (xnvme_dev_get_geo(dev)->type) {
+ case XNVME_GEO_UNKNOWN:
+ dprint(FD_ZBD, "%s: got 'unknown', assigning ZBD_NONE\n", f->file_name);
+ *model = ZBD_NONE;
+ break;
+
+ case XNVME_GEO_CONVENTIONAL:
+ dprint(FD_ZBD, "%s: got 'conventional', assigning ZBD_NONE\n", f->file_name);
+ *model = ZBD_NONE;
+ break;
+
+ case XNVME_GEO_ZONED:
+ dprint(FD_ZBD, "%s: got 'zoned', assigning ZBD_HOST_MANAGED\n", f->file_name);
+ *model = ZBD_HOST_MANAGED;
+ break;
+
+ default:
+ dprint(FD_ZBD, "%s: hit-default, assigning ZBD_NONE\n", f->file_name);
+ *model = ZBD_NONE;
+ errno = EINVAL;
+ err = -errno;
+ break;
+ }
+
+exit:
+ xnvme_dev_close(dev);
+
+ err_lock = pthread_mutex_unlock(&g_serialize);
+ if (err_lock)
+ log_err("ioeng->get_zoned_model(): pthread_mutex_unlock(), err(%d)\n", err_lock);
+
+ return err;
+}
+
+/**
+ * Fills the given ``zbdz`` with at most ``nr_zones`` zone-descriptors.
+ *
+ * The implementation converts the NVMe Zoned Command Set log-pages for Zone
+ * descriptors into the Linux Kernel Zoned Block Report format.
+ *
+ * NOTE: This function is called before I/O engine initialization, that is,
+ * before ``_dev_open`` has been called and file-wrapping is setup. Thus is has
+ * to do the ``_dev_open`` itself, and shut it down again once it is done
+ * retrieving the log-pages and converting them to the report format.
+ *
+ * TODO: unify the different setup methods, consider keeping the handle around,
+ * and consider how to support the --async option in this usecase
+ */
+static int xnvme_fioe_report_zones(struct thread_data *td, struct fio_file *f, uint64_t offset,
+ struct zbd_zone *zbdz, unsigned int nr_zones)
+{
+ struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+ const struct xnvme_spec_znd_idfy_lbafe *lbafe = NULL;
+ struct xnvme_dev *dev = NULL;
+ const struct xnvme_geo *geo = NULL;
+ struct xnvme_znd_report *rprt = NULL;
+ uint32_t ssw;
+ uint64_t slba;
+ unsigned int limit = 0;
+ int err = 0, err_lock;
+
+ dprint(FD_ZBD, "%s: report_zones() offset: %zu, nr_zones: %u\n", f->file_name, offset,
+ nr_zones);
+
+ err = pthread_mutex_lock(&g_serialize);
+ if (err) {
+ log_err("ioeng->report_zones(%s): pthread_mutex_lock(), err(%d)\n", f->file_name,
+ err);
+ return -err;
+ }
+
+ dev = xnvme_dev_open(f->file_name, &opts);
+ if (!dev) {
+ log_err("ioeng->report_zones(%s): xnvme_dev_open(), err(%d)\n", f->file_name,
+ errno);
+ goto exit;
+ }
+
+ geo = xnvme_dev_get_geo(dev);
+ ssw = xnvme_dev_get_ssw(dev);
+ lbafe = xnvme_znd_dev_get_lbafe(dev);
+
+ limit = nr_zones > geo->nzone ? geo->nzone : nr_zones;
+
+ dprint(FD_ZBD, "%s: limit: %u\n", f->file_name, limit);
+
+ slba = ((offset >> ssw) / geo->nsect) * geo->nsect;
+
+ rprt = xnvme_znd_report_from_dev(dev, slba, limit, 0);
+ if (!rprt) {
+ log_err("ioeng->report_zones(%s): xnvme_znd_report_from_dev(), err(%d)\n",
+ f->file_name, errno);
+ err = -errno;
+ goto exit;
+ }
+ if (rprt->nentries != limit) {
+ log_err("ioeng->report_zones(%s): nentries != nr_zones\n", f->file_name);
+ err = 1;
+ goto exit;
+ }
+ if (offset > geo->tbytes) {
+ log_err("ioeng->report_zones(%s): out-of-bounds\n", f->file_name);
+ goto exit;
+ }
+
+ /* Transform the zone-report */
+ for (uint32_t idx = 0; idx < rprt->nentries; ++idx) {
+ struct xnvme_spec_znd_descr *descr = XNVME_ZND_REPORT_DESCR(rprt, idx);
+
+ zbdz[idx].start = descr->zslba << ssw;
+ zbdz[idx].len = lbafe->zsze << ssw;
+ zbdz[idx].capacity = descr->zcap << ssw;
+ zbdz[idx].wp = descr->wp << ssw;
+
+ switch (descr->zt) {
+ case XNVME_SPEC_ZND_TYPE_SEQWR:
+ zbdz[idx].type = ZBD_ZONE_TYPE_SWR;
+ break;
+
+ default:
+ log_err("ioeng->report_zones(%s): invalid type for zone at offset(%zu)\n",
+ f->file_name, zbdz[idx].start);
+ err = -EIO;
+ goto exit;
+ }
+
+ switch (descr->zs) {
+ case XNVME_SPEC_ZND_STATE_EMPTY:
+ zbdz[idx].cond = ZBD_ZONE_COND_EMPTY;
+ break;
+ case XNVME_SPEC_ZND_STATE_IOPEN:
+ zbdz[idx].cond = ZBD_ZONE_COND_IMP_OPEN;
+ break;
+ case XNVME_SPEC_ZND_STATE_EOPEN:
+ zbdz[idx].cond = ZBD_ZONE_COND_EXP_OPEN;
+ break;
+ case XNVME_SPEC_ZND_STATE_CLOSED:
+ zbdz[idx].cond = ZBD_ZONE_COND_CLOSED;
+ break;
+ case XNVME_SPEC_ZND_STATE_FULL:
+ zbdz[idx].cond = ZBD_ZONE_COND_FULL;
+ break;
+
+ case XNVME_SPEC_ZND_STATE_RONLY:
+ case XNVME_SPEC_ZND_STATE_OFFLINE:
+ default:
+ zbdz[idx].cond = ZBD_ZONE_COND_OFFLINE;
+ break;
+ }
+ }
+
+exit:
+ xnvme_buf_virt_free(rprt);
+
+ xnvme_dev_close(dev);
+
+ err_lock = pthread_mutex_unlock(&g_serialize);
+ if (err_lock)
+ log_err("ioeng->report_zones(): pthread_mutex_unlock(), err: %d\n", err_lock);
+
+ dprint(FD_ZBD, "err: %d, nr_zones: %d\n", err, (int)nr_zones);
+
+ return err ? err : (int)limit;
+}
+
+/**
+ * NOTE: This function may get called before I/O engine initialization, that is,
+ * before ``_dev_open`` has been called and file-wrapping is setup. In such
+ * case it has to do ``_dev_open`` itself, and shut it down again once it is
+ * done resetting write pointer of zones.
+ */
+static int xnvme_fioe_reset_wp(struct thread_data *td, struct fio_file *f, uint64_t offset,
+ uint64_t length)
+{
+ struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+ struct xnvme_fioe_data *xd = NULL;
+ struct xnvme_fioe_fwrap *fwrap = NULL;
+ struct xnvme_dev *dev = NULL;
+ const struct xnvme_geo *geo = NULL;
+ uint64_t first, last;
+ uint32_t ssw;
+ uint32_t nsid;
+ int err = 0, err_lock;
+
+ if (td->io_ops_data) {
+ xd = td->io_ops_data;
+ fwrap = &xd->files[f->fileno];
+
+ assert(fwrap->dev);
+ assert(fwrap->geo);
+
+ dev = fwrap->dev;
+ geo = fwrap->geo;
+ ssw = fwrap->ssw;
+ } else {
+ err = pthread_mutex_lock(&g_serialize);
+ if (err) {
+ log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", err);
+ return -err;
+ }
+
+ dev = xnvme_dev_open(f->file_name, &opts);
+ if (!dev) {
+ log_err("ioeng->reset_wp(): xnvme_dev_open(%s) failed, errno(%d)\n",
+ f->file_name, errno);
+ goto exit;
+ }
+ geo = xnvme_dev_get_geo(dev);
+ ssw = xnvme_dev_get_ssw(dev);
+ }
+
+ nsid = xnvme_dev_get_nsid(dev);
+
+ first = ((offset >> ssw) / geo->nsect) * geo->nsect;
+ last = (((offset + length) >> ssw) / geo->nsect) * geo->nsect;
+ dprint(FD_ZBD, "first: 0x%lx, last: 0x%lx\n", first, last);
+
+ for (uint64_t zslba = first; zslba < last; zslba += geo->nsect) {
+ struct xnvme_cmd_ctx ctx = xnvme_cmd_ctx_from_dev(dev);
+
+ if (zslba >= (geo->nsect * geo->nzone)) {
+ log_err("ioeng->reset_wp(): out-of-bounds\n");
+ err = 0;
+ break;
+ }
+
+ err = xnvme_znd_mgmt_send(&ctx, nsid, zslba, false,
+ XNVME_SPEC_ZND_CMD_MGMT_SEND_RESET, 0x0, NULL);
+ if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
+ err = err ? err : -EIO;
+ log_err("ioeng->reset_wp(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
+ goto exit;
+ }
+ }
+
+exit:
+ if (!td->io_ops_data) {
+ xnvme_dev_close(dev);
+
+ err_lock = pthread_mutex_unlock(&g_serialize);
+ if (err_lock)
+ log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err_lock);
+ }
+
+ return err;
+}
+
+static int xnvme_fioe_fetch_ruhs(struct thread_data *td, struct fio_file *f,
+ struct fio_ruhs_info *fruhs_info)
+{
+ struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+ struct xnvme_dev *dev;
+ struct xnvme_spec_ruhs *ruhs;
+ struct xnvme_cmd_ctx ctx;
+ uint32_t ruhs_nbytes;
+ uint32_t nsid;
+ int err = 0, err_lock;
+
+ if (f->filetype != FIO_TYPE_CHAR && f->filetype != FIO_TYPE_FILE) {
+ log_err("ioeng->fdp_ruhs(): ignoring filetype: %d\n", f->filetype);
+ return -EINVAL;
+ }
+
+ err = pthread_mutex_lock(&g_serialize);
+ if (err) {
+ log_err("ioeng->fdp_ruhs(): pthread_mutex_lock(), err(%d)\n", err);
+ return -err;
+ }
+
+ dev = xnvme_dev_open(f->file_name, &opts);
+ if (!dev) {
+ log_err("ioeng->fdp_ruhs(): xnvme_dev_open(%s) failed, errno: %d\n",
+ f->file_name, errno);
+ err = -errno;
+ goto exit;
+ }
+
+ ruhs_nbytes = sizeof(*ruhs) + (FDP_MAX_RUHS * sizeof(struct xnvme_spec_ruhs_desc));
+ ruhs = xnvme_buf_alloc(dev, ruhs_nbytes);
+ if (!ruhs) {
+ err = -errno;
+ goto exit;
+ }
+ memset(ruhs, 0, ruhs_nbytes);
+
+ ctx = xnvme_cmd_ctx_from_dev(dev);
+ nsid = xnvme_dev_get_nsid(dev);
+
+ err = xnvme_nvm_mgmt_recv(&ctx, nsid, XNVME_SPEC_IO_MGMT_RECV_RUHS, 0, ruhs, ruhs_nbytes);
+
+ if (err || xnvme_cmd_ctx_cpl_status(&ctx)) {
+ err = err ? err : -EIO;
+ log_err("ioeng->fdp_ruhs(): err(%d), sc(%d)", err, ctx.cpl.status.sc);
+ goto free_buffer;
+ }
+
+ fruhs_info->nr_ruhs = ruhs->nruhsd;
+ for (uint32_t idx = 0; idx < fruhs_info->nr_ruhs; ++idx) {
+ fruhs_info->plis[idx] = le16_to_cpu(ruhs->desc[idx].pi);
+ }
+
+free_buffer:
+ xnvme_buf_free(dev, ruhs);
+exit:
+ xnvme_dev_close(dev);
+
+ err_lock = pthread_mutex_unlock(&g_serialize);
+ if (err_lock)
+ log_err("ioeng->fdp_ruhs(): pthread_mutex_unlock(), err(%d)\n", err_lock);
+
+ return err;
+}
+
+static int xnvme_fioe_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+ struct xnvme_opts opts = xnvme_opts_from_fioe(td);
+ struct xnvme_dev *dev;
+ int ret = 0, err;
+
+ if (fio_file_size_known(f))
+ return 0;
+
+ ret = pthread_mutex_lock(&g_serialize);
+ if (ret) {
+ log_err("ioeng->reset_wp(): pthread_mutex_lock(), err(%d)\n", ret);
+ return -ret;
+ }
+
+ dev = xnvme_dev_open(f->file_name, &opts);
+ if (!dev) {
+ log_err("%s: failed retrieving device handle, errno: %d\n", f->file_name, errno);
+ ret = -errno;
+ goto exit;
+ }
+
+ f->real_file_size = xnvme_dev_get_geo(dev)->tbytes;
+ fio_file_set_size_known(f);
+
+ if (td->o.zone_mode == ZONE_MODE_ZBD)
+ f->filetype = FIO_TYPE_BLOCK;
+
+exit:
+ xnvme_dev_close(dev);
+ err = pthread_mutex_unlock(&g_serialize);
+ if (err)
+ log_err("ioeng->reset_wp(): pthread_mutex_unlock(), err(%d)\n", err);
+
+ return ret;
+}
+
+FIO_STATIC struct ioengine_ops ioengine = {
+ .name = "xnvme",
+ .version = FIO_IOOPS_VERSION,
+ .options = options,
+ .option_struct_size = sizeof(struct xnvme_fioe_options),
+ .flags = FIO_DISKLESSIO | FIO_NODISKUTIL | FIO_NOEXTEND | FIO_MEMALIGN | FIO_RAWIO,
+
+ .cleanup = xnvme_fioe_cleanup,
+ .init = xnvme_fioe_init,
+
+ .iomem_free = xnvme_fioe_iomem_free,
+ .iomem_alloc = xnvme_fioe_iomem_alloc,
+
+ .io_u_free = xnvme_fioe_io_u_free,
+ .io_u_init = xnvme_fioe_io_u_init,
+
+ .event = xnvme_fioe_event,
+ .getevents = xnvme_fioe_getevents,
+ .queue = xnvme_fioe_queue,
+
+ .close_file = xnvme_fioe_close,
+ .open_file = xnvme_fioe_open,
+ .get_file_size = xnvme_fioe_get_file_size,
+
+ .invalidate = xnvme_fioe_invalidate,
+ .get_max_open_zones = xnvme_fioe_get_max_open_zones,
+ .get_zoned_model = xnvme_fioe_get_zoned_model,
+ .report_zones = xnvme_fioe_report_zones,
+ .reset_wp = xnvme_fioe_reset_wp,
+
+ .fdp_fetch_ruhs = xnvme_fioe_fetch_ruhs,
+};
+
+static void fio_init fio_xnvme_register(void)
+{
+ register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_xnvme_unregister(void)
+{
+ unregister_ioengine(&ioengine);
+}
*/
#include <unistd.h>
#include <string.h>
+#include <stdlib.h>
#ifdef CONFIG_VALGRIND_DEV
#include <valgrind/drd.h>
#else
perc = td->o.rwmix[DDIR_WRITE];
bytes_total += (bytes_total * perc) / 100;
- } else
+ } else {
bytes_total <<= 1;
+ }
}
if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING) {
perc = (double) bytes_done / (double) bytes_total;
if (perc > 1.0)
perc = 1.0;
- } else
+ } else {
perc = 0.0;
+ }
if (td->o.time_based) {
if (timeout) {
else
this_rate = 0;
- if (unified_rw_rep) {
+ if (unified_rw_rep == UNIFIED_MIXED) {
rate[i] = 0;
rate[0] += this_rate;
} else
else
this_iops = 0;
- if (unified_rw_rep) {
+ if (unified_rw_rep == UNIFIED_MIXED) {
iops[i] = 0;
iops[0] += this_iops;
} else
return time > ((eta_interval_msec * 95) / 100);
}
+/*
+ * These are the conditions under which we might be able to skip the eta
+ * calculation.
+ */
+static bool skip_eta()
+{
+ if (!(output_format & FIO_OUTPUT_NORMAL) && f_out == stdout)
+ return true;
+ if (temp_stall_ts || eta_print == FIO_ETA_NEVER)
+ return true;
+ if (!isatty(STDOUT_FILENO) && eta_print != FIO_ETA_ALWAYS)
+ return true;
+
+ return false;
+}
+
/*
* Print status of the jobs we know about. This includes rate estimates,
* ETA, thread state, etc.
*/
-bool calc_thread_status(struct jobs_eta *je, int force)
+static bool calc_thread_status(struct jobs_eta *je, int force)
{
- struct thread_data *td;
- int i, unified_rw_rep;
+ int unified_rw_rep;
+ bool any_td_in_ramp;
uint64_t rate_time, disp_time, bw_avg_time, *eta_secs;
unsigned long long io_bytes[DDIR_RWDIR_CNT] = {};
unsigned long long io_iops[DDIR_RWDIR_CNT] = {};
static unsigned long long disp_io_iops[DDIR_RWDIR_CNT];
static struct timespec rate_prev_time, disp_prev_time;
- if (!force) {
- if (!(output_format & FIO_OUTPUT_NORMAL) &&
- f_out == stdout)
- return false;
- if (temp_stall_ts || eta_print == FIO_ETA_NEVER)
- return false;
+ bool ret = true;
- if (!isatty(STDOUT_FILENO) && (eta_print != FIO_ETA_ALWAYS))
+ if (!force && skip_eta()) {
+ if (write_bw_log)
+ ret = false;
+ else
return false;
}
if (!ddir_rw_sum(disp_io_bytes))
fill_start_time(&disp_prev_time);
- eta_secs = malloc(thread_number * sizeof(uint64_t));
- memset(eta_secs, 0, thread_number * sizeof(uint64_t));
+ eta_secs = calloc(thread_number, sizeof(uint64_t));
je->elapsed_sec = (mtime_since_genesis() + 999) / 1000;
bw_avg_time = ULONG_MAX;
unified_rw_rep = 0;
- for_each_td(td, i) {
+ for_each_td(td) {
unified_rw_rep += td->o.unified_rw_rep;
if (is_power_of_2(td->o.kb_base))
je->is_pow2 = 1;
je->unit_base = td->o.unit_base;
+ je->sig_figs = td->o.sig_figs;
if (td->o.bw_avg_time < bw_avg_time)
bw_avg_time = td->o.bw_avg_time;
if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING
je->nr_pending++;
if (je->elapsed_sec >= 3)
- eta_secs[i] = thread_eta(td);
+ eta_secs[__td_index] = thread_eta(td);
else
- eta_secs[i] = INT_MAX;
+ eta_secs[__td_index] = INT_MAX;
check_str_update(td);
}
}
}
- }
+ } end_for_each();
if (exitall_on_terminate) {
je->eta_sec = INT_MAX;
- for_each_td(td, i) {
- if (eta_secs[i] < je->eta_sec)
- je->eta_sec = eta_secs[i];
- }
+ for_each_td_index() {
+ if (eta_secs[__td_index] < je->eta_sec)
+ je->eta_sec = eta_secs[__td_index];
+ } end_for_each();
} else {
unsigned long eta_stone = 0;
je->eta_sec = 0;
- for_each_td(td, i) {
+ for_each_td(td) {
if ((td->runstate == TD_NOT_CREATED) && td->o.stonewall)
- eta_stone += eta_secs[i];
+ eta_stone += eta_secs[__td_index];
else {
- if (eta_secs[i] > je->eta_sec)
- je->eta_sec = eta_secs[i];
+ if (eta_secs[__td_index] > je->eta_sec)
+ je->eta_sec = eta_secs[__td_index];
}
- }
+ } end_for_each();
je->eta_sec += eta_stone;
}
fio_gettime(&now, NULL);
rate_time = mtime_since(&rate_prev_time, &now);
- if (write_bw_log && rate_time > bw_avg_time && !in_ramp_time(td)) {
+ any_td_in_ramp = false;
+ for_each_td(td) {
+ any_td_in_ramp |= in_ramp_time(td);
+ } end_for_each();
+ if (write_bw_log && rate_time > bw_avg_time && !any_td_in_ramp) {
calc_rate(unified_rw_rep, rate_time, io_bytes, rate_io_bytes,
je->rate);
memcpy(&rate_prev_time, &now, sizeof(now));
+ regrow_agg_logs();
for_each_rw_ddir(ddir) {
- add_agg_sample(sample_val(je->rate[ddir]), ddir, 0, 0);
+ add_agg_sample(sample_val(je->rate[ddir]), ddir, 0);
}
}
je->nr_threads = thread_number;
update_condensed_str(__run_str, run_str);
memcpy(je->run_str, run_str, strlen(run_str));
- return true;
+ return ret;
}
static int gen_eta_str(struct jobs_eta *je, char *p, size_t left,
char *tr, *mr;
mr = num2str(je->m_rate[0] + je->m_rate[1] + je->m_rate[2],
- je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC);
+ je->sig_figs, 1, je->is_pow2, N2S_BYTEPERSEC);
tr = num2str(je->t_rate[0] + je->t_rate[1] + je->t_rate[2],
- je->sig_figs, 0, je->is_pow2, N2S_BYTEPERSEC);
+ je->sig_figs, 1, je->is_pow2, N2S_BYTEPERSEC);
p += sprintf(p, ", %s-%s", mr, tr);
free(tr);
return NULL;
*size = sizeof(*je) + THREAD_RUNSTR_SZ + 8;
- je = malloc(*size);
+ je = calloc(1, *size);
if (!je)
return NULL;
- memset(je, 0, *size);
if (!calc_thread_status(je, force)) {
free(je);
size_t size;
je = get_jobs_eta(false, &size);
- if (je)
+ if (je) {
display_thread_status(je);
-
- free(je);
+ free(je);
+ }
}
void print_status_init(int thr_number)
--- /dev/null
+; Randomly read/write a block device file at queue depth 16.
+[global]
+filename=/dev/sda
+direct=1
+write_lat_log=prio-run.log
+log_prio=1
+rw=randrw
+ioengine=libaio
+iodepth=16
+
+; Simple cmdprio_bssplit format. All non-zero percentage entries will
+; use the same prio class and prio level defined by the cmdprio_class
+; and cmdprio options.
+[cmdprio]
+; 40% of read I/Os are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 100% of the 64kB reads are executed with prio class 1 and prio level 0.
+; All other I/Os are executed without a priority set.
+bssplit=64k/40:1024k/60,1024k/100
+cmdprio_bssplit=64k/100:1024k/0,1024k/0
+cmdprio_class=1
+cmdprio=0
+
+; Advanced cmdprio_bssplit format. Each non-zero percentage entry can
+; use a different prio class and prio level (appended to each entry).
+[cmdprio-adv]
+; 40% of read I/Os are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 25% of the 64kB reads are executed with prio class 1 and prio level 1,
+; 75% of the 64kB reads are executed with prio class 3 and prio level 2.
+; All other I/Os are executed without a priority set.
+stonewall
+bssplit=64k/40:1024k/60,1024k/100
+cmdprio_bssplit=64k/25/1/1:64k/75/3/2:1024k/0,1024k/0
+
+; Identical to the previous example, but with a default priority defined.
+[cmdprio-adv-def]
+; 40% of read I/Os are 64kB and 60% are 1MB. 100% of writes are 1MB.
+; 25% of the 64kB reads are executed with prio class 1 and prio level 1,
+; 75% of the 64kB reads are executed with prio class 3 and prio level 2.
+; All other I/Os are executed with prio class 2 and prio level 7.
+stonewall
+prioclass=2
+prio=7
+bssplit=64k/40:1024k/60,1024k/100
+cmdprio_bssplit=64k/25/1/1:64k/75/3/2:1024k/0,1024k/0
+
+; Example of how to use cmdprio_bssplit with Command Duration Limits (CDL)
+; using I/O priority hints. The drive has to support CDL, and CDL has to be
+; enabled in sysfs, otherwise the hints will not be sent down to the drive.
+[cmdprio-hints]
+; 40% of the I/Os are 1MB reads and 60% of the I/Os are 2MB reads.
+;
+; 10% of the 1MB reads are executed with prio class 2 (Best Effort),
+; prio level 0, and prio hint 1. Prio hint 1 means CDL descriptor 1.
+; Since 40% of read I/Os are 1MB, and 10% of the 1MB I/Os use CDL desc 1,
+; this means that 4% of all the issued I/O will use this configuration.
+;
+; 30% of the 1MB reads are executed with prio class 2 (Best Effort),
+; prio level 0, and prio hint 2. Prio hint 2 means CDL descriptor 2.
+; Since 40% of read I/Os are 1MB, and 30% of the 1MB I/Os use CDL desc 2,
+; this means that 12% of all the issued I/O will use this configuration.
+;
+; 60% of the 1MB reads are executed with prio class 2 (Best Effort),
+; prio level 0, and prio hint 0. Prio hint 0 means no hint.
+; Since 40% of read I/Os are 1MB, and 60% of the 1MB I/Os use no hint,
+; this means that 24% of all the issued I/O will use this configuration.
+;
+; 10% of the 2MB reads are executed with prio class 2 (Best Effort),
+; prio level 0, and prio hint 3. Prio hint 3 means CDL descriptor 3.
+; Since 60% of read I/Os are 2MB, and 10% of the 2MB I/Os use CDL desc 3,
+; this means that 6% of all the issued I/O will use this configuration.
+;
+; 90% of the 2MB reads are executed with prio class 2 (Best Effort),
+; prio level 0, and prio hint 0. Prio hint 0 means no hint.
+; Since 60% of read I/Os are 2MB, and 90% of the 2MB I/Os use no hint,
+; this means that 54% of all the issued I/O will use this configuration.
+stonewall
+rw=randread
+bssplit=1M/40:2M/60
+cmdprio_bssplit=1M/10/2/0/1:1M/30/2/0/2:1M/60/2/0/0:2M/10/2/0/3:2M/90/2/0/0
--- /dev/null
+; Read a block device file at queue depth 8
+; with 20 % of the IOs using the high priority RT class
+; and the remaining IOs using the idle priority class
+[global]
+filename=/dev/sda
+direct=1
+write_lat_log=prio-run.log
+log_prio=1
+
+[randread]
+rw=randread
+bs=128k
+ioengine=libaio
+iodepth=8
+prioclass=3
+cmdprio_percentage=20
+cmdprio_class=1
size=100g
rw=randread
norandommap
-time_based=0
+time_based
[global]
ioengine=cpuio
time_based
-runtime=10
+runtime=15
-[burn50percent]
+# The following example load 2 cores at 50% with the noop (default) mode
+[burn_2x50_noop]
cpuload=50
+numjobs=2
+cpumode=noop
+# Once burn_2x50_noop is over,
+# fio load 2 cores at 50% with the qsort mode which drains much more power
+[burn_2x50%_qsort]
+stonewall
+cpuload=50
+numjobs=2
+cpumode=qsort
offset_increment=124g
io_size=120g
offset=120k
-group_reporting=1
+group_reporting
verify_dump=1
loops=2
--- /dev/null
+# Writing to 2 files that share the duplicate blocks.
+# The dedupe working set is spread uniformly such that when
+# each of the jobs choose to perform a dedup operation they will
+# regenerate a buffer from the global space.
+# If you test the dedup ratio on either file by itself the result
+# is likely lower than if you test the ratio of the two files combined.
+#
+# Use `./t/fio-dedupe <file> -C 1 -c 1 -b 4096` to test the total
+# data reduction ratio.
+#
+#
+# Full example of test:
+# $ ./fio ./examples/dedupe-global.fio
+#
+# Checking ratio on a and b individually:
+# $ ./t/fio-dedupe a.0.0 -C 1 -c 1 -b 4096
+#
+# $ Extents=25600, Unique extents=16817 Duplicated extents=5735
+# $ De-dupe ratio: 1:0.52
+# $ De-dupe working set at least: 22.40%
+# $ Fio setting: dedupe_percentage=34
+# $ Unique capacity 33MB
+#
+# ./t/fio-dedupe b.0.0 -C 1 -c 1 -b 4096
+# $ Extents=25600, Unique extents=17009 Duplicated extents=5636
+# $ De-dupe ratio: 1:0.51
+# $ De-dupe working set at least: 22.02%
+# $ Fio setting: dedupe_percentage=34
+# $ Unique capacity 34MB
+#
+# Combining files:
+# $ cat a.0.0 > c.0.0
+# $ cat b.0.0 >> c.0.0
+#
+# Checking data reduction ratio on combined file:
+# $ ./t/fio-dedupe c.0.0 -C 1 -c 1 -b 4096
+# $ Extents=51200, Unique extents=25747 Duplicated extents=11028
+# $ De-dupe ratio: 1:0.99
+# $ De-dupe working set at least: 21.54%
+# $ Fio setting: dedupe_percentage=50
+# $ Unique capacity 51MB
+#
+[global]
+ioengine=libaio
+iodepth=256
+size=100m
+dedupe_mode=working_set
+dedupe_global=1
+dedupe_percentage=50
+blocksize=4k
+rw=write
+buffer_compress_percentage=50
+dedupe_working_set_percentage=50
+
+[a]
+
+[b]
bs=2m
ioengine=dev-dax
norandommap
-time_based=1
+time_based
runtime=30
group_reporting
disable_lat=1
#
iodepth=1
direct=0
-thread=1
+thread
numjobs=16
#
# The dev-dax engine does IO to DAX device that are special character
--- /dev/null
+[global]
+ioengine=dfs
+pool=${POOL}
+cont=${CONT}
+filename_format=fio-test.$jobnum
+
+cpus_allowed_policy=split
+group_reporting=1
+time_based=0
+percentile_list=99.0:99.9:99.99:99.999:99.9999:100
+disable_slat=1
+disable_clat=1
+
+bs=1M
+size=100G
+iodepth=16
+numjobs=16
+
+[daos-seqwrite]
+rw=write
+stonewall
+
+[daos-seqread]
+rw=read
+stonewall
+
+[daos-randwrite]
+rw=randwrite
+stonewall
+
+[daos-randread]
+rw=randread
+stonewall
--- /dev/null
+# Example dircreate job
+#
+# create_on_open is needed so that the open happens during the run and not the
+# setup.
+#
+# openfiles needs to be set so that you do not exceed the maximum allowed open
+# files.
+#
+# filesize needs to be set to a non zero value so fio will actually run, but the
+# IO will not really be done and the write latency numbers will only reflect the
+# open times.
+[global]
+create_on_open=1
+nrfiles=30
+ioengine=dircreate
+fallocate=none
+filesize=4k
+openfiles=1
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
--- /dev/null
+# Example dirdelete job
+
+# 'filedelete' engine only do 'rmdir(dirname)'.
+# 'filesize' must be set, then directories will be created at setup stage.
+# 'unlink' is better set to 0, since the directory is deleted in measurement.
+# the options disabled completion latency output such as 'disable_clat' and 'gtod_reduce' must not set.
+[global]
+ioengine=dirdelete
+filesize=4k
+nrfiles=200
+unlink=0
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
--- /dev/null
+# Example dirstat job
+
+# 'dirstat' engine only do 'stat(dirname)', file will not be open().
+# 'filesize' must be set, then files will be created at setup stage.
+
+[global]
+ioengine=dirstat
+numjobs=10
+filesize=4k
+nrfiles=5
+thread
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
-; Read disk in zones of 128m/2g, generating a plot of that afterwards
+; Read disk in zones of 256m/2g. Generating a plot of that afterwards
; should give a nice picture of the zoning of this drive
[global]
rw=read
ioengine=libaio
iodepth=2
+zonemode=strided
zonesize=256m
zoneskip=2g
-write_bw_log
-[/dev/sdb]
+[disk-zone-profile]
+filename=/dev/sdb
+write_bw_log
+log_offset=1
# Run e4defrag and aio-dio workers in parallel
[e4defrag]
stonewall
-time_based=30
+time_based
runtime=30
ioengine=e4defrag
buffered=0
########
# Run random e4defrag and various aio workers in parallel
-[e4defrag-fuzzer-4k]
+[e4defrag-fuzzer-4k-bis]
stonewall
continue_on_error=all
inplace=1
bs=4k
donorname=file3.def
filename=file3
-time_based=30
+time_based
rw=randwrite
[buffered-aio-32k]
filename=file3
rw=randrw
runtime=30
-time_based=30
+time_based
numjobs=4
[direct-aio-32k]
filename=file3
rw=randrw
runtime=30
-time_based=30
numjobs=4
rw=randtrim
filename=raicer
-# Verifier thread continiously write to newly allcated blocks
-# and veryfy written content
+# Verifier thread continuously writes to newly allcated blocks
+# and verifies written content
[aio-dio-verifier]
create_on_open=1
verify=crc32c-intel
--- /dev/null
+[global]
+time_based
+runtime=30
+
+[monitoring_noop]
+ioengine=exec
+program=/usr/sbin/turbostat
+arguments=-c package -qS --interval 5 -s Busy%,Bzy_MHz,Avg_MHz,CorWatt,PkgWatt,RAMWatt,PkgTmp
+
+[cpuload_noop]
+ioengine=cpuio
+cpuload=100
+numjobs=12
+cpumode=noop
+
+[sleep]
+# Let the processor cooling down for a few seconds
+stonewall
+ioengine=exec
+runtime=10
+program=/bin/sleep
+arguments=%r
+grace_time=0
+std_redirect=0
+
+[monitoring_qsort]
+stonewall
+ioengine=exec
+program=/usr/sbin/turbostat
+arguments=-c package -qS --interval 5 -s Busy%,Bzy_MHz,Avg_MHz,CorWatt,PkgWatt,RAMWatt,PkgTmp
+
+[cpuload_qsort]
+ioengine=cpuio
+cpuload=100
+numjobs=12
+cpumode=qsort
filename=/tmp/test
filesize=1G
blocksize=4096
-group_reporting=1
+group_reporting
exitall=1
[slow1]
[falloc-fuzzer]
stonewall
runtime=10
-time_based=10
+time_based
bssplit=4k/10:64k/50:32k/40
rw=randwrite
numjobs=1
[punch hole-fuzzer]
bs=4k
runtime=10
-time_based=10
+time_based
rw=randtrim
numjobs=2
filename=fragmented_file
-## Mesure IO performance on fragmented file
+## Measure IO performance on fragmented file
[sequential aio-dio write]
stonewall
ioengine=libaio
--- /dev/null
+# Example filedelete job
+
+# 'filedelete' engine only do 'unlink(filename)', file will not be open().
+# 'filesize' must be set, then files will be created at setup stage.
+# 'unlink' is better set to 0, since the file is deleted in measurement.
+# the options disabled completion latency output such as 'disable_clat' and 'gtod_reduce' must not set.
+[global]
+ioengine=filedelete
+filesize=4k
+nrfiles=200
+unlink=0
+
+[t0]
+[t1]
+[t2]
+[t3]
+[t4]
+[t5]
bs=4K
direct=0
numjobs=4
-time_based=1
+time_based
runtime=900
[file1]
bs=4K
direct=0
numjobs=1
-time_based=1
+time_based
runtime=900
[file1]
bs=4K
direct=0
numjobs=4
-time_based=1
+time_based
runtime=900
[file1]
bs=256K
direct=0
numjobs=4
-time_based=1
+time_based
runtime=900
[file1]
bs=256K
direct=1
numjobs=1
-time_based=1
+time_based
runtime=900
[file1]
bs=256K
direct=0
numjobs=1
-time_based=1
+time_based
runtime=900
[file1]
norandommap
direct=1
loops=500000
-rwmixcycle=40
--- /dev/null
+# Example test for the HTTP engine's S3 support against Amazon AWS.
+# Obviously, you have to adjust the S3 credentials; for this example,
+# they're passed in via the environment.
+# And you can set the SSE Customer Key and Algorithm to test Server
+# Side Encryption.
+#
+
+[global]
+ioengine=http
+name=test
+direct=1
+filename=/larsmb-fio-test/object
+http_verbose=0
+https=on
+http_mode=s3
+http_s3_key=${S3_KEY}
+http_s3_keyid=${S3_ID}
+http_host=s3.eu-central-1.amazonaws.com
+http_s3_region=eu-central-1
+http_s3_sse_customer_key=${SSE_KEY}
+http_s3_sse_customer_algorithm=AES256
+group_reporting
+
+# With verify, this both writes and reads the object
+[create]
+rw=write
+bs=4k
+size=64k
+io_size=4k
+verify=sha256
+
+[trim]
+stonewall
+rw=trim
+bs=4k
+size=64k
+io_size=4k
+
--- /dev/null
+# Example test for the HTTP engine's S3 support against Amazon AWS.
+# Obviously, you have to adjust the S3 credentials; for this example,
+# they're passed in via the environment.
+# And here add storage class parameter, you can set normal test for
+# STANDARD and compression test for another storage class.
+#
+
+[global]
+ioengine=http
+name=test
+direct=1
+filename=/larsmb-fio-test/object
+http_verbose=0
+https=on
+http_mode=s3
+http_s3_key=${S3_KEY}
+http_s3_keyid=${S3_ID}
+http_host=s3.eu-central-1.amazonaws.com
+http_s3_region=eu-central-1
+http_s3_storage_class=${STORAGE_CLASS}
+group_reporting
+
+# With verify, this both writes and reads the object
+[create]
+rw=write
+bs=4k
+size=64k
+io_size=4k
+verify=sha256
+
+[trim]
+stonewall
+rw=trim
+bs=4k
+size=64k
+io_size=4k
+
blockalign=4k
random_distribution=zoned:50/5:30/15:20/80
filename=/dev/nvme0n1
-group_reporting=1
+group_reporting
--- /dev/null
+; Benchmark accessing a regular file or block device using libblkio.
+;
+; Replace "/dev/nvme0n1" below with the path to your file or device, or override
+; it by passing the '--libblkio_path=...' flag to fio.
+;
+; In the example below, the two subjobs of "job-B" *and* the single subjob of
+; "job-C" will share a single libblkio instance, and "job-A" will use a separate
+; libblkio instance.
+;
+; For information on libblkio, see: https://gitlab.com/libblkio/libblkio
+
+[global]
+ioengine=libblkio
+libblkio_driver=io_uring
+libblkio_path=/dev/nvme0n1 ; REPLACE THIS WITH THE RIGHT PATH
+rw=randread
+blocksize=4k
+direct=1
+time_based=1
+runtime=10s
+
+[job-A]
+
+[job-B]
+numjobs=2 ; run two copies of this job simultaneously
+thread=1 ; have each copy run as a separate thread in the *same* process
+
+[job-C]
+thread=1 ; have the job run as a thread in the *same* process as "job-B"
--- /dev/null
+; Benchmark accessing a PCI virtio-blk device using libblkio.
+;
+; Replace "/sys/bus/pci/devices/0000:00:01.0" below with the path to your
+; device's sysfs directory, or override it by passing the '--libblkio_path=...'
+; flag to fio.
+;
+; In the example below, the two subjobs of "job-B" *and* the single subjob of
+; "job-C" will share a single libblkio instance, and "job-A" will use a separate
+; libblkio instance.
+;
+; For information on libblkio, see: https://gitlab.com/libblkio/libblkio
+
+[global]
+ioengine=libblkio
+libblkio_driver=virtio-blk-vfio-pci
+libblkio_path=/sys/bus/pci/devices/0000:00:01.0 ; REPLACE THIS WITH THE RIGHT PATH
+rw=randread
+blocksize=4k
+time_based=1
+runtime=10s
+
+[job-A]
+
+[job-B]
+numjobs=2 ; run two copies of this job simultaneously
+thread=1 ; have each copy run as a separate thread in the *same* process
+
+[job-C]
+thread=1 ; have the job run as a thread in the *same* process as "job-B"
--- /dev/null
+# Example libcufile job, using cufile I/O
+#
+# Required environment variables:
+# GPU_DEV_IDS : refer to option 'gpu_dev_ids'
+# FIO_DIR : 'directory'. This job uses cuda_io=cufile, so path(s) must
+# point to GPUDirect Storage filesystem(s)
+#
+
+[global]
+ioengine=libcufile
+directory=${FIO_DIR}
+gpu_dev_ids=${GPU_DEV_IDS}
+cuda_io=cufile
+# 'direct' must be 1 when using cuda_io=cufile
+direct=1
+# Performance is negatively affected if 'bs' is not a multiple of 4k.
+# Refer to GDS cuFile documentation.
+bs=1m
+size=1m
+numjobs=16
+# cudaMalloc fails if too many processes attach to the GPU, use threads.
+thread
+
+[read]
+rw=read
+
+[write]
+rw=write
+
+[randread]
+rw=randread
+
+[randwrite]
+rw=randwrite
+
+[verify]
+rw=write
+verify=md5
+
+[randverify]
+rw=randwrite
+verify=md5
--- /dev/null
+# Example libcufile job, using POSIX I/O
+#
+# Required environment variables:
+# GPU_DEV_IDS : refer to option 'gpu_dev_ids'
+# FIO_DIR : 'directory'. cuda_io=posix, so the path(s) may point
+# to any POSIX filesystem(s)
+#
+
+[global]
+ioengine=libcufile
+directory=${FIO_DIR}
+gpu_dev_ids=${GPU_DEV_IDS}
+cuda_io=posix
+# 'direct' may be 1 or 0 when using cuda_io=posix
+direct=0
+# there are no unusual requirements for 'bs' when cuda_io=posix
+bs=1m
+size=1G
+numjobs=16
+# cudaMalloc fails if too many processes attach to the GPU, use threads
+thread
+
+[read]
+rw=read
+
+[write]
+rw=write
+
+[randread]
+rw=randread
+
+[randwrite]
+rw=randwrite
+
+[verify]
+rw=write
+verify=md5
+
+[randverify]
+rw=randwrite
+verify=md5
[global]
bs=4k
-size=8g
+size=10g
ioengine=libpmem
norandommap
-time_based=1
+time_based
group_reporting
invalidate=1
disable_lat=1
iodepth=1
iodepth_batch=1
-thread=1
+thread
numjobs=1
runtime=300
-#
-# In case of 'scramble_buffers=1', the source buffer
-# is rewritten with a random value every write operations.
-#
-# But when 'scramble_buffers=0' is set, the source buffer isn't
-# rewritten. So it will be likely that the source buffer is in CPU
-# cache and it seems to be high performance.
-#
-scramble_buffers=0
-
#
# depends on direct option, flags are set for pmem_memcpy() call:
# direct=1 - PMEM_F_MEM_NONTEMPORAL,
#
sync=1
+#
+# In case of 'scramble_buffers=1', the source buffer
+# is rewritten with a random value every write operation.
+#
+# But when 'scramble_buffers=0' is set, the source buffer isn't
+# rewritten. So it will be likely that the source buffer is in CPU
+# cache and it seems to be high write performance.
+#
+scramble_buffers=1
#
-# Setting for fio process's CPU Node and Memory Node
+# Setting for fio process's CPU Node and Memory Node.
+# Set proper node below or use `numactl` command along with FIO.
#
numa_cpu_nodes=0
numa_mem_policy=bind:0
#
# The libpmem engine does IO to files in a DAX-mounted filesystem.
-# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# The filesystem should be created on a Non-Volatile DIMM (e.g /dev/pmem0)
# and then mounted with the '-o dax' option. Note that the engine
# accesses the underlying NVDIMM directly, bypassing the kernel block
# layer, so the usual filesystem/disk performance monitoring tools such
# as iostat will not provide useful data.
#
-directory=/mnt/pmem0
+#filename=/mnt/pmem/somefile
+directory=/mnt/pmem
[libpmem-seqwrite]
rw=write
stonewall
-#[libpmem-seqread]
-#rw=read
-#stonewall
+[libpmem-seqread]
+rw=read
+stonewall
#[libpmem-randwrite]
#rw=randwrite
--- /dev/null
+# Example of the librpma_apm_client job
+
+[global]
+ioengine=librpma_apm_client
+create_serialize=0 # (required) forces specific initiation sequence
+serverip=[serverip] #IP address the server is listening on
+port=7204 # port(s) the server will listen on, <port; port + numjobs - 1> will be used
+thread
+
+# The client will get a remote memory region description after establishing
+# a connection.
+
+[client]
+numjobs=1 # number of parallel connections
+group_reporting=1
+sync=1 # 1 is the best for latency measurements, 0 for bandwidth
+iodepth=2 # total number of ious
+iodepth_batch_submit=1 # number of ious to be submitted at once
+rw=write # read/write/randread/randwrite/readwrite/rw
+rwmixread=70 # % of a mixed workload that should be reads
+blocksize=4KiB
+ramp_time=15s # gives some time to stabilize the workload
+time_based
+runtime=60s # run the workload for the specified period of time
--- /dev/null
+# Example of the librpma_apm_server job
+
+[global]
+ioengine=librpma_apm_server
+create_serialize=0 # (required) forces specific initiation sequence
+kb_base=1000 # turn on the straight units handling (non-compatibility mode)
+serverip=[serverip] # IP address to listen on
+port=7204 # port(s) the server jobs will listen on, ports <port; port + numjobs - 1> will be used
+thread
+
+# The server side spawns one thread for each expected connection from
+# the client-side, opens and registers the range dedicated for this thread
+# (a workspace) from the provided memory.
+# Each of the server threads accepts a connection on the dedicated port
+# (different for each and every working thread) and waits for it to end up,
+# and closes itself.
+
+[server]
+# set to 1 (true) ONLY when Direct Write to PMem from the remote host is possible
+# (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)
+direct_write_to_pmem=0
+
+numjobs=1 # number of expected incoming connections
+size=100MiB # size of workspace for a single connection
+filename=malloc # device dax or an existing fsdax file or "malloc" for allocation from DRAM
+# filename=/dev/dax1.0
--- /dev/null
+# Example of the librpma_gpspm_client job
+
+[global]
+ioengine=librpma_gpspm_client
+create_serialize=0 # (required) forces specific initiation sequence
+serverip=[serverip] #IP address the server is listening on
+port=7204 # port(s) the server will listen on, <port; port + numjobs - 1> will be used
+thread
+
+# The client will get a remote memory region description after establishing
+# a connection.
+
+[client]
+numjobs=1 # number of parallel connections
+group_reporting=1
+sync=1 # 1 is the best for latency measurements, 0 for bandwidth
+iodepth=2 # total number of ious
+iodepth_batch_submit=1 # number of ious to be submitted at once
+rw=write # write/randwrite
+blocksize=4KiB
+ramp_time=15s # gives some time to stabilize the workload
+time_based
+runtime=60s # run the workload for the specified period of time
--- /dev/null
+# Example of the librpma_gpspm_server job
+
+[global]
+ioengine=librpma_gpspm_server
+create_serialize=0 # (required) forces specific initiation sequence
+kb_base=1000 # turn on the straight units handling (non-compatibility mode)
+serverip=[serverip] #IP address to listen on
+port=7204 # port(s) the server jobs will listen on, ports <port; port + numjobs - 1> will be used
+thread
+
+# The server side spawns one thread for each expected connection from
+# the client-side, opens and registers the range dedicated for this thread
+# (a workspace) from the provided memory.
+# Each of the server threads accepts a connection on the dedicated port
+# (different for each and every working thread), accepts and executes flush
+# requests, and sends back a flush response for each of the requests.
+# When the client is done it sends the termination notice to the server's thread.
+
+[server]
+# set to 1 (true) ONLY when Direct Write to PMem from the remote host is possible
+# (https://pmem.io/rpma/documentation/basic-direct-write-to-pmem.html)
+direct_write_to_pmem=0
+# set to 0 (false) to wait for completion instead of busy-wait polling completion.
+busy_wait_polling=1
+numjobs=1 # number of expected incoming connections
+iodepth=2 # number of parallel GPSPM requests
+size=100MiB # size of workspace for a single connection
+filename=malloc # device dax or an existing fsdax file or "malloc" for allocation from DRAM
+# filename=/dev/dax1.0
+
+# The client will terminate the server when the client will end up its job.
+time_based
+runtime=365d
bs=512K
direct=1
numjobs=16
-time_based=1
+time_based
runtime=300
[dev1]
blocksize=512,512,16384
skip_bad=1
-[write]
+[trim]
stonewall
rw=trim
stonewall
rw=write
-[write]
+[trimwrite]
stonewall
block_error_percentiles=1
rw=trimwrite
-# To use fio to test nbdkit:
+# To use fio to test nbdkit + RAM disk:
#
-# nbdkit -U - memory size=256M --run 'export unixsocket; fio examples/nbd.fio'
+# nbdkit -U - memory size=256M --run 'export uri; fio examples/nbd.fio'
#
-# To use fio to test qemu-nbd:
+# To use fio to test nbdkit + local file:
#
-# rm -f /tmp/disk.img /tmp/socket
-# truncate -s 256M /tmp/disk.img
-# export unixsocket=/tmp/socket
-# qemu-nbd -t -k $unixsocket -f raw /tmp/disk.img &
-# fio examples/nbd.fio
-# killall qemu-nbd
+# rm -f /var/tmp/disk.img
+# truncate -s 256M /var/tmp/disk.img
+# nbdkit -U - file /var/tmp/disk.img --run 'export uri; fio examples/nbd.fio'
+#
+# To use fio to test qemu-nbd + local file:
+#
+# rm -f /var/tmp/disk.img /var/tmp/socket
+# truncate -s 256M /var/tmp/disk.img
+# export uri='nbd+unix:///?socket=/var/tmp/socket'
+# qemu-nbd -t -k /var/tmp/socket -f raw /var/tmp/disk.img &
+# fio examples/nbd.fio
+# killall qemu-nbd
[global]
ioengine=nbd
-uri=nbd+unix:///?socket=${unixsocket}
-# Starting from nbdkit 1.14 the following will work:
-#uri=${uri}
+uri=${uri}
rw=randrw
time_based
runtime=60
--- /dev/null
+# Example network vsock job, just defines two clients that send/recv data
+[global]
+ioengine=net
+
+port=8888
+protocol=vsock
+bs=4k
+size=100g
+
+#set the below option to enable end-to-end data integrity tests
+#verify=md5
+
+[receiver]
+listen
+rw=read
+
+[sender]
+# 1 (VMADDR_CID_LOCAL) is the well-known address
+# for local communication (loopback)
+hostname=1
+startdelay=1
+rw=write
--- /dev/null
+# Example network vsock job, just defines a receiver
+[global]
+ioengine=net
+port=8888
+protocol=vsock
+bs=4k
+size=100g
+
+#set the below option to enable end-to-end data integrity tests
+#verify=md5
+
+[receiver]
+listen
+rw=read
--- /dev/null
+# Example network vsock job, just defines a sender
+[global]
+ioengine=net
+port=8888
+protocol=vsock
+bs=4k
+size=100g
+
+#set the below option to enable end-to-end data integrity tests
+#verify=md5
+
+[sender]
+# set the 'hostname' option to the CID of the listening domain
+hostname=3
+startdelay=1
+rw=write
+
--- /dev/null
+[global]
+nfs_url=nfs://127.0.0.1/nfs
+blocksize=524288
+iodepth=10
+ioengine=nfs
+size=104857600
+lat_percentiles=1
+group_reporting
+numjobs=10
+ramp_time=5s
+filename_format=myfiles.$clientuid.$jobnum.$filenum
+time_based=1
+
+[write]
+rw=write
+runtime=10s
+stonewall
+
+[read]
+wait_for=write
+rw=randread
+runtime=10s
size=100g
rw=randread
norandommap
-time_based=0
+++ /dev/null
-[global]
-bs=1m
-ioengine=pmemblk
-norandommap
-time_based=1
-runtime=30
-group_reporting
-disable_lat=1
-disable_slat=1
-disable_clat=1
-clat_percentiles=0
-cpus_allowed_policy=split
-
-# For the pmemblk engine:
-#
-# IOs always complete immediately
-# IOs are always direct
-# Must use threads
-#
-iodepth=1
-direct=1
-thread=1
-numjobs=16
-#
-# Unlink can be used to remove the files when done, but if you are
-# using serial runs with stonewall, and you want the files to be created
-# only once and unlinked only at the very end, then put the unlink=1
-# in the last group. This is the method demonstrated here.
-#
-# Note that if you have a read-only group and if the files will be
-# newly created, then all of the data will read back as zero and the
-# read will be optimized, yielding performance that is different from
-# that of reading non-zero blocks (or unoptimized zero blocks).
-#
-unlink=0
-#
-# The pmemblk engine does IO to files in a DAX-mounted filesystem.
-# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
-# and then mounted with the '-o dax' option. Note that the engine
-# accesses the underlying NVDIMM directly, bypassing the kernel block
-# layer, so the usual filesystem/disk performance monitoring tools such
-# as iostat will not provide useful data.
-#
-# Here we specify a test file on each of two NVDIMMs. The first
-# number after the file name is the block size in bytes (4096 bytes
-# in this example). The second number is the size of the file to
-# create in MiB (1 GiB in this example); note that the actual usable
-# space available to fio will be less than this as libpmemblk requires
-# some space for metadata.
-#
-# Currently, the minimum block size is 512 bytes and the minimum file
-# size is about 17 MiB (these are libpmemblk requirements).
-#
-# While both files in this example have the same block size and file
-# size, this is not required.
-#
-filename=/pmem0/fio-test,4096,1024
-filename=/pmem1/fio-test,4096,1024
-
-[pmemblk-write]
-rw=randwrite
-stonewall
-
-[pmemblk-read]
-rw=randread
-stonewall
-#
-# We're done, so unlink the file:
-#
-unlink=1
-
ioengine=rados
clientname=admin
pool=rados
+conf=/etc/ceph/ceph.conf
busy_poll=0
rw=randwrite
bs=4k
# The above applies to all of reads/writes/trims. If we wanted to do
# something differently for writes, let's say 50% for the first 10%
# and 50% for the remaining 90%, we could do it by adding a new section
-# after a a comma.
+# after a comma.
# random_distribution=zoned:50/5:30/15:20/,50/10:50/90
--- /dev/null
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL *
+# * DESTROY DATA ON /dev/sdb *
+# **********************************
+#
+# Test SCSI VERIFY commands issued via the sg ioengine
+# The jobs with fail in the name should produce errors
+#
+# job description
+# precon precondition the device by writing with a known
+# pattern
+# verify01 verify each block one at a time by comparing to known
+# pattern
+# verify01-fail verifying one too many blocks should produce a failure
+# verify11-one_ios verify all 20 blocks by sending only 512 bytes
+# verify11-fail verifying beyond the preconditioned region should
+# produce a failure
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+number_ios=20
+stonewall
+
+[precon]
+
+[verify01]
+sg_write_mode=verify_bytchk_01
+number_ios=20
+
+[verify01-fail]
+sg_write_mode=verify_bytchk_01
+number_ios=21
+
+[verify11-one_ios]
+sg_write_mode=verify_bytchk_11
+number_ios=1
+bs=10240
+
+[verify11-fail]
+sg_write_mode=verify_bytchk_11
+number_ios=1
+bs=10752
--- /dev/null
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL *
+# * DESTROY DATA ON /dev/sdb *
+# **********************************
+#
+# Test SCSI VERIFY commands issued via the sg ioengine
+# All of the jobs below should complete without error
+#
+# job description
+# precon precondition the device by writing with a known
+# pattern
+# verify00 verify written data on medium only
+# verify01 verify each block one at a time by comparing to known
+# pattern
+# verify01-two_ios verify same data but with only two VERIFY operations
+# verify11 verify each block one at a time
+# verify11-five_ios verify data with five IOs, four blocks at a time,
+# sending 512 bytes for each IO
+# verify11-one_ios verify all 20 blocks by sending only 512 bytes
+#
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+number_ios=20
+stonewall
+
+[precon]
+
+[verify00]
+sg_write_mode=verify_bytchk_00
+
+[verify01]
+sg_write_mode=verify_bytchk_01
+
+[verify01-two_ios]
+sg_write_mode=verify_bytchk_01
+bs=5120
+number_ios=2
+
+[verify11]
+sg_write_mode=verify_bytchk_11
+
+[verify11-five_ios]
+sg_write_mode=verify_bytchk_11
+bs=2048
+number_ios=5
+
+[verify11-one_ios]
+sg_write_mode=verify_bytchk_11
+bs=10240
+number_ios=1
--- /dev/null
+#
+# **********************************
+# * !!THIS IS A DESTRUCTIVE TEST!! *
+# * IF NOT CHANGED THIS TEST WILL *
+# * DESTROY DATA ON /dev/sdb *
+# **********************************
+#
+# Test WRITE SAME commands with the NDOB flag set
+# issued via the sg ioengine
+# All of the jobs below should complete without error
+# except the last one
+#
+# job description
+# precon Precondition the device by writing 20 blocks with a
+# known pattern
+# write_same_ndob Write 19 sectors of all zeroes with the NDOB flag set
+# verify-pass Verify 19 blocks of all zeroes
+# verify-fail Verify 20 blocks of all zeroes. This should fail.
+#
+
+[global]
+filename=/dev/sdb
+buffer_pattern=0x01
+ioengine=sg
+rw=write
+bs=512
+stonewall
+
+[precon]
+number_ios=20
+
+[write_same_ndob]
+sg_write_mode=write_same_ndob
+number_ios=19
+
+[verify-pass]
+sg_write_mode=verify_bytchk_01
+buffer_pattern=0x00
+number_ios=19
+
+[verify-fail]
+sg_write_mode=verify_bytchk_01
+buffer_pattern=0x00
+number_ios=20
[global]
threads=1
-group_reporting=1
+group_reporting
time_based
size=128m
; writes 512 byte verification blocks until the disk is full,
; then verifies written data
[global]
-thread=1
+thread
bs=64k
direct=1
ioengine=sync
--- /dev/null
+# io_uring_cmd I/O engine for nvme-ns generic character device with FDP enabled
+# This assumes the namespace is already configured with FDP support and has at
+# least 8 available reclaim units.
+#
+# Each job targets different ranges of LBAs with different placement
+# identifiers, and has different write intensity.
+
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+iodepth=32
+bs=4K
+fdp=1
+time_based=1
+runtime=1000
+
+[write-heavy]
+rw=randrw
+rwmixwrite=90
+fdp_pli=0,1,2,3
+offset=0%
+size=30%
+
+[write-mid]
+rw=randrw
+rwmixwrite=30
+fdp_pli=4,5
+offset=30%
+size=30%
+
+[write-light]
+rw=randrw
+rwmixwrite=10
+fdp_pli=6
+offset=60%
+size=30%
--- /dev/null
+# io_uring_cmd I/O engine for nvme-ns generic character device
+
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+size=1G
+iodepth=32
+bs=4K
+thread=1
+stonewall=1
+
+[rand-write]
+rw=randwrite
+sqthread_poll=1
+
+[rand-read]
+rw=randread
+
+[write-opts]
+rw=write
+sqthread_poll=1
+sqthread_poll_cpu=0
+nonvectored=1
+registerfiles=1
--- /dev/null
+# Protection information test with io_uring_cmd I/O engine for nvme-ns generic
+# character device.
+#
+# This requires nvme device to be formatted with extended LBA data size and
+# protection information enabled. This can be done with nvme-cli utility.
+# Replace bs below with the correct extended LBA size.
+#
+# First we sequentially write to the device, without protection information
+# action being set. FIO will generate and send necessary protection
+# information data as per the protection information check option. Later on we
+# sequentially read and verify the device returned protection information data.
+#
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+size=1G
+iodepth=32
+bs=4160
+pi_act=0
+pi_chk=GUARD,APPTAG,REFTAG
+apptag=0x0888
+apptag_mask=0xFFFF
+thread=1
+stonewall=1
+
+[write]
+rw=write
+
+[read]
+rw=read
--- /dev/null
+# Protection information test with io_uring_cmd I/O engine for nvme-ns generic
+# character device.
+#
+# This requires nvme device to be formatted with separate metadata buffer and
+# protection information enabled. This can be done with nvme-cli utility.
+# Replace md_per_io_size as per the required metadata buffer size for each IO.
+#
+# First we sequentially write to the device, without protection information
+# action being set. FIO will generate and send necessary protection
+# information data as per the protection information check option. Later on we
+# sequentially read and verify the device returned protection information data.
+#
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+size=1G
+iodepth=32
+bs=4096
+md_per_io_size=64
+pi_act=0
+pi_chk=GUARD,APPTAG,REFTAG
+apptag=0x0888
+apptag_mask=0xFFFF
+thread=1
+stonewall=1
+
+[write]
+rw=write
+
+[read]
+rw=read
--- /dev/null
+# Multi-range trim command test with io_uring_cmd I/O engine for nvme-ns
+# generic character device.
+#
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+size=10M
+iodepth=32
+thread=1
+stonewall=1
+
+[write_bs]
+bs=4096
+rw=randtrim
+num_range=8
+
+[write_bssplit]
+bssplit=4k/10:64k/50:32k/40
+rw=trim
+num_range=8
--- /dev/null
+# io_uring_cmd I/O engine for nvme-ns generic zoned character device
+#
+# NOTE:
+# Regular writes against a zone should be limited to QD1, as the device can
+# reorder the requests.
+#
+# As the passthrough path do not use an IO scheduler (such as mq-deadline),
+# the queue depth should be limited to 1 to avoid zone invalid writes.
+
+[global]
+filename=/dev/ng0n1
+ioengine=io_uring_cmd
+cmd_type=nvme
+zonemode=zbd
+size=1G
+iodepth=1
+bs=256K
+verify=crc32c
+stonewall=1
+
+[rand-write]
+rw=randwrite
+
+[write-opts]
+rw=write
+registerfiles=1
+sqthread_poll=1
+sqthread_poll_cpu=0
+
+[randwrite-opts]
+rw=randwrite
+sqthread_poll=1
+sqthread_poll_cpu=0
+nonvectored=1
+registerfiles=1
[global]
threads=1
-group_reporting=1
+group_reporting
filename=/tmp/data
filesize=128m
--- /dev/null
+; Compare fio IO engines with a random-read workload using BS=4k at QD=1
+;
+; README
+;
+; This job-file is intended to be used as:
+;
+; # Use the built-in io_uring engine to get baseline numbers
+; fio examples/xnvme-compare.fio \
+; --section=default \
+; --ioengine=io_uring \
+; --sqthread_poll=1 \
+; --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and io_uring async. impl.
+; fio examples/xnvme-compare.fio \
+; --section=default \
+; --ioengine=xnvme \
+; --sqthread_poll=1 \
+; --xnvme_async=io_uring \
+; --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and libaio async. impl.
+; fio examples/xnvme-compare.fio \
+; --section=default \
+; --ioengine=xnvme \
+; --xnvme_async=libaio \
+; --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id
+; fio examples/xnvme-compare.fio \
+; --section=default \
+; --ioengine=xnvme \
+; --xnvme_dev_nsid=1 \
+; --filename=0000\\:01\\:00.0
+;
+; NOTE: The URI encoded in the filename above, the ":" must be escaped.
+;
+; On the command-line using two "\\":
+;
+; --filename=0000\\:01\\:00.0
+;
+; Within a fio-script using a single "\":
+;
+; filename=0000\:01\:00.0
+;
+; NOTE: If you want to override the default bs, iodepth, and workload, then
+; invoke it as:
+;
+; FIO_BS="512" FIO_RW="verify" FIO_IODEPTH=16 fio examples/xnvme-compare.fio \
+; --section=override
+;
+[global]
+rw=randread
+size=12G
+iodepth=1
+bs=4K
+direct=1
+thread=1
+time_based=1
+runtime=7
+ramp_time=3
+norandommap=1
+
+; Avoid accidentally creating device files; e.g. "/dev/nvme0n1", "/dev/nullb0"
+allow_file_create=0
+
+[default]
+
+[override]
+rw=${FIO_RW}
+iodepth=${FIO_IODEPTH}
+bs=${FIO_BS}
--- /dev/null
+; README
+;
+; This job-file is intended to be used either as:
+;
+; # Use the xNVMe io-engine engine io_uring_cmd async. impl.
+; fio examples/xnvme-fdp.fio \
+; --section=default \
+; --ioengine=xnvme \
+; --xnvme_async=io_uring_cmd \
+; --filename=/dev/ng0n1
+;
+; # Use the xNVMe io-engine engine with nvme sync. impl.
+; fio examples/xnvme-fdp.fio \
+; --section=default \
+; --ioengine=xnvme \
+; --xnvme_sync=nvme \
+; --filename=/dev/ng0n1
+;
+; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id
+; fio examples/xnvme-fdp.fio \
+; --section=default \
+; --ioengine=xnvme \
+; --xnvme_dev_nsid=1 \
+; --filename=0000\\:01\\:00.0
+;
+; NOTE: The URI encoded in the filename above, the ":" must be escaped.
+;
+; On the command-line using two "\\":
+;
+; --filename=0000\\:01\\:00.0
+;
+; Within a fio-script using a single "\":
+;
+; filename=0000\:01\:00.0
+;
+; NOTE: If you want to override the default bs, iodepth, and workload, then
+; invoke it as:
+;
+; FIO_BS="512" FIO_RW="read" FIO_IODEPTH=16 fio examples/xnvme-fdp.fio \
+; --section=override --ioengine=xnvme --xnvme_sync=nvme --filename=/dev/ng0n1
+;
+[global]
+rw=randwrite
+size=2M
+iodepth=1
+bs=4K
+thread=1
+fdp=1
+fdp_pli=4,5
+
+[default]
+
+[override]
+rw=${FIO_RW}
+iodepth=${FIO_IODEPTH}
+bs=${FIO_BS}
--- /dev/null
+; README
+;
+; This job-file is intended to be used either as:
+;
+; # Use the xNVMe io-engine engine io_uring_cmd async. impl.
+; fio examples/xnvme-pi.fio \
+; --ioengine=xnvme \
+; --xnvme_async=io_uring_cmd \
+; --filename=/dev/ng0n1
+;
+; # Use the xNVMe io-engine engine with nvme sync. impl.
+; fio examples/xnvme-pi.fio \
+; --ioengine=xnvme \
+; --xnvme_sync=nvme \
+; --filename=/dev/ng0n1
+;
+; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id
+; fio examples/xnvme-pi.fio \
+; --ioengine=xnvme \
+; --xnvme_dev_nsid=1 \
+; --filename=0000\\:01\\:00.0
+;
+; NOTE: The URI encoded in the filename above, the ":" must be escaped.
+;
+; On the command-line using two "\\":
+;
+; --filename=0000\\:01\\:00.0
+;
+; Within a fio-script using a single "\":
+;
+; filename=0000\:01\:00.0
+;
+; NOTE: This example configuration assumes that the NVMe device is formatted
+; with a separate metadata buffer. If you want to run on an extended LBA format
+; update the "bs" accordingly.
+;
+[global]
+size=100M
+iodepth=16
+bs=4K
+md_per_io_size=64
+pi_act=0
+pi_chk=GUARD,APPTAG,REFTAG
+apptag=0x0234
+apptag_mask=0xFFFF
+thread=1
+stonewall=1
+
+[write]
+rw=write
+
+[read]
+rw=read
--- /dev/null
+; Running xNVMe/fio on a Zoned Device
+;
+; Writes 1GB at QD1 using 4K BS and verifies it.
+;
+; README
+;
+; This job-file is intended to be used as:
+;
+; # Use the built-in io_uring engine to get baseline numbers
+; fio examples/xnvme-zoned.fio \
+; --section=default \
+; --ioengine=io_uring \
+; --sqthread_poll=1 \
+; --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and io_uring async. impl.
+; fio examples/xnvme-zoned.fio \
+; --section=default \
+; --ioengine=xnvme \
+; --sqthread_poll=1 \
+; --xnvme_async=io_uring \
+; --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with Linux backend and libaio async. impl.
+; fio examples/xnvme-zoned.fio \
+; --section=default \
+; --ioengine=xnvme \
+; --xnvme_async=libaio \
+; --filename=/dev/nvme0n1
+;
+; # Use the xNVMe io-engine engine with SPDK backend, note that you have to set the Namespace-id
+; fio examples/xnvme-zoned.fio \
+; --section=default \
+; --ioengine=xnvme \
+; --xnvme_dev_nsid=1 \
+; --filename=0000\\:01\\:00.0
+;
+; NOTE: The URI encoded in the filename above, the ":" must be escaped.
+;
+; On the command-line using two "\\":
+;
+; --filename=0000\\:01\\:00.0
+;
+; Within a fio-script using a single "\":
+;
+; filename=0000\:01\:00.0
+;
+; NOTE: If you want to override the default bs, iodepth, and workload, then
+; invoke it as:
+;
+; FIO_BS="512" FIO_RW="verify" FIO_IODEPTH=16 fio examples/xnvme-zoned.fio \
+; --section=override
+;
+; To reset all zones on the device to EMPTY state aka. wipe the entire device.
+;
+; # zoned mgmt-reset /dev/nvme0n2 --slba 0x0 --all
+;
+[global]
+zonemode=zbd
+rw=write
+size=1G
+iodepth=1
+bs=4K
+direct=1
+thread=1
+ramp_time=1
+norandommap=1
+verify=crc32c
+; Avoid accidentally creating device files; e.g. "/dev/nvme0n1", "/dev/nullb0"
+allow_file_create=0
+;
+; NOTE: If fio complains about zone-size, then run:
+;
+; # zoned info /dev/nvme0n1
+;
+; The command will provide the values you need, then in the fio-script define:
+;
+; zonesize=nsect * nbytes
+;
+;zonesize=
+
+[default]
+
+[override]
+rw=${FIO_RW}
+iodepth=${FIO_IODEPTH}
+bs=${FIO_BS}
--- /dev/null
+; Using the libaio ioengine, random write to a (zoned) block device. Write
+; target zones are chosen randomly among the first 128 zones starting from
+; device offset corresponding to the 524th zone of the device (524 x 256 MB).
+; For first 3 seconds, run only random write. After that, run random write job
+; and garbage collection simulation job in parallel. The garbage collection
+; simulation job runs trim workload to reset the 128 zones randomly. Use flow
+; option to make the zone resets happen every 128 blocks writes by the other
+; job. This example does not specify max_open_zones. The limit of maximum
+; open zones is obtained from the target block device.
+
+[global]
+group_reporting
+zonemode=zbd
+zonesize=256M
+direct=1
+time_based
+runtime=30
+
+filename=/dev/sdb
+offset=524z
+
+[warmup]
+rw=randwrite
+bs=2M
+size=128z
+ioengine=libaio
+runtime=3
+
+[wjob]
+wait_for=warmup
+rw=randwrite
+bs=2M
+size=128z
+ioengine=libaio
+flow=128
+
+[trimjob]
+wait_for=warmup
+rw=randtrim
+bs=256M
+size=128z
+ioengine=psync
+flow=1
--- /dev/null
+; Using the psync ioengine, random write to a (zoned) block device. Write
+; target zones are chosen randomly among the first 8 zones starting from device
+; offset corresponding to the 524th zone of the device (524 x 256 MB). Simulate
+; garbage collection operation using zone_reset_threshold and
+; zone_reset_frequency options. The zone resets happen when total written data
+; bytes is beyond 70% of 8 zones, and 8 = 1 / 0.125 blocks are written. This
+; example does not specify max_open_zones. The limit of maximum open zones is
+; obtained from the target block device.
+
+[global]
+name=zbd-rand-write-gc
+group_reporting
+rw=randwrite
+zonemode=zbd
+zonesize=256M
+bs=32M
+direct=1
+time_based
+runtime=40
+
+[dev1]
+filename=/dev/sdb
+size=8z
+offset=524z
+ioengine=psync
+zone_reset_threshold=0.7
+zone_reset_frequency=0.125
-; Using the libaio ioengine, random write to a (zoned) block device,
+; Using the psync ioengine, random write to a (zoned) block device,
; writing at most 32 zones at a time. Target zones are chosen randomly
; and writes directed at the write pointer of the chosen zones
bs=512K
direct=1
numjobs=16
-time_based=1
+time_based
runtime=180
[dev1]
/* Forward declarations */
struct zoned_block_device_info;
+struct fdp_ruh_info;
/*
* The type of object we are working on
uint64_t file_offset;
uint64_t io_size;
+ struct fio_ruhs_info *ruhs_info;
+
/*
* Zoned block device information. See also zonemode=zbd.
*/
unsigned int last_write_idx;
/*
- * For use by the io engine for offset or private data storage
+ * For use by the io engine to store offset
*/
- union {
- uint64_t engine_pos;
- void *engine_data;
- };
+ uint64_t engine_pos;
+
+ /*
+ * For use by the io engine for private data storage
+ */
+ void *engine_data;
/*
* if io is protected by a semaphore, this is set
extern int __must_check generic_open_file(struct thread_data *, struct fio_file *);
extern int __must_check generic_close_file(struct thread_data *, struct fio_file *);
extern int __must_check generic_get_file_size(struct thread_data *, struct fio_file *);
+extern int __must_check generic_prepopulate_file(struct thread_data *, struct fio_file *);
#ifdef __cplusplus
}
#endif
if (!f->file_name)
continue;
- if (!strcmp(f->file_name, name)) {
- assert(f->fd != -1);
+ if (!strcmp(f->file_name, name))
return f;
- }
}
return NULL;
if (r < 0) {
int __e = errno;
- if (__e == ENOSPC) {
+ if (__e == ENOSPC || __e == EDQUOT) {
+ const char *__e_name;
if (td->o.fill_device)
break;
- log_info("fio: ENOSPC on laying out "
- "file, stopping\n");
- break;
+ if (__e == ENOSPC)
+ __e_name = "ENOSPC";
+ else
+ __e_name = "EDQUOT";
+ log_info("fio: %s on laying out "
+ "file, stopping\n", __e_name);
}
td_verror(td, errno, "write");
} else
td_verror(td, EIO, "write");
- break;
+ goto err;
}
}
if (bs > left)
bs = left;
- b = malloc(bs);
+ b = calloc(1, bs);
if (!b) {
td_verror(td, errno, "malloc");
ret = false;
goto error;
}
- memset(b, 0, bs);
if (lseek(f->fd, f->file_offset, SEEK_SET) < 0) {
td_verror(td, errno, "lseek");
return ret;
}
+/*
+ * Generic function to prepopulate regular file with data.
+ * Useful if you want to make sure I/O engine has data to read.
+ * Leaves f->fd open on success, caller must close.
+ */
+int generic_prepopulate_file(struct thread_data *td, struct fio_file *f)
+{
+ int flags;
+ unsigned long long left, bs;
+ char *b = NULL;
+
+ /* generic function for regular files only */
+ assert(f->filetype == FIO_TYPE_FILE);
+
+ if (read_only) {
+ log_err("fio: refusing to write a file due to read-only\n");
+ return 0;
+ }
+
+ flags = O_WRONLY;
+ if (td->o.allow_create)
+ flags |= O_CREAT;
+
+#ifdef WIN32
+ flags |= _O_BINARY;
+#endif
+
+ dprint(FD_FILE, "open file %s, flags %x\n", f->file_name, flags);
+ f->fd = open(f->file_name, flags, 0644);
+ if (f->fd < 0) {
+ int err = errno;
+
+ if (err == ENOENT && !td->o.allow_create)
+ log_err("fio: file creation disallowed by "
+ "allow_file_create=0\n");
+ else
+ td_verror(td, err, "open");
+ return 1;
+ }
+
+ left = f->real_file_size;
+ bs = td->o.max_bs[DDIR_WRITE];
+ if (bs > left)
+ bs = left;
+
+ b = malloc(bs);
+ if (!b) {
+ td_verror(td, errno, "malloc");
+ goto err;
+ }
+
+ while (left && !td->terminate) {
+ ssize_t r;
+
+ if (bs > left)
+ bs = left;
+
+ fill_io_buffer(td, b, bs, bs);
+
+ r = write(f->fd, b, bs);
+
+ if (r > 0) {
+ left -= r;
+ } else {
+ td_verror(td, errno, "write");
+ goto err;
+ }
+ }
+
+ if (td->terminate) {
+ dprint(FD_FILE, "terminate unlink %s\n", f->file_name);
+ td_io_unlink_file(td, f);
+ } else if (td->o.create_fsync) {
+ if (fsync(f->fd) < 0) {
+ td_verror(td, errno, "fsync");
+ goto err;
+ }
+ }
+
+ free(b);
+ return 0;
+err:
+ close(f->fd);
+ f->fd = -1;
+ if (b)
+ free(b);
+ return 1;
+}
+
unsigned long long get_rand_file_size(struct thread_data *td)
{
unsigned long long ret, sized;
f_out = stderr;
}
- if (td_trim(td))
- goto skip_flags;
if (td->o.odirect)
flags |= OS_O_DIRECT;
- if (td->o.oatomic) {
- if (!FIO_O_ATOMIC) {
- td_verror(td, EINVAL, "OS does not support atomic IO");
- return 1;
- }
- flags |= OS_O_DIRECT | FIO_O_ATOMIC;
- }
- if (td->o.sync_io)
- flags |= O_SYNC;
+ flags |= td->o.sync_io;
if (td->o.create_on_open && td->o.allow_create)
flags |= O_CREAT;
-skip_flags:
if (f->filetype != FIO_TYPE_FILE)
flags |= FIO_O_NOATIME;
if (!read_only)
flags |= O_RDWR;
+ if (td->o.verify_only) {
+ flags &= ~O_RDWR;
+ flags |= O_RDONLY;
+ }
+
if (f->filetype == FIO_TYPE_FILE && td->o.allow_create)
flags |= O_CREAT;
else
from_hash = file_lookup_open(f, flags);
} else if (td_read(td)) {
- if (f->filetype == FIO_TYPE_CHAR && !read_only)
+ if (td_ioengine_flagged(td, FIO_RO_NEEDS_RW_OPEN) && !read_only)
flags |= O_RDWR;
else
flags |= O_RDONLY;
} else if (f->filetype != FIO_TYPE_FILE)
continue;
- snprintf(buf, ARRAY_SIZE(buf), "%s", f->file_name);
+ snprintf(buf, FIO_ARRAY_SIZE(buf), "%s", f->file_name);
if (stat(buf, &sb) < 0) {
if (errno != ENOENT)
continue;
fm = calloc(1, sizeof(*fm));
- snprintf(fm->__base, ARRAY_SIZE(fm->__base), "%s", buf);
+ snprintf(fm->__base, FIO_ARRAY_SIZE(fm->__base), "%s", buf);
fm->base = basename(fm->__base);
fm->key = sb.st_dev;
flist_add(&fm->list, &list);
while (!done) {
buf_pos = strrchr(buf, FIO_OS_PATH_SEPARATOR);
if (!buf_pos) {
- done = true;
offset = 0;
break;
}
if (err)
goto err_out;
+ if (td->o.zone_mode == ZONE_MODE_ZBD) {
+ err = zbd_init_files(td);
+ if (err)
+ goto err_out;
+ }
+ zbd_recalc_options_with_zone_granularity(td);
+
if (o->read_iolog_file)
goto done;
o->size = total_size;
if (o->size < td_min_bs(td)) {
- log_err("fio: blocksize too large for data set\n");
+ log_err("fio: blocksize is larger than data set range\n");
goto err_out;
}
temp_stall_ts = 0;
}
+ if (err)
+ goto err_out;
+
+ /*
+ * Prepopulate files with data. It might be expected to read some
+ * "real" data instead of zero'ed files (if no writes to file occurred
+ * prior to a read job). Engine has to provide a way to do that.
+ */
+ if (td->io_ops->prepopulate_file) {
+ temp_stall_ts = 1;
+
+ for_each_file(td, f, i) {
+ if (output_format & FIO_OUTPUT_NORMAL) {
+ log_info("%s: Prepopulating IO file (%s)\n",
+ o->name, f->file_name);
+ }
+
+ err = td->io_ops->prepopulate_file(td, f);
+ if (err)
+ break;
+
+ err = __file_invalidate_cache(td, f, f->file_offset,
+ f->io_size);
+
+ /*
+ * Shut up static checker
+ */
+ if (f->fd != -1)
+ close(f->fd);
+
+ f->fd = -1;
+ if (err)
+ break;
+ }
+ temp_stall_ts = 0;
+ }
+
if (err)
goto err_out;
}
done:
+ if (td->o.zone_mode == ZONE_MODE_ZBD) {
+ err = zbd_setup_files(td);
+ if (err)
+ goto err_out;
+ }
+
if (o->create_only)
td->done = 1;
td_restore_runstate(td, old_state);
- if (td->o.zone_mode == ZONE_MODE_ZBD) {
- err = zbd_setup_files(td);
+ if (td->o.dp_type != FIO_DP_NONE) {
+ err = dp_init(td);
if (err)
goto err_out;
}
+
return 0;
err_offset:
nranges = (fsize + range_size - 1ULL) / range_size;
- seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number;
- if (!td->o.rand_repeatable)
- seed = td->rand_seeds[4];
+ seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number *
+ td->rand_seeds[FIO_RAND_BLOCK_OFF];
if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
- zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, seed);
+ zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, td->o.random_center.u.f, seed);
else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
- pareto_init(&f->zipf, nranges, td->o.pareto_h.u.f, seed);
+ pareto_init(&f->zipf, nranges, td->o.pareto_h.u.f, td->o.random_center.u.f, seed);
else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
- gauss_init(&f->gauss, nranges, td->o.gauss_dev.u.f, seed);
+ gauss_init(&f->gauss, nranges, td->o.gauss_dev.u.f, td->o.random_center.u.f, seed);
}
static bool init_rand_distribution(struct thread_data *td)
/*
* Check if the number of blocks exceeds the randomness capability of
- * the selected generator. Tausworthe is 32-bit, the others are fullly
+ * the selected generator. Tausworthe is 32-bit, the others are fully
* 64-bit capable.
*/
static int check_rand_gen_limits(struct thread_data *td, struct fio_file *f,
{
if (fio_file_axmap(f))
axmap_free(f->io_axmap);
+ if (f->ruhs_info)
+ sfree(f->ruhs_info);
if (!fio_file_smalloc(f)) {
free(f->file_name);
free(f);
}
zbd_close_file(f);
+ fdp_free_ruhs_info(f);
fio_file_free(f);
}
if (!org->files)
return;
- td->files = malloc(org->files_index * sizeof(f));
+ td->files = calloc(org->files_index, sizeof(f));
if (td->o.file_lock_mode != FILE_LOCK_NONE)
td->file_locks = malloc(org->files_index);
+ assert(org->files_index >= org->o.nr_files);
for_each_file(org, f, i) {
struct fio_file *__f;
defined by \fIioengine\fR. If no \fIioengine\fR is given, list all
available ioengines.
.TP
-.BI \-\-showcmd \fR=\fPjobfile
-Convert \fIjobfile\fR to a set of command\-line options.
+.BI \-\-showcmd
+Convert given \fIjobfile\fRs to a set of command\-line options.
.TP
.BI \-\-readonly
Turn on safety read\-only checks, preventing writes and trims. The \fB\-\-readonly\fR
.PD
.RE
.P
+For Zone Block Device Mode:
+.RS
+.P
+.PD 0
+z means Zone
+.P
+.PD
+.RE
+.P
With `kb_base=1024' (the default), the unit prefixes are opposite
from those specified in the SI and IEC 80000-13 standards to provide
compatibility with old scripts. For example, 4k means 4096.
.PD
.RE
.P
+`z' suffix specifies that the value is measured in zones.
+Value is recalculated once block device's zone size becomes known.
+.P
If the option accepts an upper and lower range, use a colon ':' or
minus '\-' to separate such values. See \fIirange\fR parameter type.
If the lower value specified happens to be larger than the upper value
.SS "Time related parameters"
.TP
.BI runtime \fR=\fPtime
-Tell fio to terminate processing after the specified period of time. It
-can be quite hard to determine for how long a specified job will run, so
-this parameter is handy to cap the total runtime to a given time. When
-the unit is omitted, the value is interpreted in seconds.
+Limit runtime. The test will run until it completes the configured I/O
+workload or until it has run for this specified amount of time, whichever
+occurs first. It can be quite hard to determine for how long a specified
+job will run, so this parameter is handy to cap the total runtime to a
+given time. When the unit is omitted, the value is interpreted in
+seconds.
.TP
.BI time_based
If set, fio will run for the duration of the \fBruntime\fR specified
\fBgettimeofday\fR\|(2) call. The CPU set aside for doing these time
calls will be excluded from other uses. Fio will manually clear it from the
CPU mask of other jobs.
+.TP
+.BI job_start_clock_id \fR=\fPint
+The clock_id passed to the call to \fBclock_gettime\fR used to record job_start
+in the \fBjson\fR output format. Default is 0, or CLOCK_REALTIME.
.SS "Target file/device"
.TP
.BI directory \fR=\fPstr
explicit size is specified by \fBfilesize\fR.
.RS
.P
-Each colon in the wanted path must be escaped with a '\\'
+Each colon in the wanted path must be escaped with a '\e'
character. For instance, if the path is `/dev/dsk/foo@3,0:c' then you
would use `filename=/dev/dsk/foo@3,0\\:c' and if the path is
`F:\\filename' then you would use `filename=F\\:\\filename'.
.B $jobname
The name of the worker thread or process.
.TP
+.B $clientuid
+IP of the fio process when using client/server mode.
+.TP
.B $jobnum
The incremental number of the worker thread or process.
.TP
client connecting. To disable this behavior, set this option to 0.
.TP
.BI opendir \fR=\fPstr
-Recursively open any files below directory \fIstr\fR.
+Recursively open any files below directory \fIstr\fR. This accepts only a
+single directory and unlike related options, colons appearing in the path must
+not be escaped.
.TP
.BI lockfile \fR=\fPstr
Fio defaults to not locking any files before it does I/O to them. If a file
.TP
.BI ioscheduler \fR=\fPstr
Attempt to switch the device hosting the file to the specified I/O scheduler
-before running.
+before running. If the file is a pipe, a character device file or if device
+hosting the file could not be determined, this option is ignored.
.TP
.BI create_serialize \fR=\fPbool
If true, serialize the file creation for the jobs. This may be handy to
(e.g. network, splice). Default: false.
.TP
.BI unlink \fR=\fPbool
-Unlink the job files when done. Not the default, as repeated runs of that
+Unlink (delete) the job files when done. Not the default, as repeated runs of that
job would then waste time recreating the file set again and again. Default:
false.
.TP
.BI unlink_each_loop \fR=\fPbool
-Unlink job files after each iteration or loop. Default: false.
+Unlink (delete) job files after each iteration or loop. Default: false.
.TP
.BI zonemode \fR=\fPstr
Accepted values are:
Zoned block device mode. I/O happens sequentially in each zone, even if random
I/O has been selected. Random I/O happens across all zones instead of being
restricted to a single zone.
+Trim is handled using a zone reset operation. Trim only considers non-empty
+sequential write required and sequential write preferred zones.
.RE
.RE
.TP
block device, the zone capacity is obtained from the device information and this
option is ignored.
.TP
-.BI zoneskip \fR=\fPint
+.BI zoneskip \fR=\fPint[z]
For \fBzonemode\fR=strided, the number of bytes to skip after \fBzonesize\fR
bytes of data have been transferred.
so. Default: false.
.TP
.BI max_open_zones \fR=\fPint
-When running a random write test across an entire drive many more zones will be
-open than in a typical application workload. Hence this command line option
-that allows to limit the number of open zones. The number of open zones is
-defined as the number of zones to which write commands are issued by all
-threads/processes.
+When a zone of a zoned block device is partially written (i.e. not all sectors
+of the zone have been written), the zone is in one of three
+conditions: 'implicit open', 'explicit open' or 'closed'. Zoned block devices
+may have a limit called 'max_open_zones' (same name as the parameter) on the
+total number of zones that can simultaneously be in the 'implicit open'
+or 'explicit open' conditions. Zoned block devices may have another limit
+called 'max_active_zones', on the total number of zones that can simultaneously
+be in the three conditions. The \fBmax_open_zones\fR parameter limits
+the number of zones to which write commands are issued by all fio jobs, that is,
+limits the number of zones that will be in the conditions. When the device has
+the max_open_zones limit and does not have the max_active_zones limit, the
+\fBmax_open_zones\fR parameter limits the number of zones in the two open
+conditions up to the limit. In this case, fio includes zones in the two open
+conditions to the write target zones at fio start. When the device has both the
+max_open_zones and the max_active_zones limits, the \fBmax_open_zones\fR
+parameter limits the number of zones in the three conditions up to the limit.
+In this case, fio includes zones in the three conditions to the write target
+zones at fio start.
+
+This parameter is relevant only if the \fBzonemode=zbd\fR is used. The default
+value is always equal to the max_open_zones limit of the target zoned block
+device and a value higher than this limit cannot be specified by users unless
+the option \fBignore_zone_limits\fR is specified. When \fBignore_zone_limits\fR
+is specified or the target device does not have the max_open_zones limit,
+\fBmax_open_zones\fR can specify 0 to disable any limit on the number of zones
+that can be simultaneously written to by all jobs.
.TP
.BI job_max_open_zones \fR=\fPint
-Limit on the number of simultaneously opened zones per single thread/process.
+In the same manner as \fBmax_open_zones\fR, limit the number of open zones per
+fio job, that is, the number of zones that a single job can simultaneously write
+to. A value of zero indicates no limit. Default: zero.
+.TP
+.BI ignore_zone_limits \fR=\fPbool
+If this option is used, fio will ignore the maximum number of open zones limit
+of the zoned block device in use, thus allowing the option \fBmax_open_zones\fR
+value to be larger than the device reported limit. Default: false.
.TP
.BI zone_reset_threshold \fR=\fPfloat
-A number between zero and one that indicates the ratio of logical blocks with
-data to the total number of logical blocks in the test above which zones
-should be reset periodically.
+A number between zero and one that indicates the ratio of written bytes in the
+zones with write pointers in the IO range to the size of the IO range. When
+current ratio is above this ratio, zones are reset periodically as
+\fBzone_reset_frequency\fR specifies. If there are multiple jobs when using this
+option, the IO range for all write jobs has to be the same.
.TP
.BI zone_reset_frequency \fR=\fPfloat
A number between zero and one that indicates how often a zone reset should be
OpenBSD and ZFS on Solaris don't support direct I/O. On Windows the synchronous
ioengines don't support direct I/O. Default: false.
.TP
-.BI atomic \fR=\fPbool
-If value is true, attempt to use atomic direct I/O. Atomic writes are
-guaranteed to be stable once acknowledged by the operating system. Only
-Linux supports O_ATOMIC right now.
-.TP
.BI buffered \fR=\fPbool
If value is true, use buffered I/O. This is the opposite of the
\fBdirect\fR option. Defaults to true.
.TP
.B trimwrite
Sequential trim+write sequences. Blocks will be trimmed first,
-then the same blocks will be written to.
+then the same blocks will be written to. So if `io_size=64K' is specified,
+Fio will trim a total of 64K bytes and also write 64K bytes on the same
+trimmed blocks. This behaviour will be consistent with `number_ios' or
+other Fio options limiting the total bytes or number of I/O's.
+.TP
+.B randtrimwrite
+Like
+.B trimwrite ,
+but uses random offsets rather than sequential writes.
.RE
.P
Fio defaults to read if the option is not specified. For the mixed I/O
.P
\fBsequential\fR is only useful for random I/O, where fio would normally
generate a new random offset for every I/O. If you append e.g. 8 to randread,
-you would get a new random offset for every 8 I/Os. The result would be a
-seek for only every 8 I/Os, instead of for every I/O. Use `rw=randread:8'
-to specify that. As sequential I/O is already sequential, setting
-\fBsequential\fR for that would not result in any differences. \fBidentical\fR
-behaves in a similar fashion, except it sends the same offset 8 number of
-times before generating a new offset.
+i.e. `rw=randread:8' you would get a new random offset for every 8 I/Os. The
+result would be a sequence of 8 sequential offsets with a random starting
+point. However this behavior may change if a sequential I/O reaches end of the
+file. As sequential I/O is already sequential, setting \fBsequential\fR for
+that would not result in any difference. \fBidentical\fR behaves in a similar
+fashion, except it sends the same offset 8 number of times before generating a
+new offset.
+.P
+.P
+Example #1:
+.RS
+.P
+.PD 0
+rw=randread:8
+.P
+rw_sequencer=sequential
+.P
+bs=4k
+.PD
+.RE
+.P
+The generated sequence of offsets will look like this:
+4k, 8k, 12k, 16k, 20k, 24k, 28k, 32k, 92k, 96k, 100k, 104k, 108k, 112k, 116k,
+120k, 48k, 52k ...
+.P
+.P
+Example #2:
+.RS
+.P
+.PD 0
+rw=randread:8
+.P
+rw_sequencer=identical
+.P
+bs=4k
+.PD
+.RE
+.P
+The generated sequence of offsets will look like this:
+4k, 4k, 4k, 4k, 4k, 4k, 4k, 4k, 92k, 92k, 92k, 92k, 92k, 92k, 92k, 92k, 48k,
+48k, 48k ...
.RE
.TP
-.BI unified_rw_reporting \fR=\fPbool
+.BI unified_rw_reporting \fR=\fPstr
Fio normally reports statistics on a per data direction basis, meaning that
-reads, writes, and trims are accounted and reported separately. If this
-option is set fio sums the results and report them as "mixed" instead.
+reads, writes, and trims are accounted and reported separately. This option
+determines whether fio reports the results normally, summed together, or as
+both options.
+Accepted values are:
+.RS
+.TP
+.B none
+Normal statistics reporting.
+.TP
+.B mixed
+Statistics are summed per data direction and reported together.
+.TP
+.B both
+Statistics are reported normally, followed by the mixed statistics.
+.TP
+.B 0
+Backward-compatible alias for \fBnone\fR.
+.TP
+.B 1
+Backward-compatible alias for \fBmixed\fR.
+.TP
+.B 2
+Alias for \fBboth\fR.
+.RE
.TP
.BI randrepeat \fR=\fPbool
-Seed the random number generator used for random I/O patterns in a
-predictable way so the pattern is repeatable across runs. Default: true.
+Seed all random number generators in a predictable way so the pattern is
+repeatable across runs. Default: true.
.TP
.BI allrandrepeat \fR=\fPbool
-Seed all random number generators in a predictable way so results are
-repeatable across runs. Default: false.
+Alias for \fBrandrepeat\fR. Default: true.
.TP
.BI randseed \fR=\fPint
Seed the random number generators based on this seed value, to be able to
.TP
.B random
Advise using FADV_RANDOM.
+.TP
+.B noreuse
+Advise using FADV_NOREUSE. This may be a no-op on older Linux
+kernels. Since Linux 6.3, it provides a hint to the LRU algorithm.
+See the \fBposix_fadvise\fR\|(2) man page.
.RE
.RE
.TP
should be associated with them.
.RE
.TP
-.BI offset \fR=\fPint
+.BI offset \fR=\fPint[%|z]
Start I/O at the provided offset in the file, given as either a fixed size in
-bytes or a percentage. If a percentage is given, the generated offset will be
+bytes, zones or a percentage. If a percentage is given, the generated offset will be
aligned to the minimum \fBblocksize\fR or to the value of \fBoffset_align\fR if
provided. Data before the given offset will not be touched. This
effectively caps the file size at `real_size \- offset'. Can be combined with
\fBsize\fR to constrain the start and end range of the I/O workload.
A percentage can be specified by a number between 1 and 100 followed by '%',
-for example, `offset=20%' to specify 20%.
+for example, `offset=20%' to specify 20%. In ZBD mode, value can be set as
+number of zones using 'z'.
.TP
.BI offset_align \fR=\fPint
If set to non-zero value, the byte offset generated by a percentage \fBoffset\fR
is aligned upwards to this value. Defaults to 0 meaning that a percentage
offset is aligned to the minimum block size.
.TP
-.BI offset_increment \fR=\fPint
+.BI offset_increment \fR=\fPint[%|z]
If this is provided, then the real offset becomes `\fBoffset\fR + \fBoffset_increment\fR
* thread_number', where the thread number is a counter that starts at 0 and
is incremented for each sub-job (i.e. when \fBnumjobs\fR option is
intended to operate on a file in parallel disjoint segments, with even
spacing between the starting points. Percentages can be used for this option.
If a percentage is given, the generated offset will be aligned to the minimum
-\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.
+\fBblocksize\fR or to the value of \fBoffset_align\fR if provided.In ZBD mode, value
+can be set as number of zones using 'z'.
.TP
.BI number_ios \fR=\fPint
Fio will normally perform I/Os until it has exhausted the size of the region
.TP
.BI fdatasync \fR=\fPint
Like \fBfsync\fR but uses \fBfdatasync\fR\|(2) to only sync data and
-not metadata blocks. In Windows, FreeBSD, DragonFlyBSD or OSX there is no
+not metadata blocks. In Windows, DragonFlyBSD or OSX there is no
\fBfdatasync\fR\|(2) so this falls back to using \fBfsync\fR\|(2).
Defaults to 0, which means fio does not periodically issue and wait for a
data-only sync to complete.
limit reads or writes to a certain rate. If that is the case, then the
distribution may be skewed. Default: 50.
.TP
-.BI random_distribution \fR=\fPstr:float[,str:float][,str:float]
+.BI random_distribution \fR=\fPstr:float[:float][,str:float][,str:float]
By default, fio will use a completely uniform random distribution when asked
to perform random I/O. Sometimes it is useful to skew the distribution in
specific ways, ensuring that some parts of the data is more hot than others.
map. For the \fBnormal\fR distribution, a normal (Gaussian) deviation is
supplied as a value between 0 and 100.
.P
+The second, optional float is allowed for \fBpareto\fR, \fBzipf\fR and \fBnormal\fR
+distributions. It allows one to set base of distribution in non-default place, giving
+more control over most probable outcome. This value is in range [0-1] which maps linearly to
+range of possible random values.
+Defaults are: random for \fBpareto\fR and \fBzipf\fR, and 0.5 for \fBnormal\fR.
+If you wanted to use \fBzipf\fR with a `theta` of 1.2 centered on 1/4 of allowed value range,
+you would use `random_distribution=zipf:1.2:0.25`.
+.P
For a \fBzoned\fR distribution, fio supports specifying percentages of I/O
access that should fall within what range of the file or device. For
example, given a criteria of:
.RS
.RS
.P
-bsrange=1k\-4k,2k\-8k
+bsrange=1k\-4k,2k\-8k or bsrange=1k:4k,2k:8k
.RE
.RE
.TP
this option will also enable \fBrefill_buffers\fR to prevent every buffer
being identical.
.TP
+.BI dedupe_mode \fR=\fPstr
+If \fBdedupe_percentage\fR is given, then this option controls how fio
+generates the dedupe buffers.
+.RS
+.RS
+.TP
+.B repeat
+.P
+.RS
+Generate dedupe buffers by repeating previous writes
+.RE
+.TP
+.B working_set
+.P
+.RS
+Generate dedupe buffers from working set
+.RE
+.RE
+.P
+\fBrepeat\fR is the default option for fio. Dedupe buffers are generated
+by repeating previous unique write.
+
+\fBworking_set\fR is a more realistic workload.
+With \fBworking_set\fR, \fBdedupe_working_set_percentage\fR should be provided.
+Given that, fio will use the initial unique write buffers as its working set.
+Upon deciding to dedupe, fio will randomly choose a buffer from the working set.
+Note that by using \fBworking_set\fR the dedupe percentage will converge
+to the desired over time while \fBrepeat\fR maintains the desired percentage
+throughout the job.
+.RE
+.RE
+.TP
+.BI dedupe_working_set_percentage \fR=\fPint
+If \fBdedupe_mode\fR is set to \fBworking_set\fR, then this controls
+the percentage of size of the file or device used as the buffers
+fio will choose to generate the dedupe buffers from
+.P
+.RS
+Note that \fBsize\fR needs to be explicitly provided and only 1 file
+per job is supported
+.RE
+.TP
+.BI dedupe_global \fR=\fPbool
+This controls whether the deduplication buffers will be shared amongst
+all jobs that have this option set. The buffers are spread evenly between
+participating jobs.
+.P
+.RS
+Note that \fBdedupe_mode\fR must be set to \fBworking_set\fR for this to work.
+Can be used in combination with compression
+.TP
.BI invalidate \fR=\fPbool
Invalidate the buffer/page cache parts of the files to be used prior to
starting I/O if the platform and file type support it. Defaults to true.
This will be ignored if \fBpre_read\fR is also specified for the
same job.
.TP
-.BI sync \fR=\fPbool
-Use synchronous I/O for buffered writes. For the majority of I/O engines,
-this means using O_SYNC. Default: false.
+.BI sync \fR=\fPstr
+Whether, and what type, of synchronous I/O to use for writes. The allowed
+values are:
+.RS
+.RS
+.TP
+.B none
+Do not use synchronous IO, the default.
+.TP
+.B 0
+Same as \fBnone\fR.
+.TP
+.B sync
+Use synchronous file IO. For the majority of I/O engines,
+this means using O_SYNC.
+.TP
+.B 1
+Same as \fBsync\fR.
+.TP
+.B dsync
+Use synchronous data IO. For the majority of I/O engines,
+this means using O_DSYNC.
+.PD
+.RE
+.RE
.TP
.BI iomem \fR=\fPstr "\fR,\fP mem" \fR=\fPstr
Fio can use various types of memory as the I/O unit buffer. The allowed
\fBmmaphuge\fR to work, the system must have free huge pages allocated. This
can normally be checked and set by reading/writing
`/proc/sys/vm/nr_hugepages' on a Linux system. Fio assumes a huge page
-is 4MiB in size. So to calculate the number of huge pages you need for a
-given job file, add up the I/O depth of all jobs (normally one unless
-\fBiodepth\fR is used) and multiply by the maximum bs set. Then divide
-that number by the huge page size. You can see the size of the huge pages in
-`/proc/meminfo'. If no huge pages are allocated by having a non-zero
+is 2 or 4MiB in size depending on the platform. So to calculate the number of
+huge pages you need for a given job file, add up the I/O depth of all jobs
+(normally one unless \fBiodepth\fR is used) and multiply by the maximum bs set.
+Then divide that number by the huge page size. You can see the size of the huge
+pages in `/proc/meminfo'. If no huge pages are allocated by having a non-zero
number in `nr_hugepages', using \fBmmaphuge\fR or \fBshmhuge\fR will fail. Also
see \fBhugepage\-size\fR.
.P
\fBbs\fR used.
.TP
.BI hugepage\-size \fR=\fPint
-Defines the size of a huge page. Must at least be equal to the system
-setting, see `/proc/meminfo'. Defaults to 4MiB. Should probably
-always be a multiple of megabytes, so using `hugepage\-size=Xm' is the
-preferred way to set this to avoid setting a non-pow-2 bad value.
+Defines the size of a huge page. Must at least be equal to the system setting,
+see `/proc/meminfo' and `/sys/kernel/mm/hugepages/'. Defaults to 2 or 4MiB
+depending on the platform. Should probably always be a multiple of megabytes,
+so using `hugepage\-size=Xm' is the preferred way to set this to avoid setting
+a non-pow-2 bad value.
.TP
.BI lockmem \fR=\fPint
Pin the specified amount of memory with \fBmlock\fR\|(2). Can be used to
simulate a smaller amount of memory. The amount specified is per worker.
.SS "I/O size"
.TP
-.BI size \fR=\fPint
+.BI size \fR=\fPint[%|z]
The total size of file I/O for each thread of this job. Fio will run until
-this many bytes has been transferred, unless runtime is limited by other options
-(such as \fBruntime\fR, for instance, or increased/decreased by \fBio_size\fR).
+this many bytes has been transferred, unless runtime is altered by other means
+such as (1) \fBruntime\fR, (2) \fBio_size\fR, (3) \fBnumber_ios\fR, (4)
+gaps/holes while doing I/O's such as `rw=read:16K', or (5) sequential I/O
+reaching end of the file which is possible when \fBpercentage_random\fR is
+less than 100.
Fio will divide this size between the available files determined by options
such as \fBnrfiles\fR, \fBfilename\fR, unless \fBfilesize\fR is
specified by the job. If the result of division happens to be 0, the size is
If this option is not specified, fio will use the full size of the given
files or devices. If the files do not exist, size must be given. It is also
possible to give size as a percentage between 1 and 100. If `size=20%' is
-given, fio will use 20% of the full size of the given files or devices.
-Can be combined with \fBoffset\fR to constrain the start and end range
-that I/O will be done within.
+given, fio will use 20% of the full size of the given files or devices. In ZBD mode,
+size can be given in units of number of zones using 'z'. Can be combined with \fBoffset\fR to
+constrain the start and end range that I/O will be done within.
.TP
-.BI io_size \fR=\fPint "\fR,\fB io_limit" \fR=\fPint
+.BI io_size \fR=\fPint[%|z] "\fR,\fB io_limit" \fR=\fPint[%|z]
Normally fio operates within the region set by \fBsize\fR, which means
that the \fBsize\fR option sets both the region and size of I/O to be
performed. Sometimes that is not what you want. With this option, it is
done. The opposite is also possible \-\- if \fBsize\fR is set to 20GiB,
and \fBio_size\fR is set to 40GiB, then fio will do 40GiB of I/O within
the 0..20GiB region. Value can be set as percentage: \fBio_size\fR=N%.
-In this case \fBio_size\fR multiplies \fBsize\fR= value.
+In this case \fBio_size\fR multiplies \fBsize\fR= value. In ZBD mode, value can
+also be set as number of zones using 'z'.
.TP
.BI filesize \fR=\fPirange(int)
Individual file sizes. May be a range, in which case fio will select sizes
-for files at random within the given range and limited to \fBsize\fR in
-total (if that is given). If not given, each created file is the same size.
-This option overrides \fBsize\fR in terms of file size, which means
-this value is used as a fixed size or possible range of each file.
+for files at random within the given range. If not given, each created file
+is the same size. This option overrides \fBsize\fR in terms of file size,
+i.e. \fBsize\fR becomes merely the default for \fBio_size\fR (and
+has no effect it all if \fBio_size\fR is set explicitly).
.TP
.BI file_append \fR=\fPbool
Perform I/O after the end of the file. Normally fio will operate within the
.TP
.BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
Sets size to something really large and waits for ENOSPC (no space left on
-device) as the terminating condition. Only makes sense with sequential
+device) or EDQUOT (disk quota exceeded)
+as the terminating condition. Only makes sense with sequential
write. For a read workload, the mount point will be filled first then I/O
-started on the result. This option doesn't make sense if operating on a raw
-device node, since the size of that is already known by the file system.
-Additionally, writing beyond end-of-device will not return ENOSPC there.
+started on the result.
.SS "I/O engine"
.TP
.BI ioengine \fR=\fPstr
-Defines how the job issues I/O to the file. The following types are defined:
-.RS
+fio supports 2 kinds of performance measurement: I/O and file/directory operation.
+
+I/O engines define how the job issues I/O to the file. The following types are defined:
.RS
.TP
.B sync
.B pvsync2
Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O.
.TP
+.B io_uring
+Fast Linux native asynchronous I/O. Supports async IO
+for both direct and buffered IO.
+This engine defines engine specific options.
+.TP
+.B io_uring_cmd
+Fast Linux native asynchronous I/O for passthrough commands.
+This engine defines engine specific options.
+.TP
.B libaio
Linux native asynchronous I/O. Note that Linux may only support
queued behavior with non-buffered I/O (set `direct=1' or
sg engine includes engine specific options.
.TP
.B libzbc
-Synchronous I/O engine for SMR hard-disks using the \fBlibzbc\fR
-library. The target can be either an sg character device or
-a block device file. This engine supports the zonemode=zbd zone
-operations.
+Read, write, trim and ZBC/ZAC operations to a zoned block device using
+\fBlibzbc\fR library. The target can be either an SG character device or
+a block device file.
.TP
.B null
Doesn't transfer any data, just pretends to. This is mainly used to
.TP
.B cpuio
Doesn't transfer any data, but burns CPU cycles according to the
-\fBcpuload\fR and \fBcpuchunks\fR options. Setting
-\fBcpuload\fR\=85 will cause that job to do nothing but burn 85%
-of the CPU. In case of SMP machines, use `numjobs=<nr_of_cpu>'
-to get desired CPU usage, as the cpuload only loads a
-single CPU at the desired rate. A job never finishes unless there is
-at least one non-cpuio job.
+\fBcpuload\fR, \fBcpuchunks\fR and \fBcpumode\fR options.
+A job never finishes unless there is at least one non-cpuio job.
+.RS
+.P
+.PD 0
+\fBcpuload\fR\=85 will cause that job to do nothing but burn 85% of the CPU.
+In case of SMP machines, use \fBnumjobs=<nr_of_cpu>\fR\ to get desired CPU usage,
+as the cpuload only loads a single CPU at the desired rate.
+
+.P
+\fBcpumode\fR\=qsort replace the default noop instructions loop
+by a qsort algorithm to consume more energy.
+
+.P
+.RE
.TP
.B rdma
The RDMA I/O engine supports both RDMA memory semantics
before overwriting. The \fBtrimwrite\fR mode works well for this
constraint.
.TP
-.B pmemblk
-Read and write using filesystem DAX to a file on a filesystem
-mounted with DAX on a persistent memory device through the PMDK
-libpmemblk library.
-.TP
.B dev\-dax
Read and write using device DAX to a persistent memory device (e.g.,
/dev/dax0.0) through the PMDK libpmem library.
absolute or relative. See `engines/skeleton_external.c' in the fio source for
details of writing an external I/O engine.
.TP
-.B filecreate
-Simply create the files and do no I/O to them. You still need to set
-\fBfilesize\fR so that all the accounting still occurs, but no actual I/O will be
-done other than creating the file.
-.TP
-.B filestat
-Simply do stat() and do no I/O to the file. You need to set 'filesize'
-and 'nrfiles', so that files will be created.
-This engine is to measure file lookup and meta data access.
-.TP
.B libpmem
Read and write using mmap I/O to a file on a filesystem
mounted with DAX on a persistent memory device through the PMDK
.TP
.B nbd
Synchronous read and write a Network Block Device (NBD).
+.TP
+.B libcufile
+I/O engine supporting libcufile synchronous access to nvidia-fs and a
+GPUDirect Storage-supported filesystem. This engine performs
+I/O without transferring buffers between user-space and the kernel,
+unless \fBverify\fR is set or \fBcuda_io\fR is \fBposix\fR. \fBiomem\fR must
+not be \fBcudamalloc\fR. This ioengine defines engine specific options.
+.TP
+.B dfs
+I/O engine supporting asynchronous read and write operations to the DAOS File
+System (DFS) via libdfs.
+.TP
+.B nfs
+I/O engine supporting asynchronous read and write operations to
+NFS filesystems from userspace via libnfs. This is useful for
+achieving higher concurrency and thus throughput than is possible
+via kernel NFS.
+.TP
+.B exec
+Execute 3rd party tools. Could be used to perform monitoring during jobs runtime.
+.TP
+.B xnvme
+I/O engine using the xNVMe C API, for NVMe devices. The xnvme engine provides
+flexibility to access GNU/Linux Kernel NVMe driver via libaio, IOCTLs, io_uring,
+the SPDK NVMe driver, or your own custom NVMe driver. The xnvme engine includes
+engine specific options. (See \fIhttps://xnvme.io/\fR).
+.TP
+.B libblkio
+Use the libblkio library (\fIhttps://gitlab.com/libblkio/libblkio\fR). The
+specific driver to use must be set using \fBlibblkio_driver\fR. If
+\fBmem\fR/\fBiomem\fR is not specified, memory allocation is delegated to
+libblkio (and so is guaranteed to work with the selected driver). One libblkio
+instance is used per process, so all jobs setting option \fBthread\fR will share
+a single instance (with one queue per thread) and must specify compatible
+options. Note that some drivers don't allow several instances to access the same
+device or file simultaneously, but allow it for threads.
+.TP
+.RE
+.P
+File/directory operation engines define how the job operates file or directory.
+The following types are defined:
+.RS
+.TP
+.B filecreate
+Simply create the files and do no I/O to them. You still need to
+set \fBfilesize\fP so that all the accounting still occurs, but no
+actual I/O will be done other than creating the file.
+Example job file: filecreate-ioengine.fio.
+.TP
+.B filestat
+Simply do stat() and do no I/O to the file. You need to set \fBfilesize\fP
+and \fBnrfiles\fP, so that files will be created.
+This engine is to measure file lookup and meta data access.
+Example job file: filestat-ioengine.fio.
+.TP
+.B filedelete
+Simply delete the files by unlink() and do no I/O to them. You need to set \fBfilesize\fP
+and \fBnrfiles\fP, so that the files will be created.
+This engine is to measure file delete.
+Example job file: filedelete-ioengine.fio.
+.TP
+.B dircreate
+Simply create the directories and do no I/O to them. You still need to
+set \fBfilesize\fP so that all the accounting still occurs, but no
+actual I/O will be done other than creating the directories.
+Example job file: dircreate-ioengine.fio.
+.TP
+.B dirstat
+Simply do stat() and do no I/O to the directories. You need to set \fBfilesize\fP
+and \fBnrfiles\fP, so that directories will be created.
+This engine is to measure directory lookup and meta data access.
+Example job file: dirstat-ioengine.fio.
+.TP
+.B dirdelete
+Simply delete the directories by rmdir() and do no I/O to them. You need to set \fBfilesize\fP
+and \fBnrfiles\fP, so that the directories will be created.
+This engine is to measure directory delete.
+.TP
+.RE
+.P
+For file and directory operation engines, there is no I/O throughput, then the statistics \
+data in report have different meanings. The meaningful output indexes are: \fBiops\fP and \fBclat\fP. \
+\fBbw\fP is meaningless. Refer to section: "Interpreting the output" for more details.
+.RE
+.P
.SS "I/O engine specific parameters"
In addition, there are some parameters which are only valid when a specific
\fBioengine\fR is in use. These are used identically to normal parameters,
with the caveat that when used on the command line, they must come after the
\fBioengine\fR that defines them is selected.
.TP
-.BI (io_uring, libaio)cmdprio_percentage \fR=\fPint
-Set the percentage of I/O that will be issued with higher priority by setting
-the priority bit. Non-read I/O is likely unaffected by ``cmdprio_percentage``.
-This option cannot be used with the `prio` or `prioclass` options. For this
-option to set the priority bit properly, NCQ priority must be supported and
-enabled and `direct=1' option must be used. fio must also be run as the root
-user.
+.BI (io_uring,libaio)cmdprio_percentage \fR=\fPint[,int]
+Set the percentage of I/O that will be issued with the highest priority.
+Default: 0. A single value applies to reads and writes. Comma-separated
+values may be specified for reads and writes. For this option to be effective,
+NCQ priority must be supported and enabled, and `direct=1' option must be
+used. fio must also be run as the root user. Unlike slat/clat/lat stats, which
+can be tracked and reported independently, per priority stats only track and
+report a single type of latency. By default, completion latency (clat) will be
+reported, if \fBlat_percentiles\fR is set, total latency (lat) will be reported.
+.TP
+.BI (io_uring,libaio)cmdprio_class \fR=\fPint[,int]
+Set the I/O priority class to use for I/Os that must be issued with a
+priority when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR is set.
+If not specified when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR
+is set, this defaults to the highest priority class. A single value applies
+to reads and writes. Comma-separated values may be specified for reads and
+writes. See man \fBionice\fR\|(1). See also the \fBprioclass\fR option.
+.TP
+.BI (io_uring,libaio)cmdprio_hint \fR=\fPint[,int]
+Set the I/O priority hint to use for I/Os that must be issued with a
+priority when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR is set.
+If not specified when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR
+is set, this defaults to 0 (no hint). A single value applies to reads and
+writes. Comma-separated values may be specified for reads and writes.
+See also the \fBpriohint\fR option.
+.TP
+.BI (io_uring,libaio)cmdprio \fR=\fPint[,int]
+Set the I/O priority value to use for I/Os that must be issued with a
+priority when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR is set.
+If not specified when \fBcmdprio_percentage\fR or \fBcmdprio_bssplit\fR
+is set, this defaults to 0. Linux limits us to a positive value between
+0 and 7, with 0 being the highest. A single value applies to reads and writes.
+Comma-separated values may be specified for reads and writes. See man
+\fBionice\fR\|(1). Refer to an appropriate manpage for other operating systems
+since the meaning of priority may differ. See also the \fBprio\fR option.
.TP
-.BI (io_uring)fixedbufs
+.BI (io_uring,libaio)cmdprio_bssplit \fR=\fPstr[,str]
+To get a finer control over I/O priority, this option allows specifying
+the percentage of IOs that must have a priority set depending on the block
+size of the IO. This option is useful only when used together with the option
+\fBbssplit\fR, that is, multiple different block sizes are used for reads and
+writes.
+.RS
+.P
+The first accepted format for this option is the same as the format of the
+\fBbssplit\fR option:
+.RS
+.P
+cmdprio_bssplit=blocksize/percentage:blocksize/percentage
+.RE
+.P
+In this case, each entry will use the priority class, priority hint and
+priority level defined by the options \fBcmdprio_class\fR, \fBcmdprio\fR
+and \fBcmdprio_hint\fR respectively.
+.P
+The second accepted format for this option is:
+.RS
+.P
+cmdprio_bssplit=blocksize/percentage/class/level:blocksize/percentage/class/level
+.RE
+.P
+In this case, the priority class and priority level is defined inside each
+entry. In comparison with the first accepted format, the second accepted format
+does not restrict all entries to have the same priority class and priority
+level.
+.P
+The third accepted format for this option is:
+.RS
+.P
+cmdprio_bssplit=blocksize/percentage/class/level/hint:...
+.RE
+.P
+This is an extension of the second accepted format that allows one to also
+specify a priority hint.
+.P
+For all formats, only the read and write data directions are supported, values
+for trim IOs are ignored. This option is mutually exclusive with the
+\fBcmdprio_percentage\fR option.
+.RE
+.TP
+.BI (io_uring,io_uring_cmd)fixedbufs
If fio is asked to do direct IO, then Linux will map pages for each IO call, and
release them when IO is done. If this option is set, the pages are pre-mapped
before IO is started. This eliminates the need to map and release for each IO.
This is more efficient, and reduces the IO latency as well.
.TP
-.BI (io_uring)hipri
+.BI (io_uring,io_uring_cmd)nonvectored \fR=\fPint
+With this option, fio will use non-vectored read/write commands, where address
+must contain the address directly. Default is -1.
+.TP
+.BI (io_uring,io_uring_cmd)force_async
+Normal operation for io_uring is to try and issue an sqe as non-blocking first,
+and if that fails, execute it in an async manner. With this option set to N,
+then every N request fio will ask sqe to be issued in an async manner. Default
+is 0.
+.TP
+.BI (io_uring,io_uring_cmd,xnvme)hipri
If this option is set, fio will attempt to use polled IO completions. Normal IO
completions generate interrupts to signal the completion of IO, polled
completions do not. Hence they are require active reaping by the application.
The benefits are more efficient IO for high IOPS scenarios, and lower latencies
for low queue depth IO.
.TP
-.BI (io_uring)registerfiles
+.BI (io_uring,io_uring_cmd)registerfiles
With this option, fio registers the set of files being used with the kernel.
This avoids the overhead of managing file counts in the kernel, making the
submission and completion part more lightweight. Required for the below
sqthread_poll option.
.TP
-.BI (io_uring)sqthread_poll
+.BI (io_uring,io_uring_cmd,xnvme)sqthread_poll
Normally fio will submit IO by issuing a system call to notify the kernel of
available items in the SQ ring. If this option is set, the act of submitting IO
will be done by a polling thread in the kernel. This frees up cycles for fio, at
-the cost of using more CPU in the system.
+the cost of using more CPU in the system. As submission is just the time it
+takes to fill in the sqe entries and any syscall required to wake up the idle
+kernel thread, fio will not report submission latencies.
.TP
-.BI (io_uring)sqthread_poll_cpu
+.BI (io_uring,io_uring_cmd)sqthread_poll_cpu \fR=\fPint
When `sqthread_poll` is set, this option provides a way to define which CPU
should be used for the polling thread.
.TP
+.BI (io_uring_cmd)cmd_type \fR=\fPstr
+Specifies the type of uring passthrough command to be used. Supported
+value is nvme. Default is nvme.
+.TP
.BI (libaio)userspace_reap
Normally, with the libaio engine in use, fio will use the
\fBio_getevents\fR\|(3) system call to reap newly returned events. With
When hipri is set this determines the probability of a pvsync2 I/O being high
priority. The default is 100%.
.TP
-.BI (pvsync2,libaio,io_uring)nowait
+.BI (pvsync2,libaio,io_uring,io_uring_cmd)nowait \fR=\fPbool
By default if a request cannot be executed immediately (e.g. resource starvation,
waiting on locks) it is queued and the initiating process will be blocked until
the required resource becomes free.
For direct I/O, requests will only succeed if cache invalidation isn't required,
file blocks are fully allocated and the disk request could be issued immediately.
.TP
+.BI (io_uring_cmd,xnvme)fdp \fR=\fPbool
+Enable Flexible Data Placement mode for write commands.
+.TP
+.BI (io_uring_cmd,xnvme)dataplacement \fR=\fPstr
+Specifies the data placement directive type to use for write commands. The
+following types are supported:
+.RS
+.RS
+.TP
+.B none
+Do not use a data placement directive. This is the default.
+.TP
+.B fdp
+Use Flexible Data placement directives for write commands. This is equivalent
+to specifying \fBfdp\fR=1.
+.TP
+.B streams
+Use Streams directives for write commands.
+.TP
+.RE
+.RE
+.TP
+.BI (io_uring_cmd,xnvme)plid_select=str, fdp_pli_select \fR=\fPstr
+Defines how fio decides which placement ID to use next. The following types
+are defined:
+.RS
+.RS
+.TP
+.B random
+Choose a placement ID at random (uniform).
+.TP
+.B roundrobin
+Round robin over available placement IDs. This is the default.
+.RE
+.P
+The available placement ID (indices) are defined by the \fBplids\fR option.
+.RE
+.TP
+.BI (io_uring_cmd,xnvme)plids=str, fdp_pli \fR=\fPstr
+Select which Placement IDs (streams) or Placement ID Indicies (FDP) this job is
+allowed to use for writes. For FDP by default, the job will cycle through all
+available Placement IDs, so use this to isolate these identifiers to specific
+jobs. If you want fio to use placement identifier only at indices 0, 2 and 5
+specify, you would set `plids=0,2,5`. For streams this should be a
+comma-separated list of Stream IDs.
+.TP
+.BI (io_uring_cmd,xnvme)md_per_io_size \fR=\fPint
+Size in bytes for separate metadata buffer per IO. Default: 0.
+.TP
+.BI (io_uring_cmd,xnvme)pi_act \fR=\fPint
+Action to take when nvme namespace is formatted with protection information.
+If this is set to 1 and namespace is formatted with metadata size equal to
+protection information size, fio won't use separate metadata buffer or extended
+logical block. If this is set to 1 and namespace is formatted with metadata
+size greater than protection information size, fio will not generate or verify
+the protection information portion of metadata for write or read case
+respectively. If this is set to 0, fio generates protection information for
+write case and verifies for read case. Default: 1.
+
+For 16 bit CRC generation fio will use isa-l if available otherwise it will
+use the default slower generator.
+(see: https://github.com/intel/isa-l)
+.TP
+.BI (io_uring_cmd,xnvme)pi_chk \fR=\fPstr[,str][,str]
+Controls the protection information check. This can take one or more of these
+values. Default: none.
+.RS
+.RS
+.TP
+.B GUARD
+Enables protection information checking of guard field.
+.TP
+.B REFTAG
+Enables protection information checking of logical block reference tag field.
+.TP
+.B APPTAG
+Enables protection information checking of application tag field.
+.RE
+.RE
+.TP
+.BI (io_uring_cmd,xnvme)apptag \fR=\fPint
+Specifies logical block application tag value, if namespace is formatted to use
+end to end protection information. Default: 0x1234.
+.TP
+.BI (io_uring_cmd,xnvme)apptag_mask \fR=\fPint
+Specifies logical block application tag mask value, if namespace is formatted
+to use end to end protection information. Default: 0xffff.
+.TP
+.BI (io_uring_cmd)num_range \fR=\fPint
+For trim command this will be the number of ranges to trim per I/O request.
+The number of logical blocks per range is determined by the \fBbs\fR option
+which should be a multiple of logical block size. This cannot be used with
+read or write. Note that setting this option > 1, \fBlog_offset\fR will not be
+able to log all the offsets. Default: 1.
+.TP
.BI (cpuio)cpuload \fR=\fPint
Attempt to use the specified percentage of CPU cycles. This is a mandatory
option when using cpuio I/O engine.
.BI (cpuio)cpuchunks \fR=\fPint
Split the load into cycles of the given time. In microseconds.
.TP
+.BI (cpuio)cpumode \fR=\fPstr
+Specify how to stress the CPU. It can take these two values:
+.RS
+.RS
+.TP
+.B noop
+This is the default and directs the CPU to execute noop instructions.
+.TP
+.B qsort
+Replace the default noop instructions with a qsort algorithm to consume more energy.
+.RE
+.RE
+.TP
.BI (cpuio)exit_on_io_done \fR=\fPbool
Detect when I/O threads are done, then exit.
.TP
.BI (libhdfs)namenode \fR=\fPstr
The hostname or IP address of a HDFS cluster namenode to contact.
.TP
-.BI (libhdfs)port
+.BI (libhdfs)port \fR=\fPint
The listening port of the HFDS cluster namenode.
.TP
-.BI (netsplice,net)port
+.BI (netsplice,net)port \fR=\fPint
The TCP or UDP port to bind to or connect to. If this is used with
\fBnumjobs\fR to spawn multiple instances of the same job type, then
this will be the starting port number since fio will use a range of
ports.
.TP
-.BI (rdma)port
+.BI (rdma,librpma_*)port \fR=\fPint
The port to use for RDMA-CM communication. This should be the same
value on the client and the server side.
.TP
-.BI (netsplice,net, rdma)hostname \fR=\fPstr
+.BI (netsplice,net,rdma)hostname \fR=\fPstr
The hostname or IP address to use for TCP, UDP or RDMA-CM based I/O.
If the job is a TCP listener or UDP reader, the hostname is not used
and must be omitted unless it is a valid UDP multicast address.
.TP
+.BI (librpma_*)serverip \fR=\fPstr
+The IP address to be used for RDMA-CM based I/O.
+.TP
+.BI (librpma_*_server)direct_write_to_pmem \fR=\fPbool
+Set to 1 only when Direct Write to PMem from the remote host is possible. Otherwise, set to 0.
+.TP
+.BI (librpma_*_server)busy_wait_polling \fR=\fPbool
+Set to 0 to wait for completion instead of busy-wait polling completion.
+Default: 1.
+.TP
.BI (netsplice,net)interface \fR=\fPstr
The IP address of the network interface used to send or receive UDP
multicast.
.TP
.B unix
UNIX domain socket.
+.TP
+.B vsock
+VSOCK protocol.
.RE
.P
-When the protocol is TCP or UDP, the port must also be given, as well as the
-hostname if the job is a TCP listener or UDP reader. For unix sockets, the
+When the protocol is TCP, UDP or VSOCK, the port must also be given, as well as the
+hostname if the job is a TCP or VSOCK listener or UDP reader. For unix sockets, the
normal \fBfilename\fR option should be used and the port is invalid.
+When the protocol is VSOCK, the \fBhostname\fR is the CID of the remote VM.
+
.RE
.TP
.BI (netsplice,net)listen
the full *type.id* string. If no type. prefix is given, fio will add 'client.'
by default.
.TP
+.BI (rados)conf \fR=\fPstr
+Specifies the configuration path of ceph cluster, so conf file does not
+have to be /etc/ceph/ceph.conf.
+.TP
.BI (rbd,rados)busy_poll \fR=\fPbool
Poll store instead of waiting for completion. Usually this provides better
throughput at cost of higher(up to 100%) CPU utilization.
.TP
+.BI (rados)touch_objects \fR=\fPbool
+During initialization, touch (create if do not exist) all objects (files).
+Touching all objects affects ceph caches and likely impacts test results.
+Enabled by default.
+.TP
.BI (http)http_host \fR=\fPstr
Hostname to connect to. For S3, this could be the bucket name. Default
is \fBlocalhost\fR
.BI (http)http_s3_keyid \fR=\fPstr
The S3 key/access id.
.TP
+.BI (http)http_s3_sse_customer_key \fR=\fPstr
+The encryption customer key in SSE server side.
+.TP
+.BI (http)http_s3_sse_customer_algorithm \fR=\fPstr
+The encryption customer algorithm in SSE server side. Default is \fBAES256\fR
+.TP
+.BI (http)http_s3_storage_class \fR=\fPstr
+Which storage class to access. User-customizable settings. Default is \fBSTANDARD\fR
+.TP
.BI (http)http_swift_auth_token \fR=\fPstr
The Swift auth token. See the example configuration file on how to
retrieve this.
Specify stat system call type to measure lookup/getattr performance.
Default is \fBstat\fR for \fBstat\fR\|(2).
.TP
+.BI (sg)hipri
+If this option is set, fio will attempt to use polled IO completions. This
+will have a similar effect as (io_uring)hipri. Only SCSI READ and WRITE
+commands will have the SGV4_FLAG_HIPRI set (not UNMAP (trim) nor VERIFY).
+Older versions of the Linux sg driver that do not support hipri will simply
+ignore this flag and do normal IO. The Linux SCSI Low Level Driver (LLD)
+that "owns" the device also needs to support hipri (also known as iopoll
+and mq_poll). The MegaRAID driver is an example of a SCSI LLD.
+Default: clear (0) which does normal (interrupted based) IO.
+.TP
.BI (sg)readfua \fR=\fPbool
With readfua option set to 1, read operations include the force
unit access (fua) flag. Default: 0.
unit access (fua) flag. Default: 0.
.TP
.BI (sg)sg_write_mode \fR=\fPstr
-Specify the type of write commands to issue. This option can take three
+Specify the type of write commands to issue. This option can take multiple
values:
.RS
.RS
.B write (default)
Write opcodes are issued as usual
.TP
+.B write_and_verify
+Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 00b. This directs the
+device to carry out a medium verification with no data comparison for the data
+that was written. The writefua option is ignored with this selection.
+.TP
.B verify
-Issue WRITE AND VERIFY commands. The BYTCHK bit is set to 0. This
-directs the device to carry out a medium verification with no data
-comparison. The writefua option is ignored with this selection.
+This option is deprecated. Use write_and_verify instead.
.TP
-.B same
+.B write_same
Issue WRITE SAME commands. This transfers a single block to the device
and writes this same block of data to a contiguous sequence of LBAs
beginning at the specified offset. fio's block size parameter
generate 8k of data for each command butonly the first 512 bytes will
be used and transferred to the device. The writefua option is ignored
with this selection.
+.TP
+.B same
+This option is deprecated. Use write_same instead.
+.TP
+.B write_same_ndob
+Issue WRITE SAME(16) commands as above but with the No Data Output
+Buffer (NDOB) bit set. No data will be transferred to the device with
+this bit set. Data written will be a pre-determined pattern such as
+all zeroes.
+.TP
+.B write_stream
+Issue WRITE STREAM(16) commands. Use the stream_id option to specify
+the stream identifier.
+.TP
+.B verify_bytchk_00
+Issue VERIFY commands with BYTCHK set to 00. This directs the device to carry
+out a medium verification with no data comparison.
+.TP
+.B verify_bytchk_01
+Issue VERIFY commands with BYTCHK set to 01. This directs the device to
+compare the data on the device with the data transferred to the device.
+.TP
+.B verify_bytchk_11
+Issue VERIFY commands with BYTCHK set to 11. This transfers a single block to
+the device and compares the contents of this block with the data on the device
+beginning at the specified offset. fio's block size parameter specifies the
+total amount of data compared with this command. However, only one block
+(sector) worth of data is transferred to the device. This is similar to the
+WRITE SAME command except that data is compared instead of written.
.RE
.RE
.TP
+.BI (sg)stream_id \fR=\fPint
+Set the stream identifier for WRITE STREAM commands. If this is set to 0 (which is not
+a valid stream identifier) fio will open a stream and then close it when done. Default
+is 0.
+.TP
.BI (nbd)uri \fR=\fPstr
Specify the NBD URI of the server to test.
The string is a standard NBD URI (see
\fInbd+unix:///?socket=/tmp/socket\fR
.TP
\fInbds://tlshost/exportname\fR
-
+.RE
+.RE
+.TP
+.BI (libcufile)gpu_dev_ids\fR=\fPstr
+Specify the GPU IDs to use with CUDA. This is a colon-separated list of int.
+GPUs are assigned to workers roundrobin. Default is 0.
+.TP
+.BI (libcufile)cuda_io\fR=\fPstr
+Specify the type of I/O to use with CUDA. This option
+takes the following values:
+.RS
+.RS
+.TP
+.B cufile (default)
+Use libcufile and nvidia-fs. This option performs I/O directly
+between a GPUDirect Storage filesystem and GPU buffers,
+avoiding use of a bounce buffer. If \fBverify\fR is set,
+cudaMemcpy is used to copy verification data between RAM and GPU(s).
+Verification data is copied from RAM to GPU before a write
+and from GPU to RAM after a read.
+\fBdirect\fR must be 1.
+.TP
+.BI posix
+Use POSIX to perform I/O with a RAM buffer, and use
+cudaMemcpy to transfer data between RAM and the GPU(s).
+Data is copied from GPU to RAM before a write and copied
+from RAM to GPU after a read. \fBverify\fR does not affect
+the use of cudaMemcpy.
+.RE
+.RE
+.TP
+.BI (dfs)pool
+Specify the label or UUID of the DAOS pool to connect to.
+.TP
+.BI (dfs)cont
+Specify the label or UUID of the DAOS container to open.
+.TP
+.BI (dfs)chunk_size
+Specify a different chunk size (in bytes) for the dfs file.
+Use DAOS container's chunk size by default.
+.TP
+.BI (dfs)object_class
+Specify a different object class for the dfs file.
+Use DAOS container's object class by default.
+.TP
+.BI (nfs)nfs_url
+URL in libnfs format, eg nfs://<server|ipv4|ipv6>/path[?arg=val[&arg=val]*]
+Refer to the libnfs README for more details.
+.TP
+.BI (exec)program\fR=\fPstr
+Specify the program to execute.
+Note the program will receive a SIGTERM when the job is reaching the time limit.
+A SIGKILL is sent once the job is over. The delay between the two signals is defined by \fBgrace_time\fR option.
+.TP
+.BI (exec)arguments\fR=\fPstr
+Specify arguments to pass to program.
+Some special variables can be expanded to pass fio's job details to the program :
+.RS
+.RS
+.TP
+.B %r
+replaced by the duration of the job in seconds
+.TP
+.BI %n
+replaced by the name of the job
+.RE
+.RE
+.TP
+.BI (exec)grace_time\fR=\fPint
+Defines the time between the SIGTERM and SIGKILL signals. Default is 1 second.
+.TP
+.BI (exec)std_redirect\fR=\fPbool
+If set, stdout and stderr streams are redirected to files named from the job name. Default is true.
+.TP
+.BI (xnvme)xnvme_async\fR=\fPstr
+Select the xnvme async command interface. This can take these values.
+.RS
+.RS
+.TP
+.B emu
+This is default and use to emulate asynchronous I/O by using a single thread to
+create a queue pair on top of a synchronous I/O interface using the NVMe driver
+IOCTL.
+.TP
+.BI thrpool
+Emulate an asynchronous I/O interface with a pool of userspace threads on top
+of a synchronous I/O interface using the NVMe driver IOCTL. By default four
+threads are used.
+.TP
+.BI io_uring
+Linux native asynchronous I/O interface which supports both direct and buffered
+I/O.
+.TP
+.BI libaio
+Use Linux aio for Asynchronous I/O
+.TP
+.BI posix
+Use the posix asynchronous I/O interface to perform one or more I/O operations
+asynchronously.
+.TP
+.BI vfio
+Use the user-space VFIO-based backend, implemented using libvfn instead of
+SPDK.
+.TP
+.BI nil
+Do not transfer any data; just pretend to. This is mainly used for
+introspective performance evaluation.
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_sync\fR=\fPstr
+Select the xnvme synchronous command interface. This can take these values.
+.RS
+.RS
+.TP
+.B nvme
+This is default and uses Linux NVMe Driver ioctl() for synchronous I/O.
+.TP
+.BI psync
+This supports regular as well as vectored pread() and pwrite() commands.
+.TP
+.BI block
+This is the same as psync except that it also supports zone management
+commands using Linux block layer IOCTLs.
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_admin\fR=\fPstr
+Select the xnvme admin command interface. This can take these values.
+.RS
+.RS
+.TP
+.B nvme
+This is default and uses Linux NVMe Driver ioctl() for admin commands.
+.TP
+.BI block
+Use Linux Block Layer ioctl() and sysfs for admin commands.
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_dev_nsid\fR=\fPint
+xnvme namespace identifier for userspace NVMe driver SPDK or vfio.
+.TP
+.BI (xnvme)xnvme_dev_subnqn\fR=\fPstr
+Sets the subsystem NQN for fabrics. This is for xNVMe to utilize a fabrics
+target with multiple systems.
+.TP
+.BI (xnvme)xnvme_mem\fR=\fPstr
+Select the xnvme memory backend. This can take these values.
+.RS
+.RS
+.TP
+.B posix
+This is the default posix memory backend for linux NVMe driver.
+.TP
+.BI hugepage
+Use hugepages, instead of existing posix memory backend. The memory backend
+uses hugetlbfs. This require users to allocate hugepages, mount hugetlbfs and
+set an environment variable for XNVME_HUGETLB_PATH.
+.TP
+.BI spdk
+Uses SPDK's memory allocator.
+.TP
+.BI vfio
+Uses libvfn's memory allocator. This also specifies the use of libvfn backend
+instead of SPDK.
+.RE
+.RE
+.TP
+.BI (xnvme)xnvme_iovec
+If this option is set, xnvme will use vectored read/write commands.
+.TP
+.BI (libblkio)libblkio_driver \fR=\fPstr
+The libblkio driver to use. Different drivers access devices through different
+underlying interfaces. Available drivers depend on the libblkio version in use
+and are listed at \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR
+.TP
+.BI (libblkio)libblkio_path \fR=\fPstr
+Sets the value of the driver-specific "path" property before connecting the
+libblkio instance, which identifies the target device or file on which to
+perform I/O. Its exact semantics are driver-dependent and not all drivers may
+support it; see \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR
+.TP
+.BI (libblkio)libblkio_pre_connect_props \fR=\fPstr
+A colon-separated list of additional libblkio properties to be set after
+creating but before connecting the libblkio instance. Each property must have
+the format \fB<name>=<value>\fR. Colons can be escaped as \fB\\:\fR. These are
+set after the engine sets any other properties, so those can be overridden.
+Available properties depend on the libblkio version in use and are listed at
+\fIhttps://libblkio.gitlab.io/libblkio/blkio.html#properties\fR
+.TP
+.BI (libblkio)libblkio_num_entries \fR=\fPint
+Sets the value of the driver-specific "num-entries" property before starting the
+libblkio instance. Its exact semantics are driver-dependent and not all drivers
+may support it; see \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR
+.TP
+.BI (libblkio)libblkio_queue_size \fR=\fPint
+Sets the value of the driver-specific "queue-size" property before starting the
+libblkio instance. Its exact semantics are driver-dependent and not all drivers
+may support it; see \fIhttps://libblkio.gitlab.io/libblkio/blkio.html#drivers\fR
+.TP
+.BI (libblkio)libblkio_pre_start_props \fR=\fPstr
+A colon-separated list of additional libblkio properties to be set after
+connecting but before starting the libblkio instance. Each property must have
+the format \fB<name>=<value>\fR. Colons can be escaped as \fB\\:\fR. These are
+set after the engine sets any other properties, so those can be overridden.
+Available properties depend on the libblkio version in use and are listed at
+\fIhttps://libblkio.gitlab.io/libblkio/blkio.html#properties\fR
+.TP
+.BI (libblkio)hipri
+Use poll queues. This is incompatible with \fBlibblkio_wait_mode=eventfd\fR and
+\fBlibblkio_force_enable_completion_eventfd\fR.
+.TP
+.BI (libblkio)libblkio_vectored
+Submit vectored read and write requests.
+.TP
+.BI (libblkio)libblkio_write_zeroes_on_trim
+Submit trims as "write zeroes" requests instead of discard requests.
+.TP
+.BI (libblkio)libblkio_wait_mode \fR=\fPstr
+How to wait for completions:
+.RS
+.RS
+.TP
+.B block \fR(default)
+Use a blocking call to \fBblkioq_do_io()\fR.
+.TP
+.B eventfd
+Use a blocking call to \fBread()\fR on the completion eventfd.
+.TP
+.B loop
+Use a busy loop with a non-blocking call to \fBblkioq_do_io()\fR.
+.RE
+.RE
+.TP
+.BI (libblkio)libblkio_force_enable_completion_eventfd
+Enable the queue's completion eventfd even when unused. This may impact
+performance. The default is to enable it only if
+\fBlibblkio_wait_mode=eventfd\fR.
+.TP
+.BI (windowsaio)no_completion_thread
+Avoid using a separate thread for completion polling.
.SS "I/O depth"
.TP
.BI iodepth \fR=\fPint
problem). Note that this option cannot reliably be used with async IO engines.
.SS "I/O rate"
.TP
+.BI thinkcycles \fR=\fPint
+Stall the job for the specified number of cycles after an I/O has completed before
+issuing the next. May be used to simulate processing being done by an application.
+This is not taken into account for the time to be waited on for \fBthinktime\fR.
+Might not have any effect on some platforms, this can be checked by trying a setting
+a high enough amount of thinkcycles.
+.TP
.BI thinktime \fR=\fPtime
Stall the job for the specified period of time after an I/O has completed before issuing the
next. May be used to simulate processing being done by an application.
When the unit is omitted, the value is interpreted in microseconds. See
-\fBthinktime_blocks\fR and \fBthinktime_spin\fR.
+\fBthinktime_blocks\fR, \fBthinktime_iotime\fR and \fBthinktime_spin\fR.
.TP
.BI thinktime_spin \fR=\fPtime
Only valid if \fBthinktime\fR is set - pretend to spend CPU time doing
queue depth setting redundant, since no more than 1 I/O will be queued
before we have to complete it and do our \fBthinktime\fR. In other words, this
setting effectively caps the queue depth if the latter is larger.
+.TP
+.BI thinktime_blocks_type \fR=\fPstr
+Only valid if \fBthinktime\fR is set - control how \fBthinktime_blocks\fR triggers.
+The default is `complete', which triggers \fBthinktime\fR when fio completes
+\fBthinktime_blocks\fR blocks. If this is set to `issue', then the trigger happens
+at the issue side.
+.TP
+.BI thinktime_iotime \fR=\fPtime
+Only valid if \fBthinktime\fR is set - control \fBthinktime\fR interval by time.
+The \fBthinktime\fR stall is repeated after IOs are executed for
+\fBthinktime_iotime\fR. For example, `\-\-thinktime_iotime=9s \-\-thinktime=1s'
+repeat 10-second cycle with IOs for 9 seconds and stall for 1 second. When the
+unit is omitted, \fBthinktime_iotime\fR is interpreted as a number of seconds.
+If this option is used together with \fBthinktime_blocks\fR, the \fBthinktime\fR
+stall is repeated after \fBthinktime_iotime\fR or after \fBthinktime_blocks\fR
+IOs, whichever happens first.
+
.TP
.BI rate \fR=\fPint[,int][,int]
Cap the bandwidth used by this job. The number is in bytes/sec, the normal
kind of thinktime setting was used. If this option is set, then fio will
ignore the thinktime and continue doing IO at the specified rate, instead of
entering a catch-up mode after thinktime is done.
+.TP
+.BI rate_cycle \fR=\fPint
+Average bandwidth for \fBrate_min\fR and \fBrate_iops_min\fR over this number
+of milliseconds. Defaults to 1000.
.SS "I/O latency"
.TP
.BI latency_target \fR=\fPtime
queue depth that meets \fBlatency_target\fR and exit. If true, fio will continue
running and try to meet \fBlatency_target\fR by adjusting queue depth.
.TP
-.BI max_latency \fR=\fPtime
+.BI max_latency \fR=\fPtime[,time][,time]
If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
maximum latency. When the unit is omitted, the value is interpreted in
-microseconds.
-.TP
-.BI rate_cycle \fR=\fPint
-Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number
-of milliseconds. Defaults to 1000.
+microseconds. Comma-separated values may be specified for reads, writes,
+and trims as described in \fBblocksize\fR.
.SS "I/O replay"
.TP
.BI write_iolog \fR=\fPstr
Write the issued I/O patterns to the specified file. See
\fBread_iolog\fR. Specify a separate file for each job, otherwise the
-iologs will be interspersed and the file may be corrupt.
+iologs will be interspersed and the file may be corrupt. This file will be
+opened in append mode.
.TP
.BI read_iolog \fR=\fPstr
Open an iolog with the specified filename and replay the I/O patterns it
between 0 and 7, with 0 being the highest. See man
\fBionice\fR\|(1). Refer to an appropriate manpage for other operating
systems since meaning of priority may differ. For per-command priority
-setting, see I/O engine specific `cmdprio_percentage` and `hipri_percentage`
-options.
+setting, see the I/O engine specific `cmdprio_percentage` and
+`cmdprio` options.
.TP
.BI prioclass \fR=\fPint
Set the I/O priority class. See man \fBionice\fR\|(1). For per-command
-priority setting, see I/O engine specific `cmdprio_percentage` and `hipri_percent`
-options.
+priority setting, see the I/O engine specific `cmdprio_percentage` and
+`cmdprio_class` options.
+.TP
+.BI priohint \fR=\fPint
+Set the I/O priority hint. This is only applicable to platforms that support
+I/O priority classes and to devices with features controlled through priority
+hints, e.g. block devices supporting command duration limits, or CDL. CDL is a
+way to indicate the desired maximum latency of I/Os so that the device can
+optimize its internal command scheduling according to the latency limits
+indicated by the user. For per-I/O priority hint setting, see the I/O engine
+specific \fBcmdprio_hint\fB option.
.TP
.BI cpus_allowed \fR=\fPstr
Controls the same options as \fBcpumask\fR, but accepts a textual
To avoid false verification errors, do not use the norandommap option when
verifying data with async I/O engines and I/O depths > 1. Or use the
norandommap and the lfsr random generator together to avoid writing to the
-same offset with muliple outstanding I/Os.
+same offset with multiple outstanding I/Os.
.RE
.TP
.BI verify_offset \fR=\fPint
verification pass, according to the settings in the job file used. Default
false.
.TP
+.BI experimental_verify \fR=\fPbool
+Enable experimental verification. Standard verify records I/O metadata for
+later use during the verification phase. Experimental verify instead resets the
+file after the write phase and then replays I/Os for the verification phase.
+.TP
.BI trim_percentage \fR=\fPint
Number of verify blocks to discard/trim.
.TP
.TP
.BI trim_backlog_batch \fR=\fPint
Trim this number of I/O blocks.
-.TP
-.BI experimental_verify \fR=\fPbool
-Enable experimental verification.
.SS "Steady state"
.TP
.BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float
.TP
.BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime
A rolling window of this duration will be used to judge whether steady state
-has been reached. Data will be collected once per second. The default is 0
-which disables steady state detection. When the unit is omitted, the
-value is interpreted in seconds.
+has been reached. Data will be collected every \fBss_interval\fR. The default
+is 0 which disables steady state detection. When the unit is omitted, the value
+is interpreted in seconds.
.TP
.BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime
Allow the job to run for the specified duration before beginning data
collection for checking the steady state job termination criterion. The
default is 0. When the unit is omitted, the value is interpreted in seconds.
+.TP
+.BI steadystate_check_interval \fR=\fPtime "\fR,\fP ss_interval" \fR=\fPtime
+The values suring the rolling window will be collected with a period of this
+value. If \fBss_interval\fR is 30s and \fBss_dur\fR is 300s, 10 measurements
+will be taken. Default is 1s but that might not converge, especially for slower
+devices, so set this accordingly. When the unit is omitted, the value is
+interpreted in seconds.
.SS "Measurements and reporting"
.TP
.BI per_job_logs \fR=\fPbool
-If set, this generates bw/clat/iops log with per file private filenames. If
-not set, jobs with identical names will share the log filename. Default:
+If set to true, fio generates bw/clat/iops logs with per job unique filenames.
+If set to false, jobs with identical names will share a log filename. Note that
+when this option is set to false log files will be opened in append mode and if
+log files already exist the previous contents will not be overwritten. Default:
true.
.TP
.BI group_reporting
per-job, use \fBgroup_reporting\fR. Jobs in a file will be part of the
same reporting group, unless if separated by a \fBstonewall\fR, or by
using \fBnew_group\fR.
+.RS
+.P
+NOTE: When \fBgroup_reporting\fR is used along with \fBjson\fR output, there
+are certain per-job properties which can be different between jobs but do not
+have a natural group-level equivalent. Examples include \fBkb_base\fR,
+\fBunit_base\fR, \fBsig_figs\fR, \fBthread_number\fR, \fBpid\fR, and
+\fBjob_start\fR. For these properties, the values for the first job are
+recorded for the group.
+.RE
.TP
.BI new_group
Start a new reporting group. See: \fBgroup_reporting\fR. If not given,
\fBwrite_bw_log\fR for details about the filename format and \fBLOG
FILE FORMATS\fR for how data is structured within the file.
.TP
+.BI log_entries \fR=\fPint
+By default, fio will log an entry in the iops, latency, or bw log for
+every I/O that completes. The initial number of I/O log entries is 1024.
+When the log entries are all used, new log entries are dynamically
+allocated. This dynamic log entry allocation may negatively impact
+time-related statistics such as I/O tail latencies (e.g. 99.9th percentile
+completion latency). This option allows specifying a larger initial
+number of log entries to avoid run-time allocation of new log entries,
+resulting in more precise time-related I/O statistics.
+Also see \fBlog_avg_msec\fR as well. Defaults to 1024.
+.TP
.BI log_avg_msec \fR=\fPint
-By default, fio will log an entry in the iops, latency, or bw log for every
-I/O that completes. When writing to the disk log, that can quickly grow to a
-very large size. Setting this option makes fio average the each log entry
-over the specified period of time, reducing the resolution of the log. See
-\fBlog_max_value\fR as well. Defaults to 0, logging all entries.
-Also see \fBLOG FILE FORMATS\fR section.
+By default, fio will log an entry in the iops, latency, or bw log for every I/O
+that completes. When writing to the disk log, that can quickly grow to a very
+large size. Setting this option directs fio to instead record an average over
+the specified duration for each log entry, reducing the resolution of the log.
+When the job completes, fio will flush any accumulated latency log data, so the
+final log interval may not match the value specified by this option and there
+may even be duplicate timestamps. See \fBlog_window_value\fR as well. Defaults
+to 0, logging entries for each I/O. Also see \fBLOG FILE FORMATS\fR section.
.TP
.BI log_hist_msec \fR=\fPint
Same as \fBlog_avg_msec\fR, but logs entries for completion latency
in coarseness, fio outputs half as many bins. Defaults to 0, for which
histogram logs contain 1216 latency bins. See \fBLOG FILE FORMATS\fR section.
.TP
-.BI log_max_value \fR=\fPbool
-If \fBlog_avg_msec\fR is set, fio logs the average over that window. If
-you instead want to log the maximum value, set this option to 1. Defaults to
-0, meaning that averaged values are logged.
+.BI log_window_value \fR=\fPstr "\fR,\fP log_max_value" \fR=\fPstr
+If \fBlog_avg_msec\fR is set, fio by default logs the average over that window.
+This option determines whether fio logs the average, maximum or both the
+values over the window. This only affects the latency logging, as both average
+and maximum values for iops or bw log will be same. Accepted values are:
+.RS
+.TP
+.B avg
+Log average value over the window. The default.
+.TP
+.B max
+Log maximum value in the window.
+.TP
+.B both
+Log both average and maximum value over the window.
+.TP
+.B 0
+Backward-compatible alias for \fBavg\fR.
+.TP
+.B 1
+Backward-compatible alias for \fBmax\fR.
+.RE
.TP
.BI log_offset \fR=\fPbool
If this is set, the iolog options will include the byte offset for the I/O
entry as well as the other data values. Defaults to 0 meaning that
offsets are not present in logs. Also see \fBLOG FILE FORMATS\fR section.
.TP
+.BI log_prio \fR=\fPbool
+If this is set, the iolog options will include the I/O priority for the I/O
+entry as well as the other data values. Defaults to 0 meaning that
+I/O priorities are not present in logs. Also see \fBLOG FILE FORMATS\fR section.
+.TP
.BI log_compression \fR=\fPint
If this is set, fio will compress the I/O logs as it goes, to keep the
memory footprint lower. When a log reaches the specified size, that chunk is
parameter. The files will be stored with a `.fz' suffix.
.TP
.BI log_unix_epoch \fR=\fPbool
-If set, fio will log Unix timestamps to the log files produced by enabling
-write_type_log for each log type, instead of the default zero-based
+Backward-compatible alias for \fBlog_alternate_epoch\fR.
+.TP
+.BI log_alternate_epoch \fR=\fPbool
+If set, fio will log timestamps based on the epoch used by the clock specified
+in the \fBlog_alternate_epoch_clock_id\fR option, to the log files produced by
+enabling write_type_log for each log type, instead of the default zero-based
timestamps.
.TP
+.BI log_alternate_epoch_clock_id \fR=\fPint
+Specifies the clock_id to be used by clock_gettime to obtain the alternate
+epoch if \fBlog_alternate_epoch\fR is true. Otherwise has no effect. Default
+value is 0, or CLOCK_REALTIME.
+.TP
.BI block_error_percentiles \fR=\fPbool
If set, record errors in trim block-sized units from writes and trims and
output a histogram of how many trims it took to get to errors, and what kind
completed. If this option is used, there are two more stats that are
appended, the total error count and the first error. The error field given
in the stats is the first error that was hit during the run.
+.RS
+.P
+Note: a write error from the device may go unnoticed by fio when using buffered
+IO, as the write() (or similar) system call merely dirties the kernel pages,
+unless `sync' or `direct' is used. Device IO errors occur when the dirty data is
+actually written out to disk. If fully sync writes aren't desirable, `fsync' or
+`fdatasync' can be used as well. This is specific to writes, as reads are always
+synchronous.
+.RS
+.P
The allowed values are:
.RS
.RS
usually be equal (or very close) to 0, as the time from submit to
complete is basically just CPU time (I/O has already been done, see slat
explanation).
+
+For file and directory operation engines, \fBclat\fP denotes the time
+to complete one file or directory operation.
+.RS
+.TP
+\fBfilecreate engine\fP:\tthe time cost to create a new file
+.TP
+\fBfilestat engine\fP:\tthe time cost to look up an existing file
+.TP
+\fBfiledelete engine\fP:\tthe time cost to delete a file
+.TP
+\fBdircreate engine\fP:\tthe time cost to create a new directory
+.TP
+\fBdirstat engine\fP:\tthe time cost to look up an existing directory
+.TP
+\fBdirdelete engine\fP:\tthe time cost to delete a directory
+.TP
+.RE
.TP
.B lat
Total latency. Same names as slat and clat, this denotes the time from
when fio created the I/O unit to completion of the I/O operation.
.TP
.B bw
-Bandwidth statistics based on samples. Same names as the xlat stats,
-but also includes the number of samples taken (\fIsamples\fR) and an
-approximate percentage of total aggregate bandwidth this thread
-received in its group (\fIper\fR). This last value is only really
-useful if the threads in this group are on the same disk, since they
-are then competing for disk access.
+Bandwidth statistics based on measurements from discrete intervals. Fio
+continuosly monitors bytes transferred and I/O operations completed. By default
+fio calculates bandwidth in each half-second interval (see \fBbwavgtime\fR)
+and reports descriptive statistics for the measurements here. Same names as the
+xlat stats, but also includes the number of samples taken (\fIsamples\fR) and an
+approximate percentage of total aggregate bandwidth this thread received in its
+group (\fIper\fR). This last value is only really useful if the threads in this
+group are on the same disk, since they are then competing for disk access.
+
+For file and directory operation engines, \fBbw\fR is meaningless.
.TP
.B iops
-IOPS statistics based on samples. Same names as \fBbw\fR.
+IOPS statistics based on measurements from discrete intervals.
+For details see the description for \fBbw\fR above. See
+\fBiopsavgtime\fR to control the duration of the intervals.
+Same values reported here as for \fBbw\fR except for percentage.
+
+For file and directory operation engines, \fBiops\fP is the most
+fundamental index to denote the performance.
+It means how many files or directories can be operated per second.
+.RS
+.TP
+\fBfilecreate engine\fP:\tnumber of files can be created per second
+.TP
+\fBfilestat engine\fP:\tnumber of files can be looked up per second
+.TP
+\fBfiledelete engine\fP:\tnumber of files can be deleted per second
+.TP
+\fBdircreate engine\fP:\tnumber of directories can be created per second
+.TP
+\fBdirstat engine\fP:\tnumber of directories can be looked up per second
+.TP
+\fBdirdelete engine\fP:\tnumber of directories can be deleted per second
+.TP
+.RE
.TP
.B lat (nsec/usec/msec)
The distribution of I/O completion latencies. This is the time from when
.P
.nf
Disk stats (read/write):
- sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+ sda: ios=16398/16511, sectors=32321/65472, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
.fi
.P
Each value is printed for both reads and writes, with reads first. The
minimal output v3, separated by semicolons:
.P
.nf
- terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth;read_iops;read_runtime_ms;read_slat_min;read_slat_max;read_slat_mean;read_slat_dev;read_clat_min;read_clat_max;read_clat_mean;read_clat_dev;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min;read_lat_max;read_lat_mean;read_lat_dev;read_bw_min;read_bw_max;read_bw_agg_pct;read_bw_mean;read_bw_dev;write_kb;write_bandwidth;write_iops;write_runtime_ms;write_slat_min;write_slat_max;write_slat_mean;write_slat_dev;write_clat_min;write_clat_max;write_clat_mean;write_clat_dev;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min;write_lat_max;write_lat_mean;write_lat_dev;write_bw_min;write_bw_max;write_bw_agg_pct;write_bw_mean;write_bw_dev;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
+ terse_version_3;fio_version;jobname;groupid;error;read_kb;read_bandwidth_kb;read_iops;read_runtime_ms;read_slat_min_us;read_slat_max_us;read_slat_mean_us;read_slat_dev_us;read_clat_min_us;read_clat_max_us;read_clat_mean_us;read_clat_dev_us;read_clat_pct01;read_clat_pct02;read_clat_pct03;read_clat_pct04;read_clat_pct05;read_clat_pct06;read_clat_pct07;read_clat_pct08;read_clat_pct09;read_clat_pct10;read_clat_pct11;read_clat_pct12;read_clat_pct13;read_clat_pct14;read_clat_pct15;read_clat_pct16;read_clat_pct17;read_clat_pct18;read_clat_pct19;read_clat_pct20;read_tlat_min_us;read_lat_max_us;read_lat_mean_us;read_lat_dev_us;read_bw_min_kb;read_bw_max_kb;read_bw_agg_pct;read_bw_mean_kb;read_bw_dev_kb;write_kb;write_bandwidth_kb;write_iops;write_runtime_ms;write_slat_min_us;write_slat_max_us;write_slat_mean_us;write_slat_dev_us;write_clat_min_us;write_clat_max_us;write_clat_mean_us;write_clat_dev_us;write_clat_pct01;write_clat_pct02;write_clat_pct03;write_clat_pct04;write_clat_pct05;write_clat_pct06;write_clat_pct07;write_clat_pct08;write_clat_pct09;write_clat_pct10;write_clat_pct11;write_clat_pct12;write_clat_pct13;write_clat_pct14;write_clat_pct15;write_clat_pct16;write_clat_pct17;write_clat_pct18;write_clat_pct19;write_clat_pct20;write_tlat_min_us;write_lat_max_us;write_lat_mean_us;write_lat_dev_us;write_bw_min_kb;write_bw_max_kb;write_bw_agg_pct;write_bw_mean_kb;write_bw_dev_kb;cpu_user;cpu_sys;cpu_csw;cpu_mjf;cpu_minf;iodepth_1;iodepth_2;iodepth_4;iodepth_8;iodepth_16;iodepth_32;iodepth_64;lat_2us;lat_4us;lat_10us;lat_20us;lat_50us;lat_100us;lat_250us;lat_500us;lat_750us;lat_1000us;lat_2ms;lat_4ms;lat_10ms;lat_20ms;lat_50ms;lat_100ms;lat_250ms;lat_500ms;lat_750ms;lat_1000ms;lat_2000ms;lat_over_2000ms;disk_name;disk_read_iops;disk_write_iops;disk_read_merges;disk_write_merges;disk_read_ticks;write_ticks;disk_queue_time;disk_util
.fi
.P
In client/server mode terse output differs from what appears when jobs are run
.TP
.B Trace file format v2
The second version of the trace file format was added in fio version 1.17. It
-allows to access more then one file per trace and has a bigger set of possible
+allows one to access more than one file per trace and has a bigger set of possible
file actions.
.RS
.P
.TP
.B wait
Wait for `offset' microseconds. Everything below 100 is discarded.
-The time is relative to the previous `wait' statement.
+The time is relative to the previous `wait' statement. Note that action `wait`
+is not allowed as of version 3, as the same behavior can be achieved using
+timestamps.
.TP
.B read
Read `length' bytes beginning from `offset'.
Trim the given file from the given `offset' for `length' bytes.
.RE
.RE
+.RE
+.TP
+.B Trace file format v3
+The third version of the trace file format was added in fio version 3.31. It
+forces each action to have a timestamp associated with it.
+.RS
+.P
+The first line of the trace file has to be:
+.RS
+.P
+"fio version 3 iolog"
+.RE
+.P
+Following this can be lines in two different formats, which are described below.
+.P
+.B
+The file management format:
+.RS
+timestamp filename action
+.P
+.RE
+.B
+The file I/O action format:
+.RS
+timestamp filename action offset length
+.P
+The `timestamp` is relative to the beginning of the run (ie starts at 0). The
+`filename`, `action`, `offset` and `length` are identical to version 2, except
+that version 3 does not allow the `wait` action.
+.RE
+.RE
.SH I/O REPLAY \- MERGING TRACES
Colocation is a common practice used to get the most out of a machine.
Knowing which workloads play nicely with each other and which ones don't is
from the start of the file for that particular I/O. The logging of the offset can be
toggled with \fBlog_offset\fR.
.P
-`Command priority` is 0 for normal priority and 1 for high priority. This is controlled
-by the ioengine specific \fBcmdprio_percentage\fR.
+If \fBlog_prio\fR is not set, the entry's `Command priority` is 1 for an IO executed
+with the highest RT priority class (\fBprioclass\fR=1 or \fBcmdprio_class\fR=1) and 0
+otherwise. This is controlled by the \fBprioclass\fR option and the ioengine specific
+\fBcmdprio_percentage\fR \fBcmdprio_class\fR options. If \fBlog_prio\fR is set, the
+entry's `Command priority` is the priority set for the IO, as a 16-bits hexadecimal
+number with the lowest 13 bits indicating the priority value (\fBprio\fR and
+\fBcmdprio\fR options) and the highest 3 bits indicating the IO priority class
+(\fBprioclass\fR and \fBcmdprio_class\fR options).
.P
Fio defaults to logging every individual I/O but when windowed logging is set
-through \fBlog_avg_msec\fR, either the average (by default) or the maximum
-(\fBlog_max_value\fR is set) `value' seen over the specified period of time
-is recorded. Each `data direction' seen within the window period will aggregate
-its values in a separate row. Further, when using windowed logging the `block
-size' and `offset' entries will always contain 0.
+through \fBlog_avg_msec\fR, either the average (by default), the maximum
+(\fBlog_window_value\fR is set to max) `value' seen over the specified period of
+time, or both the average `value' and maximum `value1' (\fBlog_window_value\fR is
+set to both) is recorded. The log file format when both the values are reported
+takes this form:
+.RS
+.P
+time (msec), value, value1, data direction, block size (bytes), offset (bytes),
+command priority
+.RE
+.P
+Each `data direction' seen within the window period will aggregate its values
+in a separate row. Further, when using windowed logging the `block size' and
+`offset' entries will always contain 0.
.SH CLIENT / SERVER
Normally fio is invoked as a stand-alone application on the machine where the
I/O workload should be generated. However, the backend and frontend of fio can
server. The `server' string follows the same format as it does on the server
side, to allow IP/hostname/socket and port strings.
.P
+Note that all job options must be defined in job files when running fio as a
+client. Any job options specified in `remote\-args' will be ignored.
+.P
Fio can connect to multiple servers this way:
.RS
.P
{
int ret = 1;
- compiletime_assert(TD_NR <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT");
-
if (initialize_fio(envp))
return 1;
#include "workqueue.h"
#include "steadystate.h"
#include "lib/nowarn_snprintf.h"
+#include "dedupe.h"
#ifdef CONFIG_SOLARISAIO
#include <sys/asynch.h>
struct fio_sem;
+#define MAX_TRIM_RANGE 256
+
+/*
+ * Range for trim command
+ */
+struct trim_range {
+ unsigned long long start;
+ unsigned long long len;
+};
+
/*
* offset generator types
*/
__TD_F_MMAP_KEEP,
__TD_F_DIRS_CREATED,
__TD_F_CHECK_RATE,
+ __TD_F_SYNCS,
__TD_F_LAST, /* not a real bit, keep last */
};
TD_F_MMAP_KEEP = 1U << __TD_F_MMAP_KEEP,
TD_F_DIRS_CREATED = 1U << __TD_F_DIRS_CREATED,
TD_F_CHECK_RATE = 1U << __TD_F_CHECK_RATE,
+ TD_F_SYNCS = 1U << __TD_F_SYNCS,
};
enum {
FIO_RAND_POISSON2_OFF,
FIO_RAND_POISSON3_OFF,
FIO_RAND_PRIO_CMDS,
+ FIO_RAND_DEDUPE_WORKING_SET_IX,
+ FIO_RAND_FDP_OFF,
FIO_RAND_NR_OFFS,
};
RATE_PROCESS_LINEAR = 0,
RATE_PROCESS_POISSON = 1,
+
+ THINKTIME_BLOCKS_TYPE_COMPLETE = 0,
+ THINKTIME_BLOCKS_TYPE_ISSUE = 1,
};
enum {
F_ADV_TYPE,
F_ADV_RANDOM,
F_ADV_SEQUENTIAL,
+ F_ADV_NOREUSE,
};
/*
*/
struct thread_data {
struct flist_head opt_list;
- unsigned long flags;
+ unsigned long long flags;
struct thread_options o;
void *eo;
pthread_t thread;
double pareto_h;
double gauss_dev;
};
+ double random_center;
int error;
int sig;
int done;
struct frand_state bsrange_state[DDIR_RWDIR_CNT];
struct frand_state verify_state;
+ struct frand_state verify_state_last_do_io;
struct frand_state trim_state;
struct frand_state delay_state;
+ struct frand_state fdp_state;
struct frand_state buf_state;
struct frand_state buf_state_prev;
+ struct frand_state buf_state_ret;
struct frand_state dedupe_state;
struct frand_state zone_state;
struct frand_state prio_state;
+ struct frand_state dedupe_working_set_index_state;
+ struct frand_state *dedupe_working_set_states;
+
+ unsigned long long num_unique_pages;
struct zone_split_index **zone_state_index;
- unsigned int num_open_zones;
+ unsigned int num_write_zones;
unsigned int verify_batch;
unsigned int trim_batch;
int shm_id;
+ /*
+ * Job default IO priority set with prioclass and prio options.
+ */
+ unsigned int ioprio;
+
/*
* IO engine hooks, contains everything needed to submit an io_u
* to any of the available IO engines.
* IO engine private data and dlhandle.
*/
void *io_ops_data;
- void *io_ops_dlhandle;
/*
* Queue depth of io_u's that fio MIGHT do
*/
uint64_t rate_bps[DDIR_RWDIR_CNT];
uint64_t rate_next_io_time[DDIR_RWDIR_CNT];
- unsigned long long rate_bytes[DDIR_RWDIR_CNT];
- unsigned long rate_blocks[DDIR_RWDIR_CNT];
+ unsigned long long last_rate_check_bytes[DDIR_RWDIR_CNT];
+ unsigned long last_rate_check_blocks[DDIR_RWDIR_CNT];
unsigned long long rate_io_issue_bytes[DDIR_RWDIR_CNT];
- struct timespec lastrate[DDIR_RWDIR_CNT];
+ struct timespec last_rate_check_time[DDIR_RWDIR_CNT];
int64_t last_usec[DDIR_RWDIR_CNT];
struct frand_state poisson_state[DDIR_RWDIR_CNT];
* Issue side
*/
uint64_t io_issues[DDIR_RWDIR_CNT];
+ uint64_t verify_read_issues;
uint64_t io_issue_bytes[DDIR_RWDIR_CNT];
uint64_t loops;
uint64_t zone_bytes;
struct fio_sem *sem;
uint64_t bytes_done[DDIR_RWDIR_CNT];
+ uint64_t bytes_verified;
+
+ uint64_t *thinktime_blocks_counter;
+ struct timespec last_thinktime;
+ int64_t last_thinktime_blocks;
/*
* State for random io, a bitmap of blocks done vs not done
struct timespec start; /* start of this loop */
struct timespec epoch; /* time job was started */
- unsigned long long unix_epoch; /* Time job was started, unix epoch based. */
+ unsigned long long alternate_epoch; /* Time job was started, as clock_gettime(log_alternate_epoch_clock_id) */
+ unsigned long long job_start; /* Time job was started, as clock_gettime(job_start_clock_id) */
struct timespec last_issue;
long time_offset;
struct timespec ts_cache;
*/
struct flist_head io_log_list;
FILE *io_log_rfile;
+ unsigned int io_log_blktrace;
+ unsigned int io_log_blktrace_swap;
+ unsigned long long io_log_last_ttime;
+ struct timespec io_log_start_time;
unsigned int io_log_current;
unsigned int io_log_checkmark;
unsigned int io_log_highmark;
+ unsigned int io_log_version;
struct timespec io_log_highmark_time;
/*
};
+struct thread_segment {
+ struct thread_data *threads;
+ int shm_id;
+ int nr_threads;
+};
+
/*
* when should interactive ETA output be generated
*/
#define __fio_stringify_1(x) #x
#define __fio_stringify(x) __fio_stringify_1(x)
+#define REAL_MAX_JOBS 4096
+#define JOBS_PER_SEG 8
+#define REAL_MAX_SEG (REAL_MAX_JOBS / JOBS_PER_SEG)
+
extern bool exitall_on_terminate;
extern unsigned int thread_number;
extern unsigned int stat_number;
-extern int shm_id;
+extern unsigned int nr_segments;
+extern unsigned int cur_segment;
extern int groupid;
extern int output_format;
extern int append_terse_output;
extern long long trigger_timeout;
extern char *aux_path;
-extern struct thread_data *threads;
+extern struct thread_segment segments[REAL_MAX_SEG];
+
+static inline struct thread_data *tnumber_to_td(unsigned int tnumber)
+{
+ struct thread_segment *seg;
+
+ seg = &segments[tnumber / JOBS_PER_SEG];
+ return &seg->threads[tnumber & (JOBS_PER_SEG - 1)];
+}
static inline bool is_running_backend(void)
{
!(io_u->ddir == DDIR_TRIM && !td_trim(td)));
}
-#define REAL_MAX_JOBS 4096
+static inline bool multi_range_trim(struct thread_data *td, struct io_u *io_u)
+{
+ if (io_u->ddir == DDIR_TRIM && td->o.num_range > 1)
+ return true;
+
+ return false;
+}
static inline bool should_fsync(struct thread_data *td)
{
extern char *fio_option_dup_subs(const char *);
extern void fio_options_mem_dupe(struct thread_data *);
extern void td_fill_rand_seeds(struct thread_data *);
-extern void td_fill_verify_state_seed(struct thread_data *);
extern void add_job_opts(const char **, int);
extern int ioengine_load(struct thread_data *);
extern bool parse_dryrun(void);
TD_NR,
};
-#define TD_ENG_FLAG_SHIFT 17
-#define TD_ENG_FLAG_MASK ((1U << 17) - 1)
+#define TD_ENG_FLAG_SHIFT (__TD_F_LAST)
+#define TD_ENG_FLAG_MASK ((1ULL << (__TD_F_LAST)) - 1)
static inline void td_set_ioengine_flags(struct thread_data *td)
{
td->flags = (~(TD_ENG_FLAG_MASK << TD_ENG_FLAG_SHIFT) & td->flags) |
- (td->io_ops->flags << TD_ENG_FLAG_SHIFT);
+ ((unsigned long long)td->io_ops->flags << TD_ENG_FLAG_SHIFT);
}
static inline bool td_ioengine_flagged(struct thread_data *td,
/*
* Iterates all threads/processes within all the defined jobs
+ * Usage:
+ * for_each_td(var_name_for_td) {
+ * << bodoy of your loop >>
+ * Note: internally-scoped loop index availble as __td_index
+ * } end_for_each_td()
*/
-#define for_each_td(td, i) \
- for ((i) = 0, (td) = &threads[0]; (i) < (int) thread_number; (i)++, (td)++)
+#define for_each_td(td) \
+{ \
+ int __td_index; \
+ struct thread_data *(td); \
+ for (__td_index = 0, (td) = &segments[0].threads[0];\
+ __td_index < (int) thread_number; __td_index++, (td) = tnumber_to_td(__td_index))
+#define for_each_td_index() \
+{ \
+ int __td_index; \
+ for (__td_index = 0; __td_index < (int) thread_number; __td_index++)
+#define end_for_each() }
+
#define for_each_file(td, f, i) \
if ((td)->files_index) \
for ((i) = 0, (f) = (td)->files[0]; \
return false;
}
-static inline bool __should_check_rate(struct thread_data *td)
-{
- return (td->flags & TD_F_CHECK_RATE) != 0;
-}
-
static inline bool should_check_rate(struct thread_data *td)
{
- if (!__should_check_rate(td))
- return false;
-
- return ddir_rw_sum(td->bytes_done) != 0;
+ return (td->flags & TD_F_CHECK_RATE) != 0;
}
static inline unsigned long long td_max_bs(struct thread_data *td)
extern uint64_t time_since_genesis(void);
extern uint64_t mtime_since_genesis(void);
extern uint64_t utime_since_genesis(void);
+extern void cycles_spin(unsigned int);
extern uint64_t usec_spin(unsigned int);
extern uint64_t usec_sleep(struct thread_data *, unsigned long);
extern void fill_start_time(struct timespec *);
extern bool in_ramp_time(struct thread_data *);
extern void fio_time_init(void);
extern void timespec_add_msec(struct timespec *, unsigned int);
-extern void set_epoch_time(struct thread_data *, int);
+extern void set_epoch_time(struct thread_data *, clockid_t, clockid_t);
#endif
if (td->o.flow_sleep) {
io_u_quiesce(td);
usleep(td->o.flow_sleep);
+ } else if (td->o.zone_mode == ZONE_MODE_ZBD) {
+ io_u_quiesce(td);
}
return 1;
{ "PrintFile", GTK_STOCK_PRINT, "Print", "<Control>P", NULL, G_CALLBACK(results_print) },
{ "CloseFile", GTK_STOCK_CLOSE, "Close", "<Control>W", NULL, G_CALLBACK(results_close) },
};
-static gint results_nmenu_items = ARRAY_SIZE(results_menu_items);
+static gint results_nmenu_items = FIO_ARRAY_SIZE(results_menu_items);
static const gchar *results_ui_string = " \
<ui> \
if (sum_stat_clients == 1)
return;
- sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
+ sum_thread_stats(&client_ts, &p->ts);
sum_group_stats(&client_gs, &p->rs);
client_ts.members++;
}
static struct thread_options *gfio_client_add_job(struct gfio_client *gc,
- struct thread_options_pack *top)
+ struct thread_options_pack *top, size_t top_sz)
{
struct gfio_client_options *gco;
gco = calloc(1, sizeof(*gco));
- convert_thread_options_to_cpu(&gco->o, top);
+ if (convert_thread_options_to_cpu(&gco->o, top, top_sz)) {
+ dprint(FD_NET, "client: failed parsing add_job command\n");
+ return NULL;
+ }
INIT_FLIST_HEAD(&gco->list);
flist_add_tail(&gco->list, &gc->o_list);
gc->o_list_nr = 1;
p->thread_number = le32_to_cpu(p->thread_number);
p->groupid = le32_to_cpu(p->groupid);
- o = gfio_client_add_job(gc, &p->top);
+ o = gfio_client_add_job(gc, &p->top,
+ cmd->pdu_len - offsetof(struct cmd_add_job_pdu, top));
+ if (o == NULL)
+ return;
gdk_threads_enter();
GtkListStore *model;
int i;
const char *labels[] = { "Depth", "0", "1", "2", "4", "8", "16", "32", "64", ">= 64" };
- const int nr_labels = ARRAY_SIZE(labels);
+ const int nr_labels = FIO_ARRAY_SIZE(labels);
GType types[nr_labels];
frame = gtk_frame_new("IO depths");
#define GFIO_CLAT 1
#define GFIO_SLAT 2
#define GFIO_LAT 4
-#define GFIO_HILAT 8
-#define GFIO_LOLAT 16
static void gfio_show_ddir_status(struct gfio_client *gc, GtkWidget *mbox,
struct group_run_stats *rs,
struct thread_stat *ts, int ddir)
{
const char *ddir_label[3] = { "Read", "Write", "Trim" };
- const char *hilat, *lolat;
GtkWidget *frame, *label, *box, *vbox, *main_vbox;
- unsigned long long min[5], max[5];
+ unsigned long long min[3], max[3];
unsigned long runt;
unsigned long long bw, iops;
unsigned int flags = 0;
- double mean[5], dev[5];
+ double mean[3], dev[3];
char *io_p, *io_palt, *bw_p, *bw_palt, *iops_p;
char tmp[128];
int i2p;
flags |= GFIO_CLAT;
if (calc_lat(&ts->lat_stat[ddir], &min[2], &max[2], &mean[2], &dev[2]))
flags |= GFIO_LAT;
- if (calc_lat(&ts->clat_high_prio_stat[ddir], &min[3], &max[3], &mean[3], &dev[3])) {
- flags |= GFIO_HILAT;
- if (calc_lat(&ts->clat_low_prio_stat[ddir], &min[4], &max[4], &mean[4], &dev[4]))
- flags |= GFIO_LOLAT;
- /* we only want to print low priority statistics if other IOs were
- * submitted with the priority bit set
- */
- }
if (flags) {
frame = gtk_frame_new("Latency");
vbox = gtk_vbox_new(FALSE, 3);
gtk_container_add(GTK_CONTAINER(frame), vbox);
- if (ts->lat_percentiles) {
- hilat = "High priority total latency";
- lolat = "Low priority total latency";
- } else {
- hilat = "High priority completion latency";
- lolat = "Low priority completion latency";
- }
-
if (flags & GFIO_SLAT)
gfio_show_lat(vbox, "Submission latency", min[0], max[0], mean[0], dev[0]);
if (flags & GFIO_CLAT)
gfio_show_lat(vbox, "Completion latency", min[1], max[1], mean[1], dev[1]);
if (flags & GFIO_LAT)
gfio_show_lat(vbox, "Total latency", min[2], max[2], mean[2], dev[2]);
- if (flags & GFIO_HILAT)
- gfio_show_lat(vbox, hilat, min[3], max[3], mean[3], dev[3]);
- if (flags & GFIO_LOLAT)
- gfio_show_lat(vbox, lolat, min[4], max[4], mean[4], dev[4]);
}
if (ts->slat_percentiles && flags & GFIO_SLAT)
ts->io_u_plat[FIO_SLAT][ddir],
ts->slat_stat[ddir].samples,
"Submission");
- if (ts->clat_percentiles && flags & GFIO_CLAT) {
+ if (ts->clat_percentiles && flags & GFIO_CLAT)
gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
ts->io_u_plat[FIO_CLAT][ddir],
ts->clat_stat[ddir].samples,
"Completion");
- if (!ts->lat_percentiles) {
- if (flags & GFIO_HILAT)
- gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
- ts->io_u_plat_high_prio[ddir],
- ts->clat_high_prio_stat[ddir].samples,
- "High priority completion");
- if (flags & GFIO_LOLAT)
- gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
- ts->io_u_plat_low_prio[ddir],
- ts->clat_low_prio_stat[ddir].samples,
- "Low priority completion");
- }
- }
- if (ts->lat_percentiles && flags & GFIO_LAT) {
+ if (ts->lat_percentiles && flags & GFIO_LAT)
gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
ts->io_u_plat[FIO_LAT][ddir],
ts->lat_stat[ddir].samples,
"Total");
- if (flags & GFIO_HILAT)
- gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
- ts->io_u_plat_high_prio[ddir],
- ts->clat_high_prio_stat[ddir].samples,
- "High priority total");
- if (flags & GFIO_LOLAT)
- gfio_show_clat_percentiles(gc, main_vbox, ts, ddir,
- ts->io_u_plat_low_prio[ddir],
- ts->clat_low_prio_stat[ddir].samples,
- "Low priority total");
- }
free(io_p);
free(bw_p);
* but I'm not sure what to use outside of a simple CPU nop to relax
* it - we don't want to lose precision.
*/
- while (threads) {
+ while (nr_segments) {
fio_gtod_update();
nop;
}
max_ticks = MAX_CLOCK_SEC * cycles_per_msec * 1000ULL;
max_mult = ULLONG_MAX / max_ticks;
- dprint(FD_TIME, "\n\nmax_ticks=%llu, __builtin_clzll=%d, "
+ dprint(FD_TIME, "max_ticks=%llu, __builtin_clzll=%d, "
"max_mult=%llu\n", max_ticks,
__builtin_clzll(max_ticks), max_mult);
/*
* Find the greatest power of 2 clock ticks that is less than the
- * ticks in MAX_CLOCK_SEC_2STAGE
+ * ticks in MAX_CLOCK_SEC
*/
max_cycles_shift = max_cycles_mask = 0;
tmp = MAX_CLOCK_SEC * 1000ULL * cycles_per_msec;
uint64_t ntime_since(const struct timespec *s, const struct timespec *e)
{
- int64_t sec, nsec;
+ int64_t sec, nsec;
- sec = e->tv_sec - s->tv_sec;
- nsec = e->tv_nsec - s->tv_nsec;
- if (sec > 0 && nsec < 0) {
- sec--;
- nsec += 1000000000LL;
- }
+ sec = e->tv_sec - s->tv_sec;
+ nsec = e->tv_nsec - s->tv_nsec;
+ if (sec > 0 && nsec < 0) {
+ sec--;
+ nsec += 1000000000LL;
+ }
/*
* time warp bug on some kernels?
*/
- if (sec < 0 || (sec == 0 && nsec < 0))
- return 0;
+ if (sec < 0 || (sec == 0 && nsec < 0))
+ return 0;
- return nsec + (sec * 1000000000LL);
+ return nsec + (sec * 1000000000LL);
}
uint64_t ntime_since_now(const struct timespec *s)
seq = *t->seq;
if (seq == UINT_MAX)
break;
- __sync_synchronize();
+ tsc_barrier();
tsc = get_cpu_clock();
} while (seq != atomic32_compare_and_swap(t->seq, seq, seq + 1));
int fio_monotonic_clocktest(int debug)
{
struct clock_thread *cthreads;
- unsigned int nr_cpus = cpus_online();
+ unsigned int seen_cpus, nr_cpus = cpus_configured();
struct clock_entry *entries;
unsigned long nr_entries, tentries, failed = 0;
struct clock_entry *prev, *this;
uint32_t seq = 0;
unsigned int i;
+ os_cpu_mask_t mask;
+
+#ifdef FIO_HAVE_GET_THREAD_AFFINITY
+ fio_get_thread_affinity(mask);
+#else
+ memset(&mask, 0, sizeof(mask));
+ for (i = 0; i < nr_cpus; i++)
+ fio_cpu_set(&mask, i);
+#endif
if (debug) {
log_info("cs: reliable_tsc: %s\n", tsc_reliable ? "yes" : "no");
if (debug)
log_info("cs: Testing %u CPUs\n", nr_cpus);
+ seen_cpus = 0;
for (i = 0; i < nr_cpus; i++) {
struct clock_thread *t = &cthreads[i];
+ if (!fio_cpu_isset(&mask, i))
+ continue;
t->cpu = i;
t->debug = debug;
t->seq = &seq;
t->nr_entries = nr_entries;
- t->entries = &entries[i * nr_entries];
+ t->entries = &entries[seen_cpus * nr_entries];
__fio_sem_init(&t->lock, FIO_SEM_LOCKED);
if (pthread_create(&t->thread, NULL, clock_thread_fn, t)) {
failed++;
nr_cpus = i;
break;
}
+ seen_cpus++;
}
for (i = 0; i < nr_cpus; i++) {
struct clock_thread *t = &cthreads[i];
+ if (!fio_cpu_isset(&mask, i))
+ continue;
fio_sem_up(&t->lock);
}
struct clock_thread *t = &cthreads[i];
void *ret;
+ if (!fio_cpu_isset(&mask, i))
+ continue;
pthread_join(t->thread, &ret);
if (ret)
failed++;
goto err;
}
+ tentries = nr_entries * seen_cpus;
qsort(entries, tentries, sizeof(struct clock_entry), clock_cmp);
/* silence silly gcc */
{
struct gui_entry *ge;
- ge = malloc(sizeof(*ge));
- memset(ge, 0, sizeof(*ge));
+ ge = calloc(1, sizeof(*ge));
ge->state = GE_STATE_NEW;
ge->ui = ui;
return ge;
{ "Quit", GTK_STOCK_QUIT, NULL, "<Control>Q", NULL, G_CALLBACK(quit_clicked) },
{ "About", GTK_STOCK_ABOUT, NULL, NULL, NULL, G_CALLBACK(about_dialog) },
};
-static gint nmenu_items = ARRAY_SIZE(menu_items);
+static gint nmenu_items = FIO_ARRAY_SIZE(menu_items);
static const gchar *ui_string = " \
<ui> \
gtk_container_add(GTK_CONTAINER(bottom_align), ge->buttonbox);
gtk_box_pack_start(GTK_BOX(main_vbox), bottom_align, FALSE, FALSE, 0);
- add_buttons(ge, buttonspeclist, ARRAY_SIZE(buttonspeclist));
+ add_buttons(ge, buttonspeclist, FIO_ARRAY_SIZE(buttonspeclist));
/*
* Set up thread status progress bar
unsigned long long *p, unsigned int idx)
{
struct gopt_str_val *g;
- const gchar *postfix[] = { "B", "KiB", "MiB", "GiB", "PiB", "PiB", "" };
+ const gchar *postfix[] = { "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB" };
GtkWidget *label;
int i;
#ifndef GFIO_OPTIONS_H
#define GFIO_OPTIONS_H
+#include <gtk/gtk.h>
+
void gopt_get_options_window(GtkWidget *window, struct gfio_client *gc);
void gopt_init(void);
void gopt_exit(void);
struct graph *g = i->parent;
struct graph_value *x;
- x = malloc(sizeof(*x));
- memset(x, 0, sizeof(*x));
+ x = calloc(1, sizeof(*x));
INIT_FLIST_HEAD(&x->alias);
INIT_FLIST_HEAD(&x->list);
flist_add_tail(&x->list, &i->value_list);
ydiff = fabs(yval - y);
/*
- * zero delta, or within or match critera, break
+ * zero delta, or within or match criteria, break
*/
if (ydiff < best_delta) {
best_delta = ydiff;
(C) 2002 William Lee Irwin III, IBM */
/*
- * Knuth recommends primes in approximately golden ratio to the maximum
- * integer representable by a machine word for multiplicative hashing.
- * Chuck Lever verified the effectiveness of this technique:
- * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
- *
- * These primes are chosen to be bit-sparse, that is operations on
- * them can use shifts and additions instead of multiplications for
- * machines where multiplications are slow.
- */
-
-#if BITS_PER_LONG == 32
-/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
-#define GOLDEN_RATIO_PRIME 0x9e370001UL
-#elif BITS_PER_LONG == 64
-/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
-#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
-#else
-#error Define GOLDEN_RATIO_PRIME for your wordsize.
-#endif
-
-/*
- * The above primes are actively bad for hashing, since they are
- * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
- * real problems. Besides, the "prime" part is pointless for the
- * multiplicative hash.
- *
* Although a random odd number will do, it turns out that the golden
* ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
* properties.
/* Last block: affect all 32 bits of (c) */
/* All the case statements fall through */
switch (length) {
- case 12: c += (uint32_t) k[11] << 24; fallthrough;
- case 11: c += (uint32_t) k[10] << 16; fallthrough;
- case 10: c += (uint32_t) k[9] << 8; fallthrough;
- case 9: c += k[8]; fallthrough;
- case 8: b += (uint32_t) k[7] << 24; fallthrough;
- case 7: b += (uint32_t) k[6] << 16; fallthrough;
- case 6: b += (uint32_t) k[5] << 8; fallthrough;
- case 5: b += k[4]; fallthrough;
- case 4: a += (uint32_t) k[3] << 24; fallthrough;
- case 3: a += (uint32_t) k[2] << 16; fallthrough;
- case 2: a += (uint32_t) k[1] << 8; fallthrough;
+ case 12: c += (uint32_t) k[11] << 24; fio_fallthrough;
+ case 11: c += (uint32_t) k[10] << 16; fio_fallthrough;
+ case 10: c += (uint32_t) k[9] << 8; fio_fallthrough;
+ case 9: c += k[8]; fio_fallthrough;
+ case 8: b += (uint32_t) k[7] << 24; fio_fallthrough;
+ case 7: b += (uint32_t) k[6] << 16; fio_fallthrough;
+ case 6: b += (uint32_t) k[5] << 8; fio_fallthrough;
+ case 5: b += k[4]; fio_fallthrough;
+ case 4: a += (uint32_t) k[3] << 24; fio_fallthrough;
+ case 3: a += (uint32_t) k[2] << 16; fio_fallthrough;
+ case 2: a += (uint32_t) k[1] << 8; fio_fallthrough;
case 1: a += k[0];
__jhash_final(a, b, c);
- fallthrough;
+ fio_fallthrough;
case 0: /* Nothing left to add */
break;
}
+#include <errno.h>
#include <signal.h>
+#include <stdio.h>
+#include <string.h>
#include <unistd.h>
#ifdef CONFIG_HAVE_TIMERFD_CREATE
#include <sys/timerfd.h>
#define DRD_IGNORE_VAR(x) do { } while (0)
#endif
+#ifdef WIN32
+#include "os/os-windows.h"
+#endif
+
#include "fio.h"
#include "smalloc.h"
#include "helper_thread.h"
static void block_signals(void)
{
-#ifdef HAVE_PTHREAD_SIGMASK
+#ifdef CONFIG_PTHREAD_SIGMASK
sigset_t sigmask;
+ int ret;
+
ret = pthread_sigmask(SIG_UNBLOCK, NULL, &sigmask);
assert(ret == 0);
ret = pthread_sigmask(SIG_BLOCK, &sigmask, NULL);
- assert(ret == 0);
#endif
}
return;
ret = write_to_pipe(helper_data->pipe[1], &data, sizeof(data));
- assert(ret == 1);
+ if (ret != 1) {
+ log_err("failed to write action into pipe, err %i:%s", errno, strerror(errno));
+ assert(0);
+ }
}
void helper_reset(void)
return;
helper_data->exit = 1;
- submit_action(A_EXIT);
pthread_join(helper_data->thread, NULL);
}
},
{
.name = "steadystate",
- .interval_ms = steadystate_enabled ? STEADYSTATE_MSEC :
+ .interval_ms = steadystate_enabled ? ss_check_interval :
0,
.func = steadystate_check,
}
};
struct timespec ts;
- int clk_tck, ret = 0;
+ long clk_tck;
+ int ret = 0;
-#ifdef _SC_CLK_TCK
- clk_tck = sysconf(_SC_CLK_TCK);
-#else
- /*
- * The timer frequence is variable on Windows. Instead of trying to
- * query it, use 64 Hz, the clock frequency lower bound. See also
- * https://carpediemsystems.co.uk/2019/07/18/windows-system-timer-granularity/.
- */
- clk_tck = 64;
-#endif
- dprint(FD_HELPERTHREAD, "clk_tck = %d\n", clk_tck);
+ os_clk_tck(&clk_tck);
+
+ dprint(FD_HELPERTHREAD, "clk_tck = %ld\n", clk_tck);
assert(clk_tck > 0);
sleep_accuracy_ms = (1000 + clk_tck - 1) / clk_tck;
block_signals();
fio_get_mono_time(&ts);
- msec_to_next_event = reset_timers(timer, ARRAY_SIZE(timer), &ts);
+ msec_to_next_event = reset_timers(timer, FIO_ARRAY_SIZE(timer), &ts);
fio_sem_up(hd->startup_sem);
if (action == A_RESET)
msec_to_next_event = reset_timers(timer,
- ARRAY_SIZE(timer), &ts);
+ FIO_ARRAY_SIZE(timer), &ts);
- for (i = 0; i < ARRAY_SIZE(timer); ++i)
+ for (i = 0; i < FIO_ARRAY_SIZE(timer); ++i)
ret = eval_timer(&timer[i], &ts, &msec_to_next_event);
if (action == A_DO_STAT)
#ifndef FIO_HELPER_THREAD_H
#define FIO_HELPER_THREAD_H
+#include <stdbool.h>
+
+struct fio_sem;
+struct sk_out;
+
extern void helper_reset(void);
extern void helper_do_stat(void);
extern bool helper_should_exit(void);
pthread_condattr_t cattr;
struct idle_prof_thread *ipt;
- ipc.nr_cpus = cpus_online();
+ ipc.nr_cpus = cpus_configured();
ipc.status = IDLE_PROF_STATUS_OK;
if (ipc.opt == IDLE_PROF_OPT_NONE)
#define FIO_RANDSEED (0xb1899bedUL)
static char **ini_file;
-static int max_jobs = FIO_MAX_JOBS;
static bool dump_cmdline;
static bool parse_only;
static bool merge_blktrace_only;
static struct thread_data def_thread;
-struct thread_data *threads = NULL;
+struct thread_segment segments[REAL_MAX_SEG];
static char **job_sections;
static int nr_job_sections;
.has_arg = optional_argument,
.val = 'S',
},
+#ifdef WIN32
+ {
+ .name = (char *) "server-internal",
+ .has_arg = required_argument,
+ .val = 'N',
+ },
+#endif
{ .name = (char *) "daemonize",
.has_arg = required_argument,
.val = 'D',
void free_threads_shm(void)
{
- if (threads) {
- void *tp = threads;
+ int i;
+
+ for (i = 0; i < nr_segments; i++) {
+ struct thread_segment *seg = &segments[i];
+
+ if (seg->threads) {
+ void *tp = seg->threads;
#ifndef CONFIG_NO_SHM
- struct shmid_ds sbuf;
+ struct shmid_ds sbuf;
- threads = NULL;
- shmdt(tp);
- shmctl(shm_id, IPC_RMID, &sbuf);
- shm_id = -1;
+ seg->threads = NULL;
+ shmdt(tp);
+ shmctl(seg->shm_id, IPC_RMID, &sbuf);
+ seg->shm_id = -1;
#else
- threads = NULL;
- free(tp);
+ seg->threads = NULL;
+ free(tp);
#endif
+ }
}
+
+ nr_segments = 0;
+ cur_segment = 0;
}
static void free_shm(void)
{
- if (threads) {
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+ if (nr_segments) {
flow_exit();
fio_debug_jobp = NULL;
fio_warned = NULL;
fio_filelock_exit();
file_hash_exit();
scleanup();
+#endif
}
-/*
- * The thread area is shared between the main process and the job
- * threads/processes. So setup a shared memory segment that will hold
- * all the job info. We use the end of the region for keeping track of
- * open files across jobs, for file sharing.
- */
-static int setup_thread_area(void)
+static int add_thread_segment(void)
{
+ struct thread_segment *seg = &segments[nr_segments];
+ size_t size = JOBS_PER_SEG * sizeof(struct thread_data);
int i;
- if (threads)
- return 0;
-
- /*
- * 1024 is too much on some machines, scale max_jobs if
- * we get a failure that looks like too large a shm segment
- */
- do {
- size_t size = max_jobs * sizeof(struct thread_data);
+ if (nr_segments + 1 >= REAL_MAX_SEG) {
+ log_err("error: maximum number of jobs reached.\n");
+ return -1;
+ }
- size += 2 * sizeof(unsigned int);
+ size += 2 * sizeof(unsigned int);
#ifndef CONFIG_NO_SHM
- shm_id = shmget(0, size, IPC_CREAT | 0600);
- if (shm_id != -1)
- break;
- if (errno != EINVAL && errno != ENOMEM && errno != ENOSPC) {
+ seg->shm_id = shmget(0, size, IPC_CREAT | 0600);
+ if (seg->shm_id == -1) {
+ if (errno != EINVAL && errno != ENOMEM && errno != ENOSPC)
perror("shmget");
- break;
- }
+ return -1;
+ }
#else
- threads = malloc(size);
- if (threads)
- break;
+ seg->threads = malloc(size);
+ if (!seg->threads)
+ return -1;
#endif
- max_jobs >>= 1;
- } while (max_jobs);
-
#ifndef CONFIG_NO_SHM
- if (shm_id == -1)
- return 1;
-
- threads = shmat(shm_id, NULL, 0);
- if (threads == (void *) -1) {
+ seg->threads = shmat(seg->shm_id, NULL, 0);
+ if (seg->threads == (void *) -1) {
perror("shmat");
return 1;
}
if (shm_attach_to_open_removed())
- shmctl(shm_id, IPC_RMID, NULL);
+ shmctl(seg->shm_id, IPC_RMID, NULL);
#endif
- memset(threads, 0, max_jobs * sizeof(struct thread_data));
- for (i = 0; i < max_jobs; i++)
- DRD_IGNORE_VAR(threads[i]);
- fio_debug_jobp = (unsigned int *)(threads + max_jobs);
+ nr_segments++;
+
+ memset(seg->threads, 0, JOBS_PER_SEG * sizeof(struct thread_data));
+ for (i = 0; i < JOBS_PER_SEG; i++)
+ DRD_IGNORE_VAR(seg->threads[i]);
+ seg->nr_threads = 0;
+
+ /* Not first segment, we're done */
+ if (nr_segments != 1) {
+ cur_segment++;
+ return 0;
+ }
+
+ fio_debug_jobp = (unsigned int *)(seg->threads + JOBS_PER_SEG);
*fio_debug_jobp = -1;
fio_warned = fio_debug_jobp + 1;
*fio_warned = 0;
flow_init();
-
return 0;
}
+/*
+ * The thread areas are shared between the main process and the job
+ * threads/processes, and is split into chunks of JOBS_PER_SEG. If the current
+ * segment has no more room, add a new chunk.
+ */
+static int expand_thread_area(void)
+{
+ struct thread_segment *seg = &segments[cur_segment];
+
+ if (nr_segments && seg->nr_threads < JOBS_PER_SEG)
+ return 0;
+
+ return add_thread_segment();
+}
+
static void dump_print_option(struct print_option *p)
{
const char *delim;
}
}
-static void fio_dump_options_free(struct thread_data *td)
-{
- while (!flist_empty(&td->opt_list)) {
- struct print_option *p;
-
- p = flist_first_entry(&td->opt_list, struct print_option, list);
- flist_del_init(&p->list);
- free(p->name);
- free(p->value);
- free(p);
- }
-}
-
static void copy_opt_list(struct thread_data *dst, struct thread_data *src)
{
struct flist_head *entry;
static struct thread_data *get_new_job(bool global, struct thread_data *parent,
bool preserve_eo, const char *jobname)
{
+ struct thread_segment *seg;
struct thread_data *td;
if (global)
return &def_thread;
- if (setup_thread_area()) {
+ if (expand_thread_area()) {
log_err("error: failed to setup shm segment\n");
return NULL;
}
- if (thread_number >= max_jobs) {
- log_err("error: maximum number of jobs (%d) reached.\n",
- max_jobs);
- return NULL;
- }
- td = &threads[thread_number++];
+ seg = &segments[cur_segment];
+ td = &seg->threads[seg->nr_threads++];
+ thread_number++;
*td = *parent;
INIT_FLIST_HEAD(&td->opt_list);
if (td->o.name)
free(td->o.name);
- memset(&threads[td->thread_number - 1], 0, sizeof(*td));
+ memset(td, 0, sizeof(*td));
+ segments[cur_segment].nr_threads--;
thread_number--;
}
ret |= 1;
}
+ if (td_trimwrite(td) && o->num_range > 1) {
+ log_err("fio: trimwrite cannot be used with multiple"
+ " ranges.\n");
+ ret |= 1;
+ }
+
+ if (td_trim(td) && o->num_range > 1 &&
+ !td_ioengine_flagged(td, FIO_MULTI_RANGE_TRIM)) {
+ log_err("fio: can't use multiple ranges with IO engine %s\n",
+ td->io_ops->name);
+ ret |= 1;
+ }
+
#ifndef CONFIG_PSHARED
if (!o->use_thread) {
log_info("fio: this platform does not support process shared"
ret |= 1;
}
+ if (o->zone_mode == ZONE_MODE_ZBD && !o->create_serialize) {
+ log_err("fio: --zonemode=zbd and --create_serialize=0 are not compatible.\n");
+ ret |= 1;
+ }
+
if (o->zone_mode == ZONE_MODE_STRIDED && !o->zone_size) {
log_err("fio: --zonesize must be specified when using --zonemode=strided.\n");
ret |= 1;
ret |= 1;
}
- /*
- * O_ATOMIC implies O_DIRECT
- */
- if (o->oatomic)
- o->odirect = 1;
-
/*
* If randseed is set, that overrides randrepeat
*/
if (o->disable_slat)
o->slat_percentiles = 0;
+ /* Do this only for the parent job */
+ if (!td->subjob_number) {
+ /*
+ * Fix these up to be nsec internally
+ */
+ for_each_rw_ddir(ddir)
+ o->max_latency[ddir] *= 1000ULL;
+
+ o->latency_target *= 1000ULL;
+ }
+
/*
- * Fix these up to be nsec internally
+ * Dedupe working set verifications
*/
- o->max_latency *= 1000ULL;
- o->latency_target *= 1000ULL;
+ if (o->dedupe_percentage && o->dedupe_mode == DEDUPE_MODE_WORKING_SET) {
+ if (!fio_option_is_set(o, size)) {
+ log_err("fio: pregenerated dedupe working set "
+ "requires size to be set\n");
+ ret |= 1;
+ } else if (o->nr_files != 1) {
+ log_err("fio: dedupe working set mode supported with "
+ "single file per job, but %d files "
+ "provided\n", o->nr_files);
+ ret |= 1;
+ } else if (o->dedupe_working_set_percentage + o->dedupe_percentage > 100) {
+ log_err("fio: impossible to reach expected dedupe percentage %u "
+ "since %u percentage of size is reserved to dedupe working set "
+ "(those are unique pages)\n",
+ o->dedupe_percentage, o->dedupe_working_set_percentage);
+ ret |= 1;
+ }
+ }
+
+ for_each_td(td2) {
+ if (td->o.ss_check_interval != td2->o.ss_check_interval) {
+ log_err("fio: conflicting ss_check_interval: %llu and %llu, must be globally equal\n",
+ td->o.ss_check_interval, td2->o.ss_check_interval);
+ ret |= 1;
+ }
+ } end_for_each();
+ if (td->o.ss_dur && td->o.ss_check_interval / 1000L < 1000) {
+ log_err("fio: ss_check_interval must be at least 1s\n");
+ ret |= 1;
+ }
+ if (td->o.ss_dur && (td->o.ss_dur % td->o.ss_check_interval != 0 || td->o.ss_dur <= td->o.ss_check_interval)) {
+ log_err("fio: ss_duration %lluus must be multiple of ss_check_interval %lluus\n",
+ td->o.ss_dur, td->o.ss_check_interval);
+ ret |= 1;
+ }
+
+ if (td->o.fdp) {
+ if (fio_option_is_set(&td->o, dp_type) &&
+ (td->o.dp_type == FIO_DP_STREAMS || td->o.dp_type == FIO_DP_NONE)) {
+ log_err("fio: fdp=1 is not compatible with dataplacement={streams, none}\n");
+ ret |= 1;
+ } else {
+ td->o.dp_type = FIO_DP_FDP;
+ }
+ }
return ret;
}
const unsigned int seed = td->rand_seeds[FIO_RAND_FILE_OFF];
if (td->o.file_service_type == FIO_FSERVICE_ZIPF) {
- zipf_init(&td->next_file_zipf, nranges, td->zipf_theta, seed);
+ zipf_init(&td->next_file_zipf, nranges, td->zipf_theta, td->random_center, seed);
zipf_disable_hash(&td->next_file_zipf);
} else if (td->o.file_service_type == FIO_FSERVICE_PARETO) {
- pareto_init(&td->next_file_zipf, nranges, td->pareto_h, seed);
+ pareto_init(&td->next_file_zipf, nranges, td->pareto_h, td->random_center, seed);
zipf_disable_hash(&td->next_file_zipf);
} else if (td->o.file_service_type == FIO_FSERVICE_GAUSS) {
- gauss_init(&td->next_file_gauss, nranges, td->gauss_dev, seed);
+ gauss_init(&td->next_file_gauss, nranges, td->gauss_dev, td->random_center, seed);
gauss_disable_hash(&td->next_file_gauss);
}
}
-void td_fill_verify_state_seed(struct thread_data *td)
+void td_fill_rand_seeds(struct thread_data *td)
{
+ uint64_t read_seed = td->rand_seeds[FIO_RAND_BS_OFF];
+ uint64_t write_seed = td->rand_seeds[FIO_RAND_BS1_OFF];
+ uint64_t trim_seed = td->rand_seeds[FIO_RAND_BS2_OFF];
+ int i;
bool use64;
if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
else
use64 = false;
- init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF],
- use64);
-}
-
-static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
-{
- uint64_t read_seed = td->rand_seeds[FIO_RAND_BS_OFF];
- uint64_t write_seed = td->rand_seeds[FIO_RAND_BS1_OFF];
- uint64_t trim_seed = td->rand_seeds[FIO_RAND_BS2_OFF];
- int i;
-
/*
* trimwrite is special in that we need to generate the same
* offsets to get the "write after trim" effect. If we are
init_rand_seed(&td->bsrange_state[DDIR_WRITE], write_seed, use64);
init_rand_seed(&td->bsrange_state[DDIR_TRIM], trim_seed, use64);
- td_fill_verify_state_seed(td);
+ init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF],
+ use64);
init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], false);
if (td->o.file_service_type == FIO_FSERVICE_RANDOM)
init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false);
init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false);
init_rand_seed(&td->prio_state, td->rand_seeds[FIO_RAND_PRIO_CMDS], false);
-
- if (!td_random(td))
- return;
-
- if (td->o.rand_repeatable)
- td->rand_seeds[FIO_RAND_BLOCK_OFF] = FIO_RANDSEED * td->thread_number;
+ init_rand_seed(&td->dedupe_working_set_index_state, td->rand_seeds[FIO_RAND_DEDUPE_WORKING_SET_IX], use64);
init_rand_seed(&td->random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF], use64);
init_rand_seed(s, td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF], false);
}
+
+ init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF], use64);
+ frand_copy(&td->buf_state_prev, &td->buf_state);
+
+ init_rand_seed(&td->fdp_state, td->rand_seeds[FIO_RAND_FDP_OFF], use64);
}
-void td_fill_rand_seeds(struct thread_data *td)
+static int setup_random_seeds(struct thread_data *td)
{
- bool use64;
-
- if (td->o.allrand_repeatable) {
- unsigned int i;
+ uint64_t seed;
+ unsigned int i;
- for (i = 0; i < FIO_RAND_NR_OFFS; i++)
- td->rand_seeds[i] = FIO_RANDSEED * td->thread_number
- + i;
+ if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) {
+ int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds));
+ dprint(FD_RANDOM, "using system RNG for random seeds\n");
+ if (ret)
+ return ret;
+ } else {
+ seed = td->o.rand_seed;
+ for (i = 0; i < 4; i++)
+ seed *= 0x9e370001UL;
+
+ for (i = 0; i < FIO_RAND_NR_OFFS; i++) {
+ td->rand_seeds[i] = seed * td->thread_number + i;
+ seed *= 0x9e370001UL;
+ }
}
- if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
- use64 = true;
- else
- use64 = false;
+ td_fill_rand_seeds(td);
- td_fill_rand_seeds_internal(td, use64);
+ dprint(FD_RANDOM, "FIO_RAND_NR_OFFS=%d\n", FIO_RAND_NR_OFFS);
+ for (int i = 0; i < FIO_RAND_NR_OFFS; i++)
+ dprint(FD_RANDOM, "rand_seeds[%d]=%" PRIu64 "\n", i, td->rand_seeds[i]);
- init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF], use64);
- frand_copy(&td->buf_state_prev, &td->buf_state);
+ return 0;
}
/*
* for this name and see if they match. If they do, then
* the engine is unchanged.
*/
- dlhandle = td->io_ops_dlhandle;
+ dlhandle = td->io_ops->dlhandle;
ops = load_ioengine(td);
if (!ops)
goto fail;
- if (ops == td->io_ops && dlhandle == td->io_ops_dlhandle) {
- if (dlhandle)
- dlclose(dlhandle);
+ if (ops == td->io_ops && dlhandle == td->io_ops->dlhandle)
return 0;
- }
- if (dlhandle && dlhandle != td->io_ops_dlhandle)
+ if (dlhandle && dlhandle != td->io_ops->dlhandle)
dlclose(dlhandle);
/* Unload the old engine. */
}
}
-static int setup_random_seeds(struct thread_data *td)
-{
- uint64_t seed;
- unsigned int i;
-
- if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed)) {
- int ret = init_random_seeds(td->rand_seeds, sizeof(td->rand_seeds));
- if (!ret)
- td_fill_rand_seeds(td);
- return ret;
- }
-
- seed = td->o.rand_seed;
- for (i = 0; i < 4; i++)
- seed *= 0x9e370001UL;
-
- for (i = 0; i < FIO_RAND_NR_OFFS; i++) {
- td->rand_seeds[i] = seed * td->thread_number + i;
- seed *= 0x9e370001UL;
- }
-
- td_fill_rand_seeds(td);
- return 0;
-}
-
enum {
FPRE_NONE = 0,
FPRE_JOBNAME,
FPRE_JOBNUM,
- FPRE_FILENUM
+ FPRE_FILENUM,
+ FPRE_CLIENTUID
};
static struct fpre_keyword {
{ .keyword = "$jobname", .key = FPRE_JOBNAME, },
{ .keyword = "$jobnum", .key = FPRE_JOBNUM, },
{ .keyword = "$filenum", .key = FPRE_FILENUM, },
+ { .keyword = "$clientuid", .key = FPRE_CLIENTUID, },
{ .keyword = NULL, },
};
}
break;
}
+ case FPRE_CLIENTUID: {
+ int ret;
+ ret = snprintf(dst, dst_left, "%s", client_sockaddr_str);
+ if (ret < 0)
+ break;
+ else if (ret > dst_left) {
+ log_err("fio: truncated filename\n");
+ dst += dst_left;
+ dst_left = 0;
+ } else {
+ dst += ret;
+ dst_left -= ret;
+ }
+ break;
+ }
default:
assert(0);
break;
static int check_waitees(char *waitee)
{
- struct thread_data *td;
- int i, ret = 0;
+ int ret = 0;
- for_each_td(td, i) {
+ for_each_td(td) {
if (td->subjob_number)
continue;
ret += !strcmp(td->o.name, waitee);
- }
+ } end_for_each();
return ret;
}
return true;
}
+static int verify_per_group_options(struct thread_data *td, const char *jobname)
+{
+ for_each_td(td2) {
+ if (td->groupid != td2->groupid)
+ continue;
+
+ if (td->o.stats &&
+ td->o.lat_percentiles != td2->o.lat_percentiles) {
+ log_err("fio: lat_percentiles in job: %s differs from group\n",
+ jobname);
+ return 1;
+ }
+ } end_for_each();
+
+ return 0;
+}
+
/*
* Treat an empty log file name the same as a one not given
*/
if (fixup_options(td))
goto err;
+ if (!td->o.dedupe_global && init_dedupe_working_set_seeds(td, 0))
+ goto err;
+
/*
* Belongs to fixup_options, but o->name is not necessarily set as yet
*/
memcpy(td->ts.percentile_list, o->percentile_list, sizeof(o->percentile_list));
td->ts.sig_figs = o->sig_figs;
- for (i = 0; i < DDIR_RWDIR_CNT; i++) {
- td->ts.clat_stat[i].min_val = ULONG_MAX;
- td->ts.slat_stat[i].min_val = ULONG_MAX;
- td->ts.lat_stat[i].min_val = ULONG_MAX;
- td->ts.bw_stat[i].min_val = ULONG_MAX;
- td->ts.iops_stat[i].min_val = ULONG_MAX;
- td->ts.clat_high_prio_stat[i].min_val = ULONG_MAX;
- td->ts.clat_low_prio_stat[i].min_val = ULONG_MAX;
- }
- td->ts.sync_stat.min_val = ULONG_MAX;
- td->ddir_seq_nr = o->ddir_seq_nr;
+ init_thread_stat_min_vals(&td->ts);
+
+ /*
+ * td->>ddir_seq_nr needs to be initialized to 1, NOT o->ddir_seq_nr,
+ * so that get_next_offset gets a new random offset the first time it
+ * is called, instead of keeping an initial offset of 0 for the first
+ * nr-1 calls
+ */
+ td->ddir_seq_nr = 1;
if ((o->stonewall || o->new_group) && prev_group_jobs) {
prev_group_jobs = 0;
td->groupid = groupid;
prev_group_jobs++;
+ if (td->o.group_reporting && prev_group_jobs > 1 &&
+ verify_per_group_options(td, jobname))
+ goto err;
+
if (setup_rate(td))
goto err;
.hist_coarseness = o->log_hist_coarseness,
.log_type = IO_LOG_TYPE_LAT,
.log_offset = o->log_offset,
+ .log_prio = o->log_prio,
.log_gz = o->log_gz,
.log_gz_store = o->log_gz_store,
};
else
suf = "log";
- gen_log_name(logname, sizeof(logname), "lat", pre,
- td->thread_number, suf, o->per_job_logs);
- setup_log(&td->lat_log, &p, logname);
+ if (!o->disable_lat) {
+ gen_log_name(logname, sizeof(logname), "lat", pre,
+ td->thread_number, suf, o->per_job_logs);
+ setup_log(&td->lat_log, &p, logname);
+ }
- gen_log_name(logname, sizeof(logname), "slat", pre,
- td->thread_number, suf, o->per_job_logs);
- setup_log(&td->slat_log, &p, logname);
+ if (!o->disable_slat) {
+ gen_log_name(logname, sizeof(logname), "slat", pre,
+ td->thread_number, suf, o->per_job_logs);
+ setup_log(&td->slat_log, &p, logname);
+ }
- gen_log_name(logname, sizeof(logname), "clat", pre,
- td->thread_number, suf, o->per_job_logs);
- setup_log(&td->clat_log, &p, logname);
+ if (!o->disable_clat) {
+ gen_log_name(logname, sizeof(logname), "clat", pre,
+ td->thread_number, suf, o->per_job_logs);
+ setup_log(&td->clat_log, &p, logname);
+ }
}
.hist_coarseness = o->log_hist_coarseness,
.log_type = IO_LOG_TYPE_HIST,
.log_offset = o->log_offset,
+ .log_prio = o->log_prio,
.log_gz = o->log_gz,
.log_gz_store = o->log_gz_store,
};
.hist_coarseness = o->log_hist_coarseness,
.log_type = IO_LOG_TYPE_BW,
.log_offset = o->log_offset,
+ .log_prio = o->log_prio,
.log_gz = o->log_gz,
.log_gz_store = o->log_gz_store,
};
.hist_coarseness = o->log_hist_coarseness,
.log_type = IO_LOG_TYPE_IOPS,
.log_offset = o->log_offset,
+ .log_prio = o->log_prio,
.log_gz = o->log_gz,
.log_gz_store = o->log_gz_store,
};
* it's really 256 + small bit, 280 should suffice
*/
if (!nested) {
- name = malloc(280);
- memset(name, 0, 280);
+ name = calloc(1, 280);
}
opts = NULL;
i++;
}
+ free(job_sections);
+ job_sections = NULL;
+ nr_job_sections = 0;
+
free(opts);
out:
free(string);
printf(" --minimal\t\tMinimal (terse) output\n");
printf(" --output-format=type\tOutput format (terse,json,json+,normal)\n");
printf(" --terse-version=type\tSet terse version output format"
- " (default 3, or 2 or 4)\n");
+ " (default 3, or 2 or 4 or 5)\n");
printf(" --version\t\tPrint version info and exit\n");
printf(" --help\t\tPrint this page\n");
printf(" --cpuclock-test\tPerform test/validation of CPU clock\n");
break;
ret = fio_cmd_ioengine_option_parse(td, opt, val);
+
+ if (ret) {
+ if (td) {
+ put_job(td);
+ td = NULL;
+ }
+ do_exit++;
+ exit_val = 1;
+ }
break;
}
case 'w':
warnings_fatal = 1;
break;
case 'j':
- max_jobs = atoi(optarg);
- if (!max_jobs || max_jobs > REAL_MAX_JOBS) {
- log_err("fio: invalid max jobs: %d\n", max_jobs);
- do_exit++;
- exit_val = 1;
- }
+ /* we don't track/need this anymore, ignore it */
break;
case 'S':
did_arg = true;
exit_val = 1;
#endif
break;
+#ifdef WIN32
+ case 'N':
+ did_arg = true;
+ fio_server_internal_set(optarg);
+ break;
+#endif
case 'D':
if (pid_file)
free(pid_file);
log_err("%s: unrecognized option '%s'\n", argv[0],
argv[optind - 1]);
show_closest_option(argv[optind - 1]);
- fallthrough;
+ fio_fallthrough;
default:
do_exit++;
exit_val = 1;
DDIR_WAIT,
DDIR_LAST,
DDIR_INVAL = -1,
+ DDIR_TIMEOUT = -2,
DDIR_RWDIR_CNT = 3,
DDIR_RWDIR_SYNC_CNT = 4,
"datasync", "sync_file_range",
"wait", };
- if (ddir < DDIR_LAST)
+ if (ddir >= 0 && ddir < DDIR_LAST)
return name[ddir];
return "invalid";
TD_DDIR_RANDRW = TD_DDIR_RW | TD_DDIR_RAND,
TD_DDIR_RANDTRIM = TD_DDIR_TRIM | TD_DDIR_RAND,
TD_DDIR_TRIMWRITE = TD_DDIR_TRIM | TD_DDIR_WRITE,
+ TD_DDIR_RANDTRIMWRITE = TD_DDIR_RANDTRIM | TD_DDIR_WRITE,
};
#define td_read(td) ((td)->o.td_ddir & TD_DDIR_READ)
#define file_randommap(td, f) (!(td)->o.norandommap && fio_file_axmap((f)))
#define td_trimwrite(td) (((td)->o.td_ddir & TD_DDIR_TRIMWRITE) \
== TD_DDIR_TRIMWRITE)
+#define td_randtrimwrite(td) (((td)->o.td_ddir & TD_DDIR_RANDTRIMWRITE) \
+ == TD_DDIR_RANDTRIMWRITE)
static inline int ddir_sync(enum fio_ddir ddir)
{
{
static const char *__str[] = { NULL, "read", "write", "rw", "rand",
"randread", "randwrite", "randrw",
- "trim", NULL, "trimwrite", NULL, "randtrim" };
+ "trim", NULL, "trimwrite", NULL, "randtrim",
+ NULL, "randtrimwrite" };
return __str[ddir];
}
* and invalidate the cache, if we need to.
*/
if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) &&
- o->time_based) {
+ o->time_based && o->nr_files == 1) {
f->last_pos[ddir] = f->file_offset;
loop_cache_invalidate(td, f);
}
+ /*
+ * If we reach the end for a rw-io-size based run, reset us back to 0
+ * and invalidate the cache, if we need to.
+ */
+ if (td_rw(td) && o->io_size > o->size) {
+ if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f)) {
+ f->last_pos[ddir] = f->file_offset;
+ loop_cache_invalidate(td, f);
+ }
+ }
+
if (f->last_pos[ddir] < f->real_file_size) {
uint64_t pos;
b = offset = -1ULL;
- if (rw_seq) {
+ if (td_randtrimwrite(td) && ddir == DDIR_WRITE) {
+ /* don't mark randommap for these writes */
+ io_u_set(td, io_u, IO_U_F_BUSY_OK);
+ offset = f->last_start[DDIR_TRIM];
+ *is_random = true;
+ ret = 0;
+ } else if (rw_seq) {
if (td_random(td)) {
if (should_do_random(td, ddir)) {
ret = get_next_rand_block(td, f, ddir, &b);
return 1;
}
+ /*
+ * For randtrimwrite, we decide whether to issue a trim or a write
+ * based on whether the offsets for the most recent trim and write
+ * operations match. If they don't match that means we just issued a
+ * new trim and the next operation should be a write. If they *do*
+ * match that means we just completed a trim+write pair and the next
+ * command should be a trim.
+ *
+ * This works fine for sequential workloads but for random workloads
+ * it's possible to complete a trim+write pair and then have the next
+ * randomly generated offset match the previous offset. If that happens
+ * we need to alter the offset for the last write operation in order
+ * to ensure that we issue a write operation the next time through.
+ */
+ if (td_randtrimwrite(td) && ddir == DDIR_TRIM &&
+ f->last_start[DDIR_TRIM] == io_u->offset)
+ f->last_start[DDIR_WRITE]--;
+
io_u->verify_offset = io_u->offset;
return 0;
}
assert(ddir_rw(ddir));
+ if (td_randtrimwrite(td) && ddir == DDIR_WRITE) {
+ struct fio_file *f = io_u->file;
+
+ return f->last_pos[DDIR_TRIM] - f->last_start[DDIR_TRIM];
+ }
+
if (td->o.bs_is_seq_rand)
ddir = is_random ? DDIR_WRITE : DDIR_READ;
* check if the usec is capable of taking negative values
*/
if (now > td->o.timeout) {
- ddir = DDIR_INVAL;
+ ddir = DDIR_TIMEOUT;
return ddir;
}
usec = td->o.timeout - now;
now = utime_since_now(&td->epoch);
if ((td->o.timeout && (now > td->o.timeout)) || td->terminate)
- ddir = DDIR_INVAL;
+ ddir = DDIR_TIMEOUT;
return ddir;
}
else
ddir = DDIR_INVAL;
- td->rwmix_ddir = rate_ddir(td, ddir);
+ if (!should_check_rate(td)) {
+ /*
+ * avoid time-consuming call to utime_since_now() if rate checking
+ * isn't being used. this imrpoves IOPs 50%. See:
+ * https://github.com/axboe/fio/issues/1501#issuecomment-1418327049
+ */
+ td->rwmix_ddir = ddir;
+ } else
+ td->rwmix_ddir = rate_ddir(td, ddir);
return td->rwmix_ddir;
}
if (td_trimwrite(td)) {
struct fio_file *f = io_u->file;
- if (f->last_pos[DDIR_WRITE] == f->last_pos[DDIR_TRIM])
+ if (f->last_start[DDIR_WRITE] == f->last_start[DDIR_TRIM])
ddir = DDIR_TRIM;
else
ddir = DDIR_WRITE;
fio_file_reset(td, f);
}
+static int fill_multi_range_io_u(struct thread_data *td, struct io_u *io_u)
+{
+ bool is_random;
+ uint64_t buflen, i = 0;
+ struct trim_range *range;
+ struct fio_file *f = io_u->file;
+ uint8_t *buf;
+
+ buf = io_u->buf;
+ buflen = 0;
+
+ while (i < td->o.num_range) {
+ range = (struct trim_range *)buf;
+ if (get_next_offset(td, io_u, &is_random)) {
+ dprint(FD_IO, "io_u %p, failed getting offset\n",
+ io_u);
+ break;
+ }
+
+ io_u->buflen = get_next_buflen(td, io_u, is_random);
+ if (!io_u->buflen) {
+ dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u);
+ break;
+ }
+
+ if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
+ dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n",
+ io_u,
+ (unsigned long long) io_u->offset, io_u->buflen,
+ (unsigned long long) io_u->file->real_file_size);
+ break;
+ }
+
+ range->start = io_u->offset;
+ range->len = io_u->buflen;
+ buflen += io_u->buflen;
+ f->last_start[io_u->ddir] = io_u->offset;
+ f->last_pos[io_u->ddir] = io_u->offset + range->len;
+
+ buf += sizeof(struct trim_range);
+ i++;
+
+ if (td_random(td) && file_randommap(td, io_u->file))
+ mark_random_map(td, io_u, io_u->offset, io_u->buflen);
+ dprint_io_u(io_u, "fill");
+ }
+ if (buflen) {
+ /*
+ * Set buffer length as overall trim length for this IO, and
+ * tell the ioengine about the number of ranges to be trimmed.
+ */
+ io_u->buflen = buflen;
+ io_u->number_trim = i;
+ return 0;
+ }
+
+ return 1;
+}
+
static int fill_io_u(struct thread_data *td, struct io_u *io_u)
{
bool is_random;
set_rw_ddir(td, io_u);
- if (io_u->ddir == DDIR_INVAL) {
+ if (io_u->ddir == DDIR_INVAL || io_u->ddir == DDIR_TIMEOUT) {
dprint(FD_IO, "invalid direction received ddir = %d", io_u->ddir);
return 1;
}
else if (td->o.zone_mode == ZONE_MODE_ZBD)
setup_zbd_zone_mode(td, io_u);
- /*
- * No log, let the seq/rand engine retrieve the next buflen and
- * position.
- */
- if (get_next_offset(td, io_u, &is_random)) {
- dprint(FD_IO, "io_u %p, failed getting offset\n", io_u);
- return 1;
- }
+ if (multi_range_trim(td, io_u)) {
+ if (fill_multi_range_io_u(td, io_u))
+ return 1;
+ } else {
+ /*
+ * No log, let the seq/rand engine retrieve the next buflen and
+ * position.
+ */
+ if (get_next_offset(td, io_u, &is_random)) {
+ dprint(FD_IO, "io_u %p, failed getting offset\n", io_u);
+ return 1;
+ }
- io_u->buflen = get_next_buflen(td, io_u, is_random);
- if (!io_u->buflen) {
- dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u);
- return 1;
+ io_u->buflen = get_next_buflen(td, io_u, is_random);
+ if (!io_u->buflen) {
+ dprint(FD_IO, "io_u %p, failed getting buflen\n", io_u);
+ return 1;
+ }
}
-
offset = io_u->offset;
+
if (td->o.zone_mode == ZONE_MODE_ZBD) {
ret = zbd_adjust_block(td, io_u);
- if (ret == io_u_eof)
+ if (ret == io_u_eof) {
+ dprint(FD_IO, "zbd_adjust_block() returned io_u_eof\n");
return 1;
+ }
}
+ if (td->o.dp_type != FIO_DP_NONE)
+ dp_fill_dspec_data(td, io_u);
+
if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
dprint(FD_IO, "io_u %p, off=0x%llx + len=0x%llx exceeds file size=0x%llx\n",
io_u,
/*
* mark entry before potentially trimming io_u
*/
- if (td_random(td) && file_randommap(td, io_u->file))
+ if (!multi_range_trim(td, io_u) && td_random(td) && file_randommap(td, io_u->file))
io_u->buflen = mark_random_map(td, io_u, offset, io_u->buflen);
out:
- dprint_io_u(io_u, "fill");
+ if (!multi_range_trim(td, io_u))
+ dprint_io_u(io_u, "fill");
io_u->verify_offset = io_u->offset;
td->zone_bytes += io_u->buflen;
return 0;
break;
case 1 ... 4:
idx = 1;
- fallthrough;
+ fio_fallthrough;
case 0:
break;
}
break;
case 2 ... 3:
idx = 1;
- fallthrough;
+ fio_fallthrough;
case 1:
break;
}
break;
case 2 ... 3:
idx = 1;
- fallthrough;
+ fio_fallthrough;
case 0 ... 1:
break;
}
break;
case 2 ... 3:
idx = 1;
- fallthrough;
+ fio_fallthrough;
case 0 ... 1:
break;
}
break;
case 2 ... 3:
idx = 1;
- fallthrough;
+ fio_fallthrough;
case 0 ... 1:
break;
}
if (f && fio_file_open(f) && !fio_file_closing(f)) {
if (td->o.file_service_type == FIO_FSERVICE_SEQ)
goto out;
- if (td->file_service_left--)
+ if (td->file_service_left) {
+ td->file_service_left--;
goto out;
+ }
}
if (td->o.file_service_type == FIO_FSERVICE_RR ||
put_file_log(td, f);
td_io_close_file(td, f);
io_u->file = NULL;
+
+ if (io_u->ddir == DDIR_TIMEOUT)
+ return 1;
+
if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)
fio_file_reset(td, f);
else {
return 0;
}
-static void lat_fatal(struct thread_data *td, struct io_completion_data *icd,
+static void lat_fatal(struct thread_data *td, struct io_u *io_u, struct io_completion_data *icd,
unsigned long long tnsec, unsigned long long max_nsec)
{
- if (!td->error)
- log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec)\n", tnsec, max_nsec);
+ if (!td->error) {
+ log_err("fio: latency of %llu nsec exceeds specified max (%llu nsec): %s %s %llu %llu\n",
+ tnsec, max_nsec,
+ io_u->file->file_name,
+ io_ddir_name(io_u->ddir),
+ io_u->offset, io_u->buflen);
+ }
td_verror(td, ETIMEDOUT, "max latency exceeded");
icd->error = ETIMEDOUT;
}
{
const bool needs_lock = td_async_processing(td);
struct io_u *io_u = NULL;
- int ret;
if (td->stop_io)
return NULL;
assert(io_u->flags & IO_U_F_FREE);
io_u_clear(td, io_u, IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
IO_U_F_TRIMMED | IO_U_F_BARRIER |
- IO_U_F_VER_LIST | IO_U_F_PRIORITY);
+ IO_U_F_VER_LIST);
io_u->error = 0;
io_u->acct_ddir = -1;
io_u_set(td, io_u, IO_U_F_IN_CUR_DEPTH);
io_u->ipo = NULL;
} else if (td_async_processing(td)) {
+ int ret;
/*
* We ran out, wait for async verify threads to finish and
* return one
*/
assert(!(td->flags & TD_F_CHILD));
ret = pthread_cond_wait(&td->free_cond, &td->io_u_lock);
- assert(ret == 0);
- if (!td->error)
+ if (fio_unlikely(ret != 0)) {
+ td->error = errno;
+ } else if (!td->error)
goto again;
}
assert(fio_file_open(f));
- if (ddir_rw(io_u->ddir)) {
+ if (ddir_rw(io_u->ddir) && !multi_range_trim(td, io_u)) {
if (!io_u->buflen && !td_ioengine_flagged(td, FIO_NOIO)) {
dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u);
goto err_put;
io_u->buflen);
} else if ((td->flags & TD_F_SCRAMBLE_BUFFERS) &&
!(td->flags & TD_F_COMPRESS) &&
- !(td->flags & TD_F_DO_VERIFY))
+ !(td->flags & TD_F_DO_VERIFY)) {
do_scramble = 1;
+ }
} else if (io_u->ddir == DDIR_READ) {
/*
* Reset the buf_filled parameters so next time if the
io_u->xfer_buf = io_u->buf;
io_u->xfer_buflen = io_u->buflen;
+ /*
+ * Remember the issuing context priority. The IO engine may change this.
+ */
+ io_u->ioprio = td->ioprio;
+ io_u->clat_prio_index = 0;
out:
assert(io_u->file);
if (!td_io_prep(td, io_u)) {
io_ddir_name(io_u->ddir),
io_u->offset, io_u->xfer_buflen);
+ zbd_log_err(td, io_u);
+
if (td->io_ops->errdetails) {
char *err = td->io_ops->errdetails(io_u);
unsigned long long tnsec;
tnsec = ntime_since(&io_u->start_time, &icd->time);
- add_lat_sample(td, idx, tnsec, bytes, io_u->offset, io_u_is_prio(io_u));
+ add_lat_sample(td, idx, tnsec, bytes, io_u->offset,
+ io_u->ioprio, io_u->clat_prio_index);
if (td->flags & TD_F_PROFILE_OPS) {
struct prof_io_ops *ops = &td->prof_io_ops;
icd->error = ops->io_u_lat(td, tnsec);
}
- if (td->o.max_latency && tnsec > td->o.max_latency)
- lat_fatal(td, icd, tnsec, td->o.max_latency);
- if (td->o.latency_target && tnsec > td->o.latency_target) {
- if (lat_target_failed(td))
- lat_fatal(td, icd, tnsec, td->o.latency_target);
+ if (ddir_rw(idx)) {
+ if (td->o.max_latency[idx] && tnsec > td->o.max_latency[idx])
+ lat_fatal(td, io_u, icd, tnsec, td->o.max_latency[idx]);
+ if (td->o.latency_target && tnsec > td->o.latency_target) {
+ if (lat_target_failed(td))
+ lat_fatal(td, io_u, icd, tnsec, td->o.latency_target);
+ }
}
}
if (ddir_rw(idx)) {
if (!td->o.disable_clat) {
- add_clat_sample(td, idx, llnsec, bytes, io_u->offset, io_u_is_prio(io_u));
+ add_clat_sample(td, idx, llnsec, bytes, io_u->offset,
+ io_u->ioprio, io_u->clat_prio_index);
io_u_mark_latency(td, llnsec);
}
dprint_io_u(io_u, "complete");
assert(io_u->flags & IO_U_F_FLIGHT);
- io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK);
+ io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK | IO_U_F_PATTERN_DONE);
/*
* Mark IO ok to verify
}
if (ddir_sync(ddir)) {
- td->last_was_sync = true;
+ if (io_u->error)
+ goto error;
if (f) {
f->first_write = -1ULL;
f->last_write = -1ULL;
return;
}
- td->last_was_sync = false;
td->last_ddir = ddir;
if (!io_u->error && ddir_rw(ddir)) {
* Make sure we notice short IO from here, and requeue them
* appropriately!
*/
- if (io_u->resid) {
+ if (bytes && io_u->resid) {
io_u->xfer_buflen = io_u->resid;
io_u->xfer_buf += bytes;
io_u->offset += bytes;
icd->error = ret;
}
} else if (io_u->error) {
+error:
icd->error = io_u->error;
io_u_log_error(td, io_u);
}
}
}
+static void io_u_update_bytes_done(struct thread_data *td,
+ struct io_completion_data *icd)
+{
+ int ddir;
+
+ if (td->runstate == TD_VERIFYING) {
+ td->bytes_verified += icd->bytes_done[DDIR_READ];
+ if (td_write(td))
+ return;
+ }
+
+ for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+ td->bytes_done[ddir] += icd->bytes_done[ddir];
+}
+
/*
* Complete a single io_u for the sync engines.
*/
int io_u_sync_complete(struct thread_data *td, struct io_u *io_u)
{
struct io_completion_data icd;
- int ddir;
init_icd(td, &icd, 1);
io_completed(td, &io_u, &icd);
return -1;
}
- for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
- td->bytes_done[ddir] += icd.bytes_done[ddir];
+ io_u_update_bytes_done(td, &icd);
return 0;
}
{
struct io_completion_data icd;
struct timespec *tvp = NULL;
- int ret, ddir;
+ int ret;
struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
dprint(FD_IO, "io_u_queued_complete: min=%d\n", min_evts);
return -1;
}
- for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
- td->bytes_done[ddir] += icd.bytes_done[ddir];
+ io_u_update_bytes_done(td, &icd);
return ret;
}
td = td->parent;
add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen,
- io_u->offset, io_u_is_prio(io_u));
+ io_u->offset, io_u->ioprio);
}
}
static struct frand_state *get_buf_state(struct thread_data *td)
{
unsigned int v;
+ unsigned long long i;
if (!td->o.dedupe_percentage)
return &td->buf_state;
v = rand_between(&td->dedupe_state, 1, 100);
if (v <= td->o.dedupe_percentage)
- return &td->buf_state_prev;
+ switch (td->o.dedupe_mode) {
+ case DEDUPE_MODE_REPEAT:
+ /*
+ * The caller advances the returned frand_state.
+ * A copy of prev should be returned instead since
+ * a subsequent intention to generate a deduped buffer
+ * might result in generating a unique one
+ */
+ frand_copy(&td->buf_state_ret, &td->buf_state_prev);
+ return &td->buf_state_ret;
+ case DEDUPE_MODE_WORKING_SET:
+ i = rand_between(&td->dedupe_working_set_index_state, 0, td->num_unique_pages - 1);
+ frand_copy(&td->buf_state_ret, &td->dedupe_working_set_states[i]);
+ return &td->buf_state_ret;
+ default:
+ log_err("unexpected dedupe mode %u\n", td->o.dedupe_mode);
+ assert(0);
+ }
return &td->buf_state;
}
if (o->compress_percentage || o->dedupe_percentage) {
unsigned int perc = td->o.compress_percentage;
- struct frand_state *rs;
+ struct frand_state *rs = NULL;
unsigned long long left = max_bs;
unsigned long long this_write;
do {
- rs = get_buf_state(td);
+ /*
+ * Buffers are either entirely dedupe-able or not.
+ * If we choose to dedup, the buffer should undergo
+ * the same manipulation as the original write. Which
+ * means we should retrack the steps we took for compression
+ * as well.
+ */
+ if (!rs)
+ rs = get_buf_state(td);
min_write = min(min_write, left);
- if (perc) {
- this_write = min_not_zero(min_write,
- (unsigned long long) td->o.compress_chunk);
+ this_write = min_not_zero(min_write,
+ (unsigned long long) td->o.compress_chunk);
- fill_random_buf_percentage(rs, buf, perc,
- this_write, this_write,
- o->buffer_pattern,
- o->buffer_pattern_bytes);
- } else {
- fill_random_buf(rs, buf, min_write);
- this_write = min_write;
- }
+ fill_random_buf_percentage(rs, buf, perc,
+ this_write, this_write,
+ o->buffer_pattern,
+ o->buffer_pattern_bytes);
buf += this_write;
left -= this_write;
int ret;
if (io_u->ddir == DDIR_SYNC) {
+#ifdef CONFIG_FCNTL_SYNC
+ ret = fcntl(io_u->file->fd, F_FULLFSYNC);
+#else
ret = fsync(io_u->file->fd);
+#endif
} else if (io_u->ddir == DDIR_DATASYNC) {
#ifdef CONFIG_FDATASYNC
ret = fdatasync(io_u->file->fd);
return ret;
}
-int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
+int do_io_u_trim(struct thread_data *td, struct io_u *io_u)
{
#ifndef FIO_HAVE_TRIM
io_u->error = EINVAL;
struct fio_file *f = io_u->file;
int ret;
+ if (td->o.zone_mode == ZONE_MODE_ZBD) {
+ ret = zbd_do_io_u_trim(td, io_u);
+ if (ret == io_u_completed)
+ return io_u->xfer_buflen;
+ if (ret)
+ goto err;
+ }
+
ret = os_trim(f, io_u->offset, io_u->xfer_buflen);
if (!ret)
return io_u->xfer_buflen;
+err:
io_u->error = ret;
return 0;
#endif
IO_U_F_TRIMMED = 1 << 5,
IO_U_F_BARRIER = 1 << 6,
IO_U_F_VER_LIST = 1 << 7,
- IO_U_F_PRIORITY = 1 << 8,
+ IO_U_F_PATTERN_DONE = 1 << 8,
};
/*
*/
unsigned short numberio;
+ /*
+ * IO priority.
+ */
+ unsigned short ioprio;
+ unsigned short clat_prio_index;
+
+ /*
+ * number of trim ranges for this IO.
+ */
+ unsigned int number_trim;
+
/*
* Allocated/set buffer and length
*/
union {
unsigned int index;
unsigned int seen;
- void *engine_data;
};
+ void *engine_data;
union {
struct flist_head verify_list;
*/
int (*end_io)(struct thread_data *, struct io_u **);
+ uint32_t dtype;
+ uint32_t dspec;
+
union {
#ifdef CONFIG_LIBAIO
struct iocb iocb;
bool queue_full(const struct thread_data *);
int do_io_u_sync(const struct thread_data *, struct io_u *);
-int do_io_u_trim(const struct thread_data *, struct io_u *);
+int do_io_u_trim(struct thread_data *, struct io_u *);
#ifdef FIO_INC_DEBUG
static inline void dprint_io_u(struct io_u *io_u, const char *p)
td_flags_clear((td), &(io_u->flags), (val))
#define io_u_set(td, io_u, val) \
td_flags_set((td), &(io_u)->flags, (val))
-#define io_u_is_prio(io_u) \
- (io_u->flags & (unsigned int) IO_U_F_PRIORITY) != 0
#endif
#include <dlfcn.h>
#include <fcntl.h>
#include <assert.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <errno.h>
#include "fio.h"
#include "diskutil.h"
static FLIST_HEAD(engine_list);
+static inline bool async_ioengine_sync_trim(struct thread_data *td,
+ struct io_u *io_u)
+{
+ return td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) &&
+ io_u->ddir == DDIR_TRIM;
+}
+
static bool check_engine_ops(struct thread_data *td, struct ioengine_ops *ops)
{
if (ops->version != FIO_IOOPS_VERSION) {
char engine_path[PATH_MAX];
void *dlhandle;
- sprintf(engine_path, "%s/lib%s.so", FIO_EXT_ENG_DIR, engine);
+ sprintf(engine_path, "%s/fio-%s.so", FIO_EXT_ENG_DIR, engine);
+ dprint(FD_IO, "dlopen external %s\n", engine_path);
dlhandle = dlopen(engine_path, RTLD_LAZY);
if (!dlhandle)
log_info("Engine %s not found; Either name is invalid, was not built, or fio-engine-%s package is missing.\n",
struct ioengine_ops *ops;
void *dlhandle;
- dprint(FD_IO, "dload engine %s\n", engine_lib);
+ if (!strncmp(engine_lib, "linuxaio", 8) ||
+ !strncmp(engine_lib, "aio", 3))
+ engine_lib = "libaio";
+
+ dprint(FD_IO, "dlopen engine %s\n", engine_lib);
dlerror();
dlhandle = dlopen(engine_lib, RTLD_LAZY);
return NULL;
}
- td->io_ops_dlhandle = dlhandle;
+ ops->dlhandle = dlhandle;
return ops;
}
/*
* linux libaio has alias names, so convert to what we want
*/
- if (!strncmp(engine, "linuxaio", 8)) {
+ if (!strncmp(engine, "linuxaio", 8) || !strncmp(engine, "aio", 3)) {
dprint(FD_IO, "converting ioengine name: %s -> libaio\n",
engine);
engine = "libaio";
* so as not to break job files not using the prefix.
*/
ops = __load_ioengine(td->o.ioengine);
- if (!ops)
+
+ /* We do re-dlopen existing handles, for reference counting */
+ if (!ops || ops->dlhandle)
ops = dlopen_ioengine(td, name);
/*
*/
void free_ioengine(struct thread_data *td)
{
+ assert(td != NULL && td->io_ops != NULL);
+
dprint(FD_IO, "free ioengine %s\n", td->io_ops->name);
if (td->eo && td->io_ops->options) {
td->eo = NULL;
}
- if (td->io_ops_dlhandle) {
- dlclose(td->io_ops_dlhandle);
- td->io_ops_dlhandle = NULL;
+ if (td->io_ops->dlhandle) {
+ dprint(FD_IO, "dlclose ioengine %s\n", td->io_ops->name);
+ dlclose(td->io_ops->dlhandle);
}
td->io_ops = NULL;
* flag is now set
*/
if (td_offload_overlap(td)) {
- int res = pthread_mutex_unlock(&overlap_check);
- assert(res == 0);
+ int res;
+
+ res = pthread_mutex_unlock(&overlap_check);
+ if (fio_unlikely(res != 0)) {
+ log_err("failed to unlock overlap check mutex, err: %i:%s", errno, strerror(errno));
+ abort();
+ }
}
assert(fio_file_open(io_u->file));
io_u->resid = 0;
if (td_ioengine_flagged(td, FIO_SYNCIO) ||
- (td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) &&
- io_u->ddir == DDIR_TRIM)) {
- if (fio_fill_issue_time(td))
+ async_ioengine_sync_trim(td, io_u)) {
+ if (fio_fill_issue_time(td)) {
fio_gettime(&io_u->issue_time, NULL);
- /*
- * only used for iolog
- */
- if (td->o.read_iolog_file)
- memcpy(&td->last_issue, &io_u->issue_time,
- sizeof(io_u->issue_time));
+ /*
+ * only used for iolog
+ */
+ if (td->o.read_iolog_file)
+ memcpy(&td->last_issue, &io_u->issue_time,
+ sizeof(io_u->issue_time));
+ }
}
if (!td->io_ops->commit) {
io_u_mark_submit(td, 1);
io_u_mark_complete(td, 1);
- zbd_put_io_u(td, io_u);
}
if (ret == FIO_Q_COMPLETED) {
io_u_mark_depth(td, 1);
td->ts.total_io_u[io_u->ddir]++;
}
+
+ td->last_was_sync = ddir_sync(io_u->ddir);
} else if (ret == FIO_Q_QUEUED) {
td->io_u_queued++;
if (td->io_u_queued >= td->o.iodepth_batch)
td_io_commit(td);
+
+ td->last_was_sync = ddir_sync(io_u->ddir);
}
if (!td_ioengine_flagged(td, FIO_SYNCIO) &&
- (!td_ioengine_flagged(td, FIO_ASYNCIO_SYNC_TRIM) ||
- io_u->ddir != DDIR_TRIM)) {
- if (fio_fill_issue_time(td))
+ !async_ioengine_sync_trim(td, io_u)) {
+ if (fio_fill_issue_time(td) &&
+ !td_ioengine_flagged(td, FIO_ASYNCIO_SETS_ISSUE_TIME)) {
fio_gettime(&io_u->issue_time, NULL);
- /*
- * only used for iolog
- */
- if (td->o.read_iolog_file)
- memcpy(&td->last_issue, &io_u->issue_time,
- sizeof(io_u->issue_time));
+ /*
+ * only used for iolog
+ */
+ if (td->o.read_iolog_file)
+ memcpy(&td->last_issue, &io_u->issue_time,
+ sizeof(io_u->issue_time));
+ }
}
return ret;
flags = POSIX_FADV_RANDOM;
else if (td->o.fadvise_hint == F_ADV_SEQUENTIAL)
flags = POSIX_FADV_SEQUENTIAL;
+#ifdef POSIX_FADV_NOREUSE
+ else if (td->o.fadvise_hint == F_ADV_NOREUSE)
+ flags = POSIX_FADV_NOREUSE;
+#endif
else {
log_err("fio: unknown fadvise type %d\n",
td->o.fadvise_hint);
if (fio_option_is_set(&td->o, write_hint) &&
(f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
uint64_t hint = td->o.write_hint;
- int cmd;
+ int res;
/*
- * For direct IO, we just need/want to set the hint on
- * the file descriptor. For buffered IO, we need to set
- * it on the inode.
+ * For direct IO, set the hint on the file descriptor if that is
+ * supported. Otherwise set it on the inode. For buffered IO, we
+ * need to set it on the inode.
*/
- if (td->o.odirect)
- cmd = F_SET_FILE_RW_HINT;
- else
- cmd = F_SET_RW_HINT;
-
- if (fcntl(f->fd, cmd, &hint) < 0) {
+ if (td->o.odirect) {
+ res = fcntl(f->fd, F_SET_FILE_RW_HINT, &hint);
+ if (res < 0)
+ res = fcntl(f->fd, F_SET_RW_HINT, &hint);
+ } else {
+ res = fcntl(f->fd, F_SET_RW_HINT, &hint);
+ }
+ if (res < 0) {
td_verror(td, errno, "fcntl write hint");
goto err;
}
return td->io_ops->get_file_size(td, f);
}
+#ifdef CONFIG_DYNAMIC_ENGINES
+/* Load all dynamic engines in FIO_EXT_ENG_DIR for enghelp command */
+static void
+fio_load_dynamic_engines(struct thread_data *td)
+{
+ DIR *dirhandle = NULL;
+ struct dirent *dirent = NULL;
+ char engine_path[PATH_MAX];
+
+ dirhandle = opendir(FIO_EXT_ENG_DIR);
+ if (!dirhandle)
+ return;
+
+ while ((dirent = readdir(dirhandle)) != NULL) {
+ if (!strcmp(dirent->d_name, ".") ||
+ !strcmp(dirent->d_name, ".."))
+ continue;
+
+ sprintf(engine_path, "%s/%s", FIO_EXT_ENG_DIR, dirent->d_name);
+ dlopen_ioengine(td, engine_path);
+ }
+
+ closedir(dirhandle);
+}
+#else
+#define fio_load_dynamic_engines(td) do { } while (0)
+#endif
+
int fio_show_ioengine_help(const char *engine)
{
struct flist_head *entry;
char *sep;
int ret = 1;
+ memset(&td, 0, sizeof(struct thread_data));
+
if (!engine || !*engine) {
log_info("Available IO engines:\n");
+ fio_load_dynamic_engines(&td);
flist_for_each(entry, &engine_list) {
io_ops = flist_entry(entry, struct ioengine_ops, list);
log_info("\t%s\n", io_ops->name);
sep++;
}
- memset(&td, 0, sizeof(struct thread_data));
td.o.ioengine = (char *)engine;
- io_ops = load_ioengine(&td);
+ td.io_ops = load_ioengine(&td);
- if (!io_ops) {
+ if (!td.io_ops) {
log_info("IO engine %s not found\n", engine);
return 1;
}
- if (io_ops->options)
- ret = show_cmd_help(io_ops->options, sep);
+ if (td.io_ops->options)
+ ret = show_cmd_help(td.io_ops->options, sep);
else
- log_info("IO engine %s has no options\n", io_ops->name);
+ log_info("IO engine %s has no options\n", td.io_ops->name);
free_ioengine(&td);
return ret;
#include "flist.h"
#include "io_u.h"
#include "zbd_types.h"
+#include "dataplacement.h"
-#define FIO_IOOPS_VERSION 26
+#define FIO_IOOPS_VERSION 34
#ifndef CONFIG_DYNAMIC_ENGINES
#define FIO_STATIC static
const char *name;
int version;
int flags;
+ void *dlhandle;
int (*setup)(struct thread_data *);
int (*init)(struct thread_data *);
int (*post_init)(struct thread_data *);
int (*invalidate)(struct thread_data *, struct fio_file *);
int (*unlink_file)(struct thread_data *, struct fio_file *);
int (*get_file_size)(struct thread_data *, struct fio_file *);
+ int (*prepopulate_file)(struct thread_data *, struct fio_file *);
void (*terminate)(struct thread_data *);
int (*iomem_alloc)(struct thread_data *, size_t);
void (*iomem_free)(struct thread_data *);
uint64_t, struct zbd_zone *, unsigned int);
int (*reset_wp)(struct thread_data *, struct fio_file *,
uint64_t, uint64_t);
+ int (*get_max_open_zones)(struct thread_data *, struct fio_file *,
+ unsigned int *);
+ int (*get_max_active_zones)(struct thread_data *, struct fio_file *,
+ unsigned int *);
+ int (*finish_zone)(struct thread_data *, struct fio_file *,
+ uint64_t, uint64_t);
+ int (*fdp_fetch_ruhs)(struct thread_data *, struct fio_file *,
+ struct fio_ruhs_info *);
int option_struct_size;
struct fio_option *options;
};
+enum {
+ __FIO_SYNCIO = 0, /* io engine has synchronous ->queue */
+ __FIO_RAWIO, /* some sort of direct/raw io */
+ __FIO_DISKLESSIO, /* no disk involved */
+ __FIO_NOEXTEND, /* engine can't extend file */
+ __FIO_NODISKUTIL, /* diskutil can't handle filename */
+ __FIO_UNIDIR, /* engine is uni-directional */
+ __FIO_NOIO, /* thread does only pseudo IO */
+ __FIO_PIPEIO, /* input/output no seekable */
+ __FIO_BARRIER, /* engine supports barriers */
+ __FIO_MEMALIGN, /* engine wants aligned memory */
+ __FIO_BIT_BASED, /* engine uses a bit base (e.g. uses Kbit as opposed to
+ KB) */
+ __FIO_FAKEIO, /* engine pretends to do IO */
+ __FIO_NOSTATS, /* don't do IO stats */
+ __FIO_NOFILEHASH, /* doesn't hash the files for lookup later. */
+ __FIO_ASYNCIO_SYNC_TRIM, /* io engine has async ->queue except for trim */
+ __FIO_NO_OFFLOAD, /* no async offload */
+ __FIO_ASYNCIO_SETS_ISSUE_TIME, /* async ioengine with commit function that sets
+ issue_time */
+ __FIO_SKIPPABLE_IOMEM_ALLOC, /* skip iomem_alloc & iomem_free if job sets mem/iomem */
+ __FIO_RO_NEEDS_RW_OPEN, /* open files in rw mode even if we have a read job; only
+ affects ioengines using generic_open_file */
+ __FIO_MULTI_RANGE_TRIM, /* ioengine supports trim with more than one range */
+ __FIO_IOENGINE_F_LAST, /* not a real bit; used to count number of bits */
+};
+
enum fio_ioengine_flags {
- FIO_SYNCIO = 1 << 0, /* io engine has synchronous ->queue */
- FIO_RAWIO = 1 << 1, /* some sort of direct/raw io */
- FIO_DISKLESSIO = 1 << 2, /* no disk involved */
- FIO_NOEXTEND = 1 << 3, /* engine can't extend file */
- FIO_NODISKUTIL = 1 << 4, /* diskutil can't handle filename */
- FIO_UNIDIR = 1 << 5, /* engine is uni-directional */
- FIO_NOIO = 1 << 6, /* thread does only pseudo IO */
- FIO_PIPEIO = 1 << 7, /* input/output no seekable */
- FIO_BARRIER = 1 << 8, /* engine supports barriers */
- FIO_MEMALIGN = 1 << 9, /* engine wants aligned memory */
- FIO_BIT_BASED = 1 << 10, /* engine uses a bit base (e.g. uses Kbit as opposed to KB) */
- FIO_FAKEIO = 1 << 11, /* engine pretends to do IO */
- FIO_NOSTATS = 1 << 12, /* don't do IO stats */
- FIO_NOFILEHASH = 1 << 13, /* doesn't hash the files for lookup later. */
- FIO_ASYNCIO_SYNC_TRIM
- = 1 << 14, /* io engine has async ->queue except for trim */
- FIO_NO_OFFLOAD = 1 << 15, /* no async offload */
+ FIO_SYNCIO = 1 << __FIO_SYNCIO,
+ FIO_RAWIO = 1 << __FIO_RAWIO,
+ FIO_DISKLESSIO = 1 << __FIO_DISKLESSIO,
+ FIO_NOEXTEND = 1 << __FIO_NOEXTEND,
+ FIO_NODISKUTIL = 1 << __FIO_NODISKUTIL,
+ FIO_UNIDIR = 1 << __FIO_UNIDIR,
+ FIO_NOIO = 1 << __FIO_NOIO,
+ FIO_PIPEIO = 1 << __FIO_PIPEIO,
+ FIO_BARRIER = 1 << __FIO_BARRIER,
+ FIO_MEMALIGN = 1 << __FIO_MEMALIGN,
+ FIO_BIT_BASED = 1 << __FIO_BIT_BASED,
+ FIO_FAKEIO = 1 << __FIO_FAKEIO,
+ FIO_NOSTATS = 1 << __FIO_NOSTATS,
+ FIO_NOFILEHASH = 1 << __FIO_NOFILEHASH,
+ FIO_ASYNCIO_SYNC_TRIM = 1 << __FIO_ASYNCIO_SYNC_TRIM,
+ FIO_NO_OFFLOAD = 1 << __FIO_NO_OFFLOAD,
+ FIO_ASYNCIO_SETS_ISSUE_TIME = 1 << __FIO_ASYNCIO_SETS_ISSUE_TIME,
+ FIO_SKIPPABLE_IOMEM_ALLOC = 1 << __FIO_SKIPPABLE_IOMEM_ALLOC,
+ FIO_RO_NEEDS_RW_OPEN = 1 << __FIO_RO_NEEDS_RW_OPEN,
+ FIO_MULTI_RANGE_TRIM = 1 << __FIO_MULTI_RANGE_TRIM,
};
/*
static int iolog_flush(struct io_log *log);
static const char iolog_ver2[] = "fio version 2 iolog";
+static const char iolog_ver3[] = "fio version 3 iolog";
void queue_io_piece(struct thread_data *td, struct io_piece *ipo)
{
void log_io_u(const struct thread_data *td, const struct io_u *io_u)
{
+ struct timespec now;
+
if (!td->o.write_iolog_file)
return;
- fprintf(td->iolog_f, "%s %s %llu %llu\n", io_u->file->file_name,
- io_ddir_name(io_u->ddir),
- io_u->offset, io_u->buflen);
+ fio_gettime(&now, NULL);
+ fprintf(td->iolog_f, "%llu %s %s %llu %llu\n",
+ (unsigned long long) utime_since_now(&td->io_log_start_time),
+ io_u->file->file_name, io_ddir_name(io_u->ddir), io_u->offset,
+ io_u->buflen);
+
}
void log_file(struct thread_data *td, struct fio_file *f,
enum file_log_act what)
{
const char *act[] = { "add", "open", "close" };
+ struct timespec now;
assert(what < 3);
if (!td->iolog_f)
return;
- fprintf(td->iolog_f, "%s %s\n", f->file_name, act[what]);
+ fio_gettime(&now, NULL);
+ fprintf(td->iolog_f, "%llu %s %s\n",
+ (unsigned long long) utime_since_now(&td->io_log_start_time),
+ f->file_name, act[what]);
}
static void iolog_delay(struct thread_data *td, unsigned long delay)
{
uint64_t usec = utime_since_now(&td->last_issue);
unsigned long orig_delay = delay;
- uint64_t this_delay;
struct timespec ts;
+ int ret = 0;
if (delay < td->time_offset) {
td->time_offset = 0;
delay -= usec;
fio_gettime(&ts, NULL);
- while (delay && !td->terminate) {
- this_delay = delay;
- if (this_delay > 500000)
- this_delay = 500000;
- usec_sleep(td, this_delay);
- delay -= this_delay;
+ while (delay && !td->terminate) {
+ ret = io_u_queued_complete(td, 0);
+ if (ret < 0)
+ td_verror(td, -ret, "io_u_queued_complete");
+ if (td->flags & TD_F_REGROW_LOGS)
+ regrow_logs(td);
+ if (utime_since_now(&ts) > delay)
+ break;
}
usec = utime_since_now(&ts);
f = td->files[ipo->fileno];
+ if (ipo->delay)
+ iolog_delay(td, ipo->delay);
+ if (fio_fill_issue_time(td))
+ fio_gettime(&td->last_issue, NULL);
switch (ipo->file_action) {
case FIO_LOG_OPEN_FILE:
if (td->o.replay_redirect && fio_file_open(f)) {
case FIO_LOG_UNLINK_FILE:
td_io_unlink_file(td, f);
break;
+ case FIO_LOG_ADD_FILE:
+ /*
+ * Nothing to do
+ */
+ break;
default:
log_err("fio: bad file action %d\n", ipo->file_action);
break;
return 1;
}
-static bool read_iolog2(struct thread_data *td);
+static bool read_iolog(struct thread_data *td);
+
+unsigned long long delay_since_ttime(const struct thread_data *td,
+ unsigned long long time)
+{
+ double tmp;
+ double scale;
+ const unsigned long long *last_ttime = &td->io_log_last_ttime;
+
+ if (!*last_ttime || td->o.no_stall || time < *last_ttime)
+ return 0;
+ else if (td->o.replay_time_scale == 100)
+ return time - *last_ttime;
+
+
+ scale = (double) 100.0 / (double) td->o.replay_time_scale;
+ tmp = time - *last_ttime;
+ return tmp * scale;
+}
int read_iolog_get(struct thread_data *td, struct io_u *io_u)
{
while (!flist_empty(&td->io_log_list)) {
int ret;
+
if (td->o.read_iolog_chunked) {
if (td->io_log_checkmark == td->io_log_current) {
- if (!read_iolog2(td))
- return 1;
+ if (td->io_log_blktrace) {
+ if (!read_blktrace(td))
+ return 1;
+ } else {
+ if (!read_iolog(td))
+ return 1;
+ }
}
td->io_log_current--;
}
td->iolog_buf = NULL;
}
-static int64_t iolog_items_to_fetch(struct thread_data *td)
+int64_t iolog_items_to_fetch(struct thread_data *td)
{
struct timespec now;
uint64_t elapsed;
return items_to_fetch;
}
+#define io_act(_td, _r) (((_td)->io_log_version == 3 && (r) == 5) || \
+ ((_td)->io_log_version == 2 && (r) == 4))
+#define file_act(_td, _r) (((_td)->io_log_version == 3 && (r) == 3) || \
+ ((_td)->io_log_version == 2 && (r) == 2))
+
/*
- * Read version 2 iolog data. It is enhanced to include per-file logging,
+ * Read version 2 and 3 iolog data. It is enhanced to include per-file logging,
* syncs, etc.
*/
-static bool read_iolog2(struct thread_data *td)
+static bool read_iolog(struct thread_data *td)
{
unsigned long long offset;
unsigned int bytes;
- int reads, writes, waits, fileno = 0, file_action = 0; /* stupid gcc */
+ unsigned long long delay = 0;
+ int reads, writes, trims, waits, fileno = 0, file_action = 0; /* stupid gcc */
char *rfname, *fname, *act;
char *str, *p;
enum fio_ddir rw;
bool realloc = false;
int64_t items_to_fetch = 0;
+ int syncs;
if (td->o.read_iolog_chunked) {
items_to_fetch = iolog_items_to_fetch(td);
rfname = fname = malloc(256+16);
act = malloc(256+16);
- reads = writes = waits = 0;
+ syncs = reads = writes = trims = waits = 0;
while ((p = fgets(str, 4096, td->io_log_rfile)) != NULL) {
struct io_piece *ipo;
int r;
+ unsigned long long ttime;
- r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset,
- &bytes);
+ if (td->io_log_version == 3) {
+ r = sscanf(p, "%llu %256s %256s %llu %u", &ttime, rfname, act,
+ &offset, &bytes);
+ delay = delay_since_ttime(td, ttime);
+ td->io_log_last_ttime = ttime;
+ /*
+ * "wait" is not allowed with version 3
+ */
+ if (!strcmp(act, "wait")) {
+ log_err("iolog: ignoring wait command with"
+ " version 3 for file %s\n", fname);
+ continue;
+ }
+ } else /* version 2 */
+ r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset, &bytes);
if (td->o.replay_redirect)
fname = td->o.replay_redirect;
- if (r == 4) {
+ if (io_act(td, r)) {
/*
* Check action first
*/
if (!strcmp(act, "wait"))
rw = DDIR_WAIT;
- else if (!strcmp(act, "read"))
+ else if (!strcmp(act, "read")) {
+ if (td->o.replay_skip & (1u << DDIR_READ))
+ continue;
rw = DDIR_READ;
- else if (!strcmp(act, "write"))
+ } else if (!strcmp(act, "write")) {
+ if (td->o.replay_skip & (1u << DDIR_WRITE))
+ continue;
rw = DDIR_WRITE;
- else if (!strcmp(act, "sync"))
+ } else if (!strcmp(act, "sync")) {
+ if (td->o.replay_skip & (1u << DDIR_SYNC))
+ continue;
rw = DDIR_SYNC;
- else if (!strcmp(act, "datasync"))
+ } else if (!strcmp(act, "datasync"))
rw = DDIR_DATASYNC;
- else if (!strcmp(act, "trim"))
+ else if (!strcmp(act, "trim")) {
+ if (td->o.replay_skip & (1u << DDIR_TRIM))
+ continue;
rw = DDIR_TRIM;
- else {
+ } else {
log_err("fio: bad iolog file action: %s\n",
act);
continue;
}
fileno = get_fileno(td, fname);
- } else if (r == 2) {
+ } else if (file_act(td, r)) {
rw = DDIR_INVAL;
if (!strcmp(act, "add")) {
if (td->o.replay_redirect &&
fileno = add_file(td, fname, td->subjob_number, 1);
file_action = FIO_LOG_ADD_FILE;
}
- continue;
} else if (!strcmp(act, "open")) {
fileno = get_fileno(td, fname);
file_action = FIO_LOG_OPEN_FILE;
continue;
}
} else {
- log_err("bad iolog2: %s\n", p);
+ log_err("bad iolog%d: %s\n", td->io_log_version, p);
continue;
}
if (read_only)
continue;
writes++;
+ } else if (rw == DDIR_TRIM) {
+ /*
+ * Don't add a trim for ro mode
+ */
+ if (read_only)
+ continue;
+ trims++;
} else if (rw == DDIR_WAIT) {
if (td->o.no_stall)
continue;
waits++;
} else if (rw == DDIR_INVAL) {
- } else if (!ddir_sync(rw)) {
+ } else if (ddir_sync(rw)) {
+ syncs++;
+ } else {
log_err("bad ddir: %d\n", rw);
continue;
}
ipo = calloc(1, sizeof(*ipo));
init_ipo(ipo);
ipo->ddir = rw;
+ if (td->io_log_version == 3)
+ ipo->delay = delay;
if (rw == DDIR_WAIT) {
ipo->delay = offset;
} else {
" read-only\n", td->o.name, writes);
writes = 0;
}
+ if (syncs)
+ td->flags |= TD_F_SYNCS;
if (td->o.read_iolog_chunked) {
if (td->io_log_current == 0) {
{
io_u_quiesce(td);
free_io_mem(td);
- init_io_u_buffers(td);
+ if (init_io_u_buffers(td))
+ return false;
}
return true;
}
- if (!reads && !writes && !waits)
+ if (!reads && !writes && !waits && !trims)
return false;
- else if (reads && !writes)
- td->o.td_ddir = TD_DDIR_READ;
- else if (!reads && writes)
- td->o.td_ddir = TD_DDIR_WRITE;
- else
- td->o.td_ddir = TD_DDIR_RW;
+
+ td->o.td_ddir = 0;
+ if (reads)
+ td->o.td_ddir |= TD_DDIR_READ;
+ if (writes)
+ td->o.td_ddir |= TD_DDIR_WRITE;
+ if (trims)
+ td->o.td_ddir |= TD_DDIR_TRIM;
return true;
}
/*
* open iolog, check version, and call appropriate parser
*/
-static bool init_iolog_read(struct thread_data *td)
+static bool init_iolog_read(struct thread_data *td, char *fname)
{
- char buffer[256], *p, *fname;
+ char buffer[256], *p;
FILE *f = NULL;
- fname = get_name_by_idx(td->o.read_iolog_file, td->subjob_number);
dprint(FD_IO, "iolog: name=%s\n", fname);
if (is_socket(fname)) {
} else
f = fopen(fname, "r");
- free(fname);
-
if (!f) {
perror("fopen read iolog");
return false;
}
/*
- * version 2 of the iolog stores a specific string as the
+ * versions 2 and 3 of the iolog store a specific string as the
* first line, check for that
*/
- if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2))) {
- free_release_files(td);
- td->io_log_rfile = f;
- return read_iolog2(td);
+ if (!strncmp(iolog_ver2, buffer, strlen(iolog_ver2)))
+ td->io_log_version = 2;
+ else if (!strncmp(iolog_ver3, buffer, strlen(iolog_ver3)))
+ td->io_log_version = 3;
+ else {
+ log_err("fio: iolog version 1 is no longer supported\n");
+ fclose(f);
+ return false;
}
- log_err("fio: iolog version 1 is no longer supported\n");
- fclose(f);
- return false;
+ free_release_files(td);
+ td->io_log_rfile = f;
+ return read_iolog(td);
}
/*
td->iolog_f = f;
td->iolog_buf = malloc(8192);
setvbuf(f, td->iolog_buf, _IOFBF, 8192);
+ fio_gettime(&td->io_log_start_time, NULL);
/*
* write our version line
*/
- if (fprintf(f, "%s\n", iolog_ver2) < 0) {
+ if (fprintf(f, "%s\n", iolog_ver3) < 0) {
perror("iolog init\n");
return false;
}
if (td->o.read_iolog_file) {
int need_swap;
+ char * fname = get_name_by_idx(td->o.read_iolog_file, td->subjob_number);
/*
* Check if it's a blktrace file and load that if possible.
* Otherwise assume it's a normal log file and load that.
*/
- if (is_blktrace(td->o.read_iolog_file, &need_swap))
- ret = load_blktrace(td, td->o.read_iolog_file, need_swap);
- else
- ret = init_iolog_read(td);
+ if (is_blktrace(fname, &need_swap)) {
+ td->io_log_blktrace = 1;
+ ret = init_blktrace_read(td, fname, need_swap);
+ } else {
+ td->io_log_blktrace = 0;
+ ret = init_iolog_read(td, fname);
+ }
+ free(fname);
} else if (td->o.write_iolog_file)
ret = init_iolog_write(td);
else
if (!ret)
td_verror(td, EINVAL, "failed initializing iolog");
+ init_disk_util(td);
+
return ret;
}
INIT_FLIST_HEAD(&l->io_logs);
l->log_type = p->log_type;
l->log_offset = p->log_offset;
+ l->log_prio = p->log_prio;
l->log_gz = p->log_gz;
l->log_gz_store = p->log_gz_store;
l->avg_msec = p->avg_msec;
if (l->log_offset)
l->log_ddir_mask = LOG_OFFSET_SAMPLE_BIT;
+ if (l->log_prio)
+ l->log_ddir_mask |= LOG_PRIO_SAMPLE_BIT;
+ /*
+ * The bandwidth-log option generates agg-read_bw.log,
+ * agg-write_bw.log and agg-trim_bw.log for which l->td is NULL.
+ * Check if l->td is valid before dereferencing it.
+ */
+ if (l->td && l->td->o.log_max == IO_LOG_SAMPLE_BOTH)
+ l->log_ddir_mask |= LOG_AVG_MAX_SAMPLE_BIT;
INIT_FLIST_HEAD(&l->chunk_list);
void flush_samples(FILE *f, void *samples, uint64_t sample_size)
{
struct io_sample *s;
- int log_offset;
+ int log_offset, log_prio, log_avg_max;
uint64_t i, nr_samples;
+ unsigned int prio_val;
+ const char *fmt;
if (!sample_size)
return;
s = __get_sample(samples, 0, 0);
log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+ log_prio = (s->__ddir & LOG_PRIO_SAMPLE_BIT) != 0;
+ log_avg_max = (s->__ddir & LOG_AVG_MAX_SAMPLE_BIT) != 0;
+
+ if (log_offset) {
+ if (log_prio) {
+ if (log_avg_max)
+ fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, %llu, 0x%04x\n";
+ else
+ fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, %llu, 0x%04x\n";
+ } else {
+ if (log_avg_max)
+ fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, %llu, %u\n";
+ else
+ fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, %llu, %u\n";
+ }
+ } else {
+ if (log_prio) {
+ if (log_avg_max)
+ fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, 0x%04x\n";
+ else
+ fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, 0x%04x\n";
+ } else {
+ if (log_avg_max)
+ fmt = "%" PRIu64 ", %" PRId64 ", %" PRId64 ", %u, %llu, %u\n";
+ else
+ fmt = "%" PRIu64 ", %" PRId64 ", %u, %llu, %u\n";
+ }
+ }
nr_samples = sample_size / __log_entry_sz(log_offset);
for (i = 0; i < nr_samples; i++) {
s = __get_sample(samples, log_offset, i);
+ if (log_prio)
+ prio_val = s->priority;
+ else
+ prio_val = ioprio_value_is_class_rt(s->priority);
+
if (!log_offset) {
- fprintf(f, "%lu, %" PRId64 ", %u, %llu, %u\n",
- (unsigned long) s->time,
- s->data.val,
- io_sample_ddir(s), (unsigned long long) s->bs, s->priority_bit);
+ if (log_avg_max)
+ fprintf(f, fmt,
+ s->time,
+ s->data.val.val0,
+ s->data.val.val1,
+ io_sample_ddir(s), (unsigned long long) s->bs,
+ prio_val);
+ else
+ fprintf(f, fmt,
+ s->time,
+ s->data.val.val0,
+ io_sample_ddir(s), (unsigned long long) s->bs,
+ prio_val);
} else {
struct io_sample_offset *so = (void *) s;
- fprintf(f, "%lu, %" PRId64 ", %u, %llu, %llu, %u\n",
- (unsigned long) s->time,
- s->data.val,
+ if (log_avg_max)
+ fprintf(f, fmt,
+ s->time,
+ s->data.val.val0,
+ s->data.val.val1,
+ io_sample_ddir(s), (unsigned long long) s->bs,
+ (unsigned long long) so->offset,
+ prio_val);
+ else
+ fprintf(f, fmt,
+ s->time,
+ s->data.val.val0,
io_sample_ddir(s), (unsigned long long) s->bs,
- (unsigned long long) so->offset, s->priority_bit);
+ (unsigned long long) so->offset,
+ prio_val);
}
}
}
void *buf;
FILE *f;
- f = fopen(file, "r");
+ f = fopen(file, "rb");
if (!f) {
perror("fopen");
return 1;
void *buf;
FILE *f;
+ /*
+ * If log_gz_store is true, we are writing a binary file.
+ * Set the mode appropriately (on all platforms) to avoid issues
+ * on windows (line-ending conversions, etc.)
+ */
if (!do_append)
- f = fopen(log->filename, "w");
+ if (log->log_gz_store)
+ f = fopen(log->filename, "wb");
+ else
+ f = fopen(log->filename, "w");
else
- f = fopen(log->filename, "a");
+ if (log->log_gz_store)
+ f = fopen(log->filename, "ab");
+ else
+ f = fopen(log->filename, "a");
if (!f) {
perror("fopen log");
return;
* Queue work item to compress the existing log entries. We reset the
* current log to a small size, and reference the existing log in the
* data that we queue for compression. Once compression has been done,
- * this old log is freed. If called with finish == true, will not return
- * until the log compression has completed, and will flush all previous
- * logs too
+ * this old log is freed. Will not return until the log compression
+ * has completed, and will flush all previous logs too
*/
static int iolog_flush(struct io_log *log)
{
struct iolog_flush_data *data;
+ workqueue_flush(&log->td->log_compress_wq);
data = malloc(sizeof(*data));
if (!data)
return 1;
void fio_writeout_logs(bool unit_logs)
{
- struct thread_data *td;
- int i;
-
- for_each_td(td, i)
+ for_each_td(td) {
td_writeout_logs(td, unit_logs);
+ } end_for_each();
}
struct flist_head list;
};
+enum {
+ IO_LOG_SAMPLE_AVG = 0,
+ IO_LOG_SAMPLE_MAX,
+ IO_LOG_SAMPLE_BOTH,
+};
+
+struct io_sample_value {
+ uint64_t val0;
+ uint64_t val1;
+};
union io_sample_data {
- uint64_t val;
+ struct io_sample_value val;
struct io_u_plat_entry *plat_entry;
};
-#define sample_val(value) ((union io_sample_data) { .val = value })
+#define sample_val(value) ((union io_sample_data) { .val.val0 = value })
#define sample_plat(plat) ((union io_sample_data) { .plat_entry = plat })
/*
uint64_t time;
union io_sample_data data;
uint32_t __ddir;
- uint8_t priority_bit;
+ uint16_t priority;
uint64_t bs;
};
*/
unsigned int log_offset;
+ /*
+ * Log I/O priorities
+ */
+ unsigned int log_prio;
+
/*
* Max size of log entries before a chunk is compressed
*/
* If the upper bit is set, then we have the offset as well
*/
#define LOG_OFFSET_SAMPLE_BIT 0x80000000U
-#define io_sample_ddir(io) ((io)->__ddir & ~LOG_OFFSET_SAMPLE_BIT)
+/*
+ * If the bit following the upper bit is set, then we have the priority
+ */
+#define LOG_PRIO_SAMPLE_BIT 0x40000000U
+/*
+ * If the bit following prioity sample vit is set, we report both avg and max
+ */
+#define LOG_AVG_MAX_SAMPLE_BIT 0x20000000U
+
+#define LOG_SAMPLE_BITS (LOG_OFFSET_SAMPLE_BIT | LOG_PRIO_SAMPLE_BIT |\
+ LOG_AVG_MAX_SAMPLE_BIT)
+#define io_sample_ddir(io) ((io)->__ddir & ~LOG_SAMPLE_BITS)
static inline void io_sample_set_ddir(struct io_log *log,
struct io_sample *io,
struct io_logs *iolog_cur_log(struct io_log *);
uint64_t iolog_nr_samples(struct io_log *);
void regrow_logs(struct thread_data *);
+void regrow_agg_logs(void);
static inline struct io_sample *get_sample(struct io_log *iolog,
struct io_logs *cur_log,
unsigned long len;
unsigned int flags;
enum fio_ddir ddir;
- union {
- unsigned long delay;
- unsigned int file_action;
- };
+ unsigned long delay;
+ unsigned int file_action;
};
/*
extern void queue_io_piece(struct thread_data *, struct io_piece *);
extern void prune_io_piece_log(struct thread_data *);
extern void write_iolog_close(struct thread_data *);
+int64_t iolog_items_to_fetch(struct thread_data *td);
extern int iolog_compress_init(struct thread_data *, struct sk_out *);
extern void iolog_compress_exit(struct thread_data *);
extern size_t log_chunk_sizes(struct io_log *);
extern int init_io_u_buffers(struct thread_data *);
+extern unsigned long long delay_since_ttime(const struct thread_data *,
+ unsigned long long);
#ifdef CONFIG_ZLIB
extern int iolog_file_inflate(const char *);
int hist_coarseness;
int log_type;
int log_offset;
+ int log_prio;
int log_gz;
int log_gz_store;
int log_compress;
struct json_value arg = {
.type = JSON_TYPE_STRING,
};
+ union {
+ const char *a;
+ char *b;
+ } string;
- arg.string = strdup(val ? : "");
+ string.a = val ? val : "";
+ arg.string = string.b;
return json_object_add_value_type(obj, name, &arg);
}
r -= 2;
}
if (!(x & 0x80000000u)) {
- x <<= 1;
r -= 1;
}
return r;
if (!gs->disable_hash)
sum = __hash_u64(sum);
- return sum % gs->nranges;
+ return (sum + gs->rand_off) % gs->nranges;
}
void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
- unsigned int seed)
+ double center, unsigned int seed)
{
memset(gs, 0, sizeof(*gs));
init_rand_seed(&gs->r, seed, 0);
if (gs->stddev > nranges / 2)
gs->stddev = nranges / 2;
}
+ if (center == -1)
+ gs->rand_off = 0;
+ else
+ gs->rand_off = nranges * (center - 0.5);
}
void gauss_disable_hash(struct gauss_state *gs)
struct frand_state r;
uint64_t nranges;
unsigned int stddev;
+ unsigned int rand_off;
bool disable_hash;
};
void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
- unsigned int seed);
+ double center, unsigned int seed);
unsigned long long gauss_next(struct gauss_state *gs);
void gauss_disable_hash(struct gauss_state *gs);
*/
switch (spin) {
case 15: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 14: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 13: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 12: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 11: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 10: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 9: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 8: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 7: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 6: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 5: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 4: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 3: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 2: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 1: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
case 0: __LFSR_NEXT(fl, fl->last_val);
- fallthrough;
+ fio_fallthrough;
default: break;
}
}
#include "../oslib/asprintf.h"
#include "num2str.h"
-#define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
-
/**
* num2str() - Cheesy number->string conversion, complete with carry rounding error.
* @num: quantity (e.g., number of blocks, bytes or bits)
char *buf;
compiletime_assert(sizeof(sistr) == sizeof(iecstr), "unit prefix arrays must be identical sizes");
- assert(units < ARRAY_SIZE(unitstr));
+ assert(units < FIO_ARRAY_SIZE(unitstr));
if (pow2)
unitprefix = iecstr;
* Divide by K/Ki until string length of num <= maxlen.
*/
modulo = -1U;
- while (post_index < ARRAY_SIZE(sistr)) {
+ while (post_index < FIO_ARRAY_SIZE(sistr)) {
sprintf(tmp, "%llu", (unsigned long long) num);
if (strlen(tmp) <= maxlen)
break;
post_index++;
}
- if (post_index >= ARRAY_SIZE(sistr))
+ if (post_index >= FIO_ARRAY_SIZE(sistr))
post_index = 0;
/*
sprintf(tmp, "%.*f", (int)(maxlen - strlen(tmp) - 1),
(double)modulo / (double)thousand);
+ if (tmp[0] == '1')
+ num++;
+
if (asprintf(&buf, "%llu.%s%s%s", (unsigned long long) num, &tmp[2],
unitprefix[post_index], unitstr[units]) < 0)
buf = NULL;
const char *end;
char *file;
int fd;
- ssize_t count;
+ ssize_t rc, count = 0;
if (!out_len)
goto err_out;
if (file == NULL)
goto err_out;
+#ifdef _WIN32
+ fd = open(file, O_RDONLY | O_BINARY);
+#else
fd = open(file, O_RDONLY);
+#endif
if (fd < 0)
goto err_free_out;
- count = read(fd, out, out_len);
- if (count == -1)
- goto err_free_close_out;
+ if (out) {
+ while (1) {
+ rc = read(fd, out, out_len - count);
+ if (rc == 0)
+ break;
+ if (rc == -1)
+ goto err_free_close_out;
+
+ count += rc;
+ out += rc;
+ }
+ } else {
+ count = lseek(fd, 0, SEEK_END);
+ if (count == -1)
+ goto err_free_close_out;
+ if (count >= out_len)
+ count = out_len;
+ }
*filled = count;
close(fd);
if (end - beg > out_len)
return NULL;
- memcpy(out, beg, end - beg);
+ if (out)
+ memcpy(out, beg, end - beg);
*filled = end - beg;
/* Catch up quote */
i = 0;
if (!lval) {
num = 0;
- out[i] = 0x00;
+ if (out)
+ out[i] = 0x00;
i = 1;
} else {
val = (unsigned int)lval;
for (; val && out_len; out_len--, i++, val >>= 8)
- out[i] = val & 0xff;
+ if (out)
+ out[i] = val & 0xff;
if (val)
return NULL;
}
const char *fmt;
fmt = (num & 1 ? "%1hhx" : "%2hhx");
- sscanf(beg, fmt, &out[i]);
+ if (out)
+ sscanf(beg, fmt, &out[i]);
if (num & 1) {
num++;
beg--;
* This function tries to find formats, e.g.:
* %o - offset of the block
*
- * In case of successfull parsing it fills the format param
+ * In case of successful parsing it fills the format param
* with proper offset and the size of the expected value, which
* should be pasted into buffer using the format 'func' callback.
*
if (f->desc->len > out_len)
return NULL;
- memset(out, '\0', f->desc->len);
+ if (out)
+ memset(out, '\0', f->desc->len);
*filled = f->desc->len;
return in + len;
* numbers and pattern formats.
* @in - string input
* @in_len - size of the input string
- * @out - output buffer where parsed result will be put
+ * @out - output buffer where parsed result will be put, may be NULL
+ * in which case this function just calculates the required
+ * length of the buffer
* @out_len - lengths of the output buffer
* @fmt_desc - array of pattern format descriptors [input]
* @fmt - array of pattern formats [output]
* @fmt_sz - pointer where the size of pattern formats array stored [input],
- * after successfull parsing this pointer will contain the number
+ * after successful parsing this pointer will contain the number
* of parsed formats if any [output].
*
* strings:
* NOTE: there is no way to escape quote, so "123\"abc" does not work.
*
* numbers:
- * hexidecimal - sequence of hex bytes starting from 0x or 0X prefix,
+ * hexadecimal - sequence of hex bytes starting from 0x or 0X prefix,
* e.g. 0xff12ceff1100ff
* decimal - decimal number in range [INT_MIN, INT_MAX]
*
*
* Returns number of bytes filled or err < 0 in case of failure.
*/
-int parse_and_fill_pattern(const char *in, unsigned int in_len,
- char *out, unsigned int out_len,
- const struct pattern_fmt_desc *fmt_desc,
- struct pattern_fmt *fmt,
- unsigned int *fmt_sz_out)
+static int parse_and_fill_pattern(const char *in, unsigned int in_len,
+ char *out, unsigned int out_len,
+ const struct pattern_fmt_desc *fmt_desc,
+ struct pattern_fmt *fmt,
+ unsigned int *fmt_sz_out)
{
const char *beg, *end, *out_beg = out;
unsigned int total = 0, fmt_rem = 0;
- if (!in || !in_len || !out || !out_len)
+ if (!in || !in_len || !out_len)
return -EINVAL;
if (fmt_sz_out)
fmt_rem = *fmt_sz_out;
assert(filled);
assert(filled <= out_len);
out_len -= filled;
- out += filled;
total += filled;
+ if (out)
+ out += filled;
} while (in_len);
return total;
}
+/**
+ * parse_and_fill_pattern_alloc() - Parses combined input, which consists of
+ * strings, numbers and pattern formats and
+ * allocates a buffer for the result.
+ *
+ * @in - string input
+ * @in_len - size of the input string
+ * @out - pointer to the output buffer pointer, this will be set to the newly
+ * allocated pattern buffer which must be freed by the caller
+ * @fmt_desc - array of pattern format descriptors [input]
+ * @fmt - array of pattern formats [output]
+ * @fmt_sz - pointer where the size of pattern formats array stored [input],
+ * after successful parsing this pointer will contain the number
+ * of parsed formats if any [output].
+ *
+ * See documentation on parse_and_fill_pattern() above for a description
+ * of the functionality.
+ *
+ * Returns number of bytes filled or err < 0 in case of failure.
+ */
+int parse_and_fill_pattern_alloc(const char *in, unsigned int in_len,
+ char **out, const struct pattern_fmt_desc *fmt_desc,
+ struct pattern_fmt *fmt, unsigned int *fmt_sz_out)
+{
+ int count;
+
+ count = parse_and_fill_pattern(in, in_len, NULL, MAX_PATTERN_SIZE,
+ fmt_desc, fmt, fmt_sz_out);
+ if (count < 0)
+ return count;
+
+ *out = malloc(count);
+ count = parse_and_fill_pattern(in, in_len, *out, count, fmt_desc,
+ fmt, fmt_sz_out);
+ if (count < 0) {
+ free(*out);
+ *out = NULL;
+ }
+
+ return count;
+}
+
/**
* dup_pattern() - Duplicates part of the pattern all over the buffer.
*
#ifndef FIO_PARSE_PATTERN_H
#define FIO_PARSE_PATTERN_H
+/*
+ * The pattern is dynamically allocated, but that doesn't mean there
+ * are not limits. The network protocol has a limit of
+ * FIO_SERVER_MAX_CMD_MB and potentially two patterns must fit in there.
+ * There's also a need to verify the incoming data from the network and
+ * this provides a sensible check.
+ *
+ * 128MiB is an arbitrary limit that meets these criteria. The patterns
+ * tend to be truncated at the IO size anyway and IO sizes that large
+ * aren't terribly practical.
+ */
+#define MAX_PATTERN_SIZE (128 << 20)
+
/**
* Pattern format description. The input for 'parse_pattern'.
* Describes format with its name and callback, which should
const struct pattern_fmt_desc *desc;
};
-int parse_and_fill_pattern(const char *in, unsigned int in_len,
- char *out, unsigned int out_len,
- const struct pattern_fmt_desc *fmt_desc,
- struct pattern_fmt *fmt,
- unsigned int *fmt_sz_out);
+int parse_and_fill_pattern_alloc(const char *in, unsigned int in_len,
+ char **out, const struct pattern_fmt_desc *fmt_desc,
+ struct pattern_fmt *fmt, unsigned int *fmt_sz_out);
int paste_format_inplace(char *pattern, unsigned int pattern_len,
struct pattern_fmt *fmt, unsigned int fmt_sz,
#include "../compiler/compiler.h"
#include "prio_tree.h"
-#define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
-
/*
* A clever mix of heap and radix trees forms a radix priority search tree (PST)
* which is useful for storing intervals, e.g, we can consider a vma as a closed
{
unsigned int i;
- for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
+ for (i = 0; i < FIO_ARRAY_SIZE(index_bits_to_maxindex) - 1; i++)
index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1;
- index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
+ index_bits_to_maxindex[FIO_ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL;
}
/*
__rand32(state);
}
-static void __init_rand64(struct taus258_state *state, uint64_t seed)
+void __init_rand64(struct taus258_state *state, uint64_t seed)
{
int cranks = 6;
__init_rand64(&state->state64, seed);
}
+void __fill_random_buf_small(void *buf, unsigned int len, uint64_t seed)
+{
+ uint64_t *b = buf;
+ uint64_t *e = b + len / sizeof(*b);
+ unsigned int rest = len % sizeof(*b);
+
+ for (; b != e; ++b) {
+ *b = seed;
+ seed = __hash_u64(seed);
+ }
+
+ if (fio_unlikely(rest))
+ __builtin_memcpy(e, &seed, rest);
+}
+
void __fill_random_buf(void *buf, unsigned int len, uint64_t seed)
{
- void *ptr = buf;
+ static uint64_t prime[] = {1, 2, 3, 5, 7, 11, 13, 17,
+ 19, 23, 29, 31, 37, 41, 43, 47};
+ uint64_t *b, *e, s[CONFIG_SEED_BUCKETS];
+ unsigned int rest;
+ int p;
- while (len) {
- int this_len;
-
- if (len >= sizeof(int64_t)) {
- *((int64_t *) ptr) = seed;
- this_len = sizeof(int64_t);
- } else if (len >= sizeof(int32_t)) {
- *((int32_t *) ptr) = seed;
- this_len = sizeof(int32_t);
- } else if (len >= sizeof(int16_t)) {
- *((int16_t *) ptr) = seed;
- this_len = sizeof(int16_t);
- } else {
- *((int8_t *) ptr) = seed;
- this_len = sizeof(int8_t);
+ /*
+ * Calculate the max index which is multiples of the seed buckets.
+ */
+ rest = (len / sizeof(*b) / CONFIG_SEED_BUCKETS) * CONFIG_SEED_BUCKETS;
+
+ b = buf;
+ e = b + rest;
+
+ rest = len - (rest * sizeof(*b));
+
+ for (p = 0; p < CONFIG_SEED_BUCKETS; p++)
+ s[p] = seed * prime[p];
+
+ for (; b != e; b += CONFIG_SEED_BUCKETS) {
+ for (p = 0; p < CONFIG_SEED_BUCKETS; ++p) {
+ b[p] = s[p];
+ s[p] = __hash_u64(s[p]);
}
- ptr += this_len;
- len -= this_len;
- seed *= GOLDEN_RATIO_PRIME;
- seed >>= 3;
}
+
+ __fill_random_buf_small(b, rest, s[0]);
}
uint64_t fill_random_buf(struct frand_state *fs, void *buf,
unsigned int len)
{
- uint64_t r = __rand(fs);
-
- if (sizeof(int) != sizeof(long *))
- r *= (unsigned long) __rand(fs);
+ uint64_t r = __get_next_seed(fs);
__fill_random_buf(buf, len, r);
return r;
unsigned int segment, unsigned int len,
char *pattern, unsigned int pbytes)
{
- uint64_t r = __rand(fs);
-
- if (sizeof(int) != sizeof(long *))
- r *= (unsigned long) __rand(fs);
+ uint64_t r = __get_next_seed(fs);
__fill_random_buf_percentage(r, buf, percentage, segment, len,
pattern, pbytes);
return start + rand32_upto(state, end - start);
}
+static inline uint64_t __get_next_seed(struct frand_state *fs)
+{
+ uint64_t r = __rand(fs);
+
+ if (sizeof(int) != sizeof(long *))
+ r *= (unsigned long) __rand(fs);
+
+ return r;
+}
+
extern void init_rand(struct frand_state *, bool);
extern void init_rand_seed(struct frand_state *, uint64_t seed, bool);
+void __init_rand64(struct taus258_state *state, uint64_t seed);
extern void __fill_random_buf(void *buf, unsigned int len, uint64_t seed);
extern uint64_t fill_random_buf(struct frand_state *, void *buf, unsigned int len);
extern void __fill_random_buf_percentage(uint64_t, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
#include "../arch/arch.h"
struct seqlock {
+#ifdef __cplusplus
+ std::atomic<unsigned int> sequence;
+#else
volatile unsigned int sequence;
+#endif
};
static inline void seqlock_init(struct seqlock *s)
}
static void shared_rand_init(struct zipf_state *zs, uint64_t nranges,
- unsigned int seed)
+ double center, unsigned int seed)
{
memset(zs, 0, sizeof(*zs));
zs->nranges = nranges;
init_rand_seed(&zs->rand, seed, 0);
zs->rand_off = __rand(&zs->rand);
+ if (center != -1)
+ zs->rand_off = nranges * center;
}
void zipf_init(struct zipf_state *zs, uint64_t nranges, double theta,
- unsigned int seed)
+ double center, unsigned int seed)
{
- shared_rand_init(zs, nranges, seed);
+ shared_rand_init(zs, nranges, center, seed);
zs->theta = theta;
zs->zeta2 = pow(1.0, zs->theta) + pow(0.5, zs->theta);
}
void pareto_init(struct zipf_state *zs, uint64_t nranges, double h,
- unsigned int seed)
+ double center, unsigned int seed)
{
- shared_rand_init(zs, nranges, seed);
+ shared_rand_init(zs, nranges, center, seed);
zs->pareto_pow = log(h) / log(1.0 - h);
}
bool disable_hash;
};
-void zipf_init(struct zipf_state *zs, uint64_t nranges, double theta, unsigned int seed);
+void zipf_init(struct zipf_state *zs, uint64_t nranges, double theta,
+ double center, unsigned int seed);
uint64_t zipf_next(struct zipf_state *zs);
-void pareto_init(struct zipf_state *zs, uint64_t nranges, double h, unsigned int seed);
+void pareto_init(struct zipf_state *zs, uint64_t nranges, double h,
+ double center, unsigned int seed);
uint64_t pareto_next(struct zipf_state *zs);
void zipf_disable_hash(struct zipf_state *zs);
"hppa",
"mips",
"aarch64",
+ "loongarch64",
+ "riscv64",
"generic"
};
td->this_io_bytes[ddir] = 0;
td->stat_io_blocks[ddir] = 0;
td->this_io_blocks[ddir] = 0;
- td->rate_bytes[ddir] = 0;
- td->rate_blocks[ddir] = 0;
+ td->last_rate_check_bytes[ddir] = 0;
+ td->last_rate_check_blocks[ddir] = 0;
td->bytes_done[ddir] = 0;
td->rate_io_issue_bytes[ddir] = 0;
td->rate_next_io_time[ddir] = 0;
td->last_usec[ddir] = 0;
}
+ td->bytes_verified = 0;
}
td->zone_bytes = 0;
/*
* reset file done count if we are to start over
*/
- if (td->o.time_based || td->o.loops || td->o.do_verify)
+ if (td->o.time_based || td->loops > 1 || td->o.do_verify)
td->nr_done_files = 0;
}
void reset_all_stats(struct thread_data *td)
{
+ unsigned long long b;
int i;
reset_io_counters(td, 1);
+ b = ddir_rw_sum(td->thinktime_blocks_counter);
+ td->last_thinktime_blocks -= b;
+
for (i = 0; i < DDIR_RWDIR_CNT; i++) {
td->io_bytes[i] = 0;
td->io_blocks[i] = 0;
td->io_issues[i] = 0;
td->ts.total_io_u[i] = 0;
td->ts.runtime[i] = 0;
- td->rwmix_issues = 0;
}
- set_epoch_time(td, td->o.log_unix_epoch);
+ set_epoch_time(td, td->o.log_alternate_epoch_clock_id, td->o.job_start_clock_id);
memcpy(&td->start, &td->epoch, sizeof(td->epoch));
memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
+ td->last_thinktime = td->epoch;
+
lat_target_reset(td);
clear_rusage_stat(td);
helper_reset();
void reset_fio_state(void)
{
+ int i;
+
groupid = 0;
thread_number = 0;
+ cur_segment = 0;
+ for (i = 0; i < nr_segments; i++)
+ segments[i].nr_threads = 0;
stat_number = 0;
done_secs = 0;
}
void fio_terminate_threads(unsigned int group_id, unsigned int terminate)
{
- struct thread_data *td;
pid_t pid = getpid();
- int i;
dprint(FD_PROCESS, "terminate group_id=%d\n", group_id);
- for_each_td(td, i) {
+ for_each_td(td) {
if ((terminate == TERMINATE_GROUP && group_id == TERMINATE_ALL) ||
(terminate == TERMINATE_GROUP && group_id == td->groupid) ||
(terminate == TERMINATE_STONEWALL && td->runstate >= TD_RUNNING) ||
ops->terminate(td);
}
}
- }
+ } end_for_each();
}
int fio_running_or_pending_io_threads(void)
{
- struct thread_data *td;
- int i;
int nr_io_threads = 0;
- for_each_td(td, i) {
+ for_each_td(td) {
if (td->io_ops_init && td_ioengine_flagged(td, FIO_NOIO))
continue;
nr_io_threads++;
if (td->runstate < TD_EXITED)
return 1;
- }
+ } end_for_each();
if (!nr_io_threads)
return -1; /* we only had cpuio threads to begin with */
compiletime_assert((offsetof(struct jobs_eta, m_rate) % 8) == 0, "m_rate");
compiletime_assert(__TD_F_LAST <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT");
+ compiletime_assert((__TD_F_LAST + __FIO_IOENGINE_F_LAST) <= 8*sizeof(((struct thread_data *)0)->flags), "td->flags");
compiletime_assert(BSSPLIT_MAX <= ZONESPLIT_MAX, "bsssplit/zone max");
err = endian_check();
+#include "log.h"
+
#include <unistd.h>
#include <string.h>
#include <stdarg.h>
}
/* add prefix for the specified type in front of the valist */
+#ifdef FIO_INC_DEBUG
void log_prevalist(int type, const char *fmt, va_list args)
{
char *buf1, *buf2;
free(buf1);
if (len < 0)
return;
- len = log_info_buf(buf2, len);
+ log_info_buf(buf2, len);
free(buf2);
}
+#endif
ssize_t log_info(const char *format, ...)
{
total_mem = td->orig_buffer_size;
- if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
+ if (td->o.odirect || td->o.mem_align ||
td_ioengine_flagged(td, FIO_MEMALIGN)) {
total_mem += page_mask;
if (td->o.mem_align && td->o.mem_align > page_size)
dprint(FD_MEM, "Alloc %llu for buffers\n", (unsigned long long) total_mem);
/*
- * If the IO engine has hooks to allocate/free memory, use those. But
- * error out if the user explicitly asked for something else.
+ * If the IO engine has hooks to allocate/free memory and the user
+ * doesn't explicitly ask for something else, use those. But fail if the
+ * user asks for something else with an engine that doesn't allow that.
*/
- if (td->io_ops->iomem_alloc) {
- if (fio_option_is_set(&td->o, mem_type)) {
- log_err("fio: option 'mem/iomem' conflicts with specified IO engine\n");
- ret = 1;
- } else
- ret = td->io_ops->iomem_alloc(td, total_mem);
- } else if (td->o.mem_type == MEM_MALLOC)
+ if (td->io_ops->iomem_alloc && fio_option_is_set(&td->o, mem_type) &&
+ !td_ioengine_flagged(td, FIO_SKIPPABLE_IOMEM_ALLOC)) {
+ log_err("fio: option 'mem/iomem' conflicts with specified IO engine\n");
+ ret = 1;
+ } else if (td->io_ops->iomem_alloc &&
+ !fio_option_is_set(&td->o, mem_type))
+ ret = td->io_ops->iomem_alloc(td, total_mem);
+ else if (td->o.mem_type == MEM_MALLOC)
ret = alloc_mem_malloc(td, total_mem);
else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE)
ret = alloc_mem_shm(td, total_mem);
unsigned int total_mem;
total_mem = td->orig_buffer_size;
- if (td->o.odirect || td->o.oatomic)
+ if (td->o.odirect)
total_mem += page_mask;
- if (td->io_ops->iomem_alloc) {
+ if (td->io_ops->iomem_alloc && !fio_option_is_set(&td->o, mem_type)) {
if (td->io_ops->iomem_free)
td->io_ops->iomem_free(td);
} else if (td->o.mem_type == MEM_MALLOC)
.name = "RDMA I/O engine", /* rdma */
.mask = FIO_OPT_G_RDMA,
},
+ {
+ .name = "librpma I/O engines", /* librpma_apm && librpma_gpspm */
+ .mask = FIO_OPT_G_LIBRPMA,
+ },
{
.name = "libaio I/O engine", /* libaio */
.mask = FIO_OPT_G_LIBAIO,
.name = "NBD I/O engine", /* NBD */
.mask = FIO_OPT_G_NBD,
},
+ {
+ .name = "libcufile I/O engine", /* libcufile */
+ .mask = FIO_OPT_G_LIBCUFILE,
+ },
+ {
+ .name = "DAOS File System (dfs) I/O engine", /* dfs */
+ .mask = FIO_OPT_G_DFS,
+ },
+ {
+ .name = "NFS I/O engine", /* nfs */
+ .mask = FIO_OPT_G_NFS,
+ },
{
.name = NULL,
},
__FIO_OPT_G_E4DEFRAG,
__FIO_OPT_G_NETIO,
__FIO_OPT_G_RDMA,
+ __FIO_OPT_G_LIBRPMA,
__FIO_OPT_G_LIBAIO,
__FIO_OPT_G_ACT,
__FIO_OPT_G_LATPROF,
__FIO_OPT_G_IOURING,
__FIO_OPT_G_FILESTAT,
__FIO_OPT_G_NR,
+ __FIO_OPT_G_LIBCUFILE,
+ __FIO_OPT_G_DFS,
+ __FIO_OPT_G_NFS,
+ __FIO_OPT_G_WINDOWSAIO,
+ __FIO_OPT_G_XNVME,
+ __FIO_OPT_G_LIBBLKIO,
FIO_OPT_G_RATE = (1ULL << __FIO_OPT_G_RATE),
FIO_OPT_G_ZONE = (1ULL << __FIO_OPT_G_ZONE),
FIO_OPT_G_E4DEFRAG = (1ULL << __FIO_OPT_G_E4DEFRAG),
FIO_OPT_G_NETIO = (1ULL << __FIO_OPT_G_NETIO),
FIO_OPT_G_RDMA = (1ULL << __FIO_OPT_G_RDMA),
+ FIO_OPT_G_LIBRPMA = (1ULL << __FIO_OPT_G_LIBRPMA),
FIO_OPT_G_LIBAIO = (1ULL << __FIO_OPT_G_LIBAIO),
FIO_OPT_G_ACT = (1ULL << __FIO_OPT_G_ACT),
FIO_OPT_G_LATPROF = (1ULL << __FIO_OPT_G_LATPROF),
FIO_OPT_G_INVALID = (1ULL << __FIO_OPT_G_NR),
FIO_OPT_G_ISCSI = (1ULL << __FIO_OPT_G_ISCSI),
FIO_OPT_G_NBD = (1ULL << __FIO_OPT_G_NBD),
+ FIO_OPT_G_NFS = (1ULL << __FIO_OPT_G_NFS),
FIO_OPT_G_IOURING = (1ULL << __FIO_OPT_G_IOURING),
FIO_OPT_G_FILESTAT = (1ULL << __FIO_OPT_G_FILESTAT),
+ FIO_OPT_G_LIBCUFILE = (1ULL << __FIO_OPT_G_LIBCUFILE),
+ FIO_OPT_G_DFS = (1ULL << __FIO_OPT_G_DFS),
+ FIO_OPT_G_WINDOWSAIO = (1ULL << __FIO_OPT_G_WINDOWSAIO),
+ FIO_OPT_G_XNVME = (1ULL << __FIO_OPT_G_XNVME),
+ FIO_OPT_G_LIBBLKIO = (1ULL << __FIO_OPT_G_LIBBLKIO),
};
extern const struct opt_group *opt_group_from_mask(uint64_t *mask);
#include <ctype.h>
#include <string.h>
#include <assert.h>
+#include <fcntl.h>
#include <sys/stat.h>
#include <netinet/in.h>
static const struct pattern_fmt_desc fmt_desc[] = {
{
.fmt = "%o",
- .len = FIELD_SIZE(struct io_u *, offset),
+ .len = FIO_FIELD_SIZE(struct io_u *, offset),
.paste = paste_blockoff
},
{ }
return strdup(p);
}
+static bool split_parse_distr(const char *str, double *val, double *center)
+{
+ char *cp, *p;
+ bool r;
+
+ p = strdup(str);
+ if (!p)
+ return false;
+
+ cp = strstr(p, ":");
+ r = true;
+ if (cp) {
+ *cp = '\0';
+ cp++;
+ r = str_to_float(cp, center, 0);
+ }
+ r = r && str_to_float(p, val, 0);
+ free(p);
+ return r;
+}
+
static int bs_cmp(const void *p1, const void *p2)
{
const struct bssplit *bsp1 = p1;
return (int) bsp1->perc - (int) bsp2->perc;
}
-struct split {
- unsigned int nr;
- unsigned long long val1[ZONESPLIT_MAX];
- unsigned long long val2[ZONESPLIT_MAX];
-};
-
-static int split_parse_ddir(struct thread_options *o, struct split *split,
+int split_parse_ddir(struct thread_options *o, struct split *split,
char *str, bool absolute, unsigned int max_splits)
{
unsigned long long perc;
return 0;
}
-static int bssplit_ddir(struct thread_options *o, enum fio_ddir ddir, char *str,
- bool data)
+static int bssplit_ddir(struct thread_options *o, void *eo,
+ enum fio_ddir ddir, char *str, bool data)
{
unsigned int i, perc, perc_missing;
unsigned long long max_bs, min_bs;
return 0;
}
-typedef int (split_parse_fn)(struct thread_options *, enum fio_ddir, char *, bool);
-
-static int str_split_parse(struct thread_data *td, char *str,
- split_parse_fn *fn, bool data)
+int str_split_parse(struct thread_data *td, char *str,
+ split_parse_fn *fn, void *eo, bool data)
{
char *odir, *ddir;
int ret = 0;
if (odir) {
ddir = strchr(odir + 1, ',');
if (ddir) {
- ret = fn(&td->o, DDIR_TRIM, ddir + 1, data);
+ ret = fn(&td->o, eo, DDIR_TRIM, ddir + 1, data);
if (!ret)
*ddir = '\0';
} else {
char *op;
op = strdup(odir + 1);
- ret = fn(&td->o, DDIR_TRIM, op, data);
+ ret = fn(&td->o, eo, DDIR_TRIM, op, data);
free(op);
}
if (!ret)
- ret = fn(&td->o, DDIR_WRITE, odir + 1, data);
+ ret = fn(&td->o, eo, DDIR_WRITE, odir + 1, data);
if (!ret) {
*odir = '\0';
- ret = fn(&td->o, DDIR_READ, str, data);
+ ret = fn(&td->o, eo, DDIR_READ, str, data);
}
} else {
char *op;
op = strdup(str);
- ret = fn(&td->o, DDIR_WRITE, op, data);
+ ret = fn(&td->o, eo, DDIR_WRITE, op, data);
free(op);
if (!ret) {
op = strdup(str);
- ret = fn(&td->o, DDIR_TRIM, op, data);
+ ret = fn(&td->o, eo, DDIR_TRIM, op, data);
free(op);
}
if (!ret)
- ret = fn(&td->o, DDIR_READ, str, data);
+ ret = fn(&td->o, eo, DDIR_READ, str, data);
}
return ret;
}
+static int fio_fdp_cmp(const void *p1, const void *p2)
+{
+ const uint16_t *t1 = p1;
+ const uint16_t *t2 = p2;
+
+ return *t1 - *t2;
+}
+
+static int str_fdp_pli_cb(void *data, const char *input)
+{
+ struct thread_data *td = cb_data_to_td(data);
+ char *str, *p, *v;
+ int i = 0;
+
+ p = str = strdup(input);
+ strip_blank_front(&str);
+ strip_blank_end(str);
+
+ while ((v = strsep(&str, ",")) != NULL && i < FIO_MAX_DP_IDS) {
+ unsigned long long id = strtoull(v, NULL, 0);
+ if (id > 0xFFFF) {
+ log_err("Placement IDs cannot exceed 0xFFFF\n");
+ free(p);
+ return 1;
+ }
+ td->o.dp_ids[i++] = id;
+ }
+ free(p);
+
+ qsort(td->o.dp_ids, i, sizeof(*td->o.dp_ids), fio_fdp_cmp);
+ td->o.dp_nr_ids = i;
+
+ return 0;
+}
+
static int str_bssplit_cb(void *data, const char *input)
{
struct thread_data *td = cb_data_to_td(data);
strip_blank_front(&str);
strip_blank_end(str);
- ret = str_split_parse(td, str, bssplit_ddir, false);
+ ret = str_split_parse(td, str, bssplit_ddir, NULL, false);
if (parse_dryrun()) {
int i;
return ret;
}
+static int parse_cmdprio_bssplit_entry(struct thread_options *o,
+ struct split_prio *entry, char *str)
+{
+ int matches = 0;
+ char *bs_str = NULL;
+ long long bs_val;
+ unsigned int perc = 0, class, level, hint;
+
+ /*
+ * valid entry formats:
+ * bs/ - %s/ - set perc to 0, prio to -1.
+ * bs/perc - %s/%u - set prio to -1.
+ * bs/perc/class/level - %s/%u/%u/%u
+ * bs/perc/class/level/hint - %s/%u/%u/%u/%u
+ */
+ matches = sscanf(str, "%m[^/]/%u/%u/%u/%u",
+ &bs_str, &perc, &class, &level, &hint);
+ if (matches < 1) {
+ log_err("fio: invalid cmdprio_bssplit format\n");
+ return 1;
+ }
+
+ if (str_to_decimal(bs_str, &bs_val, 1, o, 0, 0)) {
+ log_err("fio: split conversion failed\n");
+ free(bs_str);
+ return 1;
+ }
+ free(bs_str);
+
+ entry->bs = bs_val;
+ entry->perc = min(perc, 100u);
+ entry->prio = -1;
+ switch (matches) {
+ case 1: /* bs/ case */
+ case 2: /* bs/perc case */
+ break;
+ case 4: /* bs/perc/class/level case */
+ case 5: /* bs/perc/class/level/hint case */
+ class = min(class, (unsigned int) IOPRIO_MAX_PRIO_CLASS);
+ level = min(level, (unsigned int) IOPRIO_MAX_PRIO);
+ if (matches == 5)
+ hint = min(hint, (unsigned int) IOPRIO_MAX_PRIO_HINT);
+ else
+ hint = 0;
+ entry->prio = ioprio_value(class, level, hint);
+ break;
+ default:
+ log_err("fio: invalid cmdprio_bssplit format\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Returns a negative integer if the first argument should be before the second
+ * argument in the sorted list. A positive integer if the first argument should
+ * be after the second argument in the sorted list. A zero if they are equal.
+ */
+static int fio_split_prio_cmp(const void *p1, const void *p2)
+{
+ const struct split_prio *tmp1 = p1;
+ const struct split_prio *tmp2 = p2;
+
+ if (tmp1->bs > tmp2->bs)
+ return 1;
+ if (tmp1->bs < tmp2->bs)
+ return -1;
+ return 0;
+}
+
+int split_parse_prio_ddir(struct thread_options *o, struct split_prio **entries,
+ int *nr_entries, char *str)
+{
+ struct split_prio *tmp_entries;
+ unsigned int nr_bssplits;
+ char *str_cpy, *p, *fname;
+
+ /* strsep modifies the string, dup it so that we can use strsep twice */
+ p = str_cpy = strdup(str);
+ if (!p)
+ return 1;
+
+ nr_bssplits = 0;
+ while ((fname = strsep(&str_cpy, ":")) != NULL) {
+ if (!strlen(fname))
+ break;
+ nr_bssplits++;
+ }
+ free(p);
+
+ if (nr_bssplits > BSSPLIT_MAX) {
+ log_err("fio: too many cmdprio_bssplit entries\n");
+ return 1;
+ }
+
+ tmp_entries = calloc(nr_bssplits, sizeof(*tmp_entries));
+ if (!tmp_entries)
+ return 1;
+
+ nr_bssplits = 0;
+ while ((fname = strsep(&str, ":")) != NULL) {
+ struct split_prio *entry;
+
+ if (!strlen(fname))
+ break;
+
+ entry = &tmp_entries[nr_bssplits];
+
+ if (parse_cmdprio_bssplit_entry(o, entry, fname)) {
+ log_err("fio: failed to parse cmdprio_bssplit entry\n");
+ free(tmp_entries);
+ return 1;
+ }
+
+ /* skip zero perc entries, they provide no useful information */
+ if (entry->perc)
+ nr_bssplits++;
+ }
+
+ qsort(tmp_entries, nr_bssplits, sizeof(*tmp_entries),
+ fio_split_prio_cmp);
+
+ *entries = tmp_entries;
+ *nr_entries = nr_bssplits;
+
+ return 0;
+}
+
static int str2error(char *str)
{
const char *err[] = { "EPERM", "ENOENT", "ESRCH", "EINTR", "EIO",
if (!nr)
return 0;
- if (td_random(td))
- o->ddir_seq_nr = atoi(nr);
- else {
+ if (td_random(td)) {
+ long long val;
+
+ if (str_to_decimal(nr, &val, 1, o, 0, 0)) {
+ log_err("fio: randrw postfix parsing failed\n");
+ free(nr);
+ return 1;
+ }
+ if ((val <= 0) || (val > UINT_MAX)) {
+ log_err("fio: randrw postfix parsing out of range\n");
+ free(nr);
+ return 1;
+ }
+ o->ddir_seq_nr = (unsigned int) val;
+ } else {
long long val;
if (str_to_decimal(nr, &val, 1, o, 0, 0)) {
return 0;
}
-static int str_rwmix_read_cb(void *data, unsigned long long *val)
+static int str_rwmix_read_cb(void *data, long long *val)
{
struct thread_data *td = cb_data_to_td(data);
return 0;
}
-static int str_rwmix_write_cb(void *data, unsigned long long *val)
+static int str_rwmix_write_cb(void *data, long long *val)
{
struct thread_data *td = cb_data_to_td(data);
int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu_index)
{
unsigned int i, index, cpus_in_mask;
- const long max_cpu = cpus_online();
+ const long max_cpu = cpus_configured();
cpus_in_mask = fio_cpu_count(mask);
if (!cpus_in_mask)
return 1;
}
- max_cpu = cpus_online();
+ max_cpu = cpus_configured();
for (i = 0; i < sizeof(int) * 8; i++) {
if ((1 << i) & *val) {
strip_blank_front(&str);
strip_blank_end(str);
- max_cpu = cpus_online();
+ max_cpu = cpus_configured();
while ((cpu = strsep(&str, ",")) != NULL) {
char *str2, *cpu2;
{
struct thread_data *td = cb_data_to_td(data);
double val;
+ double center = -1;
bool done = false;
char *nr;
return 0;
nr = get_opt_postfix(str);
- if (nr && !str_to_float(nr, &val, 0)) {
+ if (nr && !split_parse_distr(nr, &val, ¢er)) {
log_err("fio: file service type random postfix parsing failed\n");
free(nr);
return 1;
free(nr);
+ if (center != -1 && (center < 0.00 || center > 1.00)) {
+ log_err("fio: distribution center out of range (0 <= center <= 1.0)\n");
+ return 1;
+ }
+ td->random_center = center;
+
switch (td->o.file_service_type) {
case FIO_FSERVICE_ZIPF:
if (val == 1.00) {
}
#endif
-static int zone_split_ddir(struct thread_options *o, enum fio_ddir ddir,
- char *str, bool absolute)
+static int zone_split_ddir(struct thread_options *o, void *eo,
+ enum fio_ddir ddir, char *str, bool absolute)
{
unsigned int i, perc, perc_missing, sperc, sperc_missing;
struct split split;
}
str += strlen(pre);
- ret = str_split_parse(td, str, zone_split_ddir, absolute);
+ ret = str_split_parse(td, str, zone_split_ddir, NULL, absolute);
free(p);
{
struct thread_data *td = cb_data_to_td(data);
double val;
+ double center = -1;
char *nr;
if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
return 0;
nr = get_opt_postfix(str);
- if (nr && !str_to_float(nr, &val, 0)) {
+ if (nr && !split_parse_distr(nr, &val, ¢er)) {
log_err("fio: random postfix parsing failed\n");
free(nr);
return 1;
free(nr);
+ if (center != -1 && (center < 0.00 || center > 1.00)) {
+ log_err("fio: distribution center out of range (0 <= center <= 1.0)\n");
+ return 1;
+ }
+ td->o.random_center.u.f = center;
+
if (td->o.random_distribution == FIO_RAND_DIST_ZIPF) {
if (val == 1.00) {
log_err("fio: zipf theta must different than 1.0\n");
}
/*
- * Returns the directory at the index, indexes > entires will be
+ * Returns the directory at the index, indexes > entries will be
* assigned via modulo division of the index
*/
int set_name_idx(char *target, size_t tlen, char *input, int index,
int ret;
/* FIXME: for now buffer pattern does not support formats */
- ret = parse_and_fill_pattern(input, strlen(input), td->o.buffer_pattern,
- MAX_PATTERN_SIZE, NULL, NULL, NULL);
+ ret = parse_and_fill_pattern_alloc(input, strlen(input),
+ &td->o.buffer_pattern, NULL, NULL, NULL);
if (ret < 0)
return 1;
struct thread_data *td = cb_data_to_td(data);
int ret;
- td->o.verify_fmt_sz = ARRAY_SIZE(td->o.verify_fmt);
- ret = parse_and_fill_pattern(input, strlen(input), td->o.verify_pattern,
- MAX_PATTERN_SIZE, fmt_desc,
- td->o.verify_fmt, &td->o.verify_fmt_sz);
+ td->o.verify_fmt_sz = FIO_ARRAY_SIZE(td->o.verify_fmt);
+ ret = parse_and_fill_pattern_alloc(input, strlen(input),
+ &td->o.verify_pattern, fmt_desc, td->o.verify_fmt,
+ &td->o.verify_fmt_sz);
if (ret < 0)
return 1;
int val = *il;
/*
- * Only modfiy options if gtod_reduce==1
+ * Only modify options if gtod_reduce==1
* Otherwise leave settings alone.
*/
if (val) {
return 0;
}
-static int str_offset_cb(void *data, unsigned long long *__val)
+static int str_offset_cb(void *data, long long *__val)
{
struct thread_data *td = cb_data_to_td(data);
unsigned long long v = *__val;
if (parse_is_percent(v)) {
td->o.start_offset = 0;
td->o.start_offset_percent = -1ULL - v;
+ td->o.start_offset_nz = 0;
dprint(FD_PARSE, "SET start_offset_percent %d\n",
td->o.start_offset_percent);
+ } else if (parse_is_zone(v)) {
+ td->o.start_offset = 0;
+ td->o.start_offset_percent = 0;
+ td->o.start_offset_nz = v - ZONE_BASE_VAL;
} else
td->o.start_offset = v;
return 0;
}
-static int str_offset_increment_cb(void *data, unsigned long long *__val)
+static int str_offset_increment_cb(void *data, long long *__val)
{
struct thread_data *td = cb_data_to_td(data);
unsigned long long v = *__val;
if (parse_is_percent(v)) {
td->o.offset_increment = 0;
td->o.offset_increment_percent = -1ULL - v;
+ td->o.offset_increment_nz = 0;
dprint(FD_PARSE, "SET offset_increment_percent %d\n",
td->o.offset_increment_percent);
+ } else if (parse_is_zone(v)) {
+ td->o.offset_increment = 0;
+ td->o.offset_increment_percent = 0;
+ td->o.offset_increment_nz = v - ZONE_BASE_VAL;
} else
td->o.offset_increment = v;
return 0;
}
-static int str_size_cb(void *data, unsigned long long *__val)
+static int str_size_cb(void *data, long long *__val)
{
struct thread_data *td = cb_data_to_td(data);
unsigned long long v = *__val;
td->o.size_percent = -1ULL - v;
dprint(FD_PARSE, "SET size_percent %d\n",
td->o.size_percent);
+ } else if (parse_is_zone(v)) {
+ td->o.size = 0;
+ td->o.size_percent = 0;
+ td->o.size_nz = v - ZONE_BASE_VAL;
} else
td->o.size = v;
}
dprint(FD_PARSE, "SET io_size_percent %d\n",
td->o.io_size_percent);
+ } else if (parse_is_zone(v)) {
+ td->o.io_size = 0;
+ td->o.io_size_percent = 0;
+ td->o.io_size_nz = v - ZONE_BASE_VAL;
} else
td->o.io_size = v;
return 0;
}
+static int str_zoneskip_cb(void *data, long long *__val)
+{
+ struct thread_data *td = cb_data_to_td(data);
+ unsigned long long v = *__val;
+
+ if (parse_is_zone(v)) {
+ td->o.zone_skip = 0;
+ td->o.zone_skip_nz = v - ZONE_BASE_VAL;
+ } else
+ td->o.zone_skip = v;
+
+ return 0;
+}
+
static int str_write_bw_log_cb(void *data, const char *str)
{
struct thread_data *td = cb_data_to_td(data);
.lname = "Filename(s)",
.type = FIO_OPT_STR_STORE,
.off1 = offsetof(struct thread_options, filename),
+ .maxlen = PATH_MAX,
.cb = str_filename_cb,
.prio = -1, /* must come after "directory" */
.help = "File(s) to use for the workload",
.oval = TD_DDIR_TRIMWRITE,
.help = "Trim and write mix, trims preceding writes"
},
+ { .ival = "randtrimwrite",
+ .oval = TD_DDIR_RANDTRIMWRITE,
+ .help = "Randomly trim and write mix, trims preceding writes"
+ },
},
},
{
.help = "RDMA IO engine",
},
#endif
+#ifdef CONFIG_LIBRPMA_APM
+ { .ival = "librpma_apm",
+ .help = "librpma IO engine in APM mode",
+ },
+#endif
+#ifdef CONFIG_LIBRPMA_GPSPM
+ { .ival = "librpma_gpspm",
+ .help = "librpma IO engine in GPSPM mode",
+ },
+#endif
#ifdef CONFIG_LINUX_EXT4_MOVE_EXTENT
{ .ival = "e4defrag",
.help = "ext4 defrag engine",
.help = "Hadoop Distributed Filesystem (HDFS) engine"
},
#endif
-#ifdef CONFIG_PMEMBLK
- { .ival = "pmemblk",
- .help = "PMDK libpmemblk based IO engine",
- },
-
-#endif
#ifdef CONFIG_IME
{ .ival = "ime_psync",
.help = "DDN's IME synchronous IO engine",
{ .ival = "nbd",
.help = "Network Block Device (NBD) IO engine"
},
+#ifdef CONFIG_DFS
+ { .ival = "dfs",
+ .help = "DAOS File System (dfs) IO engine",
+ },
+#endif
+#ifdef CONFIG_LIBNFS
+ { .ival = "nfs",
+ .help = "NFS IO engine",
+ },
+#endif
+#ifdef CONFIG_LIBXNVME
+ { .ival = "xnvme",
+ .help = "XNVME IO engine",
+ },
+#endif
},
},
{
{
.name = "size",
.lname = "Size",
- .type = FIO_OPT_STR_VAL,
+ .type = FIO_OPT_STR_VAL_ZONE,
.cb = str_size_cb,
.off1 = offsetof(struct thread_options, size),
.help = "Total size of device or files",
- .interval = 1024 * 1024,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_INVALID,
},
.name = "io_size",
.alias = "io_limit",
.lname = "IO Size",
- .type = FIO_OPT_STR_VAL,
+ .type = FIO_OPT_STR_VAL_ZONE,
.cb = str_io_size_cb,
.off1 = offsetof(struct thread_options, io_size),
.help = "Total size of I/O to be performed",
- .interval = 1024 * 1024,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_INVALID,
},
.name = "offset",
.lname = "IO offset",
.alias = "fileoffset",
- .type = FIO_OPT_STR_VAL,
+ .type = FIO_OPT_STR_VAL_ZONE,
.cb = str_offset_cb,
.off1 = offsetof(struct thread_options, start_offset),
.help = "Start IO from this offset",
.def = "0",
- .interval = 1024 * 1024,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_INVALID,
},
{
.name = "offset_increment",
.lname = "IO offset increment",
- .type = FIO_OPT_STR_VAL,
+ .type = FIO_OPT_STR_VAL_ZONE,
.cb = str_offset_increment_cb,
.off1 = offsetof(struct thread_options, offset_increment),
.help = "What is the increment from one offset to the next",
.parent = "offset",
.hide = 1,
.def = "0",
- .interval = 1024 * 1024,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_INVALID,
},
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_INVALID,
},
+ {
+ .name = "num_range",
+ .lname = "Number of ranges",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, num_range),
+ .maxval = MAX_TRIM_RANGE,
+ .help = "Number of ranges for trim command",
+ .def = "1",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_INVALID,
+ },
{
.name = "bs",
.lname = "Block size",
},
{
.name = "randrepeat",
+ .alias = "allrandrepeat",
.lname = "Random repeatable",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct thread_options, rand_repeatable),
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_RANDOM,
},
- {
- .name = "allrandrepeat",
- .lname = "All Random Repeat",
- .type = FIO_OPT_BOOL,
- .off1 = offsetof(struct thread_options, allrand_repeatable),
- .help = "Use repeatable random numbers for everything",
- .def = "0",
- .category = FIO_OPT_C_IO,
- .group = FIO_OPT_G_RANDOM,
- },
{
.name = "nrfiles",
.lname = "Number of files",
.oval = F_ADV_SEQUENTIAL,
.help = "Advise using FADV_SEQUENTIAL",
},
+#ifdef POSIX_FADV_NOREUSE
+ { .ival = "noreuse",
+ .oval = F_ADV_NOREUSE,
+ .help = "Advise using FADV_NOREUSE",
+ },
+#endif
},
.help = "Use fadvise() to advise the kernel on IO pattern",
.def = "1",
{
.name = "zoneskip",
.lname = "Zone skip",
- .type = FIO_OPT_STR_VAL,
+ .type = FIO_OPT_STR_VAL_ZONE,
+ .cb = str_zoneskip_cb,
.off1 = offsetof(struct thread_options, zone_skip),
.help = "Space between IO zones",
.def = "0",
- .interval = 1024 * 1024,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_ZONE,
},
.lname = "Per device/file maximum number of open zones",
.type = FIO_OPT_INT,
.off1 = offsetof(struct thread_options, max_open_zones),
- .maxval = ZBD_MAX_OPEN_ZONES,
+ .maxval = ZBD_MAX_WRITE_ZONES,
.help = "Limit on the number of simultaneously opened sequential write zones with zonemode=zbd",
.def = "0",
.category = FIO_OPT_C_IO,
.lname = "Job maximum number of open zones",
.type = FIO_OPT_INT,
.off1 = offsetof(struct thread_options, job_max_open_zones),
- .maxval = ZBD_MAX_OPEN_ZONES,
+ .maxval = ZBD_MAX_WRITE_ZONES,
.help = "Limit on the number of simultaneously opened sequential write zones with zonemode=zbd by one thread/process",
.def = "0",
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_INVALID,
},
+ {
+ .name = "ignore_zone_limits",
+ .lname = "Ignore zone resource limits",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct thread_options, ignore_zone_limits),
+ .def = "0",
+ .help = "Ignore the zone resource limits (max open/active zones) reported by the device",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_INVALID,
+ },
{
.name = "zone_reset_threshold",
.lname = "Zone reset threshold",
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_ZONE,
},
+ {
+ .name = "fdp",
+ .lname = "Flexible data placement",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct thread_options, fdp),
+ .help = "Use Data placement directive (FDP)",
+ .def = "0",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "dataplacement",
+ .alias = "data_placement",
+ .lname = "Data Placement interface",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct thread_options, dp_type),
+ .help = "Data Placement interface to use",
+ .def = "none",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_INVALID,
+ .posval = {
+ { .ival = "none",
+ .oval = FIO_DP_NONE,
+ .help = "Do not specify a data placement interface",
+ },
+ { .ival = "fdp",
+ .oval = FIO_DP_FDP,
+ .help = "Use Flexible Data Placement interface",
+ },
+ { .ival = "streams",
+ .oval = FIO_DP_STREAMS,
+ .help = "Use Streams interface",
+ },
+ },
+ },
+ {
+ .name = "plid_select",
+ .alias = "fdp_pli_select",
+ .lname = "Data Placement ID selection strategy",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct thread_options, dp_id_select),
+ .help = "Strategy for selecting next Data Placement ID",
+ .def = "roundrobin",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_INVALID,
+ .posval = {
+ { .ival = "random",
+ .oval = FIO_DP_RANDOM,
+ .help = "Choose a Placement ID at random (uniform)",
+ },
+ { .ival = "roundrobin",
+ .oval = FIO_DP_RR,
+ .help = "Round robin select Placement IDs",
+ },
+ },
+ },
+ {
+ .name = "plids",
+ .alias = "fdp_pli",
+ .lname = "Stream IDs/Data Placement ID indices",
+ .type = FIO_OPT_STR,
+ .cb = str_fdp_pli_cb,
+ .off1 = offsetof(struct thread_options, dp_ids),
+ .help = "Sets which Data Placement ids to use (defaults to all for FDP)",
+ .hide = 1,
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_INVALID,
+ },
{
.name = "lockmem",
.lname = "Lock memory",
.category = FIO_OPT_C_GENERAL,
.group = FIO_OPT_G_CRED,
},
+ {
+ .name = "priohint",
+ .lname = "I/O nice priority hint",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, ioprio_hint),
+ .help = "Set job IO priority hint",
+ .minval = IOPRIO_MIN_PRIO_HINT,
+ .maxval = IOPRIO_MAX_PRIO_HINT,
+ .interval = 1,
+ .category = FIO_OPT_C_GENERAL,
+ .group = FIO_OPT_G_CRED,
+ },
#else
{
.name = "prioclass",
.type = FIO_OPT_UNSUPPORTED,
.help = "Your platform does not support IO priority classes",
},
+ {
+ .name = "priohint",
+ .lname = "I/O nice priority hint",
+ .type = FIO_OPT_UNSUPPORTED,
+ .help = "Your platform does not support IO priority hints",
+ },
#endif
{
.name = "thinktime",
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_THINKTIME,
},
+ {
+ .name = "thinkcycles",
+ .lname = "Think cycles",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, thinkcycles),
+ .help = "Spin for a constant amount of cycles between requests",
+ .def = "0",
+ .parent = "thinktime",
+ .hide = 1,
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_THINKTIME,
+ },
{
.name = "thinktime_blocks",
.lname = "Thinktime blocks",
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_THINKTIME,
},
+ {
+ .name = "thinktime_blocks_type",
+ .lname = "Thinktime blocks type",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct thread_options, thinktime_blocks_type),
+ .help = "How thinktime_blocks takes effect",
+ .def = "complete",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_THINKTIME,
+ .posval = {
+ { .ival = "complete",
+ .oval = THINKTIME_BLOCKS_TYPE_COMPLETE,
+ .help = "thinktime_blocks takes effect at the completion side",
+ },
+ {
+ .ival = "issue",
+ .oval = THINKTIME_BLOCKS_TYPE_ISSUE,
+ .help = "thinktime_blocks takes effect at the issue side",
+ },
+ },
+ .parent = "thinktime",
+ },
+ {
+ .name = "thinktime_iotime",
+ .lname = "Thinktime interval",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, thinktime_iotime),
+ .help = "IO time interval between 'thinktime'",
+ .def = "0",
+ .parent = "thinktime",
+ .hide = 1,
+ .is_seconds = 1,
+ .is_time = 1,
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_THINKTIME,
+ },
{
.name = "rate",
.lname = "I/O rate",
{
.name = "max_latency",
.lname = "Max Latency (usec)",
- .type = FIO_OPT_STR_VAL_TIME,
- .off1 = offsetof(struct thread_options, max_latency),
+ .type = FIO_OPT_ULL,
+ .off1 = offsetof(struct thread_options, max_latency[DDIR_READ]),
+ .off2 = offsetof(struct thread_options, max_latency[DDIR_WRITE]),
+ .off3 = offsetof(struct thread_options, max_latency[DDIR_TRIM]),
.help = "Maximum tolerated IO latency (usec)",
.is_time = 1,
.category = FIO_OPT_C_IO,
{
.name = "sync",
.lname = "Synchronous I/O",
- .type = FIO_OPT_BOOL,
+ .type = FIO_OPT_STR,
.off1 = offsetof(struct thread_options, sync_io),
- .help = "Use O_SYNC for buffered writes",
- .def = "0",
- .parent = "buffered",
+ .help = "Use synchronous write IO",
+ .def = "none",
.hide = 1,
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_IO_TYPE,
+ .posval = {
+ { .ival = "none",
+ .oval = 0,
+ },
+ { .ival = "0",
+ .oval = 0,
+ },
+ { .ival = "sync",
+ .oval = O_SYNC,
+ },
+ { .ival = "1",
+ .oval = O_SYNC,
+ },
+#ifdef O_DSYNC
+ { .ival = "dsync",
+ .oval = O_DSYNC,
+ },
+#endif
+ },
},
#ifdef FIO_HAVE_WRITE_HINT
{
.category = FIO_OPT_C_LOG,
.group = FIO_OPT_G_INVALID,
},
+ {
+ .name = "log_entries",
+ .lname = "Log entries",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, log_entries),
+ .help = "Initial number of entries in a job IO log",
+ .def = __fio_stringify(DEF_LOG_ENTRIES),
+ .minval = DEF_LOG_ENTRIES,
+ .maxval = MAX_LOG_ENTRIES,
+ .category = FIO_OPT_C_LOG,
+ .group = FIO_OPT_G_INVALID,
+ },
{
.name = "log_avg_msec",
.lname = "Log averaging (msec)",
.group = FIO_OPT_G_INVALID,
},
{
- .name = "log_max_value",
- .lname = "Log maximum instead of average",
- .type = FIO_OPT_BOOL,
+ .name = "log_window_value",
+ .alias = "log_max_value",
+ .lname = "Log maximum, average or both values",
+ .type = FIO_OPT_STR,
.off1 = offsetof(struct thread_options, log_max),
- .help = "Log max sample in a window instead of average",
- .def = "0",
+ .help = "Log max, average or both sample in a window",
+ .def = "avg",
.category = FIO_OPT_C_LOG,
.group = FIO_OPT_G_INVALID,
+ .posval = {
+ { .ival = "avg",
+ .oval = IO_LOG_SAMPLE_AVG,
+ .help = "Log average value over the window",
+ },
+ { .ival = "max",
+ .oval = IO_LOG_SAMPLE_MAX,
+ .help = "Log maximum value in the window",
+ },
+ { .ival = "both",
+ .oval = IO_LOG_SAMPLE_BOTH,
+ .help = "Log both average and maximum values over the window"
+ },
+ /* Compatibility with former boolean values */
+ { .ival = "0",
+ .oval = IO_LOG_SAMPLE_AVG,
+ .help = "Alias for 'avg'",
+ },
+ { .ival = "1",
+ .oval = IO_LOG_SAMPLE_MAX,
+ .help = "Alias for 'max'",
+ },
+ },
},
{
.name = "log_offset",
.category = FIO_OPT_C_LOG,
.group = FIO_OPT_G_INVALID,
},
+ {
+ .name = "log_prio",
+ .lname = "Log priority of IO",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct thread_options, log_prio),
+ .help = "Include priority value of IO for each log entry",
+ .def = "0",
+ .category = FIO_OPT_C_LOG,
+ .group = FIO_OPT_G_INVALID,
+ },
#ifdef CONFIG_ZLIB
{
.name = "log_compression",
},
#endif
{
- .name = "log_unix_epoch",
- .lname = "Log epoch unix",
+ .name = "log_alternate_epoch",
+ .alias = "log_unix_epoch",
+ .lname = "Log epoch alternate",
.type = FIO_OPT_BOOL,
- .off1 = offsetof(struct thread_options, log_unix_epoch),
- .help = "Use Unix time in log files",
+ .off1 = offsetof(struct thread_options, log_alternate_epoch),
+ .help = "Use alternate epoch time in log files. Uses the same epoch as that is used by clock_gettime with specified log_alternate_epoch_clock_id.",
+ .category = FIO_OPT_C_LOG,
+ .group = FIO_OPT_G_INVALID,
+ },
+ {
+ .name = "log_alternate_epoch_clock_id",
+ .lname = "Log alternate epoch clock_id",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, log_alternate_epoch_clock_id),
+ .help = "If log_alternate_epoch is true, this option specifies the clock_id from clock_gettime whose epoch should be used. If log_alternate_epoch is false, this option has no effect. Default value is 0, or CLOCK_REALTIME",
.category = FIO_OPT_C_LOG,
.group = FIO_OPT_G_INVALID,
},
.category = FIO_OPT_C_IO,
.group = FIO_OPT_G_IO_BUF,
},
+ {
+ .name = "dedupe_global",
+ .lname = "Global deduplication",
+ .type = FIO_OPT_BOOL,
+ .off1 = offsetof(struct thread_options, dedupe_global),
+ .help = "Share deduplication buffers across jobs",
+ .def = "0",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_IO_BUF,
+ },
+ {
+ .name = "dedupe_mode",
+ .lname = "Dedupe mode",
+ .help = "Mode for the deduplication buffer generation",
+ .type = FIO_OPT_STR,
+ .off1 = offsetof(struct thread_options, dedupe_mode),
+ .parent = "dedupe_percentage",
+ .def = "repeat",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_IO_BUF,
+ .posval = {
+ { .ival = "repeat",
+ .oval = DEDUPE_MODE_REPEAT,
+ .help = "repeat previous page",
+ },
+ { .ival = "working_set",
+ .oval = DEDUPE_MODE_WORKING_SET,
+ .help = "choose a page randomly from limited working set defined in dedupe_working_set_percentage",
+ },
+ },
+ },
+ {
+ .name = "dedupe_working_set_percentage",
+ .lname = "Dedupe working set percentage",
+ .help = "Dedupe working set size in percentages from file or device size used to generate dedupe patterns from",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, dedupe_working_set_percentage),
+ .parent = "dedupe_percentage",
+ .def = "5",
+ .maxval = 100,
+ .minval = 0,
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_IO_BUF,
+ },
{
.name = "clat_percentiles",
.lname = "Completion latency percentiles",
.category = FIO_OPT_C_GENERAL,
.group = FIO_OPT_G_CLOCK,
},
+ {
+ .name = "job_start_clock_id",
+ .lname = "Job start clock_id",
+ .type = FIO_OPT_INT,
+ .off1 = offsetof(struct thread_options, job_start_clock_id),
+ .help = "The clock_id passed to the call to clock_gettime used to record job_start in the json output format. Default is 0, or CLOCK_REALTIME",
+ .verify = gtod_cpu_verify,
+ .category = FIO_OPT_C_GENERAL,
+ .group = FIO_OPT_G_CLOCK,
+ },
{
.name = "unified_rw_reporting",
.lname = "Unified RW Reporting",
- .type = FIO_OPT_BOOL,
+ .type = FIO_OPT_STR,
.off1 = offsetof(struct thread_options, unified_rw_rep),
.help = "Unify reporting across data direction",
- .def = "0",
+ .def = "none",
.category = FIO_OPT_C_GENERAL,
.group = FIO_OPT_G_INVALID,
+ .posval = {
+ { .ival = "none",
+ .oval = UNIFIED_SPLIT,
+ .help = "Normal statistics reporting",
+ },
+ { .ival = "mixed",
+ .oval = UNIFIED_MIXED,
+ .help = "Statistics are summed per data direction and reported together",
+ },
+ { .ival = "both",
+ .oval = UNIFIED_BOTH,
+ .help = "Statistics are reported normally, followed by the mixed statistics"
+ },
+ /* Compatibility with former boolean values */
+ { .ival = "0",
+ .oval = UNIFIED_SPLIT,
+ .help = "Alias for 'none'",
+ },
+ { .ival = "1",
+ .oval = UNIFIED_MIXED,
+ .help = "Alias for 'mixed'",
+ },
+ { .ival = "2",
+ .oval = UNIFIED_BOTH,
+ .help = "Alias for 'both'",
+ },
+ },
},
{
.name = "continue_on_error",
.category = FIO_OPT_C_GENERAL,
.group = FIO_OPT_G_RUNTIME,
},
+ {
+ .name = "steadystate_check_interval",
+ .lname = "Steady state check interval",
+ .alias = "ss_interval",
+ .parent = "steadystate",
+ .type = FIO_OPT_STR_VAL_TIME,
+ .off1 = offsetof(struct thread_options, ss_check_interval),
+ .help = "Polling interval for the steady state check (too low means steadystate will not converge)",
+ .def = "1",
+ .is_seconds = 1,
+ .is_time = 1,
+ .category = FIO_OPT_C_GENERAL,
+ .group = FIO_OPT_G_RUNTIME,
+ },
{
.name = NULL,
},
sprintf(buf, "%llu", mb_memory);
fio_keywords[1].replace = strdup(buf);
- l = cpus_online();
+ l = cpus_configured();
sprintf(buf, "%lu", l);
fio_keywords[2].replace = strdup(buf);
}
struct fio_keyword *kw = &fio_keywords[i];
while ((s = strstr(opt, kw->word)) != NULL) {
- char *new = malloc(strlen(opt) + 1);
+ char *new = calloc(strlen(opt) + 1, 1);
char *o_org = opt;
int olen = s - opt;
int len;
* If there's more in the original string, copy that
* in too
*/
- opt += strlen(kw->word) + olen;
+ opt += olen + strlen(kw->word);
+ /* keeps final zero thanks to calloc */
if (strlen(opt))
- memcpy(new + olen + len, opt, opt - o_org - 1);
+ memcpy(new + olen + len, opt, strlen(opt));
/*
* replace opt and free the old opt
}
}
+void fio_dump_options_free(struct thread_data *td)
+{
+ while (!flist_empty(&td->opt_list)) {
+ struct print_option *p;
+
+ p = flist_first_entry(&td->opt_list, struct print_option, list);
+ flist_del_init(&p->list);
+ free(p->name);
+ free(p->value);
+ free(p);
+ }
+}
+
struct fio_option *fio_option_find(const char *name)
{
return find_option(fio_options, name);
void del_opt_posval(const char *, const char *);
struct thread_data;
void fio_options_free(struct thread_data *);
+void fio_dump_options_free(struct thread_data *);
char *get_next_str(char **ptr);
int get_max_str_idx(char *input);
char* get_name_by_idx(char *input, int index);
#include <linux/fs.h>
#include <linux/types.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-
/*
* IO submission data structure (Submission Queue Entry)
*/
union {
__u64 off; /* offset into file */
__u64 addr2;
+ __u32 cmd_op;
};
union {
__u64 addr; /* pointer to buffer or iovecs */
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
+ __u32 rename_flags;
+ __u32 unlink_flags;
+ __u32 hardlink_flags;
+ __u32 uring_cmd_flags;
};
__u64 user_data; /* data to be passed back at completion time */
+ /* pack this to avoid bogus arm OABI complaints */
+ union {
+ /* index into fixed buffers, if used */
+ __u16 buf_index;
+ /* for grouped buffer selection */
+ __u16 buf_group;
+ } __attribute__((packed));
+ /* personality to use, if used */
+ __u16 personality;
+ union {
+ __s32 splice_fd_in;
+ __u32 file_index;
+ };
union {
struct {
- /* pack this to avoid bogus arm OABI complaints */
- union {
- /* index into fixed buffers, if used */
- __u16 buf_index;
- /* for grouped buffer selection */
- __u16 buf_group;
- } __attribute__((packed));
- /* personality to use, if used */
- __u16 personality;
- __s32 splice_fd_in;
+ __u64 addr3;
+ __u64 __pad2[1];
};
- __u64 __pad2[3];
+ /*
+ * If the ring is initialized with IORING_SETUP_SQE128, then
+ * this field is used for 80 bytes of arbitrary command data
+ */
+ __u8 cmd[0];
};
};
IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT,
IOSQE_BUFFER_SELECT_BIT,
+ IOSQE_CQE_SKIP_SUCCESS_BIT,
};
/*
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/* select buffer from sqe->buf_group */
#define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT)
+/* don't post CQE if request succeeded */
+#define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT)
/*
* io_uring_setup() flags
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
+#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
+#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */
+/*
+ * Cooperative task running. When requests complete, they often require
+ * forcing the submitter to transition to the kernel to complete. If this
+ * flag is set, work will be done when the task transitions anyway, rather
+ * than force an inter-processor interrupt reschedule. This avoids interrupting
+ * a task running in userspace, and saves an IPI.
+ */
+#define IORING_SETUP_COOP_TASKRUN (1U << 8)
+/*
+ * If COOP_TASKRUN is set, get notified if task work is available for
+ * running and a kernel transition would be needed to run it. This sets
+ * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
+ */
+#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
+
+#define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */
+#define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */
+
+/*
+ * Only one task is allowed to submit requests
+ */
+#define IORING_SETUP_SINGLE_ISSUER (1U << 12)
+
+/*
+ * Defer running task work to get events.
+ * Rather than running bits of task work whenever the task transitions
+ * try to do it just before it is needed.
+ */
+#define IORING_SETUP_DEFER_TASKRUN (1U << 13)
enum {
IORING_OP_NOP,
IORING_OP_PROVIDE_BUFFERS,
IORING_OP_REMOVE_BUFFERS,
IORING_OP_TEE,
+ IORING_OP_SHUTDOWN,
+ IORING_OP_RENAMEAT,
+ IORING_OP_UNLINKAT,
+ IORING_OP_MKDIRAT,
+ IORING_OP_SYMLINKAT,
+ IORING_OP_LINKAT,
+ IORING_OP_MSG_RING,
+ IORING_OP_FSETXATTR,
+ IORING_OP_SETXATTR,
+ IORING_OP_FGETXATTR,
+ IORING_OP_GETXATTR,
+ IORING_OP_SOCKET,
+ IORING_OP_URING_CMD,
+
/* this goes last, obviously */
IORING_OP_LAST,
};
+/*
+ * sqe->uring_cmd_flags
+ * IORING_URING_CMD_FIXED use registered buffer; pass thig flag
+ * along with setting sqe->buf_index.
+ */
+#define IORING_URING_CMD_FIXED (1U << 0)
+
/*
* sqe->fsync_flags
*/
/*
* sqe->timeout_flags
*/
-#define IORING_TIMEOUT_ABS (1U << 0)
-
+#define IORING_TIMEOUT_ABS (1U << 0)
+#define IORING_TIMEOUT_UPDATE (1U << 1)
+#define IORING_TIMEOUT_BOOTTIME (1U << 2)
+#define IORING_TIMEOUT_REALTIME (1U << 3)
+#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
+#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
+#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
+#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
* sqe->splice_flags
* extends splice(2) flags
*/
#define SPLICE_F_FD_IN_FIXED (1U << 31) /* the last bit of __u32 */
+/*
+ * POLL_ADD flags. Note that since sqe->poll_events is the flag space, the
+ * command flags for POLL_ADD are stored in sqe->len.
+ *
+ * IORING_POLL_ADD_MULTI Multishot poll. Sets IORING_CQE_F_MORE if
+ * the poll handler will continue to report
+ * CQEs on behalf of the same SQE.
+ *
+ * IORING_POLL_UPDATE Update existing poll request, matching
+ * sqe->addr as the old user_data field.
+ */
+#define IORING_POLL_ADD_MULTI (1U << 0)
+#define IORING_POLL_UPDATE_EVENTS (1U << 1)
+#define IORING_POLL_UPDATE_USER_DATA (1U << 2)
+
/*
* IO completion data structure (Completion Queue Entry)
*/
__u64 user_data; /* sqe->data submission passed back */
__s32 res; /* result code for this event */
__u32 flags;
+
+ /*
+ * If the ring is initialized with IORING_SETUP_CQE32, then this field
+ * contains 16-bytes of padding, doubling the size of the CQE.
+ */
+ __u64 big_cqe[];
};
/*
* cqe->flags
*
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
+ * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
*/
#define IORING_CQE_F_BUFFER (1U << 0)
+#define IORING_CQE_F_MORE (1U << 1)
enum {
IORING_CQE_BUFFER_SHIFT = 16,
/*
* io_uring_enter(2) flags
*/
-#define IORING_ENTER_GETEVENTS (1U << 0)
-#define IORING_ENTER_SQ_WAKEUP (1U << 1)
+#define IORING_ENTER_GETEVENTS (1U << 0)
+#define IORING_ENTER_SQ_WAKEUP (1U << 1)
+#define IORING_ENTER_SQ_WAIT (1U << 2)
+#define IORING_ENTER_EXT_ARG (1U << 3)
+#define IORING_ENTER_REGISTERED_RING (1U << 4)
/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
#define IORING_FEAT_FAST_POLL (1U << 5)
#define IORING_FEAT_POLL_32BITS (1U << 6)
+#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7)
+#define IORING_FEAT_EXT_ARG (1U << 8)
+#define IORING_FEAT_NATIVE_WORKERS (1U << 9)
+#define IORING_FEAT_RSRC_TAGS (1U << 10)
+#define IORING_FEAT_CQE_SKIP (1U << 11)
/*
* io_uring_register(2) opcodes and arguments
*/
-#define IORING_REGISTER_BUFFERS 0
-#define IORING_UNREGISTER_BUFFERS 1
-#define IORING_REGISTER_FILES 2
-#define IORING_UNREGISTER_FILES 3
-#define IORING_REGISTER_EVENTFD 4
-#define IORING_UNREGISTER_EVENTFD 5
-#define IORING_REGISTER_FILES_UPDATE 6
-#define IORING_REGISTER_EVENTFD_ASYNC 7
-#define IORING_REGISTER_PROBE 8
-#define IORING_REGISTER_PERSONALITY 9
-#define IORING_UNREGISTER_PERSONALITY 10
+enum {
+ IORING_REGISTER_BUFFERS = 0,
+ IORING_UNREGISTER_BUFFERS = 1,
+ IORING_REGISTER_FILES = 2,
+ IORING_UNREGISTER_FILES = 3,
+ IORING_REGISTER_EVENTFD = 4,
+ IORING_UNREGISTER_EVENTFD = 5,
+ IORING_REGISTER_FILES_UPDATE = 6,
+ IORING_REGISTER_EVENTFD_ASYNC = 7,
+ IORING_REGISTER_PROBE = 8,
+ IORING_REGISTER_PERSONALITY = 9,
+ IORING_UNREGISTER_PERSONALITY = 10,
+ IORING_REGISTER_RESTRICTIONS = 11,
+ IORING_REGISTER_ENABLE_RINGS = 12,
+
+ /* extended with tagging */
+ IORING_REGISTER_FILES2 = 13,
+ IORING_REGISTER_FILES_UPDATE2 = 14,
+ IORING_REGISTER_BUFFERS2 = 15,
+ IORING_REGISTER_BUFFERS_UPDATE = 16,
+
+ /* set/clear io-wq thread affinities */
+ IORING_REGISTER_IOWQ_AFF = 17,
+ IORING_UNREGISTER_IOWQ_AFF = 18,
+
+ /* set/get max number of io-wq workers */
+ IORING_REGISTER_IOWQ_MAX_WORKERS = 19,
+
+ /* register/unregister io_uring fd with the ring */
+ IORING_REGISTER_RING_FDS = 20,
+ IORING_UNREGISTER_RING_FDS = 21,
+
+ /* this goes last */
+ IORING_REGISTER_LAST
+};
+/* io-wq worker categories */
+enum {
+ IO_WQ_BOUND,
+ IO_WQ_UNBOUND,
+};
+
+/* deprecated, see struct io_uring_rsrc_update */
struct io_uring_files_update {
__u32 offset;
__u32 resv;
__aligned_u64 /* __s32 * */ fds;
};
+struct io_uring_rsrc_register {
+ __u32 nr;
+ __u32 resv;
+ __u64 resv2;
+ __aligned_u64 data;
+ __aligned_u64 tags;
+};
+
+struct io_uring_rsrc_update {
+ __u32 offset;
+ __u32 resv;
+ __aligned_u64 data;
+};
+
+struct io_uring_rsrc_update2 {
+ __u32 offset;
+ __u32 resv;
+ __aligned_u64 data;
+ __aligned_u64 tags;
+ __u32 nr;
+ __u32 resv2;
+};
+
+/* Skip updating fd indexes set to this value in the fd table */
+#define IORING_REGISTER_FILES_SKIP (-2)
+
#define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op {
struct io_uring_probe_op ops[0];
};
-#ifdef __cplusplus
-}
-#endif
+struct io_uring_restriction {
+ __u16 opcode;
+ union {
+ __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */
+ __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */
+ __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */
+ };
+ __u8 resv;
+ __u32 resv2[3];
+};
+
+/*
+ * io_uring_restriction->opcode values
+ */
+enum {
+ /* Allow an io_uring_register(2) opcode */
+ IORING_RESTRICTION_REGISTER_OP = 0,
+
+ /* Allow an sqe opcode */
+ IORING_RESTRICTION_SQE_OP = 1,
+
+ /* Allow sqe flags */
+ IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2,
+
+ /* Require sqe flags (these flags must be set on each submission) */
+ IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3,
+
+ IORING_RESTRICTION_LAST
+};
+
+struct io_uring_getevents_arg {
+ __u64 sigmask;
+ __u32 sigmask_sz;
+ __u32 pad;
+ __u64 ts;
+};
#endif
#define FIO_USE_GENERIC_SWAP
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
static inline int blockdev_invalidate_cache(struct fio_file *f)
{
return ENOTSUP;
+++ /dev/null
-#ifndef FIO_OS_ANDROID_H
-#define FIO_OS_ANDROID_H
-
-#define FIO_OS os_android
-
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/uio.h>
-#include <sys/syscall.h>
-#include <sys/sysmacros.h>
-#include <sys/vfs.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <sched.h>
-#include <linux/unistd.h>
-#include <linux/major.h>
-#include <asm/byteorder.h>
-
-#include "./os-linux-syscall.h"
-#include "../file.h"
-
-#ifndef __has_builtin // Optional of course.
- #define __has_builtin(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-#define FIO_HAVE_DISK_UTIL
-#define FIO_HAVE_IOSCHED_SWITCH
-#define FIO_HAVE_IOPRIO
-#define FIO_HAVE_IOPRIO_CLASS
-#define FIO_HAVE_ODIRECT
-#define FIO_HAVE_HUGETLB
-#define FIO_HAVE_BLKTRACE
-#define FIO_HAVE_CL_SIZE
-#define FIO_HAVE_CGROUPS
-#define FIO_HAVE_FS_STAT
-#define FIO_HAVE_TRIM
-#define FIO_HAVE_GETTID
-#define FIO_USE_GENERIC_INIT_RANDOM_STATE
-#define FIO_HAVE_E4_ENG
-#define FIO_HAVE_BYTEORDER_FUNCS
-#define FIO_HAVE_MMAP_HUGE
-#define FIO_NO_HAVE_SHM_H
-
-#define OS_MAP_ANON MAP_ANONYMOUS
-
-#ifndef POSIX_MADV_DONTNEED
-#define posix_madvise madvise
-#define POSIX_MADV_DONTNEED MADV_DONTNEED
-#define POSIX_MADV_SEQUENTIAL MADV_SEQUENTIAL
-#define POSIX_MADV_RANDOM MADV_RANDOM
-#endif
-
-#ifdef MADV_REMOVE
-#define FIO_MADV_FREE MADV_REMOVE
-#endif
-#ifndef MAP_HUGETLB
-#define MAP_HUGETLB 0x40000 /* arch specific */
-#endif
-
-#ifndef CONFIG_NO_SHM
-/*
- * Bionic doesn't support SysV shared memeory, so implement it using ashmem
- */
-#include <stdio.h>
-#include <linux/ashmem.h>
-#include <linux/shm.h>
-#define shmid_ds shmid64_ds
-#define SHM_HUGETLB 04000
-
-#define ASHMEM_DEVICE "/dev/ashmem"
-
-static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
-{
- int ret=0;
- if (__cmd == IPC_RMID)
- {
- int length = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
- struct ashmem_pin pin = {0 , length};
- ret = ioctl(__shmid, ASHMEM_UNPIN, &pin);
- close(__shmid);
- }
- return ret;
-}
-
-static inline int shmget(key_t __key, size_t __size, int __shmflg)
-{
- int fd,ret;
- char keybuf[11];
-
- fd = open(ASHMEM_DEVICE, O_RDWR);
- if (fd < 0)
- return fd;
-
- sprintf(keybuf,"%d",__key);
- ret = ioctl(fd, ASHMEM_SET_NAME, keybuf);
- if (ret < 0)
- goto error;
-
- /* Stores size in first 8 bytes, allocate extra space */
- ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t));
- if (ret < 0)
- goto error;
-
- return fd;
-
-error:
- close(fd);
- return ret;
-}
-
-static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg)
-{
- size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
- /* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */
- uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
- /* Save size at beginning of buffer, for use with munmap */
- *ptr = size;
- return ptr + 1;
-}
-
-static inline int shmdt (const void *__shmaddr)
-{
- /* Find mmap size which we stored at the beginning of the buffer */
- uint64_t *ptr = (uint64_t *)__shmaddr - 1;
- size_t size = *ptr;
- return munmap(ptr, size);
-}
-#endif
-
-#define SPLICE_DEF_SIZE (64*1024)
-
-enum {
- IOPRIO_CLASS_NONE,
- IOPRIO_CLASS_RT,
- IOPRIO_CLASS_BE,
- IOPRIO_CLASS_IDLE,
-};
-
-enum {
- IOPRIO_WHO_PROCESS = 1,
- IOPRIO_WHO_PGRP,
- IOPRIO_WHO_USER,
-};
-
-#define IOPRIO_BITS 16
-#define IOPRIO_CLASS_SHIFT 13
-
-#define IOPRIO_MIN_PRIO 0 /* highest priority */
-#define IOPRIO_MAX_PRIO 7 /* lowest priority */
-
-#define IOPRIO_MIN_PRIO_CLASS 0
-#define IOPRIO_MAX_PRIO_CLASS 3
-
-static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
-{
- /*
- * If no class is set, assume BE
- */
- if (!ioprio_class)
- ioprio_class = IOPRIO_CLASS_BE;
-
- ioprio |= ioprio_class << IOPRIO_CLASS_SHIFT;
- return syscall(__NR_ioprio_set, which, who, ioprio);
-}
-
-#ifndef BLKGETSIZE64
-#define BLKGETSIZE64 _IOR(0x12,114,size_t)
-#endif
-
-#ifndef BLKFLSBUF
-#define BLKFLSBUF _IO(0x12,97)
-#endif
-
-#ifndef BLKDISCARD
-#define BLKDISCARD _IO(0x12,119)
-#endif
-
-static inline int blockdev_invalidate_cache(struct fio_file *f)
-{
- return ioctl(f->fd, BLKFLSBUF);
-}
-
-static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
-{
- if (!ioctl(f->fd, BLKGETSIZE64, bytes))
- return 0;
-
- return errno;
-}
-
-static inline unsigned long long os_phys_mem(void)
-{
- long pagesize, pages;
-
- pagesize = sysconf(_SC_PAGESIZE);
- pages = sysconf(_SC_PHYS_PAGES);
- if (pages == -1 || pagesize == -1)
- return 0;
-
- return (unsigned long long) pages * (unsigned long long) pagesize;
-}
-
-#ifdef O_NOATIME
-#define FIO_O_NOATIME O_NOATIME
-#else
-#define FIO_O_NOATIME 0
-#endif
-
-/* Check for GCC or Clang byte swap intrinsics */
-#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \
- && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \
- || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */
-#define fio_swap16(x) __builtin_bswap16(x)
-#define fio_swap32(x) __builtin_bswap32(x)
-#define fio_swap64(x) __builtin_bswap64(x)
-#else
-#include <byteswap.h>
-#define fio_swap16(x) bswap_16(x)
-#define fio_swap32(x) bswap_32(x)
-#define fio_swap64(x) bswap_64(x)
-#endif /* fio_swapN */
-
-#define CACHE_LINE_FILE \
- "/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
-
-static inline int arch_cache_line_size(void)
-{
- char size[32];
- int fd, ret;
-
- fd = open(CACHE_LINE_FILE, O_RDONLY);
- if (fd < 0)
- return -1;
-
- ret = read(fd, size, sizeof(size));
-
- close(fd);
-
- if (ret <= 0)
- return -1;
- else
- return atoi(size);
-}
-
-static inline unsigned long long get_fs_free_size(const char *path)
-{
- unsigned long long ret;
- struct statfs s;
-
- if (statfs(path, &s) < 0)
- return -1ULL;
-
- ret = s.f_bsize;
- ret *= (unsigned long long) s.f_bfree;
- return ret;
-}
-
-static inline int os_trim(struct fio_file *f, unsigned long long start,
- unsigned long long len)
-{
- uint64_t range[2];
-
- range[0] = start;
- range[1] = len;
-
- if (!ioctl(f->fd, BLKDISCARD, range))
- return 0;
-
- return errno;
-}
-
-#ifdef CONFIG_SCHED_IDLE
-static inline int fio_set_sched_idle(void)
-{
- struct sched_param p = { .sched_priority = 0, };
- return sched_setscheduler(gettid(), SCHED_IDLE, &p);
-}
-#endif
-
-#endif
--- /dev/null
+#ifndef CONFIG_NO_SHM
+/*
+ * Bionic doesn't support SysV shared memory, so implement it using ashmem
+ */
+#include <stdio.h>
+#include <linux/ashmem.h>
+#include <linux/shm.h>
+#include <android/api-level.h>
+#ifdef CONFIG_ASHAREDMEMORY_CREATE
+#include <android/sharedmem.h>
+#else
+#define ASHMEM_DEVICE "/dev/ashmem"
+#endif
+#define shmid_ds shmid64_ds
+#define SHM_HUGETLB 04000
+
+static inline int shmctl(int __shmid, int __cmd, struct shmid_ds *__buf)
+{
+ int ret=0;
+ if (__cmd == IPC_RMID)
+ {
+ int length = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
+ struct ashmem_pin pin = {0 , length};
+ ret = ioctl(__shmid, ASHMEM_UNPIN, &pin);
+ close(__shmid);
+ }
+ return ret;
+}
+
+#ifdef CONFIG_ASHAREDMEMORY_CREATE
+static inline int shmget(key_t __key, size_t __size, int __shmflg)
+{
+ char keybuf[11];
+
+ sprintf(keybuf, "%d", __key);
+
+ return ASharedMemory_create(keybuf, __size + sizeof(uint64_t));
+}
+#else
+static inline int shmget(key_t __key, size_t __size, int __shmflg)
+{
+ int fd,ret;
+ char keybuf[11];
+
+ fd = open(ASHMEM_DEVICE, O_RDWR);
+ if (fd < 0)
+ return fd;
+
+ sprintf(keybuf,"%d",__key);
+ ret = ioctl(fd, ASHMEM_SET_NAME, keybuf);
+ if (ret < 0)
+ goto error;
+
+ /* Stores size in first 8 bytes, allocate extra space */
+ ret = ioctl(fd, ASHMEM_SET_SIZE, __size + sizeof(uint64_t));
+ if (ret < 0)
+ goto error;
+
+ return fd;
+
+error:
+ close(fd);
+ return ret;
+}
+#endif
+
+static inline void *shmat(int __shmid, const void *__shmaddr, int __shmflg)
+{
+ size_t size = ioctl(__shmid, ASHMEM_GET_SIZE, NULL);
+ /* Needs to be 8-byte aligned to prevent SIGBUS on 32-bit ARM */
+ uint64_t *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, __shmid, 0);
+ /* Save size at beginning of buffer, for use with munmap */
+ *ptr = size;
+ return ptr + 1;
+}
+
+static inline int shmdt (const void *__shmaddr)
+{
+ /* Find mmap size which we stored at the beginning of the buffer */
+ uint64_t *ptr = (uint64_t *)__shmaddr - 1;
+ size_t size = *ptr;
+ return munmap(ptr, size);
+}
+#endif
/* No CPU_COUNT(), but use the default function defined in os/os.h */
#define fio_cpu_count(mask) CPU_COUNT((mask))
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
static inline int fio_cpuset_init(os_cpu_mask_t *mask)
{
CPUMASK_ASSZERO(*mask);
* ioprio_set() with 4 arguments, so define fio's ioprio_set() as a macro.
* Note that there is no idea of class within ioprio_set(2) unlike Linux.
*/
-#define ioprio_set(which, who, ioprio_class, ioprio) \
+#define ioprio_value(ioprio_class, ioprio, ioprio_hint) (ioprio)
+#define ioprio_set(which, who, ioprio_class, ioprio, ioprio_hint) \
ioprio_set(which, who, ioprio)
+#define ioprio(ioprio) (ioprio)
+
static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
{
struct partinfo pi;
#define fio_cpu_isset(mask, cpu) (CPU_ISSET((cpu), (mask)) != 0)
#define fio_cpu_count(mask) CPU_COUNT((mask))
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
static inline int fio_cpuset_init(os_cpu_mask_t *mask)
{
CPU_ZERO(mask);
#define FIO_USE_GENERIC_SWAP
#define FIO_OS_HAVE_AIOCB_TYPEDEF
+
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
typedef struct aiocb64 os_aiocb_t;
static inline int blockdev_invalidate_cache(struct fio_file *f)
return ret;
}
-#define FIO_HAVE_CPU_ONLINE_SYSCONF
+#define FIO_HAVE_CPU_CONF_SYSCONF
-static inline unsigned int cpus_online(void)
+static inline unsigned int cpus_configured(void)
{
return mpctl(MPC_GETNUMSPUS, 0, NULL);
}
#define __NR_ioprio_get 31
#endif
+/* Linux syscalls for loongarch64 */
+#elif defined(ARCH_LOONGARCH64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set 30
+#define __NR_ioprio_get 31
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64 223
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice 76
+#define __NR_sys_tee 77
+#define __NR_sys_vmsplice 75
+#endif
+
+/* Linux syscalls for riscv64 */
+#elif defined(ARCH_RISCV64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set 30
+#define __NR_ioprio_get 31
+#endif
#else
#warning "Unknown architecture"
#endif
#ifndef FIO_OS_LINUX_H
#define FIO_OS_LINUX_H
+#ifdef __ANDROID__
+#define FIO_OS os_android
+#else
#define FIO_OS os_linux
+#endif
#include <sys/ioctl.h>
#include <sys/uio.h>
#include <errno.h>
#include <sched.h>
#include <linux/unistd.h>
-#include <linux/raw.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <scsi/sg.h>
+#include <asm/byteorder.h>
+#ifdef __ANDROID__
+#include "os-ashmem.h"
+#define FIO_NO_HAVE_SHM_H
+#endif
#ifdef ARCH_HAVE_CRC_CRYPTO
#include <sys/auxv.h>
+#ifndef HWCAP_PMULL
+#define HWCAP_PMULL (1 << 4)
+#endif /* HWCAP_PMULL */
#ifndef HWCAP_CRC32
#define HWCAP_CRC32 (1 << 7)
#endif /* HWCAP_CRC32 */
#define FIO_HAVE_IOSCHED_SWITCH
#define FIO_HAVE_ODIRECT
#define FIO_HAVE_HUGETLB
-#define FIO_HAVE_RAWBIND
#define FIO_HAVE_BLKTRACE
#define FIO_HAVE_CL_SIZE
#define FIO_HAVE_CGROUPS
#define FIO_HAVE_TRIM
#define FIO_HAVE_GETTID
#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_BYTEORDER_FUNCS
#define FIO_HAVE_PWRITEV2
#define FIO_HAVE_SHM_ATTACH_REMOVED
#define OS_MAP_ANON MAP_ANONYMOUS
-#define FIO_EXT_ENG_DIR "/usr/lib/fio"
+#define FIO_EXT_ENG_DIR "/usr/local/lib/fio"
typedef cpu_set_t os_cpu_mask_t;
sched_getaffinity((pid), (ptr))
#endif
-#define fio_cpu_clear(mask, cpu) (void) CPU_CLR((cpu), (mask))
-#define fio_cpu_set(mask, cpu) (void) CPU_SET((cpu), (mask))
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
+#define fio_cpu_clear(mask, cpu) CPU_CLR((cpu), (mask))
+#define fio_cpu_set(mask, cpu) CPU_SET((cpu), (mask))
#define fio_cpu_isset(mask, cpu) (CPU_ISSET((cpu), (mask)) != 0)
#define fio_cpu_count(mask) CPU_COUNT((mask))
#define IOPRIO_BITS 16
#define IOPRIO_CLASS_SHIFT 13
+#define IOPRIO_HINT_BITS 10
+#define IOPRIO_HINT_SHIFT 3
+
#define IOPRIO_MIN_PRIO 0 /* highest priority */
#define IOPRIO_MAX_PRIO 7 /* lowest priority */
#define IOPRIO_MIN_PRIO_CLASS 0
#define IOPRIO_MAX_PRIO_CLASS 3
-static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
+#define IOPRIO_MIN_PRIO_HINT 0
+#define IOPRIO_MAX_PRIO_HINT ((1 << IOPRIO_HINT_BITS) - 1)
+
+#define ioprio_class(ioprio) ((ioprio) >> IOPRIO_CLASS_SHIFT)
+#define ioprio(ioprio) ((ioprio) & IOPRIO_MAX_PRIO)
+#define ioprio_hint(ioprio) \
+ (((ioprio) >> IOPRIO_HINT_SHIFT) & IOPRIO_MAX_PRIO_HINT)
+
+static inline int ioprio_value(int ioprio_class, int ioprio, int ioprio_hint)
{
/*
* If no class is set, assume BE
*/
- if (!ioprio_class)
- ioprio_class = IOPRIO_CLASS_BE;
+ if (!ioprio_class)
+ ioprio_class = IOPRIO_CLASS_BE;
- ioprio |= ioprio_class << IOPRIO_CLASS_SHIFT;
- return syscall(__NR_ioprio_set, which, who, ioprio);
+ return (ioprio_class << IOPRIO_CLASS_SHIFT) |
+ (ioprio_hint << IOPRIO_HINT_SHIFT) |
+ ioprio;
+}
+
+static inline bool ioprio_value_is_class_rt(unsigned int priority)
+{
+ return ioprio_class(priority) == IOPRIO_CLASS_RT;
+}
+
+static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio,
+ int ioprio_hint)
+{
+ return syscall(__NR_ioprio_set, which, who,
+ ioprio_value(ioprio_class, ioprio, ioprio_hint));
}
#ifndef CONFIG_HAVE_GETTID
return (unsigned long long) pages * (unsigned long long) pagesize;
}
-static inline int fio_lookup_raw(dev_t dev, int *majdev, int *mindev)
-{
- struct raw_config_request rq;
- int fd;
-
- if (major(dev) != RAW_MAJOR)
- return 1;
-
- /*
- * we should be able to find /dev/rawctl or /dev/raw/rawctl
- */
- fd = open("/dev/rawctl", O_RDONLY);
- if (fd < 0) {
- fd = open("/dev/raw/rawctl", O_RDONLY);
- if (fd < 0)
- return 1;
- }
-
- rq.raw_minor = minor(dev);
- if (ioctl(fd, RAW_GETBIND, &rq) < 0) {
- close(fd);
- return 1;
- }
-
- close(fd);
- *majdev = rq.block_major;
- *mindev = rq.block_minor;
- return 0;
-}
-
#ifdef O_NOATIME
#define FIO_O_NOATIME O_NOATIME
#else
#define FIO_O_NOATIME 0
#endif
-#ifdef O_ATOMIC
-#define OS_O_ATOMIC O_ATOMIC
-#else
-#define OS_O_ATOMIC 040000000
-#endif
-
#ifdef MADV_REMOVE
#define FIO_MADV_FREE MADV_REMOVE
#endif
return atoi(size);
}
-#ifdef __powerpc64__
-#define FIO_HAVE_CPU_ONLINE_SYSCONF
-static inline unsigned int cpus_online(void)
-{
- return sysconf(_SC_NPROCESSORS_CONF);
-}
-#endif
-
static inline unsigned long long get_fs_free_size(const char *path)
{
unsigned long long ret;
#ifdef ARCH_HAVE_CRC_CRYPTO
case CPU_ARM64_CRC32C:
hwcap = getauxval(AT_HWCAP);
- have_feature = (hwcap & HWCAP_CRC32) != 0;
+ have_feature = (hwcap & (HWCAP_PMULL | HWCAP_CRC32)) ==
+ (HWCAP_PMULL | HWCAP_CRC32);
break;
#endif
default:
#include <machine/endian.h>
#include <libkern/OSByteOrder.h>
+#include "../arch/arch.h"
#include "../file.h"
#define FIO_USE_GENERIC_INIT_RANDOM_STATE
#define FIO_HAVE_GETTID
#define FIO_HAVE_CHARDEV_SIZE
#define FIO_HAVE_NATIVE_FALLOCATE
+#define FIO_HAVE_CPU_HAS
#define OS_MAP_ANON MAP_ANON
#define fio_swap32(x) OSSwapInt32(x)
#define fio_swap64(x) OSSwapInt64(x)
-/*
- * OSX has a pitifully small shared memory segment by default,
- * so default to a lower number of max jobs supported
- */
-#define FIO_MAX_JOBS 128
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
#ifndef CONFIG_CLOCKID_T
typedef unsigned int clockid_t;
return false;
}
+static inline bool os_cpu_has(cpu_features feature)
+{
+ /* just check for arm on OSX for now, we know that has it */
+ if (feature != CPU_ARM64_CRC32C)
+ return false;
+ return FIO_ARCH == arch_aarch64;
+}
+
#endif
#include <sys/endian.h>
#include <sys/sysctl.h>
-/* XXX hack to avoid confilcts between rbtree.h and <sys/rbtree.h> */
+/* XXX hack to avoid conflicts between rbtree.h and <sys/rbtree.h> */
#undef rb_node
#undef rb_left
#undef rb_right
#define fio_swap32(x) bswap32(x)
#define fio_swap64(x) bswap64(x)
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
{
struct disklabel dl;
#define fio_swap32(x) swap32(x)
#define fio_swap64(x) swap64(x)
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
{
struct disklabel dl;
#define os_ctime_r(x, y, z) ctime_r((x), (y), (z))
#define FIO_OS_HAS_CTIME_R
+#ifdef CONFIG_PTHREAD_GETAFFINITY
+#define FIO_HAVE_GET_THREAD_AFFINITY
+#define fio_get_thread_affinity(mask) \
+ pthread_getaffinity_np(pthread_self(), sizeof(mask), &(mask))
+#endif
+
typedef psetid_t os_cpu_mask_t;
static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
static inline bool fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
{
- const unsigned int max_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ const unsigned int max_cpus = sysconf(_SC_NPROCESSORS_CONF);
unsigned int num_cpus;
processorid_t *cpus;
bool ret;
+++ /dev/null
-#define FIO_MAX_CPUS MAXIMUM_PROCESSORS
-
-typedef DWORD_PTR os_cpu_mask_t;
#include "../lib/types.h"
#include "windows/posix.h"
+#include "os-windows-7.h"
#ifndef PTHREAD_STACK_MIN
#define PTHREAD_STACK_MIN 65535
#define fio_swap64(x) _byteswap_uint64(x)
#define _SC_PAGESIZE 0x1
-#define _SC_NPROCESSORS_ONLN 0x2
+#define _SC_NPROCESSORS_CONF 0x2
#define _SC_PHYS_PAGES 0x4
#define SA_RESTART 0
#define SIGCONT 0
#define SIGUSR1 1
#define SIGUSR2 2
+#define SIGKILL 15 /* SIGKILL doesn't exists, let's use SIGTERM */
typedef int sigset_t;
typedef int siginfo_t;
ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
off_t offset);
+HANDLE windows_handle_connection(HANDLE hjob, int sk);
+HANDLE windows_create_job(void);
static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
{
return 0;
}
-#ifdef CONFIG_WINDOWS_XP
-#include "os-windows-xp.h"
-#else
-#define FIO_HAVE_CPU_ONLINE_SYSCONF
-unsigned int cpus_online(void);
-#include "os-windows-7.h"
-#endif
-
int first_set_cpu(os_cpu_mask_t *cpumask);
int fio_setaffinity(int pid, os_cpu_mask_t cpumask);
int fio_cpuset_init(os_cpu_mask_t *mask);
#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>
+#include <errno.h>
#include "../arch/arch.h" /* IWYU pragma: export */
#include "../lib/types.h"
} cpu_features;
/* IWYU pragma: begin_exports */
-#if defined(__ANDROID__)
-#include "os-android.h"
-#elif defined(__linux__)
+#if defined(__linux__)
#include "os-linux.h"
#elif defined(__FreeBSD__)
#include "os-freebsd.h"
#error "unsupported os"
#endif
+#ifndef EDQUOT
+#define EDQUOT EIO
+#endif
+
#ifdef CONFIG_POSIXAIO
#include <aio.h>
#ifndef FIO_OS_HAVE_AIOCB_TYPEDEF
extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
#endif
+#ifndef FIO_HAVE_IOPRIO_CLASS
+#define ioprio_class(prio) 0
+#define ioprio_value_is_class_rt(prio) (false)
+#define IOPRIO_MIN_PRIO_CLASS 0
+#define IOPRIO_MAX_PRIO_CLASS 0
+#define ioprio_hint(prio) 0
+#define IOPRIO_MIN_PRIO_HINT 0
+#define IOPRIO_MAX_PRIO_HINT 0
+#endif
#ifndef FIO_HAVE_IOPRIO
-#define ioprio_set(which, who, prioclass, prio) (0)
+#define ioprio_value(prioclass, prio, priohint) (0)
+#define ioprio(ioprio) 0
+#define ioprio_set(which, who, prioclass, prio, priohint) (0)
+#define IOPRIO_MIN_PRIO 0
+#define IOPRIO_MAX_PRIO 0
#endif
#ifndef FIO_HAVE_ODIRECT
#define OS_O_DIRECT O_DIRECT
#endif
-#ifdef OS_O_ATOMIC
-#define FIO_O_ATOMIC OS_O_ATOMIC
-#else
-#define FIO_O_ATOMIC 0
-#endif
-
#ifndef FIO_HAVE_HUGETLB
#define SHM_HUGETLB 0
#define MAP_HUGETLB 0
#define OS_RAND_MAX RAND_MAX
#endif
-#ifndef FIO_HAVE_RAWBIND
-#define fio_lookup_raw(dev, majdev, mindev) 1
-#endif
-
#ifndef FIO_PREFERRED_ENGINE
#define FIO_PREFERRED_ENGINE "psync"
#endif
#endif
#endif
-#ifndef FIO_MAX_JOBS
-#define FIO_MAX_JOBS 4096
-#endif
-
#ifndef CONFIG_SOCKLEN_T
typedef unsigned int socklen_t;
#endif
}
#endif
-#ifndef FIO_HAVE_CPU_ONLINE_SYSCONF
-static inline unsigned int cpus_online(void)
+#ifndef FIO_HAVE_CPU_CONF_SYSCONF
+static inline unsigned int cpus_configured(void)
{
- return sysconf(_SC_NPROCESSORS_ONLN);
+ int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+
+ return nr_cpus >= 1 ? nr_cpus : 1;
}
#endif
#ifdef FIO_HAVE_CPU_AFFINITY
static inline int CPU_COUNT(os_cpu_mask_t *mask)
{
- int max_cpus = cpus_online();
+ int max_cpus = cpus_configured();
int nr_cpus, i;
for (i = 0, nr_cpus = 0; i < max_cpus; i++)
# define fio_mkdir(path, mode) mkdir(path, mode)
#endif
+#ifdef _SC_CLK_TCK
+static inline void os_clk_tck(long *clk_tck)
+{
+ *clk_tck = sysconf(_SC_CLK_TCK);
+}
+#else
+extern void os_clk_tck(long *clk_tck);
+#endif
+
#endif /* FIO_OS_H */
--- /dev/null
+<?xml version="1.0" encoding="UTF-8"?>\r
+<!-- Copyright (c) .NET Foundation and contributors. All rights reserved. Licensed under the Microsoft Reciprocal License. See LICENSE.TXT file in the project root for full license information. -->\r
+\r
+\r
+\r
+<!--\r
+First-time install dialog sequence:\r
+ - WixUI_MyWelcomeDlg\r
+Maintenance dialog sequence:\r
+ WixUI_MaintenanceWelcomeDlg\r
+ - WixUI_MaintenanceTypeDlg\r
+ - WixUI_VerifyReadyDlg\r
+-->\r
+\r
+<Wix xmlns="http://schemas.microsoft.com/wix/2006/wi">\r
+ <Fragment>\r
+ <UI Id="WixUI_Minimal_NoEULA">\r
+ <TextStyle Id="WixUI_Font_Normal" FaceName="Tahoma" Size="8" />\r
+ <TextStyle Id="WixUI_Font_Bigger" FaceName="Tahoma" Size="12" />\r
+ <TextStyle Id="WixUI_Font_Title" FaceName="Tahoma" Size="9" Bold="yes" />\r
+\r
+ <Property Id="DefaultUIFont" Value="WixUI_Font_Normal" />\r
+ <Property Id="WixUI_Mode" Value="Minimal" />\r
+\r
+ <DialogRef Id="ErrorDlg" />\r
+ <DialogRef Id="FatalError" />\r
+ <DialogRef Id="FilesInUse" />\r
+ <DialogRef Id="MsiRMFilesInUse" />\r
+ <DialogRef Id="PrepareDlg" />\r
+ <DialogRef Id="ProgressDlg" />\r
+ <DialogRef Id="ResumeDlg" />\r
+ <DialogRef Id="UserExit" />\r
+ <DialogRef Id="MyWelcomeDlg" />\r
+\r
+ <Dialog Id="MyWelcomeDlg" Width="370" Height="270" Title="!(loc.WelcomeDlg_Title)">\r
+ <Control Id="Install" Type="PushButton" ElevationShield="yes" X="236" Y="243" Width="56" Height="17" Default="yes" Hidden="yes" Text="!(loc.WelcomeEulaDlgInstall)" >\r
+ <Publish Property="WixUI_InstallMode" Value="Update">Installed AND PATCH</Publish>\r
+ <Publish Event="SpawnWaitDialog" Value="WaitForCostingDlg">!(wix.WixUICostingPopupOptOut) OR CostingComplete = 1</Publish>\r
+ <Publish Event="EndDialog" Value="Return"><![CDATA[OutOfDiskSpace <> 1]]></Publish>\r
+ <Publish Event="SpawnDialog" Value="OutOfRbDiskDlg">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND (PROMPTROLLBACKCOST="P" OR NOT PROMPTROLLBACKCOST)</Publish>\r
+ <Publish Event="EndDialog" Value="Return">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>\r
+ <Publish Event="EnableRollback" Value="False">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>\r
+ <Publish Event="SpawnDialog" Value="OutOfDiskDlg">(OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 1) OR (OutOfDiskSpace = 1 AND PROMPTROLLBACKCOST="F")</Publish>\r
+ <Condition Action="show">ALLUSERS</Condition>\r
+ </Control>\r
+ <Control Id="InstallNoShield" Type="PushButton" ElevationShield="no" X="212" Y="243" Width="80" Height="17" Default="yes" Text="!(loc.WelcomeEulaDlgInstall)" Hidden="yes">\r
+ <Publish Event="SpawnWaitDialog" Value="WaitForCostingDlg">!(wix.WixUICostingPopupOptOut) OR CostingComplete = 1</Publish>\r
+ <Publish Event="EndDialog" Value="Return"><![CDATA[OutOfDiskSpace <> 1]]></Publish>\r
+ <Publish Event="SpawnDialog" Value="OutOfRbDiskDlg">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND (PROMPTROLLBACKCOST="P" OR NOT PROMPTROLLBACKCOST)</Publish>\r
+ <Publish Event="EndDialog" Value="Return">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>\r
+ <Publish Event="EnableRollback" Value="False">OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 0 AND PROMPTROLLBACKCOST="D"</Publish>\r
+ <Publish Event="SpawnDialog" Value="OutOfDiskDlg">(OutOfDiskSpace = 1 AND OutOfNoRbDiskSpace = 1) OR (OutOfDiskSpace = 1 AND PROMPTROLLBACKCOST="F")</Publish>\r
+ <Condition Action="disable"><![CDATA[LicenseAccepted <> "1"]]></Condition>\r
+ <Condition Action="show">NOT ALLUSERS</Condition>\r
+ </Control>\r
+ <Control Id="Cancel" Type="PushButton" X="304" Y="243" Width="56" Height="17" Cancel="yes" Text="!(loc.WixUICancel)">\r
+ <Publish Event="SpawnDialog" Value="CancelDlg">1</Publish>\r
+ </Control>\r
+ <Control Id="Bitmap" Type="Bitmap" X="0" Y="0" Width="370" Height="234" TabSkip="no" Text="!(loc.WelcomeDlgBitmap)" />\r
+ <Control Id="Back" Type="PushButton" X="180" Y="243" Width="56" Height="17" Disabled="yes" Text="!(loc.WixUIBack)" />\r
+ <Control Id="BottomLine" Type="Line" X="0" Y="234" Width="370" Height="0" />\r
+ <Control Id="Description" Type="Text" X="135" Y="80" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.MyWelcomeDlgDescription)" >\r
+ <Condition Action="show">NOT Installed OR NOT PATCH</Condition>\r
+ <Condition Action="hide">Installed AND PATCH</Condition>\r
+ </Control>\r
+ <Control Id="PatchDescription" Type="Text" X="135" Y="80" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.WelcomeUpdateDlgDescriptionUpdate)" >\r
+ <Condition Action="show">Installed AND PATCH</Condition>\r
+ <Condition Action="hide">NOT Installed OR NOT PATCH</Condition>\r
+ </Control>\r
+ <Control Id="Title" Type="Text" X="135" Y="20" Width="220" Height="60" Transparent="yes" NoPrefix="yes" Text="!(loc.WelcomeDlgTitle)" />\r
+ </Dialog>\r
+\r
+ <Publish Dialog="ExitDialog" Control="Finish" Event="EndDialog" Value="Return" Order="999">1</Publish>\r
+\r
+ <Publish Dialog="VerifyReadyDlg" Control="Back" Event="NewDialog" Value="MaintenanceTypeDlg">1</Publish>\r
+\r
+ <Publish Dialog="MaintenanceWelcomeDlg" Control="Next" Event="NewDialog" Value="MaintenanceTypeDlg">1</Publish>\r
+\r
+ <Publish Dialog="MaintenanceTypeDlg" Control="RepairButton" Event="NewDialog" Value="VerifyReadyDlg">1</Publish>\r
+ <Publish Dialog="MaintenanceTypeDlg" Control="RemoveButton" Event="NewDialog" Value="VerifyReadyDlg">1</Publish>\r
+ <Publish Dialog="MaintenanceTypeDlg" Control="Back" Event="NewDialog" Value="MaintenanceWelcomeDlg">1</Publish>\r
+\r
+ <Publish Dialog="MyWelcomeDlg" Control="Install" Event="NewDialog" Value="PrepareDlg">1</Publish>\r
+ <Publish Dialog="VerifyReadyDlg" Control="Back" Event="NewDialog" Value="WelcomeDlg" Order="2">Installed AND PATCH</Publish>\r
+\r
+ <InstallUISequence>\r
+ <Show Dialog="WelcomeDlg" Before="ProgressDlg">0</Show>\r
+ <Show Dialog="MyWelcomeDlg" Before="ProgressDlg">NOT Installed</Show>\r
+ </InstallUISequence>\r
+\r
+ <Property Id="ARPNOMODIFY" Value="1" />\r
+ </UI>\r
+\r
+ <UIRef Id="WixUI_Common" />\r
+ </Fragment>\r
+</Wix>
\ No newline at end of file
--- /dev/null
+<?xml version="1.0" encoding="utf-8"?>\r
+<!-- Copyright (c) .NET Foundation and contributors. All rights reserved. Licensed under the Microsoft Reciprocal License. See LICENSE.TXT file in the project root for full license information. -->\r
+\r
+\r
+<WixLocalization Culture="en-US" Codepage="1252" xmlns="http://schemas.microsoft.com/wix/2006/localization">\r
+ <!-- _locID@Culture="en-US" _locComment="American English" -->\r
+ <!-- _locID@Codepage="1252" _locComment="Windows-1252" -->\r
+\r
+<String Id="MyWelcomeDlgDescription" Overridable="yes">\r
+<!-- _locID_text="MyWelcomeDlgDescription" _locComment="MyWelcomeDlgDescription" -->The Setup Wizard will install [ProductName] on your computer. Click Install to continue or Cancel to exit the Setup Wizard.\r
+</String>\r
+</WixLocalization>
\ No newline at end of file
#include <windows.h>
-#ifdef CONFIG_WINDOWS_XP
-int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
-{
- HANDLE h;
- BOOL bSuccess = FALSE;
-
- h = OpenThread(THREAD_QUERY_INFORMATION | THREAD_SET_INFORMATION, TRUE,
- pid);
- if (h != NULL) {
- bSuccess = SetThreadAffinityMask(h, cpumask);
- if (!bSuccess)
- log_err("fio_setaffinity failed: failed to set thread affinity (pid %d, mask %.16llx)\n",
- pid, (long long unsigned) cpumask);
-
- CloseHandle(h);
- } else {
- log_err("fio_setaffinity failed: failed to get handle for pid %d\n",
- pid);
- }
-
- return bSuccess ? 0 : -1;
-}
-
-int fio_getaffinity(int pid, os_cpu_mask_t *mask)
-{
- os_cpu_mask_t systemMask;
-
- HANDLE h = OpenProcess(PROCESS_QUERY_INFORMATION, TRUE, pid);
-
- if (h != NULL) {
- GetProcessAffinityMask(h, mask, &systemMask);
- CloseHandle(h);
- } else {
- log_err("fio_getaffinity failed: failed to get handle for pid %d\n",
- pid);
- return -1;
- }
-
- return 0;
-}
-
-void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)
-{
- *mask &= ~(1ULL << cpu);
-}
-
-void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
-{
- *mask |= 1ULL << cpu;
-}
-
-int fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
-{
- return (*mask & (1ULL << cpu)) != 0;
-}
-
-int fio_cpu_count(os_cpu_mask_t *mask)
-{
- return hweight64(*mask);
-}
-
-int fio_cpuset_init(os_cpu_mask_t *mask)
-{
- *mask = 0;
- return 0;
-}
-
-int fio_cpuset_exit(os_cpu_mask_t *mask)
-{
- return 0;
-}
-#else /* CONFIG_WINDOWS_XP */
-/* Return all processors regardless of processor group */
-unsigned int cpus_online(void)
-{
- return GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
-}
-
static void print_mask(os_cpu_mask_t *cpumask)
{
for (int i = 0; i < FIO_CPU_MASK_ROWS; i++)
{
return 0;
}
-#endif /* CONFIG_WINDOWS_XP */
--- /dev/null
+#include "os/os.h"
+
+#include <windows.h>
+
+void os_clk_tck(long *clk_tck)
+{
+ /*
+ * The timer resolution is variable on Windows. Try to query it
+ * or use 64 Hz, the clock frequency lower bound. See also
+ * https://carpediemsystems.co.uk/2019/07/18/windows-system-timer-granularity/.
+ */
+ unsigned long minRes, maxRes, curRes;
+ HMODULE lib;
+ NTSTATUS NTAPI (*queryTimer)
+ (OUT PULONG MinimumResolution,
+ OUT PULONG MaximumResolution,
+ OUT PULONG CurrentResolution);
+ NTSTATUS NTAPI (*setTimer)
+ (IN ULONG DesiredResolution,
+ IN BOOLEAN SetResolution,
+ OUT PULONG CurrentResolution);
+
+ if (!(lib = LoadLibrary(TEXT("ntdll.dll"))) ||
+ !(queryTimer = (void *)GetProcAddress(lib, "NtQueryTimerResolution")) ||
+ !(setTimer = (void *)GetProcAddress(lib, "NtSetTimerResolution"))) {
+ dprint(FD_HELPERTHREAD,
+ "Failed to load ntdll library, set to lower bound 64 Hz\n");
+ *clk_tck = 64;
+ } else {
+ queryTimer(&minRes, &maxRes, &curRes);
+ dprint(FD_HELPERTHREAD,
+ "minRes = %lu, maxRes = %lu, curRes = %lu\n",
+ minRes, maxRes, curRes);
+
+ /* Use maximum resolution for most accurate timestamps */
+ setTimer(maxRes, 1, &curRes);
+ *clk_tck = (long) (10000000L / maxRes);
+ }
+}
@if ERRORLEVEL 1 goto end\r
"%WIX%bin\candle" -nologo -arch %FIO_ARCH% examples.wxs\r
@if ERRORLEVEL 1 goto end\r
-"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi\r
+"%WIX%bin\candle" -nologo -arch %FIO_ARCH% WixUI_Minimal_NoEULA.wxs\r
+@if ERRORLEVEL 1 goto end\r
+\r
+"%WIX%bin\light" -nologo -sice:ICE61 install.wixobj examples.wixobj WixUI_Minimal_NoEULA.wixobj -loc WixUI_fio.wxl -ext WixUIExtension -out %FIO_VERSION%-%FIO_ARCH%.msi\r
:end\r
\r
if defined SIGN_FIO (\r
<Component>
<File Source="..\..\examples\numa.fio" />
</Component>
- <Component>
- <File Source="..\..\examples\pmemblk.fio" />
- </Component>
<Component>
<File Source="..\..\examples\poisson-rate-submission.fio" />
</Component>
<ComponentRef Id="netio_multicast.fio" />
<ComponentRef Id="null.fio" />
<ComponentRef Id="numa.fio" />
- <ComponentRef Id="pmemblk.fio" />
<ComponentRef Id="poisson_rate_submission.fio" />
<ComponentRef Id="rados.fio"/>
<ComponentRef Id="rand_zones.fio" />
</Component>
<?endif?>
<Component>
- <File Id="README" Name="README.txt" Source="..\..\README"/>
+ <File Id="README" Name="README.txt" Source="..\..\README.rst"/>
</Component>
<Component>
<File Id="REPORTING_BUGS" Name="REPORTING-BUGS.txt" Source="..\..\REPORTING-BUGS"/>
</Component>
<Component>
- <File Id="HOWTO" Name="HOWTO.txt" Source="..\..\HOWTO"/>
+ <File Id="HOWTO" Name="HOWTO.txt" Source="..\..\HOWTO.rst"/>
</Component>
<Component>
<File Id="COPYING" Name="COPYING.txt" Source="..\..\COPYING"/>
<WixVariable Id="WixUILicenseRtf" Value="eula.rtf" />
- <UIRef Id="WixUI_Minimal"/>
+ <UIRef Id="WixUI_Minimal_NoEULA"/>
<MajorUpgrade AllowDowngrades="no" DowngradeErrorMessage="A newer version of the application is already installed."
AllowSameVersionUpgrades="yes"/>
MEMORYSTATUSEX status;
switch (name) {
- case _SC_NPROCESSORS_ONLN:
- val = GetNumLogicalProcessors();
+ case _SC_NPROCESSORS_CONF:
+ /*
+ * Using GetMaximumProcessorCount introduces a problem in
+ * gettime.c because Windows does not have
+ * fio_get_thread_affinity. Log sample (see #1479):
+ *
+ * CPU mask contains processor beyond last active processor index (2)
+ * clock setaffinity failed: No error
+ */
+ val = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
if (val == -1)
- log_err("sysconf(_SC_NPROCESSORS_ONLN) failed\n");
+ log_err("sysconf(_SC_NPROCESSORS_CONF) failed\n");
break;
return 0;
}
+#ifndef CLOCK_MONOTONIC_RAW
+#define CLOCK_MONOTONIC_RAW 4
+#endif
+
/*
* Get the value of a local clock source.
- * This implementation supports 2 clocks: CLOCK_MONOTONIC provides high-accuracy
- * relative time, while CLOCK_REALTIME provides a low-accuracy wall time.
+ * This implementation supports 3 clocks: CLOCK_MONOTONIC/CLOCK_MONOTONIC_RAW
+ * provide high-accuracy relative time, while CLOCK_REALTIME provides a
+ * low-accuracy wall time.
*/
int clock_gettime(clockid_t clock_id, struct timespec *tp)
{
int rc = 0;
- if (clock_id == CLOCK_MONOTONIC) {
+ if (clock_id == CLOCK_MONOTONIC || clock_id == CLOCK_MONOTONIC_RAW) {
static LARGE_INTEGER freq = {{0,0}};
LARGE_INTEGER counts;
uint64_t t;
return hbo;
}
-#ifdef CONFIG_WINDOWS_XP
-const char *inet_ntop(int af, const void *restrict src, char *restrict dst,
- socklen_t size)
+static HANDLE create_named_pipe(char *pipe_name, int wait_connect_time)
{
- INT status = SOCKET_ERROR;
- WSADATA wsd;
- char *ret = NULL;
+ HANDLE hpipe;
- if (af != AF_INET && af != AF_INET6) {
- errno = EAFNOSUPPORT;
- return NULL;
- }
+ hpipe = CreateNamedPipe (
+ pipe_name,
+ PIPE_ACCESS_DUPLEX,
+ PIPE_WAIT | PIPE_TYPE_BYTE,
+ 1, 0, 0, wait_connect_time, NULL);
- WSAStartup(MAKEWORD(2,2), &wsd);
+ if (hpipe == INVALID_HANDLE_VALUE) {
+ log_err("ConnectNamedPipe failed (%lu).\n", GetLastError());
+ return INVALID_HANDLE_VALUE;
+ }
- if (af == AF_INET) {
- struct sockaddr_in si;
- DWORD len = size;
+ if (!ConnectNamedPipe(hpipe, NULL)) {
+ log_err("ConnectNamedPipe failed (%lu).\n", GetLastError());
+ CloseHandle(hpipe);
+ return INVALID_HANDLE_VALUE;
+ }
- memset(&si, 0, sizeof(si));
- si.sin_family = af;
- memcpy(&si.sin_addr, src, sizeof(si.sin_addr));
- status = WSAAddressToString((struct sockaddr*)&si, sizeof(si), NULL, dst, &len);
- } else if (af == AF_INET6) {
- struct sockaddr_in6 si6;
- DWORD len = size;
+ return hpipe;
+}
- memset(&si6, 0, sizeof(si6));
- si6.sin6_family = af;
- memcpy(&si6.sin6_addr, src, sizeof(si6.sin6_addr));
- status = WSAAddressToString((struct sockaddr*)&si6, sizeof(si6), NULL, dst, &len);
+static BOOL windows_create_process(PROCESS_INFORMATION *pi, const char *args, HANDLE *hjob)
+{
+ LPSTR this_cmd_line = GetCommandLine();
+ LPSTR new_process_cmd_line = malloc((strlen(this_cmd_line)+strlen(args)) * sizeof(char *));
+ STARTUPINFO si = {0};
+ DWORD flags = 0;
+
+ strcpy(new_process_cmd_line, this_cmd_line);
+ strcat(new_process_cmd_line, args);
+
+ si.cb = sizeof(si);
+ memset(pi, 0, sizeof(*pi));
+
+ if ((hjob != NULL) && (*hjob != INVALID_HANDLE_VALUE))
+ flags = CREATE_SUSPENDED | CREATE_BREAKAWAY_FROM_JOB;
+
+ flags |= CREATE_NEW_CONSOLE;
+
+ if( !CreateProcess( NULL,
+ new_process_cmd_line,
+ NULL, /* Process handle not inherited */
+ NULL, /* Thread handle not inherited */
+ TRUE, /* no handle inheritance */
+ flags,
+ NULL, /* Use parent's environment block */
+ NULL, /* Use parent's starting directory */
+ &si,
+ pi )
+ )
+ {
+ log_err("CreateProcess failed (%lu).\n", GetLastError() );
+ free(new_process_cmd_line);
+ return 1;
}
+ if ((hjob != NULL) && (*hjob != INVALID_HANDLE_VALUE)) {
+ BOOL ret = AssignProcessToJobObject(*hjob, pi->hProcess);
+ if (!ret) {
+ log_err("AssignProcessToJobObject failed (%lu).\n", GetLastError() );
+ return 1;
+ }
- if (status != SOCKET_ERROR)
- ret = dst;
- else
- errno = ENOSPC;
+ ResumeThread(pi->hThread);
+ }
- WSACleanup();
+ free(new_process_cmd_line);
+ return 0;
+}
- return ret;
+HANDLE windows_create_job(void)
+{
+ JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli = { 0 };
+ BOOL success;
+ HANDLE hjob = CreateJobObject(NULL, NULL);
+
+ jeli.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE;
+ success = SetInformationJobObject(hjob, JobObjectExtendedLimitInformation, &jeli, sizeof(jeli));
+ if ( success == 0 ) {
+ log_err( "SetInformationJobObject failed: error %lu\n", GetLastError() );
+ return INVALID_HANDLE_VALUE;
+ }
+ return hjob;
}
-int inet_pton(int af, const char *restrict src, void *restrict dst)
+/* wait for a child process to either exit or connect to a child */
+static bool monitor_process_till_connect(PROCESS_INFORMATION *pi, HANDLE *hpipe)
{
- INT status = SOCKET_ERROR;
- WSADATA wsd;
- int ret = 1;
+ bool connected = FALSE;
+ bool process_alive = TRUE;
+ char buffer[32] = {0};
+ DWORD bytes_read;
- if (af != AF_INET && af != AF_INET6) {
- errno = EAFNOSUPPORT;
- return -1;
- }
+ do {
+ DWORD exit_code;
+ GetExitCodeProcess(pi->hProcess, &exit_code);
+ if (exit_code != STILL_ACTIVE) {
+ dprint(FD_PROCESS, "process %u exited %d\n", GetProcessId(pi->hProcess), exit_code);
+ break;
+ }
- WSAStartup(MAKEWORD(2,2), &wsd);
-
- if (af == AF_INET) {
- struct sockaddr_in si;
- INT len = sizeof(si);
-
- memset(&si, 0, sizeof(si));
- si.sin_family = af;
- status = WSAStringToAddressA((char*)src, af, NULL, (struct sockaddr*)&si, &len);
- if (status != SOCKET_ERROR)
- memcpy(dst, &si.sin_addr, sizeof(si.sin_addr));
- } else if (af == AF_INET6) {
- struct sockaddr_in6 si6;
- INT len = sizeof(si6);
-
- memset(&si6, 0, sizeof(si6));
- si6.sin6_family = af;
- status = WSAStringToAddressA((char*)src, af, NULL, (struct sockaddr*)&si6, &len);
- if (status != SOCKET_ERROR)
- memcpy(dst, &si6.sin6_addr, sizeof(si6.sin6_addr));
+ memset(buffer, 0, sizeof(buffer));
+ ReadFile(*hpipe, &buffer, sizeof(buffer) - 1, &bytes_read, NULL);
+ if (bytes_read && strstr(buffer, "connected")) {
+ dprint(FD_PROCESS, "process %u connected to client\n", GetProcessId(pi->hProcess));
+ connected = TRUE;
+ }
+ usleep(10*1000);
+ } while (process_alive && !connected);
+ return connected;
+}
+
+/*create a process with --server-internal to emulate fork() */
+HANDLE windows_handle_connection(HANDLE hjob, int sk)
+{
+ char pipe_name[64] = "\\\\.\\pipe\\fiointernal-";
+ char args[128] = " --server-internal=";
+ PROCESS_INFORMATION pi;
+ HANDLE hpipe = INVALID_HANDLE_VALUE;
+ WSAPROTOCOL_INFO protocol_info;
+ HANDLE ret;
+
+ sprintf(pipe_name+strlen(pipe_name), "%d", GetCurrentProcessId());
+ sprintf(args+strlen(args), "%s", pipe_name);
+
+ if (windows_create_process(&pi, args, &hjob) != 0)
+ return INVALID_HANDLE_VALUE;
+ else
+ ret = pi.hProcess;
+
+ /* duplicate socket and write the protocol_info to pipe so child can
+ * duplicate the communication socket */
+ if (WSADuplicateSocket(sk, GetProcessId(pi.hProcess), &protocol_info)) {
+ log_err("WSADuplicateSocket failed (%lu).\n", GetLastError());
+ ret = INVALID_HANDLE_VALUE;
+ goto cleanup;
}
- if (status == SOCKET_ERROR) {
- errno = ENOSPC;
- ret = 0;
+ /* make a pipe with a unique name based upon processid */
+ hpipe = create_named_pipe(pipe_name, 1000);
+ if (hpipe == INVALID_HANDLE_VALUE) {
+ ret = INVALID_HANDLE_VALUE;
+ goto cleanup;
}
- WSACleanup();
+ if (!WriteFile(hpipe, &protocol_info, sizeof(protocol_info), NULL, NULL)) {
+ log_err("WriteFile failed (%lu).\n", GetLastError());
+ ret = INVALID_HANDLE_VALUE;
+ goto cleanup;
+ }
+ dprint(FD_PROCESS, "process %d created child process %u\n", GetCurrentProcessId(), GetProcessId(pi.hProcess));
+
+ /* monitor the process until it either exits or connects. This level
+ * doesnt care which of those occurs because the result is that it
+ * needs to loop around and create another child process to monitor */
+ if (!monitor_process_till_connect(&pi, &hpipe))
+ ret = INVALID_HANDLE_VALUE;
+
+cleanup:
+ /* close the handles and pipes because this thread is done monitoring them */
+ if (ret == INVALID_HANDLE_VALUE)
+ CloseHandle(pi.hProcess);
+ CloseHandle(pi.hThread);
+ DisconnectNamedPipe(hpipe);
+ CloseHandle(hpipe);
return ret;
}
-#endif /* CONFIG_WINDOWS_XP */
in_addr_t inet_network(const char *cp);
-#ifdef CONFIG_WINDOWS_XP
-const char *inet_ntop(int af, const void *restrict src,
- char *restrict dst, socklen_t size);
-int inet_pton(int af, const char *restrict src, void *restrict dst);
-#endif
-
#endif /* ARPA_INET_H */
typedef int nfds_t;
-#ifdef CONFIG_WINDOWS_XP
-struct pollfd
-{
- int fd;
- short events;
- short revents;
-};
-
-#define POLLOUT 1
-#define POLLIN 2
-#define POLLERR 0
-#define POLLHUP 1
-#endif /* CONFIG_WINDOWS_XP */
-
int poll(struct pollfd fds[], nfds_t nfds, int timeout);
#endif /* POLL_H */
#ifndef SYSLOG_H
#define SYSLOG_H
-int syslog();
+int syslog(int priority, const char *format, ...);
#define LOG_INFO 0x1
#define LOG_ERROR 0x2
struct zbd_zone *zones, unsigned int nr_zones);
extern int blkzoned_reset_wp(struct thread_data *td, struct fio_file *f,
uint64_t offset, uint64_t length);
+extern int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones);
+extern int blkzoned_get_max_active_zones(struct thread_data *td,
+ struct fio_file *f,
+ unsigned int *max_active_zones);
+extern int blkzoned_finish_zone(struct thread_data *td, struct fio_file *f,
+ uint64_t offset, uint64_t length);
#else
/*
* Define stubs for systems that do not have zoned block device support.
{
return -EIO;
}
+static inline int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ return -EIO;
+}
+static inline int blkzoned_get_max_active_zones(struct thread_data *td,
+ struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ return -EIO;
+}
+static inline int blkzoned_finish_zone(struct thread_data *td,
+ struct fio_file *f,
+ uint64_t offset, uint64_t length)
+{
+ return -EIO;
+}
#endif
#endif /* FIO_BLKZONED_H */
#include <sys/ioctl.h>
#include <inttypes.h>
+#include "../compiler/compiler.h"
+
#include <mtd/mtd-user.h>
#include "libmtd.h"
void *buf;
normsg("run torture test for PEB %d", eb);
- patt_count = ARRAY_SIZE(patterns);
+ patt_count = FIO_ARRAY_SIZE(patterns);
buf = xmalloc(mtd->eb_size);
* @mtd: MTD device description object
* @fd: MTD device node file descriptor
* @eb: eraseblock to read from
- * @offs: offset withing the eraseblock to read from
+ * @offs: offset within the eraseblock to read from
* @buf: buffer to read data to
* @len: how many bytes to read
*
* @mtd: MTD device description object
* @fd: MTD device node file descriptor
* @eb: eraseblock to write to
- * @offs: offset withing the eraseblock to write to
+ * @offs: offset within the eraseblock to write to
* @data: data buffer to write
* @len: how many data bytes to write
* @oob: OOB buffer to write
* @mtd: MTD device description object
* @fd: MTD device node file descriptor
* @eb: eraseblock to write to
- * @offs: offset withing the eraseblock to write to
+ * @offs: offset within the eraseblock to write to
* @img_name: the file to write
*
* This function writes an image @img_name the MTD device defined by @mtd. @eb
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif
#define min(a, b) MIN(a, b) /* glue for linux kernel source */
-#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
#define ALIGN(x,a) __ALIGN_MASK(x,(__typeof__(x))(a)-1)
#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
#include "zbd_types.h"
#include <linux/blkzoned.h>
+#ifndef BLKFINISHZONE
+#define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range)
+#endif
+
+/*
+ * If the uapi headers installed on the system lacks zone capacity support,
+ * use our local versions. If the installed headers are recent enough to
+ * support zone capacity, do not redefine any structs.
+ */
+#ifndef CONFIG_HAVE_REP_CAPACITY
+#define BLK_ZONE_REP_CAPACITY (1 << 0)
+
+struct blk_zone_v2 {
+ __u64 start; /* Zone start sector */
+ __u64 len; /* Zone length in number of sectors */
+ __u64 wp; /* Zone write pointer position */
+ __u8 type; /* Zone type */
+ __u8 cond; /* Zone condition */
+ __u8 non_seq; /* Non-sequential write resources active */
+ __u8 reset; /* Reset write pointer recommended */
+ __u8 resv[4];
+ __u64 capacity; /* Zone capacity in number of sectors */
+ __u8 reserved[24];
+};
+#define blk_zone blk_zone_v2
+
+struct blk_zone_report_v2 {
+ __u64 sector;
+ __u32 nr_zones;
+ __u32 flags;
+struct blk_zone zones[0];
+};
+#define blk_zone_report blk_zone_report_v2
+#endif /* CONFIG_HAVE_REP_CAPACITY */
/*
* Read up to 255 characters from the first line of a file. Strip the trailing
return strdup(line);
}
-int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
- enum zbd_zoned_model *model)
+/*
+ * Get the value of a sysfs attribute for a block device.
+ *
+ * Returns NULL on failure.
+ * Returns a pointer to a string on success.
+ * The caller is responsible for freeing the memory.
+ */
+static char *blkzoned_get_sysfs_attr(const char *file_name, const char *attr)
{
- const char *file_name = f->file_name;
- char *zoned_attr_path = NULL;
- char *model_str = NULL;
+ char *attr_path = NULL;
struct stat statbuf;
char *sys_devno_path = NULL;
char *part_attr_path = NULL;
char sys_path[PATH_MAX];
ssize_t sz;
char *delim = NULL;
-
- if (f->filetype != FIO_TYPE_BLOCK) {
- *model = ZBD_IGNORE;
- return 0;
- }
-
- *model = ZBD_NONE;
+ char *attr_str = NULL;
if (stat(file_name, &statbuf) < 0)
goto out;
*delim = '\0';
}
- if (asprintf(&zoned_attr_path,
- "/sys/dev/block/%s/queue/zoned", sys_path) < 0)
+ if (asprintf(&attr_path,
+ "/sys/dev/block/%s/%s", sys_path, attr) < 0)
goto out;
- model_str = read_file(zoned_attr_path);
+ attr_str = read_file(attr_path);
+out:
+ free(attr_path);
+ free(part_str);
+ free(part_attr_path);
+ free(sys_devno_path);
+
+ return attr_str;
+}
+
+int blkzoned_get_zoned_model(struct thread_data *td, struct fio_file *f,
+ enum zbd_zoned_model *model)
+{
+ char *model_str = NULL;
+
+ if (f->filetype != FIO_TYPE_BLOCK)
+ return -EINVAL;
+
+ *model = ZBD_NONE;
+
+ model_str = blkzoned_get_sysfs_attr(f->file_name, "queue/zoned");
if (!model_str)
- goto out;
- dprint(FD_ZBD, "%s: zbd model string: %s\n", file_name, model_str);
+ return 0;
+
+ dprint(FD_ZBD, "%s: zbd model string: %s\n", f->file_name, model_str);
if (strcmp(model_str, "host-aware") == 0)
*model = ZBD_HOST_AWARE;
else if (strcmp(model_str, "host-managed") == 0)
*model = ZBD_HOST_MANAGED;
-out:
+
free(model_str);
- free(zoned_attr_path);
- free(part_str);
- free(part_attr_path);
- free(sys_devno_path);
+
+ return 0;
+}
+
+int blkzoned_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones)
+{
+ char *max_open_str;
+
+ if (f->filetype != FIO_TYPE_BLOCK)
+ return -EIO;
+
+ max_open_str = blkzoned_get_sysfs_attr(f->file_name, "queue/max_open_zones");
+ if (!max_open_str) {
+ *max_open_zones = 0;
+ return 0;
+ }
+
+ dprint(FD_ZBD, "%s: max open zones supported by device: %s\n",
+ f->file_name, max_open_str);
+ *max_open_zones = atoll(max_open_str);
+
+ free(max_open_str);
+
+ return 0;
+}
+
+int blkzoned_get_max_active_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_active_zones)
+{
+ char *max_active_str;
+
+ if (f->filetype != FIO_TYPE_BLOCK)
+ return -EIO;
+
+ max_active_str = blkzoned_get_sysfs_attr(f->file_name, "queue/max_active_zones");
+ if (!max_active_str) {
+ *max_active_zones = 0;
+ return 0;
+ }
+
+ dprint(FD_ZBD, "%s: max active zones supported by device: %s\n",
+ f->file_name, max_active_str);
+ *max_active_zones = atoll(max_active_str);
+
+ free(max_active_str);
+
return 0;
}
static uint64_t zone_capacity(struct blk_zone_report *hdr,
struct blk_zone *blkz)
{
-#ifdef CONFIG_HAVE_REP_CAPACITY
if (hdr->flags & BLK_ZONE_REP_CAPACITY)
return blkz->capacity << 9;
-#endif
return blkz->len << 9;
}
hdr->sector = offset >> 9;
ret = ioctl(fd, BLKREPORTZONE, hdr);
if (ret) {
+ log_err("%s: BLKREPORTZONE ioctl failed, ret=%d, err=%d.\n",
+ f->file_name, ret, -errno);
ret = -errno;
goto out;
}
default:
/* Treat all these conditions as offline (don't use!) */
z->cond = ZBD_ZONE_COND_OFFLINE;
- break;
+ z->wp = z->start;
}
}
return ret;
}
+
+int blkzoned_finish_zone(struct thread_data *td, struct fio_file *f,
+ uint64_t offset, uint64_t length)
+{
+ struct blk_zone_range zr = {
+ .sector = offset >> 9,
+ .nr_sectors = length >> 9,
+ };
+ int fd, ret = 0;
+
+ /* If the file is not yet opened, open it for this function. */
+ fd = f->fd;
+ if (fd < 0) {
+ fd = open(f->file_name, O_RDWR | O_LARGEFILE);
+ if (fd < 0)
+ return -errno;
+ }
+
+ if (ioctl(fd, BLKFINISHZONE, &zr) < 0) {
+ ret = -errno;
+ /*
+ * Kernel versions older than 5.5 do not support BLKFINISHZONE
+ * and return the ENOTTY error code. These old kernels only
+ * support block devices that close zones automatically.
+ */
+ if (ret == ENOTTY)
+ ret = 0;
+ }
+
+ if (f->fd < 0)
+ close(fd);
+
+ return ret;
+}
int found = 0;
DIR *D;
+ /*
+ * If replay_redirect is set then always return this device
+ * upon lookup which overrides the device lookup based on
+ * major minor in the actual blktrace
+ */
+ if (redirect) {
+ strcpy(path, redirect);
+ return 1;
+ }
+
D = opendir(path);
if (!D)
return 0;
if (!S_ISBLK(st.st_mode))
continue;
- /*
- * If replay_redirect is set then always return this device
- * upon lookup which overrides the device lookup based on
- * major minor in the actual blktrace
- */
- if (redirect) {
- strcpy(path, redirect);
- found = 1;
- break;
- }
-
if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
strcpy(path, full_path);
found = 1;
"OPT_BOOL",
"OPT_FLOAT_LIST",
"OPT_STR_SET",
+ "OPT_STR_VAL_ZONE",
"OPT_DEPRECATED",
"OPT_SOFT_DEPRECATED",
"OPT_UNSUPPORTED",
static size_t opt_len(const char *str)
{
+ char delimiter[] = {',', ':'};
char *postfix;
+ unsigned int i;
- postfix = strchr(str, ':');
- if (!postfix)
- return strlen(str);
+ for (i = 0; i < FIO_ARRAY_SIZE(delimiter); i++) {
+ postfix = strchr(str, delimiter[i]);
+ if (postfix)
+ return (int)(postfix - str);
+ }
- return (int)(postfix - str);
+ return strlen(str);
}
static int str_match_len(const struct value_pair *vp, const char *str)
static const char *opt_type_name(const struct fio_option *o)
{
- compiletime_assert(ARRAY_SIZE(opt_type_names) - 1 == FIO_OPT_UNSUPPORTED,
+ compiletime_assert(FIO_ARRAY_SIZE(opt_type_names) - 1 == FIO_OPT_UNSUPPORTED,
"opt_type_names[] index");
if (o->type <= FIO_OPT_UNSUPPORTED)
}
case FIO_OPT_STR_VAL_TIME:
is_time = 1;
- fallthrough;
+ fio_fallthrough;
case FIO_OPT_ULL:
case FIO_OPT_INT:
- case FIO_OPT_STR_VAL: {
+ case FIO_OPT_STR_VAL:
+ case FIO_OPT_STR_VAL_ZONE:
+ {
fio_opt_str_val_fn *fn = o->cb;
char tmp[128], *p;
+ size_t len = strlen(ptr);
+
+ if (len > 0 && ptr[len - 1] == 'z') {
+ if (o->type == FIO_OPT_STR_VAL_ZONE) {
+ char *ep;
+ unsigned long long val;
+
+ errno = 0;
+ val = strtoul(ptr, &ep, 10);
+ if (errno == 0 && ep != ptr && *ep == 'z') {
+ ull = ZONE_BASE_VAL + (uint32_t)val;
+ ret = 0;
+ goto store_option_value;
+ } else {
+ log_err("%s: unexpected zone value '%s'\n",
+ o->name, ptr);
+ return 1;
+ }
+ } else {
+ log_err("%s: 'z' suffix isn't applicable\n",
+ o->name);
+ return 1;
+ }
+ }
if (!is_time && o->is_time)
is_time = o->is_time;
}
}
+store_option_value:
if (fn)
ret = fn(data, &ull);
else {
if (o->off1) {
cp = td_var(data, o, o->off1);
+ if (*cp)
+ free(*cp);
*cp = strdup(ptr);
+ if (strlen(ptr) > o->maxlen - 1) {
+ log_err("value exceeds max length of %d\n",
+ o->maxlen);
+ return 1;
+ }
}
if (fn)
}
case FIO_OPT_DEPRECATED:
ret = 1;
- fallthrough;
+ fio_fallthrough;
case FIO_OPT_SOFT_DEPRECATED:
log_info("Option %s is deprecated\n", o->name);
break;
FIO_OPT_BOOL,
FIO_OPT_FLOAT_LIST,
FIO_OPT_STR_SET,
+ FIO_OPT_STR_VAL_ZONE,
FIO_OPT_DEPRECATED,
FIO_OPT_SOFT_DEPRECATED,
FIO_OPT_UNSUPPORTED, /* keep this last */
*/
struct value_pair {
const char *ival; /* string option */
- unsigned long long oval;/* output value */
+ unsigned long long oval; /* output value */
const char *help; /* help text for sub option */
int orval; /* OR value */
void *cb; /* sub-option callback */
static inline int parse_is_percent(unsigned long long val)
{
- return val <= -1ULL && val >= (-1ULL - 100ULL);
+ return val >= -101ULL;
}
+#define ZONE_BASE_VAL ((-1ULL >> 1) + 1)
static inline int parse_is_percent_uncapped(unsigned long long val)
{
- return (long long)val <= -1;
+ return ZONE_BASE_VAL + -1U < val;
+}
+
+static inline int parse_is_zone(unsigned long long val)
+{
+ return (val - ZONE_BASE_VAL) <= -1U;
}
struct print_option {
*
*/
#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+
#include "fio.h"
#include "ioengines.h"
#include "lib/getrusage.h"
static void check_overlap(struct io_u *io_u)
{
- int i, res;
- struct thread_data *td;
+ int res;
/*
* Allow only one thread to check for overlap at a time to prevent two
* threads as they assess overlap.
*/
res = pthread_mutex_lock(&overlap_check);
- assert(res == 0);
+ if (fio_unlikely(res != 0)) {
+ log_err("failed to lock overlap check mutex, err: %i:%s", errno, strerror(errno));
+ abort();
+ }
retry:
- for_each_td(td, i) {
+ for_each_td(td) {
if (td->runstate <= TD_SETTING_UP ||
td->runstate >= TD_FINISHING ||
!td->o.serialize_overlap ||
continue;
res = pthread_mutex_unlock(&overlap_check);
- assert(res == 0);
+ if (fio_unlikely(res != 0)) {
+ log_err("failed to unlock overlap check mutex, err: %i:%s", errno, strerror(errno));
+ abort();
+ }
res = pthread_mutex_lock(&overlap_check);
- assert(res == 0);
+ if (fio_unlikely(res != 0)) {
+ log_err("failed to lock overlap check mutex, err: %i:%s", errno, strerror(errno));
+ abort();
+ }
goto retry;
- }
+ } end_for_each();
}
static int io_workqueue_fn(struct submit_worker *sw,
dup_files(td, parent);
td->eo = parent->eo;
fio_options_mem_dupe(td);
+ td->iolog_f = parent->iolog_f;
if (ioengine_load(td))
goto err;
if (td->io_ops->post_init && td->io_ops->post_init(td))
goto err_io_init;
- set_epoch_time(td, td->o.log_unix_epoch);
+ set_epoch_time(td, td->o.log_alternate_epoch_clock_id, td->o.job_start_clock_id);
fio_getrusage(&td->ru_start);
clear_io_state(td, 1);
struct thread_data *td = sw->priv;
(*sum_cnt)++;
- sum_thread_stats(&sw->wq->td->ts, &td->ts, *sum_cnt == 1);
+
+ /*
+ * io_workqueue_update_acct_fn() doesn't support per prio stats, and
+ * even if it did, offload can't be used with all async IO engines.
+ * If group reporting is set in the parent td, the group result
+ * generated by __show_run_stats() can still contain multiple prios
+ * from different offloaded jobs.
+ */
+ sw->wq->td->ts.disable_prio_stat = 1;
+ sum_thread_stats(&sw->wq->td->ts, &td->ts);
fio_options_free(td);
close_and_free_files(td);
sum_val(&dst->this_io_blocks[ddir], &src->this_io_blocks[ddir]);
sum_val(&dst->this_io_bytes[ddir], &src->this_io_bytes[ddir]);
sum_val(&dst->bytes_done[ddir], &src->bytes_done[ddir]);
+ if (ddir == DDIR_READ)
+ sum_val(&dst->bytes_verified, &src->bytes_verified);
pthread_double_unlock(&dst->io_wq.stat_lock, &src->io_wq.stat_lock);
}
#include <stdio.h>
#include <stdlib.h>
+#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <poll.h>
static pthread_key_t sk_out_key;
+#ifdef WIN32
+static char *fio_server_pipe_name = NULL;
+static HANDLE hjob = INVALID_HANDLE_VALUE;
+struct ffi_element {
+ union {
+ pthread_t thread;
+ HANDLE hProcess;
+ };
+ bool is_thread;
+};
+#endif
+
struct fio_fork_item {
struct flist_head list;
int exitval;
int signal;
int exited;
+#ifdef WIN32
+ struct ffi_element element;
+#else
pid_t pid;
+#endif
};
struct cmd_reply {
return fio_sendv_data(sk, &iov, 1);
}
+bool fio_server_poll_fd(int fd, short events, int timeout)
+{
+ struct pollfd pfd = {
+ .fd = fd,
+ .events = events,
+ };
+ int ret;
+
+ ret = poll(&pfd, 1, timeout);
+ if (ret < 0) {
+ if (errno == EINTR)
+ return false;
+ log_err("fio: poll: %s\n", strerror(errno));
+ return false;
+ } else if (!ret) {
+ return false;
+ }
+ if (pfd.revents & events)
+ return true;
+ return false;
+}
+
static int fio_recv_data(int sk, void *buf, unsigned int len, bool wait)
{
int flags;
if (cmdret->opcode == FIO_NET_CMD_TEXT) {
struct cmd_text_pdu *__pdu = (struct cmd_text_pdu *) cmdret->payload;
char *buf = (char *) __pdu->buf;
+ int len = le32_to_cpu(__pdu->buf_len);
- buf[__pdu->buf_len] = '\0';
+ buf[len] = '\0';
} else if (cmdret->opcode == FIO_NET_CMD_JOB) {
struct cmd_job_pdu *__pdu = (struct cmd_job_pdu *) cmdret->payload;
char *buf = (char *) __pdu->buf;
return fio_net_send_ack(NULL, error, signal);
}
+#ifdef WIN32
+static void fio_server_add_fork_item(struct ffi_element *element, struct flist_head *list)
+{
+ struct fio_fork_item *ffi;
+
+ ffi = malloc(sizeof(*ffi));
+ ffi->exitval = 0;
+ ffi->signal = 0;
+ ffi->exited = 0;
+ ffi->element = *element;
+ flist_add_tail(&ffi->list, list);
+}
+
+static void fio_server_add_conn_pid(struct flist_head *conn_list, HANDLE hProcess)
+{
+ struct ffi_element element = {.hProcess = hProcess, .is_thread=FALSE};
+ dprint(FD_NET, "server: forked off connection job (tid=%u)\n", (int) element.thread);
+
+ fio_server_add_fork_item(&element, conn_list);
+}
+
+static void fio_server_add_job_pid(struct flist_head *job_list, pthread_t thread)
+{
+ struct ffi_element element = {.thread = thread, .is_thread=TRUE};
+ dprint(FD_NET, "server: forked off job job (tid=%u)\n", (int) element.thread);
+ fio_server_add_fork_item(&element, job_list);
+}
+
+static void fio_server_check_fork_item(struct fio_fork_item *ffi)
+{
+ int ret;
+
+ if (ffi->element.is_thread) {
+
+ ret = pthread_kill(ffi->element.thread, 0);
+ if (ret) {
+ int rev_val;
+ pthread_join(ffi->element.thread, (void**) &rev_val); /*if the thread is dead, then join it to get status*/
+
+ ffi->exitval = rev_val;
+ if (ffi->exitval)
+ log_err("thread (tid=%u) exited with %x\n", (int) ffi->element.thread, (int) ffi->exitval);
+ dprint(FD_PROCESS, "thread (tid=%u) exited with %x\n", (int) ffi->element.thread, (int) ffi->exitval);
+ ffi->exited = 1;
+ }
+ } else {
+ DWORD exit_val;
+ GetExitCodeProcess(ffi->element.hProcess, &exit_val);
+
+ if (exit_val != STILL_ACTIVE) {
+ dprint(FD_PROCESS, "process %u exited with %d\n", GetProcessId(ffi->element.hProcess), exit_val);
+ ffi->exited = 1;
+ ffi->exitval = exit_val;
+ }
+ }
+}
+#else
static void fio_server_add_fork_item(pid_t pid, struct flist_head *list)
{
struct fio_fork_item *ffi;
}
}
}
+#endif
static void fio_server_fork_item_done(struct fio_fork_item *ffi, bool stop)
{
+#ifdef WIN32
+ if (ffi->element.is_thread)
+ dprint(FD_NET, "tid %u exited, sig=%u, exitval=%d\n", (int) ffi->element.thread, ffi->signal, ffi->exitval);
+ else {
+ dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int) GetProcessId(ffi->element.hProcess), ffi->signal, ffi->exitval);
+ CloseHandle(ffi->element.hProcess);
+ ffi->element.hProcess = INVALID_HANDLE_VALUE;
+ }
+#else
dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int) ffi->pid, ffi->signal, ffi->exitval);
+#endif
/*
* Fold STOP and QUIT...
return 0;
}
-static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
- struct fio_net_cmd *cmd)
+#ifdef WIN32
+static void *fio_backend_thread(void *data)
{
- pid_t pid;
int ret;
+ struct sk_out *sk_out = (struct sk_out *) data;
sk_out_assign(sk_out);
+ ret = fio_backend(sk_out);
+ sk_out_drop();
+
+ pthread_exit((void*) (intptr_t) ret);
+ return NULL;
+}
+#endif
+
+static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
+ struct fio_net_cmd *cmd)
+{
+ int ret;
+
fio_time_init();
set_genesis_time();
- pid = fork();
- if (pid) {
- fio_server_add_job_pid(job_list, pid);
- return 0;
+#ifdef WIN32
+ {
+ pthread_t thread;
+ /* both this thread and backend_thread call sk_out_assign() to double increment
+ * the ref count. This ensures struct is valid until both threads are done with it
+ */
+ sk_out_assign(sk_out);
+ ret = pthread_create(&thread, NULL, fio_backend_thread, sk_out);
+ if (ret) {
+ log_err("pthread_create: %s\n", strerror(ret));
+ return ret;
+ }
+
+ fio_server_add_job_pid(job_list, thread);
+ return ret;
}
+#else
+ {
+ pid_t pid;
+ sk_out_assign(sk_out);
+ pid = fork();
+ if (pid) {
+ fio_server_add_job_pid(job_list, pid);
+ return 0;
+ }
- ret = fio_backend(sk_out);
- free_threads_shm();
- sk_out_drop();
- _exit(ret);
+ ret = fio_backend(sk_out);
+ free_threads_shm();
+ sk_out_drop();
+ _exit(ret);
+ }
+#endif
}
static int handle_job_cmd(struct fio_net_cmd *cmd)
.os = FIO_OS,
.arch = FIO_ARCH,
.bpp = sizeof(void *),
- .cpus = __cpu_to_le32(cpus_online()),
+ .cpus = __cpu_to_le32(cpus_configured()),
};
dprint(FD_NET, "server: sending probe reply\n");
struct cmd_add_job_pdu *pdu = (struct cmd_add_job_pdu *) cmd->payload;
struct thread_data *td;
uint32_t tnumber;
+ int ret;
tnumber = le32_to_cpu(pdu->thread_number);
return 0;
}
- td = &threads[tnumber - 1];
- convert_thread_options_to_cpu(&td->o, &pdu->top);
- send_update_job_reply(cmd->tag, 0);
+ td = tnumber_to_td(tnumber);
+ ret = convert_thread_options_to_cpu(&td->o, &pdu->top,
+ cmd->pdu_len - offsetof(struct cmd_add_job_pdu, top));
+ send_update_job_reply(cmd->tag, ret);
return 0;
}
sk_unlock(sk_out);
while (!flist_empty(&list)) {
- entry = flist_entry(list.next, struct sk_entry, list);
+ entry = flist_first_entry(&list, struct sk_entry, list);
flist_del(&entry->list);
ret += handle_sk_entry(sk_out, entry);
}
if (ret < 0)
break;
- cmd = fio_net_recv_cmd(sk_out->sk, true);
+ if (pfd.revents & POLLIN)
+ cmd = fio_net_recv_cmd(sk_out->sk, true);
if (!cmd) {
ret = -1;
break;
return 0;
}
+#ifdef WIN32
+static int handle_connection_process(void)
+{
+ WSAPROTOCOL_INFO protocol_info;
+ DWORD bytes_read;
+ HANDLE hpipe;
+ int sk;
+ struct sk_out *sk_out;
+ int ret;
+ char *msg = (char *) "connected";
+
+ log_info("server enter accept loop. ProcessID %d\n", GetCurrentProcessId());
+
+ hpipe = CreateFile(
+ fio_server_pipe_name,
+ GENERIC_READ | GENERIC_WRITE,
+ 0, NULL,
+ OPEN_EXISTING,
+ 0, NULL);
+
+ if (hpipe == INVALID_HANDLE_VALUE) {
+ log_err("couldnt open pipe %s error %lu\n",
+ fio_server_pipe_name, GetLastError());
+ return -1;
+ }
+
+ if (!ReadFile(hpipe, &protocol_info, sizeof(protocol_info), &bytes_read, NULL)) {
+ log_err("couldnt read pi from pipe %s error %lu\n", fio_server_pipe_name,
+ GetLastError());
+ }
+
+ if (use_ipv6) /* use protocol_info to create a duplicate of parents socket */
+ sk = WSASocket(AF_INET6, SOCK_STREAM, 0, &protocol_info, 0, 0);
+ else
+ sk = WSASocket(AF_INET, SOCK_STREAM, 0, &protocol_info, 0, 0);
+
+ sk_out = scalloc(1, sizeof(*sk_out));
+ if (!sk_out) {
+ CloseHandle(hpipe);
+ close(sk);
+ return -1;
+ }
+
+ sk_out->sk = sk;
+ sk_out->hProcess = INVALID_HANDLE_VALUE;
+ INIT_FLIST_HEAD(&sk_out->list);
+ __fio_sem_init(&sk_out->lock, FIO_SEM_UNLOCKED);
+ __fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED);
+ __fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED);
+
+ get_my_addr_str(sk);
+
+ if (!WriteFile(hpipe, msg, strlen(msg), NULL, NULL)) {
+ log_err("couldnt write pipe\n");
+ close(sk);
+ return -1;
+ }
+ CloseHandle(hpipe);
+
+ sk_out_assign(sk_out);
+
+ ret = handle_connection(sk_out);
+ __sk_out_drop(sk_out);
+ return ret;
+}
+#endif
+
static int accept_loop(int listen_sk)
{
struct sockaddr_in addr;
struct sk_out *sk_out;
const char *from;
char buf[64];
+#ifdef WIN32
+ HANDLE hProcess;
+#else
pid_t pid;
-
+#endif
pfd.fd = listen_sk;
pfd.events = POLLIN;
do {
__fio_sem_init(&sk_out->wait, FIO_SEM_LOCKED);
__fio_sem_init(&sk_out->xmit, FIO_SEM_UNLOCKED);
+#ifdef WIN32
+ hProcess = windows_handle_connection(hjob, sk);
+ if (hProcess == INVALID_HANDLE_VALUE)
+ return -1;
+ sk_out->hProcess = hProcess;
+ fio_server_add_conn_pid(&conn_list, hProcess);
+#else
pid = fork();
if (pid) {
close(sk);
*/
sk_out_assign(sk_out);
handle_connection(sk_out);
+#endif
}
return exitval;
{
struct cmd_ts_pdu p;
int i, j, k;
- void *ss_buf;
- uint64_t *ss_iops, *ss_bw;
+ size_t clat_prio_stats_extra_size = 0;
+ size_t ss_extra_size = 0;
+ size_t extended_buf_size = 0;
+ void *extended_buf;
+ void *extended_buf_wp;
dprint(FD_NET, "server sending end stats\n");
p.ts.error = cpu_to_le32(ts->error);
p.ts.thread_number = cpu_to_le32(ts->thread_number);
p.ts.groupid = cpu_to_le32(ts->groupid);
+ p.ts.job_start = cpu_to_le64(ts->job_start);
p.ts.pid = cpu_to_le32(ts->pid);
p.ts.members = cpu_to_le32(ts->members);
p.ts.unified_rw_rep = cpu_to_le32(ts->unified_rw_rep);
+ p.ts.ioprio = cpu_to_le32(ts->ioprio);
+ p.ts.disable_prio_stat = cpu_to_le32(ts->disable_prio_stat);
for (i = 0; i < DDIR_RWDIR_CNT; i++) {
convert_io_stat(&p.ts.clat_stat[i], &ts->clat_stat[i]);
p.ts.cachehit = cpu_to_le64(ts->cachehit);
p.ts.cachemiss = cpu_to_le64(ts->cachemiss);
+ convert_gs(&p.rs, rs);
+
for (i = 0; i < DDIR_RWDIR_CNT; i++) {
- for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
- p.ts.io_u_plat_high_prio[i][j] = cpu_to_le64(ts->io_u_plat_high_prio[i][j]);
- p.ts.io_u_plat_low_prio[i][j] = cpu_to_le64(ts->io_u_plat_low_prio[i][j]);
+ if (ts->nr_clat_prio[i])
+ clat_prio_stats_extra_size += ts->nr_clat_prio[i] * sizeof(*ts->clat_prio[i]);
+ }
+ extended_buf_size += clat_prio_stats_extra_size;
+
+ dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
+ if (ts->ss_state & FIO_SS_DATA)
+ ss_extra_size = 2 * ts->ss_dur * sizeof(uint64_t);
+
+ extended_buf_size += ss_extra_size;
+ if (!extended_buf_size) {
+ fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
+ return;
+ }
+
+ extended_buf_size += sizeof(p);
+ extended_buf = calloc(1, extended_buf_size);
+ if (!extended_buf) {
+ log_err("fio: failed to allocate FIO_NET_CMD_TS buffer\n");
+ return;
+ }
+
+ memcpy(extended_buf, &p, sizeof(p));
+ extended_buf_wp = (struct cmd_ts_pdu *)extended_buf + 1;
+
+ if (clat_prio_stats_extra_size) {
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ struct clat_prio_stat *prio = (struct clat_prio_stat *) extended_buf_wp;
+
+ for (j = 0; j < ts->nr_clat_prio[i]; j++) {
+ for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
+ prio->io_u_plat[k] =
+ cpu_to_le64(ts->clat_prio[i][j].io_u_plat[k]);
+ convert_io_stat(&prio->clat_stat,
+ &ts->clat_prio[i][j].clat_stat);
+ prio->ioprio = cpu_to_le32(ts->clat_prio[i][j].ioprio);
+ prio++;
+ }
+
+ if (ts->nr_clat_prio[i]) {
+ uint64_t offset = (char *)extended_buf_wp - (char *)extended_buf;
+ struct cmd_ts_pdu *ptr = extended_buf;
+
+ ptr->ts.clat_prio_offset[i] = cpu_to_le64(offset);
+ ptr->ts.nr_clat_prio[i] = cpu_to_le32(ts->nr_clat_prio[i]);
+ }
+
+ extended_buf_wp = prio;
}
- convert_io_stat(&p.ts.clat_high_prio_stat[i], &ts->clat_high_prio_stat[i]);
- convert_io_stat(&p.ts.clat_low_prio_stat[i], &ts->clat_low_prio_stat[i]);
}
- convert_gs(&p.rs, rs);
+ if (ss_extra_size) {
+ uint64_t *ss_iops, *ss_bw;
+ uint64_t offset;
+ struct cmd_ts_pdu *ptr = extended_buf;
- dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
- if (ts->ss_state & FIO_SS_DATA) {
dprint(FD_NET, "server sending steadystate ring buffers\n");
- ss_buf = malloc(sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t));
+ /* ss iops */
+ ss_iops = (uint64_t *) extended_buf_wp;
+ for (i = 0; i < ts->ss_dur; i++)
+ ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
- memcpy(ss_buf, &p, sizeof(p));
+ offset = (char *)extended_buf_wp - (char *)extended_buf;
+ ptr->ts.ss_iops_data_offset = cpu_to_le64(offset);
+ extended_buf_wp = ss_iops + (int) ts->ss_dur;
- ss_iops = (uint64_t *) ((struct cmd_ts_pdu *)ss_buf + 1);
- ss_bw = ss_iops + (int) ts->ss_dur;
- for (i = 0; i < ts->ss_dur; i++) {
- ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
+ /* ss bw */
+ ss_bw = extended_buf_wp;
+ for (i = 0; i < ts->ss_dur; i++)
ss_bw[i] = cpu_to_le64(ts->ss_bw_data[i]);
- }
-
- fio_net_queue_cmd(FIO_NET_CMD_TS, ss_buf, sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t), NULL, SK_F_COPY);
- free(ss_buf);
+ offset = (char *)extended_buf_wp - (char *)extended_buf;
+ ptr->ts.ss_bw_data_offset = cpu_to_le64(offset);
}
- else
- fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
+
+ fio_net_queue_cmd(FIO_NET_CMD_TS, extended_buf, extended_buf_size, NULL, SK_F_COPY);
+ free(extended_buf);
}
void fio_server_send_gs(struct group_run_stats *rs)
break;
}
flist_add_tail(&entry->list, &first->next);
- } while (ret != Z_STREAM_END);
+ }
ret = deflateEnd(&stream);
if (ret == Z_OK)
.thread_number = cpu_to_le32(td->thread_number),
.log_type = cpu_to_le32(log->log_type),
.log_hist_coarseness = cpu_to_le32(log->hist_coarseness),
+ .per_job_logs = cpu_to_le32(td->o.per_job_logs),
};
struct sk_entry *first;
struct flist_head *entry;
struct io_sample *s = get_sample(log, cur_log, i);
s->time = cpu_to_le64(s->time);
- s->data.val = cpu_to_le64(s->data.val);
+ if (log->log_type != IO_LOG_TYPE_HIST) {
+ s->data.val.val0 = cpu_to_le64(s->data.val.val0);
+ s->data.val.val1 = cpu_to_le64(s->data.val.val1);
+ }
s->__ddir = __cpu_to_le32(s->__ddir);
s->bs = cpu_to_le64(s->bs);
void fio_server_send_add_job(struct thread_data *td)
{
- struct cmd_add_job_pdu pdu = {
- .thread_number = cpu_to_le32(td->thread_number),
- .groupid = cpu_to_le32(td->groupid),
- };
+ struct cmd_add_job_pdu *pdu;
+ size_t cmd_sz = offsetof(struct cmd_add_job_pdu, top) +
+ thread_options_pack_size(&td->o);
- convert_thread_options_to_net(&pdu.top, &td->o);
+ pdu = malloc(cmd_sz);
+ pdu->thread_number = cpu_to_le32(td->thread_number);
+ pdu->groupid = cpu_to_le32(td->groupid);
- fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, &pdu, sizeof(pdu), NULL,
- SK_F_COPY);
+ convert_thread_options_to_net(&pdu->top, &td->o);
+
+ fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, pdu, cmd_sz, NULL, SK_F_COPY);
+ free(pdu);
}
void fio_server_send_start(struct thread_data *td)
{
struct sk_out *sk_out = pthread_getspecific(sk_out_key);
- assert(sk_out->sk != -1);
+ if (sk_out->sk == -1) {
+ log_err("pthread getting specific for key failed, sk_out %p, sk %i, err: %i:%s",
+ sk_out, sk_out->sk, errno, strerror(errno));
+ abort();
+ }
fio_net_queue_cmd(FIO_NET_CMD_SERVER_START, NULL, 0, NULL, SK_F_SIMPLE);
}
};
sigaction(SIGINT, &act, NULL);
+
+ /* Windows uses SIGBREAK as a quit signal from other applications */
+#ifdef WIN32
+ sigaction(SIGBREAK, &act, NULL);
+#endif
}
void fio_server_destroy_sk_key(void)
if (fio_handle_server_arg())
return -1;
+ set_sig_handlers();
+
+#ifdef WIN32
+ /* if this is a child process, go handle the connection */
+ if (fio_server_pipe_name != NULL) {
+ ret = handle_connection_process();
+ return ret;
+ }
+
+ /* job to link child processes so they terminate together */
+ hjob = windows_create_job();
+ if (hjob == INVALID_HANDLE_VALUE)
+ return -1;
+#endif
+
sk = fio_init_server_connection();
if (sk < 0)
return -1;
- set_sig_handlers();
-
ret = accept_loop(sk);
close(sk);
*/
int fio_start_server(char *pidfile)
{
+ FILE *file;
pid_t pid;
int ret;
setsid();
openlog("fio", LOG_NDELAY|LOG_NOWAIT|LOG_PID, LOG_USER);
log_syslog = true;
- close(STDIN_FILENO);
- close(STDOUT_FILENO);
- close(STDERR_FILENO);
+
+ file = freopen("/dev/null", "r", stdin);
+ if (!file)
+ perror("freopen");
+
+ file = freopen("/dev/null", "w", stdout);
+ if (!file)
+ perror("freopen");
+
+ file = freopen("/dev/null", "w", stderr);
+ if (!file)
+ perror("freopen");
+
f_out = NULL;
f_err = NULL;
ret = fio_server();
+ fclose(stdin);
+ fclose(stdout);
+ fclose(stderr);
+
closelog();
unlink(pidfile);
free(pidfile);
{
fio_server_arg = strdup(arg);
}
+
+#ifdef WIN32
+void fio_server_internal_set(const char *arg)
+{
+ fio_server_pipe_name = strdup(arg);
+}
+#endif
unsigned int refs; /* frees sk_out when it drops to zero.
* protected by below ->lock */
+#ifdef WIN32
+ HANDLE hProcess; /* process handle of handle_connection_process*/
+#endif
int sk; /* socket fd to talk to client */
struct fio_sem lock; /* protects ref and below list */
struct flist_head list; /* list of pending transmit work */
};
enum {
- FIO_SERVER_VER = 86,
+ FIO_SERVER_VER = 104,
FIO_SERVER_MAX_FRAGMENT_PDU = 1024,
FIO_SERVER_MAX_CMD_MB = 2048,
uint32_t log_type;
uint32_t compressed;
uint32_t log_offset;
+ uint32_t log_prio;
uint32_t log_hist_coarseness;
+ uint32_t per_job_logs;
uint8_t name[FIO_NET_NAME_MAX];
struct io_sample samples[0];
};
extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *);
extern int fio_net_send_simple_cmd(int, uint16_t, uint64_t, struct flist_head *);
extern void fio_server_set_arg(const char *);
+extern void fio_server_internal_set(const char *);
extern int fio_server_parse_string(const char *, char **, bool *, int *, struct in_addr *, struct in6_addr *, int *);
extern int fio_server_parse_host(const char *, int, struct in_addr *, struct in6_addr *);
extern const char *fio_server_op(unsigned int);
extern void fio_server_send_du(void);
extern void fio_server_send_job_options(struct flist_head *, unsigned int);
extern int fio_server_get_verify_state(const char *, int, void **);
+extern bool fio_server_poll_fd(int fd, short events, int timeout);
extern struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait);
if (hdr->prered != SMALLOC_PRE_RED) {
log_err("smalloc pre redzone destroyed!\n"
" ptr=%p, prered=%x, expected %x\n",
- hdr, hdr->prered, SMALLOC_PRE_RED);
+ hdr+1, hdr->prered, SMALLOC_PRE_RED);
assert(0);
}
if (*postred != SMALLOC_POST_RED) {
log_err("smalloc post redzone destroyed!\n"
" ptr=%p, postred=%x, expected %x\n",
- hdr, *postred, SMALLOC_POST_RED);
+ hdr+1, *postred, SMALLOC_POST_RED);
assert(0);
}
}
#include <stdio.h>
#include <string.h>
+#include <stdlib.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <math.h>
#include "zbd.h"
#include "oslib/asprintf.h"
+#ifdef WIN32
+#define LOG_MSEC_SLACK 2
+#else
#define LOG_MSEC_SLACK 1
+#endif
struct fio_sem *stat_sem;
len = calc_clat_percentiles(io_u_plat, nr, plist, &ovals, &maxv, &minv);
if (!len || !ovals)
- goto out;
+ return;
/*
* We default to nsecs, but if the value range is such that we
log_buf(out, "\n");
}
-out:
free(ovals);
}
+static int get_nr_prios_with_samples(struct thread_stat *ts, enum fio_ddir ddir)
+{
+ int i, nr_prios_with_samples = 0;
+
+ for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+ if (ts->clat_prio[ddir][i].clat_stat.samples)
+ nr_prios_with_samples++;
+ }
+
+ return nr_prios_with_samples;
+}
+
bool calc_lat(struct io_stat *is, unsigned long long *min,
unsigned long long *max, double *mean, double *dev)
{
return true;
}
+void show_mixed_group_stats(struct group_run_stats *rs, struct buf_output *out)
+{
+ char *io, *agg, *min, *max;
+ char *ioalt, *aggalt, *minalt, *maxalt;
+ uint64_t io_mix = 0, agg_mix = 0, min_mix = -1, max_mix = 0;
+ uint64_t min_run = -1, max_run = 0;
+ const int i2p = is_power_of_2(rs->kb_base);
+ int i;
+
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ if (!rs->max_run[i])
+ continue;
+ io_mix += rs->iobytes[i];
+ agg_mix += rs->agg[i];
+ min_mix = min_mix < rs->min_bw[i] ? min_mix : rs->min_bw[i];
+ max_mix = max_mix > rs->max_bw[i] ? max_mix : rs->max_bw[i];
+ min_run = min_run < rs->min_run[i] ? min_run : rs->min_run[i];
+ max_run = max_run > rs->max_run[i] ? max_run : rs->max_run[i];
+ }
+ io = num2str(io_mix, rs->sig_figs, 1, i2p, N2S_BYTE);
+ ioalt = num2str(io_mix, rs->sig_figs, 1, !i2p, N2S_BYTE);
+ agg = num2str(agg_mix, rs->sig_figs, 1, i2p, rs->unit_base);
+ aggalt = num2str(agg_mix, rs->sig_figs, 1, !i2p, rs->unit_base);
+ min = num2str(min_mix, rs->sig_figs, 1, i2p, rs->unit_base);
+ minalt = num2str(min_mix, rs->sig_figs, 1, !i2p, rs->unit_base);
+ max = num2str(max_mix, rs->sig_figs, 1, i2p, rs->unit_base);
+ maxalt = num2str(max_mix, rs->sig_figs, 1, !i2p, rs->unit_base);
+ log_buf(out, " MIXED: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n",
+ agg, aggalt, min, max, minalt, maxalt, io, ioalt,
+ (unsigned long long) min_run,
+ (unsigned long long) max_run);
+ free(io);
+ free(agg);
+ free(min);
+ free(max);
+ free(ioalt);
+ free(aggalt);
+ free(minalt);
+ free(maxalt);
+}
+
void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
{
char *io, *agg, *min, *max;
max = num2str(rs->max_bw[i], rs->sig_figs, 1, i2p, rs->unit_base);
maxalt = num2str(rs->max_bw[i], rs->sig_figs, 1, !i2p, rs->unit_base);
log_buf(out, "%s: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n",
- rs->unified_rw_rep ? " MIXED" : str[i],
+ (rs->unified_rw_rep == UNIFIED_MIXED) ? " MIXED" : str[i],
agg, aggalt, min, max, minalt, maxalt, io, ioalt,
(unsigned long long) rs->min_run[i],
(unsigned long long) rs->max_run[i]);
free(minalt);
free(maxalt);
}
+
+ /* Need to aggregate statistics to show mixed values */
+ if (rs->unified_rw_rep == UNIFIED_BOTH)
+ show_mixed_group_stats(rs, out);
}
void stat_calc_dist(uint64_t *map, unsigned long total, double *io_u_dist)
free(maxp);
}
-static double convert_agg_kbytes_percent(struct group_run_stats *rs, int ddir, int mean)
+static struct thread_stat *gen_mixed_ddir_stats_from_ts(struct thread_stat *ts)
+{
+ struct thread_stat *ts_lcl;
+
+ /*
+ * Handle aggregation of Reads (ddir = 0), Writes (ddir = 1), and
+ * Trims (ddir = 2)
+ */
+ ts_lcl = malloc(sizeof(struct thread_stat));
+ if (!ts_lcl) {
+ log_err("fio: failed to allocate local thread stat\n");
+ return NULL;
+ }
+
+ init_thread_stat(ts_lcl);
+
+ /* calculate mixed stats */
+ ts_lcl->unified_rw_rep = UNIFIED_MIXED;
+ ts_lcl->lat_percentiles = ts->lat_percentiles;
+ ts_lcl->clat_percentiles = ts->clat_percentiles;
+ ts_lcl->slat_percentiles = ts->slat_percentiles;
+ ts_lcl->percentile_precision = ts->percentile_precision;
+ memcpy(ts_lcl->percentile_list, ts->percentile_list, sizeof(ts->percentile_list));
+
+ sum_thread_stats(ts_lcl, ts);
+
+ return ts_lcl;
+}
+
+static double convert_agg_kbytes_percent(struct group_run_stats *rs,
+ enum fio_ddir ddir, int mean)
{
double p_of_agg = 100.0;
if (rs && rs->agg[ddir] > 1024) {
- p_of_agg = mean * 100 / (double) (rs->agg[ddir] / 1024.0);
+ p_of_agg = mean * 100.0 / (double) (rs->agg[ddir] / 1024.0);
if (p_of_agg > 100.0)
p_of_agg = 100.0;
}
static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
- int ddir, struct buf_output *out)
+ enum fio_ddir ddir, struct buf_output *out)
{
unsigned long runt;
unsigned long long min, max, bw, iops;
double mean, dev;
char *io_p, *bw_p, *bw_p_alt, *iops_p, *post_st = NULL;
- int i2p;
+ int i2p, i;
+ const char *clat_type = ts->lat_percentiles ? "lat" : "clat";
if (ddir_sync(ddir)) {
if (calc_lat(&ts->sync_stat, &min, &max, &mean, &dev)) {
iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt;
iops_p = num2str(iops, ts->sig_figs, 1, 0, N2S_NONE);
- if (ddir == DDIR_WRITE)
+ if (ddir == DDIR_WRITE || ddir == DDIR_TRIM)
post_st = zbd_write_status(ts);
else if (ddir == DDIR_READ && ts->cachehit && ts->cachemiss) {
uint64_t total;
}
log_buf(out, " %s: IOPS=%s, BW=%s (%s)(%s/%llumsec)%s\n",
- rs->unified_rw_rep ? "mixed" : io_ddir_name(ddir),
+ (ts->unified_rw_rep == UNIFIED_MIXED) ? "mixed" : io_ddir_name(ddir),
iops_p, bw_p, bw_p_alt, io_p,
(unsigned long long) ts->runtime[ddir],
post_st ? : "");
display_lat("clat", min, max, mean, dev, out);
if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
display_lat(" lat", min, max, mean, dev, out);
- if (calc_lat(&ts->clat_high_prio_stat[ddir], &min, &max, &mean, &dev)) {
- display_lat(ts->lat_percentiles ? "high prio_lat" : "high prio_clat",
- min, max, mean, dev, out);
- if (calc_lat(&ts->clat_low_prio_stat[ddir], &min, &max, &mean, &dev))
- display_lat(ts->lat_percentiles ? "low prio_lat" : "low prio_clat",
- min, max, mean, dev, out);
+
+ /* Only print per prio stats if there are >= 2 prios with samples */
+ if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+ for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+ char buf[64];
+
+ if (!calc_lat(&ts->clat_prio[ddir][i].clat_stat, &min,
+ &max, &mean, &dev))
+ continue;
+
+ snprintf(buf, sizeof(buf),
+ "%s prio %u/%u/%u",
+ clat_type,
+ ioprio_class(ts->clat_prio[ddir][i].ioprio),
+ ioprio(ts->clat_prio[ddir][i].ioprio),
+ ioprio_hint(ts->clat_prio[ddir][i].ioprio));
+ display_lat(buf, min, max, mean, dev, out);
+ }
}
if (ts->slat_percentiles && ts->slat_stat[ddir].samples > 0)
ts->percentile_precision, "lat", out);
if (ts->clat_percentiles || ts->lat_percentiles) {
- const char *name = ts->lat_percentiles ? "lat" : "clat";
- char prio_name[32];
+ char prio_name[64];
uint64_t samples;
if (ts->lat_percentiles)
else
samples = ts->clat_stat[ddir].samples;
- /* Only print this if some high and low priority stats were collected */
- if (ts->clat_high_prio_stat[ddir].samples > 0 &&
- ts->clat_low_prio_stat[ddir].samples > 0)
- {
- sprintf(prio_name, "high prio (%.2f%%) %s",
- 100. * (double) ts->clat_high_prio_stat[ddir].samples / (double) samples,
- name);
- show_clat_percentiles(ts->io_u_plat_high_prio[ddir],
- ts->clat_high_prio_stat[ddir].samples,
- ts->percentile_list,
- ts->percentile_precision, prio_name, out);
-
- sprintf(prio_name, "low prio (%.2f%%) %s",
- 100. * (double) ts->clat_low_prio_stat[ddir].samples / (double) samples,
- name);
- show_clat_percentiles(ts->io_u_plat_low_prio[ddir],
- ts->clat_low_prio_stat[ddir].samples,
- ts->percentile_list,
- ts->percentile_precision, prio_name, out);
+ /* Only print per prio stats if there are >= 2 prios with samples */
+ if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+ for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+ uint64_t prio_samples =
+ ts->clat_prio[ddir][i].clat_stat.samples;
+
+ if (!prio_samples)
+ continue;
+
+ snprintf(prio_name, sizeof(prio_name),
+ "%s prio %u/%u/%u (%.2f%% of IOs)",
+ clat_type,
+ ioprio_class(ts->clat_prio[ddir][i].ioprio),
+ ioprio(ts->clat_prio[ddir][i].ioprio),
+ ioprio_hint(ts->clat_prio[ddir][i].ioprio),
+ 100. * (double) prio_samples / (double) samples);
+ show_clat_percentiles(ts->clat_prio[ddir][i].io_u_plat,
+ prio_samples, ts->percentile_list,
+ ts->percentile_precision,
+ prio_name, out);
+ }
}
}
}
}
+static void show_mixed_ddir_status(struct group_run_stats *rs,
+ struct thread_stat *ts,
+ struct buf_output *out)
+{
+ struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
+
+ if (ts_lcl)
+ show_ddir_status(rs, ts_lcl, DDIR_READ, out);
+
+ free_clat_prio_stats(ts_lcl);
+ free(ts_lcl);
+}
+
static bool show_lat(double *io_u_lat, int nr, const char **ranges,
const char *msg, struct buf_output *out)
{
return;
if (!terse) {
- log_buf(out, ", aggrios=%llu/%llu, aggrmerge=%llu/%llu, "
- "aggrticks=%llu/%llu, aggrin_queue=%llu, "
- "aggrutil=%3.2f%%",
+ log_buf(out, ", aggrios=%llu/%llu, aggsectors=%llu/%llu, "
+ "aggrmerge=%llu/%llu, aggrticks=%llu/%llu, "
+ "aggrin_queue=%llu, aggrutil=%3.2f%%",
(unsigned long long) agg->ios[0] / agg->slavecount,
(unsigned long long) agg->ios[1] / agg->slavecount,
+ (unsigned long long) agg->sectors[0] / agg->slavecount,
+ (unsigned long long) agg->sectors[1] / agg->slavecount,
(unsigned long long) agg->merges[0] / agg->slavecount,
(unsigned long long) agg->merges[1] / agg->slavecount,
(unsigned long long) agg->ticks[0] / agg->slavecount,
if (agg->slavecount)
log_buf(out, " ");
- log_buf(out, " %s: ios=%llu/%llu, merge=%llu/%llu, "
- "ticks=%llu/%llu, in_queue=%llu, util=%3.2f%%",
+ log_buf(out, " %s: ios=%llu/%llu, sectors=%llu/%llu, "
+ "merge=%llu/%llu, ticks=%llu/%llu, in_queue=%llu, "
+ "util=%3.2f%%",
dus->name,
(unsigned long long) dus->s.ios[0],
(unsigned long long) dus->s.ios[1],
+ (unsigned long long) dus->s.sectors[0],
+ (unsigned long long) dus->s.sectors[1],
(unsigned long long) dus->s.merges[0],
(unsigned long long) dus->s.merges[1],
(unsigned long long) dus->s.ticks[0],
json_object_add_value_string(obj, "name", (const char *)dus->name);
json_object_add_value_int(obj, "read_ios", dus->s.ios[0]);
json_object_add_value_int(obj, "write_ios", dus->s.ios[1]);
+ json_object_add_value_int(obj, "read_sectors", dus->s.sectors[0]);
+ json_object_add_value_int(obj, "write_sectors", dus->s.sectors[1]);
json_object_add_value_int(obj, "read_merges", dus->s.merges[0]);
json_object_add_value_int(obj, "write_merges", dus->s.merges[1]);
json_object_add_value_int(obj, "read_ticks", dus->s.ticks[0]);
agg->ios[0] / agg->slavecount);
json_object_add_value_int(obj, "aggr_write_ios",
agg->ios[1] / agg->slavecount);
+ json_object_add_value_int(obj, "aggr_read_sectors",
+ agg->sectors[0] / agg->slavecount);
+ json_object_add_value_int(obj, "aggr_write_sectors",
+ agg->sectors[1] / agg->slavecount);
json_object_add_value_int(obj, "aggr_read_merges",
agg->merges[0] / agg->slavecount);
json_object_add_value_int(obj, "aggr_write_merge",
if (!is_running_backend())
return;
- if (flist_empty(&disk_list)) {
+ if (flist_empty(&disk_list))
return;
- }
if ((output_format & FIO_OUTPUT_JSON) && parent)
do_json = true;
if (!terse && !do_json)
log_buf(out, "\nDisk stats (read/write):\n");
- if (do_json)
+ if (do_json) {
json_object_add_disk_utils(parent, &disk_list);
- else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
+ } else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
flist_for_each(entry, &disk_list) {
du = flist_entry(entry, struct disk_util, list);
show_ddir_status(rs, ts, ddir, out);
}
+ if (ts->unified_rw_rep == UNIFIED_BOTH)
+ show_mixed_ddir_status(rs, ts, out);
+
show_latencies(ts, out);
if (ts->sync_stat.samples)
}
static void show_ddir_status_terse(struct thread_stat *ts,
- struct group_run_stats *rs, int ddir,
- int ver, struct buf_output *out)
+ struct group_run_stats *rs,
+ enum fio_ddir ddir, int ver,
+ struct buf_output *out)
{
unsigned long long min, max, minv, maxv, bw, iops;
unsigned long long *ovals = NULL;
else
log_buf(out, ";%llu;%llu;%f;%f", 0ULL, 0ULL, 0.0, 0.0);
- if (ts->lat_percentiles)
+ if (ts->lat_percentiles) {
len = calc_clat_percentiles(ts->io_u_plat[FIO_LAT][ddir],
ts->lat_stat[ddir].samples,
ts->percentile_list, &ovals, &maxv,
&minv);
- else if (ts->clat_percentiles)
+ } else if (ts->clat_percentiles) {
len = calc_clat_percentiles(ts->io_u_plat[FIO_CLAT][ddir],
ts->clat_stat[ddir].samples,
ts->percentile_list, &ovals, &maxv,
&minv);
- else
+ } else {
len = 0;
+ }
for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
if (i >= len) {
}
log_buf(out, ";%llu;%llu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
- } else
+ } else {
log_buf(out, ";%llu;%llu;%f%%;%f;%f", 0ULL, 0ULL, 0.0, 0.0, 0.0);
+ }
if (ver == 5) {
if (bw_stat)
}
}
-static struct json_object *add_ddir_lat_json(struct thread_stat *ts, uint32_t percentiles,
- struct io_stat *lat_stat, uint64_t *io_u_plat)
+static void show_mixed_ddir_status_terse(struct thread_stat *ts,
+ struct group_run_stats *rs,
+ int ver, struct buf_output *out)
+{
+ struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
+
+ if (ts_lcl)
+ show_ddir_status_terse(ts_lcl, rs, DDIR_READ, ver, out);
+
+ free_clat_prio_stats(ts_lcl);
+ free(ts_lcl);
+}
+
+static struct json_object *add_ddir_lat_json(struct thread_stat *ts,
+ uint32_t percentiles,
+ struct io_stat *lat_stat,
+ uint64_t *io_u_plat)
{
char buf[120];
double mean, dev;
}
static void add_ddir_status_json(struct thread_stat *ts,
- struct group_run_stats *rs, int ddir, struct json_object *parent)
+ struct group_run_stats *rs, enum fio_ddir ddir,
+ struct json_object *parent)
{
unsigned long long min, max;
unsigned long long bw_bytes, bw;
assert(ddir_rw(ddir) || ddir_sync(ddir));
- if (ts->unified_rw_rep && ddir != DDIR_READ)
+ if ((ts->unified_rw_rep == UNIFIED_MIXED) && ddir != DDIR_READ)
return;
dir_object = json_create_object();
json_object_add_value_object(parent,
- ts->unified_rw_rep ? "mixed" : io_ddir_name(ddir), dir_object);
+ (ts->unified_rw_rep == UNIFIED_MIXED) ? "mixed" : io_ddir_name(ddir), dir_object);
if (ddir_rw(ddir)) {
bw_bytes = 0;
if (!ddir_rw(ddir))
return;
- /* Only print PRIO latencies if some high priority samples were gathered */
- if (ts->clat_high_prio_stat[ddir].samples > 0) {
- const char *high, *low;
+ /* Only include per prio stats if there are >= 2 prios with samples */
+ if (get_nr_prios_with_samples(ts, ddir) >= 2) {
+ struct json_array *array = json_create_array();
+ const char *obj_name;
+ int i;
- if (ts->lat_percentiles) {
- high = "lat_high_prio";
- low = "lat_low_prio";
- } else {
- high = "clat_high_prio";
- low = "clat_low_prio";
- }
+ if (ts->lat_percentiles)
+ obj_name = "lat_ns";
+ else
+ obj_name = "clat_ns";
- tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles,
- &ts->clat_high_prio_stat[ddir], ts->io_u_plat_high_prio[ddir]);
- json_object_add_value_object(dir_object, high, tmp_object);
+ json_object_add_value_array(dir_object, "prios", array);
- tmp_object = add_ddir_lat_json(ts, ts->clat_percentiles | ts->lat_percentiles,
- &ts->clat_low_prio_stat[ddir], ts->io_u_plat_low_prio[ddir]);
- json_object_add_value_object(dir_object, low, tmp_object);
+ for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+ struct json_object *obj;
+
+ if (!ts->clat_prio[ddir][i].clat_stat.samples)
+ continue;
+
+ obj = json_create_object();
+
+ json_object_add_value_int(obj, "prioclass",
+ ioprio_class(ts->clat_prio[ddir][i].ioprio));
+ json_object_add_value_int(obj, "prio",
+ ioprio(ts->clat_prio[ddir][i].ioprio));
+ json_object_add_value_int(obj, "priohint",
+ ioprio_hint(ts->clat_prio[ddir][i].ioprio));
+
+ tmp_object = add_ddir_lat_json(ts,
+ ts->clat_percentiles | ts->lat_percentiles,
+ &ts->clat_prio[ddir][i].clat_stat,
+ ts->clat_prio[ddir][i].io_u_plat);
+ json_object_add_value_object(obj, obj_name, tmp_object);
+ json_array_add_value_object(array, obj);
+ }
}
if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
}
}
+static void add_mixed_ddir_status_json(struct thread_stat *ts,
+ struct group_run_stats *rs, struct json_object *parent)
+{
+ struct thread_stat *ts_lcl = gen_mixed_ddir_stats_from_ts(ts);
+
+ /* add the aggregated stats to json parent */
+ if (ts_lcl)
+ add_ddir_status_json(ts_lcl, rs, DDIR_READ, parent);
+
+ free_clat_prio_stats(ts_lcl);
+ free(ts_lcl);
+}
+
static void show_thread_status_terse_all(struct thread_stat *ts,
struct group_run_stats *rs, int ver,
struct buf_output *out)
log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string,
ts->name, ts->groupid, ts->error);
- /* Log Read Status */
+ /* Log Read Status, or mixed if unified_rw_rep = 1 */
show_ddir_status_terse(ts, rs, DDIR_READ, ver, out);
- /* Log Write Status */
- show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out);
- /* Log Trim Status */
- if (ver == 2 || ver == 4 || ver == 5)
- show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out);
-
+ if (ts->unified_rw_rep != UNIFIED_MIXED) {
+ /* Log Write Status */
+ show_ddir_status_terse(ts, rs, DDIR_WRITE, ver, out);
+ /* Log Trim Status */
+ if (ver == 2 || ver == 4 || ver == 5)
+ show_ddir_status_terse(ts, rs, DDIR_TRIM, ver, out);
+ }
+ if (ts->unified_rw_rep == UNIFIED_BOTH)
+ show_mixed_ddir_status_terse(ts, rs, ver, out);
/* CPU Usage */
if (ts->total_run_time) {
double runt = (double) ts->total_run_time;
root = json_create_object();
json_object_add_value_string(root, "jobname", ts->name);
json_object_add_value_int(root, "groupid", ts->groupid);
+ json_object_add_value_int(root, "job_start", ts->job_start);
json_object_add_value_int(root, "error", ts->error);
/* ETA Info */
if (je) {
json_object_add_value_int(root, "eta", je->eta_sec);
json_object_add_value_int(root, "elapsed", je->elapsed_sec);
+ free(je);
}
if (opt_list)
add_ddir_status_json(ts, rs, DDIR_TRIM, root);
add_ddir_status_json(ts, rs, DDIR_SYNC, root);
+ if (ts->unified_rw_rep == UNIFIED_BOTH)
+ add_mixed_ddir_status_json(ts, rs, root);
+
/* CPU Usage */
if (ts->total_run_time) {
double runt = (double) ts->total_run_time;
struct json_array *iops, *bw;
int j, k, l;
char ss_buf[64];
+ int intervals = ts->ss_dur / (ss_check_interval / 1000L);
snprintf(ss_buf, sizeof(ss_buf), "%s%s:%f%s",
ts->ss_state & FIO_SS_IOPS ? "iops" : "bw",
if ((ts->ss_state & FIO_SS_ATTAINED) || !(ts->ss_state & FIO_SS_BUFFER_FULL))
j = ts->ss_head;
else
- j = ts->ss_head == 0 ? ts->ss_dur - 1 : ts->ss_head - 1;
- for (l = 0; l < ts->ss_dur; l++) {
- k = (j + l) % ts->ss_dur;
+ j = ts->ss_head == 0 ? intervals - 1 : ts->ss_head - 1;
+ for (l = 0; l < intervals; l++) {
+ k = (j + l) % intervals;
json_array_add_value_int(bw, ts->ss_bw_data[k]);
json_array_add_value_int(iops, ts->ss_iops_data[k]);
}
* numbers. For group_reporting, we should just add those up, not make
* them the mean of everything.
*/
-static void sum_stat(struct io_stat *dst, struct io_stat *src, bool first,
- bool pure_sum)
+static void sum_stat(struct io_stat *dst, struct io_stat *src, bool pure_sum)
{
+ bool first = dst->samples == 0;
+
if (src->samples == 0)
return;
dst->sig_figs = src->sig_figs;
}
-void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
- bool first)
+/*
+ * Free the clat_prio_stat arrays allocated by alloc_clat_prio_stat_ddir().
+ */
+void free_clat_prio_stats(struct thread_stat *ts)
+{
+ enum fio_ddir ddir;
+
+ if (!ts)
+ return;
+
+ for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+ sfree(ts->clat_prio[ddir]);
+ ts->clat_prio[ddir] = NULL;
+ ts->nr_clat_prio[ddir] = 0;
+ }
+}
+
+/*
+ * Allocate a clat_prio_stat array. The array has to be allocated/freed using
+ * smalloc/sfree, so that it is accessible by the process/thread summing the
+ * thread_stats.
+ */
+int alloc_clat_prio_stat_ddir(struct thread_stat *ts, enum fio_ddir ddir,
+ int nr_prios)
+{
+ struct clat_prio_stat *clat_prio;
+ int i;
+
+ clat_prio = scalloc(nr_prios, sizeof(*ts->clat_prio[ddir]));
+ if (!clat_prio) {
+ log_err("fio: failed to allocate ts clat data\n");
+ return 1;
+ }
+
+ for (i = 0; i < nr_prios; i++)
+ clat_prio[i].clat_stat.min_val = ULONG_MAX;
+
+ ts->clat_prio[ddir] = clat_prio;
+ ts->nr_clat_prio[ddir] = nr_prios;
+
+ return 0;
+}
+
+static int grow_clat_prio_stat(struct thread_stat *dst, enum fio_ddir ddir)
+{
+ int curr_len = dst->nr_clat_prio[ddir];
+ void *new_arr;
+
+ new_arr = scalloc(curr_len + 1, sizeof(*dst->clat_prio[ddir]));
+ if (!new_arr) {
+ log_err("fio: failed to grow clat prio array\n");
+ return 1;
+ }
+
+ memcpy(new_arr, dst->clat_prio[ddir],
+ curr_len * sizeof(*dst->clat_prio[ddir]));
+ sfree(dst->clat_prio[ddir]);
+
+ dst->clat_prio[ddir] = new_arr;
+ dst->clat_prio[ddir][curr_len].clat_stat.min_val = ULONG_MAX;
+ dst->nr_clat_prio[ddir]++;
+
+ return 0;
+}
+
+static int find_clat_prio_index(struct thread_stat *dst, enum fio_ddir ddir,
+ uint32_t ioprio)
+{
+ int i, nr_prios = dst->nr_clat_prio[ddir];
+
+ for (i = 0; i < nr_prios; i++) {
+ if (dst->clat_prio[ddir][i].ioprio == ioprio)
+ return i;
+ }
+
+ return -1;
+}
+
+static int alloc_or_get_clat_prio_index(struct thread_stat *dst,
+ enum fio_ddir ddir, uint32_t ioprio,
+ int *idx)
+{
+ int index = find_clat_prio_index(dst, ddir, ioprio);
+
+ if (index == -1) {
+ index = dst->nr_clat_prio[ddir];
+
+ if (grow_clat_prio_stat(dst, ddir))
+ return 1;
+
+ dst->clat_prio[ddir][index].ioprio = ioprio;
+ }
+
+ *idx = index;
+
+ return 0;
+}
+
+static int clat_prio_stats_copy(struct thread_stat *dst, struct thread_stat *src,
+ enum fio_ddir dst_ddir, enum fio_ddir src_ddir)
+{
+ size_t sz = sizeof(*src->clat_prio[src_ddir]) *
+ src->nr_clat_prio[src_ddir];
+
+ dst->clat_prio[dst_ddir] = smalloc(sz);
+ if (!dst->clat_prio[dst_ddir]) {
+ log_err("fio: failed to alloc clat prio array\n");
+ return 1;
+ }
+
+ memcpy(dst->clat_prio[dst_ddir], src->clat_prio[src_ddir], sz);
+ dst->nr_clat_prio[dst_ddir] = src->nr_clat_prio[src_ddir];
+
+ return 0;
+}
+
+static int clat_prio_stat_add_samples(struct thread_stat *dst,
+ enum fio_ddir dst_ddir, uint32_t ioprio,
+ struct io_stat *io_stat,
+ uint64_t *io_u_plat)
+{
+ int i, dst_index;
+
+ if (!io_stat->samples)
+ return 0;
+
+ if (alloc_or_get_clat_prio_index(dst, dst_ddir, ioprio, &dst_index))
+ return 1;
+
+ sum_stat(&dst->clat_prio[dst_ddir][dst_index].clat_stat, io_stat,
+ false);
+
+ for (i = 0; i < FIO_IO_U_PLAT_NR; i++)
+ dst->clat_prio[dst_ddir][dst_index].io_u_plat[i] += io_u_plat[i];
+
+ return 0;
+}
+
+static int sum_clat_prio_stats_src_single_prio(struct thread_stat *dst,
+ struct thread_stat *src,
+ enum fio_ddir dst_ddir,
+ enum fio_ddir src_ddir)
+{
+ struct io_stat *io_stat;
+ uint64_t *io_u_plat;
+
+ /*
+ * If src ts has no clat_prio_stat array, then all I/Os were submitted
+ * using src->ioprio. Thus, the global samples in src->clat_stat (or
+ * src->lat_stat) can be used as the 'per prio' samples for src->ioprio.
+ */
+ assert(!src->clat_prio[src_ddir]);
+ assert(src->nr_clat_prio[src_ddir] == 0);
+
+ if (src->lat_percentiles) {
+ io_u_plat = src->io_u_plat[FIO_LAT][src_ddir];
+ io_stat = &src->lat_stat[src_ddir];
+ } else {
+ io_u_plat = src->io_u_plat[FIO_CLAT][src_ddir];
+ io_stat = &src->clat_stat[src_ddir];
+ }
+
+ return clat_prio_stat_add_samples(dst, dst_ddir, src->ioprio, io_stat,
+ io_u_plat);
+}
+
+static int sum_clat_prio_stats_src_multi_prio(struct thread_stat *dst,
+ struct thread_stat *src,
+ enum fio_ddir dst_ddir,
+ enum fio_ddir src_ddir)
+{
+ int i;
+
+ /*
+ * If src ts has a clat_prio_stat array, then there are multiple prios
+ * in use (i.e. src ts had cmdprio_percentage or cmdprio_bssplit set).
+ * The samples for the default prio will exist in the src->clat_prio
+ * array, just like the samples for any other prio.
+ */
+ assert(src->clat_prio[src_ddir]);
+ assert(src->nr_clat_prio[src_ddir]);
+
+ /* If the dst ts doesn't yet have a clat_prio array, simply memcpy. */
+ if (!dst->clat_prio[dst_ddir])
+ return clat_prio_stats_copy(dst, src, dst_ddir, src_ddir);
+
+ /* The dst ts already has a clat_prio_array, add src stats into it. */
+ for (i = 0; i < src->nr_clat_prio[src_ddir]; i++) {
+ struct io_stat *io_stat = &src->clat_prio[src_ddir][i].clat_stat;
+ uint64_t *io_u_plat = src->clat_prio[src_ddir][i].io_u_plat;
+ uint32_t ioprio = src->clat_prio[src_ddir][i].ioprio;
+
+ if (clat_prio_stat_add_samples(dst, dst_ddir, ioprio, io_stat, io_u_plat))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int sum_clat_prio_stats(struct thread_stat *dst, struct thread_stat *src,
+ enum fio_ddir dst_ddir, enum fio_ddir src_ddir)
+{
+ if (dst->disable_prio_stat)
+ return 0;
+
+ if (!src->clat_prio[src_ddir])
+ return sum_clat_prio_stats_src_single_prio(dst, src, dst_ddir,
+ src_ddir);
+
+ return sum_clat_prio_stats_src_multi_prio(dst, src, dst_ddir, src_ddir);
+}
+
+void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src)
{
int k, l, m;
for (l = 0; l < DDIR_RWDIR_CNT; l++) {
- if (!dst->unified_rw_rep) {
- sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first, false);
- sum_stat(&dst->clat_high_prio_stat[l], &src->clat_high_prio_stat[l], first, false);
- sum_stat(&dst->clat_low_prio_stat[l], &src->clat_low_prio_stat[l], first, false);
- sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first, false);
- sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first, false);
- sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first, true);
- sum_stat(&dst->iops_stat[l], &src->iops_stat[l], first, true);
+ if (dst->unified_rw_rep != UNIFIED_MIXED) {
+ sum_stat(&dst->clat_stat[l], &src->clat_stat[l], false);
+ sum_stat(&dst->slat_stat[l], &src->slat_stat[l], false);
+ sum_stat(&dst->lat_stat[l], &src->lat_stat[l], false);
+ sum_stat(&dst->bw_stat[l], &src->bw_stat[l], true);
+ sum_stat(&dst->iops_stat[l], &src->iops_stat[l], true);
+ sum_clat_prio_stats(dst, src, l, l);
dst->io_bytes[l] += src->io_bytes[l];
if (dst->runtime[l] < src->runtime[l])
dst->runtime[l] = src->runtime[l];
} else {
- sum_stat(&dst->clat_stat[0], &src->clat_stat[l], first, false);
- sum_stat(&dst->clat_high_prio_stat[0], &src->clat_high_prio_stat[l], first, false);
- sum_stat(&dst->clat_low_prio_stat[0], &src->clat_low_prio_stat[l], first, false);
- sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first, false);
- sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first, false);
- sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first, true);
- sum_stat(&dst->iops_stat[0], &src->iops_stat[l], first, true);
+ sum_stat(&dst->clat_stat[0], &src->clat_stat[l], false);
+ sum_stat(&dst->slat_stat[0], &src->slat_stat[l], false);
+ sum_stat(&dst->lat_stat[0], &src->lat_stat[l], false);
+ sum_stat(&dst->bw_stat[0], &src->bw_stat[l], true);
+ sum_stat(&dst->iops_stat[0], &src->iops_stat[l], true);
+ sum_clat_prio_stats(dst, src, 0, l);
dst->io_bytes[0] += src->io_bytes[l];
if (dst->runtime[0] < src->runtime[l])
dst->runtime[0] = src->runtime[l];
-
- /*
- * We're summing to the same destination, so override
- * 'first' after the first iteration of the loop
- */
- first = false;
}
}
- sum_stat(&dst->sync_stat, &src->sync_stat, first, false);
+ sum_stat(&dst->sync_stat, &src->sync_stat, false);
dst->usr_time += src->usr_time;
dst->sys_time += src->sys_time;
dst->ctx += src->ctx;
dst->io_u_lat_m[k] += src->io_u_lat_m[k];
for (k = 0; k < DDIR_RWDIR_CNT; k++) {
- if (!dst->unified_rw_rep) {
+ if (dst->unified_rw_rep != UNIFIED_MIXED) {
dst->total_io_u[k] += src->total_io_u[k];
dst->short_io_u[k] += src->short_io_u[k];
dst->drop_io_u[k] += src->drop_io_u[k];
for (k = 0; k < FIO_LAT_CNT; k++)
for (l = 0; l < DDIR_RWDIR_CNT; l++)
for (m = 0; m < FIO_IO_U_PLAT_NR; m++)
- if (!dst->unified_rw_rep)
+ if (dst->unified_rw_rep != UNIFIED_MIXED)
dst->io_u_plat[k][l][m] += src->io_u_plat[k][l][m];
else
dst->io_u_plat[k][0][m] += src->io_u_plat[k][l][m];
for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
dst->io_u_sync_plat[k] += src->io_u_sync_plat[k];
- for (k = 0; k < DDIR_RWDIR_CNT; k++) {
- for (m = 0; m < FIO_IO_U_PLAT_NR; m++) {
- if (!dst->unified_rw_rep) {
- dst->io_u_plat_high_prio[k][m] += src->io_u_plat_high_prio[k][m];
- dst->io_u_plat_low_prio[k][m] += src->io_u_plat_low_prio[k][m];
- } else {
- dst->io_u_plat_high_prio[0][m] += src->io_u_plat_high_prio[k][m];
- dst->io_u_plat_low_prio[0][m] += src->io_u_plat_low_prio[k][m];
- }
-
- }
- }
-
dst->total_run_time += src->total_run_time;
dst->total_submit += src->total_submit;
dst->total_complete += src->total_complete;
gs->min_bw[i] = gs->min_run[i] = ~0UL;
}
-void init_thread_stat(struct thread_stat *ts)
+void init_thread_stat_min_vals(struct thread_stat *ts)
{
- int j;
+ int i;
+ for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+ ts->clat_stat[i].min_val = ULONG_MAX;
+ ts->slat_stat[i].min_val = ULONG_MAX;
+ ts->lat_stat[i].min_val = ULONG_MAX;
+ ts->bw_stat[i].min_val = ULONG_MAX;
+ ts->iops_stat[i].min_val = ULONG_MAX;
+ }
+ ts->sync_stat.min_val = ULONG_MAX;
+}
+
+void init_thread_stat(struct thread_stat *ts)
+{
memset(ts, 0, sizeof(*ts));
- for (j = 0; j < DDIR_RWDIR_CNT; j++) {
- ts->lat_stat[j].min_val = -1UL;
- ts->clat_stat[j].min_val = -1UL;
- ts->slat_stat[j].min_val = -1UL;
- ts->bw_stat[j].min_val = -1UL;
- ts->iops_stat[j].min_val = -1UL;
- ts->clat_high_prio_stat[j].min_val = -1UL;
- ts->clat_low_prio_stat[j].min_val = -1UL;
- }
- ts->sync_stat.min_val = -1UL;
+ init_thread_stat_min_vals(ts);
ts->groupid = -1;
}
+static void init_per_prio_stats(struct thread_stat *threadstats, int nr_ts)
+{
+ struct thread_stat *ts;
+ int i, j, last_ts, idx;
+ enum fio_ddir ddir;
+
+ j = 0;
+ last_ts = -1;
+ idx = 0;
+
+ /*
+ * Loop through all tds, if a td requires per prio stats, temporarily
+ * store a 1 in ts->disable_prio_stat, and then do an additional
+ * loop at the end where we invert the ts->disable_prio_stat values.
+ */
+ for_each_td(td) {
+ if (!td->o.stats)
+ continue;
+ if (idx &&
+ (!td->o.group_reporting ||
+ (td->o.group_reporting && last_ts != td->groupid))) {
+ idx = 0;
+ j++;
+ }
+
+ last_ts = td->groupid;
+ ts = &threadstats[j];
+
+ /* idx == 0 means first td in group, or td is not in a group. */
+ if (idx == 0)
+ ts->ioprio = td->ioprio;
+ else if (td->ioprio != ts->ioprio)
+ ts->disable_prio_stat = 1;
+
+ for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+ if (td->ts.clat_prio[ddir]) {
+ ts->disable_prio_stat = 1;
+ break;
+ }
+ }
+
+ idx++;
+ } end_for_each();
+
+ /* Loop through all dst threadstats and fixup the values. */
+ for (i = 0; i < nr_ts; i++) {
+ ts = &threadstats[i];
+ ts->disable_prio_stat = !ts->disable_prio_stat;
+ }
+}
+
void __show_run_stats(void)
{
struct group_run_stats *runstats, *rs;
- struct thread_data *td;
struct thread_stat *threadstats, *ts;
int i, j, k, nr_ts, last_ts, idx;
bool kb_base_warned = false;
*/
nr_ts = 0;
last_ts = -1;
- for_each_td(td, i) {
+ for_each_td(td) {
if (!td->o.group_reporting) {
nr_ts++;
continue;
last_ts = td->groupid;
nr_ts++;
- }
+ } end_for_each();
threadstats = malloc(nr_ts * sizeof(struct thread_stat));
opt_lists = malloc(nr_ts * sizeof(struct flist_head *));
opt_lists[i] = NULL;
}
+ init_per_prio_stats(threadstats, nr_ts);
+
j = 0;
last_ts = -1;
idx = 0;
- for_each_td(td, i) {
+ for_each_td(td) {
if (!td->o.stats)
continue;
if (idx && (!td->o.group_reporting ||
opt_lists[j] = &td->opt_list;
idx++;
- ts->members++;
if (ts->groupid == -1) {
/*
*/
ts->thread_number = td->thread_number;
ts->groupid = td->groupid;
+ ts->job_start = td->job_start;
/*
* first pid in group, not very useful...
for (k = 0; k < ts->nr_block_infos; k++)
ts->block_infos[k] = td->ts.block_infos[k];
- sum_thread_stats(ts, &td->ts, idx == 1);
+ sum_thread_stats(ts, &td->ts);
+
+ ts->members++;
if (td->o.ss_dur) {
ts->ss_state = td->ss.state;
}
else
ts->ss_dur = ts->ss_state = 0;
- }
+ } end_for_each();
for (i = 0; i < nr_ts; i++) {
unsigned long long bw;
rs->kb_base = ts->kb_base;
rs->unit_base = ts->unit_base;
rs->sig_figs = ts->sig_figs;
- rs->unified_rw_rep += ts->unified_rw_rep;
+ rs->unified_rw_rep |= ts->unified_rw_rep;
for (j = 0; j < DDIR_RWDIR_CNT; j++) {
if (!ts->runtime[j])
}
for (i = 0; i < groupid + 1; i++) {
- int ddir;
+ enum fio_ddir ddir;
rs = &runstats[i];
log_info_flush();
free(runstats);
+
+ /* free arrays allocated by sum_thread_stats(), if any */
+ for (i = 0; i < nr_ts; i++) {
+ ts = &threadstats[i];
+ free_clat_prio_stats(ts);
+ }
free(threadstats);
free(opt_lists);
}
int __show_running_run_stats(void)
{
- struct thread_data *td;
unsigned long long *rt;
struct timespec ts;
- int i;
fio_sem_down(stat_sem);
rt = malloc(thread_number * sizeof(unsigned long long));
fio_gettime(&ts, NULL);
- for_each_td(td, i) {
+ for_each_td(td) {
+ if (td->runstate >= TD_EXITED)
+ continue;
+
td->update_rusage = 1;
for_each_rw_ddir(ddir) {
td->ts.io_bytes[ddir] = td->io_bytes[ddir];
}
td->ts.total_run_time = mtime_since(&td->epoch, &ts);
- rt[i] = mtime_since(&td->start, &ts);
+ rt[__td_index] = mtime_since(&td->start, &ts);
if (td_read(td) && td->ts.io_bytes[DDIR_READ])
- td->ts.runtime[DDIR_READ] += rt[i];
+ td->ts.runtime[DDIR_READ] += rt[__td_index];
if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
- td->ts.runtime[DDIR_WRITE] += rt[i];
+ td->ts.runtime[DDIR_WRITE] += rt[__td_index];
if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM])
- td->ts.runtime[DDIR_TRIM] += rt[i];
- }
+ td->ts.runtime[DDIR_TRIM] += rt[__td_index];
+ } end_for_each();
- for_each_td(td, i) {
+ for_each_td(td) {
if (td->runstate >= TD_EXITED)
continue;
if (td->rusage_sem) {
fio_sem_down(td->rusage_sem);
}
td->update_rusage = 0;
- }
+ } end_for_each();
__show_run_stats();
- for_each_td(td, i) {
+ for_each_td(td) {
+ if (td->runstate >= TD_EXITED)
+ continue;
+
if (td_read(td) && td->ts.io_bytes[DDIR_READ])
- td->ts.runtime[DDIR_READ] -= rt[i];
+ td->ts.runtime[DDIR_READ] -= rt[__td_index];
if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
- td->ts.runtime[DDIR_WRITE] -= rt[i];
+ td->ts.runtime[DDIR_WRITE] -= rt[__td_index];
if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM])
- td->ts.runtime[DDIR_TRIM] -= rt[i];
- }
+ td->ts.runtime[DDIR_TRIM] -= rt[__td_index];
+ } end_for_each();
free(rt);
fio_sem_up(stat_sem);
is->samples++;
}
+static inline void add_stat_prio_sample(struct clat_prio_stat *clat_prio,
+ unsigned short clat_prio_index,
+ unsigned long long nsec)
+{
+ if (clat_prio)
+ add_stat_sample(&clat_prio[clat_prio_index].clat_stat, nsec);
+}
+
/*
* Return a struct io_logs, which is added to the tail of the log
* list for 'iolog'.
*/
static struct io_logs *get_new_log(struct io_log *iolog)
{
- size_t new_size, new_samples;
+ size_t new_samples;
struct io_logs *cur_log;
/*
* Cap the size at MAX_LOG_ENTRIES, so we don't keep doubling
* forever
*/
- if (!iolog->cur_log_max)
- new_samples = DEF_LOG_ENTRIES;
- else {
+ if (!iolog->cur_log_max) {
+ if (iolog->td)
+ new_samples = iolog->td->o.log_entries;
+ else
+ new_samples = DEF_LOG_ENTRIES;
+ } else {
new_samples = iolog->cur_log_max * 2;
if (new_samples > MAX_LOG_ENTRIES)
new_samples = MAX_LOG_ENTRIES;
}
- new_size = new_samples * log_entry_sz(iolog);
-
cur_log = smalloc(sizeof(*cur_log));
if (cur_log) {
INIT_FLIST_HEAD(&cur_log->list);
- cur_log->log = malloc(new_size);
+ cur_log->log = calloc(new_samples, log_entry_sz(iolog));
if (cur_log->log) {
cur_log->nr_samples = 0;
cur_log->max_samples = new_samples;
td->flags &= ~TD_F_REGROW_LOGS;
}
+void regrow_agg_logs(void)
+{
+ enum fio_ddir ddir;
+
+ for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+ regrow_log(agg_io_log[ddir]);
+}
+
static struct io_logs *get_cur_log(struct io_log *iolog)
{
struct io_logs *cur_log;
static void __add_log_sample(struct io_log *iolog, union io_sample_data data,
enum fio_ddir ddir, unsigned long long bs,
- unsigned long t, uint64_t offset, uint8_t priority_bit)
+ unsigned long t, uint64_t offset,
+ unsigned int priority)
{
struct io_logs *cur_log;
s = get_sample(iolog, cur_log, cur_log->nr_samples);
s->data = data;
- s->time = t + (iolog->td ? iolog->td->unix_epoch : 0);
+ s->time = t;
+ if (iolog->td && iolog->td->o.log_alternate_epoch)
+ s->time += iolog->td->alternate_epoch;
io_sample_set_ddir(iolog, s, ddir);
s->bs = bs;
- s->priority_bit = priority_bit;
+ s->priority = priority;
if (iolog->log_offset) {
struct io_sample_offset *so = (void *) s;
ios->mean.u.f = ios->S.u.f = 0;
}
+static inline void reset_io_u_plat(uint64_t *io_u_plat)
+{
+ int i;
+
+ for (i = 0; i < FIO_IO_U_PLAT_NR; i++)
+ io_u_plat[i] = 0;
+}
+
+static inline void reset_clat_prio_stats(struct thread_stat *ts)
+{
+ enum fio_ddir ddir;
+ int i;
+
+ for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+ if (!ts->clat_prio[ddir])
+ continue;
+
+ for (i = 0; i < ts->nr_clat_prio[ddir]; i++) {
+ reset_io_stat(&ts->clat_prio[ddir][i].clat_stat);
+ reset_io_u_plat(ts->clat_prio[ddir][i].io_u_plat);
+ }
+ }
+}
+
void reset_io_stats(struct thread_data *td)
{
struct thread_stat *ts = &td->ts;
- int i, j, k;
+ int i, j;
for (i = 0; i < DDIR_RWDIR_CNT; i++) {
- reset_io_stat(&ts->clat_high_prio_stat[i]);
- reset_io_stat(&ts->clat_low_prio_stat[i]);
reset_io_stat(&ts->clat_stat[i]);
reset_io_stat(&ts->slat_stat[i]);
reset_io_stat(&ts->lat_stat[i]);
ts->total_io_u[i] = 0;
ts->short_io_u[i] = 0;
ts->drop_io_u[i] = 0;
-
- for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
- ts->io_u_plat_high_prio[i][j] = 0;
- ts->io_u_plat_low_prio[i][j] = 0;
- if (!i)
- ts->io_u_sync_plat[j] = 0;
- }
}
for (i = 0; i < FIO_LAT_CNT; i++)
for (j = 0; j < DDIR_RWDIR_CNT; j++)
- for (k = 0; k < FIO_IO_U_PLAT_NR; k++)
- ts->io_u_plat[i][j][k] = 0;
+ reset_io_u_plat(ts->io_u_plat[i][j]);
+
+ reset_clat_prio_stats(ts);
ts->total_io_u[DDIR_SYNC] = 0;
+ reset_io_u_plat(ts->io_u_sync_plat);
for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
ts->io_u_map[i] = 0;
}
static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
- unsigned long elapsed, bool log_max, uint8_t priority_bit)
+ unsigned long elapsed, int log_max)
{
/*
* Note an entry in the log. Use the mean from the logged samples,
if (iolog->avg_window[ddir].samples) {
union io_sample_data data;
- if (log_max)
- data.val = iolog->avg_window[ddir].max_val;
- else
- data.val = iolog->avg_window[ddir].mean.u.f + 0.50;
+ if (log_max == IO_LOG_SAMPLE_AVG) {
+ data.val.val0 = iolog->avg_window[ddir].mean.u.f + 0.50;
+ data.val.val1 = 0;
+ } else if (log_max == IO_LOG_SAMPLE_MAX) {
+ data.val.val0 = iolog->avg_window[ddir].max_val;
+ data.val.val1 = 0;
+ } else {
+ data.val.val0 = iolog->avg_window[ddir].mean.u.f + 0.50;
+ data.val.val1 = iolog->avg_window[ddir].max_val;
+ }
- __add_log_sample(iolog, data, ddir, 0, elapsed, 0, priority_bit);
+ __add_log_sample(iolog, data, ddir, 0, elapsed, 0, 0);
}
reset_io_stat(&iolog->avg_window[ddir]);
}
static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed,
- bool log_max, uint8_t priority_bit)
+ int log_max)
{
- int ddir;
+ enum fio_ddir ddir;
for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
- __add_stat_to_log(iolog, ddir, elapsed, log_max, priority_bit);
+ __add_stat_to_log(iolog, ddir, elapsed, log_max);
}
static unsigned long add_log_sample(struct thread_data *td,
struct io_log *iolog,
union io_sample_data data,
enum fio_ddir ddir, unsigned long long bs,
- uint64_t offset, uint8_t priority_bit)
+ uint64_t offset, unsigned int ioprio)
{
unsigned long elapsed, this_window;
* If no time averaging, just add the log sample.
*/
if (!iolog->avg_msec) {
- __add_log_sample(iolog, data, ddir, bs, elapsed, offset, priority_bit);
+ __add_log_sample(iolog, data, ddir, bs, elapsed, offset,
+ ioprio);
return 0;
}
* Add the sample. If the time period has passed, then
* add that entry to the log and clear.
*/
- add_stat_sample(&iolog->avg_window[ddir], data.val);
+ add_stat_sample(&iolog->avg_window[ddir], data.val.val0);
/*
* If period hasn't passed, adding the above sample is all we
return diff;
}
- __add_stat_to_log(iolog, ddir, elapsed, td->o.log_max != 0, priority_bit);
+ __add_stat_to_log(iolog, ddir, elapsed, td->o.log_max);
+
+ iolog->avg_last[ddir] = elapsed - (elapsed % iolog->avg_msec);
- iolog->avg_last[ddir] = elapsed - (this_window - iolog->avg_msec);
return iolog->avg_msec;
}
elapsed = mtime_since_now(&td->epoch);
if (td->clat_log && unit_logs)
- _add_stat_to_log(td->clat_log, elapsed, td->o.log_max != 0, 0);
+ _add_stat_to_log(td->clat_log, elapsed, td->o.log_max);
if (td->slat_log && unit_logs)
- _add_stat_to_log(td->slat_log, elapsed, td->o.log_max != 0, 0);
+ _add_stat_to_log(td->slat_log, elapsed, td->o.log_max);
if (td->lat_log && unit_logs)
- _add_stat_to_log(td->lat_log, elapsed, td->o.log_max != 0, 0);
+ _add_stat_to_log(td->lat_log, elapsed, td->o.log_max);
if (td->bw_log && (unit_logs == per_unit_log(td->bw_log)))
- _add_stat_to_log(td->bw_log, elapsed, td->o.log_max != 0, 0);
+ _add_stat_to_log(td->bw_log, elapsed, td->o.log_max);
if (td->iops_log && (unit_logs == per_unit_log(td->iops_log)))
- _add_stat_to_log(td->iops_log, elapsed, td->o.log_max != 0, 0);
+ _add_stat_to_log(td->iops_log, elapsed, td->o.log_max);
}
-void add_agg_sample(union io_sample_data data, enum fio_ddir ddir, unsigned long long bs,
- uint8_t priority_bit)
+void add_agg_sample(union io_sample_data data, enum fio_ddir ddir,
+ unsigned long long bs)
{
struct io_log *iolog;
return;
iolog = agg_io_log[ddir];
- __add_log_sample(iolog, data, ddir, bs, mtime_since_genesis(), 0, priority_bit);
+ __add_log_sample(iolog, data, ddir, bs, mtime_since_genesis(), 0, 0);
}
void add_sync_clat_sample(struct thread_stat *ts, unsigned long long nsec)
add_stat_sample(&ts->sync_stat, nsec);
}
-static void add_lat_percentile_sample_noprio(struct thread_stat *ts,
- unsigned long long nsec, enum fio_ddir ddir, enum fio_lat lat)
+static inline void add_lat_percentile_sample(struct thread_stat *ts,
+ unsigned long long nsec,
+ enum fio_ddir ddir,
+ enum fio_lat lat)
{
unsigned int idx = plat_val_to_idx(nsec);
assert(idx < FIO_IO_U_PLAT_NR);
ts->io_u_plat[lat][ddir][idx]++;
}
-static void add_lat_percentile_sample(struct thread_stat *ts,
- unsigned long long nsec, enum fio_ddir ddir, uint8_t priority_bit,
- enum fio_lat lat)
+static inline void
+add_lat_percentile_prio_sample(struct thread_stat *ts, unsigned long long nsec,
+ enum fio_ddir ddir,
+ unsigned short clat_prio_index)
{
unsigned int idx = plat_val_to_idx(nsec);
- add_lat_percentile_sample_noprio(ts, nsec, ddir, lat);
-
- if (!priority_bit)
- ts->io_u_plat_low_prio[ddir][idx]++;
- else
- ts->io_u_plat_high_prio[ddir][idx]++;
+ if (ts->clat_prio[ddir])
+ ts->clat_prio[ddir][clat_prio_index].io_u_plat[idx]++;
}
void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
unsigned long long nsec, unsigned long long bs,
- uint64_t offset, uint8_t priority_bit)
+ uint64_t offset, unsigned int ioprio,
+ unsigned short clat_prio_index)
{
const bool needs_lock = td_async_processing(td);
unsigned long elapsed, this_window;
add_stat_sample(&ts->clat_stat[ddir], nsec);
- if (!ts->lat_percentiles) {
- if (priority_bit)
- add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
- else
- add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
- }
+ /*
+ * When lat_percentiles=1 (default 0), the reported per priority
+ * percentiles and stats are used for describing total latency values,
+ * even though the variable names themselves start with clat_.
+ *
+ * Because of the above definition, add a prio stat sample only when
+ * lat_percentiles=0. add_lat_sample() will add the prio stat sample
+ * when lat_percentiles=1.
+ */
+ if (!ts->lat_percentiles)
+ add_stat_prio_sample(ts->clat_prio[ddir], clat_prio_index,
+ nsec);
if (td->clat_log)
add_log_sample(td, td->clat_log, sample_val(nsec), ddir, bs,
- offset, priority_bit);
+ offset, ioprio);
if (ts->clat_percentiles) {
- if (ts->lat_percentiles)
- add_lat_percentile_sample_noprio(ts, nsec, ddir, FIO_CLAT);
- else
- add_lat_percentile_sample(ts, nsec, ddir, priority_bit, FIO_CLAT);
+ /*
+ * Because of the above definition, add a prio lat percentile
+ * sample only when lat_percentiles=0. add_lat_sample() will add
+ * the prio lat percentile sample when lat_percentiles=1.
+ */
+ add_lat_percentile_sample(ts, nsec, ddir, FIO_CLAT);
+ if (!ts->lat_percentiles)
+ add_lat_percentile_prio_sample(ts, nsec, ddir,
+ clat_prio_index);
}
if (iolog && iolog->hist_msec) {
FIO_IO_U_PLAT_NR * sizeof(uint64_t));
flist_add(&dst->list, &hw->list);
__add_log_sample(iolog, sample_plat(dst), ddir, bs,
- elapsed, offset, priority_bit);
+ elapsed, offset, ioprio);
/*
* Update the last time we recorded as being now, minus
}
void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
- unsigned long long nsec, unsigned long long bs, uint64_t offset,
- uint8_t priority_bit)
+ unsigned long long nsec, unsigned long long bs,
+ uint64_t offset, unsigned int ioprio)
{
const bool needs_lock = td_async_processing(td);
struct thread_stat *ts = &td->ts;
add_stat_sample(&ts->slat_stat[ddir], nsec);
if (td->slat_log)
- add_log_sample(td, td->slat_log, sample_val(nsec), ddir, bs, offset,
- priority_bit);
+ add_log_sample(td, td->slat_log, sample_val(nsec), ddir, bs,
+ offset, ioprio);
if (ts->slat_percentiles)
- add_lat_percentile_sample_noprio(ts, nsec, ddir, FIO_SLAT);
+ add_lat_percentile_sample(ts, nsec, ddir, FIO_SLAT);
if (needs_lock)
__td_io_u_unlock(td);
void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
unsigned long long nsec, unsigned long long bs,
- uint64_t offset, uint8_t priority_bit)
+ uint64_t offset, unsigned int ioprio,
+ unsigned short clat_prio_index)
{
const bool needs_lock = td_async_processing(td);
struct thread_stat *ts = &td->ts;
if (td->lat_log)
add_log_sample(td, td->lat_log, sample_val(nsec), ddir, bs,
- offset, priority_bit);
+ offset, ioprio);
+ /*
+ * When lat_percentiles=1 (default 0), the reported per priority
+ * percentiles and stats are used for describing total latency values,
+ * even though the variable names themselves start with clat_.
+ *
+ * Because of the above definition, add a prio stat and prio lat
+ * percentile sample only when lat_percentiles=1. add_clat_sample() will
+ * add the prio stat and prio lat percentile sample when
+ * lat_percentiles=0.
+ */
if (ts->lat_percentiles) {
- add_lat_percentile_sample(ts, nsec, ddir, priority_bit, FIO_LAT);
- if (priority_bit)
- add_stat_sample(&ts->clat_high_prio_stat[ddir], nsec);
- else
- add_stat_sample(&ts->clat_low_prio_stat[ddir], nsec);
-
+ add_lat_percentile_sample(ts, nsec, ddir, FIO_LAT);
+ add_lat_percentile_prio_sample(ts, nsec, ddir, clat_prio_index);
+ add_stat_prio_sample(ts->clat_prio[ddir], clat_prio_index,
+ nsec);
}
if (needs_lock)
__td_io_u_unlock(td);
if (td->bw_log)
add_log_sample(td, td->bw_log, sample_val(rate), io_u->ddir,
- bytes, io_u->offset, io_u_is_prio(io_u));
+ bytes, io_u->offset, io_u->ioprio);
td->stat_io_bytes[io_u->ddir] = td->this_io_bytes[io_u->ddir];
next_log = avg_time;
spent = mtime_since(parent_tv, t);
- if (spent < avg_time && avg_time - spent >= LOG_MSEC_SLACK)
+ if (spent < avg_time && avg_time - spent > LOG_MSEC_SLACK)
return avg_time - spent;
if (needs_lock)
if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
bs = td->o.min_bs[ddir];
- next = add_log_sample(td, log, sample_val(rate), ddir, bs, 0, 0);
+ next = add_log_sample(td, log, sample_val(rate), ddir,
+ bs, 0, 0);
next_log = min(next_log, next);
}
if (td->iops_log)
add_log_sample(td, td->iops_log, sample_val(1), io_u->ddir,
- bytes, io_u->offset, io_u_is_prio(io_u));
+ bytes, io_u->offset, io_u->ioprio);
td->stat_io_blocks[io_u->ddir] = td->this_io_blocks[io_u->ddir];
td->ts.iops_stat, td->iops_log, false);
}
+static bool td_in_logging_state(struct thread_data *td)
+{
+ if (in_ramp_time(td))
+ return false;
+
+ switch(td->runstate) {
+ case TD_RUNNING:
+ case TD_VERIFYING:
+ case TD_FINISHING:
+ case TD_EXITED:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* Returns msecs to next event
*/
int calc_log_samples(void)
{
- struct thread_data *td;
- unsigned int next = ~0U, tmp;
+ unsigned int next = ~0U, tmp = 0, next_mod = 0, log_avg_msec_min = -1U;
struct timespec now;
- int i;
+ long elapsed_time = 0;
- fio_gettime(&now, NULL);
+ for_each_td(td) {
+ fio_gettime(&now, NULL);
+ elapsed_time = mtime_since(&td->epoch, &now);
- for_each_td(td, i) {
if (!td->o.stats)
continue;
- if (in_ramp_time(td) ||
- !(td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING)) {
+ if (!td_in_logging_state(td)) {
next = min(td->o.iops_avg_time, td->o.bw_avg_time);
continue;
}
if (!td->bw_log ||
(td->bw_log && !per_unit_log(td->bw_log))) {
tmp = add_bw_samples(td, &now);
- if (tmp < next)
- next = tmp;
+
+ if (td->bw_log)
+ log_avg_msec_min = min(log_avg_msec_min, (unsigned int)td->bw_log->avg_msec);
}
if (!td->iops_log ||
(td->iops_log && !per_unit_log(td->iops_log))) {
tmp = add_iops_samples(td, &now);
- if (tmp < next)
- next = tmp;
+
+ if (td->iops_log)
+ log_avg_msec_min = min(log_avg_msec_min, (unsigned int)td->iops_log->avg_msec);
}
- }
+
+ if (tmp < next)
+ next = tmp;
+ } end_for_each();
+
+ /* if log_avg_msec_min has not been changed, set it to 0 */
+ if (log_avg_msec_min == -1U)
+ log_avg_msec_min = 0;
+
+ if (log_avg_msec_min == 0)
+ next_mod = elapsed_time;
+ else
+ next_mod = elapsed_time % log_avg_msec_min;
+
+ /* correction to keep the time on the log avg msec boundary */
+ next = min(next, (log_avg_msec_min - next_mod));
return next == ~0U ? 0 : next;
}
*
* FIO_IO_U_PLAT_GROUP_NR and FIO_IO_U_PLAT_BITS determine the memory
* requirement of storing those aggregate counts. The memory used will
- * be (FIO_IO_U_PLAT_GROUP_NR * 2^FIO_IO_U_PLAT_BITS) * sizeof(int)
+ * be (FIO_IO_U_PLAT_GROUP_NR * 2^FIO_IO_U_PLAT_BITS) * sizeof(uint64_t)
* bytes.
*
* FIO_IO_U_PLAT_NR is the total number of buckets.
* than one. This method has low accuracy when the value is small. For
* example, let the buckets be {[0,99],[100,199],...,[900,999]}, and
* the represented value of each bucket be the mean of the range. Then
- * a value 0 has an round-off error of 49.5. To improve on this, we
+ * a value 0 has a round-off error of 49.5. To improve on this, we
* use buckets with non-uniform ranges, while bounding the error of
* each bucket within a ratio of the sample value. A simple example
* would be when error_bound = 0.005, buckets are {
BLOCK_STATE_COUNT,
};
-#define MAX_PATTERN_SIZE 512
#define FIO_JOBNAME_SIZE 128
#define FIO_JOBDESC_SIZE 256
#define FIO_VERROR_SIZE 128
+#define UNIFIED_SPLIT 0
+#define UNIFIED_MIXED 1
+#define UNIFIED_BOTH 2
enum fio_lat {
FIO_SLAT = 0,
FIO_LAT_CNT = 3,
};
+struct clat_prio_stat {
+ uint64_t io_u_plat[FIO_IO_U_PLAT_NR];
+ struct io_stat clat_stat;
+ uint32_t ioprio;
+};
+
struct thread_stat {
char name[FIO_JOBNAME_SIZE];
char verror[FIO_VERROR_SIZE];
uint32_t error;
uint32_t thread_number;
uint32_t groupid;
+ uint64_t job_start; /* Time job was started, as clock_gettime(job_start_clock_id) */
uint32_t pid;
char description[FIO_JOBDESC_SIZE];
uint32_t members;
uint32_t unified_rw_rep;
+ uint32_t disable_prio_stat;
/*
* bandwidth and latency stats
fio_fp64_t ss_deviation;
fio_fp64_t ss_criterion;
- uint64_t io_u_plat_high_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR] __attribute__((aligned(8)));;
- uint64_t io_u_plat_low_prio[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
- struct io_stat clat_high_prio_stat[DDIR_RWDIR_CNT] __attribute__((aligned(8)));
- struct io_stat clat_low_prio_stat[DDIR_RWDIR_CNT];
+ /* A mirror of td->ioprio. */
+ uint32_t ioprio;
union {
uint64_t *ss_iops_data;
+ /*
+ * For FIO_NET_CMD_TS, the pointed to data will temporarily
+ * be stored at this offset from the start of the payload.
+ */
+ uint64_t ss_iops_data_offset;
uint64_t pad4;
};
union {
uint64_t *ss_bw_data;
+ /*
+ * For FIO_NET_CMD_TS, the pointed to data will temporarily
+ * be stored at this offset from the start of the payload.
+ */
+ uint64_t ss_bw_data_offset;
uint64_t pad5;
};
+ union {
+ struct clat_prio_stat *clat_prio[DDIR_RWDIR_CNT];
+ /*
+ * For FIO_NET_CMD_TS, the pointed to data will temporarily
+ * be stored at this offset from the start of the payload.
+ */
+ uint64_t clat_prio_offset[DDIR_RWDIR_CNT];
+ uint64_t pad6;
+ };
+ uint32_t nr_clat_prio[DDIR_RWDIR_CNT];
+
uint64_t cachehit;
uint64_t cachemiss;
} __attribute__((packed));
extern struct json_object * show_thread_status(struct thread_stat *ts, struct group_run_stats *rs, struct flist_head *, struct buf_output *);
extern void show_group_stats(struct group_run_stats *rs, struct buf_output *);
-extern bool calc_thread_status(struct jobs_eta *je, int force);
extern void display_thread_status(struct jobs_eta *je);
extern void __show_run_stats(void);
extern int __show_running_run_stats(void);
extern void show_running_run_stats(void);
extern void check_for_running_stats(void);
-extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first);
+extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src);
extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src);
+extern void init_thread_stat_min_vals(struct thread_stat *ts);
extern void init_thread_stat(struct thread_stat *ts);
extern void init_group_run_stat(struct group_run_stats *gs);
extern void eta_to_str(char *str, unsigned long eta_sec);
extern void clear_rusage_stat(struct thread_data *);
extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
- unsigned long long, uint64_t, uint8_t);
+ unsigned long long, uint64_t, unsigned int, unsigned short);
extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
- unsigned long long, uint64_t, uint8_t);
+ unsigned long long, uint64_t, unsigned int, unsigned short);
extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long long,
- unsigned long long, uint64_t, uint8_t);
-extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long bs,
- uint8_t priority_bit);
+ unsigned long long, uint64_t, unsigned int);
+extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned long long);
extern void add_iops_sample(struct thread_data *, struct io_u *,
unsigned int);
extern void add_bw_sample(struct thread_data *, struct io_u *,
extern void add_sync_clat_sample(struct thread_stat *ts,
unsigned long long nsec);
extern int calc_log_samples(void);
+extern void free_clat_prio_stats(struct thread_stat *);
+extern int alloc_clat_prio_stat_ddir(struct thread_stat *, enum fio_ddir, int);
extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse, struct buf_output *);
extern void json_array_add_disk_util(struct disk_util_stat *dus,
#include "steadystate.h"
bool steadystate_enabled = false;
+unsigned int ss_check_interval = 1000;
void steadystate_free(struct thread_data *td)
{
static void steadystate_alloc(struct thread_data *td)
{
- td->ss.bw_data = calloc(td->ss.dur, sizeof(uint64_t));
- td->ss.iops_data = calloc(td->ss.dur, sizeof(uint64_t));
+ int intervals = td->ss.dur / (ss_check_interval / 1000L);
+
+ td->ss.bw_data = calloc(intervals, sizeof(uint64_t));
+ td->ss.iops_data = calloc(intervals, sizeof(uint64_t));
td->ss.state |= FIO_SS_DATA;
}
void steadystate_setup(void)
{
- struct thread_data *td, *prev_td;
- int i, prev_groupid;
+ struct thread_data *prev_td;
+ int prev_groupid;
if (!steadystate_enabled)
return;
*/
prev_groupid = -1;
prev_td = NULL;
- for_each_td(td, i) {
+ for_each_td(td) {
if (!td->ss.dur)
continue;
prev_groupid = td->groupid;
}
prev_td = td;
- }
+ } end_for_each();
if (prev_td && prev_td->o.group_reporting)
steadystate_alloc(prev_td);
double result;
struct steadystate_data *ss = &td->ss;
uint64_t new_val;
+ int intervals = ss->dur / (ss_check_interval / 1000L);
ss->bw_data[ss->tail] = bw;
ss->iops_data[ss->tail] = iops;
else
new_val = bw;
- if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) {
+ if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == intervals - 1) {
if (!(ss->state & FIO_SS_BUFFER_FULL)) {
/* first time through */
- for(i = 0, ss->sum_y = 0; i < ss->dur; i++) {
+ for (i = 0, ss->sum_y = 0; i < intervals; i++) {
if (ss->state & FIO_SS_IOPS)
ss->sum_y += ss->iops_data[i];
else
ss->sum_y += ss->bw_data[i];
- j = (ss->head + i) % ss->dur;
+ j = (ss->head + i) % intervals;
if (ss->state & FIO_SS_IOPS)
ss->sum_xy += i * ss->iops_data[j];
else
} else { /* easy to update the sums */
ss->sum_y -= ss->oldest_y;
ss->sum_y += new_val;
- ss->sum_xy = ss->sum_xy - ss->sum_y + ss->dur * new_val;
+ ss->sum_xy = ss->sum_xy - ss->sum_y + intervals * new_val;
}
if (ss->state & FIO_SS_IOPS)
* equally spaced when they are often off by a few milliseconds.
* This assumption greatly simplifies the calculations.
*/
- ss->slope = (ss->sum_xy - (double) ss->sum_x * ss->sum_y / ss->dur) /
- (ss->sum_x_sq - (double) ss->sum_x * ss->sum_x / ss->dur);
+ ss->slope = (ss->sum_xy - (double) ss->sum_x * ss->sum_y / intervals) /
+ (ss->sum_x_sq - (double) ss->sum_x * ss->sum_x / intervals);
if (ss->state & FIO_SS_PCT)
- ss->criterion = 100.0 * ss->slope / (ss->sum_y / ss->dur);
+ ss->criterion = 100.0 * ss->slope / (ss->sum_y / intervals);
else
ss->criterion = ss->slope;
return true;
}
- ss->tail = (ss->tail + 1) % ss->dur;
+ ss->tail = (ss->tail + 1) % intervals;
if (ss->tail <= ss->head)
- ss->head = (ss->head + 1) % ss->dur;
+ ss->head = (ss->head + 1) % intervals;
return false;
}
double mean;
struct steadystate_data *ss = &td->ss;
+ int intervals = ss->dur / (ss_check_interval / 1000L);
ss->bw_data[ss->tail] = bw;
ss->iops_data[ss->tail] = iops;
- if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) {
+ if (ss->state & FIO_SS_BUFFER_FULL || ss->tail - ss->head == intervals - 1) {
if (!(ss->state & FIO_SS_BUFFER_FULL)) {
/* first time through */
- for(i = 0, ss->sum_y = 0; i < ss->dur; i++)
+ for (i = 0, ss->sum_y = 0; i < intervals; i++) {
if (ss->state & FIO_SS_IOPS)
ss->sum_y += ss->iops_data[i];
else
ss->sum_y += ss->bw_data[i];
+ }
ss->state |= FIO_SS_BUFFER_FULL;
} else { /* easy to update the sum */
ss->sum_y -= ss->oldest_y;
else
ss->oldest_y = ss->bw_data[ss->head];
- mean = (double) ss->sum_y / ss->dur;
+ mean = (double) ss->sum_y / intervals;
ss->deviation = 0.0;
- for (i = 0; i < ss->dur; i++) {
+ for (i = 0; i < intervals; i++) {
if (ss->state & FIO_SS_IOPS)
diff = ss->iops_data[i] - mean;
else
else
ss->criterion = ss->deviation;
- dprint(FD_STEADYSTATE, "sum_y: %llu, mean: %f, max diff: %f, "
+ dprint(FD_STEADYSTATE, "intervals: %d, sum_y: %llu, mean: %f, max diff: %f, "
"objective: %f, limit: %f\n",
+ intervals,
(unsigned long long) ss->sum_y, mean,
ss->deviation, ss->criterion, ss->limit);
return true;
}
- ss->tail = (ss->tail + 1) % ss->dur;
- if (ss->tail <= ss->head)
- ss->head = (ss->head + 1) % ss->dur;
+ ss->tail = (ss->tail + 1) % intervals;
+ if (ss->tail == ss->head)
+ ss->head = (ss->head + 1) % intervals;
return false;
}
int steadystate_check(void)
{
- int i, j, ddir, prev_groupid, group_ramp_time_over = 0;
+ int ddir, prev_groupid, group_ramp_time_over = 0;
unsigned long rate_time;
- struct thread_data *td, *td2;
struct timespec now;
uint64_t group_bw = 0, group_iops = 0;
uint64_t td_iops, td_bytes;
bool ret;
prev_groupid = -1;
- for_each_td(td, i) {
+ for_each_td(td) {
const bool needs_lock = td_async_processing(td);
struct steadystate_data *ss = &td->ss;
fio_gettime(&now, NULL);
if (ss->ramp_time && !(ss->state & FIO_SS_RAMP_OVER)) {
/*
- * Begin recording data one second after ss->ramp_time
+ * Begin recording data one check interval after ss->ramp_time
* has elapsed
*/
- if (utime_since(&td->epoch, &now) >= (ss->ramp_time + 1000000L))
+ if (utime_since(&td->epoch, &now) >= (ss->ramp_time + ss_check_interval * 1000L))
ss->state |= FIO_SS_RAMP_OVER;
}
rate_time = mtime_since(&ss->prev_time, &now);
memcpy(&ss->prev_time, &now, sizeof(now));
- /*
- * Begin monitoring when job starts but don't actually use
- * data in checking stopping criterion until ss->ramp_time is
- * over. This ensures that we will have a sane value in
- * prev_iops/bw the first time through after ss->ramp_time
- * is done.
- */
if (ss->state & FIO_SS_RAMP_OVER) {
- group_bw += 1000 * (td_bytes - ss->prev_bytes) / rate_time;
- group_iops += 1000 * (td_iops - ss->prev_iops) / rate_time;
+ group_bw += rate_time * (td_bytes - ss->prev_bytes) /
+ (ss_check_interval * ss_check_interval / 1000L);
+ group_iops += rate_time * (td_iops - ss->prev_iops) /
+ (ss_check_interval * ss_check_interval / 1000L);
++group_ramp_time_over;
}
ss->prev_iops = td_iops;
dprint(FD_STEADYSTATE, "steadystate_check() thread: %d, "
"groupid: %u, rate_msec: %ld, "
"iops: %llu, bw: %llu, head: %d, tail: %d\n",
- i, td->groupid, rate_time,
+ __td_index, td->groupid, rate_time,
(unsigned long long) group_iops,
(unsigned long long) group_bw,
ss->head, ss->tail);
if (ret) {
if (td->o.group_reporting) {
- for_each_td(td2, j) {
+ for_each_td(td2) {
if (td2->groupid == td->groupid) {
td2->ss.state |= FIO_SS_ATTAINED;
fio_mark_td_terminate(td2);
}
- }
+ } end_for_each();
} else {
ss->state |= FIO_SS_ATTAINED;
fio_mark_td_terminate(td);
}
}
- }
+ } end_for_each();
return 0;
}
{
struct steadystate_data *ss = &td->ss;
struct thread_options *o = &td->o;
- struct thread_data *td2;
- int j;
+ int intervals;
memset(ss, 0, sizeof(*ss));
ss->dur = o->ss_dur;
ss->limit = o->ss_limit.u.f;
ss->ramp_time = o->ss_ramp_time;
+ ss_check_interval = o->ss_check_interval / 1000L;
ss->state = o->ss_state;
if (!td->ss.ramp_time)
ss->state |= FIO_SS_RAMP_OVER;
- ss->sum_x = o->ss_dur * (o->ss_dur - 1) / 2;
- ss->sum_x_sq = (o->ss_dur - 1) * (o->ss_dur) * (2*o->ss_dur - 1) / 6;
+ intervals = ss->dur / (ss_check_interval / 1000L);
+ ss->sum_x = intervals * (intervals - 1) / 2;
+ ss->sum_x_sq = (intervals - 1) * (intervals) * (2*intervals - 1) / 6;
}
/* make sure that ss options are consistent within reporting group */
- for_each_td(td2, j) {
+ for_each_td(td2) {
if (td2->groupid == td->groupid) {
struct steadystate_data *ss2 = &td2->ss;
return 1;
}
}
- }
+ } end_for_each();
return 0;
}
{
int i;
uint64_t sum;
-
+ int intervals = ts->ss_dur / (ss_check_interval / 1000L);
+
if (!ts->ss_dur)
return 0;
- for (i = 0, sum = 0; i < ts->ss_dur; i++)
+ for (i = 0, sum = 0; i < intervals; i++)
sum += ts->ss_bw_data[i];
- return sum / ts->ss_dur;
+ return sum / intervals;
}
uint64_t steadystate_iops_mean(struct thread_stat *ts)
{
int i;
uint64_t sum;
+ int intervals = ts->ss_dur / (ss_check_interval / 1000L);
if (!ts->ss_dur)
return 0;
- for (i = 0, sum = 0; i < ts->ss_dur; i++)
+ for (i = 0, sum = 0; i < intervals; i++)
sum += ts->ss_iops_data[i];
- return sum / ts->ss_dur;
+ return sum / intervals;
}
extern uint64_t steadystate_iops_mean(struct thread_stat *);
extern bool steadystate_enabled;
+extern unsigned int ss_check_interval;
struct steadystate_data {
double limit;
FIO_SS_BW_SLOPE = FIO_SS_BW | FIO_SS_SLOPE,
};
-#define STEADYSTATE_MSEC 1000
-
#endif
#include "../lib/bloom.h"
#include "debug.h"
+#include "zlib.h"
+
+struct zlib_ctrl {
+ z_stream stream;
+ unsigned char *buf_in;
+ unsigned char *buf_out;
+};
struct worker_thread {
+ struct zlib_ctrl zc;
pthread_t thread;
-
- volatile int done;
-
- int fd;
uint64_t cur_offset;
uint64_t size;
-
+ unsigned long long unique_capacity;
unsigned long items;
unsigned long dupes;
int err;
+ int fd;
+ volatile int done;
};
struct extent {
static unsigned int collision_check;
static unsigned int print_progress = 1;
static unsigned int use_bloom = 1;
+static unsigned int compression = 0;
static uint64_t total_size;
static uint64_t cur_offset;
return 0;
}
ret = bytes;
- } else
+ } else {
ret = sb->st_size;
+ }
return (ret & ~((uint64_t)blocksize - 1));
}
if (ret < 0) {
perror("pread");
return 1;
- } else if (!ret)
+ } else if (!ret) {
return 1;
- else if (ret != count) {
+ } else if (ret != count) {
log_err("dedupe: short read on block\n");
return 1;
}
return __read_block(fd, buf, offset, blocksize);
}
+static int account_unique_capacity(uint64_t offset, uint64_t *unique_capacity,
+ struct zlib_ctrl *zc)
+{
+ z_stream *stream = &zc->stream;
+ unsigned int compressed_len;
+ int ret;
+
+ if (read_block(file.fd, zc->buf_in, offset))
+ return 1;
+
+ stream->next_in = zc->buf_in;
+ stream->avail_in = blocksize;
+ stream->avail_out = deflateBound(stream, blocksize);
+ stream->next_out = zc->buf_out;
+
+ ret = deflate(stream, Z_FINISH);
+ if (ret == Z_STREAM_ERROR)
+ return 1;
+ compressed_len = blocksize - stream->avail_out;
+
+ if (dump_output)
+ printf("offset 0x%lx compressed to %d blocksize %d ratio %.2f \n",
+ (unsigned long) offset, compressed_len, blocksize,
+ (float)compressed_len / (float)blocksize);
+
+ *unique_capacity += compressed_len;
+ deflateReset(stream);
+ return 0;
+}
+
static void add_item(struct chunk *c, struct item *i)
{
/*
if (collision_check || dump_output) {
c = malloc(sizeof(struct chunk) + sizeof(struct flist_head));
INIT_FLIST_HEAD(&c->extent_list[0]);
- } else
+ } else {
c = malloc(sizeof(struct chunk));
+ }
return c;
}
-static void insert_chunk(struct item *i)
+static int insert_chunk(struct item *i, uint64_t *unique_capacity,
+ struct zlib_ctrl *zc)
{
struct fio_rb_node **p, *parent;
struct chunk *c;
- int diff;
+ int ret, diff;
p = &rb_root.rb_node;
parent = NULL;
c = rb_entry(parent, struct chunk, rb_node);
diff = memcmp(i->hash, c->hash, sizeof(i->hash));
- if (diff < 0)
+ if (diff < 0) {
p = &(*p)->rb_left;
- else if (diff > 0)
+ } else if (diff > 0) {
p = &(*p)->rb_right;
- else {
- int ret;
-
+ } else {
if (!collision_check)
goto add;
memcpy(c->hash, i->hash, sizeof(i->hash));
rb_link_node(&c->rb_node, parent, p);
rb_insert_color(&c->rb_node, &rb_root);
+ if (compression) {
+ ret = account_unique_capacity(i->offset, unique_capacity, zc);
+ if (ret)
+ return ret;
+ }
add:
add_item(c, i);
+ return 0;
}
-static void insert_chunks(struct item *items, unsigned int nitems,
- uint64_t *ndupes)
+static int insert_chunks(struct item *items, unsigned int nitems,
+ uint64_t *ndupes, uint64_t *unique_capacity,
+ struct zlib_ctrl *zc)
{
- int i;
+ int i, ret = 0;
fio_sem_down(rb_lock);
s = sizeof(items[i].hash) / sizeof(uint32_t);
r = bloom_set(bloom, items[i].hash, s);
*ndupes += r;
- } else
- insert_chunk(&items[i]);
+ } else {
+ ret = insert_chunk(&items[i], unique_capacity, zc);
+ if (ret)
+ break;
+ }
}
fio_sem_up(rb_lock);
+ return ret;
}
static void crc_buf(void *buf, uint32_t *hash)
off_t offset;
int nitems = 0;
uint64_t ndupes = 0;
+ uint64_t unique_capacity = 0;
struct item *items;
+ int ret;
offset = thread->cur_offset;
- nblocks = read_blocks(thread->fd, buf, offset, min(thread->size, (uint64_t)chunk_size));
+ nblocks = read_blocks(thread->fd, buf, offset,
+ min(thread->size, (uint64_t) chunk_size));
if (!nblocks)
return 1;
nitems++;
}
- insert_chunks(items, nitems, &ndupes);
+ ret = insert_chunks(items, nitems, &ndupes, &unique_capacity, &thread->zc);
free(items);
- thread->items += nitems;
- thread->dupes += ndupes;
- return 0;
+ if (!ret) {
+ thread->items += nitems;
+ thread->dupes += ndupes;
+ thread->unique_capacity += unique_capacity;
+ return 0;
+ }
+
+ return ret;
+}
+
+static void thread_init_zlib_control(struct worker_thread *thread)
+{
+ size_t sz;
+
+ z_stream *stream = &thread->zc.stream;
+ stream->zalloc = Z_NULL;
+ stream->zfree = Z_NULL;
+ stream->opaque = Z_NULL;
+
+ if (deflateInit(stream, Z_DEFAULT_COMPRESSION) != Z_OK)
+ return;
+
+ thread->zc.buf_in = fio_memalign(blocksize, blocksize, false);
+ sz = deflateBound(stream, blocksize);
+ thread->zc.buf_out = fio_memalign(blocksize, sz, false);
}
static void *thread_fn(void *data)
void *buf;
buf = fio_memalign(blocksize, chunk_size, false);
+ thread_init_zlib_control(thread);
do {
if (get_work(&thread->cur_offset, &thread->size)) {
printf("%3.2f%% done (%luKiB/sec)\r", perc, this_items);
last_nitems = nitems;
fio_gettime(&last_tv, NULL);
- } else
+ } else {
printf("%3.2f%% done\r", perc);
+ }
fflush(stdout);
usleep(250000);
};
}
static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
- uint64_t *nextents, uint64_t *nchunks)
+ uint64_t *nextents, uint64_t *nchunks,
+ uint64_t *unique_capacity)
{
struct worker_thread *threads;
unsigned long nitems, total_items;
nitems = 0;
*nextents = 0;
*nchunks = 1;
+ *unique_capacity = 0;
for (i = 0; i < num_threads; i++) {
void *ret;
pthread_join(threads[i].thread, &ret);
nitems += threads[i].items;
*nchunks += threads[i].dupes;
+ *unique_capacity += threads[i].unique_capacity;
}
printf("Threads(%u): %lu items processed\n", num_threads, nitems);
}
static int dedupe_check(const char *filename, uint64_t *nextents,
- uint64_t *nchunks)
+ uint64_t *nchunks, uint64_t *unique_capacity)
{
uint64_t dev_size;
struct stat sb;
bloom = bloom_new(bloom_entries);
}
- printf("Will check <%s>, size <%llu>, using %u threads\n", filename, (unsigned long long) dev_size, num_threads);
+ printf("Will check <%s>, size <%llu>, using %u threads\n", filename,
+ (unsigned long long) dev_size, num_threads);
- return run_dedupe_threads(&file, dev_size, nextents, nchunks);
+ return run_dedupe_threads(&file, dev_size, nextents, nchunks,
+ unique_capacity);
err:
if (file.fd != -1)
close(file.fd);
struct flist_head *n;
struct extent *e;
- printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1], c->hash[2], c->hash[3], (unsigned long) c->count);
+ printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1],
+ c->hash[2], c->hash[3], (unsigned long) c->count);
flist_for_each(n, &c->extent_list[0]) {
e = flist_entry(n, struct extent, list);
printf("\toffset %llu\n", (unsigned long long) e->offset);
}
}
-static void show_stat(uint64_t nextents, uint64_t nchunks)
+static const char *capacity_unit[] = {"b","KB", "MB", "GB", "TB", "PB", "EB"};
+
+static uint64_t bytes_to_human_readable_unit(uint64_t n, const char **unit_out)
+{
+ uint8_t i = 0;
+
+ while (n >= 1024) {
+ i++;
+ n /= 1024;
+ }
+
+ *unit_out = capacity_unit[i];
+ return n;
+}
+
+static void show_stat(uint64_t nextents, uint64_t nchunks, uint64_t ndupextents,
+ uint64_t unique_capacity)
{
double perc, ratio;
+ const char *unit;
+ uint64_t uc_human;
- printf("Extents=%lu, Unique extents=%lu\n", (unsigned long) nextents, (unsigned long) nchunks);
+ printf("Extents=%lu, Unique extents=%lu", (unsigned long) nextents,
+ (unsigned long) nchunks);
+ if (!bloom)
+ printf(" Duplicated extents=%lu", (unsigned long) ndupextents);
+ printf("\n");
if (nchunks) {
ratio = (double) nextents / (double) nchunks;
printf("De-dupe ratio: 1:%3.2f\n", ratio - 1.0);
- } else
+ } else {
printf("De-dupe ratio: 1:infinite\n");
+ }
+
+ if (ndupextents) {
+ printf("De-dupe working set at least: %3.2f%%\n",
+ 100.0 * (double) ndupextents / (double) nextents);
+ }
perc = 1.00 - ((double) nchunks / (double) nextents);
perc *= 100.0;
printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
+
+ if (compression) {
+ uc_human = bytes_to_human_readable_unit(unique_capacity, &unit);
+ printf("Unique capacity %lu%s\n", (unsigned long) uc_human, unit);
+ }
}
-static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
+static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks, uint64_t *ndupextents)
{
struct fio_rb_node *n;
-
- *nchunks = *nextents = 0;
+ *nchunks = *nextents = *ndupextents = 0;
n = rb_first(&rb_root);
if (!n)
c = rb_entry(n, struct chunk, rb_node);
(*nchunks)++;
*nextents += c->count;
+ *ndupextents += (c->count > 1);
if (dump_output)
show_chunk(c);
log_err("\t-c\tFull collision check\n");
log_err("\t-B\tUse probabilistic bloom filter\n");
log_err("\t-p\tPrint progress indicator\n");
+ log_err("\t-C\tCalculate compressible size\n");
return 1;
}
int main(int argc, char *argv[])
{
- uint64_t nextents = 0, nchunks = 0;
+ uint64_t nextents = 0, nchunks = 0, ndupextents = 0, unique_capacity;
int c, ret;
arch_init(argv);
debug_init();
- while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {
+ while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:C:")) != -1) {
switch (c) {
case 'b':
blocksize = atoi(optarg);
case 'B':
use_bloom = atoi(optarg);
break;
+ case 'C':
+ compression = atoi(optarg);
+ break;
case '?':
default:
return usage(argv);
}
}
- if (collision_check || dump_output)
+ if (collision_check || dump_output || compression)
use_bloom = 0;
if (!num_threads)
- num_threads = cpus_online();
+ num_threads = cpus_configured();
if (argc == optind)
return usage(argv);
rb_root = RB_ROOT;
rb_lock = fio_sem_init(FIO_SEM_UNLOCKED);
- ret = dedupe_check(argv[optind], &nextents, &nchunks);
+ ret = dedupe_check(argv[optind], &nextents, &nchunks, &unique_capacity);
if (!ret) {
if (!bloom)
- iter_rb_tree(&nextents, &nchunks);
+ iter_rb_tree(&nextents, &nchunks, &ndupextents);
- show_stat(nextents, nchunks);
+ show_stat(nextents, nchunks, ndupextents, unique_capacity);
}
fio_sem_remove(rb_lock);
--- /dev/null
+#!/usr/bin/env python3
+"""
+fiotestcommon.py
+
+This contains constant definitions, helpers, and a Requirements class that can
+be used to help with running fio tests.
+"""
+
+import os
+import locale
+import logging
+import platform
+import subprocess
+import multiprocessing
+
+
+SUCCESS_DEFAULT = {
+ 'zero_return': True,
+ 'stderr_empty': True,
+ 'timeout': 600,
+ }
+SUCCESS_NONZERO = {
+ 'zero_return': False,
+ 'stderr_empty': False,
+ 'timeout': 600,
+ }
+SUCCESS_STDERR = {
+ 'zero_return': True,
+ 'stderr_empty': False,
+ 'timeout': 600,
+ }
+
+
+def get_file(filename):
+ """Safely read a file."""
+ file_data = ''
+ success = True
+
+ try:
+ with open(filename, "r", encoding=locale.getpreferredencoding()) as output_file:
+ file_data = output_file.read()
+ except OSError:
+ success = False
+
+ return file_data, success
+
+
+class Requirements():
+ """Requirements consists of multiple run environment characteristics.
+ These are to determine if a particular test can be run"""
+
+ _linux = False
+ _libaio = False
+ _io_uring = False
+ _zbd = False
+ _root = False
+ _zoned_nullb = False
+ _not_macos = False
+ _not_windows = False
+ _unittests = False
+ _cpucount4 = False
+ _nvmecdev = False
+
+ def __init__(self, fio_root, args):
+ Requirements._not_macos = platform.system() != "Darwin"
+ Requirements._not_windows = platform.system() != "Windows"
+ Requirements._linux = platform.system() == "Linux"
+
+ if Requirements._linux:
+ config_file = os.path.join(fio_root, "config-host.h")
+ contents, success = get_file(config_file)
+ if not success:
+ print(f"Unable to open {config_file} to check requirements")
+ Requirements._zbd = True
+ else:
+ Requirements._zbd = "CONFIG_HAS_BLKZONED" in contents
+ Requirements._libaio = "CONFIG_LIBAIO" in contents
+
+ contents, success = get_file("/proc/kallsyms")
+ if not success:
+ print("Unable to open '/proc/kallsyms' to probe for io_uring support")
+ else:
+ Requirements._io_uring = "io_uring_setup" in contents
+
+ Requirements._root = os.geteuid() == 0
+ if Requirements._zbd and Requirements._root:
+ try:
+ subprocess.run(["modprobe", "null_blk"],
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ if os.path.exists("/sys/module/null_blk/parameters/zoned"):
+ Requirements._zoned_nullb = True
+ except Exception:
+ pass
+
+ if platform.system() == "Windows":
+ utest_exe = "unittest.exe"
+ else:
+ utest_exe = "unittest"
+ unittest_path = os.path.join(fio_root, "unittests", utest_exe)
+ Requirements._unittests = os.path.exists(unittest_path)
+
+ Requirements._cpucount4 = multiprocessing.cpu_count() >= 4
+ Requirements._nvmecdev = args.nvmecdev
+
+ req_list = [
+ Requirements.linux,
+ Requirements.libaio,
+ Requirements.io_uring,
+ Requirements.zbd,
+ Requirements.root,
+ Requirements.zoned_nullb,
+ Requirements.not_macos,
+ Requirements.not_windows,
+ Requirements.unittests,
+ Requirements.cpucount4,
+ Requirements.nvmecdev,
+ ]
+ for req in req_list:
+ value, desc = req()
+ logging.debug("Requirements: Requirement '%s' met? %s", desc, value)
+
+ @classmethod
+ def linux(cls):
+ """Are we running on Linux?"""
+ return Requirements._linux, "Linux required"
+
+ @classmethod
+ def libaio(cls):
+ """Is libaio available?"""
+ return Requirements._libaio, "libaio required"
+
+ @classmethod
+ def io_uring(cls):
+ """Is io_uring available?"""
+ return Requirements._io_uring, "io_uring required"
+
+ @classmethod
+ def zbd(cls):
+ """Is ZBD support available?"""
+ return Requirements._zbd, "Zoned block device support required"
+
+ @classmethod
+ def root(cls):
+ """Are we running as root?"""
+ return Requirements._root, "root required"
+
+ @classmethod
+ def zoned_nullb(cls):
+ """Are zoned null block devices available?"""
+ return Requirements._zoned_nullb, "Zoned null block device support required"
+
+ @classmethod
+ def not_macos(cls):
+ """Are we running on a platform other than macOS?"""
+ return Requirements._not_macos, "platform other than macOS required"
+
+ @classmethod
+ def not_windows(cls):
+ """Are we running on a platform other than Windws?"""
+ return Requirements._not_windows, "platform other than Windows required"
+
+ @classmethod
+ def unittests(cls):
+ """Were unittests built?"""
+ return Requirements._unittests, "Unittests support required"
+
+ @classmethod
+ def cpucount4(cls):
+ """Do we have at least 4 CPUs?"""
+ return Requirements._cpucount4, "4+ CPUs required"
+
+ @classmethod
+ def nvmecdev(cls):
+ """Do we have an NVMe character device to test?"""
+ return Requirements._nvmecdev, "NVMe character device test target required"
--- /dev/null
+#!/usr/bin/env python3
+"""
+fiotestlib.py
+
+This library contains FioTest objects that provide convenient means to run
+different sorts of fio tests.
+
+It also contains a test runner that runs an array of dictionary objects
+describing fio tests.
+"""
+
+import os
+import sys
+import json
+import locale
+import logging
+import platform
+import traceback
+import subprocess
+from pathlib import Path
+from fiotestcommon import get_file, SUCCESS_DEFAULT
+
+
+class FioTest():
+ """Base for all fio tests."""
+
+ def __init__(self, exe_path, success, testnum, artifact_root):
+ self.success = success
+ self.testnum = testnum
+ self.output = {}
+ self.passed = True
+ self.failure_reason = ''
+ self.parameters = None
+ self.paths = {
+ 'exe': exe_path,
+ 'artifacts': artifact_root,
+ 'test_dir': os.path.join(artifact_root, \
+ f"{testnum:04d}"),
+ }
+ self.filenames = {
+ 'cmd': os.path.join(self.paths['test_dir'], \
+ f"{os.path.basename(self.paths['exe'])}.command"),
+ 'stdout': os.path.join(self.paths['test_dir'], \
+ f"{os.path.basename(self.paths['exe'])}.stdout"),
+ 'stderr': os.path.join(self.paths['test_dir'], \
+ f"{os.path.basename(self.paths['exe'])}.stderr"),
+ 'exitcode': os.path.join(self.paths['test_dir'], \
+ f"{os.path.basename(self.paths['exe'])}.exitcode"),
+ }
+
+ def setup(self, parameters):
+ """Setup instance variables for test."""
+
+ self.parameters = parameters
+ if not os.path.exists(self.paths['test_dir']):
+ os.mkdir(self.paths['test_dir'])
+
+ def run(self):
+ """Run the test."""
+
+ raise NotImplementedError()
+
+ def check_result(self):
+ """Check test results."""
+
+ raise NotImplementedError()
+
+
+class FioExeTest(FioTest):
+ """Test consists of an executable binary or script"""
+
+ def run(self):
+ """Execute the binary or script described by this instance."""
+
+ command = [self.paths['exe']] + self.parameters
+ with open(self.filenames['cmd'], "w+",
+ encoding=locale.getpreferredencoding()) as command_file:
+ command_file.write(" \\\n ".join(command))
+
+ try:
+ with open(self.filenames['stdout'], "w+",
+ encoding=locale.getpreferredencoding()) as stdout_file, \
+ open(self.filenames['stderr'], "w+",
+ encoding=locale.getpreferredencoding()) as stderr_file, \
+ open(self.filenames['exitcode'], "w+",
+ encoding=locale.getpreferredencoding()) as exitcode_file:
+ proc = None
+ # Avoid using subprocess.run() here because when a timeout occurs,
+ # fio will be stopped with SIGKILL. This does not give fio a
+ # chance to clean up and means that child processes may continue
+ # running and submitting IO.
+ proc = subprocess.Popen(command,
+ stdout=stdout_file,
+ stderr=stderr_file,
+ cwd=self.paths['test_dir'],
+ universal_newlines=True)
+ proc.communicate(timeout=self.success['timeout'])
+ exitcode_file.write(f'{proc.returncode}\n')
+ logging.debug("Test %d: return code: %d", self.testnum, proc.returncode)
+ self.output['proc'] = proc
+ except subprocess.TimeoutExpired:
+ proc.terminate()
+ proc.communicate()
+ assert proc.poll()
+ self.output['failure'] = 'timeout'
+ except Exception:
+ if proc:
+ if not proc.poll():
+ proc.terminate()
+ proc.communicate()
+ self.output['failure'] = 'exception'
+ self.output['exc_info'] = sys.exc_info()
+
+ def check_result(self):
+ """Check results of test run."""
+
+ if 'proc' not in self.output:
+ if self.output['failure'] == 'timeout':
+ self.failure_reason = f"{self.failure_reason} timeout,"
+ else:
+ assert self.output['failure'] == 'exception'
+ self.failure_reason = f'{self.failure_reason} exception: ' + \
+ f'{self.output["exc_info"][0]}, {self.output["exc_info"][1]}'
+
+ self.passed = False
+ return
+
+ if 'zero_return' in self.success:
+ if self.success['zero_return']:
+ if self.output['proc'].returncode != 0:
+ self.passed = False
+ self.failure_reason = f"{self.failure_reason} non-zero return code,"
+ else:
+ if self.output['proc'].returncode == 0:
+ self.failure_reason = f"{self.failure_reason} zero return code,"
+ self.passed = False
+
+ stderr_size = os.path.getsize(self.filenames['stderr'])
+ if 'stderr_empty' in self.success:
+ if self.success['stderr_empty']:
+ if stderr_size != 0:
+ self.failure_reason = f"{self.failure_reason} stderr not empty,"
+ self.passed = False
+ else:
+ if stderr_size == 0:
+ self.failure_reason = f"{self.failure_reason} stderr empty,"
+ self.passed = False
+
+
+class FioJobFileTest(FioExeTest):
+ """Test consists of a fio job with options in a job file."""
+
+ def __init__(self, fio_path, fio_job, success, testnum, artifact_root,
+ fio_pre_job=None, fio_pre_success=None,
+ output_format="normal"):
+ """Construct a FioJobFileTest which is a FioExeTest consisting of a
+ single fio job file with an optional setup step.
+
+ fio_path: location of fio executable
+ fio_job: location of fio job file
+ success: Definition of test success
+ testnum: test ID
+ artifact_root: root directory for artifacts
+ fio_pre_job: fio job for preconditioning
+ fio_pre_success: Definition of test success for fio precon job
+ output_format: normal (default), json, jsonplus, or terse
+ """
+
+ self.fio_job = fio_job
+ self.fio_pre_job = fio_pre_job
+ self.fio_pre_success = fio_pre_success if fio_pre_success else success
+ self.output_format = output_format
+ self.precon_failed = False
+ self.json_data = None
+
+ super().__init__(fio_path, success, testnum, artifact_root)
+
+ def setup(self, parameters):
+ """Setup instance variables for fio job test."""
+
+ self.filenames['fio_output'] = f"{os.path.basename(self.fio_job)}.output"
+ fio_args = [
+ "--max-jobs=16",
+ f"--output-format={self.output_format}",
+ f"--output={self.filenames['fio_output']}",
+ self.fio_job,
+ ]
+ if parameters:
+ fio_args += parameters
+
+ super().setup(fio_args)
+
+ # Update the filenames from the default
+ self.filenames['cmd'] = os.path.join(self.paths['test_dir'],
+ f"{os.path.basename(self.fio_job)}.command")
+ self.filenames['stdout'] = os.path.join(self.paths['test_dir'],
+ f"{os.path.basename(self.fio_job)}.stdout")
+ self.filenames['stderr'] = os.path.join(self.paths['test_dir'],
+ f"{os.path.basename(self.fio_job)}.stderr")
+ self.filenames['exitcode'] = os.path.join(self.paths['test_dir'],
+ f"{os.path.basename(self.fio_job)}.exitcode")
+
+ def run_pre_job(self):
+ """Run fio job precondition step."""
+
+ precon = FioJobFileTest(self.paths['exe'], self.fio_pre_job,
+ self.fio_pre_success,
+ self.testnum,
+ self.paths['artifacts'],
+ output_format=self.output_format)
+ precon.setup(None)
+ precon.run()
+ precon.check_result()
+ self.precon_failed = not precon.passed
+ self.failure_reason = precon.failure_reason
+
+ def run(self):
+ """Run fio job test."""
+
+ if self.fio_pre_job:
+ self.run_pre_job()
+
+ if not self.precon_failed:
+ super().run()
+ else:
+ logging.debug("Test %d: precondition step failed", self.testnum)
+
+ def get_file_fail(self, filename):
+ """Safely read a file and fail the test upon error."""
+ file_data = None
+
+ try:
+ with open(filename, "r", encoding=locale.getpreferredencoding()) as output_file:
+ file_data = output_file.read()
+ except OSError:
+ self.failure_reason += f" unable to read file {filename}"
+ self.passed = False
+
+ return file_data
+
+ def check_result(self):
+ """Check fio job results."""
+
+ if self.precon_failed:
+ self.passed = False
+ self.failure_reason = f"{self.failure_reason} precondition step failed,"
+ return
+
+ super().check_result()
+
+ if not self.passed:
+ return
+
+ if 'json' not in self.output_format:
+ return
+
+ file_data = self.get_file_fail(os.path.join(self.paths['test_dir'],
+ self.filenames['fio_output']))
+ if not file_data:
+ return
+
+ #
+ # Sometimes fio informational messages are included at the top of the
+ # JSON output, especially under Windows. Try to decode output as JSON
+ # data, skipping everything until the first {
+ #
+ lines = file_data.splitlines()
+ file_data = '\n'.join(lines[lines.index("{"):])
+ try:
+ self.json_data = json.loads(file_data)
+ except json.JSONDecodeError:
+ self.failure_reason = f"{self.failure_reason} unable to decode JSON data,"
+ self.passed = False
+
+
+class FioJobCmdTest(FioExeTest):
+ """This runs a fio job with options specified on the command line."""
+
+ def __init__(self, fio_path, success, testnum, artifact_root, fio_opts, basename=None):
+
+ self.basename = basename if basename else os.path.basename(fio_path)
+ self.fio_opts = fio_opts
+ self.json_data = None
+ self.iops_log_lines = None
+
+ super().__init__(fio_path, success, testnum, artifact_root)
+
+ filename_stub = os.path.join(self.paths['test_dir'], f"{self.basename}{self.testnum:03d}")
+ self.filenames['cmd'] = f"{filename_stub}.command"
+ self.filenames['stdout'] = f"{filename_stub}.stdout"
+ self.filenames['stderr'] = f"{filename_stub}.stderr"
+ self.filenames['output'] = os.path.abspath(f"{filename_stub}.output")
+ self.filenames['exitcode'] = f"{filename_stub}.exitcode"
+ self.filenames['iopslog'] = os.path.abspath(f"{filename_stub}")
+
+ def run(self):
+ super().run()
+
+ if 'output-format' in self.fio_opts and 'json' in \
+ self.fio_opts['output-format']:
+ if not self.get_json():
+ print('Unable to decode JSON data')
+ self.passed = False
+
+ if any('--write_iops_log=' in param for param in self.parameters):
+ self.get_iops_log()
+
+ def get_iops_log(self):
+ """Read IOPS log from the first job."""
+
+ log_filename = self.filenames['iopslog'] + "_iops.1.log"
+ with open(log_filename, 'r', encoding=locale.getpreferredencoding()) as iops_file:
+ self.iops_log_lines = iops_file.read()
+
+ def get_json(self):
+ """Convert fio JSON output into a python JSON object"""
+
+ filename = self.filenames['output']
+ with open(filename, 'r', encoding=locale.getpreferredencoding()) as file:
+ file_data = file.read()
+
+ #
+ # Sometimes fio informational messages are included at the top of the
+ # JSON output, especially under Windows. Try to decode output as JSON
+ # data, lopping off up to the first four lines
+ #
+ lines = file_data.splitlines()
+ for i in range(5):
+ file_data = '\n'.join(lines[i:])
+ try:
+ self.json_data = json.loads(file_data)
+ except json.JSONDecodeError:
+ continue
+ else:
+ return True
+
+ return False
+
+ @staticmethod
+ def check_empty(job):
+ """
+ Make sure JSON data is empty.
+
+ Some data structures should be empty. This function makes sure that they are.
+
+ job JSON object that we need to check for emptiness
+ """
+
+ return job['total_ios'] == 0 and \
+ job['slat_ns']['N'] == 0 and \
+ job['clat_ns']['N'] == 0 and \
+ job['lat_ns']['N'] == 0
+
+ def check_all_ddirs(self, ddir_nonzero, job):
+ """
+ Iterate over the data directions and check whether each is
+ appropriately empty or not.
+ """
+
+ retval = True
+ ddirlist = ['read', 'write', 'trim']
+
+ for ddir in ddirlist:
+ if ddir in ddir_nonzero:
+ if self.check_empty(job[ddir]):
+ print(f"Unexpected zero {ddir} data found in output")
+ retval = False
+ else:
+ if not self.check_empty(job[ddir]):
+ print(f"Unexpected {ddir} data found in output")
+ retval = False
+
+ return retval
+
+
+def run_fio_tests(test_list, test_env, args):
+ """
+ Run tests as specified in test_list.
+ """
+
+ passed = 0
+ failed = 0
+ skipped = 0
+
+ for config in test_list:
+ if (args.skip and config['test_id'] in args.skip) or \
+ (args.run_only and config['test_id'] not in args.run_only) or \
+ ('force_skip' in config and config['force_skip']):
+ skipped = skipped + 1
+ print(f"Test {config['test_id']} SKIPPED (User request or override)")
+ continue
+
+ if issubclass(config['test_class'], FioJobFileTest):
+ if config['pre_job']:
+ fio_pre_job = os.path.join(test_env['fio_root'], 't', 'jobs',
+ config['pre_job'])
+ else:
+ fio_pre_job = None
+ if config['pre_success']:
+ fio_pre_success = config['pre_success']
+ else:
+ fio_pre_success = None
+ if 'output_format' in config:
+ output_format = config['output_format']
+ else:
+ output_format = 'normal'
+ test = config['test_class'](
+ test_env['fio_path'],
+ os.path.join(test_env['fio_root'], 't', 'jobs', config['job']),
+ config['success'],
+ config['test_id'],
+ test_env['artifact_root'],
+ fio_pre_job=fio_pre_job,
+ fio_pre_success=fio_pre_success,
+ output_format=output_format)
+ desc = config['job']
+ parameters = config['parameters'] if 'parameters' in config else None
+ elif issubclass(config['test_class'], FioJobCmdTest):
+ if not 'success' in config:
+ config['success'] = SUCCESS_DEFAULT
+ test = config['test_class'](test_env['fio_path'],
+ config['success'],
+ config['test_id'],
+ test_env['artifact_root'],
+ config['fio_opts'],
+ test_env['basename'])
+ desc = config['test_id']
+ parameters = config
+ elif issubclass(config['test_class'], FioExeTest):
+ exe_path = os.path.join(test_env['fio_root'], config['exe'])
+ parameters = []
+ if config['parameters']:
+ parameters = [p.format(fio_path=test_env['fio_path'], nvmecdev=args.nvmecdev)
+ for p in config['parameters']]
+ if Path(exe_path).suffix == '.py' and platform.system() == "Windows":
+ parameters.insert(0, exe_path)
+ exe_path = "python.exe"
+ if config['test_id'] in test_env['pass_through']:
+ parameters += test_env['pass_through'][config['test_id']].split()
+ test = config['test_class'](
+ exe_path,
+ config['success'],
+ config['test_id'],
+ test_env['artifact_root'])
+ desc = config['exe']
+ else:
+ print(f"Test {config['test_id']} FAILED: unable to process test config")
+ failed = failed + 1
+ continue
+
+ if 'requirements' in config and not args.skip_req:
+ reqs_met = True
+ for req in config['requirements']:
+ reqs_met, reason = req()
+ logging.debug("Test %d: Requirement '%s' met? %s", config['test_id'], reason,
+ reqs_met)
+ if not reqs_met:
+ break
+ if not reqs_met:
+ print(f"Test {config['test_id']} SKIPPED ({reason}) {desc}")
+ skipped = skipped + 1
+ continue
+
+ try:
+ test.setup(parameters)
+ test.run()
+ test.check_result()
+ except KeyboardInterrupt:
+ break
+ except Exception as e:
+ test.passed = False
+ test.failure_reason += str(e)
+ logging.debug("Test %d exception:\n%s\n", config['test_id'], traceback.format_exc())
+ if test.passed:
+ result = "PASSED"
+ passed = passed + 1
+ else:
+ result = f"FAILED: {test.failure_reason}"
+ failed = failed + 1
+ contents, _ = get_file(test.filenames['stderr'])
+ logging.debug("Test %d: stderr:\n%s", config['test_id'], contents)
+ contents, _ = get_file(test.filenames['stdout'])
+ logging.debug("Test %d: stdout:\n%s", config['test_id'], contents)
+ print(f"Test {config['test_id']} {result} {desc}")
+
+ print(f"{passed} test(s) passed, {failed} failed, {skipped} skipped")
+
+ return passed, failed, skipped
--- /dev/null
+#include "fio.h"
+
+static int initialized = 0;
+
+const char *const fakeargv[] = {(char *) "fuzz",
+ (char *) "--output", (char *) "/dev/null",
+ (char *) "--parse-only",
+ 0};
+
+int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size)
+{
+ char *fuzzedini;
+
+ if (size < 2)
+ return 0;
+
+ if (initialized == 0) {
+ if (fio_init_options()) {
+ printf("Failed fio_init_options\n");
+ return 1;
+ }
+
+ parse_cmd_line(4, (char **) fakeargv, 0);
+ sinit();
+
+ initialized = 1;
+ }
+ fuzzedini = malloc(size);
+ if (!fuzzedini) {
+ printf("Failed malloc\n");
+ return 1;
+ }
+ /* final character is type for parse_jobs_ini */
+ memcpy(fuzzedini, data, size - 1);
+ /* ensures final 0 */
+ fuzzedini[size - 1] = 0;
+
+ parse_jobs_ini(fuzzedini, 1, 0, data[size - 1]);
+ free(fuzzedini);
+ return 0;
+}
--- /dev/null
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+
+int main(int argc, char** argv)
+{
+ FILE *fp;
+ uint8_t *data;
+ size_t size;
+
+ if (argc != 2)
+ return 1;
+
+ /* opens the file, get its size, and reads it into a buffer */
+ fp = fopen(argv[1], "rb");
+ if (fp == NULL)
+ return 2;
+
+ if (fseek(fp, 0L, SEEK_END) != 0) {
+ fclose(fp);
+ return 2;
+ }
+ size = ftell(fp);
+ if (size == (size_t) -1) {
+ fclose(fp);
+ return 2;
+ }
+ if (fseek(fp, 0L, SEEK_SET) != 0) {
+ fclose(fp);
+ return 2;
+ }
+ data = malloc(size);
+ if (data == NULL) {
+ fclose(fp);
+ return 2;
+ }
+ if (fread(data, size, 1, fp) != 1) {
+ fclose(fp);
+ free(data);
+ return 2;
+ }
+
+ /* launch fuzzer */
+ LLVMFuzzerTestOneInput(data, size);
+ free(data);
+ fclose(fp);
+
+ return 0;
+}
nranges /= block_size;
if (dist_type == TYPE_ZIPF)
- zipf_init(&zs, nranges, dist_val, 1);
+ zipf_init(&zs, nranges, dist_val, -1, 1);
else if (dist_type == TYPE_PARETO)
- pareto_init(&zs, nranges, dist_val, 1);
+ pareto_init(&zs, nranges, dist_val, -1, 1);
else
- gauss_init(&gs, nranges, dist_val, 1);
+ gauss_init(&gs, nranges, dist_val, -1, 1);
hash_bits = 0;
hash_size = nranges;
#include <stddef.h>
#include <signal.h>
#include <inttypes.h>
+#include <math.h>
+
+#ifdef CONFIG_LIBAIO
+#include <libaio.h>
+#endif
+
+#ifdef CONFIG_LIBNUMA
+#include <numa.h>
+#endif
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#include <pthread.h>
#include <sched.h>
+#include <libgen.h>
#include "../arch/arch.h"
+#include "../os/os.h"
#include "../lib/types.h"
+#include "../lib/roundup.h"
+#include "../lib/rand.h"
+#include "../minmax.h"
#include "../os/linux/io_uring.h"
-
-#define min(a, b) ((a < b) ? (a) : (b))
+#include "../engines/nvme.h"
struct io_sq_ring {
unsigned *head;
struct file {
unsigned long max_blocks;
+ unsigned long max_size;
+ unsigned long cur_off;
unsigned pending_ios;
+ unsigned int nsid; /* nsid field required for nvme-passthrough */
+ unsigned int lba_shift; /* lba_shift field required for nvme-passthrough */
int real_fd;
int fixed_fd;
+ int fileno;
};
+#define PLAT_BITS 6
+#define PLAT_VAL (1 << PLAT_BITS)
+#define PLAT_GROUP_NR 29
+#define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL)
+
struct submitter {
pthread_t thread;
int ring_fd;
+ int enter_ring_fd;
+ int index;
struct io_sq_ring sq_ring;
struct io_uring_sqe *sqes;
struct io_cq_ring cq_ring;
int inflight;
+ int tid;
unsigned long reaps;
unsigned long done;
unsigned long calls;
+ unsigned long io_errors;
volatile int finish;
__s32 *fds;
+ struct taus258_state rand_state;
+
+ unsigned long *clock_batch;
+ int clock_index;
+ unsigned long *plat;
+
+#ifdef CONFIG_LIBAIO
+ io_context_t aio_ctx;
+#endif
+
+ int numa_node;
+ int per_file_depth;
+ const char *filename;
+
struct file files[MAX_FDS];
unsigned nr_files;
unsigned cur_file;
static struct submitter *submitter;
static volatile int finish;
+static int stats_running;
+static unsigned long max_iops;
+static long t_io_uring_page_size;
static int depth = DEPTH;
static int batch_submit = BATCH_SUBMIT;
static int sq_thread_poll = 0; /* use kernel submission/poller thread */
static int sq_thread_cpu = -1; /* pin above thread to this CPU */
static int do_nop = 0; /* no-op SQ ring commands */
+static int nthreads = 1;
+static int stats = 0; /* generate IO stats */
+static int aio = 0; /* use libaio */
+static int runtime = 0; /* runtime */
+static int random_io = 1; /* random or sequential IO */
+static int register_ring = 1; /* register ring */
+static int use_sync = 0; /* use preadv2 */
+static int numa_placement = 0; /* set to node of device */
+static int pt = 0; /* passthrough I/O or not */
+
+static unsigned long tsc_rate;
+
+#define TSC_RATE_FILE "tsc-rate"
static int vectored = 1;
+static float plist[] = { 1.0, 5.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0,
+ 80.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.95, 99.99 };
+static int plist_len = 17;
+
+static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
+ enum nvme_csi csi, void *data)
+{
+ struct nvme_passthru_cmd cmd = {
+ .opcode = nvme_admin_identify,
+ .nsid = nsid,
+ .addr = (__u64)(uintptr_t)data,
+ .data_len = NVME_IDENTIFY_DATA_SIZE,
+ .cdw10 = cns,
+ .cdw11 = csi << NVME_IDENTIFY_CSI_SHIFT,
+ .timeout_ms = NVME_DEFAULT_IOCTL_TIMEOUT,
+ };
+
+ return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
+}
+
+static int nvme_get_info(int fd, __u32 *nsid, __u32 *lba_sz, __u64 *nlba)
+{
+ struct nvme_id_ns ns;
+ int namespace_id;
+ int err;
+
+ namespace_id = ioctl(fd, NVME_IOCTL_ID);
+ if (namespace_id < 0) {
+ fprintf(stderr, "error failed to fetch namespace-id\n");
+ close(fd);
+ return -errno;
+ }
+
+ /*
+ * Identify namespace to get namespace-id, namespace size in LBA's
+ * and LBA data size.
+ */
+ err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
+ NVME_CSI_NVM, &ns);
+ if (err) {
+ fprintf(stderr, "error failed to fetch identify namespace\n");
+ close(fd);
+ return err;
+ }
+
+ *nsid = namespace_id;
+ *lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds;
+ *nlba = ns.nsze;
+
+ return 0;
+}
+
+static unsigned long cycles_to_nsec(unsigned long cycles)
+{
+ uint64_t val;
+
+ if (!tsc_rate)
+ return cycles;
+
+ val = cycles * 1000000000ULL;
+ return val / tsc_rate;
+}
+
+static unsigned long plat_idx_to_val(unsigned int idx)
+{
+ unsigned int error_bits;
+ unsigned long k, base;
+
+ assert(idx < PLAT_NR);
+
+ /* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+ * all bits of the sample as index */
+ if (idx < (PLAT_VAL << 1))
+ return cycles_to_nsec(idx);
+
+ /* Find the group and compute the minimum value of that group */
+ error_bits = (idx >> PLAT_BITS) - 1;
+ base = ((unsigned long) 1) << (error_bits + PLAT_BITS);
+
+ /* Find its bucket number of the group */
+ k = idx % PLAT_VAL;
+
+ /* Return the mean of the range of the bucket */
+ return cycles_to_nsec(base + ((k + 0.5) * (1 << error_bits)));
+}
+
+unsigned int calculate_clat_percentiles(unsigned long *io_u_plat,
+ unsigned long nr, unsigned long **output,
+ unsigned long *maxv, unsigned long *minv)
+{
+ unsigned long sum = 0;
+ unsigned int len = plist_len, i, j = 0;
+ unsigned long *ovals = NULL;
+ bool is_last;
+
+ *minv = -1UL;
+ *maxv = 0;
+
+ ovals = malloc(len * sizeof(*ovals));
+ if (!ovals)
+ return 0;
+
+ /*
+ * Calculate bucket values, note down max and min values
+ */
+ is_last = false;
+ for (i = 0; i < PLAT_NR && !is_last; i++) {
+ sum += io_u_plat[i];
+ while (sum >= ((long double) plist[j] / 100.0 * nr)) {
+ assert(plist[j] <= 100.0);
+
+ ovals[j] = plat_idx_to_val(i);
+ if (ovals[j] < *minv)
+ *minv = ovals[j];
+ if (ovals[j] > *maxv)
+ *maxv = ovals[j];
+
+ is_last = (j == len - 1) != 0;
+ if (is_last)
+ break;
+
+ j++;
+ }
+ }
+
+ if (!is_last)
+ fprintf(stderr, "error calculating latency percentiles\n");
+
+ *output = ovals;
+ return len;
+}
+
+static void show_clat_percentiles(unsigned long *io_u_plat, unsigned long nr,
+ unsigned int precision)
+{
+ unsigned int divisor, len, i, j = 0;
+ unsigned long minv, maxv;
+ unsigned long *ovals;
+ int per_line, scale_down, time_width;
+ bool is_last;
+ char fmt[32];
+
+ len = calculate_clat_percentiles(io_u_plat, nr, &ovals, &maxv, &minv);
+ if (!len || !ovals)
+ goto out;
+
+ if (!tsc_rate) {
+ scale_down = 0;
+ divisor = 1;
+ printf(" percentiles (tsc ticks):\n |");
+ } else if (minv > 2000 && maxv > 99999) {
+ scale_down = 1;
+ divisor = 1000;
+ printf(" percentiles (usec):\n |");
+ } else {
+ scale_down = 0;
+ divisor = 1;
+ printf(" percentiles (nsec):\n |");
+ }
+
+ time_width = max(5, (int) (log10(maxv / divisor) + 1));
+ snprintf(fmt, sizeof(fmt), " %%%u.%ufth=[%%%dllu]%%c", precision + 3,
+ precision, time_width);
+ /* fmt will be something like " %5.2fth=[%4llu]%c" */
+ per_line = (80 - 7) / (precision + 10 + time_width);
+
+ for (j = 0; j < len; j++) {
+ /* for formatting */
+ if (j != 0 && (j % per_line) == 0)
+ printf(" |");
+
+ /* end of the list */
+ is_last = (j == len - 1) != 0;
+
+ for (i = 0; i < scale_down; i++)
+ ovals[j] = (ovals[j] + 999) / 1000;
+
+ printf(fmt, plist[j], ovals[j], is_last ? '\n' : ',');
+
+ if (is_last)
+ break;
+
+ if ((j % per_line) == per_line - 1) /* for formatting */
+ printf("\n");
+ }
+
+out:
+ free(ovals);
+}
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+static unsigned int plat_val_to_idx(unsigned long val)
+{
+ unsigned int msb, error_bits, base, offset, idx;
+
+ /* Find MSB starting from bit 0 */
+ if (val == 0)
+ msb = 0;
+ else
+ msb = (sizeof(val)*8) - __builtin_clzll(val) - 1;
+
+ /*
+ * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+ * all bits of the sample as index
+ */
+ if (msb <= PLAT_BITS)
+ return val;
+
+ /* Compute the number of error bits to discard*/
+ error_bits = msb - PLAT_BITS;
+
+ /* Compute the number of buckets before the group */
+ base = (error_bits + 1) << PLAT_BITS;
+
+ /*
+ * Discard the error bits and apply the mask to find the
+ * index for the buckets in the group
+ */
+ offset = (PLAT_VAL - 1) & (val >> error_bits);
+
+ /* Make sure the index does not exceed (array size - 1) */
+ idx = (base + offset) < (PLAT_NR - 1) ?
+ (base + offset) : (PLAT_NR - 1);
+
+ return idx;
+}
+#endif
+
+static void add_stat(struct submitter *s, int clock_index, int nr)
+{
+#ifdef ARCH_HAVE_CPU_CLOCK
+ unsigned long cycles;
+ unsigned int pidx;
+
+ if (!s->finish && clock_index) {
+ cycles = get_cpu_clock();
+ cycles -= s->clock_batch[clock_index];
+ pidx = plat_val_to_idx(cycles);
+ s->plat[pidx] += nr;
+ }
+#endif
+}
+
static int io_uring_register_buffers(struct submitter *s)
{
if (do_nop)
return 0;
return syscall(__NR_io_uring_register, s->ring_fd,
- IORING_REGISTER_BUFFERS, s->iovecs, depth);
+ IORING_REGISTER_BUFFERS, s->iovecs, roundup_pow2(depth));
}
static int io_uring_register_files(struct submitter *s)
static int io_uring_setup(unsigned entries, struct io_uring_params *p)
{
- return syscall(__NR_io_uring_setup, entries, p);
+ int ret;
+
+ /*
+ * Clamp CQ ring size at our SQ ring size, we don't need more entries
+ * than that.
+ */
+ p->flags |= IORING_SETUP_CQSIZE;
+ p->cq_entries = entries;
+
+ p->flags |= IORING_SETUP_COOP_TASKRUN;
+ p->flags |= IORING_SETUP_SINGLE_ISSUER;
+ p->flags |= IORING_SETUP_DEFER_TASKRUN;
+retry:
+ ret = syscall(__NR_io_uring_setup, entries, p);
+ if (!ret)
+ return 0;
+
+ if (errno == EINVAL && p->flags & IORING_SETUP_COOP_TASKRUN) {
+ p->flags &= ~IORING_SETUP_COOP_TASKRUN;
+ goto retry;
+ }
+ if (errno == EINVAL && p->flags & IORING_SETUP_SINGLE_ISSUER) {
+ p->flags &= ~IORING_SETUP_SINGLE_ISSUER;
+ goto retry;
+ }
+ if (errno == EINVAL && p->flags & IORING_SETUP_DEFER_TASKRUN) {
+ p->flags &= ~IORING_SETUP_DEFER_TASKRUN;
+ goto retry;
+ }
+
+ return ret;
}
static void io_uring_probe(int fd)
struct io_uring_probe *p;
int ret;
- p = malloc(sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
+ p = calloc(1, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
if (!p)
return;
- memset(p, 0, sizeof(*p) + 256 * sizeof(struct io_uring_probe_op));
ret = syscall(__NR_io_uring_register, fd, IORING_REGISTER_PROBE, p, 256);
if (ret < 0)
goto out;
static int io_uring_enter(struct submitter *s, unsigned int to_submit,
unsigned int min_complete, unsigned int flags)
{
- return syscall(__NR_io_uring_enter, s->ring_fd, to_submit, min_complete,
- flags, NULL, 0);
+ if (register_ring)
+ flags |= IORING_ENTER_REGISTERED_RING;
+#ifdef FIO_ARCH_HAS_SYSCALL
+ return __do_syscall6(__NR_io_uring_enter, s->enter_ring_fd, to_submit,
+ min_complete, flags, NULL, 0);
+#else
+ return syscall(__NR_io_uring_enter, s->enter_ring_fd, to_submit,
+ min_complete, flags, NULL, 0);
+#endif
}
-#ifndef CONFIG_HAVE_GETTID
-static int gettid(void)
+static unsigned long long get_offset(struct submitter *s, struct file *f)
{
- return syscall(__NR_gettid);
-}
-#endif
+ unsigned long long offset;
+ long r;
-static unsigned file_depth(struct submitter *s)
-{
- return (depth + s->nr_files - 1) / s->nr_files;
+ if (random_io) {
+ unsigned long long block;
+
+ r = __rand64(&s->rand_state);
+ block = r % f->max_blocks;
+ offset = block * (unsigned long long) bs;
+ } else {
+ offset = f->cur_off;
+ f->cur_off += bs;
+ if (f->cur_off + bs > f->max_size)
+ f->cur_off = 0;
+ }
+
+ return offset;
}
-static void init_io(struct submitter *s, unsigned index)
+static struct file *get_next_file(struct submitter *s)
{
- struct io_uring_sqe *sqe = &s->sqes[index];
- unsigned long offset;
struct file *f;
- long r;
-
- if (do_nop) {
- sqe->opcode = IORING_OP_NOP;
- return;
- }
if (s->nr_files == 1) {
f = &s->files[0];
} else {
f = &s->files[s->cur_file];
- if (f->pending_ios >= file_depth(s)) {
+ if (f->pending_ios >= s->per_file_depth) {
s->cur_file++;
if (s->cur_file == s->nr_files)
s->cur_file = 0;
f = &s->files[s->cur_file];
}
}
+
f->pending_ios++;
+ return f;
+}
+
+static void init_io(struct submitter *s, unsigned index)
+{
+ struct io_uring_sqe *sqe = &s->sqes[index];
+ struct file *f;
+
+ if (do_nop) {
+ sqe->opcode = IORING_OP_NOP;
+ return;
+ }
- r = lrand48();
- offset = (r % (f->max_blocks - 1)) * bs;
+ f = get_next_file(s);
if (register_files) {
sqe->flags = IOSQE_FIXED_FILE;
sqe->buf_index = 0;
}
sqe->ioprio = 0;
- sqe->off = offset;
- sqe->user_data = (unsigned long) f;
+ sqe->off = get_offset(s, f);
+ sqe->user_data = (unsigned long) f->fileno;
+ if (stats && stats_running)
+ sqe->user_data |= ((uint64_t)s->clock_index << 32);
+}
+
+static void init_io_pt(struct submitter *s, unsigned index)
+{
+ struct io_uring_sqe *sqe = &s->sqes[index << 1];
+ unsigned long offset;
+ struct file *f;
+ struct nvme_uring_cmd *cmd;
+ unsigned long long slba;
+ unsigned long long nlb;
+
+ f = get_next_file(s);
+
+ offset = get_offset(s, f);
+
+ if (register_files) {
+ sqe->fd = f->fixed_fd;
+ sqe->flags = IOSQE_FIXED_FILE;
+ } else {
+ sqe->fd = f->real_fd;
+ sqe->flags = 0;
+ }
+ sqe->opcode = IORING_OP_URING_CMD;
+ sqe->user_data = (unsigned long) f->fileno;
+ if (stats)
+ sqe->user_data |= ((__u64) s->clock_index << 32ULL);
+ sqe->cmd_op = NVME_URING_CMD_IO;
+ slba = offset >> f->lba_shift;
+ nlb = (bs >> f->lba_shift) - 1;
+ cmd = (struct nvme_uring_cmd *)&sqe->cmd;
+ /* cdw10 and cdw11 represent starting slba*/
+ cmd->cdw10 = slba & 0xffffffff;
+ cmd->cdw11 = slba >> 32;
+ /* cdw12 represent number of lba to be read*/
+ cmd->cdw12 = nlb;
+ cmd->addr = (unsigned long) s->iovecs[index].iov_base;
+ cmd->data_len = bs;
+ if (fixedbufs) {
+ sqe->uring_cmd_flags = IORING_URING_CMD_FIXED;
+ sqe->buf_index = index;
+ }
+ cmd->nsid = f->nsid;
+ cmd->opcode = 2;
}
-static int prep_more_ios(struct submitter *s, int max_ios)
+static int prep_more_ios_uring(struct submitter *s, int max_ios)
{
struct io_sq_ring *ring = &s->sq_ring;
- unsigned index, tail, next_tail, prepped = 0;
+ unsigned head, index, tail, next_tail, prepped = 0;
+
+ if (sq_thread_poll)
+ head = atomic_load_acquire(ring->head);
+ else
+ head = *ring->head;
next_tail = tail = *ring->tail;
do {
next_tail++;
- read_barrier();
- if (next_tail == *ring->head)
+ if (next_tail == head)
break;
index = tail & sq_ring_mask;
- init_io(s, index);
- ring->array[index] = index;
+ if (pt)
+ init_io_pt(s, index);
+ else
+ init_io(s, index);
prepped++;
tail = next_tail;
} while (prepped < max_ios);
- if (*ring->tail != tail) {
- *ring->tail = tail;
- write_barrier();
- }
+ if (prepped)
+ atomic_store_release(ring->tail, tail);
return prepped;
}
if (fstat(f->real_fd, &st) < 0)
return -1;
- if (S_ISBLK(st.st_mode)) {
+ if (pt) {
+ __u64 nlba;
+ __u32 lbs;
+ int ret;
+
+ if (!S_ISCHR(st.st_mode)) {
+ fprintf(stderr, "passthrough works with only nvme-ns "
+ "generic devices (/dev/ngXnY)\n");
+ return -1;
+ }
+ ret = nvme_get_info(f->real_fd, &f->nsid, &lbs, &nlba);
+ if (ret)
+ return -1;
+ if ((bs % lbs) != 0) {
+ printf("error: bs:%d should be a multiple logical_block_size:%d\n",
+ bs, lbs);
+ return -1;
+ }
+ f->max_blocks = nlba;
+ f->max_size = nlba;
+ f->lba_shift = ilog2(lbs);
+ return 0;
+ } else if (S_ISBLK(st.st_mode)) {
unsigned long long bytes;
if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
return -1;
f->max_blocks = bytes / bs;
+ f->max_size = bytes;
return 0;
} else if (S_ISREG(st.st_mode)) {
f->max_blocks = st.st_size / bs;
+ f->max_size = st.st_size;
return 0;
}
return -1;
}
-static int reap_events(struct submitter *s)
+static int reap_events_uring(struct submitter *s)
{
struct io_cq_ring *ring = &s->cq_ring;
struct io_uring_cqe *cqe;
unsigned head, reaped = 0;
+ int last_idx = -1, stat_nr = 0;
head = *ring->head;
do {
struct file *f;
- read_barrier();
- if (head == *ring->tail)
+ if (head == atomic_load_acquire(ring->tail))
break;
cqe = &ring->cqes[head & cq_ring_mask];
if (!do_nop) {
- f = (struct file *) (uintptr_t) cqe->user_data;
+ int fileno = cqe->user_data & 0xffffffff;
+
+ f = &s->files[fileno];
f->pending_ios--;
if (cqe->res != bs) {
- printf("io: unexpected ret=%d\n", cqe->res);
- if (polled && cqe->res == -EOPNOTSUPP)
- printf("Your filesystem/driver/kernel doesn't support polled IO\n");
- return -1;
+ if (cqe->res == -ENODATA || cqe->res == -EIO) {
+ s->io_errors++;
+ } else {
+ printf("io: unexpected ret=%d\n", cqe->res);
+ if (polled && cqe->res == -EOPNOTSUPP)
+ printf("Your filesystem/driver/kernel doesn't support polled IO\n");
+ return -1;
+ }
+ }
+ }
+ if (stats) {
+ int clock_index = cqe->user_data >> 32;
+
+ if (last_idx != clock_index) {
+ if (last_idx != -1) {
+ add_stat(s, last_idx, stat_nr);
+ stat_nr = 0;
+ }
+ last_idx = clock_index;
}
+ stat_nr++;
}
reaped++;
head++;
} while (1);
- s->inflight -= reaped;
- *ring->head = head;
- write_barrier();
+ if (stat_nr)
+ add_stat(s, last_idx, stat_nr);
+
+ if (reaped) {
+ s->inflight -= reaped;
+ atomic_store_release(ring->head, head);
+ }
return reaped;
}
-static void *submitter_fn(void *data)
+static int reap_events_uring_pt(struct submitter *s)
{
- struct submitter *s = data;
- struct io_sq_ring *ring = &s->sq_ring;
- int ret, prepped;
-
- printf("submitter=%d\n", gettid());
-
- srand48(pthread_self());
+ struct io_cq_ring *ring = &s->cq_ring;
+ struct io_uring_cqe *cqe;
+ unsigned head, reaped = 0;
+ int last_idx = -1, stat_nr = 0;
+ unsigned index;
+ int fileno;
- prepped = 0;
+ head = *ring->head;
do {
- int to_wait, to_submit, this_reap, to_prep;
+ struct file *f;
- if (!prepped && s->inflight < depth) {
- to_prep = min(depth - s->inflight, batch_submit);
- prepped = prep_more_ios(s, to_prep);
+ if (head == atomic_load_acquire(ring->tail))
+ break;
+ index = head & cq_ring_mask;
+ cqe = &ring->cqes[index << 1];
+ fileno = cqe->user_data & 0xffffffff;
+ f = &s->files[fileno];
+ f->pending_ios--;
+
+ if (cqe->res != 0) {
+ printf("io: unexpected ret=%d\n", cqe->res);
+ if (polled && cqe->res == -EINVAL)
+ printf("passthrough doesn't support polled IO\n");
+ return -1;
}
- s->inflight += prepped;
-submit_more:
- to_submit = prepped;
-submit:
- if (to_submit && (s->inflight + to_submit <= depth))
- to_wait = 0;
- else
- to_wait = min(s->inflight + to_submit, batch_complete);
+ if (stats) {
+ int clock_index = cqe->user_data >> 32;
+
+ if (last_idx != clock_index) {
+ if (last_idx != -1) {
+ add_stat(s, last_idx, stat_nr);
+ stat_nr = 0;
+ }
+ last_idx = clock_index;
+ }
+ stat_nr++;
+ }
+ reaped++;
+ head++;
+ } while (1);
- /*
- * Only need to call io_uring_enter if we're not using SQ thread
- * poll, or if IORING_SQ_NEED_WAKEUP is set.
- */
- if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) {
- unsigned flags = 0;
+ if (stat_nr)
+ add_stat(s, last_idx, stat_nr);
- if (to_wait)
- flags = IORING_ENTER_GETEVENTS;
- if ((*ring->flags & IORING_SQ_NEED_WAKEUP))
- flags |= IORING_ENTER_SQ_WAKEUP;
- ret = io_uring_enter(s, to_submit, to_wait, flags);
- s->calls++;
- }
+ if (reaped) {
+ s->inflight -= reaped;
+ atomic_store_release(ring->head, head);
+ }
+ return reaped;
+}
- /*
- * For non SQ thread poll, we already got the events we needed
- * through the io_uring_enter() above. For SQ thread poll, we
- * need to loop here until we find enough events.
- */
- this_reap = 0;
- do {
- int r;
- r = reap_events(s);
- if (r == -1) {
- s->finish = 1;
- break;
- } else if (r > 0)
- this_reap += r;
- } while (sq_thread_poll && this_reap < to_wait);
- s->reaps += this_reap;
+static void set_affinity(struct submitter *s)
+{
+#ifdef CONFIG_LIBNUMA
+ struct bitmask *mask;
- if (ret >= 0) {
- if (!ret) {
- to_submit = 0;
- if (s->inflight)
- goto submit;
- continue;
- } else if (ret < to_submit) {
- int diff = to_submit - ret;
+ if (s->numa_node == -1)
+ return;
- s->done += ret;
- prepped -= diff;
- goto submit_more;
- }
- s->done += ret;
- prepped = 0;
- continue;
- } else if (ret < 0) {
- if (errno == EAGAIN) {
- if (s->finish)
- break;
- if (this_reap)
- goto submit;
- to_submit = 0;
- goto submit;
- }
- printf("io_submit: %s\n", strerror(errno));
- break;
- }
- } while (!s->finish);
+ numa_set_preferred(s->numa_node);
- finish = 1;
- return NULL;
+ mask = numa_allocate_cpumask();
+ numa_node_to_cpus(s->numa_node, mask);
+ numa_sched_setaffinity(s->tid, mask);
+#endif
}
-static void sig_int(int sig)
+static int detect_node(struct submitter *s, char *name)
{
- printf("Exiting on signal %d\n", sig);
- submitter->finish = 1;
- finish = 1;
+#ifdef CONFIG_LIBNUMA
+ const char *base = basename(name);
+ char str[128];
+ int ret, fd, node;
+
+ if (pt)
+ sprintf(str, "/sys/class/nvme-generic/%s/device/numa_node", base);
+ else
+ sprintf(str, "/sys/block/%s/device/numa_node", base);
+ fd = open(str, O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ ret = read(fd, str, sizeof(str));
+ if (ret < 0) {
+ close(fd);
+ return -1;
+ }
+ node = atoi(str);
+ s->numa_node = node;
+ close(fd);
+#else
+ s->numa_node = -1;
+#endif
+ return 0;
}
-static void arm_sig_int(void)
+static int setup_aio(struct submitter *s)
{
- struct sigaction act;
-
- memset(&act, 0, sizeof(act));
- act.sa_handler = sig_int;
- act.sa_flags = SA_RESTART;
- sigaction(SIGINT, &act, NULL);
+#ifdef CONFIG_LIBAIO
+ if (polled) {
+ fprintf(stderr, "aio does not support polled IO\n");
+ polled = 0;
+ }
+ if (sq_thread_poll) {
+ fprintf(stderr, "aio does not support SQPOLL IO\n");
+ sq_thread_poll = 0;
+ }
+ if (do_nop) {
+ fprintf(stderr, "aio does not support polled IO\n");
+ do_nop = 0;
+ }
+ if (fixedbufs || register_files) {
+ fprintf(stderr, "aio does not support registered files or buffers\n");
+ fixedbufs = register_files = 0;
+ }
+
+ s->per_file_depth = (depth + s->nr_files - 1) / s->nr_files;
+ return io_queue_init(roundup_pow2(depth), &s->aio_ctx);
+#else
+ fprintf(stderr, "Legacy AIO not available on this system/build\n");
+ errno = EINVAL;
+ return -1;
+#endif
}
static int setup_ring(struct submitter *s)
struct io_sq_ring *sring = &s->sq_ring;
struct io_cq_ring *cring = &s->cq_ring;
struct io_uring_params p;
- int ret, fd;
+ int ret, fd, i;
void *ptr;
+ size_t len;
memset(&p, 0, sizeof(p));
p.sq_thread_cpu = sq_thread_cpu;
}
}
+ if (pt) {
+ p.flags |= IORING_SETUP_SQE128;
+ p.flags |= IORING_SETUP_CQE32;
+ }
fd = io_uring_setup(depth, &p);
if (fd < 0) {
perror("io_uring_setup");
return 1;
}
- s->ring_fd = fd;
+ s->ring_fd = s->enter_ring_fd = fd;
io_uring_probe(fd);
if (fixedbufs) {
+ struct rlimit rlim;
+
+ rlim.rlim_cur = RLIM_INFINITY;
+ rlim.rlim_max = RLIM_INFINITY;
+ /* ignore potential error, not needed on newer kernels */
+ setrlimit(RLIMIT_MEMLOCK, &rlim);
+
ret = io_uring_register_buffers(s);
if (ret < 0) {
perror("io_uring_register_buffers");
ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_SQ_RING);
- printf("sq_ring ptr = 0x%p\n", ptr);
sring->head = ptr + p.sq_off.head;
sring->tail = ptr + p.sq_off.tail;
sring->ring_mask = ptr + p.sq_off.ring_mask;
sring->array = ptr + p.sq_off.array;
sq_ring_mask = *sring->ring_mask;
- s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
+ if (p.flags & IORING_SETUP_SQE128)
+ len = 2 * p.sq_entries * sizeof(struct io_uring_sqe);
+ else
+ len = p.sq_entries * sizeof(struct io_uring_sqe);
+ s->sqes = mmap(0, len,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_SQES);
- printf("sqes ptr = 0x%p\n", s->sqes);
- ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
+ if (p.flags & IORING_SETUP_CQE32) {
+ len = p.cq_off.cqes +
+ 2 * p.cq_entries * sizeof(struct io_uring_cqe);
+ } else {
+ len = p.cq_off.cqes +
+ p.cq_entries * sizeof(struct io_uring_cqe);
+ }
+ ptr = mmap(0, len,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
IORING_OFF_CQ_RING);
- printf("cq_ring ptr = 0x%p\n", ptr);
cring->head = ptr + p.cq_off.head;
cring->tail = ptr + p.cq_off.tail;
cring->ring_mask = ptr + p.cq_off.ring_mask;
cring->ring_entries = ptr + p.cq_off.ring_entries;
cring->cqes = ptr + p.cq_off.cqes;
cq_ring_mask = *cring->ring_mask;
+
+ for (i = 0; i < p.sq_entries; i++)
+ sring->array[i] = i;
+
+ s->per_file_depth = INT_MAX;
+ if (s->nr_files)
+ s->per_file_depth = (depth + s->nr_files - 1) / s->nr_files;
return 0;
}
-static void file_depths(char *buf)
+static void *allocate_mem(struct submitter *s, int size)
{
- struct submitter *s = submitter;
- char *p;
- int i;
+ void *buf;
- buf[0] = '\0';
- p = buf;
- for (i = 0; i < s->nr_files; i++) {
- struct file *f = &s->files[i];
+#ifdef CONFIG_LIBNUMA
+ if (s->numa_node != -1)
+ return numa_alloc_onnode(size, s->numa_node);
+#endif
+
+ if (posix_memalign(&buf, t_io_uring_page_size, bs)) {
+ printf("failed alloc\n");
+ return NULL;
+ }
+
+ return buf;
+}
+
+static int submitter_init(struct submitter *s)
+{
+ int i, nr_batch, err;
+ static int init_printed;
+ char buf[80];
+ s->tid = gettid();
+ printf("submitter=%d, tid=%d, file=%s, nfiles=%d, node=%d\n", s->index, s->tid,
+ s->filename, s->nr_files, s->numa_node);
+
+ set_affinity(s);
+
+ __init_rand64(&s->rand_state, s->tid);
+ srand48(s->tid);
+
+ for (i = 0; i < MAX_FDS; i++)
+ s->files[i].fileno = i;
+
+ for (i = 0; i < roundup_pow2(depth); i++) {
+ void *buf;
+
+ buf = allocate_mem(s, bs);
+ if (!buf)
+ return -1;
+ s->iovecs[i].iov_base = buf;
+ s->iovecs[i].iov_len = bs;
+ }
+
+ if (use_sync) {
+ sprintf(buf, "Engine=preadv2\n");
+ err = 0;
+ } else if (!aio) {
+ err = setup_ring(s);
+ if (!err)
+ sprintf(buf, "Engine=io_uring, sq_ring=%d, cq_ring=%d\n", *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+ } else {
+ sprintf(buf, "Engine=aio\n");
+ err = setup_aio(s);
+ }
+ if (err) {
+ printf("queue setup failed: %s, %d\n", strerror(errno), err);
+ return -1;
+ }
+
+ if (!init_printed) {
+ printf("polled=%d, fixedbufs=%d, register_files=%d, buffered=%d, QD=%d\n", polled, fixedbufs, register_files, buffered, depth);
+ printf("%s", buf);
+ init_printed = 1;
+ }
+
+ if (stats) {
+ nr_batch = roundup_pow2(depth / batch_submit);
+ if (nr_batch < 2)
+ nr_batch = 2;
+ s->clock_batch = calloc(nr_batch, sizeof(unsigned long));
+ s->clock_index = 1;
+
+ s->plat = calloc(PLAT_NR, sizeof(unsigned long));
+ } else {
+ s->clock_batch = NULL;
+ s->plat = NULL;
+ nr_batch = 0;
+ }
+ /* perform the expensive command initialization part for passthrough here
+ * rather than in the fast path
+ */
+ if (pt) {
+ for (i = 0; i < roundup_pow2(depth); i++) {
+ struct io_uring_sqe *sqe = &s->sqes[i << 1];
+
+ memset(&sqe->cmd, 0, sizeof(struct nvme_uring_cmd));
+ }
+ }
+ return nr_batch;
+}
+
+#ifdef CONFIG_LIBAIO
+static int prep_more_ios_aio(struct submitter *s, int max_ios, struct iocb *iocbs)
+{
+ uint64_t data;
+ struct file *f;
+ unsigned index;
+
+ index = 0;
+ while (index < max_ios) {
+ struct iocb *iocb = &iocbs[index];
+
+ f = get_next_file(s);
+
+ io_prep_pread(iocb, f->real_fd, s->iovecs[index].iov_base,
+ s->iovecs[index].iov_len, get_offset(s, f));
+
+ data = f->fileno;
+ if (stats && stats_running)
+ data |= (((uint64_t) s->clock_index) << 32);
+ iocb->data = (void *) (uintptr_t) data;
+ index++;
+ }
+ return index;
+}
+
+static int reap_events_aio(struct submitter *s, struct io_event *events, int evs)
+{
+ int last_idx = -1, stat_nr = 0;
+ int reaped = 0;
+
+ while (evs) {
+ uint64_t data = (uintptr_t) events[reaped].data;
+ struct file *f = &s->files[data & 0xffffffff];
+
+ f->pending_ios--;
+ if (events[reaped].res != bs) {
+ if (events[reaped].res == -ENODATA ||
+ events[reaped].res == -EIO) {
+ s->io_errors++;
+ } else {
+ printf("io: unexpected ret=%ld\n", events[reaped].res);
+ return -1;
+ }
+ } else if (stats) {
+ int clock_index = data >> 32;
+
+ if (last_idx != clock_index) {
+ if (last_idx != -1) {
+ add_stat(s, last_idx, stat_nr);
+ stat_nr = 0;
+ }
+ last_idx = clock_index;
+ }
+ stat_nr++;
+ }
+ reaped++;
+ evs--;
+ }
+
+ if (stat_nr)
+ add_stat(s, last_idx, stat_nr);
+
+ s->inflight -= reaped;
+ s->done += reaped;
+ return reaped;
+}
+
+static void *submitter_aio_fn(void *data)
+{
+ struct submitter *s = data;
+ int i, ret, prepped;
+ struct iocb **iocbsptr;
+ struct iocb *iocbs;
+ struct io_event *events;
+#ifdef ARCH_HAVE_CPU_CLOCK
+ int nr_batch;
+#endif
+
+ ret = submitter_init(s);
+ if (ret < 0)
+ goto done;
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+ nr_batch = ret;
+#endif
+
+ iocbsptr = calloc(depth, sizeof(struct iocb *));
+ iocbs = calloc(depth, sizeof(struct iocb));
+ events = calloc(depth, sizeof(struct io_event));
+
+ for (i = 0; i < depth; i++)
+ iocbsptr[i] = &iocbs[i];
+
+ prepped = 0;
+ do {
+ int to_wait, to_submit, to_prep;
+
+ if (!prepped && s->inflight < depth) {
+ to_prep = min(depth - s->inflight, batch_submit);
+ prepped = prep_more_ios_aio(s, to_prep, iocbs);
+#ifdef ARCH_HAVE_CPU_CLOCK
+ if (prepped && stats) {
+ s->clock_batch[s->clock_index] = get_cpu_clock();
+ s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
+ }
+#endif
+ }
+ s->inflight += prepped;
+ to_submit = prepped;
+
+ if (to_submit && (s->inflight + to_submit <= depth))
+ to_wait = 0;
+ else
+ to_wait = min(s->inflight + to_submit, batch_complete);
+
+ ret = io_submit(s->aio_ctx, to_submit, iocbsptr);
+ s->calls++;
+ if (ret < 0) {
+ perror("io_submit");
+ break;
+ } else if (ret != to_submit) {
+ printf("submitted %d, wanted %d\n", ret, to_submit);
+ break;
+ }
+ prepped = 0;
+
+ while (to_wait) {
+ int r;
+
+ s->calls++;
+ r = io_getevents(s->aio_ctx, to_wait, to_wait, events, NULL);
+ if (r < 0) {
+ perror("io_getevents");
+ break;
+ } else if (r != to_wait) {
+ printf("r=%d, wait=%d\n", r, to_wait);
+ break;
+ }
+ r = reap_events_aio(s, events, r);
+ s->reaps += r;
+ to_wait -= r;
+ }
+ } while (!s->finish);
+
+ free(iocbsptr);
+ free(iocbs);
+ free(events);
+done:
+ finish = 1;
+ return NULL;
+}
+#endif
+
+static void io_uring_unregister_ring(struct submitter *s)
+{
+ struct io_uring_rsrc_update up = {
+ .offset = s->enter_ring_fd,
+ };
+
+ syscall(__NR_io_uring_register, s->ring_fd, IORING_UNREGISTER_RING_FDS,
+ &up, 1);
+}
+
+static int io_uring_register_ring(struct submitter *s)
+{
+ struct io_uring_rsrc_update up = {
+ .data = s->ring_fd,
+ .offset = -1U,
+ };
+ int ret;
+
+ ret = syscall(__NR_io_uring_register, s->ring_fd,
+ IORING_REGISTER_RING_FDS, &up, 1);
+ if (ret == 1) {
+ s->enter_ring_fd = up.offset;
+ return 0;
+ }
+ register_ring = 0;
+ return -1;
+}
+
+static void *submitter_uring_fn(void *data)
+{
+ struct submitter *s = data;
+ struct io_sq_ring *ring = &s->sq_ring;
+ int ret, prepped;
+#ifdef ARCH_HAVE_CPU_CLOCK
+ int nr_batch;
+#endif
+
+ ret = submitter_init(s);
+ if (ret < 0)
+ goto done;
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+ nr_batch = ret;
+#endif
+
+ if (register_ring)
+ io_uring_register_ring(s);
+
+ prepped = 0;
+ do {
+ int to_wait, to_submit, this_reap, to_prep;
+ unsigned ring_flags = 0;
+
+ if (!prepped && s->inflight < depth) {
+ to_prep = min(depth - s->inflight, batch_submit);
+ prepped = prep_more_ios_uring(s, to_prep);
+#ifdef ARCH_HAVE_CPU_CLOCK
+ if (prepped && stats) {
+ s->clock_batch[s->clock_index] = get_cpu_clock();
+ s->clock_index = (s->clock_index + 1) & (nr_batch - 1);
+ }
+#endif
+ }
+ s->inflight += prepped;
+submit_more:
+ to_submit = prepped;
+submit:
+ if (to_submit && (s->inflight + to_submit <= depth))
+ to_wait = 0;
+ else
+ to_wait = min(s->inflight + to_submit, batch_complete);
+
+ /*
+ * Only need to call io_uring_enter if we're not using SQ thread
+ * poll, or if IORING_SQ_NEED_WAKEUP is set.
+ */
+ if (sq_thread_poll)
+ ring_flags = atomic_load_acquire(ring->flags);
+ if (!sq_thread_poll || ring_flags & IORING_SQ_NEED_WAKEUP) {
+ unsigned flags = 0;
+
+ if (to_wait)
+ flags = IORING_ENTER_GETEVENTS;
+ if (ring_flags & IORING_SQ_NEED_WAKEUP)
+ flags |= IORING_ENTER_SQ_WAKEUP;
+ ret = io_uring_enter(s, to_submit, to_wait, flags);
+ s->calls++;
+ } else {
+ /* for SQPOLL, we submitted it all effectively */
+ ret = to_submit;
+ }
+
+ /*
+ * For non SQ thread poll, we already got the events we needed
+ * through the io_uring_enter() above. For SQ thread poll, we
+ * need to loop here until we find enough events.
+ */
+ this_reap = 0;
+ do {
+ int r;
+
+ if (pt)
+ r = reap_events_uring_pt(s);
+ else
+ r = reap_events_uring(s);
+ if (r == -1) {
+ s->finish = 1;
+ break;
+ } else if (r > 0)
+ this_reap += r;
+ } while (sq_thread_poll && this_reap < to_wait);
+ s->reaps += this_reap;
+
+ if (ret >= 0) {
+ if (!ret) {
+ to_submit = 0;
+ if (s->inflight)
+ goto submit;
+ continue;
+ } else if (ret < to_submit) {
+ int diff = to_submit - ret;
+
+ s->done += ret;
+ prepped -= diff;
+ goto submit_more;
+ }
+ s->done += ret;
+ prepped = 0;
+ continue;
+ } else if (ret < 0) {
+ if (errno == EAGAIN) {
+ if (s->finish)
+ break;
+ if (this_reap)
+ goto submit;
+ to_submit = 0;
+ goto submit;
+ }
+ printf("io_submit: %s\n", strerror(errno));
+ break;
+ }
+ } while (!s->finish);
+
+ if (register_ring)
+ io_uring_unregister_ring(s);
+
+done:
+ finish = 1;
+ return NULL;
+}
+
+#ifdef CONFIG_PWRITEV2
+static void *submitter_sync_fn(void *data)
+{
+ struct submitter *s = data;
+ int ret;
+
+ if (submitter_init(s) < 0)
+ goto done;
+
+ do {
+ uint64_t offset;
+ struct file *f;
+
+ f = get_next_file(s);
+
+#ifdef ARCH_HAVE_CPU_CLOCK
+ if (stats)
+ s->clock_batch[s->clock_index] = get_cpu_clock();
+#endif
+
+ s->inflight++;
+ s->calls++;
- if (i + 1 == s->nr_files)
- p += sprintf(p, "%d", f->pending_ios);
+ offset = get_offset(s, f);
+ if (polled)
+ ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, RWF_HIPRI);
else
- p += sprintf(p, "%d, ", f->pending_ios);
+ ret = preadv2(f->real_fd, &s->iovecs[0], 1, offset, 0);
+
+ if (ret < 0) {
+ perror("preadv2");
+ break;
+ } else if (ret != bs) {
+ break;
+ }
+
+ s->done++;
+ s->inflight--;
+ f->pending_ios--;
+ if (stats)
+ add_stat(s, s->clock_index, 1);
+ } while (!s->finish);
+
+done:
+ finish = 1;
+ return NULL;
+}
+#else
+static void *submitter_sync_fn(void *data)
+{
+ finish = 1;
+ return NULL;
+}
+#endif
+
+static struct submitter *get_submitter(int offset)
+{
+ void *ret;
+
+ ret = submitter;
+ if (offset)
+ ret += offset * (sizeof(*submitter) + depth * sizeof(struct iovec));
+ return ret;
+}
+
+static void do_finish(const char *reason)
+{
+ int j;
+
+ printf("Exiting on %s\n", reason);
+ for (j = 0; j < nthreads; j++) {
+ struct submitter *s = get_submitter(j);
+ s->finish = 1;
}
+ if (max_iops > 1000000) {
+ double miops = (double) max_iops / 1000000.0;
+ printf("Maximum IOPS=%.2fM\n", miops);
+ } else if (max_iops > 100000) {
+ double kiops = (double) max_iops / 1000.0;
+ printf("Maximum IOPS=%.2fK\n", kiops);
+ } else {
+ printf("Maximum IOPS=%lu\n", max_iops);
+ }
+ finish = 1;
}
-static void usage(char *argv)
+static void sig_int(int sig)
{
+ do_finish("signal");
+}
+
+static void arm_sig_int(void)
+{
+ struct sigaction act;
+
+ memset(&act, 0, sizeof(act));
+ act.sa_handler = sig_int;
+ act.sa_flags = SA_RESTART;
+ sigaction(SIGINT, &act, NULL);
+
+ /* Windows uses SIGBREAK as a quit signal from other applications */
+#ifdef WIN32
+ sigaction(SIGBREAK, &act, NULL);
+#endif
+}
+
+static void usage(char *argv, int status)
+{
+ char runtime_str[16];
+ snprintf(runtime_str, sizeof(runtime_str), "%d", runtime);
printf("%s [options] -- [filenames]\n"
- " -d <int> : IO Depth, default %d\n"
- " -s <int> : Batch submit, default %d\n"
- " -c <int> : Batch complete, default %d\n"
- " -b <int> : Block size, default %d\n"
- " -p <bool> : Polled IO, default %d\n",
- argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled);
- exit(0);
+ " -d <int> : IO Depth, default %d\n"
+ " -s <int> : Batch submit, default %d\n"
+ " -c <int> : Batch complete, default %d\n"
+ " -b <int> : Block size, default %d\n"
+ " -p <bool> : Polled IO, default %d\n"
+ " -B <bool> : Fixed buffers, default %d\n"
+ " -F <bool> : Register files, default %d\n"
+ " -n <int> : Number of threads, default %d\n"
+ " -O <bool> : Use O_DIRECT, default %d\n"
+ " -N <bool> : Perform just no-op requests, default %d\n"
+ " -t <bool> : Track IO latencies, default %d\n"
+ " -T <int> : TSC rate in HZ\n"
+ " -r <int> : Runtime in seconds, default %s\n"
+ " -R <bool> : Use random IO, default %d\n"
+ " -a <bool> : Use legacy aio, default %d\n"
+ " -S <bool> : Use sync IO (preadv2), default %d\n"
+ " -X <bool> : Use registered ring %d\n"
+ " -P <bool> : Automatically place on device home node %d\n"
+ " -u <bool> : Use nvme-passthrough I/O, default %d\n",
+ argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
+ fixedbufs, register_files, nthreads, !buffered, do_nop,
+ stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio,
+ use_sync, register_ring, numa_placement, pt);
+ exit(status);
+}
+
+static void read_tsc_rate(void)
+{
+ char buffer[32];
+ int fd, ret;
+
+ if (tsc_rate)
+ return;
+
+ fd = open(TSC_RATE_FILE, O_RDONLY);
+ if (fd < 0)
+ return;
+
+ ret = read(fd, buffer, sizeof(buffer));
+ if (ret < 0) {
+ close(fd);
+ return;
+ }
+
+ tsc_rate = strtoul(buffer, NULL, 10);
+ printf("Using TSC rate %luHz\n", tsc_rate);
+ close(fd);
+}
+
+static void write_tsc_rate(void)
+{
+ char buffer[32];
+ struct stat sb;
+ int fd, ret;
+
+ if (!stat(TSC_RATE_FILE, &sb))
+ return;
+
+ fd = open(TSC_RATE_FILE, O_WRONLY | O_CREAT, 0644);
+ if (fd < 0)
+ return;
+
+ memset(buffer, 0, sizeof(buffer));
+ sprintf(buffer, "%lu", tsc_rate);
+ ret = write(fd, buffer, strlen(buffer));
+ if (ret < 0)
+ perror("write");
+ close(fd);
}
int main(int argc, char *argv[])
{
struct submitter *s;
- unsigned long done, calls, reap;
- int err, i, flags, fd, opt;
- char *fdepths;
+ unsigned long done, calls, reap, io_errors;
+ int i, j, flags, fd, opt, threads_per_f, threads_rem = 0, nfiles;
+ struct file f;
void *ret;
- if (!do_nop && argc < 2) {
- printf("%s: filename [options]\n", argv[0]);
- return 1;
- }
+ if (!do_nop && argc < 2)
+ usage(argv[0], 1);
- while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:h?")) != -1) {
+ while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:X:S:P:u:h?")) != -1) {
switch (opt) {
+ case 'a':
+ aio = !!atoi(optarg);
+ break;
case 'd':
depth = atoi(optarg);
break;
case 's':
batch_submit = atoi(optarg);
+ if (!batch_submit)
+ batch_submit = 1;
break;
case 'c':
batch_complete = atoi(optarg);
+ if (!batch_complete)
+ batch_complete = 1;
break;
case 'b':
bs = atoi(optarg);
case 'F':
register_files = !!atoi(optarg);
break;
+ case 'n':
+ nthreads = atoi(optarg);
+ if (!nthreads) {
+ printf("Threads must be non-zero\n");
+ usage(argv[0], 1);
+ }
+ break;
+ case 'N':
+ do_nop = !!atoi(optarg);
+ break;
+ case 'O':
+ buffered = !atoi(optarg);
+ break;
+ case 't':
+#ifndef ARCH_HAVE_CPU_CLOCK
+ fprintf(stderr, "Stats not supported on this CPU\n");
+ return 1;
+#endif
+ stats = !!atoi(optarg);
+ break;
+ case 'T':
+#ifndef ARCH_HAVE_CPU_CLOCK
+ fprintf(stderr, "Stats not supported on this CPU\n");
+ return 1;
+#endif
+ tsc_rate = strtoul(optarg, NULL, 10);
+ write_tsc_rate();
+ break;
+ case 'r':
+ runtime = atoi(optarg);
+ break;
+ case 'R':
+ random_io = !!atoi(optarg);
+ break;
+ case 'X':
+ register_ring = !!atoi(optarg);
+ break;
+ case 'S':
+#ifdef CONFIG_PWRITEV2
+ use_sync = !!atoi(optarg);
+#else
+ fprintf(stderr, "preadv2 not supported\n");
+ exit(1);
+#endif
+ break;
+ case 'P':
+ numa_placement = !!atoi(optarg);
+ break;
+ case 'u':
+ pt = !!atoi(optarg);
+ break;
case 'h':
case '?':
default:
- usage(argv[0]);
+ usage(argv[0], 0);
break;
}
}
- submitter = malloc(sizeof(*submitter) + depth * sizeof(struct iovec));
- memset(submitter, 0, sizeof(*submitter) + depth * sizeof(struct iovec));
- s = submitter;
+ if (stats)
+ read_tsc_rate();
+
+ if (batch_complete > depth)
+ batch_complete = depth;
+ if (batch_submit > depth)
+ batch_submit = depth;
+
+ submitter = calloc(nthreads, sizeof(*submitter) +
+ roundup_pow2(depth) * sizeof(struct iovec));
+ for (j = 0; j < nthreads; j++) {
+ s = get_submitter(j);
+ s->numa_node = -1;
+ s->index = j;
+ s->done = s->calls = s->reaps = s->io_errors = 0;
+ }
flags = O_RDONLY | O_NOATIME;
if (!buffered)
flags |= O_DIRECT;
+ j = 0;
i = optind;
+ nfiles = argc - i;
+ if (!do_nop) {
+ if (!nfiles) {
+ printf("No files specified\n");
+ usage(argv[0], 1);
+ }
+ threads_per_f = nthreads / nfiles;
+ /* make sure each thread gets assigned files */
+ if (threads_per_f == 0) {
+ threads_per_f = 1;
+ } else {
+ threads_rem = nthreads - threads_per_f * nfiles;
+ }
+ }
while (!do_nop && i < argc) {
- struct file *f;
+ int k, limit;
+
+ memset(&f, 0, sizeof(f));
- if (s->nr_files == MAX_FDS) {
- printf("Max number of files (%d) reached\n", MAX_FDS);
- break;
- }
fd = open(argv[i], flags);
if (fd < 0) {
perror("open");
return 1;
}
-
- f = &s->files[s->nr_files];
- f->real_fd = fd;
- if (get_file_size(f)) {
+ f.real_fd = fd;
+ if (get_file_size(&f)) {
printf("failed getting size of device/file\n");
return 1;
}
- if (f->max_blocks <= 1) {
+ if (f.max_blocks <= 1) {
printf("Zero file/device size?\n");
return 1;
}
- f->max_blocks--;
-
- printf("Added file %s\n", argv[i]);
- s->nr_files++;
- i++;
- }
+ f.max_blocks--;
- if (fixedbufs) {
- struct rlimit rlim;
+ limit = threads_per_f;
+ limit += threads_rem > 0 ? 1 : 0;
+ for (k = 0; k < limit; k++) {
+ s = get_submitter((j + k) % nthreads);
- rlim.rlim_cur = RLIM_INFINITY;
- rlim.rlim_max = RLIM_INFINITY;
- if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) {
- perror("setrlimit");
- return 1;
- }
- }
+ if (s->nr_files == MAX_FDS) {
+ printf("Max number of files (%d) reached\n", MAX_FDS);
+ break;
+ }
- arm_sig_int();
+ memcpy(&s->files[s->nr_files], &f, sizeof(f));
- for (i = 0; i < depth; i++) {
- void *buf;
+ if (numa_placement)
+ detect_node(s, argv[i]);
- if (posix_memalign(&buf, bs, bs)) {
- printf("failed alloc\n");
- return 1;
+ s->filename = argv[i];
+ s->nr_files++;
}
- s->iovecs[i].iov_base = buf;
- s->iovecs[i].iov_len = bs;
+ threads_rem--;
+ i++;
+ j += limit;
}
- err = setup_ring(s);
- if (err) {
- printf("ring setup failed: %s, %d\n", strerror(errno), err);
- return 1;
- }
- printf("polled=%d, fixedbufs=%d, register_files=%d, buffered=%d", polled, fixedbufs, register_files, buffered);
- printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", depth, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries);
+ arm_sig_int();
- pthread_create(&s->thread, NULL, submitter_fn, s);
+ t_io_uring_page_size = sysconf(_SC_PAGESIZE);
+ if (t_io_uring_page_size < 0)
+ t_io_uring_page_size = 4096;
+
+ for (j = 0; j < nthreads; j++) {
+ s = get_submitter(j);
+ if (use_sync)
+ pthread_create(&s->thread, NULL, submitter_sync_fn, s);
+ else if (!aio)
+ pthread_create(&s->thread, NULL, submitter_uring_fn, s);
+#ifdef CONFIG_LIBAIO
+ else
+ pthread_create(&s->thread, NULL, submitter_aio_fn, s);
+#endif
+ }
- fdepths = malloc(8 * s->nr_files);
- reap = calls = done = 0;
+ reap = calls = done = io_errors = 0;
do {
unsigned long this_done = 0;
unsigned long this_reap = 0;
unsigned long this_call = 0;
+ unsigned long this_io_errors = 0;
unsigned long rpc = 0, ipc = 0;
+ unsigned long iops, bw;
sleep(1);
- this_done += s->done;
- this_call += s->calls;
- this_reap += s->reaps;
+ if (runtime && !--runtime)
+ do_finish("timeout");
+
+ /* don't print partial run, if interrupted by signal */
+ if (finish)
+ break;
+
+ /* one second in to the run, enable stats */
+ if (stats)
+ stats_running = 1;
+
+ for (j = 0; j < nthreads; j++) {
+ s = get_submitter(j);
+ this_done += s->done;
+ this_call += s->calls;
+ this_reap += s->reaps;
+ this_io_errors += s->io_errors;
+ }
if (this_call - calls) {
rpc = (this_done - done) / (this_call - calls);
ipc = (this_reap - reap) / (this_call - calls);
} else
rpc = ipc = -1;
- file_depths(fdepths);
- printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n",
- this_done - done, rpc, ipc, s->inflight,
- fdepths);
+ iops = this_done - done;
+ iops -= this_io_errors - io_errors;
+ if (bs > 1048576)
+ bw = iops * (bs / 1048576);
+ else
+ bw = iops / (1048576 / bs);
+ if (iops > 1000000) {
+ double miops = (double) iops / 1000000.0;
+ printf("IOPS=%.2fM, ", miops);
+ } else if (iops > 100000) {
+ double kiops = (double) iops / 1000.0;
+ printf("IOPS=%.2fK, ", kiops);
+ } else {
+ printf("IOPS=%lu, ", iops);
+ }
+ max_iops = max(max_iops, iops);
+ if (!do_nop) {
+ if (bw > 2000) {
+ double bw_g = (double) bw / 1000.0;
+
+ printf("BW=%.2fGiB/s, ", bw_g);
+ } else {
+ printf("BW=%luMiB/s, ", bw);
+ }
+ }
+ printf("IOS/call=%ld/%ld\n", rpc, ipc);
done = this_done;
calls = this_call;
reap = this_reap;
+ io_errors = this_io_errors;
} while (!finish);
- pthread_join(s->thread, &ret);
- close(s->ring_fd);
- free(fdepths);
+ for (j = 0; j < nthreads; j++) {
+ s = get_submitter(j);
+ pthread_join(s->thread, &ret);
+ close(s->ring_fd);
+
+ if (s->io_errors)
+ printf("%d: %lu IO errors\n", s->tid, s->io_errors);
+
+ if (stats) {
+ unsigned long nr;
+
+ printf("%d: Latency percentiles:\n", s->tid);
+ for (i = 0, nr = 0; i < PLAT_NR; i++)
+ nr += s->plat[i];
+ show_clat_percentiles(s->plat, nr, 4);
+ free(s->clock_batch);
+ free(s->plat);
+ }
+ }
+
+ free(submitter);
return 0;
}
-# Expected result: fio reads 87040KB of data
+# Expected result: fio reads 87040KB of data:
+# first read is at offset 0, then 2nd read is at offset 1.5m, then the 3rd
+# read is at offset 3m, and after the last read at offset 127m - we have only
+# read 87,040K data.
# Buggy result: fio reads the full 128MB of data
[foo]
size=128mb
thread
log_avg_msec=1000
write_iops_log=t0012.fio
+time_based
[flow1]
flow=1
thread
log_avg_msec=1000
write_iops_log=t0014.fio
+time_based
[flow1]
flow=1
--- /dev/null
+# Expected result: mean(slat) + mean(clat) = mean(lat)
+# Buggy result: equality does not hold
+
+[test]
+ioengine=libaio
+size=1M
+iodepth=16
--- /dev/null
+# Expected result: mean(slat) + mean(clat) = mean(lat)
+# Buggy result: equality does not hold
+
+[test]
+ioengine=null
+size=1M
+iodepth=16
--- /dev/null
+# Expected result: mean(slat) + mean(clat) = mean(lat)
+# Buggy result: equality does not hold
+# This is similar to t0015 and t0016 except that is uses posixaio which is
+# available on more platforms and does not have a commit hook
+
+[test]
+ioengine=posixaio
+size=1M
+iodepth=16
--- /dev/null
+# Expected result: job completes without error
+# Buggy result: job fails
+
+[test]
+ioengine=io_uring
+filesize=256K
+time_based
+runtime=3s
+rw=randrw
--- /dev/null
+# Expected result: offsets are accessed sequentially and all offsets are read
+# Buggy result: offsets are not accessed sequentially and one or more offsets are missed
+# run with --debug=io or logging to see which offsets are accessed
+
+[test]
+ioengine=null
+filesize=1M
+write_bw_log=test
+per_job_logs=0
+log_offset=1
--- /dev/null
+# Expected result: offsets are not accessed sequentially and all offsets are touched
+# Buggy result: offsets are accessed sequentially and one or more offsets are missed
+# run with --debug=io or logging to see which offsets are read
+
+[test]
+ioengine=null
+filesize=1M
+rw=randread
+write_bw_log=test
+per_job_logs=0
+log_offset=1
--- /dev/null
+# make sure the lfsr random generator actually does touch all the offsets
+#
+# Expected result: offsets are not accessed sequentially and all offsets are touched
+# Buggy result: offsets are accessed sequentially and one or more offsets are missed
+# run with --debug=io or logging to see which offsets are read
+
+[test]
+ioengine=null
+filesize=1M
+rw=randread
+write_bw_log=test
+per_job_logs=0
+log_offset=1
+norandommap=1
+random_generator=lfsr
--- /dev/null
+# make sure that when we enable norandommap we touch some offsets more than once
+#
+# Expected result: at least one offset is touched more than once
+# Buggy result: each offset is touched only once
+
+[test]
+ioengine=null
+filesize=1M
+rw=randread
+write_bw_log=test
+per_job_logs=0
+log_offset=1
+norandommap=1
--- /dev/null
+# randtrimwrite data direction tests
+[global]
+filesize=1M
+ioengine=null
+rw=randtrimwrite
+log_offset=1
+per_job_logs=0
+randrepeat=0
+write_bw_log
+
+# Expected result: trim issued to random offset followed by write to same offset
+# all offsets touched
+# block sizes match
+# Buggy result: something else
+[basic]
+
+# Expected result: trim issued to random offset followed by write to same offset
+# all offsets trimmed
+# block sizes 8k for both write and trim
+# Buggy result: something else
+[bs]
+bs=8k,8k,8k
+
+# Expected result: trim issued to random offset followed by write to same offset
+# all offsets trimmed
+# block sizes match
+# Buggy result: something else
+[bsrange]
+bsrange=512-4k
+
+# Expected result: trim issued to random offset followed by write to same offset
+# all offsets trimmed
+# block sizes match
+# Buggy result: something else
+[bssplit]
+bssplit=512/25:1k/:2k/:4k/
+
+# Expected result: trim issued to random offset followed by write to same offset
+# block sizes match
+# Buggy result: something else
+[basic_no_rm]
+norandommap=1
+
+# Expected result: trim issued to random offset followed by write to same offset
+# block sizes 8k for both write and trim
+# Buggy result: something else
+[bs_no_rm]
+bs=4k,4k,8k
+norandommap=1
+
+# Expected result: trim issued to random offset followed by write to same offset
+# block sizes match
+# Buggy result: something else
+[bsrange_no_rm]
+bsrange=512-4k
+norandommap=1
+
+# Expected result: trim issued to random offset followed by write to same offset
+# block sizes match
+# Buggy result: something else
+[bssplit_no_rm]
+bssplit=512/25:1k/:2k/:4k/
+norandommap=1
--- /dev/null
+# trimwrite data direction tests
+[global]
+filesize=1M
+ioengine=null
+rw=trimwrite
+log_offset=1
+per_job_logs=0
+randrepeat=0
+write_bw_log
+
+# Expected result: trim issued to sequential offsets followed by write to same offset
+# all offsets touched
+# block sizes match
+# Buggy result: something else
+[basic]
+
+# Expected result: trim issued to sequential offsets followed by write to same offset
+# all offsets trimmed
+# block sizes 8k for both write and trim
+# Buggy result: something else
+[bs]
+bs=8k,8k,8k
+
+# Expected result: trim issued to sequential offsets followed by write to same offset
+# all offsets trimmed
+# block sizes match
+# Buggy result: something else
+[bsrange]
+bsrange=512-4k
+
+# Expected result: trim issued to sequential offsets followed by write to same offset
+# all offsets trimmed
+# block sizes match
+# Buggy result: something else
+[bssplit]
+bssplit=512/25:1k/:2k/:4k/
--- /dev/null
+[job]
+filename=t0025file
+size=128k
+readwrite=write
+do_verify=1
+verify=md5
+experimental_verify=1
--- /dev/null
+[job1]
+filename=t0026file
+size=1M
+readwrite=randwrite
+loops=8
+do_verify=1
+verify=md5
+experimental_verify=1
+
+[job2]
+stonewall=1
+filename=t0026file
+size=1M
+readwrite=randrw
+time_based
+runtime=5
+do_verify=1
+verify=md5
+experimental_verify=1
--- /dev/null
+[global]
+filename=t0027file
+size=16k
+bs=16k
+
+[write_job]
+readwrite=write
+buffer_pattern='t0027.pattern'
+
+[read_job]
+stonewall=1
+readwrite=read
+verify=pattern
+verify_pattern='t0027.pattern'
--- /dev/null
+[test]
+size=16k
+readwrite=write
+buffer_pattern="abcd"-120xdeadface
+ioengine=null
--- /dev/null
+[global]
+filename=t0029file
+size=4k
+verify=md5
+
+[write]
+rw=write
+do_verify=0
+
+[read]
+stonewall=1
+rw=read
+loops=2
+do_verify=1
--- /dev/null
+# run with --bandwidth-log
+# broken behavior: seg fault
+# successful behavior: test runs to completion with 0 as the exit code
+
+[test]
+ioengine=null
+filesize=1T
+rw=read
+time_based
+runtime=2s
--- /dev/null
+[job]
+rw=write
+ioengine=libaio
+size=1mb
+time_based=1
+runtime=1
+filename=t0030file
+write_iolog=iolog
--- /dev/null
+[job]
+rw=read
+ioengine=libaio
+iodepth=128
+filename=t0030file
+read_iolog=iolog
+write_lat_log=lat_log
--- /dev/null
+# Expected results: max offset is ~1280K
+# Buggy result: max offset is ~640K
+#
+
+[global]
+ioengine=null
+size=1280K
+io_size=2560k
+bs=128K
+
+[test1]
+rw=rw
import argparse
import platform
import subprocess
+from collections import Counter
from pathlib import Path
"--output-format={output-format}".format(**self.test_options),
]
for opt in ['slat_percentiles', 'clat_percentiles', 'lat_percentiles',
- 'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs', 'cmdprio_percentage']:
+ 'unified_rw_reporting', 'fsync', 'fdatasync', 'numjobs',
+ 'cmdprio_percentage', 'bssplit', 'cmdprio_bssplit']:
if opt in self.test_options:
option = '--{0}={{{0}}}'.format(opt)
fio_args.append(option.format(**self.test_options))
file_data = file.read()
#
- # Read the first few lines and see if any of them begin with '3;fio-'
+ # Read the first few lines and see if any of them begin with '3;'
# If so, the line is probably terse output. Obviously, this only
# works for fio terse version 3 and it does not work for
# multi-line terse output
lines = file_data.splitlines()
for i in range(8):
file_data = lines[i]
- if file_data.startswith('3;fio-'):
+ if file_data.startswith('3;'):
self.terse_data = file_data.split(';')
return True
#
# Check only for the presence/absence of json+
# latency bins. Future work can check the
- # accurracy of the bin values and counts.
+ # accuracy of the bin values and counts.
#
# Because the latency percentiles are based on
# the bins, we can be confident that the bin
def check_nocmdprio_lat(self, job):
"""
- Make sure no high/low priority latencies appear.
+ Make sure no per priority latencies appear.
job JSON object to check
"""
for ddir in ['read', 'write', 'trim']:
if ddir in job:
- if 'lat_high_prio' in job[ddir] or 'lat_low_prio' in job[ddir] or \
- 'clat_high_prio' in job[ddir] or 'clat_low_prio' in job[ddir]:
- print("Unexpected high/low priority latencies found in %s output" % ddir)
+ if 'prios' in job[ddir]:
+ print("Unexpected per priority latencies found in %s output" % ddir)
return False
if self.debug:
- print("No high/low priority latencies found")
+ print("No per priority latencies found")
return True
return retval
def check_prio_latencies(self, jsondata, clat=True, plus=False):
- """Check consistency of high/low priority latencies.
+ """Check consistency of per priority latencies.
clat True if we should check clat data; other check lat data
plus True if we have json+ format data where additional checks can
"""
if clat:
- high = 'clat_high_prio'
- low = 'clat_low_prio'
- combined = 'clat_ns'
+ obj = combined = 'clat_ns'
else:
- high = 'lat_high_prio'
- low = 'lat_low_prio'
- combined = 'lat_ns'
+ obj = combined = 'lat_ns'
- if not high in jsondata or not low in jsondata or not combined in jsondata:
- print("Error identifying high/low priority latencies")
+ if not 'prios' in jsondata or not combined in jsondata:
+ print("Error identifying per priority latencies")
return False
- if jsondata[high]['N'] + jsondata[low]['N'] != jsondata[combined]['N']:
- print("High %d + low %d != combined sample size %d" % \
- (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N']))
+ sum_sample_size = sum([x[obj]['N'] for x in jsondata['prios']])
+ if sum_sample_size != jsondata[combined]['N']:
+ print("Per prio sample size sum %d != combined sample size %d" %
+ (sum_sample_size, jsondata[combined]['N']))
return False
elif self.debug:
- print("High %d + low %d == combined sample size %d" % \
- (jsondata[high]['N'], jsondata[low]['N'], jsondata[combined]['N']))
+ print("Per prio sample size sum %d == combined sample size %d" %
+ (sum_sample_size, jsondata[combined]['N']))
- if min(jsondata[high]['min'], jsondata[low]['min']) != jsondata[combined]['min']:
- print("Min of high %d, low %d min latencies does not match min %d from combined data" % \
- (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min']))
+ min_val = min([x[obj]['min'] for x in jsondata['prios']])
+ if min_val != jsondata[combined]['min']:
+ print("Min per prio min latency %d does not match min %d from combined data" %
+ (min_val, jsondata[combined]['min']))
return False
elif self.debug:
- print("Min of high %d, low %d min latencies matches min %d from combined data" % \
- (jsondata[high]['min'], jsondata[low]['min'], jsondata[combined]['min']))
+ print("Min per prio min latency %d matches min %d from combined data" %
+ (min_val, jsondata[combined]['min']))
- if max(jsondata[high]['max'], jsondata[low]['max']) != jsondata[combined]['max']:
- print("Max of high %d, low %d max latencies does not match max %d from combined data" % \
- (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max']))
+ max_val = max([x[obj]['max'] for x in jsondata['prios']])
+ if max_val != jsondata[combined]['max']:
+ print("Max per prio max latency %d does not match max %d from combined data" %
+ (max_val, jsondata[combined]['max']))
return False
elif self.debug:
- print("Max of high %d, low %d max latencies matches max %d from combined data" % \
- (jsondata[high]['max'], jsondata[low]['max'], jsondata[combined]['max']))
+ print("Max per prio max latency %d matches max %d from combined data" %
+ (max_val, jsondata[combined]['max']))
- weighted_avg = (jsondata[high]['mean'] * jsondata[high]['N'] + \
- jsondata[low]['mean'] * jsondata[low]['N']) / jsondata[combined]['N']
+ weighted_vals = [x[obj]['mean'] * x[obj]['N'] for x in jsondata['prios']]
+ weighted_avg = sum(weighted_vals) / jsondata[combined]['N']
delta = abs(weighted_avg - jsondata[combined]['mean'])
if (delta / jsondata[combined]['mean']) > 0.0001:
- print("Difference between weighted average %f of high, low means "
+ print("Difference between merged per prio weighted average %f mean "
"and actual mean %f exceeds 0.01%%" % (weighted_avg, jsondata[combined]['mean']))
return False
elif self.debug:
- print("Weighted average %f of high, low means matches actual mean %f" % \
- (weighted_avg, jsondata[combined]['mean']))
+ print("Merged per prio weighted average %f mean matches actual mean %f" %
+ (weighted_avg, jsondata[combined]['mean']))
if plus:
- if not self.check_jsonplus(jsondata[high]):
- return False
- if not self.check_jsonplus(jsondata[low]):
- return False
+ for prio in jsondata['prios']:
+ if not self.check_jsonplus(prio[obj]):
+ return False
- bins = {**jsondata[high]['bins'], **jsondata[low]['bins']}
- for duration in bins.keys():
- if duration in jsondata[high]['bins'] and duration in jsondata[low]['bins']:
- bins[duration] = jsondata[high]['bins'][duration] + \
- jsondata[low]['bins'][duration]
+ counter = Counter()
+ for prio in jsondata['prios']:
+ counter.update(prio[obj]['bins'])
+
+ bins = dict(counter)
if len(bins) != len(jsondata[combined]['bins']):
- print("Number of combined high/low bins does not match number of overall bins")
+ print("Number of merged bins %d does not match number of overall bins %d" %
+ (len(bins), len(jsondata[combined]['bins'])))
return False
elif self.debug:
- print("Number of bins from merged high/low data matches number of overall bins")
+ print("Number of merged bins %d matches number of overall bins %d" %
+ (len(bins), len(jsondata[combined]['bins'])))
for duration in bins.keys():
if bins[duration] != jsondata[combined]['bins'][duration]:
- print("Merged high/low count does not match overall count for duration %d" \
- % duration)
+ print("Merged per prio count does not match overall count for duration %d" %
+ duration)
return False
- print("Merged high/low priority latency data match combined latency data")
+ print("Merged per priority latency data match combined latency data")
return True
def check(self):
print("Unexpected trim data found in output")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['read'], 0, slat=False)
print("Unexpected trim data found in output")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['write'], 1, slat=False, clat=False)
print("Unexpected write data found in output")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['trim'], 2, slat=False, tlat=False)
print("Unexpected trim data found in output")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['read'], 0, plus=True)
print("Unexpected trim data found in output")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
print("Unexpected trim data found in output")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True)
print("Unexpected trim data found in output")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['read'], 0, clat=False, tlat=False, plus=True)
job = self.json_data['jobs'][0]
retval = True
- if 'read' in job or 'write'in job or 'trim' in job:
+ if 'read' in job or 'write' in job or 'trim' in job:
print("Unexpected data direction found in fio output")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['mixed'], 0, plus=True, unified=True)
print("Error checking fsync latency data")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['write'], 1, slat=False, plus=True)
print("Unexpected trim data found in output")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['read'], 0, plus=True)
print("Unexpected trim data found in output")
retval = False
if not self.check_nocmdprio_lat(job):
- print("Unexpected high/low priority latencies found")
+ print("Unexpected per priority latencies found")
retval = False
retval &= self.check_latencies(job['read'], 0, slat=False, clat=False, plus=True)
job = self.json_data['jobs'][0]
retval = True
- if 'read' in job or 'write'in job or 'trim' in job:
+ if 'read' in job or 'write' in job or 'trim' in job:
print("Unexpected data direction found in fio output")
retval = False
return retval
+class Test021(FioLatTest):
+ """Test object for Test 21."""
+
+ def check(self):
+ """Check Test 21 output."""
+
+ job = self.json_data['jobs'][0]
+
+ retval = True
+ if not self.check_empty(job['trim']):
+ print("Unexpected trim data found in output")
+ retval = False
+
+ retval &= self.check_latencies(job['read'], 0, slat=False, tlat=False, plus=True)
+ retval &= self.check_latencies(job['write'], 1, slat=False, tlat=False, plus=True)
+ retval &= self.check_prio_latencies(job['read'], clat=True, plus=True)
+ retval &= self.check_prio_latencies(job['write'], clat=True, plus=True)
+
+ return retval
+
+
def parse_args():
"""Parse command-line arguments."""
# randread, null
# enable slat, clat, lat
# only clat and lat will appear because
- # because the null ioengine is syncrhonous
+ # because the null ioengine is synchronous
"test_id": 1,
"runtime": 2,
"output-format": "json",
{
# randread, aio
# enable slat, clat, lat
- # all will appear because liaio is asynchronous
+ # all will appear because libaio is asynchronous
"test_id": 4,
"runtime": 5,
"output-format": "json+",
# randread, null
# enable slat, clat, lat
# only clat and lat will appear because
- # because the null ioengine is syncrhonous
- # same as Test 1 except
- # numjobs = 4 to test sum_thread_stats() changes
+ # because the null ioengine is synchronous
+ # same as Test 1 except add numjobs = 4 to test
+ # sum_thread_stats() changes
"test_id": 12,
"runtime": 2,
"output-format": "json",
{
# randread, aio
# enable slat, clat, lat
- # all will appear because liaio is asynchronous
- # same as Test 4 except
- # numjobs = 4 to test sum_thread_stats() changes
+ # all will appear because libaio is asynchronous
+ # same as Test 4 except add numjobs = 4 to test
+ # sum_thread_stats() changes
"test_id": 13,
"runtime": 5,
"output-format": "json+",
{
# 50/50 r/w, aio, unified_rw_reporting
# enable slat, clat, lata
- # same as Test 8 except
- # numjobs = 4 to test sum_thread_stats() changes
+ # same as Test 8 except add numjobs = 4 to test
+ # sum_thread_stats() changes
"test_id": 14,
"runtime": 5,
"output-format": "json+",
{
# randread, aio
# enable slat, clat, lat
- # all will appear because liaio is asynchronous
+ # all will appear because libaio is asynchronous
# same as Test 4 except add cmdprio_percentage
"test_id": 15,
"runtime": 5,
{
# 50/50 r/w, aio, unified_rw_reporting
# enable slat, clat, lat
- # same as Test 19 except
- # add numjobs = 4 to test sum_thread_stats() changes
+ # same as Test 19 except add numjobs = 4 to test
+ # sum_thread_stats() changes
"test_id": 20,
"runtime": 5,
"output-format": "json+",
'numjobs': 4,
"test_obj": Test019,
},
+ {
+ # r/w, aio
+ # enable only clat
+ # test bssplit and cmdprio_bssplit
+ "test_id": 21,
+ "runtime": 5,
+ "output-format": "json+",
+ "slat_percentiles": 0,
+ "clat_percentiles": 1,
+ "lat_percentiles": 0,
+ "ioengine": aio,
+ 'rw': 'randrw',
+ 'bssplit': '64k/40:1024k/60',
+ 'cmdprio_bssplit': '64k/25/1/1:64k/75/3/2:1024k/0',
+ "test_obj": Test021,
+ },
+ {
+ # r/w, aio
+ # enable only clat
+ # same as Test 21 except add numjobs = 4 to test
+ # sum_thread_stats() changes
+ "test_id": 22,
+ "runtime": 5,
+ "output-format": "json+",
+ "slat_percentiles": 0,
+ "clat_percentiles": 1,
+ "lat_percentiles": 0,
+ "ioengine": aio,
+ 'rw': 'randrw',
+ 'bssplit': '64k/40:1024k/60',
+ 'cmdprio_bssplit': '64k/25/1/1:64k/75/3/2:1024k/0',
+ 'numjobs': 4,
+ "test_obj": Test021,
+ },
]
passed = 0
(args.run_only and test['test_id'] not in args.run_only):
skipped = skipped + 1
outcome = 'SKIPPED (User request)'
- elif (platform.system() != 'Linux' or os.geteuid() != 0) and 'cmdprio_percentage' in test:
+ elif (platform.system() != 'Linux' or os.geteuid() != 0) and \
+ ('cmdprio_percentage' in test or 'cmdprio_bssplit' in test):
skipped = skipped + 1
- outcome = 'SKIPPED (Linux root required for cmdprio_percentage tests)'
+ outcome = 'SKIPPED (Linux root required for cmdprio tests)'
else:
test_obj = test['test_obj'](artifact_root, test, args.debug)
status = test_obj.run_fio(fio)
switch (argc) {
case 5: if (strncmp(argv[4], "verify", 7) == 0)
verify = 1;
- fallthrough;
+ fio_fallthrough;
case 4: spin = atoi(argv[3]);
- fallthrough;
+ fio_fallthrough;
case 3: seed = atol(argv[2]);
- fallthrough;
+ fio_fallthrough;
case 2: numbers = strtol(argv[1], NULL, 16);
break;
default: usage();
/* Create verification table */
if (verify) {
v_size = numbers * sizeof(uint8_t);
- v = malloc(v_size);
- memset(v, 0, v_size);
+ v = calloc(1, v_size);
printf("\nVerification table is %lf KiB\n", (double)(v_size) / 1024);
}
v_start = v;
--- /dev/null
+#!/usr/bin/env python3
+#
+# log_compression.py
+#
+# Test log_compression and log_store_compressed. Uses null ioengine.
+# Previous bugs have caused output in per I/O log files to be missing
+# and/or out of order
+#
+# Expected result: 8000 log entries, offset starting at 0 and increasing by bs
+# Buggy result: Log entries out of order (usually without log_store_compressed)
+# and/or missing log entries (usually with log_store_compressed)
+#
+# USAGE
+# python log_compression.py [-f fio-executable]
+#
+# EXAMPLES
+# python t/log_compression.py
+# python t/log_compression.py -f ./fio
+#
+# REQUIREMENTS
+# Python 3.5+
+#
+# ===TEST MATRIX===
+#
+# With log_compression=10K
+# With log_store_compressed=1 and log_compression=10K
+
+import os
+import sys
+import platform
+import argparse
+import subprocess
+
+
+def parse_args():
+ """Parse command-line arguments."""
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-f', '--fio',
+ help='path to fio executable (e.g., ./fio)')
+ return parser.parse_args()
+
+
+def run_fio(fio,log_store_compressed):
+ fio_args = [
+ '--name=job',
+ '--ioengine=null',
+ '--filesize=1000M',
+ '--bs=128K',
+ '--rw=write',
+ '--iodepth=1',
+ '--write_bw_log=test',
+ '--per_job_logs=0',
+ '--log_offset=1',
+ '--log_compression=10K',
+ ]
+ if log_store_compressed:
+ fio_args.append('--log_store_compressed=1')
+
+ subprocess.check_output([fio] + fio_args)
+
+ if log_store_compressed:
+ fio_inflate_args = [
+ '--inflate-log=test_bw.log.fz'
+ ]
+ with open('test_bw.from_fz.log','wt') as f:
+ subprocess.check_call([fio]+fio_inflate_args,stdout=f)
+
+def check_log_file(log_store_compressed):
+ filename = 'test_bw.from_fz.log' if log_store_compressed else 'test_bw.log'
+ with open(filename,'rt') as f:
+ file_data = f.read()
+ log_lines = [x for x in file_data.split('\n') if len(x.strip())!=0]
+ log_ios = len(log_lines)
+
+ filesize = 1000*1024*1024
+ bs = 128*1024
+ ios = filesize//bs
+ if log_ios!=ios:
+ print('wrong number of ios ({}) in log; should be {}'.format(log_ios,ios))
+ return False
+
+ expected_offset = 0
+ for line_number,line in enumerate(log_lines):
+ log_offset = int(line.split(',')[4])
+ if log_offset != expected_offset:
+ print('wrong offset ({}) for io number {} in log; should be {}'.format(
+ log_offset, line_number, expected_offset))
+ return False
+ expected_offset += bs
+ return True
+
+def main():
+ """Entry point for this script."""
+ args = parse_args()
+ if args.fio:
+ fio_path = args.fio
+ else:
+ fio_path = os.path.join(os.path.dirname(__file__), '../fio')
+ if not os.path.exists(fio_path):
+ fio_path = 'fio'
+ print("fio path is", fio_path)
+
+ passed_count = 0
+ failed_count = 0
+ for log_store_compressed in [False, True]:
+ run_fio(fio_path, log_store_compressed)
+ passed = check_log_file(log_store_compressed)
+ print('Test with log_store_compressed={} {}'.format(log_store_compressed,
+ 'PASSED' if passed else 'FAILED'))
+ if passed:
+ passed_count+=1
+ else:
+ failed_count+=1
+
+ print('{} tests passed, {} failed'.format(passed_count, failed_count))
+
+ sys.exit(failed_count)
+
+if __name__ == '__main__':
+ main()
+
--- /dev/null
+#!/usr/bin/env python3
+"""
+# nvmept.py
+#
+# Test fio's io_uring_cmd ioengine with NVMe pass-through commands.
+#
+# USAGE
+# see python3 nvmept.py --help
+#
+# EXAMPLES
+# python3 t/nvmept.py --dut /dev/ng0n1
+# python3 t/nvmept.py --dut /dev/ng1n1 -f ./fio
+#
+# REQUIREMENTS
+# Python 3.6
+#
+"""
+import os
+import sys
+import time
+import argparse
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+
+
+class PassThruTest(FioJobCmdTest):
+ """
+ NVMe pass-through test class. Check to make sure output for selected data
+ direction(s) is non-zero and that zero data appears for other directions.
+ """
+
+ def setup(self, parameters):
+ """Setup a test."""
+
+ fio_args = [
+ "--name=nvmept",
+ "--ioengine=io_uring_cmd",
+ "--cmd_type=nvme",
+ "--iodepth=8",
+ "--iodepth_batch=4",
+ "--iodepth_batch_complete=4",
+ f"--filename={self.fio_opts['filename']}",
+ f"--rw={self.fio_opts['rw']}",
+ f"--output={self.filenames['output']}",
+ f"--output-format={self.fio_opts['output-format']}",
+ ]
+ for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles',
+ 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait',
+ 'time_based', 'runtime', 'verify', 'io_size']:
+ if opt in self.fio_opts:
+ option = f"--{opt}={self.fio_opts[opt]}"
+ fio_args.append(option)
+
+ super().setup(fio_args)
+
+
+ def check_result(self):
+ super().check_result()
+
+ if 'rw' not in self.fio_opts:
+ return
+
+ if not self.passed:
+ return
+
+ job = self.json_data['jobs'][0]
+
+ if self.fio_opts['rw'] in ['read', 'randread']:
+ self.passed = self.check_all_ddirs(['read'], job)
+ elif self.fio_opts['rw'] in ['write', 'randwrite']:
+ if 'verify' not in self.fio_opts:
+ self.passed = self.check_all_ddirs(['write'], job)
+ else:
+ self.passed = self.check_all_ddirs(['read', 'write'], job)
+ elif self.fio_opts['rw'] in ['trim', 'randtrim']:
+ self.passed = self.check_all_ddirs(['trim'], job)
+ elif self.fio_opts['rw'] in ['readwrite', 'randrw']:
+ self.passed = self.check_all_ddirs(['read', 'write'], job)
+ elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']:
+ self.passed = self.check_all_ddirs(['trim', 'write'], job)
+ else:
+ print(f"Unhandled rw value {self.fio_opts['rw']}")
+ self.passed = False
+
+ if job['iodepth_level']['8'] < 95:
+ print("Did not achieve requested iodepth")
+ self.passed = False
+
+
+TEST_LIST = [
+ {
+ "test_id": 1,
+ "fio_opts": {
+ "rw": 'read',
+ "timebased": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 2,
+ "fio_opts": {
+ "rw": 'randread',
+ "timebased": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 3,
+ "fio_opts": {
+ "rw": 'write',
+ "timebased": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 4,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "timebased": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 5,
+ "fio_opts": {
+ "rw": 'trim',
+ "timebased": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 6,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "timebased": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 7,
+ "fio_opts": {
+ "rw": 'write',
+ "io_size": 1024*1024,
+ "verify": "crc32c",
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 8,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "io_size": 1024*1024,
+ "verify": "crc32c",
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 9,
+ "fio_opts": {
+ "rw": 'readwrite',
+ "timebased": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 10,
+ "fio_opts": {
+ "rw": 'randrw',
+ "timebased": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 11,
+ "fio_opts": {
+ "rw": 'trimwrite',
+ "timebased": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 12,
+ "fio_opts": {
+ "rw": 'randtrimwrite',
+ "timebased": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 13,
+ "fio_opts": {
+ "rw": 'randread',
+ "timebased": 1,
+ "runtime": 3,
+ "fixedbufs": 1,
+ "nonvectored": 1,
+ "force_async": 1,
+ "registerfiles": 1,
+ "sqthread_poll": 1,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ "test_id": 14,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "timebased": 1,
+ "runtime": 3,
+ "fixedbufs": 1,
+ "nonvectored": 1,
+ "force_async": 1,
+ "registerfiles": 1,
+ "sqthread_poll": 1,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+ {
+ # We can't enable fixedbufs because for trim-only
+ # workloads fio actually does not allocate any buffers
+ "test_id": 15,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "timebased": 1,
+ "runtime": 3,
+ "fixedbufs": 0,
+ "nonvectored": 1,
+ "force_async": 1,
+ "registerfiles": 1,
+ "sqthread_poll": 1,
+ "output-format": "json",
+ },
+ "test_class": PassThruTest,
+ },
+]
+
+def parse_args():
+ """Parse command-line arguments."""
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+ parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+ parser.add_argument('-s', '--skip', nargs='+', type=int,
+ help='list of test(s) to skip')
+ parser.add_argument('-o', '--run-only', nargs='+', type=int,
+ help='list of test(s) to run, skipping all others')
+ parser.add_argument('--dut', help='target NVMe character device to test '
+ '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True)
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands."""
+
+ args = parse_args()
+
+ artifact_root = args.artifact_root if args.artifact_root else \
+ f"nvmept-test-{time.strftime('%Y%m%d-%H%M%S')}"
+ os.mkdir(artifact_root)
+ print(f"Artifact directory is {artifact_root}")
+
+ if args.fio:
+ fio_path = str(Path(args.fio).absolute())
+ else:
+ fio_path = 'fio'
+ print(f"fio path is {fio_path}")
+
+ for test in TEST_LIST:
+ test['fio_opts']['filename'] = args.dut
+
+ test_env = {
+ 'fio_path': fio_path,
+ 'fio_root': str(Path(__file__).absolute().parent.parent),
+ 'artifact_root': artifact_root,
+ 'basename': 'nvmept',
+ }
+
+ _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
+ sys.exit(failed)
+
+
+if __name__ == '__main__':
+ main()
--- /dev/null
+#!/usr/bin/env python3
+#
+# Copyright 2024 Samsung Electronics Co., Ltd All Rights Reserved
+#
+# For conditions of distribution and use, see the accompanying COPYING file.
+#
+"""
+# nvmept_fdp.py
+#
+# Test fio's io_uring_cmd ioengine with NVMe pass-through FDP write commands.
+#
+# USAGE
+# see python3 nvmept_fdp.py --help
+#
+# EXAMPLES
+# python3 t/nvmept_fdp.py --dut /dev/ng0n1
+# python3 t/nvmept_fdp.py --dut /dev/ng1n1 -f ./fio
+#
+# REQUIREMENTS
+# Python 3.6
+# Device formatted with LBA data size 4096 bytes
+# Device with at least five placement IDs
+#
+# WARNING
+# This is a destructive test
+"""
+import os
+import sys
+import json
+import time
+import locale
+import logging
+import argparse
+import subprocess
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+from fiotestcommon import SUCCESS_NONZERO
+
+
+class FDPTest(FioJobCmdTest):
+ """
+ NVMe pass-through test class. Check to make sure output for selected data
+ direction(s) is non-zero and that zero data appears for other directions.
+ """
+
+ def setup(self, parameters):
+ """Setup a test."""
+
+ fio_args = [
+ "--name=nvmept-fdp",
+ "--ioengine=io_uring_cmd",
+ "--cmd_type=nvme",
+ "--randrepeat=0",
+ f"--filename={self.fio_opts['filename']}",
+ f"--rw={self.fio_opts['rw']}",
+ f"--output={self.filenames['output']}",
+ f"--output-format={self.fio_opts['output-format']}",
+ ]
+ for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles',
+ 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait',
+ 'time_based', 'runtime', 'verify', 'io_size', 'num_range',
+ 'iodepth', 'iodepth_batch', 'iodepth_batch_complete',
+ 'size', 'rate', 'bs', 'bssplit', 'bsrange', 'randrepeat',
+ 'buffer_pattern', 'verify_pattern', 'offset', 'fdp',
+ 'fdp_pli', 'fdp_pli_select', 'dataplacement', 'plid_select',
+ 'plids', 'number_ios']:
+ if opt in self.fio_opts:
+ option = f"--{opt}={self.fio_opts[opt]}"
+ fio_args.append(option)
+
+ super().setup(fio_args)
+
+
+ def check_result(self):
+ try:
+ self._check_result()
+ finally:
+ if not update_all_ruhs(self.fio_opts['filename']):
+ logging.error("Could not reset device")
+ if not check_all_ruhs(self.fio_opts['filename']):
+ logging.error("Reclaim units have inconsistent RUAMW values")
+
+
+ def _check_result(self):
+
+ super().check_result()
+
+ if 'rw' not in self.fio_opts or \
+ not self.passed or \
+ 'json' not in self.fio_opts['output-format']:
+ return
+
+ job = self.json_data['jobs'][0]
+
+ if self.fio_opts['rw'] in ['read', 'randread']:
+ self.passed = self.check_all_ddirs(['read'], job)
+ elif self.fio_opts['rw'] in ['write', 'randwrite']:
+ if 'verify' not in self.fio_opts:
+ self.passed = self.check_all_ddirs(['write'], job)
+ else:
+ self.passed = self.check_all_ddirs(['read', 'write'], job)
+ elif self.fio_opts['rw'] in ['trim', 'randtrim']:
+ self.passed = self.check_all_ddirs(['trim'], job)
+ elif self.fio_opts['rw'] in ['readwrite', 'randrw']:
+ self.passed = self.check_all_ddirs(['read', 'write'], job)
+ elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']:
+ self.passed = self.check_all_ddirs(['trim', 'write'], job)
+ else:
+ logging.error("Unhandled rw value %s", self.fio_opts['rw'])
+ self.passed = False
+
+ if 'iodepth' in self.fio_opts:
+ # We will need to figure something out if any test uses an iodepth
+ # different from 8
+ if job['iodepth_level']['8'] < 95:
+ logging.error("Did not achieve requested iodepth")
+ self.passed = False
+ else:
+ logging.debug("iodepth 8 target met %s", job['iodepth_level']['8'])
+
+
+class FDPMultiplePLIDTest(FDPTest):
+ """
+ Write to multiple placement IDs.
+ """
+
+ def setup(self, parameters):
+ mapping = {
+ 'nruhsd': FIO_FDP_NUMBER_PLIDS,
+ 'max_ruamw': FIO_FDP_MAX_RUAMW,
+ }
+ if 'number_ios' in self.fio_opts and isinstance(self.fio_opts['number_ios'], str):
+ self.fio_opts['number_ios'] = eval(self.fio_opts['number_ios'].format(**mapping))
+
+ super().setup(parameters)
+
+ def _check_result(self):
+ if 'fdp_pli' in self.fio_opts:
+ plid_list = self.fio_opts['fdp_pli'].split(',')
+ elif 'plids' in self.fio_opts:
+ plid_list = self.fio_opts['plids'].split(',')
+ else:
+ plid_list = list(range(FIO_FDP_NUMBER_PLIDS))
+
+ plid_list = sorted([int(i) for i in plid_list])
+ logging.debug("plid_list: %s", str(plid_list))
+
+ fdp_status = get_fdp_status(self.fio_opts['filename'])
+
+ select = "roundrobin"
+ if 'fdp_pli_select' in self.fio_opts:
+ select = self.fio_opts['fdp_pli_select']
+ elif 'plid_select' in self.fio_opts:
+ select = self.fio_opts['plid_select']
+
+ if select == "roundrobin":
+ self._check_robin(plid_list, fdp_status)
+ elif select == "random":
+ self._check_random(plid_list, fdp_status)
+ else:
+ logging.error("Unknown plid selection strategy %s", select)
+ self.passed = False
+
+ super()._check_result()
+
+ def _check_robin(self, plid_list, fdp_status):
+ """
+ With round robin we can know exactly how many writes each PLID will
+ receive.
+ """
+ ruamw = [FIO_FDP_MAX_RUAMW] * FIO_FDP_NUMBER_PLIDS
+
+ remainder = int(self.fio_opts['number_ios'] % len(plid_list))
+ whole = int((self.fio_opts['number_ios'] - remainder) / len(plid_list))
+ logging.debug("PLIDs in the list should receive %d writes; %d PLIDs will receive one extra",
+ whole, remainder)
+
+ for plid in plid_list:
+ ruamw[plid] -= whole
+ if remainder:
+ ruamw[plid] -= 1
+ remainder -= 1
+ logging.debug("Expected ruamw values: %s", str(ruamw))
+
+ for idx, ruhs in enumerate(fdp_status['ruhss']):
+ if ruhs['ruamw'] != ruamw[idx]:
+ logging.error("RUAMW mismatch with idx %d, pid %d, expected %d, observed %d", idx,
+ ruhs['pid'], ruamw[idx], ruhs['ruamw'])
+ self.passed = False
+ break
+
+ logging.debug("RUAMW match with idx %d, pid %d: ruamw=%d", idx, ruhs['pid'], ruamw[idx])
+
+ def _check_random(self, plid_list, fdp_status):
+ """
+ With random selection, a set of PLIDs will receive all the write
+ operations and the remainder will be untouched.
+ """
+
+ total_ruamw = 0
+ for plid in plid_list:
+ total_ruamw += fdp_status['ruhss'][plid]['ruamw']
+
+ expected = len(plid_list) * FIO_FDP_MAX_RUAMW - self.fio_opts['number_ios']
+ if total_ruamw != expected:
+ logging.error("Expected total ruamw %d for plids %s, observed %d", expected,
+ str(plid_list), total_ruamw)
+ self.passed = False
+ else:
+ logging.debug("Observed expected total ruamw %d for plids %s", expected, str(plid_list))
+
+ for idx, ruhs in enumerate(fdp_status['ruhss']):
+ if idx in plid_list:
+ continue
+ if ruhs['ruamw'] != FIO_FDP_MAX_RUAMW:
+ logging.error("Unexpected ruamw %d for idx %d, pid %d, expected %d", ruhs['ruamw'],
+ idx, ruhs['pid'], FIO_FDP_MAX_RUAMW)
+ self.passed = False
+ else:
+ logging.debug("Observed expected ruamw %d for idx %d, pid %d", ruhs['ruamw'], idx,
+ ruhs['pid'])
+
+
+class FDPSinglePLIDTest(FDPTest):
+ """
+ Write to a single placement ID only.
+ """
+
+ def _check_result(self):
+ if 'plids' in self.fio_opts:
+ plid = self.fio_opts['plids']
+ elif 'fdp_pli' in self.fio_opts:
+ plid = self.fio_opts['fdp_pli']
+ else:
+ plid = 0
+
+ fdp_status = get_fdp_status(self.fio_opts['filename'])
+ ruamw = fdp_status['ruhss'][plid]['ruamw']
+ lba_count = self.fio_opts['number_ios']
+
+ if FIO_FDP_MAX_RUAMW - lba_count != ruamw:
+ logging.error("FDP accounting mismatch for plid %d; expected ruamw %d, observed %d",
+ plid, FIO_FDP_MAX_RUAMW - lba_count, ruamw)
+ self.passed = False
+ else:
+ logging.debug("FDP accounting as expected for plid %d; ruamw = %d", plid, ruamw)
+
+ super()._check_result()
+
+
+class FDPReadTest(FDPTest):
+ """
+ Read workload test.
+ """
+
+ def _check_result(self):
+ ruamw = check_all_ruhs(self.fio_opts['filename'])
+
+ if ruamw != FIO_FDP_MAX_RUAMW:
+ logging.error("Read workload affected FDP ruamw")
+ self.passed = False
+ else:
+ logging.debug("Read workload did not disturb FDP ruamw")
+ super()._check_result()
+
+
+def get_fdp_status(dut):
+ """
+ Run the nvme-cli command to obtain FDP status and return result as a JSON
+ object.
+ """
+
+ cmd = f"sudo nvme fdp status --output-format=json {dut}"
+ cmd = cmd.split(' ')
+ cmd_result = subprocess.run(cmd, capture_output=True, check=False,
+ encoding=locale.getpreferredencoding())
+
+ if cmd_result.returncode != 0:
+ logging.error("Error obtaining device %s FDP status: %s", dut, cmd_result.stderr)
+ return False
+
+ return json.loads(cmd_result.stdout)
+
+
+def update_ruh(dut, plid):
+ """
+ Update reclaim unit handles with specified ID(s). This tells the device to
+ point the RUH to a new (empty) reclaim unit.
+ """
+
+ ids = ','.join(plid) if isinstance(plid, list) else plid
+ cmd = f"nvme fdp update --pids={ids} {dut}"
+ cmd = cmd.split(' ')
+ cmd_result = subprocess.run(cmd, capture_output=True, check=False,
+ encoding=locale.getpreferredencoding())
+
+ if cmd_result.returncode != 0:
+ logging.error("Error updating RUH %s ID(s) %s", dut, ids)
+ return False
+
+ return True
+
+
+def update_all_ruhs(dut):
+ """
+ Update all reclaim unit handles on the device.
+ """
+
+ fdp_status = get_fdp_status(dut)
+ for ruhs in fdp_status['ruhss']:
+ if not update_ruh(dut, ruhs['pid']):
+ return False
+
+ return True
+
+
+def check_all_ruhs(dut):
+ """
+ Check that all RUHs have the same value for reclaim unit available media
+ writes (RUAMW). Return the RUAMW value.
+ """
+
+ fdp_status = get_fdp_status(dut)
+ ruh_status = fdp_status['ruhss']
+
+ ruamw = ruh_status[0]['ruamw']
+ for ruhs in ruh_status:
+ if ruhs['ruamw'] != ruamw:
+ logging.error("RUAMW mismatch: found %d, expected %d", ruhs['ruamw'], ruamw)
+ return False
+
+ return ruamw
+
+
+TEST_LIST = [
+ # Write one LBA to one PLID using both the old and new sets of options
+ ## omit fdp_pli_select/plid_select
+ {
+ "test_id": 1,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "number_ios": 1,
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli": 3,
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ {
+ "test_id": 2,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": 1,
+ "verify": "crc32c",
+ "dataplacement": "fdp",
+ "plids": 3,
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ ## fdp_pli_select/plid_select=roundrobin
+ {
+ "test_id": 3,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "number_ios": 1,
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli": 3,
+ "fdp_pli_select": "roundrobin",
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ {
+ "test_id": 4,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": 1,
+ "verify": "crc32c",
+ "dataplacement": "fdp",
+ "plids": 3,
+ "plid_select": "roundrobin",
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ ## fdp_pli_select/plid_select=random
+ {
+ "test_id": 5,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "number_ios": 1,
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli": 3,
+ "fdp_pli_select": "random",
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ {
+ "test_id": 6,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": 1,
+ "verify": "crc32c",
+ "dataplacement": "fdp",
+ "plids": 3,
+ "plid_select": "random",
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ # Write four LBAs to one PLID using both the old and new sets of options
+ ## omit fdp_pli_select/plid_select
+ {
+ "test_id": 7,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "number_ios": 4,
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli": 1,
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ {
+ "test_id": 8,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": 4,
+ "verify": "crc32c",
+ "dataplacement": "fdp",
+ "plids": 1,
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ ## fdp_pli_select/plid_select=roundrobin
+ {
+ "test_id": 9,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "number_ios": 4,
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli": 1,
+ "fdp_pli_select": "roundrobin",
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ {
+ "test_id": 10,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": 4,
+ "verify": "crc32c",
+ "dataplacement": "fdp",
+ "plids": 1,
+ "plid_select": "roundrobin",
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ ## fdp_pli_select/plid_select=random
+ {
+ "test_id": 11,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "number_ios": 4,
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli": 1,
+ "fdp_pli_select": "random",
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ {
+ "test_id": 12,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": 4,
+ "verify": "crc32c",
+ "dataplacement": "fdp",
+ "plids": 1,
+ "plid_select": "random",
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ # Just a regular write without FDP directive--should land on plid 0
+ {
+ "test_id": 13,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": 19,
+ "verify": "crc32c",
+ "output-format": "json",
+ },
+ "test_class": FDPSinglePLIDTest,
+ },
+ # Read workload
+ {
+ "test_id": 14,
+ "fio_opts": {
+ "rw": 'randread',
+ "bs": 4096,
+ "number_ios": 19,
+ "output-format": "json",
+ },
+ "test_class": FDPReadTest,
+ },
+ # write to multiple PLIDs using round robin to select PLIDs
+ ## write to all PLIDs using old and new sets of options
+ {
+ "test_id": 100,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": "2*{nruhsd}+3",
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli_select": "roundrobin",
+ "output-format": "json",
+ },
+ "test_class": FDPMultiplePLIDTest,
+ },
+ {
+ "test_id": 101,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": "2*{nruhsd}+3",
+ "verify": "crc32c",
+ "dataplacement": "fdp",
+ "plid_select": "roundrobin",
+ "output-format": "json",
+ },
+ "test_class": FDPMultiplePLIDTest,
+ },
+ ## write to a subset of PLIDs using old and new sets of options
+ {
+ "test_id": 102,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": "{nruhsd}+1",
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli": "1,3",
+ "fdp_pli_select": "roundrobin",
+ "output-format": "json",
+ },
+ "test_class": FDPMultiplePLIDTest,
+ },
+ {
+ "test_id": 103,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": "{nruhsd}+1",
+ "verify": "crc32c",
+ "dataplacement": "fdp",
+ "plids": "1,3",
+ "plid_select": "roundrobin",
+ "output-format": "json",
+ },
+ "test_class": FDPMultiplePLIDTest,
+ },
+ # write to multiple PLIDs using random selection of PLIDs
+ ## write to all PLIDs using old and new sets of options
+ {
+ "test_id": 200,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": "{max_ruamw}-1",
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli_select": "random",
+ "output-format": "json",
+ },
+ "test_class": FDPMultiplePLIDTest,
+ },
+ {
+ "test_id": 201,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": "{max_ruamw}-1",
+ "verify": "crc32c",
+ "dataplacement": "fdp",
+ "plid_select": "random",
+ "output-format": "json",
+ },
+ "test_class": FDPMultiplePLIDTest,
+ },
+ ## write to a subset of PLIDs using old and new sets of options
+ {
+ "test_id": 202,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": "{max_ruamw}-1",
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli": "1,3,4",
+ "fdp_pli_select": "random",
+ "output-format": "json",
+ },
+ "test_class": FDPMultiplePLIDTest,
+ },
+ {
+ "test_id": 203,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "number_ios": "{max_ruamw}-1",
+ "verify": "crc32c",
+ "dataplacement": "fdp",
+ "plids": "1,3,4",
+ "plid_select": "random",
+ "output-format": "json",
+ },
+ "test_class": FDPMultiplePLIDTest,
+ },
+ # Specify invalid options fdp=1 and dataplacement=none
+ {
+ "test_id": 300,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "io_size": 4096,
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli": 3,
+ "output-format": "normal",
+ "dataplacement": "none",
+ },
+ "test_class": FDPTest,
+ "success": SUCCESS_NONZERO,
+ },
+ # Specify invalid options fdp=1 and dataplacement=streams
+ {
+ "test_id": 301,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "io_size": 4096,
+ "verify": "crc32c",
+ "fdp": 1,
+ "fdp_pli": 3,
+ "output-format": "normal",
+ "dataplacement": "streams",
+ },
+ "test_class": FDPTest,
+ "success": SUCCESS_NONZERO,
+ },
+]
+
+def parse_args():
+ """Parse command-line arguments."""
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true')
+ parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+ parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+ parser.add_argument('-s', '--skip', nargs='+', type=int,
+ help='list of test(s) to skip')
+ parser.add_argument('-o', '--run-only', nargs='+', type=int,
+ help='list of test(s) to run, skipping all others')
+ parser.add_argument('--dut', help='target NVMe character device to test '
+ '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True)
+ args = parser.parse_args()
+
+ return args
+
+
+FIO_FDP_MAX_RUAMW = 0
+FIO_FDP_NUMBER_PLIDS = 0
+
+def main():
+ """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands."""
+ global FIO_FDP_MAX_RUAMW
+ global FIO_FDP_NUMBER_PLIDS
+
+ args = parse_args()
+
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ artifact_root = args.artifact_root if args.artifact_root else \
+ f"nvmept-fdp-test-{time.strftime('%Y%m%d-%H%M%S')}"
+ os.mkdir(artifact_root)
+ print(f"Artifact directory is {artifact_root}")
+
+ if args.fio:
+ fio_path = str(Path(args.fio).absolute())
+ else:
+ fio_path = 'fio'
+ print(f"fio path is {fio_path}")
+
+ for test in TEST_LIST:
+ test['fio_opts']['filename'] = args.dut
+
+ fdp_status = get_fdp_status(args.dut)
+ FIO_FDP_NUMBER_PLIDS = fdp_status['nruhsd']
+ update_all_ruhs(args.dut)
+ FIO_FDP_MAX_RUAMW = check_all_ruhs(args.dut)
+ if not FIO_FDP_MAX_RUAMW:
+ sys.exit(-1)
+
+ test_env = {
+ 'fio_path': fio_path,
+ 'fio_root': str(Path(__file__).absolute().parent.parent),
+ 'artifact_root': artifact_root,
+ 'basename': 'nvmept-fdp',
+ }
+
+ _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
+ sys.exit(failed)
+
+
+if __name__ == '__main__':
+ main()
--- /dev/null
+#!/usr/bin/env python3
+"""
+# nvmept_pi.py
+#
+# Test fio's io_uring_cmd ioengine support for DIF/DIX end-to-end data
+# protection.
+#
+# USAGE
+# see python3 nvmept_pi.py --help
+#
+# EXAMPLES (THIS IS A DESTRUCTIVE TEST!!)
+# python3 t/nvmept_pi.py --dut /dev/ng0n1 -f ./fio
+# python3 t/nvmept_pi.py --dut /dev/ng0n1 -f ./fio --lbaf 1
+#
+# REQUIREMENTS
+# Python 3.6
+#
+"""
+import os
+import sys
+import json
+import time
+import locale
+import logging
+import argparse
+import itertools
+import subprocess
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+from fiotestcommon import SUCCESS_NONZERO
+
+NUMBER_IOS = 8192
+BS_LOW = 1
+BS_HIGH = 16
+
+class DifDixTest(FioJobCmdTest):
+ """
+ NVMe DIF/DIX test class.
+ """
+
+ def setup(self, parameters):
+ """Setup a test."""
+
+ fio_args = [
+ "--name=nvmept_pi",
+ f"--ioengine={self.fio_opts['ioengine']}",
+ f"--filename={self.fio_opts['filename']}",
+ f"--rw={self.fio_opts['rw']}",
+ f"--bsrange={self.fio_opts['bsrange']}",
+ f"--output={self.filenames['output']}",
+ f"--md_per_io_size={self.fio_opts['md_per_io_size']}",
+ f"--pi_act={self.fio_opts['pi_act']}",
+ f"--pi_chk={self.fio_opts['pi_chk']}",
+ f"--apptag={self.fio_opts['apptag']}",
+ f"--apptag_mask={self.fio_opts['apptag_mask']}",
+ ]
+ for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles',
+ 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait',
+ 'time_based', 'runtime', 'verify', 'io_size', 'offset', 'number_ios',
+ 'output-format']:
+ if opt in self.fio_opts:
+ option = f"--{opt}={self.fio_opts[opt]}"
+ fio_args.append(option)
+
+ if self.fio_opts['ioengine'] == 'io_uring_cmd':
+ fio_args.append('--cmd_type=nvme')
+ elif self.fio_opts['ioengine'] == 'xnvme':
+ fio_args.append('--thread=1')
+ fio_args.append('--xnvme_async=io_uring_cmd')
+
+ super().setup(fio_args)
+
+
+TEST_LIST = [
+#
+# Write data with pi_act=1 and then read the data back (with both
+# pi_act=[0,1]).
+#
+ {
+ # Write workload with variable IO sizes
+ # pi_act=1
+ "test_id": 101,
+ "fio_opts": {
+ "rw": 'write',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ "pi_act": 1,
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with fixed small IO size
+ # pi_act=0
+ "test_id": 102,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_LOW,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with fixed small IO size
+ # pi_act=1
+ "test_id": 103,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_LOW,
+ "test_class": DifDixTest,
+ },
+ {
+ # Write workload with fixed large IO size
+ # Precondition for read workloads to follow
+ # pi_act=1
+ "test_id": 104,
+ "fio_opts": {
+ "rw": 'write',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ "pi_act": 1,
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_HIGH,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ "test_id": 105,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=1
+ "test_id": 106,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+#
+# Write data with pi_act=0 and then read the data back (with both
+# pi_act=[0,1]).
+#
+ {
+ # Write workload with variable IO sizes
+ # pi_act=0
+ "test_id": 201,
+ "fio_opts": {
+ "rw": 'write',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ "pi_act": 0,
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with fixed small IO size
+ # pi_act=0
+ "test_id": 202,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_LOW,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with fixed small IO size
+ # pi_act=1
+ "test_id": 203,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_LOW,
+ "test_class": DifDixTest,
+ },
+ {
+ # Write workload with fixed large IO sizes
+ # pi_act=0
+ "test_id": 204,
+ "fio_opts": {
+ "rw": 'write',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ "pi_act": 0,
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_HIGH,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ "test_id": 205,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=1
+ "test_id": 206,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x8888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+#
+# Test apptag errors.
+#
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ # trigger an apptag error
+ "test_id": 301,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x0888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "success": SUCCESS_NONZERO,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=1
+ # trigger an apptag error
+ "test_id": 302,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x0888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "success": SUCCESS_NONZERO,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ # trigger an apptag error
+ # same as above but with pi_chk=APPTAG only
+ "test_id": 303,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x0888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "success": SUCCESS_NONZERO,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=1
+ # trigger an apptag error
+ # same as above but with pi_chk=APPTAG only
+ "test_id": 304,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x0888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "success": SUCCESS_NONZERO,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ # this case would trigger an apptag error, but pi_chk says to check
+ # only the Guard PI and reftag, so there should be no error
+ "test_id": 305,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x0888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=1
+ # this case would trigger an apptag error, but pi_chk says to check
+ # only the Guard PI and reftag, so there should be no error
+ "test_id": 306,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x0888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ # this case would trigger an apptag error, but pi_chk says to check
+ # only the Guard PI, so there should be no error
+ "test_id": 307,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x0888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "GUARD",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=1
+ # this case would trigger an apptag error, but pi_chk says to check
+ # only the Guard PI, so there should be no error
+ "test_id": 308,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x0888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "GUARD",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ # this case would trigger an apptag error, but pi_chk says to check
+ # only the reftag, so there should be no error
+ # This case will be skipped when the device is formatted with Type 3 PI
+ # since Type 3 PI ignores the reftag
+ "test_id": 309,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x0888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "skip": "type3",
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=1
+ # this case would trigger an apptag error, but pi_chk says to check
+ # only the reftag, so there should be no error
+ # This case will be skipped when the device is formatted with Type 3 PI
+ # since Type 3 PI ignores the reftag
+ "test_id": 310,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x0888",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "skip": "type3",
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ # use apptag mask to ignore apptag mismatch
+ "test_id": 311,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x0888",
+ "apptag_mask": "0x0FFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=1
+ # use apptag mask to ignore apptag mismatch
+ "test_id": 312,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x0888",
+ "apptag_mask": "0x0FFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ # use apptag mask to ignore apptag mismatch
+ "test_id": 313,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0xF888",
+ "apptag_mask": "0x0FFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=1
+ # use apptag mask to ignore apptag mismatch
+ "test_id": 314,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0xF888",
+ "apptag_mask": "0x0FFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "test_class": DifDixTest,
+ },
+ {
+ # Write workload with fixed large IO sizes
+ # Set apptag=0xFFFF to disable all checking for Type 1 and 2
+ # pi_act=1
+ "test_id": 315,
+ "fio_opts": {
+ "rw": 'write',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "apptag": "0xFFFF",
+ "apptag_mask": "0xFFFF",
+ "pi_act": 1,
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_HIGH,
+ "bs_high": BS_HIGH,
+ "skip": "type3",
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ # Data was written with apptag=0xFFFF
+ # Reading the data back should disable all checking for Type 1 and 2
+ "test_id": 316,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 0,
+ "apptag": "0x0101",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "skip": "type3",
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=1
+ # Data was written with apptag=0xFFFF
+ # Reading the data back should disable all checking for Type 1 and 2
+ "test_id": 317,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "output-format": "json",
+ "pi_act": 1,
+ "apptag": "0x0000",
+ "apptag_mask": "0xFFFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "skip": "type3",
+ "test_class": DifDixTest,
+ },
+#
+# Error cases related to block size and metadata size
+#
+ {
+ # Use a min block size that is not a multiple of lba/elba size to
+ # trigger an error.
+ "test_id": 401,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "pi_act": 0,
+ "apptag": "0x8888",
+ "apptag_mask": "0x0FFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW+0.5,
+ "bs_high": BS_HIGH,
+ "success": SUCCESS_NONZERO,
+ "test_class": DifDixTest,
+ },
+ {
+ # Use metadata size that is too small
+ "test_id": 402,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "pi_act": 0,
+ "apptag": "0x8888",
+ "apptag_mask": "0x0FFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "mdsize_adjustment": -1,
+ "success": SUCCESS_NONZERO,
+ "skip": "elba",
+ "test_class": DifDixTest,
+ },
+ {
+ # Read workload with variable IO sizes
+ # pi_act=0
+ # Should still work even if metadata size is too large
+ "test_id": 403,
+ "fio_opts": {
+ "rw": 'read',
+ "number_ios": NUMBER_IOS,
+ "pi_act": 0,
+ "apptag": "0x8888",
+ "apptag_mask": "0x0FFF",
+ },
+ "pi_chk": "APPTAG,GUARD,REFTAG",
+ "bs_low": BS_LOW,
+ "bs_high": BS_HIGH,
+ "mdsize_adjustment": 1,
+ "test_class": DifDixTest,
+ },
+]
+
+
+def parse_args():
+ """Parse command-line arguments."""
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true')
+ parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+ parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+ parser.add_argument('-s', '--skip', nargs='+', type=int,
+ help='list of test(s) to skip')
+ parser.add_argument('-o', '--run-only', nargs='+', type=int,
+ help='list of test(s) to run, skipping all others')
+ parser.add_argument('--dut', help='target NVMe character device to test '
+ '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True)
+ parser.add_argument('-l', '--lbaf', nargs='+', type=int,
+ help='list of lba formats to test')
+ parser.add_argument('-i', '--ioengine', default='io_uring_cmd')
+ args = parser.parse_args()
+
+ return args
+
+
+def get_lbafs(args):
+ """
+ Determine which LBA formats to use. Use either the ones specified on the
+ command line or if none are specified query the device and use all lba
+ formats with metadata.
+ """
+ lbaf_list = []
+ id_ns_cmd = f"sudo nvme id-ns --output-format=json {args.dut}".split(' ')
+ id_ns_output = subprocess.check_output(id_ns_cmd)
+ lbafs = json.loads(id_ns_output)['lbafs']
+ if args.lbaf:
+ for lbaf in args.lbaf:
+ lbaf_list.append({'lbaf': lbaf, 'ds': 2 ** lbafs[lbaf]['ds'],
+ 'ms': lbafs[lbaf]['ms'], })
+ if lbafs[lbaf]['ms'] == 0:
+ print(f'Error: lbaf {lbaf} has metadata size zero')
+ sys.exit(1)
+ else:
+ for lbaf_num, lbaf in enumerate(lbafs):
+ if lbaf['ms'] != 0:
+ lbaf_list.append({'lbaf': lbaf_num, 'ds': 2 ** lbaf['ds'],
+ 'ms': lbaf['ms'], })
+
+ return lbaf_list
+
+
+def get_guard_pi(lbaf_list, args):
+ """
+ Find out how many bits of guard protection information are associated with
+ each lbaf to be used. If this is not available assume 16-bit guard pi.
+ Also record the bytes of protection information associated with the number
+ of guard PI bits.
+ """
+ nvm_id_ns_cmd = f"sudo nvme nvm-id-ns --output-format=json {args.dut}".split(' ')
+ try:
+ nvm_id_ns_output = subprocess.check_output(nvm_id_ns_cmd)
+ except subprocess.CalledProcessError:
+ print(f"Non-zero return code from {' '.join(nvm_id_ns_cmd)}; " \
+ "assuming all lbafs use 16b Guard Protection Information")
+ for lbaf in lbaf_list:
+ lbaf['guard_pi_bits'] = 16
+ else:
+ elbafs = json.loads(nvm_id_ns_output)['elbafs']
+ for elbaf_num, elbaf in enumerate(elbafs):
+ for lbaf in lbaf_list:
+ if lbaf['lbaf'] == elbaf_num:
+ lbaf['guard_pi_bits'] = 16 << elbaf['pif']
+
+ # For 16b Guard Protection Information, the PI requires 8 bytes
+ # For 32b and 64b Guard PI, the PI requires 16 bytes
+ for lbaf in lbaf_list:
+ if lbaf['guard_pi_bits'] == 16:
+ lbaf['pi_bytes'] = 8
+ else:
+ lbaf['pi_bytes'] = 16
+
+
+def get_capabilities(args):
+ """
+ Determine what end-to-end data protection features the device supports.
+ """
+ caps = { 'pil': [], 'pitype': [], 'elba': [] }
+ id_ns_cmd = f"sudo nvme id-ns --output-format=json {args.dut}".split(' ')
+ id_ns_output = subprocess.check_output(id_ns_cmd)
+ id_ns_json = json.loads(id_ns_output)
+
+ mc = id_ns_json['mc']
+ if mc & 1:
+ caps['elba'].append(1)
+ if mc & 2:
+ caps['elba'].append(0)
+
+ dpc = id_ns_json['dpc']
+ if dpc & 1:
+ caps['pitype'].append(1)
+ if dpc & 2:
+ caps['pitype'].append(2)
+ if dpc & 4:
+ caps['pitype'].append(3)
+ if dpc & 8:
+ caps['pil'].append(1)
+ if dpc & 16:
+ caps['pil'].append(0)
+
+ for _, value in caps.items():
+ if len(value) == 0:
+ logging.error("One or more end-to-end data protection features unsupported: %s", caps)
+ sys.exit(-1)
+
+ return caps
+
+
+def format_device(args, lbaf, pitype, pil, elba):
+ """
+ Format device using specified lba format with specified pitype, pil, and
+ elba values.
+ """
+
+ format_cmd = f"sudo nvme format {args.dut} --lbaf={lbaf['lbaf']} " \
+ f"--pi={pitype} --pil={pil} --ms={elba} --force"
+ logging.debug("Format command: %s", format_cmd)
+ format_cmd = format_cmd.split(' ')
+ format_cmd_result = subprocess.run(format_cmd, capture_output=True, check=False,
+ encoding=locale.getpreferredencoding())
+
+ # Sometimes nvme-cli may format the device successfully but fail to
+ # rescan the namespaces after the format. Continue if this happens but
+ # abort if some other error occurs.
+ if format_cmd_result.returncode != 0:
+ if 'failed to rescan namespaces' not in format_cmd_result.stderr \
+ or 'Success formatting namespace' not in format_cmd_result.stdout:
+ logging.error(format_cmd_result.stdout)
+ logging.error(format_cmd_result.stderr)
+ print("Unable to format device; skipping this configuration")
+ return False
+
+ logging.debug(format_cmd_result.stdout)
+ return True
+
+
+def difdix_test(test_env, args, lbaf, pitype, elba):
+ """
+ Adjust test arguments based on values of lbaf, pitype, and elba. Then run
+ the tests.
+ """
+ for test in TEST_LIST:
+ test['force_skip'] = False
+
+ blocksize = lbaf['ds']
+ # Set fio blocksize parameter at runtime
+ # If we formatted the device in extended LBA mode (e.g., 520-byte
+ # sectors), we usually need to add the lba data size and metadata size
+ # together for fio's bs parameter. However, if pi_act == 1 and the
+ # device is formatted so that the metadata is the same size as the PI,
+ # then the device will take care of everything and the application
+ # should just use regular power of 2 lba data size even when the device
+ # is in extended lba mode.
+ if elba:
+ if not test['fio_opts']['pi_act'] or lbaf['ms'] != lbaf['pi_bytes']:
+ blocksize += lbaf['ms']
+ test['fio_opts']['md_per_io_size'] = 0
+ else:
+ # If we are using a separate buffer for metadata, fio doesn't need to
+ # do anything when pi_act==1 and protection information size is equal to
+ # metadata size since the device is taking care of it all. If either of
+ # the two conditions do not hold, then we do need to allocate a
+ # separate metadata buffer.
+ if test['fio_opts']['pi_act'] and lbaf['ms'] == lbaf['pi_bytes']:
+ test['fio_opts']['md_per_io_size'] = 0
+ else:
+ test['fio_opts']['md_per_io_size'] = lbaf['ms'] * test['bs_high']
+
+ test['fio_opts']['bsrange'] = f"{blocksize * test['bs_low']}-{blocksize * test['bs_high']}"
+ if 'mdsize_adjustment' in test:
+ test['fio_opts']['md_per_io_size'] += test['mdsize_adjustment']
+
+ # Set fio pi_chk parameter at runtime. If the device is formatted
+ # with Type 3 protection information, this means that the reference
+ # tag is not checked and I/O commands may throw an error if they
+ # are submitted with the REFTAG bit set in pi_chk. Make sure fio
+ # does not set pi_chk's REFTAG bit if the device is formatted with
+ # Type 3 PI.
+ if 'pi_chk' in test:
+ if pitype == 3 and 'REFTAG' in test['pi_chk']:
+ test['fio_opts']['pi_chk'] = test['pi_chk'].replace('REFTAG','')
+ logging.debug("Type 3 PI: dropping REFTAG bit")
+ else:
+ test['fio_opts']['pi_chk'] = test['pi_chk']
+
+ if 'skip' in test:
+ if pitype == 3 and 'type3' in test['skip']:
+ test['force_skip'] = True
+ logging.debug("Type 3 PI: skipping test case")
+ if elba and 'elba' in test['skip']:
+ test['force_skip'] = True
+ logging.debug("extended lba format: skipping test case")
+
+ logging.debug("Test %d: pi_act=%d, bsrange=%s, md_per_io_size=%d", test['test_id'],
+ test['fio_opts']['pi_act'], test['fio_opts']['bsrange'],
+ test['fio_opts']['md_per_io_size'])
+
+ return run_fio_tests(TEST_LIST, test_env, args)
+
+
+def main():
+ """
+ Run tests using fio's io_uring_cmd ioengine to exercise end-to-end data
+ protection capabilities.
+ """
+
+ args = parse_args()
+
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ artifact_root = args.artifact_root if args.artifact_root else \
+ f"nvmept_pi-test-{time.strftime('%Y%m%d-%H%M%S')}"
+ os.mkdir(artifact_root)
+ print(f"Artifact directory is {artifact_root}")
+
+ if args.fio:
+ fio_path = str(Path(args.fio).absolute())
+ else:
+ fio_path = 'fio'
+ print(f"fio path is {fio_path}")
+
+ lbaf_list = get_lbafs(args)
+ get_guard_pi(lbaf_list, args)
+ caps = get_capabilities(args)
+ print("Device capabilities:", caps)
+
+ for test in TEST_LIST:
+ test['fio_opts']['filename'] = args.dut
+ test['fio_opts']['ioengine'] = args.ioengine
+
+ test_env = {
+ 'fio_path': fio_path,
+ 'fio_root': str(Path(__file__).absolute().parent.parent),
+ 'artifact_root': artifact_root,
+ 'basename': 'nvmept_pi',
+ }
+
+ total = { 'passed': 0, 'failed': 0, 'skipped': 0 }
+
+ try:
+ for lbaf, pil, pitype, elba in itertools.product(lbaf_list, caps['pil'], caps['pitype'],
+ caps['elba']):
+ print(f"\nlbaf: {lbaf}, pil: {pil}, pitype: {pitype}, elba: {elba}")
+
+ if not format_device(args, lbaf, pitype, pil, elba):
+ continue
+
+ test_env['artifact_root'] = \
+ os.path.join(artifact_root, f"lbaf{lbaf['lbaf']}pil{pil}pitype{pitype}" \
+ f"elba{elba}")
+ os.mkdir(test_env['artifact_root'])
+
+ passed, failed, skipped = difdix_test(test_env, args, lbaf, pitype, elba)
+
+ total['passed'] += passed
+ total['failed'] += failed
+ total['skipped'] += skipped
+ except KeyboardInterrupt:
+ pass
+
+ print(f"\n\n{total['passed']} test(s) passed, {total['failed']} failed, " \
+ f"{total['skipped']} skipped")
+ sys.exit(total['failed'])
+
+
+if __name__ == '__main__':
+ main()
--- /dev/null
+#!/usr/bin/env python3
+#
+# Copyright 2024 Samsung Electronics Co., Ltd All Rights Reserved
+#
+# For conditions of distribution and use, see the accompanying COPYING file.
+#
+"""
+# nvmept_streams.py
+#
+# Test fio's NVMe streams support using the io_uring_cmd ioengine with NVMe
+# pass-through commands.
+#
+# USAGE
+# see python3 nvmept_streams.py --help
+#
+# EXAMPLES
+# python3 t/nvmept_streams.py --dut /dev/ng0n1
+# python3 t/nvmept_streams.py --dut /dev/ng1n1 -f ./fio
+#
+# REQUIREMENTS
+# Python 3.6
+#
+# WARNING
+# This is a destructive test
+#
+# Enable streams with
+# nvme dir-send -D 0 -O 1 -e 1 -T 1 /dev/nvme0n1
+#
+# See streams directive status with
+# nvme dir-receive -D 0 -O 1 -H /dev/nvme0n1
+"""
+import os
+import sys
+import time
+import locale
+import logging
+import argparse
+import subprocess
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+from fiotestcommon import SUCCESS_NONZERO
+
+
+class StreamsTest(FioJobCmdTest):
+ """
+ NVMe pass-through test class for streams. Check to make sure output for
+ selected data direction(s) is non-zero and that zero data appears for other
+ directions.
+ """
+
+ def setup(self, parameters):
+ """Setup a test."""
+
+ fio_args = [
+ "--name=nvmept-streams",
+ "--ioengine=io_uring_cmd",
+ "--cmd_type=nvme",
+ "--randrepeat=0",
+ f"--filename={self.fio_opts['filename']}",
+ f"--rw={self.fio_opts['rw']}",
+ f"--output={self.filenames['output']}",
+ f"--output-format={self.fio_opts['output-format']}",
+ ]
+ for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles',
+ 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait',
+ 'time_based', 'runtime', 'verify', 'io_size', 'num_range',
+ 'iodepth', 'iodepth_batch', 'iodepth_batch_complete',
+ 'size', 'rate', 'bs', 'bssplit', 'bsrange', 'randrepeat',
+ 'buffer_pattern', 'verify_pattern', 'offset', 'dataplacement',
+ 'plids', 'plid_select' ]:
+ if opt in self.fio_opts:
+ option = f"--{opt}={self.fio_opts[opt]}"
+ fio_args.append(option)
+
+ super().setup(fio_args)
+
+
+ def check_result(self):
+ try:
+ self._check_result()
+ finally:
+ release_all_streams(self.fio_opts['filename'])
+
+
+ def _check_result(self):
+
+ super().check_result()
+
+ if 'rw' not in self.fio_opts or \
+ not self.passed or \
+ 'json' not in self.fio_opts['output-format']:
+ return
+
+ job = self.json_data['jobs'][0]
+
+ if self.fio_opts['rw'] in ['read', 'randread']:
+ self.passed = self.check_all_ddirs(['read'], job)
+ elif self.fio_opts['rw'] in ['write', 'randwrite']:
+ if 'verify' not in self.fio_opts:
+ self.passed = self.check_all_ddirs(['write'], job)
+ else:
+ self.passed = self.check_all_ddirs(['read', 'write'], job)
+ elif self.fio_opts['rw'] in ['trim', 'randtrim']:
+ self.passed = self.check_all_ddirs(['trim'], job)
+ elif self.fio_opts['rw'] in ['readwrite', 'randrw']:
+ self.passed = self.check_all_ddirs(['read', 'write'], job)
+ elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']:
+ self.passed = self.check_all_ddirs(['trim', 'write'], job)
+ else:
+ logging.error("Unhandled rw value %s", self.fio_opts['rw'])
+ self.passed = False
+
+ if 'iodepth' in self.fio_opts:
+ # We will need to figure something out if any test uses an iodepth
+ # different from 8
+ if job['iodepth_level']['8'] < 95:
+ logging.error("Did not achieve requested iodepth")
+ self.passed = False
+ else:
+ logging.debug("iodepth 8 target met %s", job['iodepth_level']['8'])
+
+ stream_ids = [int(stream) for stream in self.fio_opts['plids'].split(',')]
+ if not self.check_streams(self.fio_opts['filename'], stream_ids):
+ self.passed = False
+ logging.error("Streams not as expected")
+ else:
+ logging.debug("Streams created as expected")
+
+
+ def check_streams(self, dut, stream_ids):
+ """
+ Confirm that the specified stream IDs exist on the specified device.
+ """
+
+ id_list = get_device_stream_ids(dut)
+ if not id_list:
+ return False
+
+ for stream in stream_ids:
+ if stream in id_list:
+ logging.debug("Stream ID %d found active on device", stream)
+ id_list.remove(stream)
+ else:
+ if self.__class__.__name__ != "StreamsTestRand":
+ logging.error("Stream ID %d not found on device", stream)
+ else:
+ logging.debug("Stream ID %d not found on device", stream)
+ return False
+
+ if len(id_list) != 0:
+ logging.error("Extra stream IDs %s found on device", str(id_list))
+ return False
+
+ return True
+
+
+class StreamsTestRR(StreamsTest):
+ """
+ NVMe pass-through test class for streams. Check to make sure output for
+ selected data direction(s) is non-zero and that zero data appears for other
+ directions. Check that Stream IDs are accessed in round robin order.
+ """
+
+ def check_streams(self, dut, stream_ids):
+ """
+ The number of IOs is less than the number of stream IDs provided. Let N
+ be the number of IOs. Make sure that the device only has the first N of
+ the stream IDs provided.
+
+ This will miss some cases where some other selection algorithm happens
+ to select the first N stream IDs. The solution would be to repeat this
+ test multiple times. Multiple trials passing would be evidence that
+ round robin is working correctly.
+ """
+
+ id_list = get_device_stream_ids(dut)
+ if not id_list:
+ return False
+
+ num_streams = int(self.fio_opts['io_size'] / self.fio_opts['bs'])
+ stream_ids = sorted(stream_ids)[0:num_streams]
+
+ return super().check_streams(dut, stream_ids)
+
+
+class StreamsTestRand(StreamsTest):
+ """
+ NVMe pass-through test class for streams. Check to make sure output for
+ selected data direction(s) is non-zero and that zero data appears for other
+ directions. Check that Stream IDs are accessed in random order.
+ """
+
+ def check_streams(self, dut, stream_ids):
+ """
+ The number of IOs is less than the number of stream IDs provided. Let N
+ be the number of IOs. Confirm that the stream IDs on the device are not
+ the first N stream IDs.
+
+ This will produce false positives because it is possible for the first
+ N stream IDs to be randomly selected. We can reduce the probability of
+ false positives by increasing N and increasing the number of streams
+ IDs to choose from, although fio has a max of 16 placement IDs.
+ """
+
+ id_list = get_device_stream_ids(dut)
+ if not id_list:
+ return False
+
+ num_streams = int(self.fio_opts['io_size'] / self.fio_opts['bs'])
+ stream_ids = sorted(stream_ids)[0:num_streams]
+
+ return not super().check_streams(dut, stream_ids)
+
+
+def get_device_stream_ids(dut):
+ cmd = f"sudo nvme dir-receive -D 1 -O 2 -H {dut}"
+ logging.debug("check streams command: %s", cmd)
+ cmd = cmd.split(' ')
+ cmd_result = subprocess.run(cmd, capture_output=True, check=False,
+ encoding=locale.getpreferredencoding())
+
+ logging.debug(cmd_result.stdout)
+
+ if cmd_result.returncode != 0:
+ logging.error("Error obtaining device %s stream IDs: %s", dut, cmd_result.stderr)
+ return False
+
+ id_list = []
+ for line in cmd_result.stdout.split('\n'):
+ if not 'Stream Identifier' in line:
+ continue
+ tokens = line.split(':')
+ id_list.append(int(tokens[1]))
+
+ return id_list
+
+
+def release_stream(dut, stream_id):
+ """
+ Release stream on given device with selected ID.
+ """
+ cmd = f"nvme dir-send -D 1 -O 1 -S {stream_id} {dut}"
+ logging.debug("release stream command: %s", cmd)
+ cmd = cmd.split(' ')
+ cmd_result = subprocess.run(cmd, capture_output=True, check=False,
+ encoding=locale.getpreferredencoding())
+
+ if cmd_result.returncode != 0:
+ logging.error("Error releasing %s stream %d", dut, stream_id)
+ return False
+
+ return True
+
+
+def release_all_streams(dut):
+ """
+ Release all streams on specified device.
+ """
+
+ id_list = get_device_stream_ids(dut)
+ if not id_list:
+ return False
+
+ for stream in id_list:
+ if not release_stream(dut, stream):
+ return False
+
+ return True
+
+
+TEST_LIST = [
+ # 4k block size
+ # {seq write, rand write} x {single stream, four streams}
+ {
+ "test_id": 1,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "io_size": 256*1024*1024,
+ "verify": "crc32c",
+ "plids": "8",
+ "dataplacement": "streams",
+ "output-format": "json",
+ },
+ "test_class": StreamsTest,
+ },
+ {
+ "test_id": 2,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "io_size": 256*1024*1024,
+ "verify": "crc32c",
+ "plids": "3",
+ "dataplacement": "streams",
+ "output-format": "json",
+ },
+ "test_class": StreamsTest,
+ },
+ {
+ "test_id": 3,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "io_size": 256*1024*1024,
+ "verify": "crc32c",
+ "plids": "1,2,3,4",
+ "dataplacement": "streams",
+ "output-format": "json",
+ },
+ "test_class": StreamsTest,
+ },
+ {
+ "test_id": 4,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 4096,
+ "io_size": 256*1024*1024,
+ "verify": "crc32c",
+ "plids": "5,6,7,8",
+ "dataplacement": "streams",
+ "output-format": "json",
+ },
+ "test_class": StreamsTest,
+ },
+ # 256KiB block size
+ # {seq write, rand write} x {single stream, four streams}
+ {
+ "test_id": 10,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 256*1024,
+ "io_size": 256*1024*1024,
+ "verify": "crc32c",
+ "plids": "88",
+ "dataplacement": "streams",
+ "output-format": "json",
+ },
+ "test_class": StreamsTest,
+ },
+ {
+ "test_id": 11,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 256*1024,
+ "io_size": 256*1024*1024,
+ "verify": "crc32c",
+ "plids": "20",
+ "dataplacement": "streams",
+ "output-format": "json",
+ },
+ "test_class": StreamsTest,
+ },
+ {
+ "test_id": 12,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 256*1024,
+ "io_size": 256*1024*1024,
+ "verify": "crc32c",
+ "plids": "16,32,64,128",
+ "dataplacement": "streams",
+ "output-format": "json",
+ },
+ "test_class": StreamsTest,
+ },
+ {
+ "test_id": 13,
+ "fio_opts": {
+ "rw": 'randwrite',
+ "bs": 256*1024,
+ "io_size": 256*1024*1024,
+ "verify": "crc32c",
+ "plids": "10,20,40,82",
+ "dataplacement": "streams",
+ "output-format": "json",
+ },
+ "test_class": StreamsTest,
+ },
+ # Test placement ID selection patterns
+ # default is round robin
+ {
+ "test_id": 20,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "io_size": 8192,
+ "plids": '88,99,100,123,124,125,126,127,128,129,130,131,132,133,134,135',
+ "dataplacement": "streams",
+ "output-format": "json",
+ },
+ "test_class": StreamsTestRR,
+ },
+ {
+ "test_id": 21,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "io_size": 8192,
+ "plids": '12,88,99,100,123,124,125,126,127,128,129,130,131,132,133,11',
+ "dataplacement": "streams",
+ "output-format": "json",
+ },
+ "test_class": StreamsTestRR,
+ },
+ # explicitly select round robin
+ {
+ "test_id": 22,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "io_size": 8192,
+ "plids": '22,88,99,100,123,124,125,126,127,128,129,130,131,132,133,134',
+ "dataplacement": "streams",
+ "output-format": "json",
+ "plid_select": "roundrobin",
+ },
+ "test_class": StreamsTestRR,
+ },
+ # explicitly select random
+ {
+ "test_id": 23,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "io_size": 8192,
+ "plids": '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16',
+ "dataplacement": "streams",
+ "output-format": "json",
+ "plid_select": "random",
+ },
+ "test_class": StreamsTestRand,
+ },
+ # Error case with placement ID > 0xFFFF
+ {
+ "test_id": 30,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "io_size": 8192,
+ "plids": "1,2,3,0x10000",
+ "dataplacement": "streams",
+ "output-format": "normal",
+ "plid_select": "random",
+ },
+ "test_class": StreamsTestRand,
+ "success": SUCCESS_NONZERO,
+ },
+ # Error case with no stream IDs provided
+ {
+ "test_id": 31,
+ "fio_opts": {
+ "rw": 'write',
+ "bs": 4096,
+ "io_size": 8192,
+ "dataplacement": "streams",
+ "output-format": "normal",
+ },
+ "test_class": StreamsTestRand,
+ "success": SUCCESS_NONZERO,
+ },
+
+]
+
+def parse_args():
+ """Parse command-line arguments."""
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true')
+ parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+ parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+ parser.add_argument('-s', '--skip', nargs='+', type=int,
+ help='list of test(s) to skip')
+ parser.add_argument('-o', '--run-only', nargs='+', type=int,
+ help='list of test(s) to run, skipping all others')
+ parser.add_argument('--dut', help='target NVMe character device to test '
+ '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True)
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands."""
+
+ args = parse_args()
+
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ artifact_root = args.artifact_root if args.artifact_root else \
+ f"nvmept-streams-test-{time.strftime('%Y%m%d-%H%M%S')}"
+ os.mkdir(artifact_root)
+ print(f"Artifact directory is {artifact_root}")
+
+ if args.fio:
+ fio_path = str(Path(args.fio).absolute())
+ else:
+ fio_path = 'fio'
+ print(f"fio path is {fio_path}")
+
+ for test in TEST_LIST:
+ test['fio_opts']['filename'] = args.dut
+
+ release_all_streams(args.dut)
+ test_env = {
+ 'fio_path': fio_path,
+ 'fio_root': str(Path(__file__).absolute().parent.parent),
+ 'artifact_root': artifact_root,
+ 'basename': 'nvmept-streams',
+ }
+
+ _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
+ sys.exit(failed)
+
+
+if __name__ == '__main__':
+ main()
--- /dev/null
+#!/usr/bin/env python3
+#
+# Copyright 2024 Samsung Electronics Co., Ltd All Rights Reserved
+#
+# For conditions of distribution and use, see the accompanying COPYING file.
+#
+"""
+# nvmept_trim.py
+#
+# Test fio's io_uring_cmd ioengine with NVMe pass-through dataset management
+# commands that trim multiple ranges.
+#
+# USAGE
+# see python3 nvmept_trim.py --help
+#
+# EXAMPLES
+# python3 t/nvmept_trim.py --dut /dev/ng0n1
+# python3 t/nvmept_trim.py --dut /dev/ng1n1 -f ./fio
+#
+# REQUIREMENTS
+# Python 3.6
+#
+"""
+import os
+import sys
+import time
+import logging
+import argparse
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+from fiotestcommon import SUCCESS_NONZERO
+
+
+class TrimTest(FioJobCmdTest):
+ """
+ NVMe pass-through test class. Check to make sure output for selected data
+ direction(s) is non-zero and that zero data appears for other directions.
+ """
+
+ def setup(self, parameters):
+ """Setup a test."""
+
+ fio_args = [
+ "--name=nvmept-trim",
+ "--ioengine=io_uring_cmd",
+ "--cmd_type=nvme",
+ f"--filename={self.fio_opts['filename']}",
+ f"--rw={self.fio_opts['rw']}",
+ f"--output={self.filenames['output']}",
+ f"--output-format={self.fio_opts['output-format']}",
+ ]
+ for opt in ['fixedbufs', 'nonvectored', 'force_async', 'registerfiles',
+ 'sqthread_poll', 'sqthread_poll_cpu', 'hipri', 'nowait',
+ 'time_based', 'runtime', 'verify', 'io_size', 'num_range',
+ 'iodepth', 'iodepth_batch', 'iodepth_batch_complete',
+ 'size', 'rate', 'bs', 'bssplit', 'bsrange', 'randrepeat',
+ 'buffer_pattern', 'verify_pattern', 'verify', 'offset']:
+ if opt in self.fio_opts:
+ option = f"--{opt}={self.fio_opts[opt]}"
+ fio_args.append(option)
+
+ super().setup(fio_args)
+
+
+ def check_result(self):
+
+ super().check_result()
+
+ if 'rw' not in self.fio_opts or \
+ not self.passed or \
+ 'json' not in self.fio_opts['output-format']:
+ return
+
+ job = self.json_data['jobs'][0]
+
+ if self.fio_opts['rw'] in ['read', 'randread']:
+ self.passed = self.check_all_ddirs(['read'], job)
+ elif self.fio_opts['rw'] in ['write', 'randwrite']:
+ if 'verify' not in self.fio_opts:
+ self.passed = self.check_all_ddirs(['write'], job)
+ else:
+ self.passed = self.check_all_ddirs(['read', 'write'], job)
+ elif self.fio_opts['rw'] in ['trim', 'randtrim']:
+ self.passed = self.check_all_ddirs(['trim'], job)
+ elif self.fio_opts['rw'] in ['readwrite', 'randrw']:
+ self.passed = self.check_all_ddirs(['read', 'write'], job)
+ elif self.fio_opts['rw'] in ['trimwrite', 'randtrimwrite']:
+ self.passed = self.check_all_ddirs(['trim', 'write'], job)
+ else:
+ logging.error("Unhandled rw value %s", self.fio_opts['rw'])
+ self.passed = False
+
+ if 'iodepth' in self.fio_opts:
+ # We will need to figure something out if any test uses an iodepth
+ # different from 8
+ if job['iodepth_level']['8'] < 95:
+ logging.error("Did not achieve requested iodepth")
+ self.passed = False
+ else:
+ logging.debug("iodepth 8 target met %s", job['iodepth_level']['8'])
+
+
+class RangeTrimTest(TrimTest):
+ """
+ Multi-range trim test class.
+ """
+
+ def get_bs(self):
+ """Calculate block size and determine whether bs will be an average or exact."""
+
+ if 'bs' in self.fio_opts:
+ exact_size = True
+ bs = self.fio_opts['bs']
+ elif 'bssplit' in self.fio_opts:
+ exact_size = False
+ bs = 0
+ total = 0
+ for split in self.fio_opts['bssplit'].split(':'):
+ [blocksize, share] = split.split('/')
+ total += int(share)
+ bs += int(blocksize) * int(share) / 100
+ if total != 100:
+ logging.error("bssplit '%s' total percentage is not 100", self.fio_opts['bssplit'])
+ self.passed = False
+ else:
+ logging.debug("bssplit: average block size is %d", int(bs))
+ # The only check we do here for bssplit is to calculate an average
+ # blocksize and see if the IOPS and bw are consistent
+ elif 'bsrange' in self.fio_opts:
+ exact_size = False
+ [minbs, maxbs] = self.fio_opts['bsrange'].split('-')
+ minbs = int(minbs)
+ maxbs = int(maxbs)
+ bs = int((minbs + maxbs) / 2)
+ logging.debug("bsrange: average block size is %d", int(bs))
+ # The only check we do here for bsrange is to calculate an average
+ # blocksize and see if the IOPS and bw are consistent
+ else:
+ exact_size = True
+ bs = 4096
+
+ return bs, exact_size
+
+
+ def check_result(self):
+ """
+ Make sure that the number of IO requests is consistent with the
+ blocksize and num_range values. In other words, if the blocksize is
+ 4KiB and num_range is 2, we should have 128 IO requests to trim 1MiB.
+ """
+ # TODO Enable debug output to check the actual offsets
+
+ super().check_result()
+
+ if not self.passed or 'json' not in self.fio_opts['output-format']:
+ return
+
+ job = self.json_data['jobs'][0]['trim']
+ bs, exact_size = self.get_bs()
+
+ # make sure bw and IOPS are consistent
+ bw = job['bw_bytes']
+ iops = job['iops']
+ runtime = job['runtime']
+
+ calculated = int(bw*runtime/1000)
+ expected = job['io_bytes']
+ if abs(calculated - expected) / expected > 0.05:
+ logging.error("Total bytes %d from bw does not match reported total bytes %d",
+ calculated, expected)
+ self.passed = False
+ else:
+ logging.debug("Total bytes %d from bw matches reported total bytes %d", calculated,
+ expected)
+
+ calculated = int(iops*runtime/1000*bs*self.fio_opts['num_range'])
+ if abs(calculated - expected) / expected > 0.05:
+ logging.error("Total bytes %d from IOPS does not match reported total bytes %d",
+ calculated, expected)
+ self.passed = False
+ else:
+ logging.debug("Total bytes %d from IOPS matches reported total bytes %d", calculated,
+ expected)
+
+ if 'size' in self.fio_opts:
+ io_count = self.fio_opts['size'] / self.fio_opts['num_range'] / bs
+ if exact_size:
+ delta = 0.1
+ else:
+ delta = 0.05*job['total_ios']
+
+ if abs(job['total_ios'] - io_count) > delta:
+ logging.error("Expected numbers of IOs %d does not match actual value %d",
+ io_count, job['total_ios'])
+ self.passed = False
+ else:
+ logging.debug("Expected numbers of IOs %d matches actual value %d", io_count,
+ job['total_ios'])
+
+ if 'rate' in self.fio_opts:
+ if abs(bw - self.fio_opts['rate']) / self.fio_opts['rate'] > 0.05:
+ logging.error("Actual rate %f does not match expected rate %f", bw,
+ self.fio_opts['rate'])
+ self.passed = False
+ else:
+ logging.debug("Actual rate %f matches expeected rate %f", bw, self.fio_opts['rate'])
+
+
+
+TEST_LIST = [
+ # The group of tests below checks existing use cases to make sure there are
+ # no regressions.
+ {
+ "test_id": 1,
+ "fio_opts": {
+ "rw": 'trim',
+ "time_based": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": TrimTest,
+ },
+ {
+ "test_id": 2,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "time_based": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": TrimTest,
+ },
+ {
+ "test_id": 3,
+ "fio_opts": {
+ "rw": 'trim',
+ "time_based": 1,
+ "runtime": 3,
+ "iodepth": 8,
+ "iodepth_batch": 4,
+ "iodepth_batch_complete": 4,
+ "output-format": "json",
+ },
+ "test_class": TrimTest,
+ },
+ {
+ "test_id": 4,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "time_based": 1,
+ "runtime": 3,
+ "iodepth": 8,
+ "iodepth_batch": 4,
+ "iodepth_batch_complete": 4,
+ "output-format": "json",
+ },
+ "test_class": TrimTest,
+ },
+ {
+ "test_id": 5,
+ "fio_opts": {
+ "rw": 'trimwrite',
+ "time_based": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": TrimTest,
+ },
+ {
+ "test_id": 6,
+ "fio_opts": {
+ "rw": 'randtrimwrite',
+ "time_based": 1,
+ "runtime": 3,
+ "output-format": "json",
+ },
+ "test_class": TrimTest,
+ },
+ {
+ "test_id": 7,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "time_based": 1,
+ "runtime": 3,
+ "fixedbufs": 0,
+ "nonvectored": 1,
+ "force_async": 1,
+ "registerfiles": 1,
+ "sqthread_poll": 1,
+ "fixedbuffs": 1,
+ "output-format": "json",
+ },
+ "test_class": TrimTest,
+ },
+ # The group of tests below try out the new functionality
+ {
+ "test_id": 100,
+ "fio_opts": {
+ "rw": 'trim',
+ "num_range": 2,
+ "size": 16*1024*1024,
+ "output-format": "json",
+ },
+ "test_class": RangeTrimTest,
+ },
+ {
+ "test_id": 101,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "num_range": 2,
+ "size": 16*1024*1024,
+ "output-format": "json",
+ },
+ "test_class": RangeTrimTest,
+ },
+ {
+ "test_id": 102,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "num_range": 256,
+ "size": 64*1024*1024,
+ "output-format": "json",
+ },
+ "test_class": RangeTrimTest,
+ },
+ {
+ "test_id": 103,
+ "fio_opts": {
+ "rw": 'trim',
+ "num_range": 2,
+ "bs": 16*1024,
+ "size": 32*1024*1024,
+ "output-format": "json",
+ },
+ "test_class": RangeTrimTest,
+ },
+ {
+ "test_id": 104,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "num_range": 2,
+ "bs": 16*1024,
+ "size": 32*1024*1024,
+ "output-format": "json",
+ },
+ "test_class": RangeTrimTest,
+ },
+ {
+ "test_id": 105,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "num_range": 2,
+ "bssplit": "4096/50:16384/50",
+ "size": 80*1024*1024,
+ "output-format": "json",
+ "randrepeat": 0,
+ },
+ "test_class": RangeTrimTest,
+ },
+ {
+ "test_id": 106,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "num_range": 4,
+ "bssplit": "4096/25:8192/25:12288/25:16384/25",
+ "size": 80*1024*1024,
+ "output-format": "json",
+ "randrepeat": 0,
+ },
+ "test_class": RangeTrimTest,
+ },
+ {
+ "test_id": 107,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "num_range": 4,
+ "bssplit": "4096/20:8192/20:12288/20:16384/20:20480/20",
+ "size": 72*1024*1024,
+ "output-format": "json",
+ "randrepeat": 0,
+ },
+ "test_class": RangeTrimTest,
+ },
+ {
+ "test_id": 108,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "num_range": 2,
+ "bsrange": "4096-16384",
+ "size": 80*1024*1024,
+ "output-format": "json",
+ "randrepeat": 0,
+ },
+ "test_class": RangeTrimTest,
+ },
+ {
+ "test_id": 109,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "num_range": 4,
+ "bsrange": "4096-20480",
+ "size": 72*1024*1024,
+ "output-format": "json",
+ "randrepeat": 0,
+ },
+ "test_class": RangeTrimTest,
+ },
+ {
+ "test_id": 110,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "time_based": 1,
+ "runtime": 10,
+ "rate": 1024*1024,
+ "num_range": 2,
+ "output-format": "json",
+ },
+ "test_class": RangeTrimTest,
+ },
+ # All of the tests below should fail
+ # TODO check the error messages resulting from the jobs below
+ {
+ "test_id": 200,
+ "fio_opts": {
+ "rw": 'randtrimwrite',
+ "time_based": 1,
+ "runtime": 10,
+ "rate": 1024*1024,
+ "num_range": 2,
+ "output-format": "normal",
+ },
+ "test_class": RangeTrimTest,
+ "success": SUCCESS_NONZERO,
+ },
+ {
+ "test_id": 201,
+ "fio_opts": {
+ "rw": 'trimwrite',
+ "time_based": 1,
+ "runtime": 10,
+ "rate": 1024*1024,
+ "num_range": 2,
+ "output-format": "normal",
+ },
+ "test_class": RangeTrimTest,
+ "success": SUCCESS_NONZERO,
+ },
+ {
+ "test_id": 202,
+ "fio_opts": {
+ "rw": 'trim',
+ "time_based": 1,
+ "runtime": 10,
+ "num_range": 257,
+ "output-format": "normal",
+ },
+ "test_class": RangeTrimTest,
+ "success": SUCCESS_NONZERO,
+ },
+ # The sequence of jobs below constitute a single test with multiple steps
+ # - write a data pattern
+ # - verify the data pattern
+ # - trim the first half of the LBA space
+ # - verify that the trim'd LBA space no longer returns the original data pattern
+ # - verify that the remaining LBA space has the expected pattern
+ {
+ "test_id": 300,
+ "fio_opts": {
+ "rw": 'write',
+ "output-format": 'json',
+ "buffer_pattern": 0x0f,
+ "size": 256*1024*1024,
+ },
+ "test_class": TrimTest,
+ },
+ {
+ "test_id": 301,
+ "fio_opts": {
+ "rw": 'read',
+ "output-format": 'json',
+ "verify_pattern": 0x0f,
+ "verify": "pattern",
+ "size": 256*1024*1024,
+ },
+ "test_class": TrimTest,
+ },
+ {
+ "test_id": 302,
+ "fio_opts": {
+ "rw": 'randtrim',
+ "num_range": 8,
+ "output-format": 'json',
+ "size": 128*1024*1024,
+ },
+ "test_class": TrimTest,
+ },
+ # The identify namespace data structure has a DLFEAT field which specifies
+ # what happens when reading data from deallocated blocks. There are three
+ # options:
+ # - read behavior not reported
+ # - deallocated logical block returns all bytes 0x0
+ # - deallocated logical block returns all bytes 0xff
+ # The test below merely checks that the original data pattern is not returned.
+ # Source: Figure 97 from
+ # https://nvmexpress.org/wp-content/uploads/NVM-Express-NVM-Command-Set-Specification-1.0c-2022.10.03-Ratified.pdf
+ {
+ "test_id": 303,
+ "fio_opts": {
+ "rw": 'read',
+ "output-format": 'json',
+ "verify_pattern": 0x0f,
+ "verify": "pattern",
+ "size": 128*1024*1024,
+ },
+ "test_class": TrimTest,
+ "success": SUCCESS_NONZERO,
+ },
+ {
+ "test_id": 304,
+ "fio_opts": {
+ "rw": 'read',
+ "output-format": 'json',
+ "verify_pattern": 0x0f,
+ "verify": "pattern",
+ "offset": 128*1024*1024,
+ "size": 128*1024*1024,
+ },
+ "test_class": TrimTest,
+ },
+]
+
+def parse_args():
+ """Parse command-line arguments."""
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-d', '--debug', help='Enable debug messages', action='store_true')
+ parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+ parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+ parser.add_argument('-s', '--skip', nargs='+', type=int,
+ help='list of test(s) to skip')
+ parser.add_argument('-o', '--run-only', nargs='+', type=int,
+ help='list of test(s) to run, skipping all others')
+ parser.add_argument('--dut', help='target NVMe character device to test '
+ '(e.g., /dev/ng0n1). WARNING: THIS IS A DESTRUCTIVE TEST', required=True)
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ """Run tests using fio's io_uring_cmd ioengine to send NVMe pass through commands."""
+
+ args = parse_args()
+
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ artifact_root = args.artifact_root if args.artifact_root else \
+ f"nvmept-trim-test-{time.strftime('%Y%m%d-%H%M%S')}"
+ os.mkdir(artifact_root)
+ print(f"Artifact directory is {artifact_root}")
+
+ if args.fio:
+ fio_path = str(Path(args.fio).absolute())
+ else:
+ fio_path = 'fio'
+ print(f"fio path is {fio_path}")
+
+ for test in TEST_LIST:
+ test['fio_opts']['filename'] = args.dut
+
+ test_env = {
+ 'fio_path': fio_path,
+ 'fio_root': str(Path(__file__).absolute().parent.parent),
+ 'artifact_root': artifact_root,
+ 'basename': 'nvmept-trim',
+ }
+
+ _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
+ sys.exit(failed)
+
+
+if __name__ == '__main__':
+ main()
--- /dev/null
+#!/bin/bash
+
+args=$*
+first_cores=""
+taskset_cores=""
+first_cores_count=0
+nb_threads=1
+drives=""
+
+# Default options
+latency_cmdline=""
+
+fatal() {
+ echo "$@"
+ exit 1
+}
+
+hint() {
+ echo "Warning: $*"
+}
+
+info() {
+ item=$1
+ shift
+ echo "${item}: $*"
+}
+
+check_root() {
+ [[ ${EUID} -eq 0 ]] || fatal "You should be root to run this tool"
+}
+
+check_binary() {
+ # Ensure the binaries are present and executable
+ for bin in "$@"; do
+ if [ ! -x ${bin} ]; then
+ command -v ${bin} >/dev/null
+ [ $? -eq 0 ] || fatal "${bin} doesn't exist or is not executable"
+ fi
+ done
+}
+
+detect_first_core() {
+ cpu_to_search="0"
+ if [ "${#drives[@]}" -eq 1 ]; then
+ device_name=$(block_dev_name ${drives[0]})
+ device_dir="/sys/block/${device_name}/device/"
+ pci_addr=$(cat ${device_dir}/address)
+ pci_dir="/sys/bus/pci/devices/${pci_addr}/"
+ cpu_to_search=$(cat ${pci_dir}/local_cpulist | cut -d"," -f 1 | cut -d"-" -f 1)
+ else
+ hint 'Passed multiple devices. Running on the first core.'
+ fi
+ core_to_run=$(lscpu --all -pSOCKET,CORE,CPU | grep ",$cpu_to_search\$" | cut -d"," -f1-2)
+
+ # Detect which logical cpus belongs to the first physical core
+ # If Hyperthreading is enabled, two cores are returned
+ cpus=$(lscpu --all -pSOCKET,CORE,CPU | grep "$core_to_run")
+ for cpu in ${cpus}; do
+ IFS=','
+ # shellcheck disable=SC2206
+ array=(${cpu})
+ if [ ${first_cores_count} -eq 0 ]; then
+ first_cores="${array[2]}"
+ else
+ first_cores="${first_cores} ${array[2]}"
+ fi
+
+ first_cores_count=$((first_cores_count + 1))
+ unset IFS
+ done
+ [ ${first_cores_count} -eq 0 ] && fatal "Cannot detect first core"
+ taskset_cores=$(echo "${first_cores}" | tr ' ' ',')
+}
+
+usage() {
+ echo "usage: [options] block_device [other_block_devices]
+
+ -h : print help
+ -l : enable latency reporting
+
+ example:
+ t/one-core-peak.sh /dev/nvme0n1
+ t/one-core-peak.sh -l /dev/nvme0n1 /dev/nvme1n1
+ "
+ exit 0
+}
+
+check_args() {
+ local OPTIND option
+ while getopts "hl" option; do
+ case "${option}" in
+ h) # Show help
+ usage
+ ;;
+ l) # Report latency
+ latency_cmdline="1"
+ ;;
+ *)
+ fatal "Unsupported ${option} option"
+ ;;
+ esac
+ done
+ shift $((OPTIND-1))
+ [ $# -eq 0 ] && fatal "Missing drive(s) as argument"
+ drives="$*"
+}
+
+check_drive_exists() {
+ # Ensure the block device exists
+ [ -b $1 ] || fatal "$1 is not a valid block device"
+}
+
+is_nvme() {
+ [[ ${*} == *"nvme"* ]]
+}
+
+check_poll_queue() {
+ # Print a warning if the nvme poll queues aren't enabled
+ is_nvme ${drives} || return
+ poll_queue=$(cat /sys/module/nvme/parameters/poll_queues)
+ [ ${poll_queue} -eq 0 ] && hint "For better performance, you should enable nvme poll queues by setting nvme.poll_queues=32 on the kernel commande line"
+}
+
+block_dev_name() {
+ echo ${1#"/dev/"}
+}
+
+get_sys_block_dir() {
+ # Returns the /sys/block/ directory of a given block device
+ device_name=$1
+ sys_block_dir="/sys/block/${device_name}"
+ [ -d "${sys_block_dir}" ] || fatal "Cannot find ${sys_block_dir} directory"
+ echo ${sys_block_dir}
+}
+
+check_io_scheduler() {
+ # Ensure io_sched is set to none
+ device_name=$(block_dev_name $1)
+ sys_block_dir=$(get_sys_block_dir ${device_name})
+ sched_file="${sys_block_dir}/queue/scheduler"
+ [ -f "${sched_file}" ] || fatal "Cannot find IO scheduler for ${device_name}"
+ grep -q '\[none\]' ${sched_file}
+ if [ $? -ne 0 ]; then
+ info "${device_name}" "set none as io scheduler"
+ echo "none" > ${sched_file}
+ fi
+
+}
+
+check_sysblock_value() {
+ device_name=$(block_dev_name $1)
+ sys_block_dir=$(get_sys_block_dir ${device_name})
+ target_file="${sys_block_dir}/$2"
+ value=$3
+ [ -f "${target_file}" ] || return
+ content=$(cat ${target_file} 2>/dev/null)
+ if [ "${content}" != "${value}" ]; then
+ echo ${value} > ${target_file} 2>/dev/null && info "${device_name}" "${target_file} set to ${value}." || hint "${device_name}: Cannot set ${value} on ${target_file}"
+ fi
+}
+
+compute_nb_threads() {
+ # Increase the number of threads if there is more devices or cores than the default value
+ [ $# -gt ${nb_threads} ] && nb_threads=$#
+ [ ${first_cores_count} -gt ${nb_threads} ] && nb_threads=${first_cores_count}
+}
+
+check_scaling_governor() {
+ driver=$(LC_ALL=C cpupower frequency-info |grep "driver:" |awk '{print $2}')
+ if [ -z "${driver}" ]; then
+ hint "Cannot detect processor scaling driver"
+ return
+ fi
+ cpupower frequency-set -g performance >/dev/null 2>&1 || fatal "Cannot set scaling processor governor"
+}
+
+check_idle_governor() {
+ filename="/sys/devices/system/cpu/cpuidle/current_governor"
+ if [ ! -f "${filename}" ]; then
+ hint "Cannot detect cpu idle governor"
+ return
+ fi
+ echo "menu" > ${filename} 2>/dev/null || fatal "Cannot set cpu idle governor to menu"
+}
+
+show_nvme() {
+ device="$1"
+ device_name=$(block_dev_name $1)
+ device_dir="/sys/block/${device_name}/device/"
+ pci_addr=$(cat ${device_dir}/address)
+ pci_dir="/sys/bus/pci/devices/${pci_addr}/"
+ link_speed=$(cat ${pci_dir}/current_link_speed)
+ irq=$(cat ${pci_dir}/irq)
+ numa=$([ -f ${pci_dir}/numa_node ] && cat ${pci_dir}/numa_node || echo "off")
+ cpus=$(cat ${pci_dir}/local_cpulist)
+ model=$(cat ${device_dir}/model | xargs) #xargs for trimming spaces
+ fw=$(cat ${device_dir}/firmware_rev | xargs) #xargs for trimming spaces
+ serial=$(cat ${device_dir}/serial | xargs) #xargs for trimming spaces
+ info ${device_name} "MODEL=${model} FW=${fw} serial=${serial} PCI=${pci_addr}@${link_speed} IRQ=${irq} NUMA=${numa} CPUS=${cpus} "
+ command -v nvme > /dev/null
+ if [ $? -eq 0 ]; then
+ status=""
+ NCQA=$(nvme get-feature -H -f 0x7 ${device} 2>&1 |grep NCQA |cut -d ':' -f 2 | xargs)
+ [ -n "${NCQA}" ] && status="${status}Completion Queues:${NCQA}, "
+ NSQA=$(nvme get-feature -H -f 0x7 ${device} 2>&1 |grep NSQA |cut -d ':' -f 2 | xargs)
+ [ -n "${NSQA}" ] && status="${status}Submission Queues:${NSQA}, "
+ power_state=$(nvme get-feature -H -f 0x2 ${device} 2>&1 | grep PS |cut -d ":" -f 2 | xargs)
+ [ -n "${power_state}" ] && status="${status}PowerState:${power_state}, "
+ apste=$(nvme get-feature -H -f 0xc ${device} 2>&1 | grep APSTE |cut -d ":" -f 2 | xargs)
+ [ -n "${apste}" ] && status="${status} Autonomous Power State Transition:${apste}, "
+ temp=$(nvme smart-log ${device} 2>&1 |grep 'temperature' |cut -d ':' -f 2 |xargs)
+ [ -n "${temp}" ] && status="${status}Temp:${temp}"
+ info ${device_name} "${status}"
+ fi
+}
+
+show_device() {
+ device_name=$(block_dev_name $1)
+ is_nvme $1 && show_nvme $1
+}
+
+show_kernel_config_item() {
+ config_item="CONFIG_$1"
+ config_file="/boot/config-$(uname -r)"
+ if [ ! -f "${config_file}" ]; then
+ config_file='/proc/config.gz'
+ if [ ! -f "${config_file}" ]; then
+ return
+ fi
+ fi
+ status=$(zgrep ${config_item}= ${config_file})
+ if [ -z "${status}" ]; then
+ echo "${config_item}=N"
+ else
+ echo "${config_item}=$(echo ${status} | cut -d '=' -f 2)"
+ fi
+}
+
+show_system() {
+ CPU_MODEL=$(grep -m1 "model name" /proc/cpuinfo | awk '{print substr($0, index($0,$4))}')
+ MEMORY_SPEED=$(dmidecode -t 17 -q | grep -m 1 "Configured Memory Speed: [0-9]" | awk '{print substr($0, index($0,$4))}')
+ KERNEL=$(uname -r)
+ info "system" "CPU: ${CPU_MODEL}"
+ info "system" "MEMORY: ${MEMORY_SPEED}"
+ info "system" "KERNEL: ${KERNEL}"
+ for config_item in BLK_CGROUP BLK_WBT_MQ HZ RETPOLINE PAGE_TABLE_ISOLATION; do
+ info "system" "KERNEL: $(show_kernel_config_item ${config_item})"
+ done
+ info "system" "KERNEL: $(cat /proc/cmdline)"
+ info "system" "SElinux: $(getenforce)"
+ tsc=$(journalctl -k | grep 'tsc: Refined TSC clocksource calibration:' | awk '{print $11}')
+ if [ -n "${tsc}" ]; then
+ info "system" "TSC: ${tsc} Mhz"
+ tsc=$(echo ${tsc} | tr -d '.')
+ [ -n "${latency_cmdline}" ] && latency_cmdline="-t1 -T${tsc}000"
+ fi
+}
+
+### MAIN
+check_args ${args}
+check_root
+check_binary t/io_uring lscpu grep taskset cpupower awk tr xargs dmidecode
+detect_first_core
+
+info "##################################################"
+show_system
+for drive in ${drives}; do
+ check_drive_exists ${drive}
+ check_io_scheduler ${drive}
+ check_sysblock_value ${drive} "queue/iostats" 0 # Ensure iostats are disabled
+ check_sysblock_value ${drive} "queue/nomerges" 2 # Ensure merge are disabled
+ check_sysblock_value ${drive} "queue/io_poll" 1 # Ensure io_poll is enabled
+ check_sysblock_value ${drive} "queue/wbt_lat_usec" 0 # Disabling wbt lat
+ show_device ${drive}
+done
+
+check_poll_queue
+compute_nb_threads ${drives}
+check_scaling_governor
+check_idle_governor
+
+info "##################################################"
+echo
+
+cmdline="taskset -c ${taskset_cores} t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -B1 -n${nb_threads} ${latency_cmdline} ${drives}"
+info "io_uring" "Running ${cmdline}"
+${cmdline}
--- /dev/null
+#!/usr/bin/env python3
+"""
+# random_seed.py
+#
+# Test fio's random seed options.
+#
+# - make sure that randseed overrides randrepeat and allrandrepeat
+# - make sure that seeds differ across invocations when [all]randrepeat=0 and randseed is not set
+# - make sure that seeds are always the same when [all]randrepeat=1 and randseed is not set
+#
+# USAGE
+# see python3 random_seed.py --help
+#
+# EXAMPLES
+# python3 t/random_seed.py
+# python3 t/random_seed.py -f ./fio
+#
+# REQUIREMENTS
+# Python 3.6
+#
+"""
+import os
+import sys
+import time
+import locale
+import logging
+import argparse
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+
+class FioRandTest(FioJobCmdTest):
+ """fio random seed test."""
+
+ def setup(self, parameters):
+ """Setup the test."""
+
+ fio_args = [
+ "--debug=random",
+ "--name=random_seed",
+ "--ioengine=null",
+ "--filesize=32k",
+ "--rw=randread",
+ f"--output={self.filenames['output']}",
+ ]
+ for opt in ['randseed', 'randrepeat', 'allrandrepeat']:
+ if opt in self.fio_opts:
+ option = f"--{opt}={self.fio_opts[opt]}"
+ fio_args.append(option)
+
+ super().setup(fio_args)
+
+ def get_rand_seeds(self):
+ """Collect random seeds from --debug=random output."""
+ with open(self.filenames['output'], "r",
+ encoding=locale.getpreferredencoding()) as out_file:
+ file_data = out_file.read()
+
+ offsets = 0
+ for line in file_data.split('\n'):
+ if 'random' in line and 'FIO_RAND_NR_OFFS=' in line:
+ tokens = line.split('=')
+ offsets = int(tokens[len(tokens)-1])
+ break
+
+ if offsets == 0:
+ pass
+ # find an exception to throw
+
+ seed_list = []
+ for line in file_data.split('\n'):
+ if 'random' not in line:
+ continue
+ if 'rand_seeds[' in line:
+ tokens = line.split('=')
+ seed = int(tokens[-1])
+ seed_list.append(seed)
+ # assume that seeds are in order
+
+ return seed_list
+
+
+class TestRR(FioRandTest):
+ """
+ Test object for [all]randrepeat. If run for the first time just collect the
+ seeds. For later runs make sure the seeds match or do not match those
+ previously collected.
+ """
+ # one set of seeds is for randrepeat=0 and the other is for randrepeat=1
+ seeds = { 0: None, 1: None }
+
+ def check_result(self):
+ """Check output for allrandrepeat=1."""
+
+ super().check_result()
+ if not self.passed:
+ return
+
+ opt = 'randrepeat' if 'randrepeat' in self.fio_opts else 'allrandrepeat'
+ rr = self.fio_opts[opt]
+ rand_seeds = self.get_rand_seeds()
+
+ if not TestRR.seeds[rr]:
+ TestRR.seeds[rr] = rand_seeds
+ logging.debug("TestRR: saving rand_seeds for [a]rr=%d", rr)
+ else:
+ if rr:
+ if TestRR.seeds[1] != rand_seeds:
+ self.passed = False
+ print(f"TestRR: unexpected seed mismatch for [a]rr={rr}")
+ else:
+ logging.debug("TestRR: seeds correctly match for [a]rr=%d", rr)
+ if TestRR.seeds[0] == rand_seeds:
+ self.passed = False
+ print("TestRR: seeds unexpectedly match those from system RNG")
+ else:
+ if TestRR.seeds[0] == rand_seeds:
+ self.passed = False
+ print(f"TestRR: unexpected seed match for [a]rr={rr}")
+ else:
+ logging.debug("TestRR: seeds correctly don't match for [a]rr=%d", rr)
+ if TestRR.seeds[1] == rand_seeds:
+ self.passed = False
+ print("TestRR: random seeds unexpectedly match those from [a]rr=1")
+
+
+class TestRS(FioRandTest):
+ """
+ Test object when randseed=something controls the generated seeds. If run
+ for the first time for a given randseed just collect the seeds. For later
+ runs with the same seed make sure the seeds are the same as those
+ previously collected.
+ """
+ seeds = {}
+
+ def check_result(self):
+ """Check output for randseed=something."""
+
+ super().check_result()
+ if not self.passed:
+ return
+
+ rand_seeds = self.get_rand_seeds()
+ randseed = self.fio_opts['randseed']
+
+ logging.debug("randseed = %s", randseed)
+
+ if randseed not in TestRS.seeds:
+ TestRS.seeds[randseed] = rand_seeds
+ logging.debug("TestRS: saving rand_seeds")
+ else:
+ if TestRS.seeds[randseed] != rand_seeds:
+ self.passed = False
+ print("TestRS: seeds don't match when they should")
+ else:
+ logging.debug("TestRS: seeds correctly match")
+
+ # Now try to find seeds generated using a different randseed and make
+ # sure they *don't* match
+ for key, value in TestRS.seeds.items():
+ if key != randseed:
+ if value == rand_seeds:
+ self.passed = False
+ print("TestRS: randseeds differ but generated seeds match.")
+ else:
+ logging.debug("TestRS: randseeds differ and generated seeds also differ.")
+
+
+def parse_args():
+ """Parse command-line arguments."""
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+ parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+ parser.add_argument('-d', '--debug', help='enable debug output', action='store_true')
+ parser.add_argument('-s', '--skip', nargs='+', type=int,
+ help='list of test(s) to skip')
+ parser.add_argument('-o', '--run-only', nargs='+', type=int,
+ help='list of test(s) to run, skipping all others')
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ """Run tests of fio random seed options"""
+
+ args = parse_args()
+
+ if args.debug:
+ logging.basicConfig(level=logging.DEBUG)
+ else:
+ logging.basicConfig(level=logging.INFO)
+
+ artifact_root = args.artifact_root if args.artifact_root else \
+ f"random-seed-test-{time.strftime('%Y%m%d-%H%M%S')}"
+ os.mkdir(artifact_root)
+ print(f"Artifact directory is {artifact_root}")
+
+ if args.fio:
+ fio_path = str(Path(args.fio).absolute())
+ else:
+ fio_path = 'fio'
+ print(f"fio path is {fio_path}")
+
+ test_list = [
+ {
+ "test_id": 1,
+ "fio_opts": {
+ "randrepeat": 0,
+ },
+ "test_class": TestRR,
+ },
+ {
+ "test_id": 2,
+ "fio_opts": {
+ "randrepeat": 0,
+ },
+ "test_class": TestRR,
+ },
+ {
+ "test_id": 3,
+ "fio_opts": {
+ "randrepeat": 1,
+ },
+ "test_class": TestRR,
+ },
+ {
+ "test_id": 4,
+ "fio_opts": {
+ "randrepeat": 1,
+ },
+ "test_class": TestRR,
+ },
+ {
+ "test_id": 5,
+ "fio_opts": {
+ "allrandrepeat": 0,
+ },
+ "test_class": TestRR,
+ },
+ {
+ "test_id": 6,
+ "fio_opts": {
+ "allrandrepeat": 0,
+ },
+ "test_class": TestRR,
+ },
+ {
+ "test_id": 7,
+ "fio_opts": {
+ "allrandrepeat": 1,
+ },
+ "test_class": TestRR,
+ },
+ {
+ "test_id": 8,
+ "fio_opts": {
+ "allrandrepeat": 1,
+ },
+ "test_class": TestRR,
+ },
+ {
+ "test_id": 9,
+ "fio_opts": {
+ "randrepeat": 0,
+ "randseed": "12345",
+ },
+ "test_class": TestRS,
+ },
+ {
+ "test_id": 10,
+ "fio_opts": {
+ "randrepeat": 0,
+ "randseed": "12345",
+ },
+ "test_class": TestRS,
+ },
+ {
+ "test_id": 11,
+ "fio_opts": {
+ "randrepeat": 1,
+ "randseed": "12345",
+ },
+ "test_class": TestRS,
+ },
+ {
+ "test_id": 12,
+ "fio_opts": {
+ "allrandrepeat": 0,
+ "randseed": "12345",
+ },
+ "test_class": TestRS,
+ },
+ {
+ "test_id": 13,
+ "fio_opts": {
+ "allrandrepeat": 1,
+ "randseed": "12345",
+ },
+ "test_class": TestRS,
+ },
+ {
+ "test_id": 14,
+ "fio_opts": {
+ "randrepeat": 0,
+ "randseed": "67890",
+ },
+ "test_class": TestRS,
+ },
+ {
+ "test_id": 15,
+ "fio_opts": {
+ "randrepeat": 1,
+ "randseed": "67890",
+ },
+ "test_class": TestRS,
+ },
+ {
+ "test_id": 16,
+ "fio_opts": {
+ "allrandrepeat": 0,
+ "randseed": "67890",
+ },
+ "test_class": TestRS,
+ },
+ {
+ "test_id": 17,
+ "fio_opts": {
+ "allrandrepeat": 1,
+ "randseed": "67890",
+ },
+ "test_class": TestRS,
+ },
+ ]
+
+ test_env = {
+ 'fio_path': fio_path,
+ 'fio_root': str(Path(__file__).absolute().parent.parent),
+ 'artifact_root': artifact_root,
+ 'basename': 'random',
+ }
+
+ _, failed, _ = run_fio_tests(test_list, test_env, args)
+ sys.exit(failed)
+
+
+if __name__ == '__main__':
+ main()
#include "../flist.h"
+#include "compiler/compiler.h"
+
static int bs = 4096;
static int max_us = 10000;
static char *file;
#define PLAT_NR (PLAT_GROUP_NR * PLAT_VAL)
#define PLAT_LIST_MAX 20
+#ifndef NDEBUG
+#define CHECK_ZERO_OR_ABORT(code) assert(code)
+#else
+#define CHECK_ZERO_OR_ABORT(code) \
+ do { \
+ if (fio_unlikely((code) != 0)) { \
+ log_err("failed checking code %i != 0", (code)); \
+ abort(); \
+ } \
+ } while (0)
+#endif
+
struct stats {
unsigned int plat[PLAT_NR];
unsigned int nr_samples;
return ret;
}
-static struct work_item *find_seq(struct writer_thread *w, unsigned int seq)
+static struct work_item *find_seq(struct writer_thread *w, int seq)
{
struct work_item *work;
struct flist_head *entry;
clock_gettime(CLOCK_MONOTONIC, &s);
ret = write(STDOUT_FILENO, work->buf, work->buf_size);
+ if (ret < 0)
+ return (int)ret;
clock_gettime(CLOCK_MONOTONIC, &e);
assert(ret == work->buf_size);
{
struct writer_thread *wt = data;
struct work_item *work;
- unsigned int seq = 1;
+ int seq = 1;
work = NULL;
- while (!wt->thread.exit || !flist_empty(&wt->list)) {
+ while (!(seq < 0) && (!wt->thread.exit || !flist_empty(&wt->list))) {
pthread_mutex_lock(&wt->thread.lock);
- if (work) {
+ if (work)
flist_add_tail(&work->list, &wt->done_list);
- work = NULL;
- }
work = find_seq(wt, seq);
if (work)
int ret;
ret = pthread_condattr_init(&cattr);
- assert(ret == 0);
+ CHECK_ZERO_OR_ABORT(ret);
#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
ret = pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC);
- assert(ret == 0);
+ CHECK_ZERO_OR_ABORT(ret);
#endif
pthread_cond_init(&thread->cond, &cattr);
pthread_cond_init(&thread->done_cond, &cattr);
bytes = 0;
ret = pthread_condattr_init(&cattr);
- assert(ret == 0);
+ CHECK_ZERO_OR_ABORT(ret);
#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
ret = pthread_condattr_setclock(&cattr, CLOCK_MONOTONIC);
- assert(ret == 0);
+ CHECK_ZERO_OR_ABORT(ret);
#endif
clock_gettime(CLOCK_MONOTONIC, &s);
# SPDX-License-Identifier: GPL-2.0-only
#
# Copyright (c) 2019 Western Digital Corporation or its affiliates.
-#
-#
+
+"""
# readonly.py
#
-# Do some basic tests of the --readonly paramter
+# Do some basic tests of the --readonly parameter
#
# USAGE
# python readonly.py [-f fio-executable]
# REQUIREMENTS
# Python 3.5+
#
-#
+"""
+import os
import sys
+import time
import argparse
-import subprocess
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
+from fiotestcommon import SUCCESS_DEFAULT, SUCCESS_NONZERO
+
+
+class FioReadOnlyTest(FioJobCmdTest):
+ """fio read only test."""
+
+ def setup(self, parameters):
+ """Setup the test."""
+
+ fio_args = [
+ "--name=readonly",
+ "--ioengine=null",
+ "--time_based",
+ "--runtime=1s",
+ "--size=1M",
+ f"--rw={self.fio_opts['rw']}",
+ ]
+ if 'readonly-pre' in parameters:
+ fio_args.insert(0, "--readonly")
+ if 'readonly-post' in parameters:
+ fio_args.append("--readonly")
+
+ super().setup(fio_args)
+
+
+TEST_LIST = [
+ {
+ "test_id": 1,
+ "fio_opts": { "rw": "randread", },
+ "readonly-pre": 1,
+ "success": SUCCESS_DEFAULT,
+ "test_class": FioReadOnlyTest,
+ },
+ {
+ "test_id": 2,
+ "fio_opts": { "rw": "randwrite", },
+ "readonly-pre": 1,
+ "success": SUCCESS_NONZERO,
+ "test_class": FioReadOnlyTest,
+ },
+ {
+ "test_id": 3,
+ "fio_opts": { "rw": "randtrim", },
+ "readonly-pre": 1,
+ "success": SUCCESS_NONZERO,
+ "test_class": FioReadOnlyTest,
+ },
+ {
+ "test_id": 4,
+ "fio_opts": { "rw": "randread", },
+ "readonly-post": 1,
+ "success": SUCCESS_DEFAULT,
+ "test_class": FioReadOnlyTest,
+ },
+ {
+ "test_id": 5,
+ "fio_opts": { "rw": "randwrite", },
+ "readonly-post": 1,
+ "success": SUCCESS_NONZERO,
+ "test_class": FioReadOnlyTest,
+ },
+ {
+ "test_id": 6,
+ "fio_opts": { "rw": "randtrim", },
+ "readonly-post": 1,
+ "success": SUCCESS_NONZERO,
+ "test_class": FioReadOnlyTest,
+ },
+ {
+ "test_id": 7,
+ "fio_opts": { "rw": "randread", },
+ "success": SUCCESS_DEFAULT,
+ "test_class": FioReadOnlyTest,
+ },
+ {
+ "test_id": 8,
+ "fio_opts": { "rw": "randwrite", },
+ "success": SUCCESS_DEFAULT,
+ "test_class": FioReadOnlyTest,
+ },
+ {
+ "test_id": 9,
+ "fio_opts": { "rw": "randtrim", },
+ "success": SUCCESS_DEFAULT,
+ "test_class": FioReadOnlyTest,
+ },
+ ]
def parse_args():
+ """Parse command-line arguments."""
+
parser = argparse.ArgumentParser()
- parser.add_argument('-f', '--fio',
- help='path to fio executable (e.g., ./fio)')
+ parser.add_argument('-f', '--fio', help='path to fio executable (e.g., ./fio)')
+ parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+ parser.add_argument('-s', '--skip', nargs='+', type=int,
+ help='list of test(s) to skip')
+ parser.add_argument('-o', '--run-only', nargs='+', type=int,
+ help='list of test(s) to run, skipping all others')
args = parser.parse_args()
return args
-def run_fio(fio, test, index):
- fio_args = [
- "--max-jobs=16",
- "--name=readonly",
- "--ioengine=null",
- "--time_based",
- "--runtime=1s",
- "--size=1M",
- "--rw={rw}".format(**test),
- ]
- if 'readonly-pre' in test:
- fio_args.insert(0, "--readonly")
- if 'readonly-post' in test:
- fio_args.append("--readonly")
-
- output = subprocess.run([fio] + fio_args, stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
-
- return output
-
-
-def check_output(output, test):
- expect_error = False
- if 'readonly-pre' in test or 'readonly-post' in test:
- if 'write' in test['rw'] or 'trim' in test['rw']:
- expect_error = True
-
-# print(output.stdout)
-# print(output.stderr)
-
- if output.returncode == 0:
- if expect_error:
- return False
- else:
- return True
- else:
- if expect_error:
- return True
- else:
- return False
-
+def main():
+ """Run readonly tests."""
-if __name__ == '__main__':
args = parse_args()
- tests = [
- {
- "rw": "randread",
- "readonly-pre": 1,
- },
- {
- "rw": "randwrite",
- "readonly-pre": 1,
- },
- {
- "rw": "randtrim",
- "readonly-pre": 1,
- },
- {
- "rw": "randread",
- "readonly-post": 1,
- },
- {
- "rw": "randwrite",
- "readonly-post": 1,
- },
- {
- "rw": "randtrim",
- "readonly-post": 1,
- },
- {
- "rw": "randread",
- },
- {
- "rw": "randwrite",
- },
- {
- "rw": "randtrim",
- },
- ]
-
- index = 1
- passed = 0
- failed = 0
-
if args.fio:
- fio_path = args.fio
+ fio_path = str(Path(args.fio).absolute())
else:
fio_path = 'fio'
+ print(f"fio path is {fio_path}")
- for test in tests:
- output = run_fio(fio_path, test, index)
- status = check_output(output, test)
- print("Test {0} {1}".format(index, ("PASSED" if status else "FAILED")))
- if status:
- passed = passed + 1
- else:
- failed = failed + 1
- index = index + 1
+ artifact_root = args.artifact_root if args.artifact_root else \
+ f"readonly-test-{time.strftime('%Y%m%d-%H%M%S')}"
+ os.mkdir(artifact_root)
+ print(f"Artifact directory is {artifact_root}")
- print("{0} tests passed, {1} failed".format(passed, failed))
+ test_env = {
+ 'fio_path': fio_path,
+ 'fio_root': str(Path(__file__).absolute().parent.parent),
+ 'artifact_root': artifact_root,
+ 'basename': 'readonly',
+ }
+ _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
sys.exit(failed)
+
+
+if __name__ == '__main__':
+ main()
import os
import sys
-import json
import time
import shutil
import logging
import argparse
-import platform
-import subprocess
-import multiprocessing
from pathlib import Path
+from statsmodels.sandbox.stats.runs import runstest_1samp
+from fiotestlib import FioExeTest, FioJobFileTest, run_fio_tests
+from fiotestcommon import *
-class FioTest(object):
- """Base for all fio tests."""
-
- def __init__(self, exe_path, parameters, success):
- self.exe_path = exe_path
- self.parameters = parameters
- self.success = success
- self.output = {}
- self.artifact_root = None
- self.testnum = None
- self.test_dir = None
- self.passed = True
- self.failure_reason = ''
- self.command_file = None
- self.stdout_file = None
- self.stderr_file = None
- self.exitcode_file = None
-
- def setup(self, artifact_root, testnum):
- """Setup instance variables for test."""
-
- self.artifact_root = artifact_root
- self.testnum = testnum
- self.test_dir = os.path.join(artifact_root, "{:04d}".format(testnum))
- if not os.path.exists(self.test_dir):
- os.mkdir(self.test_dir)
-
- self.command_file = os.path.join(
- self.test_dir,
- "{0}.command".format(os.path.basename(self.exe_path)))
- self.stdout_file = os.path.join(
- self.test_dir,
- "{0}.stdout".format(os.path.basename(self.exe_path)))
- self.stderr_file = os.path.join(
- self.test_dir,
- "{0}.stderr".format(os.path.basename(self.exe_path)))
- self.exitcode_file = os.path.join(
- self.test_dir,
- "{0}.exitcode".format(os.path.basename(self.exe_path)))
-
- def run(self):
- """Run the test."""
-
- raise NotImplementedError()
-
- def check_result(self):
- """Check test results."""
-
- raise NotImplementedError()
-
-
-class FioExeTest(FioTest):
- """Test consists of an executable binary or script"""
-
- def __init__(self, exe_path, parameters, success):
- """Construct a FioExeTest which is a FioTest consisting of an
- executable binary or script.
-
- exe_path: location of executable binary or script
- parameters: list of parameters for executable
- success: Definition of test success
- """
-
- FioTest.__init__(self, exe_path, parameters, success)
-
- def run(self):
- """Execute the binary or script described by this instance."""
-
- command = [self.exe_path] + self.parameters
- command_file = open(self.command_file, "w+")
- command_file.write("%s\n" % command)
- command_file.close()
-
- stdout_file = open(self.stdout_file, "w+")
- stderr_file = open(self.stderr_file, "w+")
- exitcode_file = open(self.exitcode_file, "w+")
- try:
- proc = None
- # Avoid using subprocess.run() here because when a timeout occurs,
- # fio will be stopped with SIGKILL. This does not give fio a
- # chance to clean up and means that child processes may continue
- # running and submitting IO.
- proc = subprocess.Popen(command,
- stdout=stdout_file,
- stderr=stderr_file,
- cwd=self.test_dir,
- universal_newlines=True)
- proc.communicate(timeout=self.success['timeout'])
- exitcode_file.write('{0}\n'.format(proc.returncode))
- logging.debug("Test %d: return code: %d", self.testnum, proc.returncode)
- self.output['proc'] = proc
- except subprocess.TimeoutExpired:
- proc.terminate()
- proc.communicate()
- assert proc.poll()
- self.output['failure'] = 'timeout'
- except Exception:
- if proc:
- if not proc.poll():
- proc.terminate()
- proc.communicate()
- self.output['failure'] = 'exception'
- self.output['exc_info'] = sys.exc_info()
- finally:
- stdout_file.close()
- stderr_file.close()
- exitcode_file.close()
-
- def check_result(self):
- """Check results of test run."""
-
- if 'proc' not in self.output:
- if self.output['failure'] == 'timeout':
- self.failure_reason = "{0} timeout,".format(self.failure_reason)
- else:
- assert self.output['failure'] == 'exception'
- self.failure_reason = '{0} exception: {1}, {2}'.format(
- self.failure_reason, self.output['exc_info'][0],
- self.output['exc_info'][1])
-
- self.passed = False
- return
-
- if 'zero_return' in self.success:
- if self.success['zero_return']:
- if self.output['proc'].returncode != 0:
- self.passed = False
- self.failure_reason = "{0} non-zero return code,".format(self.failure_reason)
- else:
- if self.output['proc'].returncode == 0:
- self.failure_reason = "{0} zero return code,".format(self.failure_reason)
- self.passed = False
-
- stderr_size = os.path.getsize(self.stderr_file)
- if 'stderr_empty' in self.success:
- if self.success['stderr_empty']:
- if stderr_size != 0:
- self.failure_reason = "{0} stderr not empty,".format(self.failure_reason)
- self.passed = False
- else:
- if stderr_size == 0:
- self.failure_reason = "{0} stderr empty,".format(self.failure_reason)
- self.passed = False
-
-
-class FioJobTest(FioExeTest):
- """Test consists of a fio job"""
-
- def __init__(self, fio_path, fio_job, success, fio_pre_job=None,
- fio_pre_success=None, output_format="normal"):
- """Construct a FioJobTest which is a FioExeTest consisting of a
- single fio job file with an optional setup step.
-
- fio_path: location of fio executable
- fio_job: location of fio job file
- success: Definition of test success
- fio_pre_job: fio job for preconditioning
- fio_pre_success: Definition of test success for fio precon job
- output_format: normal (default), json, jsonplus, or terse
- """
-
- self.fio_job = fio_job
- self.fio_pre_job = fio_pre_job
- self.fio_pre_success = fio_pre_success if fio_pre_success else success
- self.output_format = output_format
- self.precon_failed = False
- self.json_data = None
- self.fio_output = "{0}.output".format(os.path.basename(self.fio_job))
- self.fio_args = [
- "--max-jobs=16",
- "--output-format={0}".format(self.output_format),
- "--output={0}".format(self.fio_output),
- self.fio_job,
- ]
- FioExeTest.__init__(self, fio_path, self.fio_args, success)
-
- def setup(self, artifact_root, testnum):
- """Setup instance variables for fio job test."""
-
- super(FioJobTest, self).setup(artifact_root, testnum)
-
- self.command_file = os.path.join(
- self.test_dir,
- "{0}.command".format(os.path.basename(self.fio_job)))
- self.stdout_file = os.path.join(
- self.test_dir,
- "{0}.stdout".format(os.path.basename(self.fio_job)))
- self.stderr_file = os.path.join(
- self.test_dir,
- "{0}.stderr".format(os.path.basename(self.fio_job)))
- self.exitcode_file = os.path.join(
- self.test_dir,
- "{0}.exitcode".format(os.path.basename(self.fio_job)))
-
- def run_pre_job(self):
- """Run fio job precondition step."""
-
- precon = FioJobTest(self.exe_path, self.fio_pre_job,
- self.fio_pre_success,
- output_format=self.output_format)
- precon.setup(self.artifact_root, self.testnum)
- precon.run()
- precon.check_result()
- self.precon_failed = not precon.passed
- self.failure_reason = precon.failure_reason
-
- def run(self):
- """Run fio job test."""
-
- if self.fio_pre_job:
- self.run_pre_job()
-
- if not self.precon_failed:
- super(FioJobTest, self).run()
- else:
- logging.debug("Test %d: precondition step failed", self.testnum)
-
- @classmethod
- def get_file(cls, filename):
- """Safely read a file."""
- file_data = ''
- success = True
-
- try:
- with open(filename, "r") as output_file:
- file_data = output_file.read()
- except OSError:
- success = False
-
- return file_data, success
-
- def check_result(self):
- """Check fio job results."""
-
- if self.precon_failed:
- self.passed = False
- self.failure_reason = "{0} precondition step failed,".format(self.failure_reason)
- return
-
- super(FioJobTest, self).check_result()
-
- if not self.passed:
- return
-
- if 'json' not in self.output_format:
- return
-
- file_data, success = self.get_file(os.path.join(self.test_dir, self.fio_output))
- if not success:
- self.failure_reason = "{0} unable to open output file,".format(self.failure_reason)
- self.passed = False
- return
-
- #
- # Sometimes fio informational messages are included at the top of the
- # JSON output, especially under Windows. Try to decode output as JSON
- # data, lopping off up to the first four lines
- #
- lines = file_data.splitlines()
- for i in range(5):
- file_data = '\n'.join(lines[i:])
- try:
- self.json_data = json.loads(file_data)
- except json.JSONDecodeError:
- continue
- else:
- logging.debug("Test %d: skipped %d lines decoding JSON data", self.testnum, i)
- return
-
- self.failure_reason = "{0} unable to decode JSON data,".format(self.failure_reason)
- self.passed = False
-
-
-class FioJobTest_t0005(FioJobTest):
+class FioJobFileTest_t0005(FioJobFileTest):
"""Test consists of fio test job t0005
Confirm that read['io_kbytes'] == write['io_kbytes'] == 102400"""
def check_result(self):
- super(FioJobTest_t0005, self).check_result()
+ super().check_result()
if not self.passed:
return
if self.json_data['jobs'][0]['read']['io_kbytes'] != 102400:
- self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason)
+ self.failure_reason = f"{self.failure_reason} bytes read mismatch,"
self.passed = False
if self.json_data['jobs'][0]['write']['io_kbytes'] != 102400:
- self.failure_reason = "{0} bytes written mismatch,".format(self.failure_reason)
+ self.failure_reason = f"{self.failure_reason} bytes written mismatch,"
self.passed = False
-class FioJobTest_t0006(FioJobTest):
+class FioJobFileTest_t0006(FioJobFileTest):
"""Test consists of fio test job t0006
Confirm that read['io_kbytes'] ~ 2*write['io_kbytes']"""
def check_result(self):
- super(FioJobTest_t0006, self).check_result()
+ super().check_result()
if not self.passed:
return
/ self.json_data['jobs'][0]['write']['io_kbytes']
logging.debug("Test %d: ratio: %f", self.testnum, ratio)
if ratio < 1.99 or ratio > 2.01:
- self.failure_reason = "{0} read/write ratio mismatch,".format(self.failure_reason)
+ self.failure_reason = f"{self.failure_reason} read/write ratio mismatch,"
self.passed = False
-class FioJobTest_t0007(FioJobTest):
+class FioJobFileTest_t0007(FioJobFileTest):
"""Test consists of fio test job t0007
Confirm that read['io_kbytes'] = 87040"""
def check_result(self):
- super(FioJobTest_t0007, self).check_result()
+ super().check_result()
if not self.passed:
return
if self.json_data['jobs'][0]['read']['io_kbytes'] != 87040:
- self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason)
+ self.failure_reason = f"{self.failure_reason} bytes read mismatch,"
self.passed = False
-class FioJobTest_t0008(FioJobTest):
+class FioJobFileTest_t0008(FioJobFileTest):
"""Test consists of fio test job t0008
Confirm that read['io_kbytes'] = 32768 and that
- write['io_kbytes'] ~ 16568
+ write['io_kbytes'] ~ 16384
- I did runs with fio-ae2fafc8 and saw write['io_kbytes'] values of
- 16585, 16588. With two runs of fio-3.16 I obtained 16568"""
+ This is a 50/50 seq read/write workload. Since fio flips a coin to
+ determine whether to issue a read or a write, total bytes written will not
+ be exactly 16384K. But total bytes read will be exactly 32768K because
+ reads will include the initial phase as well as the verify phase where all
+ the blocks originally written will be read."""
def check_result(self):
- super(FioJobTest_t0008, self).check_result()
+ super().check_result()
if not self.passed:
return
- ratio = self.json_data['jobs'][0]['write']['io_kbytes'] / 16568
+ ratio = self.json_data['jobs'][0]['write']['io_kbytes'] / 16384
logging.debug("Test %d: ratio: %f", self.testnum, ratio)
- if ratio < 0.99 or ratio > 1.01:
- self.failure_reason = "{0} bytes written mismatch,".format(self.failure_reason)
+ if ratio < 0.97 or ratio > 1.03:
+ self.failure_reason = f"{self.failure_reason} bytes written mismatch,"
self.passed = False
if self.json_data['jobs'][0]['read']['io_kbytes'] != 32768:
- self.failure_reason = "{0} bytes read mismatch,".format(self.failure_reason)
+ self.failure_reason = f"{self.failure_reason} bytes read mismatch,"
self.passed = False
-class FioJobTest_t0009(FioJobTest):
+class FioJobFileTest_t0009(FioJobFileTest):
"""Test consists of fio test job t0009
Confirm that runtime >= 60s"""
def check_result(self):
- super(FioJobTest_t0009, self).check_result()
+ super().check_result()
if not self.passed:
return
logging.debug('Test %d: elapsed: %d', self.testnum, self.json_data['jobs'][0]['elapsed'])
if self.json_data['jobs'][0]['elapsed'] < 60:
- self.failure_reason = "{0} elapsed time mismatch,".format(self.failure_reason)
+ self.failure_reason = f"{self.failure_reason} elapsed time mismatch,"
self.passed = False
-class FioJobTest_t0012(FioJobTest):
+class FioJobFileTest_t0012(FioJobFileTest):
"""Test consists of fio test job t0012
Confirm ratios of job iops are 1:5:10
job1,job2,job3 respectively"""
def check_result(self):
- super(FioJobTest_t0012, self).check_result()
+ super().check_result()
if not self.passed:
return
iops_files = []
- for i in range(1,4):
- file_data, success = self.get_file(os.path.join(self.test_dir, "{0}_iops.{1}.log".format(os.path.basename(self.fio_job), i)))
-
- if not success:
- self.failure_reason = "{0} unable to open output file,".format(self.failure_reason)
- self.passed = False
+ for i in range(1, 4):
+ filename = os.path.join(self.paths['test_dir'], "{0}_iops.{1}.log".format(os.path.basename(
+ self.fio_job), i))
+ file_data = self.get_file_fail(filename)
+ if not file_data:
return
iops_files.append(file_data.splitlines())
ratio1 = iops3/iops2
ratio2 = iops3/iops1
- logging.debug(
- "sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} job3/job2={4:.3f} job3/job1={5:.3f}".format(
- i, iops1, iops2, iops3, ratio1, ratio2
- )
- )
+ logging.debug("sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} " \
+ "job3/job2={4:.3f} job3/job1={5:.3f}".format(i, iops1, iops2, iops3, ratio1,
+ ratio2))
# test job1 and job2 succeeded to recalibrate
if ratio1 < 1 or ratio1 > 3 or ratio2 < 7 or ratio2 > 13:
- self.failure_reason = "{0} iops ratio mismatch iops1={1} iops2={2} iops3={3} expected r1~2 r2~10 got r1={4:.3f} r2={5:.3f},".format(
- self.failure_reason, iops1, iops2, iops3, ratio1, ratio2
- )
+ self.failure_reason += " iops ratio mismatch iops1={0} iops2={1} iops3={2} " \
+ "expected r1~2 r2~10 got r1={3:.3f} r2={4:.3f},".format(iops1, iops2, iops3,
+ ratio1, ratio2)
self.passed = False
return
-class FioJobTest_t0014(FioJobTest):
+class FioJobFileTest_t0014(FioJobFileTest):
"""Test consists of fio test job t0014
Confirm that job1_iops / job2_iops ~ 1:2 for entire duration
and that job1_iops / job3_iops ~ 1:3 for first half of duration.
re-calibrate the activity dynamically"""
def check_result(self):
- super(FioJobTest_t0014, self).check_result()
+ super().check_result()
if not self.passed:
return
iops_files = []
- for i in range(1,4):
- file_data, success = self.get_file(os.path.join(self.test_dir, "{0}_iops.{1}.log".format(os.path.basename(self.fio_job), i)))
-
- if not success:
- self.failure_reason = "{0} unable to open output file,".format(self.failure_reason)
- self.passed = False
+ for i in range(1, 4):
+ filename = os.path.join(self.paths['test_dir'], "{0}_iops.{1}.log".format(os.path.basename(
+ self.fio_job), i))
+ file_data = self.get_file_fail(filename)
+ if not file_data:
return
iops_files.append(file_data.splitlines())
if ratio1 < 0.43 or ratio1 > 0.57 or ratio2 < 0.21 or ratio2 > 0.45:
- self.failure_reason = "{0} iops ratio mismatch iops1={1} iops2={2} iops3={3}\
- expected r1~0.5 r2~0.33 got r1={4:.3f} r2={5:.3f},".format(
- self.failure_reason, iops1, iops2, iops3, ratio1, ratio2
- )
+ self.failure_reason += " iops ratio mismatch iops1={0} iops2={1} iops3={2} " \
+ "expected r1~0.5 r2~0.33 got r1={3:.3f} r2={4:.3f},".format(
+ iops1, iops2, iops3, ratio1, ratio2)
self.passed = False
iops1 = iops1 + float(iops_files[0][i].split(',')[1])
ratio1 = iops1/iops2
ratio2 = iops1/iops3
- logging.debug(
- "sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} job1/job2={4:.3f} job1/job3={5:.3f}".format(
- i, iops1, iops2, iops3, ratio1, ratio2
- )
- )
+ logging.debug("sample {0}: job1 iops={1} job2 iops={2} job3 iops={3} " \
+ "job1/job2={4:.3f} job1/job3={5:.3f}".format(i, iops1, iops2, iops3,
+ ratio1, ratio2))
# test job1 and job2 succeeded to recalibrate
if ratio1 < 0.43 or ratio1 > 0.57:
- self.failure_reason = "{0} iops ratio mismatch iops1={1} iops2={2} expected ratio~0.5 got ratio={3:.3f},".format(
- self.failure_reason, iops1, iops2, ratio1
- )
+ self.failure_reason += " iops ratio mismatch iops1={0} iops2={1} expected ratio~0.5 " \
+ "got ratio={2:.3f},".format(iops1, iops2, ratio1)
self.passed = False
return
-class FioJobTest_iops_rate(FioJobTest):
- """Test consists of fio test job t0009
+class FioJobFileTest_t0015(FioJobFileTest):
+ """Test consists of fio test jobs t0015 and t0016
+ Confirm that mean(slat) + mean(clat) = mean(tlat)"""
+
+ def check_result(self):
+ super().check_result()
+
+ if not self.passed:
+ return
+
+ slat = self.json_data['jobs'][0]['read']['slat_ns']['mean']
+ clat = self.json_data['jobs'][0]['read']['clat_ns']['mean']
+ tlat = self.json_data['jobs'][0]['read']['lat_ns']['mean']
+ logging.debug('Test %d: slat %f, clat %f, tlat %f', self.testnum, slat, clat, tlat)
+
+ if abs(slat + clat - tlat) > 1:
+ self.failure_reason = "{0} slat {1} + clat {2} = {3} != tlat {4},".format(
+ self.failure_reason, slat, clat, slat+clat, tlat)
+ self.passed = False
+
+
+class FioJobFileTest_t0019(FioJobFileTest):
+ """Test consists of fio test job t0019
+ Confirm that all offsets were touched sequentially"""
+
+ def check_result(self):
+ super().check_result()
+
+ bw_log_filename = os.path.join(self.paths['test_dir'], "test_bw.log")
+ file_data = self.get_file_fail(bw_log_filename)
+ if not file_data:
+ return
+
+ log_lines = file_data.split('\n')
+
+ prev = -4096
+ for line in log_lines:
+ if len(line.strip()) == 0:
+ continue
+ cur = int(line.split(',')[4])
+ if cur - prev != 4096:
+ self.passed = False
+ self.failure_reason = f"offsets {prev}, {cur} not sequential"
+ return
+ prev = cur
+
+ if cur/4096 != 255:
+ self.passed = False
+ self.failure_reason = f"unexpected last offset {cur}"
+
+
+class FioJobFileTest_t0020(FioJobFileTest):
+ """Test consists of fio test jobs t0020 and t0021
+ Confirm that almost all offsets were touched non-sequentially"""
+
+ def check_result(self):
+ super().check_result()
+
+ bw_log_filename = os.path.join(self.paths['test_dir'], "test_bw.log")
+ file_data = self.get_file_fail(bw_log_filename)
+ if not file_data:
+ return
+
+ log_lines = file_data.split('\n')
+
+ offsets = []
+
+ prev = int(log_lines[0].split(',')[4])
+ for line in log_lines[1:]:
+ offsets.append(prev/4096)
+ if len(line.strip()) == 0:
+ continue
+ cur = int(line.split(',')[4])
+ prev = cur
+
+ if len(offsets) != 256:
+ self.passed = False
+ self.failure_reason += f" number of offsets is {len(offsets)} instead of 256"
+
+ for i in range(256):
+ if not i in offsets:
+ self.passed = False
+ self.failure_reason += f" missing offset {i * 4096}"
+
+ (_, p) = runstest_1samp(list(offsets))
+ if p < 0.05:
+ self.passed = False
+ self.failure_reason += f" runs test failed with p = {p}"
+
+
+class FioJobFileTest_t0022(FioJobFileTest):
+ """Test consists of fio test job t0022"""
+
+ def check_result(self):
+ super().check_result()
+
+ bw_log_filename = os.path.join(self.paths['test_dir'], "test_bw.log")
+ file_data = self.get_file_fail(bw_log_filename)
+ if not file_data:
+ return
+
+ log_lines = file_data.split('\n')
+
+ filesize = 1024*1024
+ bs = 4096
+ seq_count = 0
+ offsets = set()
+
+ prev = int(log_lines[0].split(',')[4])
+ for line in log_lines[1:]:
+ offsets.add(prev/bs)
+ if len(line.strip()) == 0:
+ continue
+ cur = int(line.split(',')[4])
+ if cur - prev == bs:
+ seq_count += 1
+ prev = cur
+
+ # 10 is an arbitrary threshold
+ if seq_count > 10:
+ self.passed = False
+ self.failure_reason = f"too many ({seq_count}) consecutive offsets"
+
+ if len(offsets) == filesize/bs:
+ self.passed = False
+ self.failure_reason += " no duplicate offsets found with norandommap=1"
+
+
+class FioJobFileTest_t0023(FioJobFileTest):
+ """Test consists of fio test job t0023 randtrimwrite test."""
+
+ def check_trimwrite(self, filename):
+ """Make sure that trims are followed by writes of the same size at the same offset."""
+
+ bw_log_filename = os.path.join(self.paths['test_dir'], filename)
+ file_data = self.get_file_fail(bw_log_filename)
+ if not file_data:
+ return
+
+ log_lines = file_data.split('\n')
+
+ prev_ddir = 1
+ for line in log_lines:
+ if len(line.strip()) == 0:
+ continue
+ vals = line.split(',')
+ ddir = int(vals[2])
+ bs = int(vals[3])
+ offset = int(vals[4])
+ if prev_ddir == 1:
+ if ddir != 2:
+ self.passed = False
+ self.failure_reason += " {0}: write not preceeded by trim: {1}".format(
+ bw_log_filename, line)
+ break
+ else:
+ if ddir != 1: # pylint: disable=no-else-break
+ self.passed = False
+ self.failure_reason += " {0}: trim not preceeded by write: {1}".format(
+ bw_log_filename, line)
+ break
+ else:
+ if prev_bs != bs:
+ self.passed = False
+ self.failure_reason += " {0}: block size does not match: {1}".format(
+ bw_log_filename, line)
+ break
+
+ if prev_offset != offset:
+ self.passed = False
+ self.failure_reason += " {0}: offset does not match: {1}".format(
+ bw_log_filename, line)
+ break
+
+ prev_ddir = ddir
+ prev_bs = bs
+ prev_offset = offset
+
+
+ def check_all_offsets(self, filename, sectorsize, filesize):
+ """Make sure all offsets were touched."""
+
+ file_data = self.get_file_fail(os.path.join(self.paths['test_dir'], filename))
+ if not file_data:
+ return
+
+ log_lines = file_data.split('\n')
+
+ offsets = set()
+
+ for line in log_lines:
+ if len(line.strip()) == 0:
+ continue
+ vals = line.split(',')
+ bs = int(vals[3])
+ offset = int(vals[4])
+ if offset % sectorsize != 0:
+ self.passed = False
+ self.failure_reason += " {0}: offset {1} not a multiple of sector size {2}".format(
+ filename, offset, sectorsize)
+ break
+ if bs % sectorsize != 0:
+ self.passed = False
+ self.failure_reason += " {0}: block size {1} not a multiple of sector size " \
+ "{2}".format(filename, bs, sectorsize)
+ break
+ for i in range(int(bs/sectorsize)):
+ offsets.add(offset/sectorsize + i)
+
+ if len(offsets) != filesize/sectorsize:
+ self.passed = False
+ self.failure_reason += " {0}: only {1} offsets touched; expected {2}".format(
+ filename, len(offsets), filesize/sectorsize)
+ else:
+ logging.debug("%s: %d sectors touched", filename, len(offsets))
+
+
+ def check_result(self):
+ super().check_result()
+
+ filesize = 1024*1024
+
+ self.check_trimwrite("basic_bw.log")
+ self.check_trimwrite("bs_bw.log")
+ self.check_trimwrite("bsrange_bw.log")
+ self.check_trimwrite("bssplit_bw.log")
+ self.check_trimwrite("basic_no_rm_bw.log")
+ self.check_trimwrite("bs_no_rm_bw.log")
+ self.check_trimwrite("bsrange_no_rm_bw.log")
+ self.check_trimwrite("bssplit_no_rm_bw.log")
+
+ self.check_all_offsets("basic_bw.log", 4096, filesize)
+ self.check_all_offsets("bs_bw.log", 8192, filesize)
+ self.check_all_offsets("bsrange_bw.log", 512, filesize)
+ self.check_all_offsets("bssplit_bw.log", 512, filesize)
+
+
+class FioJobFileTest_t0024(FioJobFileTest_t0023):
+ """Test consists of fio test job t0024 trimwrite test."""
+
+ def check_result(self):
+ # call FioJobFileTest_t0023's parent to skip checks done by t0023
+ super(FioJobFileTest_t0023, self).check_result()
+
+ filesize = 1024*1024
+
+ self.check_trimwrite("basic_bw.log")
+ self.check_trimwrite("bs_bw.log")
+ self.check_trimwrite("bsrange_bw.log")
+ self.check_trimwrite("bssplit_bw.log")
+
+ self.check_all_offsets("basic_bw.log", 4096, filesize)
+ self.check_all_offsets("bs_bw.log", 8192, filesize)
+ self.check_all_offsets("bsrange_bw.log", 512, filesize)
+ self.check_all_offsets("bssplit_bw.log", 512, filesize)
+
+
+class FioJobFileTest_t0025(FioJobFileTest):
+ """Test experimental verify read backs written data pattern."""
+ def check_result(self):
+ super().check_result()
+
+ if not self.passed:
+ return
+
+ if self.json_data['jobs'][0]['read']['io_kbytes'] != 128:
+ self.passed = False
+
+class FioJobFileTest_t0027(FioJobFileTest):
+ def setup(self, *args, **kws):
+ super().setup(*args, **kws)
+ self.pattern_file = os.path.join(self.paths['test_dir'], "t0027.pattern")
+ self.output_file = os.path.join(self.paths['test_dir'], "t0027file")
+ self.pattern = os.urandom(16 << 10)
+ with open(self.pattern_file, "wb") as f:
+ f.write(self.pattern)
+
+ def check_result(self):
+ super().check_result()
+
+ if not self.passed:
+ return
+
+ with open(self.output_file, "rb") as f:
+ data = f.read()
+
+ if data != self.pattern:
+ self.passed = False
+
+class FioJobFileTest_t0029(FioJobFileTest):
+ """Test loops option works with read-verify workload."""
+ def check_result(self):
+ super().check_result()
+
+ if not self.passed:
+ return
+
+ if self.json_data['jobs'][1]['read']['io_kbytes'] != 8:
+ self.passed = False
+
+class FioJobFileTest_iops_rate(FioJobFileTest):
+ """Test consists of fio test job t0011
Confirm that job0 iops == 1000
and that job1_iops / job0_iops ~ 8
With two runs of fio-3.16 I observed a ratio of 8.3"""
def check_result(self):
- super(FioJobTest_iops_rate, self).check_result()
+ super().check_result()
if not self.passed:
return
iops1 = self.json_data['jobs'][0]['read']['iops']
+ logging.debug("Test %d: iops1: %f", self.testnum, iops1)
iops2 = self.json_data['jobs'][1]['read']['iops']
+ logging.debug("Test %d: iops2: %f", self.testnum, iops2)
ratio = iops2 / iops1
- logging.debug("Test %d: iops1: %f", self.testnum, iops1)
logging.debug("Test %d: ratio: %f", self.testnum, ratio)
if iops1 < 950 or iops1 > 1050:
- self.failure_reason = "{0} iops value mismatch,".format(self.failure_reason)
+ self.failure_reason = f"{self.failure_reason} iops value mismatch,"
self.passed = False
if ratio < 6 or ratio > 10:
- self.failure_reason = "{0} iops ratio mismatch,".format(self.failure_reason)
+ self.failure_reason = f"{self.failure_reason} iops ratio mismatch,"
self.passed = False
-class Requirements(object):
- """Requirements consists of multiple run environment characteristics.
- These are to determine if a particular test can be run"""
-
- _linux = False
- _libaio = False
- _zbd = False
- _root = False
- _zoned_nullb = False
- _not_macos = False
- _not_windows = False
- _unittests = False
- _cpucount4 = False
-
- def __init__(self, fio_root):
- Requirements._not_macos = platform.system() != "Darwin"
- Requirements._not_windows = platform.system() != "Windows"
- Requirements._linux = platform.system() == "Linux"
-
- if Requirements._linux:
- config_file = os.path.join(fio_root, "config-host.h")
- contents, success = FioJobTest.get_file(config_file)
- if not success:
- print("Unable to open {0} to check requirements".format(config_file))
- Requirements._zbd = True
- else:
- Requirements._zbd = "CONFIG_HAS_BLKZONED" in contents
- Requirements._libaio = "CONFIG_LIBAIO" in contents
-
- Requirements._root = (os.geteuid() == 0)
- if Requirements._zbd and Requirements._root:
- try:
- subprocess.run(["modprobe", "null_blk"],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
- if os.path.exists("/sys/module/null_blk/parameters/zoned"):
- Requirements._zoned_nullb = True
- except Exception:
- pass
-
- if platform.system() == "Windows":
- utest_exe = "unittest.exe"
- else:
- utest_exe = "unittest"
- unittest_path = os.path.join(fio_root, "unittests", utest_exe)
- Requirements._unittests = os.path.exists(unittest_path)
-
- Requirements._cpucount4 = multiprocessing.cpu_count() >= 4
-
- req_list = [Requirements.linux,
- Requirements.libaio,
- Requirements.zbd,
- Requirements.root,
- Requirements.zoned_nullb,
- Requirements.not_macos,
- Requirements.not_windows,
- Requirements.unittests,
- Requirements.cpucount4]
- for req in req_list:
- value, desc = req()
- logging.debug("Requirements: Requirement '%s' met? %s", desc, value)
-
- @classmethod
- def linux(cls):
- """Are we running on Linux?"""
- return Requirements._linux, "Linux required"
-
- @classmethod
- def libaio(cls):
- """Is libaio available?"""
- return Requirements._libaio, "libaio required"
-
- @classmethod
- def zbd(cls):
- """Is ZBD support available?"""
- return Requirements._zbd, "Zoned block device support required"
-
- @classmethod
- def root(cls):
- """Are we running as root?"""
- return Requirements._root, "root required"
-
- @classmethod
- def zoned_nullb(cls):
- """Are zoned null block devices available?"""
- return Requirements._zoned_nullb, "Zoned null block device support required"
-
- @classmethod
- def not_macos(cls):
- """Are we running on a platform other than macOS?"""
- return Requirements._not_macos, "platform other than macOS required"
-
- @classmethod
- def not_windows(cls):
- """Are we running on a platform other than Windws?"""
- return Requirements._not_windows, "platform other than Windows required"
-
- @classmethod
- def unittests(cls):
- """Were unittests built?"""
- return Requirements._unittests, "Unittests support required"
-
- @classmethod
- def cpucount4(cls):
- """Do we have at least 4 CPUs?"""
- return Requirements._cpucount4, "4+ CPUs required"
-
-
-SUCCESS_DEFAULT = {
- 'zero_return': True,
- 'stderr_empty': True,
- 'timeout': 600,
- }
-SUCCESS_NONZERO = {
- 'zero_return': False,
- 'stderr_empty': False,
- 'timeout': 600,
- }
-SUCCESS_STDERR = {
- 'zero_return': True,
- 'stderr_empty': False,
- 'timeout': 600,
- }
TEST_LIST = [
{
'test_id': 1,
- 'test_class': FioJobTest,
+ 'test_class': FioJobFileTest,
'job': 't0001-52c58027.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 2,
- 'test_class': FioJobTest,
+ 'test_class': FioJobFileTest,
'job': 't0002-13af05ae-post.fio',
'success': SUCCESS_DEFAULT,
'pre_job': 't0002-13af05ae-pre.fio',
},
{
'test_id': 3,
- 'test_class': FioJobTest,
+ 'test_class': FioJobFileTest,
'job': 't0003-0ae2c6e1-post.fio',
'success': SUCCESS_NONZERO,
'pre_job': 't0003-0ae2c6e1-pre.fio',
},
{
'test_id': 4,
- 'test_class': FioJobTest,
+ 'test_class': FioJobFileTest,
'job': 't0004-8a99fdf6.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 5,
- 'test_class': FioJobTest_t0005,
+ 'test_class': FioJobFileTest_t0005,
'job': 't0005-f7078f7b.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 6,
- 'test_class': FioJobTest_t0006,
+ 'test_class': FioJobFileTest_t0006,
'job': 't0006-82af2a7c.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 7,
- 'test_class': FioJobTest_t0007,
+ 'test_class': FioJobFileTest_t0007,
'job': 't0007-37cf9e3c.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 8,
- 'test_class': FioJobTest_t0008,
+ 'test_class': FioJobFileTest_t0008,
'job': 't0008-ae2fafc8.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 9,
- 'test_class': FioJobTest_t0009,
+ 'test_class': FioJobFileTest_t0009,
'job': 't0009-f8b0bd10.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 10,
- 'test_class': FioJobTest,
+ 'test_class': FioJobFileTest,
'job': 't0010-b7aae4ba.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 11,
- 'test_class': FioJobTest_iops_rate,
+ 'test_class': FioJobFileTest_iops_rate,
'job': 't0011-5d2788d5.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 12,
- 'test_class': FioJobTest_t0012,
+ 'test_class': FioJobFileTest_t0012,
'job': 't0012.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 13,
- 'test_class': FioJobTest,
+ 'test_class': FioJobFileTest,
'job': 't0013.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
},
{
'test_id': 14,
- 'test_class': FioJobTest_t0014,
+ 'test_class': FioJobFileTest_t0014,
'job': 't0014.fio',
'success': SUCCESS_DEFAULT,
'pre_job': None,
'output_format': 'json',
'requirements': [],
},
+ {
+ 'test_id': 15,
+ 'test_class': FioJobFileTest_t0015,
+ 'job': 't0015-4e7e7898.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'output_format': 'json',
+ 'requirements': [Requirements.linux, Requirements.libaio],
+ },
+ {
+ 'test_id': 16,
+ 'test_class': FioJobFileTest_t0015,
+ 'job': 't0016-d54ae22.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'output_format': 'json',
+ 'requirements': [],
+ },
+ {
+ 'test_id': 17,
+ 'test_class': FioJobFileTest_t0015,
+ 'job': 't0017.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'output_format': 'json',
+ 'requirements': [Requirements.not_windows],
+ },
+ {
+ 'test_id': 18,
+ 'test_class': FioJobFileTest,
+ 'job': 't0018.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'requirements': [Requirements.linux, Requirements.io_uring],
+ },
+ {
+ 'test_id': 19,
+ 'test_class': FioJobFileTest_t0019,
+ 'job': 't0019.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'requirements': [],
+ },
+ {
+ 'test_id': 20,
+ 'test_class': FioJobFileTest_t0020,
+ 'job': 't0020.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'requirements': [],
+ },
+ {
+ 'test_id': 21,
+ 'test_class': FioJobFileTest_t0020,
+ 'job': 't0021.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'requirements': [],
+ },
+ {
+ 'test_id': 22,
+ 'test_class': FioJobFileTest_t0022,
+ 'job': 't0022.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'requirements': [],
+ },
+ {
+ 'test_id': 23,
+ 'test_class': FioJobFileTest_t0023,
+ 'job': 't0023.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'requirements': [],
+ },
+ {
+ 'test_id': 24,
+ 'test_class': FioJobFileTest_t0024,
+ 'job': 't0024.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'requirements': [],
+ },
+ {
+ 'test_id': 25,
+ 'test_class': FioJobFileTest_t0025,
+ 'job': 't0025.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'output_format': 'json',
+ 'requirements': [],
+ },
+ {
+ 'test_id': 26,
+ 'test_class': FioJobFileTest,
+ 'job': 't0026.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'requirements': [Requirements.not_windows],
+ },
+ {
+ 'test_id': 27,
+ 'test_class': FioJobFileTest_t0027,
+ 'job': 't0027.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'requirements': [],
+ },
+ {
+ 'test_id': 28,
+ 'test_class': FioJobFileTest,
+ 'job': 't0028-c6cade16.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'requirements': [],
+ },
+ {
+ 'test_id': 29,
+ 'test_class': FioJobFileTest_t0029,
+ 'job': 't0029.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'output_format': 'json',
+ 'requirements': [],
+ },
+ {
+ 'test_id': 30,
+ 'test_class': FioJobFileTest,
+ 'job': 't0030.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': None,
+ 'pre_success': None,
+ 'parameters': ['--bandwidth-log'],
+ 'requirements': [],
+ },
+ {
+ 'test_id': 31,
+ 'test_class': FioJobFileTest,
+ 'job': 't0031.fio',
+ 'success': SUCCESS_DEFAULT,
+ 'pre_job': 't0031-pre.fio',
+ 'pre_success': SUCCESS_DEFAULT,
+ 'requirements': [Requirements.linux, Requirements.libaio],
+ },
{
'test_id': 1000,
'test_class': FioExeTest,
'test_id': 1006,
'test_class': FioExeTest,
'exe': 't/strided.py',
- 'parameters': ['{fio_path}'],
+ 'parameters': ['--fio', '{fio_path}'],
'success': SUCCESS_DEFAULT,
'requirements': [],
},
{
'test_id': 1007,
'test_class': FioExeTest,
- 'exe': 't/zbd/run-tests-against-regular-nullb',
- 'parameters': None,
+ 'exe': 't/zbd/run-tests-against-nullb',
+ 'parameters': ['-s', '1'],
'success': SUCCESS_DEFAULT,
'requirements': [Requirements.linux, Requirements.zbd,
Requirements.root],
{
'test_id': 1008,
'test_class': FioExeTest,
- 'exe': 't/zbd/run-tests-against-zoned-nullb',
- 'parameters': None,
+ 'exe': 't/zbd/run-tests-against-nullb',
+ 'parameters': ['-s', '2'],
'success': SUCCESS_DEFAULT,
'requirements': [Requirements.linux, Requirements.zbd,
Requirements.root, Requirements.zoned_nullb],
'success': SUCCESS_DEFAULT,
'requirements': [],
},
+ {
+ 'test_id': 1012,
+ 'test_class': FioExeTest,
+ 'exe': 't/log_compression.py',
+ 'parameters': ['-f', '{fio_path}'],
+ 'success': SUCCESS_DEFAULT,
+ 'requirements': [],
+ },
+ {
+ 'test_id': 1013,
+ 'test_class': FioExeTest,
+ 'exe': 't/random_seed.py',
+ 'parameters': ['-f', '{fio_path}'],
+ 'success': SUCCESS_DEFAULT,
+ 'requirements': [],
+ },
+ {
+ 'test_id': 1014,
+ 'test_class': FioExeTest,
+ 'exe': 't/nvmept.py',
+ 'parameters': ['-f', '{fio_path}', '--dut', '{nvmecdev}'],
+ 'success': SUCCESS_DEFAULT,
+ 'requirements': [Requirements.linux, Requirements.nvmecdev],
+ },
+ {
+ 'test_id': 1015,
+ 'test_class': FioExeTest,
+ 'exe': 't/nvmept_trim.py',
+ 'parameters': ['-f', '{fio_path}', '--dut', '{nvmecdev}'],
+ 'success': SUCCESS_DEFAULT,
+ 'requirements': [Requirements.linux, Requirements.nvmecdev],
+ },
]
help='skip requirements checking')
parser.add_argument('-p', '--pass-through', action='append',
help='pass-through an argument to an executable test')
+ parser.add_argument('--nvmecdev', action='store', default=None,
+ help='NVMe character device for **DESTRUCTIVE** testing (e.g., /dev/ng0n1)')
args = parser.parse_args()
return args
if args.pass_through:
for arg in args.pass_through:
if not ':' in arg:
- print("Invalid --pass-through argument '%s'" % arg)
+ print(f"Invalid --pass-through argument '{arg}'")
print("Syntax for --pass-through is TESTNUMBER:ARGUMENT")
return
split = arg.split(":", 1)
fio_root = args.fio_root
else:
fio_root = str(Path(__file__).absolute().parent.parent)
- print("fio root is %s" % fio_root)
+ print(f"fio root is {fio_root}")
if args.fio:
fio_path = args.fio
else:
fio_exe = "fio"
fio_path = os.path.join(fio_root, fio_exe)
- print("fio path is %s" % fio_path)
+ print(f"fio path is {fio_path}")
if not shutil.which(fio_path):
print("Warning: fio executable not found")
artifact_root = args.artifact_root if args.artifact_root else \
- "fio-test-{0}".format(time.strftime("%Y%m%d-%H%M%S"))
+ f"fio-test-{time.strftime('%Y%m%d-%H%M%S')}"
os.mkdir(artifact_root)
- print("Artifact directory is %s" % artifact_root)
+ print(f"Artifact directory is {artifact_root}")
if not args.skip_req:
- req = Requirements(fio_root)
-
- passed = 0
- failed = 0
- skipped = 0
-
- for config in TEST_LIST:
- if (args.skip and config['test_id'] in args.skip) or \
- (args.run_only and config['test_id'] not in args.run_only):
- skipped = skipped + 1
- print("Test {0} SKIPPED (User request)".format(config['test_id']))
- continue
-
- if issubclass(config['test_class'], FioJobTest):
- if config['pre_job']:
- fio_pre_job = os.path.join(fio_root, 't', 'jobs',
- config['pre_job'])
- else:
- fio_pre_job = None
- if config['pre_success']:
- fio_pre_success = config['pre_success']
- else:
- fio_pre_success = None
- if 'output_format' in config:
- output_format = config['output_format']
- else:
- output_format = 'normal'
- test = config['test_class'](
- fio_path,
- os.path.join(fio_root, 't', 'jobs', config['job']),
- config['success'],
- fio_pre_job=fio_pre_job,
- fio_pre_success=fio_pre_success,
- output_format=output_format)
- desc = config['job']
- elif issubclass(config['test_class'], FioExeTest):
- exe_path = os.path.join(fio_root, config['exe'])
- if config['parameters']:
- parameters = [p.format(fio_path=fio_path) for p in config['parameters']]
- else:
- parameters = []
- if Path(exe_path).suffix == '.py' and platform.system() == "Windows":
- parameters.insert(0, exe_path)
- exe_path = "python.exe"
- if config['test_id'] in pass_through:
- parameters += pass_through[config['test_id']].split()
- test = config['test_class'](exe_path, parameters,
- config['success'])
- desc = config['exe']
- else:
- print("Test {0} FAILED: unable to process test config".format(config['test_id']))
- failed = failed + 1
- continue
-
- if not args.skip_req:
- reqs_met = True
- for req in config['requirements']:
- reqs_met, reason = req()
- logging.debug("Test %d: Requirement '%s' met? %s", config['test_id'], reason,
- reqs_met)
- if not reqs_met:
- break
- if not reqs_met:
- print("Test {0} SKIPPED ({1}) {2}".format(config['test_id'], reason, desc))
- skipped = skipped + 1
- continue
-
- test.setup(artifact_root, config['test_id'])
- test.run()
- test.check_result()
- if test.passed:
- result = "PASSED"
- passed = passed + 1
- else:
- result = "FAILED: {0}".format(test.failure_reason)
- failed = failed + 1
- contents, _ = FioJobTest.get_file(test.stderr_file)
- logging.debug("Test %d: stderr:\n%s", config['test_id'], contents)
- contents, _ = FioJobTest.get_file(test.stdout_file)
- logging.debug("Test %d: stdout:\n%s", config['test_id'], contents)
- print("Test {0} {1} {2}".format(config['test_id'], result, desc))
-
- print("{0} test(s) passed, {1} failed, {2} skipped".format(passed, failed, skipped))
-
+ Requirements(fio_root, args)
+
+ test_env = {
+ 'fio_path': fio_path,
+ 'fio_root': fio_root,
+ 'artifact_root': artifact_root,
+ 'pass_through': pass_through,
+ }
+ _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
sys.exit(failed)
#
# sgunmap-test.py
#
-# Limited functonality test for trim workloads using fio's sg ioengine
+# Limited functionality test for trim workloads using fio's sg ioengine
# This checks only the three sets of reported iodepths
#
# !!!WARNING!!!
#
# steadystate_tests.py
#
-# Test option parsing and functonality for fio's steady state detection feature.
+# Test option parsing and functionality for fio's steady state detection feature.
#
# steadystate_tests.py --read file-for-read-testing --write file-for-write-testing ./fio
#
{'s': False, 'timeout': 20, 'numjobs': 2},
{'s': True, 'timeout': 100, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 5, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True},
{'s': True, 'timeout': 10, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 500, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+ {'s': True, 'timeout': 10, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 500, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True, 'ss_interval': 5},
]
jobnum = 0
#!/usr/bin/env python3
-#
+
+"""
# strided.py
#
# Test zonemode=strided. This uses the null ioengine when no file is
# specified. If a file is specified, use it for randdom read testing.
# Some of the zoneranges in the tests are 16MiB. So when using a file
-# a minimum size of 32MiB is recommended.
+# a minimum size of 64MiB is recommended.
#
# USAGE
# python strided.py fio-executable [-f file/device]
# EXAMPLES
# python t/strided.py ./fio
# python t/strided.py ./fio -f /dev/sda
-# dd if=/dev/zero of=temp bs=1M count=32
+# dd if=/dev/zero of=temp bs=1M count=64
# python t/strided.py ./fio -f temp
#
-# REQUIREMENTS
-# Python 2.6+
-#
# ===TEST MATRIX===
#
# --zonemode=strided, zoneskip unset
# zonesize<zonerange all blocks inside zone
#
# w/o randommap all blocks inside zone
-#
+"""
-from __future__ import absolute_import
-from __future__ import print_function
import os
import sys
+import time
import argparse
-import subprocess
+from pathlib import Path
+from fiotestlib import FioJobCmdTest, run_fio_tests
-def parse_args():
- parser = argparse.ArgumentParser()
- parser.add_argument('fio',
- help='path to fio executable (e.g., ./fio)')
- parser.add_argument('-f', '--filename', help="file/device to test")
- args = parser.parse_args()
+class StridedTest(FioJobCmdTest):
+ """Test zonemode=strided."""
- return args
+ def setup(self, parameters):
+ fio_args = [
+ "--name=strided",
+ "--zonemode=strided",
+ "--log_offset=1",
+ "--randrepeat=0",
+ "--rw=randread",
+ f"--write_iops_log={self.filenames['iopslog']}",
+ f"--output={self.filenames['output']}",
+ f"--zonerange={self.fio_opts['zonerange']}",
+ f"--zonesize={self.fio_opts['zonesize']}",
+ f"--bs={self.fio_opts['bs']}",
+ ]
+ for opt in ['norandommap', 'random_generator', 'offset']:
+ if opt in self.fio_opts:
+ option = f"--{opt}={self.fio_opts[opt]}"
+ fio_args.append(option)
-def run_fio(fio, test, index):
- filename = "strided"
- fio_args = [
- "--max-jobs=16",
- "--name=strided",
- "--zonemode=strided",
- "--log_offset=1",
- "--randrepeat=0",
- "--rw=randread",
- "--write_iops_log={0}{1:03d}".format(filename, index),
- "--output={0}{1:03d}.out".format(filename, index),
- "--zonerange={zonerange}".format(**test),
- "--zonesize={zonesize}".format(**test),
- "--bs={bs}".format(**test),
- ]
- if 'norandommap' in test:
- fio_args.append('--norandommap')
- if 'random_generator' in test:
- fio_args.append('--random_generator={random_generator}'.format(**test))
- if 'offset' in test:
- fio_args.append('--offset={offset}'.format(**test))
- if 'filename' in test:
- fio_args.append('--filename={filename}'.format(**test))
- fio_args.append('--filesize={filesize})'.format(**test))
- else:
- fio_args.append('--ioengine=null')
- fio_args.append('--size={size}'.format(**test))
- fio_args.append('--io_size={io_size}'.format(**test))
- fio_args.append('--filesize={size})'.format(**test))
-
- output = subprocess.check_output([fio] + fio_args, universal_newlines=True)
-
- f = open("{0}{1:03d}_iops.1.log".format(filename, index), "r")
- log = f.read()
- f.close()
-
- return log
-
-
-def check_output(iops_log, test):
- zonestart = 0 if 'offset' not in test else test['offset']
- iospersize = test['zonesize'] / test['bs']
- iosperrange = test['zonerange'] / test['bs']
- iosperzone = 0
- lines = iops_log.split('\n')
- zoneset = set()
-
- for line in lines:
- if len(line) == 0:
- continue
-
- if iosperzone == iospersize:
- # time to move to a new zone
- iosperzone = 0
- zoneset = set()
- zonestart += test['zonerange']
- if zonestart >= test['filesize']:
- zonestart = 0 if 'offset' not in test else test['offset']
-
- iosperzone = iosperzone + 1
- tokens = line.split(',')
- offset = int(tokens[4])
- if offset < zonestart or offset >= zonestart + test['zonerange']:
- print("Offset {0} outside of zone starting at {1}".format(
- offset, zonestart))
- return False
-
- # skip next section if norandommap is enabled with no
- # random_generator or with a random_generator != lfsr
- if 'norandommap' in test:
- if 'random_generator' in test:
- if test['random_generator'] != 'lfsr':
- continue
- else:
+ if 'filename' in self.fio_opts:
+ for opt in ['filename', 'filesize']:
+ option = f"--{opt}={self.fio_opts[opt]}"
+ fio_args.append(option)
+ else:
+ fio_args.append('--ioengine=null')
+ for opt in ['size', 'io_size', 'filesize']:
+ option = f"--{opt}={self.fio_opts[opt]}"
+ fio_args.append(option)
+
+ super().setup(fio_args)
+
+ def check_result(self):
+ super().check_result()
+ if not self.passed:
+ return
+
+ zonestart = 0 if 'offset' not in self.fio_opts else self.fio_opts['offset']
+ iospersize = self.fio_opts['zonesize'] / self.fio_opts['bs']
+ iosperrange = self.fio_opts['zonerange'] / self.fio_opts['bs']
+ iosperzone = 0
+ lines = self.iops_log_lines.split('\n')
+ zoneset = set()
+
+ for line in lines:
+ if len(line) == 0:
continue
- # we either have a random map enabled or we
- # are using an LFSR
- # so all blocks should be unique and we should have
- # covered the entire zone when iosperzone % iosperrange == 0
- block = (offset - zonestart) / test['bs']
- if block in zoneset:
- print("Offset {0} in zone already touched".format(offset))
- return False
+ if iosperzone == iospersize:
+ # time to move to a new zone
+ iosperzone = 0
+ zoneset = set()
+ zonestart += self.fio_opts['zonerange']
+ if zonestart >= self.fio_opts['filesize']:
+ zonestart = 0 if 'offset' not in self.fio_opts else self.fio_opts['offset']
- zoneset.add(block)
- if iosperzone % iosperrange == 0:
- if len(zoneset) != iosperrange:
- print("Expected {0} blocks in zone but only saw {1}".format(
- iosperrange, len(zoneset)))
- return False
- zoneset = set()
+ iosperzone = iosperzone + 1
+ tokens = line.split(',')
+ offset = int(tokens[4])
+ if offset < zonestart or offset >= zonestart + self.fio_opts['zonerange']:
+ print(f"Offset {offset} outside of zone starting at {zonestart}")
+ return
- return True
+ # skip next section if norandommap is enabled with no
+ # random_generator or with a random_generator != lfsr
+ if 'norandommap' in self.fio_opts:
+ if 'random_generator' in self.fio_opts:
+ if self.fio_opts['random_generator'] != 'lfsr':
+ continue
+ else:
+ continue
+ # we either have a random map enabled or we
+ # are using an LFSR
+ # so all blocks should be unique and we should have
+ # covered the entire zone when iosperzone % iosperrange == 0
+ block = (offset - zonestart) / self.fio_opts['bs']
+ if block in zoneset:
+ print(f"Offset {offset} in zone already touched")
+ return
+
+ zoneset.add(block)
+ if iosperzone % iosperrange == 0:
+ if len(zoneset) != iosperrange:
+ print(f"Expected {iosperrange} blocks in zone but only saw {len(zoneset)}")
+ return
+ zoneset = set()
+
+
+TEST_LIST = [ # randommap enabled
+ {
+ "test_id": 1,
+ "fio_opts": {
+ "zonerange": 4096,
+ "zonesize": 4096,
+ "bs": 4096,
+ "offset": 8*4096,
+ "size": 16*4096,
+ "io_size": 16*4096,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 2,
+ "fio_opts": {
+ "zonerange": 4096,
+ "zonesize": 4096,
+ "bs": 4096,
+ "size": 16*4096,
+ "io_size": 16*4096,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 3,
+ "fio_opts": {
+ "zonerange": 16*1024*1024,
+ "zonesize": 16*1024*1024,
+ "bs": 4096,
+ "size": 256*1024*1024,
+ "io_size": 256*1024*204,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 4,
+ "fio_opts": {
+ "zonerange": 4096,
+ "zonesize": 4*4096,
+ "bs": 4096,
+ "size": 16*4096,
+ "io_size": 16*4096,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 5,
+ "fio_opts": {
+ "zonerange": 16*1024*1024,
+ "zonesize": 32*1024*1024,
+ "bs": 4096,
+ "size": 256*1024*1024,
+ "io_size": 256*1024*204,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 6,
+ "fio_opts": {
+ "zonerange": 8192,
+ "zonesize": 4096,
+ "bs": 4096,
+ "size": 16*4096,
+ "io_size": 16*4096,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 7,
+ "fio_opts": {
+ "zonerange": 16*1024*1024,
+ "zonesize": 8*1024*1024,
+ "bs": 4096,
+ "size": 256*1024*1024,
+ "io_size": 256*1024*204,
+ },
+ "test_class": StridedTest,
+ },
+ # lfsr
+ {
+ "test_id": 8,
+ "fio_opts": {
+ "random_generator": "lfsr",
+ "zonerange": 4096*1024,
+ "zonesize": 4096*1024,
+ "bs": 4096,
+ "offset": 8*4096*1024,
+ "size": 16*4096*1024,
+ "io_size": 16*4096*1024,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 9,
+ "fio_opts": {
+ "random_generator": "lfsr",
+ "zonerange": 4096*1024,
+ "zonesize": 4096*1024,
+ "bs": 4096,
+ "size": 16*4096*1024,
+ "io_size": 16*4096*1024,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 10,
+ "fio_opts": {
+ "random_generator": "lfsr",
+ "zonerange": 16*1024*1024,
+ "zonesize": 16*1024*1024,
+ "bs": 4096,
+ "size": 256*1024*1024,
+ "io_size": 256*1024*204,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 11,
+ "fio_opts": {
+ "random_generator": "lfsr",
+ "zonerange": 4096*1024,
+ "zonesize": 4*4096*1024,
+ "bs": 4096,
+ "size": 16*4096*1024,
+ "io_size": 16*4096*1024,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 12,
+ "fio_opts": {
+ "random_generator": "lfsr",
+ "zonerange": 16*1024*1024,
+ "zonesize": 32*1024*1024,
+ "bs": 4096,
+ "size": 256*1024*1024,
+ "io_size": 256*1024*204,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 13,
+ "fio_opts": {
+ "random_generator": "lfsr",
+ "zonerange": 8192*1024,
+ "zonesize": 4096*1024,
+ "bs": 4096,
+ "size": 16*4096*1024,
+ "io_size": 16*4096*1024,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 14,
+ "fio_opts": {
+ "random_generator": "lfsr",
+ "zonerange": 16*1024*1024,
+ "zonesize": 8*1024*1024,
+ "bs": 4096,
+ "size": 256*1024*1024,
+ "io_size": 256*1024*204,
+ },
+ "test_class": StridedTest,
+ },
+ # norandommap
+ {
+ "test_id": 15,
+ "fio_opts": {
+ "norandommap": 1,
+ "zonerange": 4096,
+ "zonesize": 4096,
+ "bs": 4096,
+ "offset": 8*4096,
+ "size": 16*4096,
+ "io_size": 16*4096,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 16,
+ "fio_opts": {
+ "norandommap": 1,
+ "zonerange": 4096,
+ "zonesize": 4096,
+ "bs": 4096,
+ "size": 16*4096,
+ "io_size": 16*4096,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 17,
+ "fio_opts": {
+ "norandommap": 1,
+ "zonerange": 16*1024*1024,
+ "zonesize": 16*1024*1024,
+ "bs": 4096,
+ "size": 256*1024*1024,
+ "io_size": 256*1024*204,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 18,
+ "fio_opts": {
+ "norandommap": 1,
+ "zonerange": 4096,
+ "zonesize": 8192,
+ "bs": 4096,
+ "size": 16*4096,
+ "io_size": 16*4096,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 19,
+ "fio_opts": {
+ "norandommap": 1,
+ "zonerange": 16*1024*1024,
+ "zonesize": 32*1024*1024,
+ "bs": 4096,
+ "size": 256*1024*1024,
+ "io_size": 256*1024*204,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 20,
+ "fio_opts": {
+ "norandommap": 1,
+ "zonerange": 8192,
+ "zonesize": 4096,
+ "bs": 4096,
+ "size": 16*4096,
+ "io_size": 16*4096,
+ },
+ "test_class": StridedTest,
+ },
+ {
+ "test_id": 21,
+ "fio_opts": {
+ "norandommap": 1,
+ "zonerange": 16*1024*1024,
+ "zonesize": 8*1024*1024,
+ "bs": 4096,
+ "size": 256*1024*1024,
+ "io_size": 256*1024*1024,
+ },
+ "test_class": StridedTest,
+ },
+]
+
+
+def parse_args():
+ """Parse command-line arguments."""
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-f', '--fio', help='path to file executable (e.g., ./fio)')
+ parser.add_argument('-a', '--artifact-root', help='artifact root directory')
+ parser.add_argument('-s', '--skip', nargs='+', type=int,
+ help='list of test(s) to skip')
+ parser.add_argument('-o', '--run-only', nargs='+', type=int,
+ help='list of test(s) to run, skipping all others')
+ parser.add_argument('--dut',
+ help='target file/device to test.')
+ args = parser.parse_args()
+
+ return args
+
+
+def main():
+ """Run zonemode=strided tests."""
-if __name__ == '__main__':
args = parse_args()
- tests = [ # randommap enabled
- {
- "zonerange": 4096,
- "zonesize": 4096,
- "bs": 4096,
- "offset": 8*4096,
- "size": 16*4096,
- "io_size": 16*4096,
- },
- {
- "zonerange": 4096,
- "zonesize": 4096,
- "bs": 4096,
- "size": 16*4096,
- "io_size": 16*4096,
- },
- {
- "zonerange": 16*1024*1024,
- "zonesize": 16*1024*1024,
- "bs": 4096,
- "size": 256*1024*1024,
- "io_size": 256*1024*204,
- },
- {
- "zonerange": 4096,
- "zonesize": 4*4096,
- "bs": 4096,
- "size": 16*4096,
- "io_size": 16*4096,
- },
- {
- "zonerange": 16*1024*1024,
- "zonesize": 32*1024*1024,
- "bs": 4096,
- "size": 256*1024*1024,
- "io_size": 256*1024*204,
- },
- {
- "zonerange": 8192,
- "zonesize": 4096,
- "bs": 4096,
- "size": 16*4096,
- "io_size": 16*4096,
- },
- {
- "zonerange": 16*1024*1024,
- "zonesize": 8*1024*1024,
- "bs": 4096,
- "size": 256*1024*1024,
- "io_size": 256*1024*204,
- },
- # lfsr
- {
- "random_generator": "lfsr",
- "zonerange": 4096*1024,
- "zonesize": 4096*1024,
- "bs": 4096,
- "offset": 8*4096*1024,
- "size": 16*4096*1024,
- "io_size": 16*4096*1024,
- },
- {
- "random_generator": "lfsr",
- "zonerange": 4096*1024,
- "zonesize": 4096*1024,
- "bs": 4096,
- "size": 16*4096*1024,
- "io_size": 16*4096*1024,
- },
- {
- "random_generator": "lfsr",
- "zonerange": 16*1024*1024,
- "zonesize": 16*1024*1024,
- "bs": 4096,
- "size": 256*1024*1024,
- "io_size": 256*1024*204,
- },
- {
- "random_generator": "lfsr",
- "zonerange": 4096*1024,
- "zonesize": 4*4096*1024,
- "bs": 4096,
- "size": 16*4096*1024,
- "io_size": 16*4096*1024,
- },
- {
- "random_generator": "lfsr",
- "zonerange": 16*1024*1024,
- "zonesize": 32*1024*1024,
- "bs": 4096,
- "size": 256*1024*1024,
- "io_size": 256*1024*204,
- },
- {
- "random_generator": "lfsr",
- "zonerange": 8192*1024,
- "zonesize": 4096*1024,
- "bs": 4096,
- "size": 16*4096*1024,
- "io_size": 16*4096*1024,
- },
- {
- "random_generator": "lfsr",
- "zonerange": 16*1024*1024,
- "zonesize": 8*1024*1024,
- "bs": 4096,
- "size": 256*1024*1024,
- "io_size": 256*1024*204,
- },
- # norandommap
- {
- "norandommap": 1,
- "zonerange": 4096,
- "zonesize": 4096,
- "bs": 4096,
- "offset": 8*4096,
- "size": 16*4096,
- "io_size": 16*4096,
- },
- {
- "norandommap": 1,
- "zonerange": 4096,
- "zonesize": 4096,
- "bs": 4096,
- "size": 16*4096,
- "io_size": 16*4096,
- },
- {
- "norandommap": 1,
- "zonerange": 16*1024*1024,
- "zonesize": 16*1024*1024,
- "bs": 4096,
- "size": 256*1024*1024,
- "io_size": 256*1024*204,
- },
- {
- "norandommap": 1,
- "zonerange": 4096,
- "zonesize": 8192,
- "bs": 4096,
- "size": 16*4096,
- "io_size": 16*4096,
- },
- {
- "norandommap": 1,
- "zonerange": 16*1024*1024,
- "zonesize": 32*1024*1024,
- "bs": 4096,
- "size": 256*1024*1024,
- "io_size": 256*1024*204,
- },
- {
- "norandommap": 1,
- "zonerange": 8192,
- "zonesize": 4096,
- "bs": 4096,
- "size": 16*4096,
- "io_size": 16*4096,
- },
- {
- "norandommap": 1,
- "zonerange": 16*1024*1024,
- "zonesize": 8*1024*1024,
- "bs": 4096,
- "size": 256*1024*1024,
- "io_size": 256*1024*1024,
- },
-
- ]
-
- index = 1
- passed = 0
- failed = 0
-
- if args.filename:
- statinfo = os.stat(args.filename)
+ artifact_root = args.artifact_root if args.artifact_root else \
+ f"strided-test-{time.strftime('%Y%m%d-%H%M%S')}"
+ os.mkdir(artifact_root)
+ print(f"Artifact directory is {artifact_root}")
+
+ if args.fio:
+ fio_path = str(Path(args.fio).absolute())
+ else:
+ fio_path = 'fio'
+ print(f"fio path is {fio_path}")
+
+ if args.dut:
+ statinfo = os.stat(args.dut)
filesize = statinfo.st_size
if filesize == 0:
- f = os.open(args.filename, os.O_RDONLY)
+ f = os.open(args.dut, os.O_RDONLY)
filesize = os.lseek(f, 0, os.SEEK_END)
os.close(f)
- for test in tests:
- if args.filename:
- test['filename'] = args.filename
- test['filesize'] = filesize
+ for test in TEST_LIST:
+ if args.dut:
+ test['fio_opts']['filename'] = os.path.abspath(args.dut)
+ test['fio_opts']['filesize'] = filesize
else:
- test['filesize'] = test['size']
- iops_log = run_fio(args.fio, test, index)
- status = check_output(iops_log, test)
- print("Test {0} {1}".format(index, ("PASSED" if status else "FAILED")))
- if status:
- passed = passed + 1
- else:
- failed = failed + 1
- index = index + 1
+ test['fio_opts']['filesize'] = test['fio_opts']['size']
- print("{0} tests passed, {1} failed".format(passed, failed))
+ test_env = {
+ 'fio_path': fio_path,
+ 'fio_root': str(Path(__file__).absolute().parent.parent),
+ 'artifact_root': artifact_root,
+ 'basename': 'strided',
+ }
+ _, failed, _ = run_fio_tests(TEST_LIST, test_env, args)
sys.exit(failed)
+
+
+if __name__ == '__main__':
+ main()
* accuracy because the (ticks * clock_mult) product used for final
* fractional chunk
*
- * iv) 64-bit arithmetic with the clock ticks to nsec conversion occuring in
+ * iv) 64-bit arithmetic with the clock ticks to nsec conversion occurring in
* two stages. This is carried out using locks to update the number of
* large time chunks (MAX_CLOCK_SEC_2STAGE) that have elapsed.
*
sg_inq=$(type -p sg_inq 2>/dev/null)
zbc_report_zones=$(type -p zbc_report_zones 2>/dev/null)
zbc_reset_zone=$(type -p zbc_reset_zone 2>/dev/null)
+zbc_close_zone=$(type -p zbc_close_zone 2>/dev/null)
zbc_info=$(type -p zbc_info 2>/dev/null)
if [ -z "${blkzone}" ] &&
{ [ -z "${zbc_report_zones}" ] || [ -z "${zbc_reset_zone}" ]; }; then
"${blkzone}" report -c 1 -o 0 "${dev}" | grep -q 'cap '
}
+has_command() {
+ local cmd="${1}"
+
+ cmd_path=$(type -p "${cmd}" 2>/dev/null)
+ if [ -z "${cmd_path}" ]; then
+ echo "${cmd} is not available"
+ return 1
+ fi
+ return 0
+}
+
# Whether or not $1 (/dev/...) is a NVME ZNS device.
is_nvme_zns() {
local s
fi
}
+# Check zone capacity of each zone and report block size aligned to the zone
+# capacities. If zone capacity is same as zone size for zones, report zone size.
+zone_cap_bs() {
+ local dev="${1}"
+ local zone_size="${2}"
+ local sed_str='s/.*len \([0-9A-Za-z]*\), cap \([0-9A-Za-z]*\).*/\1 \2/p'
+ local cap bs="$zone_size"
+
+ # When blkzone command is neither available nor relevant to the
+ # test device, or when blkzone command does not report capacity,
+ # assume that zone capacity is same as zone size for all zones.
+ if [ -z "${blkzone}" ] || [ -z "$is_zbd" ] || [ -c "$dev" ] ||
+ ! blkzone_reports_capacity "${dev}"; then
+ echo "$zone_size"
+ return
+ fi
+
+ while read -r -a line; do
+ ((line[0] == line[1])) && continue
+ cap=$((line[1] * 512))
+ while ((bs > 512 && cap % bs)); do
+ bs=$((bs / 2))
+ done
+ done < <(blkzone report "${dev}" | sed -n "${sed_str}")
+
+ echo "$bs"
+}
+
# Reports the starting sector and length of the first sequential zone of device
# $1.
first_sequential_zone() {
if [ -n "${blkzone}" ] && [ ! -n "${use_libzbc}" ]; then
${blkzone} report "$dev" |
- sed -n 's/^[[:blank:]]*start:[[:blank:]]\([0-9a-zA-Z]*\),[[:blank:]]len[[:blank:]]\([0-9a-zA-Z]*\),.*type:[[:blank:]]2(.*/\1 \2/p' |
+ sed -n 's/^[[:blank:]]*start:[[:blank:]]\([0-9a-zA-Z]*\),[[:blank:]]len[[:blank:]]\([0-9a-zA-Z]*\),.*zcond:\(14\|[[:blank:]][0-4]\)(.*type:[[:blank:]]\([2]\)(.*/\1 \2/p' |
{
read -r starting_sector length &&
# Convert from hex to decimal
}
else
${zbc_report_zones} "$dev" |
- sed -n 's/^Zone [0-9]*: type 0x2 .*, sector \([0-9]*\), \([0-9]*\) sectors,.*$/\1 \2/p' |
+ sed -n 's/^Zone [0-9]*: type 0x2 .*,[[:blank:]]cond[[:blank:]]0x[0-4e][[:blank:]].*, sector \([0-9]*\), \([0-9]*\) sectors.*$/\1 \2/p' |
head -n1
fi
}
echo $((capacity * 512))
}
+# Reports the starting sector and length of the first zone of device $1
+# that is not in offline (or similar) condition.
+first_online_zone() {
+ local dev=$1
+
+ if [ -z "$is_zbd" ]; then
+ echo 0
+ return
+ fi
+
+ if [ -n "${blkzone}" ] && [ ! -n "${use_libzbc}" ]; then
+ ${blkzone} report "$dev" |
+ sed -n 's/^[[:blank:]]*start:[[:blank:]]\([0-9a-zA-Z]*\),[[:blank:]]len[[:blank:]]\([0-9a-zA-Z]*\),.*zcond:\(14\|[[:blank:]][0-4]\)(.*type:[[:blank:]][12](.*/\1/p' |
+ head -n1 |
+ {
+ read -r starting_sector &&
+ # Convert from hex to decimal
+ echo $((starting_sector))
+ }
+ else
+ ${zbc_report_zones} "$dev" |
+ sed -n 's/^Zone[[:blank:]][0-9]*:[[:blank:]]type[[:blank:]]0x[12][[:blank:]].*,[[:blank:]]cond[[:blank:]]0x[0-4e][[:blank:]].*,[[:blank:]]sector[[:blank:]]\([0-9]*\),.*$/\1/p' |
+ head -n1
+ fi
+}
+
+# Reports the starting sector and length of the last zone of device $1
+# that is not in offline (or similar) condition.
+last_online_zone() {
+ local dev=$1
+
+ if [ -z "$is_zbd" ]; then
+ echo 0
+ return
+ fi
+
+ if [ -n "${blkzone}" ] && [ ! -n "${use_libzbc}" ]; then
+ ${blkzone} report "$dev" |
+ sed -n 's/^[[:blank:]]*start:[[:blank:]]\([0-9a-zA-Z]*\),[[:blank:]]len[[:blank:]]\([0-9a-zA-Z]*\),.*zcond:\(14\|[[:blank:]][0-4]\)(.*type:[[:blank:]][12](.*/\1/p' |
+ tail -1 |
+ {
+ read -r starting_sector &&
+ # Convert from hex to decimal
+ echo $((starting_sector))
+ }
+ else
+ ${zbc_report_zones} "$dev" |
+ sed -n 's/^Zone[[:blank:]][0-9]*:[[:blank:]]type[[:blank:]]0x[12][[:blank:]].*,[[:blank:]]cond[[:blank:]]0x[0-4e][[:blank:]].*,[[:blank:]]sector[[:blank:]]\([0-9]*\),.*$/\1/p' |
+ tail -1
+ fi
+}
+
+# Get max_open_zones of SMR drives using sg_inq or libzbc tools. Two test cases
+# 31 and 32 use this max_open_zones value. The test case 31 uses max_open_zones
+# to decide number of write target zones. The test case 32 passes max_open_zones
+# value to fio with --max_open_zones option. Of note is that fio itself has the
+# feature to get max_open_zones from the device through sysfs or ioengine
+# specific implementation. This max_open_zones fetch by test script is required
+# in case fio is running on an old Linux kernel version which lacks
+# max_open_zones in sysfs, or which lacks zoned block device support completely.
max_open_zones() {
local dev=$1
+ local realdev syspath
- if [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then
+ realdev=$(readlink -f "$dev")
+ syspath=/sys/block/${realdev##*/}/queue/max_open_zones
+
+ if [ -b "${realdev}" ] && [ -r "${syspath}" ]; then
+ cat ${syspath}
+ elif [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then
if ! ${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" \
> /dev/null 2>&1; then
- # Non scsi device such as null_blk can not return max open zones.
- # Use default value.
- echo 128
+ # When sg_inq can not get max open zones, specify 0 which indicates
+ # fio to get max open zones limit from the device.
+ echo 0
else
${sg_inq} -e --page=0xB6 --len=20 --hex "$dev" | tail -1 |
{
echo ${max_nr_open_zones}
}
fi
- else
+ elif [ -n "${use_libzbc}" ]; then
${zbc_report_zones} "$dev" |
sed -n 's/^[[:blank:]]*Maximum number of open sequential write required zones:[[:blank:]]*//p'
+ else
+ echo 0
+ fi
+}
+
+# If sysfs provides, get max_active_zones limit of the zoned block device.
+max_active_zones() {
+ local dev=$1
+ local sys_queue="/sys/block/${dev##*/}/queue/"
+
+ if [[ -e "$sys_queue/max_active_zones" ]]; then
+ cat "$sys_queue/max_active_zones"
+ return
+ fi
+ echo 0
+}
+
+# Get minimum block size to write to seq zones. Refer the sysfs attribute
+# zone_write_granularity which shows the valid minimum size regardless of zoned
+# block device type. If the sysfs attribute is not available, refer physical
+# block size for rotational SMR drives. For non-rotational devices such as ZNS
+# devices, refer logical block size.
+min_seq_write_size() {
+ local sys_path="/sys/block/$1/queue"
+ local -i size=0
+
+ if [[ -r "$sys_path/zone_write_granularity" ]]; then
+ size=$(<"$sys_path/zone_write_granularity")
+ fi
+
+ if ((size)); then
+ echo "$size"
+ elif (($(<"$sys_path/rotational"))); then
+ cat "$sys_path/physical_block_size"
+ else
+ cat "$sys_path/logical_block_size"
+ fi
+}
+
+urswrz() {
+ local dev=$1
+
+ if [ -n "${sg_inq}" ] && [ ! -n "${use_libzbc}" ]; then
+ if ! ${sg_inq} -e --page=0xB6 --len=10 --hex "$dev" \
+ > /dev/null 2>&1; then
+ # Couldn't get URSWRZ bit. Assume the reads are unrestricted
+ # because this configuration is more common.
+ echo 1
+ else
+ ${sg_inq} -e --page=0xB6 --len=10 --hex "$dev" | tail -1 |
+ {
+ read -r offset b0 b1 b2 b3 b4 trailer && \
+ echo $(( $b4 & 0x01 )) || echo 0
+ }
+ fi
+ else
+ ${zbc_info} "$dev" |
+ sed -n 's/^[[:blank:]].*Read commands are \(un\)restricted*/\1/p' | grep -q ^ && echo 1 || echo 0
fi
}
[[ -z "$(${zbc_info} "$dev" | grep "is not a zoned block device")" ]]
}
-zbc_logical_block_size() {
+zbc_physical_block_size() {
local dev=$1
${zbc_info} "$dev" |
- grep "logical blocks" |
- sed -n 's/^[[:blank:]]*[0-9]* logical blocks of[[:blank:]]*//p' |
+ grep "physical blocks" |
+ sed -n 's/^[[:blank:]]*[0-9]* physical blocks of[[:blank:]]*//p' |
sed 's/ B//'
}
fi
}
+# Close the zone on device $1 at offset $2. The offset must be specified in
+# units of 512 byte sectors.
+close_zone() {
+ local dev=$1 offset=$2
+
+ if [ -n "${blkzone}" ] && [ -z "${use_libzbc}" ]; then
+ ${blkzone} close -o "${offset}" -c 1 "$dev"
+ else
+ ${zbc_close_zone} -sector "$dev" "${offset}" >/dev/null
+ fi
+}
+
# Extract the number of bytes that have been transferred from a line like
# READ: bw=6847KiB/s (7011kB/s), 6847KiB/s-6847KiB/s (7011kB/s-7011kB/s), io=257MiB (269MB), run=38406-38406msec
fio_io() {
--- /dev/null
+#!/bin/bash
+#
+# Copyright (C) 2020 Western Digital Corporation or its affiliates.
+#
+# This file is released under the GPL.
+#
+# Run t/zbd/test-zbd-support script against a variety of conventional,
+# zoned and mixed zone configurations.
+#
+
+usage()
+{
+ echo "This script runs the tests from t/zbd/test-zbd-support script"
+ echo "against a nullb device in a variety of conventional and zoned"
+ echo "configurations."
+ echo "Usage: ${0} [OPTIONS]"
+ echo "Options:"
+ echo -e "\t-h Show this message."
+ echo -e "\t-L List the device layouts for every section without running"
+ echo -e "\t tests."
+ echo -e "\t-s <#section> Only run the section with the given number."
+ echo -e "\t-t <#test> Only run the test with the given number in every section."
+ echo -e "\t-o <max_open_zones> Specify MaxOpen value, (${set_max_open} by default)."
+ echo -e "\t-n <#number of runs> Set the number of times to run the entire suite "
+ echo -e "\t or an individual section/test."
+ echo -e "\t-q Quit t/zbd/test-zbd-support run after any failed test."
+ echo -e "\t-r Remove the /dev/nullb0 device that may still exist after"
+ echo -e "\t running this script."
+ exit 1
+}
+
+cleanup_nullb()
+{
+ for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d"; done
+ modprobe -r null_blk
+ modprobe null_blk nr_devices=0 || exit $?
+ for d in /sys/kernel/config/nullb/*; do
+ [ -d "$d" ] && rmdir "$d"
+ done
+ modprobe -r null_blk
+ [ -e /sys/module/null_blk ] && exit $?
+}
+
+create_nullb()
+{
+ modprobe null_blk nr_devices=0 &&
+ cd /sys/kernel/config/nullb &&
+ mkdir nullb0 &&
+ cd nullb0 || return $?
+}
+
+configure_nullb()
+{
+ echo 0 > completion_nsec &&
+ echo ${dev_blocksize} > blocksize &&
+ echo ${dev_size} > size &&
+ echo 1 > memory_backed || return $?
+
+ if ((conv_pcnt < 100)); then
+ echo 1 > zoned &&
+ echo "${zone_size}" > zone_size || return $?
+
+ if ((zone_capacity < zone_size)); then
+ if ((!zcap_supported)); then
+ echo "null_blk does not support zone capacity"
+ return 2
+ fi
+ echo "${zone_capacity}" > zone_capacity
+ fi
+
+ if ((conv_pcnt)); then
+ if ((!conv_supported)); then
+ echo "null_blk does not support conventional zones"
+ return 2
+ fi
+ nr_conv=$((dev_size/zone_size*conv_pcnt/100))
+ else
+ nr_conv=0
+ fi
+ echo "${nr_conv}" > zone_nr_conv
+
+ if ((max_open)); then
+ echo "${max_open}" > zone_max_open
+ if ((max_active)); then
+ if ((!max_act_supported)); then
+ echo "null_blk does not support active zone counts"
+ return 2
+ fi
+ echo "${max_active}" > zone_max_active
+ fi
+ fi
+ fi
+
+ echo 1 > power || return $?
+ return 0
+}
+
+show_nullb_config()
+{
+ if ((conv_pcnt < 100)); then
+ echo " $(printf "Zoned Device, %d%% Conventional Zones (%d)" \
+ ${conv_pcnt} ${nr_conv})"
+ echo " $(printf "Zone Size: %d MB" ${zone_size})"
+ echo " $(printf "Zone Capacity: %d MB" ${zone_capacity})"
+ if ((max_open)); then
+ echo " $(printf "Max Open: %d Zones" ${max_open})"
+ if ((max_active)); then
+ echo " $(printf "Max Active: %d Zones" ${max_active})"
+ else
+ echo " Max Active: Unlimited Zones"
+ fi
+ else
+ echo " Max Open: Unlimited Zones"
+ fi
+ else
+ echo " Non-zoned Device"
+ fi
+}
+
+#
+# Test sections.
+#
+# Fully conventional device.
+section1()
+{
+ conv_pcnt=100
+ max_open=0
+}
+
+# Zoned device with no conventional zones, ZCAP == ZSIZE, unlimited MaxOpen.
+section2()
+{
+ conv_pcnt=0
+ zone_size=1
+ zone_capacity=1
+ max_open=0
+}
+
+# Zoned device with no conventional zones, ZCAP < ZSIZE, unlimited MaxOpen.
+section3()
+{
+ conv_pcnt=0
+ zone_size=4
+ zone_capacity=3
+ max_open=0
+ max_active=0
+}
+
+# Zoned device with mostly sequential zones, ZCAP == ZSIZE, unlimited MaxOpen.
+section4()
+{
+ conv_pcnt=10
+ zone_size=1
+ zone_capacity=1
+ max_open=0
+ max_active=0
+}
+
+# Zoned device with mostly sequential zones, ZCAP < ZSIZE, unlimited MaxOpen.
+section5()
+{
+ conv_pcnt=10
+ zone_size=4
+ zone_capacity=3
+ max_open=0
+ max_active=0
+}
+
+# Zoned device with mostly conventional zones, ZCAP == ZSIZE, unlimited MaxOpen.
+section6()
+{
+ conv_pcnt=66
+ zone_size=1
+ zone_capacity=1
+ max_open=0
+ max_active=0
+}
+
+# Zoned device with mostly conventional zones, ZCAP < ZSIZE, unlimited MaxOpen.
+section7()
+{
+ dev_size=2048
+ conv_pcnt=66
+ zone_size=4
+ zone_capacity=3
+ max_open=0
+ max_active=0
+}
+
+# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
+section8()
+{
+ dev_size=1024
+ conv_pcnt=0
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=0
+}
+
+# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
+section9()
+{
+ conv_pcnt=0
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=0
+}
+
+# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
+section10()
+{
+ conv_pcnt=10
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=0
+}
+
+# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
+section11()
+{
+ conv_pcnt=10
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=0
+}
+
+# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
+section12()
+{
+ conv_pcnt=66
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=0
+}
+
+# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# unlimited MaxActive.
+section13()
+{
+ dev_size=2048
+ conv_pcnt=66
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=0
+}
+
+# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section14()
+{
+ dev_size=1024
+ conv_pcnt=0
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=${set_max_open}
+}
+
+# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section15()
+{
+ conv_pcnt=0
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=${set_max_open}
+}
+
+# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section16()
+{
+ conv_pcnt=10
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=${set_max_open}
+}
+
+# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section17()
+{
+ conv_pcnt=10
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=${set_max_open}
+}
+
+# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section18()
+{
+ conv_pcnt=66
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=${set_max_open}
+}
+
+# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive == MaxOpen.
+section19()
+{
+ dev_size=2048
+ conv_pcnt=66
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=${set_max_open}
+}
+
+# Zoned device with no conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section20()
+{
+ dev_size=1024
+ conv_pcnt=0
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=$((set_max_open+set_extra_max_active))
+}
+
+# Zoned device with no conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section21()
+{
+ conv_pcnt=0
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=$((set_max_open+set_extra_max_active))
+}
+
+# Zoned device with mostly sequential zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section22()
+{
+ conv_pcnt=10
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=$((set_max_open+set_extra_max_active))
+}
+
+# Zoned device with mostly sequential zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section23()
+{
+ conv_pcnt=10
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=$((set_max_open+set_extra_max_active))
+}
+
+# Zoned device with mostly conventional zones, ZCAP == ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section24()
+{
+ conv_pcnt=66
+ zone_size=1
+ zone_capacity=1
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=$((set_max_open+set_extra_max_active))
+}
+
+# Zoned device with mostly conventional zones, ZCAP < ZSIZE, limited MaxOpen,
+# MaxActive > MaxOpen.
+section25()
+{
+ dev_size=2048
+ conv_pcnt=66
+ zone_size=4
+ zone_capacity=3
+ max_open=${set_max_open}
+ zbd_test_opts+=("-o ${max_open}")
+ max_active=$((set_max_open+set_extra_max_active))
+}
+
+#
+# Entry point.
+#
+SECONDS=0
+scriptdir="$(cd "$(dirname "$0")" && pwd)"
+sections=()
+zcap_supported=1
+conv_supported=1
+max_act_supported=1
+list_only=0
+dev_size=1024
+dev_blocksize=4096
+set_max_open=8
+set_extra_max_active=2
+zbd_test_opts=()
+num_of_runs=1
+test_case=0
+quit_on_err=0
+
+while (($#)); do
+ case "$1" in
+ -s) sections+=("$2"); shift; shift;;
+ -o) set_max_open="${2}"; shift; shift;;
+ -L) list_only=1; shift;;
+ -r) cleanup_nullb; exit 0;;
+ -n) num_of_runs="${2}"; shift; shift;;
+ -t) test_case="${2}"; shift; shift;;
+ -q) quit_on_err=1; shift;;
+ -h) usage; break;;
+ --) shift; break;;
+ *) usage; exit 1;;
+ esac
+done
+
+if [ "${#sections[@]}" = 0 ]; then
+ readarray -t sections < <(declare -F | grep "section[0-9]*" | tr -c -d "[:digit:]\n" | sort -n)
+fi
+
+cleanup_nullb
+
+#
+# Test creating null_blk device and check if newer features are supported
+#
+if ! eval "create_nullb"; then
+ echo "can't create nullb"
+ exit 1
+fi
+if ! cat /sys/kernel/config/nullb/features | grep -q zone_capacity; then
+ zcap_supported=0
+fi
+if ! cat /sys/kernel/config/nullb/features | grep -q zone_nr_conv; then
+ conv_supported=0
+fi
+if ! cat /sys/kernel/config/nullb/features | grep -q zone_max_active; then
+ max_act_supported=0
+fi
+
+rc=0
+test_rc=0
+intr=0
+run_nr=1
+trap 'kill ${zbd_test_pid}; intr=1' SIGINT
+
+while ((run_nr <= $num_of_runs)); do
+ echo -e "\nRun #$run_nr:"
+ for section_number in "${sections[@]}"; do
+ cleanup_nullb
+ echo "---------- Section $(printf "%02d" $section_number) ----------"
+ if ! eval "create_nullb"; then
+ echo "error creating nullb"
+ exit 1
+ fi
+ zbd_test_opts=()
+ if ((test_case)); then
+ zbd_test_opts+=("-t" "${test_case}")
+ fi
+ if ((quit_on_err)); then
+ zbd_test_opts+=("-q")
+ fi
+ section$section_number
+ configure_nullb
+ rc=$?
+ ((rc == 2)) && continue
+ if ((rc)); then
+ echo "can't set up nullb for section $(printf "%02d" $section_number)"
+ exit 1
+ fi
+ show_nullb_config
+ cd "${scriptdir}"
+ ((intr)) && exit 1
+ ((list_only)) && continue
+
+ ./test-zbd-support ${zbd_test_opts[@]} /dev/nullb0 &
+ zbd_test_pid=$!
+ if kill -0 "${zbd_test_pid}"; then
+ wait "${zbd_test_pid}"
+ test_rc=$?
+ else
+ echo "can't run ZBD tests"
+ exit 1
+ fi
+ ((intr)) && exit 1
+ if (($test_rc)); then
+ rc=1
+ ((quit_on_err)) && break
+ fi
+ done
+
+ ((rc && quit_on_err)) && break
+ run_nr=$((run_nr + 1))
+done
+
+if ((!list_only)); then
+ echo "--------------------------------"
+ echo "Total run time: $(TZ=UTC0 printf "%(%H:%M:%S)T\n" $(( SECONDS )) )"
+fi
+
+exit $rc
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2018 Western Digital Corporation or its affiliates.
-#
-# This file is released under the GPL.
-
-scriptdir="$(cd "$(dirname "$0")" && pwd)"
-
-for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d"; done
-modprobe -r null_blk
-modprobe null_blk nr_devices=0 || exit $?
-for d in /sys/kernel/config/nullb/*; do
- [ -d "$d" ] && rmdir "$d"
-done
-modprobe -r null_blk
-[ -e /sys/module/null_blk ] && exit $?
-modprobe null_blk nr_devices=0 &&
- cd /sys/kernel/config/nullb &&
- mkdir nullb0 &&
- cd nullb0 &&
- echo 0 > completion_nsec &&
- echo 4096 > blocksize &&
- echo 1024 > size &&
- echo 1 > memory_backed &&
- echo 1 > power || exit $?
-
-"${scriptdir}"/test-zbd-support "$@" /dev/nullb0
+++ /dev/null
-#!/bin/bash
-#
-# Copyright (C) 2018 Western Digital Corporation or its affiliates.
-#
-# This file is released under the GPL.
-
-scriptdir="$(cd "$(dirname "$0")" && pwd)"
-
-zone_size=1
-zone_capacity=1
-if [[ ${1} == "-h" ]]; then
- echo "Usage: ${0} [OPTIONS]"
- echo "Options:"
- echo -e "\t-h Show this message."
- echo -e "\t-zone-cap Use null blk with zone capacity less than zone size."
- echo -e "\tany option supported by test-zbd-support script."
- exit 1
-elif [[ ${1} == "-zone-cap" ]]; then
- zone_size=4
- zone_capacity=3
- shift
-fi
-
-for d in /sys/kernel/config/nullb/*; do [ -d "$d" ] && rmdir "$d"; done
-modprobe -r null_blk
-modprobe null_blk nr_devices=0 || exit $?
-for d in /sys/kernel/config/nullb/*; do
- [ -d "$d" ] && rmdir "$d"
-done
-modprobe -r null_blk
-[ -e /sys/module/null_blk ] && exit $?
-modprobe null_blk nr_devices=0 &&
- cd /sys/kernel/config/nullb &&
- mkdir nullb0 &&
- cd nullb0 || exit $?
-
-if ((zone_capacity < zone_size)); then
- if [[ ! -w zone_capacity ]]; then
- echo "null blk does not support zone capacity"
- exit 1
- fi
- echo "${zone_capacity}" > zone_capacity
-fi
-
-echo 1 > zoned &&
- echo "${zone_size}" > zone_size &&
- echo 0 > completion_nsec &&
- echo 4096 > blocksize &&
- echo 1024 > size &&
- echo 1 > memory_backed &&
- echo 1 > power || exit $?
-
-"${scriptdir}"/test-zbd-support "$@" /dev/nullb0
echo -e "\t-v Run fio with valgrind --read-var-info option"
echo -e "\t-l Test with libzbc ioengine"
echo -e "\t-r Reset all zones before test start"
+ echo -e "\t-w Reset all zones before executing each write test case"
echo -e "\t-o <max_open_zones> Run fio with max_open_zones limit"
echo -e "\t-t <test #> Run only a single test case with specified number"
+ echo -e "\t-s <test #> Start testing from the case with the specified number"
+ echo -e "\t-q Quit the test run after any failed test"
echo -e "\t-z Run fio with debug=zbd option"
+ echo -e "\t-u Use io_uring ioengine in place of libaio"
}
max() {
ioengine() {
if [ -n "$use_libzbc" ]; then
echo -n "--ioengine=libzbc"
+ elif [ "$1" = "libaio" -a -n "$force_io_uring" ]; then
+ echo -n "--ioengine=io_uring"
else
echo -n "--ioengine=$1"
fi
}
+get_dev_path_by_id() {
+ for d in /sys/block/* /sys/block/*/*; do
+ if [[ ! -r "${d}/dev" ]]; then
+ continue
+ fi
+ if [[ "${1}" == "$(<"${d}/dev")" ]]; then
+ echo "/dev/${d##*/}"
+ return 0
+ fi
+ done
+ return 1
+}
+
+dm_destination_dev_set_io_scheduler() {
+ local dev=$1 sched=$2
+ local dest_dev_id dest_dev path
+
+ has_command dmsetup || return 1
+
+ while read -r dest_dev_id; do
+ if ! dest_dev=$(get_dev_path_by_id "${dest_dev_id}"); then
+ continue
+ fi
+ path=${dest_dev/dev/sys\/block}/queue/scheduler
+ if [[ ! -w ${path} ]]; then
+ echo "Can not set scheduler of device mapper destination: ${dest_dev}"
+ continue
+ fi
+ echo "${2}" > "${path}"
+ done < <(dmsetup table "$(<"/sys/block/$dev/dm/name")" |
+ sed -n 's/.* \([0-9]*:[0-9]*\).*/\1/p')
+}
+
+dev_has_dm_map() {
+ local dev=${1} target_type=${2}
+ local dm_name
+
+ has_command dmsetup || return 1
+
+ dm_name=$(<"/sys/block/$dev/dm/name")
+ if ! dmsetup status "${dm_name}" | grep -qe "${target_type}"; then
+ return 1
+ fi
+ if dmsetup status "${dm_name}" | grep -v "${target_type}"; then
+ return 1
+ fi
+ return 0
+}
+
set_io_scheduler() {
local dev=$1 sched=$2
esac
fi
- echo "$sched" >"/sys/block/$dev/queue/scheduler"
+ if [ -w "/sys/block/$dev/queue/scheduler" ]; then
+ echo "$sched" >"/sys/block/$dev/queue/scheduler"
+ elif [ -r "/sys/block/$dev/dm/name" ] &&
+ ( dev_has_dm_map "$dev" linear ||
+ dev_has_dm_map "$dev" flakey ||
+ dev_has_dm_map "$dev" crypt ); then
+ dm_destination_dev_set_io_scheduler "$dev" "$sched"
+ else
+ echo "can not set io scheduler"
+ exit 1
+ fi
}
check_read() {
shift 2
r=$(((RANDOM << 16) | RANDOM))
write_opts=(--name="write_job" --rw=write "$(ioengine "psync")" \
- --bs="${logical_block_size}" --zonemode=zbd \
+ --bs="${min_seq_write_size}" --zonemode=zbd \
--zonesize="${zone_size}" --thread=1 --direct=1 \
--offset="${write_offset}" --size="${write_size}")
write_opts+=("${job_var_opts[@]}")
run_one_fio_job "${opts[@]}" "$@"
}
-# Prepare for write test by resetting zones. When max_open_zones option is
-# specified, reset all zones of the test target to ensure that zones out of the
-# test target range do not have open zones. This allows the write test to the
-# target range to be able to open zones up to max_open_zones.
+# Prepare for write test by resetting zones. When reset_before_write or
+# max_open_zones option is specified, reset all zones of the test target to
+# ensure that zones out of the test target range do not have open zones. This
+# allows the write test to the target range to be able to open zones up to
+# max_open_zones limit specified as the option or obtained from sysfs.
prep_write() {
- [[ -n "${max_open_zones_opt}" && -n "${is_zbd}" ]] &&
- reset_zone "${dev}" -1
+ [[ -n "${reset_before_write}" || -n "${max_open_zones_opt}" ]] &&
+ [[ -n "${is_zbd}" ]] && reset_zone "${dev}" -1
+}
+
+SKIP_TESTCASE=255
+
+require_scsi_dev() {
+ if ! is_scsi_device "$dev"; then
+ SKIP_REASON="$dev is not a SCSI device"
+ return 1
+ fi
+ return 0
+}
+
+require_conv_zone_bytes() {
+ local req_bytes=${1}
+
+ if ((req_bytes > first_sequential_zone_sector * 512)); then
+ SKIP_REASON="$dev does not have enough conventional zones"
+ return 1
+ fi
+ return 0
+}
+
+require_zbd() {
+ if [[ -z ${is_zbd} ]]; then
+ SKIP_REASON="$dev is not a zoned block device"
+ return 1
+ fi
+ return 0
+}
+
+require_regular_block_dev() {
+ if [[ -n ${is_zbd} ]]; then
+ SKIP_REASON="$dev is not a regular block device"
+ return 1
+ fi
+ return 0
+}
+
+require_block_dev() {
+ if [[ -b "$realdev" ]]; then
+ return 0
+ fi
+ SKIP_REASON="$dev is not a block device"
+ return 1
+}
+
+require_seq_zones() {
+ local req_seq_zones=${1}
+ local seq_bytes=$((disk_size - first_sequential_zone_sector * 512))
+
+ if ((req_seq_zones > seq_bytes / zone_size)); then
+ SKIP_REASON="$dev does not have $req_seq_zones sequential zones"
+ return 1
+ fi
+ return 0
+}
+
+require_conv_zones() {
+ local req_c_zones=${1}
+ local conv_bytes=$((first_sequential_zone_sector * 512))
+
+ if ((req_c_zones > conv_bytes / zone_size)); then
+ SKIP_REASON="$dev does not have $req_c_zones conventional zones"
+ return 1
+ fi
+ return 0
+}
+
+require_max_open_zones() {
+ local min=${1}
+
+ if ((max_open_zones !=0 && max_open_zones < min)); then
+ SKIP_REASON="max_open_zones of $dev is smaller than $min"
+ return 1
+ fi
+ return 0
+}
+
+require_max_active_zones() {
+ local min=${1}
+
+ if ((max_active_zones == 0)); then
+ SKIP_REASON="$dev does not have max_active_zones limit"
+ return 1
+ fi
+ if ((max_active_zones < min)); then
+ SKIP_REASON="max_active_zones of $dev is smaller than $min"
+ return 1
+ fi
+ return 0
}
-# Check whether buffered writes are refused.
+# Check whether buffered writes are refused for block devices.
test1() {
+ require_block_dev || return $SKIP_TESTCASE
run_fio --name=job1 --filename="$dev" --rw=write --direct=0 --bs=4K \
"$(ioengine "psync")" --size="${zone_size}" --thread=1 \
--zonemode=zbd --zonesize="${zone_size}" 2>&1 |
if [ -z "$is_zbd" ]; then
opts+=("--zonesize=${zone_size}")
fi
- run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
- ! grep -q 'WRITE:' "${logfile}.${test_number}"
+ run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 && return 1
+ grep -q 'buflen exceeds zone size' "${logfile}.${test_number}"
}
# Run fio against an empty zone. This causes fio to report "No I/O performed".
test3() {
local off opts=() rc
+ require_seq_zones 129 || return $SKIP_TESTCASE
off=$((first_sequential_zone_sector * 512 + 128 * zone_size))
size=$((zone_size))
[ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
test4() {
local off opts=()
+ require_seq_zones 130 || return $SKIP_TESTCASE
off=$((first_sequential_zone_sector * 512 + 129 * zone_size))
size=$((zone_size))
[ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
- opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--bs=$size")
+ opts+=("--name=$dev" "--filename=$dev" "--offset=$off")
+ opts+=(--bs="$(min $((min_seq_write_size * 256)) $size)")
opts+=("--size=$size" "--thread=1" "--read_beyond_wp=1")
opts+=("$(ioengine "psync")" "--rw=read" "--direct=1" "--disable_lat=1")
opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
- run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
- check_read $size || return $?
+ run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1
+ fio_rc=$?
+ if [[ $unrestricted_reads != 0 ]]; then
+ if [[ $fio_rc != 0 ]]; then
+ return "$fio_rc"
+ fi
+ check_read $size || return $?
+ else
+ [ $fio_rc == 0 ] && return 1 || return 0
+ fi
}
# Sequential write to sequential zones.
test5() {
- local size off capacity
+ local size off capacity bs
prep_write
off=$((first_sequential_zone_sector * 512))
capacity=$(total_zone_capacity 4 $off $dev)
size=$((4 * zone_size))
+ bs=$(min "$(max $((zone_size / 64)) "$min_seq_write_size")" "$zone_cap_bs")
run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write \
- --bs="$(max $((zone_size / 64)) "$logical_block_size")"\
- --do_verify=1 --verify=md5 \
+ --bs="$bs" --do_verify=1 --verify=md5 \
>>"${logfile}.${test_number}" 2>&1 || return $?
check_written $capacity || return $?
check_read $capacity || return $?
# Sequential read from sequential zones.
test6() {
- local size off capacity
+ local size off capacity bs
prep_write
off=$((first_sequential_zone_sector * 512))
capacity=$(total_zone_capacity 4 $off $dev)
size=$((4 * zone_size))
+ bs=$(min "$(max $((zone_size / 64)) "$min_seq_write_size")" "$zone_cap_bs")
write_and_run_one_fio_job \
$((first_sequential_zone_sector * 512)) "${size}" \
--offset="${off}" \
--size="${size}" --zonemode=zbd --zonesize="${zone_size}" \
- "$(ioengine "psync")" --iodepth=1 --rw=read \
- --bs="$(max $((zone_size / 64)) "$logical_block_size")" \
+ "$(ioengine "psync")" --iodepth=1 --rw=read --bs="$bs" \
>>"${logfile}.${test_number}" 2>&1 || return $?
check_read $capacity || return $?
}
test9() {
local size
- if ! is_scsi_device "$dev"; then
- echo "$dev is not a SCSI device" >>"${logfile}.${test_number}"
- return 0
- fi
+ require_scsi_dev || return $SKIP_TESTCASE
prep_write
size=$((4 * zone_size))
test10() {
local size
- if ! is_scsi_device "$dev"; then
- echo "$dev is not a SCSI device" >>"${logfile}.${test_number}"
- return 0
- fi
+ require_scsi_dev || return $SKIP_TESTCASE
prep_write
size=$((4 * zone_size))
test12() {
local size off capacity
- prep_write
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
size=$((8 * zone_size))
off=$((first_sequential_zone_sector * 512))
capacity=$(total_zone_capacity 8 $off $dev)
test13() {
local size off capacity
- prep_write
+ require_max_open_zones 4 || return $SKIP_TESTCASE
+
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
size=$((8 * zone_size))
off=$((first_sequential_zone_sector * 512))
capacity=$(total_zone_capacity 8 $off $dev)
# Random write to conventional zones.
test14() {
- local size
+ local off size
+ if ! result=($(first_online_zone "$dev")); then
+ echo "Failed to determine first online zone"
+ exit 1
+ fi
+ off=${result[0]}
prep_write
size=$((16 * 2**20)) # 20 MB
- if [ $size -gt $((first_sequential_zone_sector * 512)) ]; then
- echo "$dev does not have enough sequential zones" \
- >>"${logfile}.${test_number}"
- return 0
- fi
+ require_conv_zone_bytes "${size}" || return $SKIP_TESTCASE
+
run_one_fio_job "$(ioengine "libaio")" --iodepth=64 --rw=randwrite --bs=16K \
--zonemode=zbd --zonesize="${zone_size}" --do_verify=1 \
- --verify=md5 --size=$size \
+ --verify=md5 --offset=$off --size=$size\
>>"${logfile}.${test_number}" 2>&1 || return $?
check_written $((size)) || return $?
check_read $((size)) || return $?
# Sequential read on a mix of empty and full zones.
test15() {
- local i off size
+ local i off size bs
local w_off w_size w_capacity
for ((i=0;i<4;i++)); do
w_capacity=$(total_zone_capacity 2 $w_off $dev)
off=$((first_sequential_zone_sector * 512))
size=$((4 * zone_size))
+ bs=$(min $((zone_size / 16)) "$zone_cap_bs")
write_and_run_one_fio_job "${w_off}" "${w_size}" \
- "$(ioengine "psync")" --rw=read --bs=$((zone_size / 16)) \
+ "$(ioengine "psync")" --rw=read --bs="$bs" \
--zonemode=zbd --zonesize="${zone_size}" --offset=$off \
--size=$((size)) >>"${logfile}.${test_number}" 2>&1 ||
return $?
# Random reads and writes in the last zone.
test17() {
- local io off read size written
+ local io off last read size written
off=$(((disk_size / zone_size - 1) * zone_size))
size=$((disk_size - off))
+ if ! last=($(last_online_zone "$dev")); then
+ echo "Failed to determine last online zone"
+ exit 1
+ fi
+ if [[ "$((last * 512))" -lt "$off" ]]; then
+ off=$((last * 512))
+ size=$zone_size
+ fi
if [ -n "$is_zbd" ]; then
reset_zone "$dev" $((off / 512)) || return $?
fi
prep_write
run_one_fio_job "$(ioengine "libaio")" --iodepth=8 --rw=randrw --bs=4K \
--zonemode=zbd --zonesize="${zone_size}" \
- --offset=$off --loops=2 --norandommap=1\
+ --offset=$off --loops=2 --norandommap=1 \
+ --size="$size"\
>>"${logfile}.${test_number}" 2>&1 || return $?
written=$(fio_written <"${logfile}.${test_number}")
read=$(fio_read <"${logfile}.${test_number}")
test28() {
local i jobs=16 off opts
+ require_seq_zones 65 || return $SKIP_TESTCASE
off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
[ -n "$is_zbd" ] && reset_zone "$dev" $((off / 512))
prep_write
test29() {
local i jobs=16 off opts=()
+ require_seq_zones 80 || return $SKIP_TESTCASE
off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
size=$((16*zone_size))
- prep_write
+
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
opts=("--debug=zbd")
for ((i=0;i<jobs;i++)); do
opts+=("--name=job$i" "--filename=$dev" "--offset=$off" "--bs=16K")
prep_write
off=$((first_sequential_zone_sector * 512))
run_one_fio_job "$(ioengine "libaio")" --iodepth=8 --rw=randrw \
- --bs="$(max $((zone_size / 128)) "$logical_block_size")"\
+ --bs="$(max $((zone_size / 128)) "$min_seq_write_size")"\
--zonemode=zbd --zonesize="${zone_size}" --offset=$off\
--loops=2 --time_based --runtime=30s --norandommap=1\
>>"${logfile}.${test_number}" 2>&1
test31() {
local bs inc nz off opts size
- prep_write
- # Start with writing 128 KB to 128 sequential zones.
- bs=128K
- nz=128
- # shellcheck disable=SC2017
- inc=$(((disk_size - (first_sequential_zone_sector * 512)) / (nz * zone_size)
- * zone_size))
- opts=()
- for ((off = first_sequential_zone_sector * 512; off < disk_size;
- off += inc)); do
- opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--io_size=$bs")
- opts+=("--bs=$bs" "--size=$zone_size" "$(ioengine "libaio")")
- opts+=("--rw=write" "--direct=1" "--thread=1" "--stats=0")
- opts+=("--zonemode=zbd" "--zonesize=${zone_size}")
- opts+=(${job_var_opts[@]})
- done
- "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" 2>&1
- # Next, run the test.
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+ # As preparation, write 128 KB to sequential write required zones. Limit
+ # write target zones up to max_open_zones to keep test time reasonable.
+ # To distribute the write target zones evenly, skip certain zones for every
+ # write. Utilize zonemode strided for such write patterns.
+ bs=$((128 * 1024))
+ nz=$((max_open_zones))
+ if [[ $nz -eq 0 ]]; then
+ nz=128
+ fi
off=$((first_sequential_zone_sector * 512))
size=$((disk_size - off))
+ inc=$(((size / nz / zone_size) * zone_size))
+ opts=("--name=$dev" "--filename=$dev" "--rw=write" "--bs=${bs}")
+ opts+=("--offset=$off" "--size=$((inc * nz))" "--io_size=$((bs * nz))")
+ opts+=("--zonemode=strided" "--zonesize=${bs}" "--zonerange=${inc}")
+ opts+=("--direct=1" "$(ioengine "psync")")
+ echo "fio ${opts[@]}" >> "${logfile}.${test_number}"
+ "$(dirname "$0")/../../fio" "${opts[@]}" >> "${logfile}.${test_number}" \
+ 2>&1 || return $?
+
+ # Next, run the test.
opts=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size")
opts+=("--bs=$bs" "$(ioengine "psync")" "--rw=randread" "--direct=1")
opts+=("--thread=1" "--time_based" "--runtime=30" "--zonemode=zbd")
test32() {
local off opts=() size
- prep_write
+ require_zbd || return $SKIP_TESTCASE
+
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
off=$((first_sequential_zone_sector * 512))
size=$((disk_size - off))
opts+=("--name=$dev" "--filename=$dev" "--offset=$off" "--size=$size")
local bs io_size size
local off capacity=0;
- prep_write
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
off=$((first_sequential_zone_sector * 512))
capacity=$(total_zone_capacity 1 $off $dev)
size=$((2 * zone_size))
run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write \
--size=$size --io_size=$io_size --bs=$bs \
>> "${logfile}.${test_number}" 2>&1 || return $?
- check_written $(((io_size + bs - 1) / bs * bs)) || return $?
+ check_written $((io_size / bs * bs)) || return $?
}
-# Write to sequential zones with a block size that is not a divisor of the
-# zone size and with data verification enabled.
+# Test repeated async write job with verify using two unaligned block sizes.
test34() {
- local size
-
- prep_write
- size=$((2 * zone_size))
- run_fio_on_seq "$(ioengine "psync")" --iodepth=1 --rw=write --size=$size \
- --do_verify=1 --verify=md5 --bs=$((3 * zone_size / 4)) \
- >> "${logfile}.${test_number}" 2>&1 && return 1
- grep -q 'not a divisor of' "${logfile}.${test_number}"
+ local bs off zone_capacity
+ local -a block_sizes
+
+ require_zbd || return $SKIP_TESTCASE
+ prep_write
+
+ off=$((first_sequential_zone_sector * 512))
+ zone_capacity=$(total_zone_capacity 1 $off $dev)
+ block_sizes=($((4096 * 7)) $(($(min ${zone_capacity} 4194304) - 4096)))
+
+ for bs in ${block_sizes[@]}; do
+ run_fio --name=job --filename="${dev}" --rw=randwrite \
+ --bs="${bs}" --offset="${off}" \
+ --size=$((4 * zone_size)) --iodepth=256 \
+ "$(ioengine "libaio")" --time_based=1 --runtime=15s \
+ --zonemode=zbd --direct=1 --zonesize="${zone_size}" \
+ --verify=crc32c --do_verify=1 ${job_var_opts[@]} \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+ done
}
# Test 1/4 for the I/O boundary rounding code: $size < $zone_size.
local bs off size capacity
prep_write
- capacity=$(total_zone_capacity 1 $first_sequential_zone_sector $dev)
+ capacity=$(total_zone_capacity 1 $((first_sequential_zone_sector*512)) $dev)
if [ "$first_sequential_zone_sector" = 0 ]; then
off=0
else
off=$(((first_sequential_zone_sector - 1) * 512))
fi
size=$((zone_size + 2 * 512))
- bs=$((zone_size / 4))
+ bs=$(min $((zone_size / 4)) "$zone_cap_bs")
run_one_fio_job --offset=$off --size=$size "$(ioengine "psync")" \
--iodepth=1 --rw=write --do_verify=1 --verify=md5 \
--bs=$bs --zonemode=zbd --zonesize="${zone_size}" \
local bs off size
prep_write
- size=$((logical_block_size))
- off=$((disk_size - logical_block_size))
- bs=$((logical_block_size))
+ size=$((min_seq_write_size))
+ off=$((disk_size - min_seq_write_size))
+ bs=$((min_seq_write_size))
run_one_fio_job --offset=$off --size=$size "$(ioengine "psync")" \
--iodepth=1 --rw=write --do_verify=1 --verify=md5 \
--bs=$bs --zonemode=zbd --zonesize="${zone_size}" \
# Read one block from a block device.
read_one_block() {
+ local off
local bs
- bs=$((logical_block_size))
- run_one_fio_job --rw=read "$(ioengine "psync")" --bs=$bs --size=$bs "$@" 2>&1 |
+ if ! result=($(first_online_zone "$dev")); then
+ echo "Failed to determine first online zone"
+ exit 1
+ fi
+ off=${result[0]}
+ bs=$((min_seq_write_size))
+ run_one_fio_job --rw=read "$(ioengine "psync")" --offset=$off --bs=$bs \
+ --size=$bs "$@" 2>&1 |
tee -a "${logfile}.${test_number}"
}
# Check whether fio accepts --zonemode=none for zoned block devices.
test39() {
- [ -n "$is_zbd" ] || return 0
+ require_zbd || return $SKIP_TESTCASE
read_one_block --zonemode=none >/dev/null || return $?
- check_read $((logical_block_size)) || return $?
+ check_read $((min_seq_write_size)) || return $?
}
# Check whether fio accepts --zonemode=strided for zoned block devices.
test40() {
local bs
- bs=$((logical_block_size))
- [ -n "$is_zbd" ] || return 0
+ bs=$((min_seq_write_size))
+ require_zbd || return $SKIP_TESTCASE
read_one_block --zonemode=strided |
grep -q 'fio: --zonesize must be specified when using --zonemode=strided' ||
return $?
# Check whether fio checks the zone size for zoned block devices.
test41() {
- [ -n "$is_zbd" ] || return 0
+ require_zbd || return $SKIP_TESTCASE
read_one_block --zonemode=zbd --zonesize=$((2 * zone_size)) |
grep -q 'job parameter zonesize.*does not match disk zone size'
}
# Check whether fio handles --zonesize=0 correctly for regular block devices.
test42() {
- [ -n "$is_zbd" ] && return 0
+ require_regular_block_dev || return $SKIP_TESTCASE
read_one_block --zonemode=zbd --zonesize=0 |
- grep -q 'Specifying the zone size is mandatory for regular block devices with --zonemode=zbd'
+ grep -q 'Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd'
}
# Check whether fio handles --zonesize=1 correctly for regular block devices.
test43() {
- [ -n "$is_zbd" ] && return 0
+ require_regular_block_dev || return $SKIP_TESTCASE
read_one_block --zonemode=zbd --zonesize=1 |
grep -q 'zone size must be at least 512 bytes for --zonemode=zbd'
}
test45() {
local bs i
+ local grep_str="fio: first I/O failed. If .* is a zoned block device, consider --zonemode=zbd"
- [ -z "$is_zbd" ] && return 0
+ require_zbd || return $SKIP_TESTCASE
prep_write
- bs=$((logical_block_size))
- run_one_fio_job "$(ioengine "psync")" --iodepth=1 --rw=randwrite --bs=$bs\
- --offset=$((first_sequential_zone_sector * 512)) \
- --size="$zone_size" --do_verify=1 --verify=md5 2>&1 |
- tee -a "${logfile}.${test_number}" |
- grep -q "fio: first I/O failed. If .* is a zoned block device, consider --zonemode=zbd"
+ bs=$((min_seq_write_size))
+ for ((i = 0; i < 10; i++)); do
+ run_one_fio_job "$(ioengine "psync")" --iodepth=1 --rw=randwrite \
+ --offset=$((first_sequential_zone_sector * 512)) \
+ --bs="$bs" --time_based --runtime=1s \
+ --do_verify=1 --verify=md5 \
+ >> "${logfile}.${test_number}" 2>&1
+ grep -qe "$grep_str" "${logfile}.${test_number}" && return 0
+ done
+ return 1
}
# Random write to sequential zones, libaio, 8 jobs, queue depth 64 per job
local bs
prep_write
- bs=$((logical_block_size))
+ bs=$((min_seq_write_size))
run_fio_on_seq "$(ioengine "psync")" --rw=write --bs=$bs --zoneskip=1 \
>> "${logfile}.${test_number}" 2>&1 && return 1
grep -q 'zoneskip 1 is not a multiple of the device zone size' "${logfile}.${test_number}"
test48() {
local i jobs=16 off opts=()
+ require_zbd || return $SKIP_TESTCASE
+ require_seq_zones 80 || return $SKIP_TESTCASE
+
off=$((first_sequential_zone_sector * 512 + 64 * zone_size))
size=$((16*zone_size))
- prep_write
+
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
opts=("--aux-path=/tmp" "--allow_file_create=0" "--significant_figures=10")
opts+=("--debug=zbd")
opts+=("$(ioengine "libaio")" "--rw=randwrite" "--direct=1")
for ((i=0;i<jobs;i++)); do
opts+=("--name=job$i" "--filename=$dev" "--offset=$off" "--bs=16K")
opts+=("--io_size=$zone_size" "--iodepth=256" "--thread=1")
- opts+=("--group_reporting=1")
+ opts+=("--size=$size" "--group_reporting=1")
# max_open_zones is already specified
opts+=($(job_var_opts_exclude "--max_open_zones"))
done
{ echo; echo "fio ${opts[*]}"; echo; } >>"${logfile}.${test_number}"
- timeout -v -s KILL 45s \
+ timeout -v -s KILL 180s \
"${dynamic_analyzer[@]}" "$fio" "${opts[@]}" \
>> "${logfile}.${test_number}" 2>&1 || return $?
}
# Check if fio handles --zonecapacity on a normal block device correctly
test49() {
- if [ -n "$is_zbd" ]; then
- echo "$dev is not a regular block device" \
- >>"${logfile}.${test_number}"
- return 0
- fi
+ require_regular_block_dev || return $SKIP_TESTCASE
size=$((2 * zone_size))
capacity=$((zone_size * 3 / 4))
check_read $((capacity * 2)) || return $?
}
+# Verify that conv zones are not locked and only seq zones are locked during
+# random read on conv-seq mixed zones.
+test50() {
+ local off
+
+ require_zbd || return $SKIP_TESTCASE
+ require_conv_zones 8 || return $SKIP_TESTCASE
+ require_seq_zones 8 || return $SKIP_TESTCASE
+
+ reset_zone "${dev}" -1
+
+ off=$((first_sequential_zone_sector * 512 - 8 * zone_size))
+ run_fio --name=job --filename=${dev} --offset=${off} --bs=64K \
+ --size=$((16 * zone_size)) "$(ioengine "libaio")" --rw=randread\
+ --time_based --runtime=3 --zonemode=zbd --zonesize=${zone_size}\
+ --direct=1 --group_reporting=1 ${job_var_opts[@]} \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Verify that conv zones are neither locked nor opened during random write on
+# conv-seq mixed zones. Zone lock and zone open shall happen only on seq zones.
+test51() {
+ local off jobs=16
+ local -a opts
+
+ require_zbd || return $SKIP_TESTCASE
+ require_conv_zones 8 || return $SKIP_TESTCASE
+ require_seq_zones 8 || return $SKIP_TESTCASE
+
+ reset_zone "$dev" -1
+
+ off=$((first_sequential_zone_sector * 512 - 8 * zone_size))
+ opts+=("--size=$((16 * zone_size))" "$(ioengine "libaio")")
+ opts+=("--zonemode=zbd" "--direct=1" "--zonesize=${zone_size}")
+ opts+=("--max_open_zones=2" "--offset=$off")
+ opts+=("--thread=1" "--group_reporting=1")
+ opts+=("--time_based" "--runtime=30" "--rw=randwrite")
+ for ((i=0;i<jobs;i++)); do
+ opts+=("--name=job${i}" "--filename=$dev")
+ opts+=("--bs=$(((i+1)*16))K")
+ opts+=($(job_var_opts_exclude "--max_open_zones"))
+ done
+ run_fio "${opts[@]}" >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Verify that zone_reset_threshold only accounts written bytes in seq
+# zones, and written data bytes of conv zones are not counted.
+test52() {
+ local off io_size
+
+ require_zbd || return $SKIP_TESTCASE
+ require_conv_zones 8 || return $SKIP_TESTCASE
+ require_seq_zones 8 || return $SKIP_TESTCASE
+
+ reset_zone "${dev}" -1
+
+ # Total I/O size is 1/8 = 0.125 of the I/O range of cont + seq zones.
+ # Set zone_reset_threshold as 0.1. The threshold size is less than
+ # 0.125, then, reset count zero is expected.
+ # On the other hand, half of the I/O range is covered by conv zones.
+ # If fio would count the conv zones for zone_reset_threshold, the ratio
+ # were more than 0.5 and would trigger zone resets.
+
+ off=$((first_sequential_zone_sector * 512 - 8 * zone_size))
+ io_size=$((zone_size * 16 / 8))
+ run_fio --name=job --filename=$dev --rw=randwrite --bs=$((zone_size/16))\
+ --size=$((zone_size * 16)) --softrandommap=1 \
+ --io_size=$((io_size)) "$(ioengine "psync")" --offset=$off \
+ --zonemode=zbd --direct=1 --zonesize=${zone_size} \
+ --zone_reset_threshold=.1 --zone_reset_frequency=1.0 \
+ ${job_var_opts[@]} --debug=zbd \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+
+ check_written ${io_size} || return $?
+ check_reset_count -eq 0 || return $?
+}
+
+# Check both reads and writes are executed by random I/O to conventional zones.
+test53() {
+ local off capacity io read_b=0 written_b=0
+
+ require_zbd || return $SKIP_TESTCASE
+ require_conv_zones 4 || return $SKIP_TESTCASE
+
+ off=$((first_sequential_zone_sector * 512 - 4 * zone_size))
+ capacity=$(total_zone_capacity 4 $off $dev)
+ run_fio --name=job --filename=${dev} --rw=randrw --bs=64K \
+ --size=$((4 * zone_size)) "$(ioengine "psync")" --offset=${off}\
+ --zonemode=zbd --direct=1 --zonesize=${zone_size} \
+ ${job_var_opts[@]} \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+
+ written_b=$(fio_written <"${logfile}.${test_number}")
+ read_b=$(fio_read <"${logfile}.${test_number}")
+ io=$((written_b + read_b))
+ echo "Number of bytes read: $read_b" >>"${logfile}.${test_number}"
+ echo "Number of bytes written: $written_b" >>"${logfile}.${test_number}"
+ echo "Total number of bytes read and written: $io <> $capacity" \
+ >>"${logfile}.${test_number}"
+ if ((io==capacity && written_b != 0 && read_b != 0)); then
+ return 0
+ fi
+ return 1
+}
+
+# Test read/write mix with verify.
+test54() {
+ require_zbd || return $SKIP_TESTCASE
+ require_seq_zones 8 || return $SKIP_TESTCASE
+
+ prep_write
+ run_fio --name=job --filename=${dev} "$(ioengine "libaio")" \
+ --time_based=1 --runtime=30s --continue_on_error=0 \
+ --offset=$((first_sequential_zone_sector * 512)) \
+ --size=$((8*zone_size)) --direct=1 --iodepth=1 \
+ --rw=randrw:2 --rwmixwrite=25 --bsrange=4k-${zone_size} \
+ --zonemode=zbd --zonesize=${zone_size} \
+ --verify=crc32c --do_verify=1 --verify_backlog=2 \
+ --alloc-size=65536 --random_generator=tausworthe64 \
+ ${job_var_opts[@]} --debug=zbd \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# test 'z' suffix parsing only
+test55() {
+ local bs
+ bs=$((min_seq_write_size))
+
+ require_zbd || return $SKIP_TESTCASE
+ # offset=1z + offset_increment=10z + size=2z
+ require_seq_zones 13 || return $SKIP_TESTCASE
+
+ prep_write
+ run_fio --name=j \
+ --filename=${dev} \
+ --direct=1 \
+ "$(ioengine "psync")" \
+ --zonemode=zbd \
+ --zonesize=${zone_size} \
+ --rw=write \
+ --bs=${bs} \
+ --numjobs=2 \
+ --offset_increment=10z \
+ --offset=1z \
+ --size=2z \
+ --io_size=3z \
+ ${job_var_opts[@]} --debug=zbd \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# test 'z' suffix parsing only
+test56() {
+ local bs
+ bs=$((min_seq_write_size))
+
+ require_regular_block_dev || return $SKIP_TESTCASE
+ require_seq_zones 10 || return $SKIP_TESTCASE
+
+ prep_write
+ run_fio --name=j \
+ --filename=${dev} \
+ --direct=1 \
+ "$(ioengine "psync")" \
+ --zonemode=strided \
+ --zonesize=${zone_size} \
+ --rw=write \
+ --bs=${bs} \
+ --size=10z \
+ --zoneskip=2z \
+ ${job_var_opts[@]} --debug=zbd \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Test that repeated async write job does not cause zone reset during writes
+# in-flight, when the block size is not a divisor of the zone size.
+test57() {
+ local bs off
+
+ require_zbd || return $SKIP_TESTCASE
+
+ prep_write
+ bs=$((4096 * 7))
+ off=$((first_sequential_zone_sector * 512))
+
+ run_fio --name=job --filename="${dev}" --rw=randwrite --bs="${bs}" \
+ --offset="${off}" --size=$((4 * zone_size)) --iodepth=256 \
+ "$(ioengine "libaio")" --time_based=1 --runtime=30s \
+ --zonemode=zbd --direct=1 --zonesize="${zone_size}" \
+ ${job_var_opts[@]} \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Random writes and random trims to sequential write required zones for 30s.
+test58() {
+ local off size bs
+
+ require_seq_zones 128 || return $SKIP_TESTCASE
+
+ size=$((zone_size * 128))
+ bs="$(max $((zone_size / 128)) "$min_seq_write_size")"
+ prep_write
+ off=$((first_sequential_zone_sector * 512))
+ run_fio --zonemode=zbd --direct=1 --zonesize="${zone_size}" --thread=1 \
+ --filename="${dev}" --norandommap=1 \
+ --name="precondition" --rw=write "$(ioengine "psync")" \
+ --offset="${off}" --size=$((zone_size * 16)) --bs="${bs}" \
+ "${job_var_opts[@]}" \
+ --name=wjob --wait_for="precondition" --rw=randwrite \
+ "$(ioengine "libaio")" --iodepth=8 \
+ --offset="${off}" --size="${size}" --bs="${bs}" \
+ --time_based --runtime=30s --flow=128 "${job_var_opts[@]}" \
+ --name=trimjob --wait_for="precondition" --rw=randtrim \
+ "$(ioengine "psync")" \
+ --offset="${off}" --size="${size}" --bs="${zone_size}" \
+ --time_based --runtime=30s --flow=1 "${job_var_opts[@]}" \
+ >>"${logfile}.${test_number}" 2>&1
+}
+
+# Test zone_reset_threshold with verify.
+test59() {
+ local off bs loops=2 size=$((zone_size)) w
+ local -a workloads=(write randwrite rw randrw)
+
+ prep_write
+ off=$((first_sequential_zone_sector * 512))
+
+ bs=$(min $((256*1024)) "$zone_size")
+ for w in "${workloads[@]}"; do
+ run_fio_on_seq "$(ioengine "psync")" --rw=${w} --bs="$bs" \
+ --size=$size --loops=$loops --do_verify=1 \
+ --verify=md5 --zone_reset_frequency=.9 \
+ --zone_reset_threshold=.1 \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+ done
+}
+
+# Test fio errors out experimental_verify option with zonemode=zbd.
+test60() {
+ run_fio_on_seq "$(ioengine "psync")" --rw=write --size=$zone_size \
+ --do_verify=1 --verify=md5 --experimental_verify=1 \
+ >> "${logfile}.${test_number}" 2>&1 && return 1
+ grep -q 'not support experimental verify' "${logfile}.${test_number}"
+}
+
+# Test fio errors out zone_reset_threshold option for multiple jobs with
+# different write ranges.
+test61() {
+ run_fio_on_seq "$(ioengine "psync")" --rw=write --size="$zone_size" \
+ --numjobs=2 --offset_increment="$zone_size" \
+ --zone_reset_threshold=0.1 --zone_reset_frequency=1 \
+ --exitall_on_error=1 \
+ >> "${logfile}.${test_number}" 2>&1 && return 1
+ grep -q 'different write ranges' "${logfile}.${test_number}"
+}
+
+# Test zone_reset_threshold option works for multiple jobs with same write
+# range.
+test62() {
+ local bs loops=2 size=$((zone_size))
+
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+ # Two jobs write to single zone twice. Reset zone happens at next write
+ # after half of the zone gets filled. So 2 * 2 * 2 - 1 = 7 times zone
+ # resets are expected.
+ bs=$(min $((256*1024)) $((zone_size / 4)))
+ run_fio_on_seq "$(ioengine "psync")" --rw=write --bs="$bs" \
+ --size=$size --loops=$loops --numjobs=2 \
+ --zone_reset_frequency=1 --zone_reset_threshold=.5 \
+ --group_reporting=1 \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+ check_written $((size * loops * 2)) || return $?
+ check_reset_count -eq 7 || return $?
+}
+
+# Test zone_reset_threshold option works for a read job and a write job with
+# different IO range.
+test63() {
+ local bs loops=2 size=$((zone_size)) off1 off2
+
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+ off1=$((first_sequential_zone_sector * 512))
+ off2=$((off1 + zone_size))
+ bs=$(min $((256*1024)) $((zone_size / 4)))
+
+ # One job writes to single zone twice. Reset zone happens at next write
+ # after half of the zone gets filled. So 2 * 2 - 1 = 3 times zone resets
+ # are expected.
+ run_fio "$(ioengine "psync")" --bs="$bs" --size=$size --loops=$loops \
+ --filename="$dev" --group_reporting=1 \
+ --zonemode=zbd --zonesize="$zone_size" --direct=1 \
+ --zone_reset_frequency=1 --zone_reset_threshold=.5 \
+ --name=r --rw=read --offset=$off1 "${job_var_opts[@]}" \
+ --name=w --rw=write --offset=$off2 "${job_var_opts[@]}" \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+ check_written $((size * loops)) || return $?
+ check_reset_count -eq 3 || return $?
+}
+
+# Test write zone accounting handles almost full zones correctly. Prepare an
+# almost full, but not full zone. Write to the zone with verify using larger
+# block size. Then confirm fio does not report write zone accounting failure.
+test64() {
+ local bs cap
+
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+ bs=$((zone_size / 8))
+ cap=$(total_zone_capacity 1 $((first_sequential_zone_sector*512)) $dev)
+ run_fio_on_seq "$(ioengine "psync")" --rw=write --bs="$bs" \
+ --size=$((zone_size)) \
+ --io_size=$((cap - bs)) \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+
+ bs=$((zone_size / 2))
+ run_fio_on_seq "$(ioengine "psync")" --rw=write --bs="$bs" \
+ --size=$((zone_size)) --do_verify=1 --verify=md5 \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+}
+
+# Test open zone accounting handles trim workload correctly. Prepare open zones
+# as many as max_open_zones=4. Trim one of the 4 zones. Then write to another
+# zone and check the write amount is expected size.
+test65() {
+ local off capacity
+
+ [ -n "$is_zbd" ] && reset_zone "$dev" -1
+
+ off=$((first_sequential_zone_sector * 512))
+ capacity=$(total_zone_capacity 1 $off "$dev")
+ run_fio --zonemode=zbd --direct=1 --zonesize="$zone_size" --thread=1 \
+ --filename="$dev" --group_reporting=1 --max_open_zones=4 \
+ "$(ioengine "psync")" \
+ --name="prep_open_zones" --rw=randwrite --offset="$off" \
+ --size="$((zone_size * 4))" --bs=4096 --io_size="$zone_size" \
+ --name=trimjob --wait_for="prep_open_zones" --rw=trim \
+ --bs="$zone_size" --offset="$off" --size="$zone_size" \
+ --name=write --wait_for="trimjob" --rw=write --bs=4096 \
+ --offset="$((off + zone_size * 4))" --size="$zone_size" \
+ >> "${logfile}.${test_number}" 2>&1
+
+ check_written $((zone_size + capacity))
+}
+
+# Test closed zones are handled as open zones. This test case requires zoned
+# block devices which has same max_open_zones and max_active_zones.
+test66() {
+ local i off
+
+ require_zbd || return $SKIP_TESTCASE
+ require_max_active_zones 2 || return $SKIP_TESTCASE
+ require_max_open_zones "${max_active_zones}" || return $SKIP_TESTCASE
+ require_seq_zones $((max_active_zones * 16)) || return $SKIP_TESTCASE
+
+ reset_zone "$dev" -1
+
+ # Prepare max_active_zones in closed condition.
+ off=$((first_sequential_zone_sector * 512))
+ run_fio --name=w --filename="$dev" --zonemod=zbd --direct=1 \
+ --offset=$((off)) --zonesize="${zone_size}" --rw=randwrite \
+ --bs=4096 --size="$((zone_size * max_active_zones))" \
+ --io_size="${zone_size}" "$(ioengine "psync")" \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+ for ((i = 0; i < max_active_zones; i++)); do
+ close_zone "$dev" $((off / 512)) || return $?
+ off=$((off + zone_size))
+ done
+
+ # Run random write to the closed zones and empty zones. This confirms
+ # that fio handles closed zones as write target open zones. Otherwise,
+ # fio writes to the empty zones and hit the max_active_zones limit.
+ off=$((first_sequential_zone_sector * 512))
+ run_one_fio_job --zonemod=zbd --direct=1 \
+ "$(ioengine "psync")" --rw=randwrite --bs=4096 \
+ --max_open_zones="$max_active_zones" --offset=$((off)) \
+ --size=$((max_active_zones * 16 * zone_size)) \
+ --io_size=$((zone_size)) --zonesize="${zone_size}" \
+ --time_based --runtime=5s \
+ >> "${logfile}.${test_number}" 2>&1
+}
+
+# Test max_active_zones limit failure is reported with good error message.
+test67() {
+ local i off
+
+ require_zbd || return $SKIP_TESTCASE
+ require_max_active_zones 2 || return $SKIP_TESTCASE
+ require_max_open_zones "${max_active_zones}" || return $SKIP_TESTCASE
+ require_seq_zones $((max_active_zones + 1)) || return $SKIP_TESTCASE
+
+ reset_zone "$dev" -1
+
+ # Prepare max_active_zones in open condition.
+ off=$((first_sequential_zone_sector * 512))
+ run_fio --name=w --filename="$dev" --zonemod=zbd --direct=1 \
+ --offset=$((off)) --zonesize="${zone_size}" --rw=randwrite \
+ --bs=4096 --size="$((zone_size * max_active_zones))" \
+ --io_size="${zone_size}" "$(ioengine "psync")" \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+
+ # Write to antoher zone and trigger max_active_zones limit error.
+ off=$((off + zone_size * max_active_zones))
+ run_one_fio_job --zonemod=zbd --direct=1 "$(ioengine "psync")" \
+ --rw=write --bs=$min_seq_write_size --offset=$((off)) \
+ --size=$((zone_size)) --zonesize="${zone_size}" \
+ >> "${logfile}.${test_number}" 2>&1 && return $?
+ grep -q 'Exceeded max_active_zones limit' "${logfile}.${test_number}"
+}
+
+# Test rw=randrw and rwmixwrite=0 options do not issue write I/O unit
+test68() {
+ local off size
+
+ require_zbd || return "$SKIP_TESTCASE"
+
+ reset_zone "${dev}" -1
+
+ # Write some data as preparation
+ off=$((first_sequential_zone_sector * 512))
+ size=$min_seq_write_size
+ run_one_fio_job "$(ioengine "psync")" --rw=write --offset="$off" \
+ --io_size="$size" --zonemode=strided \
+ --zonesize="$zone_size" --zonerange="$zone_size" \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+ # Run random mixed read and write specifying zero write ratio
+ run_fio_on_seq "$(ioengine "psync")" --rw=randrw --rwmixwrite=0 \
+ --time_based --runtime=1s \
+ >> "${logfile}.${test_number}" 2>&1 || return $?
+ # "WRITE:" shall be recoreded only once for the preparation
+ [[ $(grep -c "WRITE:" "${logfile}.${test_number}") == 1 ]]
+}
+
+SECONDS=0
tests=()
dynamic_analyzer=()
reset_all_zones=
+reset_before_write=
use_libzbc=
zbd_debug=
max_open_zones_opt=
+quit_on_err=
+force_io_uring=
+start_test=1
while [ "${1#-}" != "$1" ]; do
case "$1" in
shift;;
-l) use_libzbc=1; shift;;
-r) reset_all_zones=1; shift;;
+ -w) reset_before_write=1; shift;;
-t) tests+=("$2"); shift; shift;;
-o) max_open_zones_opt="${2}"; shift; shift;;
+ -s) start_test=$2; shift; shift;;
-v) dynamic_analyzer=(valgrind "--read-var-info=yes");
shift;;
+ -q) quit_on_err=1; shift;;
-z) zbd_debug=1; shift;;
+ -u) force_io_uring=1; shift;;
--) shift; break;;
+ *) usage; exit 1;;
esac
done
exit 1
fi
+if [ -n "$use_libzbc" -a -n "$force_io_uring" ]; then
+ echo "Please specify only one of -l and -u options"
+ exit 1
+fi
+
# shellcheck source=functions
source "$(dirname "$0")/functions" || exit $?
realsysfs=$(readlink "/sys/dev/block/$major:$minor")
basename=$(basename "${realsysfs%/*}")
fi
- logical_block_size=$(<"/sys/block/$basename/queue/logical_block_size")
+ min_seq_write_size=$(min_seq_write_size "$basename")
case "$(<"/sys/class/block/$basename/queue/zoned")" in
host-managed|host-aware)
is_zbd=true
first_sequential_zone_sector=${result[0]}
sectors_per_zone=${result[1]}
zone_size=$((sectors_per_zone * 512))
+ unrestricted_reads=$(urswrz "$dev")
if ! max_open_zones=$(max_open_zones "$dev"); then
echo "Failed to determine maximum number of open zones"
exit 1
fi
+ max_active_zones=$(max_active_zones "$dev")
set_io_scheduler "$basename" deadline || exit $?
if [ -n "$reset_all_zones" ]; then
reset_zone "$dev" -1
;;
*)
first_sequential_zone_sector=$(((disk_size / 2) &
- (logical_block_size - 1)))
- zone_size=$(max 65536 "$logical_block_size")
+ (min_seq_write_size - 1)))
+ zone_size=$(max 65536 "$min_seq_write_size")
sectors_per_zone=$((zone_size / 512))
max_open_zones=128
+ max_active_zones=0
+ unrestricted_reads=1
set_io_scheduler "$basename" none || exit $?
;;
esac
+
elif [[ -c "$realdev" ]]; then
# For an SG node, we must have libzbc option specified
if [[ ! -n "$use_libzbc" ]]; then
echo "Failed to determine disk size"
exit 1
fi
- if ! logical_block_size=($(zbc_logical_block_size "$dev")); then
- echo "Failed to determine logical block size"
+ if ! min_seq_write_size=($(zbc_physical_block_size "$dev")); then
+ echo "Failed to determine physical block size"
exit 1
fi
if ! result=($(first_sequential_zone "$dev")); then
first_sequential_zone_sector=${result[0]}
sectors_per_zone=${result[1]}
zone_size=$((sectors_per_zone * 512))
+ unrestricted_reads=$(urswrz "$dev")
if ! max_open_zones=$(max_open_zones "$dev"); then
echo "Failed to determine maximum number of open zones"
exit 1
fi
+ max_active_zones=0
if [ -n "$reset_all_zones" ]; then
reset_zone "$dev" -1
fi
if [[ -n ${max_open_zones_opt} ]]; then
# Override max_open_zones with the script option value
max_open_zones="${max_open_zones_opt}"
+ global_var_opts+=("--ignore_zone_limits=1")
job_var_opts+=("--max_open_zones=${max_open_zones_opt}")
fi
echo -n "First sequential zone starts at sector $first_sequential_zone_sector;"
echo " zone size: $((zone_size >> 20)) MB"
+zone_cap_bs=$(zone_cap_bs "$dev" "$zone_size")
+
if [ "${#tests[@]}" = 0 ]; then
readarray -t tests < <(declare -F | grep "test[0-9]*" | \
tr -c -d "[:digit:]\n" | sort -n)
logfile=$0.log
passed=0
+skipped=0
failed=0
if [ -t 1 ]; then
red="\e[1;31m"
green="\e[1;32m"
+ cyan="\e[1;36m"
end="\e[m"
else
red=""
intr=0
trap 'intr=1' SIGINT
+ret=0
for test_number in "${tests[@]}"; do
+ [ "${test_number}" -lt "${start_test}" ] && continue
rm -f "${logfile}.${test_number}"
+ unset SKIP_REASON
echo -n "Running test $(printf "%02d" $test_number) ... "
- if eval "test$test_number" && check_log $test_number; then
+ eval "test$test_number"
+ ret=$?
+ if ((!ret)) && check_log $test_number; then
status="PASS"
cc_status="${green}${status}${end}"
((passed++))
+ elif ((ret==SKIP_TESTCASE)); then
+ status="SKIP"
+ echo "${SKIP_REASON}" >> "${logfile}.${test_number}"
+ cc_status="${cyan}${status}${end} ${SKIP_REASON}"
+ ((skipped++))
else
status="FAIL"
cc_status="${red}${status}${end}"
echo -e "$cc_status"
echo "$status" >> "${logfile}.${test_number}"
[ $intr -ne 0 ] && exit 1
+ [ -n "$quit_on_err" -a "$rc" -ne 0 ] && exit 1
done
echo "$passed tests passed"
+if [ $skipped -gt 0 ]; then
+ echo " $skipped tests skipped"
+fi
if [ $failed -gt 0 ]; then
- echo " and $failed tests failed"
+ echo " $failed tests failed"
fi
+echo "Run time: $(TZ=UTC0 printf "%(%H:%M:%S)T\n" $(( SECONDS )) )"
exit $rc
if (!td->o.ignore_error[etype]) {
td->o.ignore_error[etype] = __NON_FATAL_ERR;
- td->o.ignore_error_nr[etype] = ARRAY_SIZE(__NON_FATAL_ERR);
+ td->o.ignore_error_nr[etype] = FIO_ARRAY_SIZE(__NON_FATAL_ERR);
}
if (!(td->o.continue_on_error & (1 << etype)))
MEM_CUDA_MALLOC,/* use GPU memory */
};
+/*
+ * What mode to use for deduped data generation
+ */
+enum dedupe_mode {
+ DEDUPE_MODE_REPEAT = 0,
+ DEDUPE_MODE_WORKING_SET = 1,
+};
+
#define ERROR_STR_MAX 128
#define BSSPLIT_MAX 64
#define ZONESPLIT_MAX 256
+struct split {
+ unsigned int nr;
+ unsigned long long val1[ZONESPLIT_MAX];
+ unsigned long long val2[ZONESPLIT_MAX];
+};
+
+struct split_prio {
+ uint64_t bs;
+ int32_t prio;
+ uint32_t perc;
+};
+
struct bssplit {
uint64_t bs;
uint32_t perc;
unsigned long long size;
unsigned long long io_size;
unsigned int size_percent;
+ unsigned int size_nz;
unsigned int io_size_percent;
+ unsigned int io_size_nz;
unsigned int fill_device;
unsigned int file_append;
unsigned long long file_size_low;
unsigned long long file_size_high;
unsigned long long start_offset;
unsigned long long start_offset_align;
+ unsigned int start_offset_nz;
unsigned long long bs[DDIR_RWDIR_CNT];
unsigned long long ba[DDIR_RWDIR_CNT];
unsigned int do_verify;
unsigned int verify_interval;
unsigned int verify_offset;
- char verify_pattern[MAX_PATTERN_SIZE];
+ char *verify_pattern;
unsigned int verify_pattern_bytes;
struct pattern_fmt verify_fmt[8];
unsigned int verify_fmt_sz;
unsigned int do_disk_util;
unsigned int override_sync;
unsigned int rand_repeatable;
- unsigned int allrand_repeatable;
unsigned long long rand_seed;
unsigned int log_avg_msec;
unsigned int log_hist_msec;
unsigned int log_offset;
unsigned int log_gz;
unsigned int log_gz_store;
- unsigned int log_unix_epoch;
+ unsigned int log_alternate_epoch;
+ unsigned int log_alternate_epoch_clock_id;
unsigned int norandommap;
unsigned int softrandommap;
unsigned int bs_unaligned;
fio_fp64_t zipf_theta;
fio_fp64_t pareto_h;
fio_fp64_t gauss_dev;
+ fio_fp64_t random_center;
unsigned int random_generator;
unsigned int hugepage_size;
unsigned long long rw_min_bs;
- unsigned int thinktime;
- unsigned int thinktime_spin;
- unsigned int thinktime_blocks;
unsigned int fsync_blocks;
unsigned int fdatasync_blocks;
unsigned int barrier_blocks;
fio_fp64_t ss_limit;
unsigned long long ss_dur;
unsigned long long ss_ramp_time;
+ unsigned long long ss_check_interval;
unsigned int overwrite;
unsigned int bw_avg_time;
unsigned int iops_avg_time;
unsigned long long zone_size;
unsigned long long zone_capacity;
unsigned long long zone_skip;
+ uint32_t zone_skip_nz;
enum fio_zone_mode zone_mode;
unsigned long long lockmem;
enum fio_memtype mem_type;
unsigned int mem_align;
- unsigned long long max_latency;
+ unsigned long long max_latency[DDIR_RWDIR_CNT];
unsigned int exit_what;
unsigned int stonewall;
unsigned int nice;
unsigned int ioprio;
unsigned int ioprio_class;
+ unsigned int ioprio_hint;
unsigned int file_service_type;
unsigned int group_reporting;
unsigned int stats;
unsigned int zero_buffers;
unsigned int refill_buffers;
unsigned int scramble_buffers;
- char buffer_pattern[MAX_PATTERN_SIZE];
+ char *buffer_pattern;
unsigned int buffer_pattern_bytes;
unsigned int compress_percentage;
unsigned int compress_chunk;
unsigned int dedupe_percentage;
+ unsigned int dedupe_mode;
+ unsigned int dedupe_working_set_percentage;
+ unsigned int dedupe_global;
unsigned int time_based;
unsigned int disable_lat;
unsigned int disable_clat;
unsigned int unified_rw_rep;
unsigned int gtod_reduce;
unsigned int gtod_cpu;
+ unsigned int job_start_clock_id;
enum fio_cs clocksource;
unsigned int no_stall;
unsigned int trim_percentage;
char *exec_prerun;
char *exec_postrun;
+ unsigned int thinkcycles;
+
+ unsigned int thinktime;
+ unsigned int thinktime_spin;
+ unsigned int thinktime_blocks;
+ unsigned int thinktime_blocks_type;
+ unsigned int thinktime_iotime;
+
uint64_t rate[DDIR_RWDIR_CNT];
uint64_t ratemin[DDIR_RWDIR_CNT];
unsigned int ratecycle;
unsigned int gid;
unsigned int offset_increment_percent;
+ unsigned int offset_increment_nz;
unsigned long long offset_increment;
unsigned long long number_ios;
+ unsigned int num_range;
+
unsigned int sync_file_range;
unsigned long long latency_target;
unsigned long long latency_window;
- fio_fp64_t latency_percentile;
uint32_t latency_run;
+ fio_fp64_t latency_percentile;
/*
* flow support
unsigned int read_beyond_wp;
int max_open_zones;
unsigned int job_max_open_zones;
+ unsigned int ignore_zone_limits;
fio_fp64_t zrt;
fio_fp64_t zrf;
+
+ unsigned int fdp;
+ unsigned int dp_type;
+ unsigned int dp_id_select;
+ unsigned int dp_ids[FIO_MAX_DP_IDS];
+ unsigned int dp_nr_ids;
+
+ unsigned int log_entries;
+ unsigned int log_prio;
};
#define FIO_TOP_STR_MAX 256
uint32_t iodepth_batch_complete_min;
uint32_t iodepth_batch_complete_max;
uint32_t serialize_overlap;
- uint32_t pad;
uint64_t size;
uint64_t io_size;
uint32_t size_percent;
+ uint32_t size_nz;
uint32_t io_size_percent;
+ uint32_t io_size_nz;
uint32_t fill_device;
uint32_t file_append;
uint32_t unique_filename;
uint64_t file_size_high;
uint64_t start_offset;
uint64_t start_offset_align;
+ uint32_t start_offset_nz;
uint64_t bs[DDIR_RWDIR_CNT];
uint64_t ba[DDIR_RWDIR_CNT];
uint32_t do_verify;
uint32_t verify_interval;
uint32_t verify_offset;
- uint8_t verify_pattern[MAX_PATTERN_SIZE];
uint32_t verify_pattern_bytes;
uint32_t verify_fatal;
uint32_t verify_dump;
uint32_t do_disk_util;
uint32_t override_sync;
uint32_t rand_repeatable;
- uint32_t allrand_repeatable;
- uint32_t pad2;
uint64_t rand_seed;
uint32_t log_avg_msec;
uint32_t log_hist_msec;
uint32_t log_offset;
uint32_t log_gz;
uint32_t log_gz_store;
- uint32_t log_unix_epoch;
+ uint32_t log_alternate_epoch;
+ uint32_t log_alternate_epoch_clock_id;
uint32_t norandommap;
uint32_t softrandommap;
uint32_t bs_unaligned;
struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
uint32_t zone_split_nr[DDIR_RWDIR_CNT];
- uint8_t pad1[4];
-
fio_fp64_t zipf_theta;
fio_fp64_t pareto_h;
fio_fp64_t gauss_dev;
+ fio_fp64_t random_center;
uint32_t random_generator;
uint32_t hugepage_size;
uint64_t rw_min_bs;
- uint32_t thinktime;
- uint32_t thinktime_spin;
- uint32_t thinktime_blocks;
uint32_t fsync_blocks;
uint32_t fdatasync_blocks;
uint32_t barrier_blocks;
uint64_t ss_ramp_time;
uint32_t ss_state;
fio_fp64_t ss_limit;
+ uint64_t ss_check_interval;
uint32_t overwrite;
uint32_t bw_avg_time;
uint32_t iops_avg_time;
uint64_t zone_capacity;
uint64_t zone_skip;
uint64_t lockmem;
+ uint32_t zone_skip_nz;
uint32_t mem_type;
uint32_t mem_align;
uint32_t stonewall;
uint32_t new_group;
uint32_t numjobs;
+
/*
* We currently can't convert these, so don't enable them
*/
uint32_t nice;
uint32_t ioprio;
uint32_t ioprio_class;
+ uint32_t ioprio_hint;
uint32_t file_service_type;
uint32_t group_reporting;
uint32_t stats;
uint32_t zero_buffers;
uint32_t refill_buffers;
uint32_t scramble_buffers;
- uint8_t buffer_pattern[MAX_PATTERN_SIZE];
uint32_t buffer_pattern_bytes;
uint32_t compress_percentage;
uint32_t compress_chunk;
uint32_t dedupe_percentage;
+ uint32_t dedupe_mode;
+ uint32_t dedupe_working_set_percentage;
+ uint32_t dedupe_global;
uint32_t time_based;
uint32_t disable_lat;
uint32_t disable_clat;
uint32_t unified_rw_rep;
uint32_t gtod_reduce;
uint32_t gtod_cpu;
+ uint32_t job_start_clock_id;
uint32_t clocksource;
uint32_t no_stall;
uint32_t trim_percentage;
uint32_t lat_percentiles;
uint32_t slat_percentiles;
uint32_t percentile_precision;
+ uint32_t pad;
fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
uint8_t read_iolog_file[FIO_TOP_STR_MAX];
uint8_t exec_prerun[FIO_TOP_STR_MAX];
uint8_t exec_postrun[FIO_TOP_STR_MAX];
+ uint32_t thinkcycles;
+
+ uint32_t thinktime;
+ uint32_t thinktime_spin;
+ uint32_t thinktime_blocks;
+ uint32_t thinktime_blocks_type;
+ uint32_t thinktime_iotime;
+
uint64_t rate[DDIR_RWDIR_CNT];
uint64_t ratemin[DDIR_RWDIR_CNT];
uint32_t ratecycle;
uint32_t gid;
uint32_t offset_increment_percent;
+ uint32_t offset_increment_nz;
uint64_t offset_increment;
uint64_t number_ios;
uint64_t latency_target;
uint64_t latency_window;
- uint64_t max_latency;
- fio_fp64_t latency_percentile;
+ uint64_t max_latency[DDIR_RWDIR_CNT];
uint32_t latency_run;
+ fio_fp64_t latency_percentile;
/*
* flow support
uint32_t allow_mounted_write;
uint32_t zone_mode;
+ int32_t max_open_zones;
+ uint32_t ignore_zone_limits;
+
+ uint32_t log_entries;
+ uint32_t log_prio;
+
+ uint32_t fdp;
+ uint32_t dp_type;
+ uint32_t dp_id_select;
+ uint32_t dp_ids[FIO_MAX_DP_IDS];
+ uint32_t dp_nr_ids;
+
+ uint32_t num_range;
+ /*
+ * verify_pattern followed by buffer_pattern from the unpacked struct
+ */
+ uint8_t patterns[];
} __attribute__((packed));
-extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);
+extern int convert_thread_options_to_cpu(struct thread_options *o,
+ struct thread_options_pack *top, size_t top_sz);
+extern size_t thread_options_pack_size(struct thread_options *o);
extern void convert_thread_options_to_net(struct thread_options_pack *top, struct thread_options *);
extern int fio_test_cconv(struct thread_options *);
extern void options_default_fill(struct thread_options *o);
+typedef int (split_parse_fn)(struct thread_options *, void *,
+ enum fio_ddir, char *, bool);
+
+extern int str_split_parse(struct thread_data *td, char *str,
+ split_parse_fn *fn, void *eo, bool data);
+
+extern int split_parse_ddir(struct thread_options *o, struct split *split,
+ char *str, bool absolute, unsigned int max_splits);
+
+extern int split_parse_prio_ddir(struct thread_options *o,
+ struct split_prio **entries, int *nr_entries,
+ char *str);
+
#endif
return t;
}
+/*
+ * busy loop for a fixed amount of cycles
+ */
+void cycles_spin(unsigned int n)
+{
+ unsigned long i;
+
+ for (i=0; i < n; i++)
+ nop;
+}
+
uint64_t usec_sleep(struct thread_data *td, unsigned long usec)
{
struct timespec req;
fio_gettime(&genesis, NULL);
}
-void set_epoch_time(struct thread_data *td, int log_unix_epoch)
+void set_epoch_time(struct thread_data *td, clockid_t log_alternate_epoch_clock_id, clockid_t job_start_clock_id)
{
+ struct timespec ts;
fio_gettime(&td->epoch, NULL);
- if (log_unix_epoch) {
- struct timeval tv;
- gettimeofday(&tv, NULL);
- td->unix_epoch = (unsigned long long)(tv.tv_sec) * 1000 +
- (unsigned long long)(tv.tv_usec) / 1000;
+ clock_gettime(log_alternate_epoch_clock_id, &ts);
+ td->alternate_epoch = (unsigned long long)(ts.tv_sec) * 1000 +
+ (unsigned long long)(ts.tv_nsec) / 1000000;
+ if (job_start_clock_id == log_alternate_epoch_clock_id)
+ {
+ td->job_start = td->alternate_epoch;
+ }
+ else
+ {
+ clock_gettime(job_start_clock_id, &ts);
+ td->job_start = (unsigned long long)(ts.tv_sec) * 1000 +
+ (unsigned long long)(ts.tv_nsec) / 1000000;
}
}
exit 1
fi
-GNUPLOT=$(which gnuplot)
+GNUPLOT=$(command -v gnuplot)
if [ ! -x "$GNUPLOT" ]
then
echo You need gnuplot installed to generate graphs
Returns:
True if the indices do not yet point to the end of each bin in bins.
- False if the indices point beyond their repsective bins.
+ False if the indices point beyond their respective bins.
"""
for key, value in six.iteritems(indices):
def get_csvfile(dest, jobnum):
"""Generate CSV filename from command-line arguments and job numbers.
- Paramaters:
+ Parameters:
dest file specification for CSV filename.
jobnum job number.
--- /dev/null
+[fio_jobs]
+header=<<B><font color="{}"> {} </font></B> >
+header_color=black
+text_color=darkgreen
+shape=box
+shape_color=blue
+style=rounded
+title_style=<<table border='0' cellborder='0' cellspacing='1'> <tr> <td align='center'> <b> {} </b> </td> </tr>
+item_style=<tr> <td align = "left"> <font color="{}" > {} </font> </td> </tr>
+cluster_style=filled
+cluster_color=gainsboro
+
+[exec_prerun]
+text_color=red
+
+[exec_postrun]
+text_color=red
+
+[numjobs]
+text_color=red
+style=<font color="{}" > x {} </font>
+
+[ioengine]
+text_color=darkblue
+specific_options_color=darkblue
+
+# definitions of engine's specific options
+
+[ioengine_cpuio]
+specific_options=cpuload cpumode cpuchunks exit_on_io_done
+
+[ioengine_dfs]
+specific_options=pool cont chunk_size object_class svcl
+
+[ioengine_e4defrag]
+specific_options=donorname inplace
+
+[ioengine_exec]
+specific_options=program arguments grace_time std_redirect
+
+[ioengine_filestat]
+specific_options=stat_type
+
+[ioengine_single-instance]
+specific_options=volume brick
+
+[ioengine_http]
+specific_options=https http_host http_user http_pass http_s3_key http_s3_keyid http_swift_auth_token http_s3_region http_mode http_verbose http_s3_storage_class http_s3_sse_customer_key http_s3_sse_customer_algorithm
+
+[ioengine_ime_aio]
+specific_options=ime_psync ime_psyncv
+
+[ioengine_io_uring]
+specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored uncached nowait force_async
+
+[ioengine_io_uring_cmd]
+specific_options=hipri cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit fixedbufs registerfiles sqthread_poll sqthread_poll_cpu nonvectored uncached nowait force_async cmd_type md_per_io_size pi_act pi_chk apptag apptag_mask
+
+[ioengine_libaio]
+specific_options=userspace_reap cmdprio_percentage cmdprio_class cmdprio cmdprio_bssplit nowait
+
+[ioengine_libblkio]
+specific_options=libblkio_driver libblkio_path libblkio_pre_connect_props libblkio_num_entries libblkio_queue_size libblkio_pre_start_props hipri libblkio_vectored libblkio_write_zeroes_on_trim libblkio_wait_mode libblkio_force_enable_completion_eventfd
+
+[ioengine_libcufile]
+specific_options=gpu_dev_ids cuda_io
+
+[ioengine_libhdfs]
+specific_options=namenode hostname port hdfsdirectory chunk_size single_instance hdfs_use_direct
+
+[ioengine_libiscsi]
+specific_options=initiator
+
+[ioengine_librpma_apm_server]
+specific_options=librpma_apm_client
+
+[ioengine_busy_wait_polling]
+specific_options=serverip port direct_write_to_pmem
+
+[ioengine_librpma_gpspm_server]
+specific_options=librpma_gpspm_client
+
+[ioengine_mmap]
+specific_options=thp
+
+[ioengine_mtd]
+specific_options=skip_bad
+
+[ioengine_nbd]
+specific_options=uri
+
+[ioengine_net]
+specific_options=hostname port protocol nodelay listen pingpong interface ttl window_size mss netsplice
+
+[ioengine_nfs]
+specific_options=nfs_url
+
+[ioengine_rados]
+specific_options=clustername pool clientname busy_poll touch_objects
+
+[ioengine_rbd]
+specific_options=clustername rbdname pool clientname busy_poll
+
+[ioengine_rdma]
+specific_options=hostname bindname port verb
+
+[ioengine_sg]
+specific_options=hipri readfua writefua sg_write_mode stream_id
+
+[ioengine_pvsync2]
+specific_options=hipri hipri_percentage uncached nowait sync psync vsync pvsync
+
+[ioengine_xnvme]
+specific_options=hipri sqthread_poll xnvme_be xnvme_async xnvme_sync xnvme_admin xnvme_dev_nsid xnvme_iovec
--- /dev/null
+#!/usr/bin/env python3
+import uuid
+import time
+import errno
+from graphviz import Digraph
+import argparse
+import configparser
+import os
+
+config_file = None
+fio_file = None
+
+
+def get_section_option(section_name, option_name, default=None):
+ global fio_file
+ if fio_file.has_option(section_name, option_name):
+ return fio_file[section_name][option_name]
+ return default
+
+
+def get_config_option(section_name, option_name, default=None):
+ global config_file
+ if config_file.has_option(section_name, option_name):
+ return config_file[section_name][option_name]
+ return default
+
+
+def get_header_color(keyword='fio_jobs', default_color='black'):
+ return get_config_option(keyword, 'header_color', default_color)
+
+
+def get_shape_color(keyword='fio_jobs', default_color='black'):
+ return get_config_option(keyword, 'shape_color', default_color)
+
+
+def get_text_color(keyword='fio_jobs', default_color='black'):
+ return get_config_option(keyword, 'text_color', default_color)
+
+
+def get_cluster_color(keyword='fio_jobs', default_color='gray92'):
+ return get_config_option(keyword, 'cluster_color', default_color)
+
+
+def get_header(keyword='fio_jobs'):
+ return get_config_option(keyword, 'header')
+
+
+def get_shape(keyword='fio_jobs'):
+ return get_config_option(keyword, 'shape', 'box')
+
+
+def get_style(keyword='fio_jobs'):
+ return get_config_option(keyword, 'style', 'rounded')
+
+
+def get_cluster_style(keyword='fio_jobs'):
+ return get_config_option(keyword, 'cluster_style', 'filled')
+
+
+def get_specific_options(engine):
+ if not engine:
+ return ''
+ return get_config_option('ioengine_{}'.format(engine), 'specific_options', '').split(' ')
+
+
+def render_option(section, label, display, option, color_override=None):
+ # These options are already shown with graphical helpers, no need to report them directly
+ skip_list = ['size', 'stonewall', 'runtime', 'time_based',
+ 'numjobs', 'wait_for', 'wait_for_previous']
+ # If the option doesn't exist or if a special handling is already done
+ # don't render it, just return the current state
+ if option in skip_list or option not in section:
+ return label, display
+ display = option
+ if section[option]:
+ display = '{} = {}'.format(display, section[option])
+
+ # Adding jobs's options into the box, darkgreen is the default color
+ if color_override:
+ color = color_override
+ else:
+ color = get_text_color(option, get_text_color('fio_jobs', 'darkgreen'))
+ label += get_config_option('fio_jobs',
+ 'item_style').format(color, display)
+ return label, display
+
+
+def render_options(fio_file, section_name):
+ """Render all options of a section."""
+ display = section_name
+ section = fio_file[section_name]
+
+ # Add a multiplier to the section_name if numjobs is set
+ numjobs = int(get_section_option(section_name, 'numjobs', '1'))
+ if numjobs > 1:
+ display = display + \
+ get_style('numjobs').format(
+ get_text_color('numjobs'), numjobs)
+
+ # Header of the box
+ label = get_config_option('fio_jobs', 'title_style').format(display)
+
+ # Let's parse all the options of the current fio thread
+ # Some needs to be printed on top or bottom of the job to ease the read
+ to_early_print = ['exec_prerun', 'ioengine']
+ to_late_print = ['exec_postrun']
+
+ # Let's print the options on top of the box
+ for early_print in to_early_print:
+ label, display = render_option(
+ section, label, display, early_print)
+
+ current_io_engine = get_section_option(
+ section_name, 'ioengine', None)
+ if current_io_engine:
+ # Let's print all specifics options for this engine
+ for specific_option in sorted(get_specific_options(current_io_engine)):
+ label, display = render_option(
+ section, label, display, specific_option, get_config_option('ioengine', 'specific_options_color'))
+
+ # Let's print generic options sorted by name
+ for option in sorted(section):
+ if option in to_early_print or option in to_late_print or option in get_specific_options(current_io_engine):
+ continue
+ label, display = render_option(section, label, display, option)
+
+ # let's print options on the bottom of the box
+ for late_print in to_late_print:
+ label, display = render_option(
+ section, label, display, late_print)
+
+ # End of the box content
+ label += '</table>>'
+ return label
+
+
+def render_section(current_graph, fio_file, section_name, label):
+ """Render the section."""
+ attr = None
+ section = fio_file[section_name]
+
+ # Let's render the box associated to a job
+ current_graph.node(section_name, label,
+ shape=get_shape(),
+ color=get_shape_color(),
+ style=get_style())
+
+ # Let's report the duration of the jobs with a self-loop arrow
+ if 'runtime' in section and 'time_based' in section:
+ attr = 'runtime={}'.format(section['runtime'])
+ elif 'size' in section:
+ attr = 'size={}'.format(section['size'])
+ if attr:
+ current_graph.edge(section_name, section_name, attr)
+
+
+def create_sub_graph(name):
+ """Return a new graph."""
+ # We need to put 'cluster' in the name to ensure graphviz consider it as a cluster
+ cluster_name = 'cluster_' + name
+ # Unset the main graph labels to avoid a recopy in each subgraph
+ attr = {}
+ attr['label'] = ''
+ new_graph = Digraph(name=cluster_name, graph_attr=attr)
+ new_graph.attr(style=get_cluster_style(),
+ color=get_cluster_color())
+ return new_graph
+
+
+def create_legend():
+ """Return a legend."""
+ html_table = "<<table border='0' cellborder='1' cellspacing='0' cellpadding='4'>"
+ html_table += '<tr><td COLSPAN="2"><b>Legend</b></td></tr>'
+ legend_item = '<tr> <td>{}</td> <td><font color="{}">{}</font></td></tr>"'
+ legend_bgcolor_item = '<tr><td>{}</td><td BGCOLOR="{}"></td></tr>'
+ html_table += legend_item.format('numjobs',
+ get_text_color('numjobs'), 'x numjobs')
+ html_table += legend_item.format('generic option',
+ get_text_color(), 'generic option')
+ html_table += legend_item.format('ioengine option',
+ get_text_color('ioengine'), 'ioengine option')
+ html_table += legend_bgcolor_item.format('job', get_shape_color())
+ html_table += legend_bgcolor_item.format(
+ 'execution group', get_cluster_color())
+ html_table += '</table>>'
+ legend = Digraph('html_table')
+ legend.node('legend', shape='none', label=html_table)
+ return legend
+
+
+def fio_to_graphviz(filename, format):
+ """Compute the graphviz graph from the fio file."""
+
+ # Let's read the fio file
+ global fio_file
+ fio_file = configparser.RawConfigParser(
+ allow_no_value=True,
+ default_section="global",
+ inline_comment_prefixes="'#', ';'")
+ fio_file.read(filename)
+
+ # Prepare the main graph object
+ # Let's define the header of the document
+ attrs = {}
+ attrs['labelloc'] = 't'
+ attrs['label'] = get_header().format(
+ get_header_color(), os.path.basename(filename))
+ main_graph = Digraph(engine='dot', graph_attr=attrs, format=format)
+
+ # Let's add a legend
+ main_graph.subgraph(create_legend())
+
+ # By default all jobs are run in parallel and depends on "global"
+ depends_on = fio_file.default_section
+
+ # The previous section is by default the global section
+ previous_section = fio_file.default_section
+
+ current_graph = main_graph
+
+ # The first job will be a new execution group
+ new_execution_group = True
+
+ # Let's iterate on all sections to create links between them
+ for section_name in fio_file.sections():
+ # The current section
+ section = fio_file[section_name]
+
+ # If the current section is waiting the previous job
+ if ('stonewall' or 'wait_for_previous') in section:
+ # let's remember what was the previous job we depend on
+ depends_on = previous_section
+ new_execution_group = True
+ elif 'wait_for' in section:
+ # This sections depends on a named section pointed by wait_for
+ depends_on = section['wait_for']
+ new_execution_group = True
+
+ if new_execution_group:
+ # Let's link the current graph with the main one
+ main_graph.subgraph(current_graph)
+ # Let's create a new graph to represent all the incoming jobs running at the same time
+ current_graph = create_sub_graph(section_name)
+
+ # Let's render the current section in its execution group
+ render_section(current_graph, fio_file, section_name,
+ render_options(fio_file, section_name))
+
+ # Let's trace the link between this job and the one it depends on
+ # If we depend on 'global', we can avoid doing adding an arrow as we don't want to see 'global'
+ if depends_on != fio_file.default_section:
+ current_graph.edge(depends_on, section_name)
+
+ # The current section become the parent of the next one
+ previous_section = section_name
+
+ # We are by default in the same execution group
+ new_execution_group = False
+
+ # The last subgraph isn't rendered yet
+ main_graph.subgraph(current_graph)
+
+ # Let's return the main graphviz object
+ return main_graph
+
+
+def setup_commandline():
+ "Prepare the command line."
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--file', action='store',
+ type=str,
+ required=True,
+ help='the fio file to graph')
+ parser.add_argument('--output', action='store',
+ type=str,
+ help='the output filename')
+ parser.add_argument('--format', action='store',
+ type=str,
+ default='png',
+ help='the output format (see https://graphviz.org/docs/outputs/)')
+ parser.add_argument('--view', action='store_true',
+ default=False,
+ help='view the graph')
+ parser.add_argument('--keep', action='store_true',
+ default=False,
+ help='keep the graphviz script file')
+ parser.add_argument('--config', action='store',
+ type=str,
+ help='the configuration filename')
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ global config_file
+ args = setup_commandline()
+
+ if args.config is None:
+ if os.path.exists('fiograph.conf'):
+ config_filename = 'fiograph.conf'
+ else:
+ config_filename = os.path.join(os.path.dirname(__file__), 'fiograph.conf')
+ if not os.path.exists(config_filename):
+ raise FileNotFoundError("Cannot locate configuration file")
+ else:
+ config_filename = args.config
+ config_file = configparser.RawConfigParser(allow_no_value=True)
+ config_file.read(config_filename)
+
+ temp_filename = uuid.uuid4().hex
+ image_filename = fio_to_graphviz(args.file, args.format).render(temp_filename, view=args.view)
+
+ output_filename_stub = args.file
+ if args.output:
+ output_filename = args.output
+ else:
+ if output_filename_stub.endswith('.fio'):
+ output_filename_stub = output_filename_stub[:-4]
+ output_filename = image_filename.replace(temp_filename, output_filename_stub)
+ if args.view:
+ time.sleep(1)
+ # allow time for the file to be opened before renaming it
+ os.rename(image_filename, output_filename)
+
+ if not args.keep:
+ os.remove(temp_filename)
+ else:
+ os.rename(temp_filename, output_filename_stub + '.gv')
+
+
+main()
f = open(fn, 'r')
p_time = 0
for line in f:
- (time, value, foo, bar) = line.rstrip('\r\n').rsplit(', ')
+ (time, value) = line.rstrip('\r\n').rsplit(', ')[:2]
self.add_sample(p_time, int(time), int(value))
p_time = int(time)
BLK_SIZE=
BLOCK_SIZE=4k
SEQ=-1
-TEMPLATE=/tmp/template.fio
+TEMPLATE=$(mktemp "${TMPDIR:-${TEMP:-/tmp}}/template.fio.XXXXXX") || exit $?
+trap 'rm -f "$TEMPLATE"' EXIT
OUTFILE=
DISKS=
PRINTABLE_DISKS=
one test after another then one disk after another
Disabled by default
-p : Run parallel test
- one test after anoter but all disks at the same time
+ one test after another but all disks at the same time
Enabled by default
-D iodepth : Run with the specified iodepth
Default is $IODEPTH
def test_e2_get_pctiles_highest_pct(self):
fio_v3_bucket_count = 29 * 64
with open(self.fn, 'w') as f:
- # make a empty fio v3 histogram
+ # make an empty fio v3 histogram
buckets = [ 0 for j in range(0, fio_v3_bucket_count) ]
# add one I/O request to last bucket
buckets[-1] = 1
# Index will be used to remember what file was featuring what value
index=index+1
- time, perf, x, block_size = line[1]
+ time, perf, x, block_size = line[1][:4]
if (blk_size == 0):
try:
blk_size=int(block_size)
#We need to adjust the output filename regarding the pattern required by the user
if (pattern_set_by_user == True):
gnuplot_output_filename=pattern
- # As we do have some glob in the pattern, let's make this simpliest
- # We do remove the simpliest parts of the expression to get a clear file name
+ # As we do have some glob in the pattern, let's make this simplest
+ # We do remove the simplest parts of the expression to get a clear file name
gnuplot_output_filename=gnuplot_output_filename.replace('-*-','-')
gnuplot_output_filename=gnuplot_output_filename.replace('*','-')
gnuplot_output_filename=gnuplot_output_filename.replace('--','-')
.TP
.B
Grouped 2D graph
-All files are plotted in a single image to ease the comparaison. The same rendering options as per the individual 2D graph are used :
+All files are plotted in a single image to ease the comparison. The same rendering options as per the individual 2D graph are used :
.RS
.IP \(bu 3
raw
The resulting graph helps at understanding trends.
Grouped 2D graph
- All files are plotted in a single image to ease the comparaison. The same rendering options as per the individual 2D graph are used :
+ All files are plotted in a single image to ease the comparison. The same rendering options as per the individual 2D graph are used :
- raw
- smooth
- trend
char *str;
int i;
- for (i = 0; i < ARRAY_SIZE(testcases); ++i) {
+ for (i = 0; i < FIO_ARRAY_SIZE(testcases); ++i) {
p = &testcases[i];
str = num2str(p->num, p->maxlen, p->base, p->pow2, p->unit);
CU_ASSERT_STRING_EQUAL(str, p->expected);
static void log_verify_failure(struct verify_header *hdr, struct vcont *vc)
{
unsigned long long offset;
+ uint32_t len;
+ struct thread_data *td = vc->td;
offset = vc->io_u->verify_offset;
- offset += vc->hdr_num * hdr->len;
+ if (td->o.verify != VERIFY_PATTERN_NO_HDR) {
+ len = hdr->len;
+ offset += (unsigned long long) vc->hdr_num * len;
+ } else {
+ len = vc->io_u->buflen;
+ }
+
log_err("%.8s: verify failed at file %s offset %llu, length %u"
" (requested block: offset=%llu, length=%llu, flags=%x)\n",
- vc->name, vc->io_u->file->file_name, offset, hdr->len,
+ vc->name, vc->io_u->file->file_name, offset, len,
vc->io_u->verify_offset, vc->io_u->buflen, vc->io_u->flags);
if (vc->good_crc && vc->bad_crc) {
(unsigned char)buf[i],
(unsigned char)pattern[mod],
bits);
- log_err("fio: bad pattern block offset %u\n", i);
+ log_err("fio: bad pattern block offset %u\n",
+ i + header_size);
vc->name = "pattern";
log_verify_failure(hdr, vc);
return EILSEQ;
hdr = p;
/*
- * Make rand_seed check pass when have verify_backlog.
+ * Make rand_seed check pass when have verify_backlog or
+ * zone reset frequency for zonemode=zbd.
*/
- if (!td_rw(td) || (td->flags & TD_F_VER_BACKLOG))
+ if (!td_rw(td) || (td->flags & TD_F_VER_BACKLOG) ||
+ td->o.zrf.u.f)
io_u->rand_seed = hdr->rand_seed;
if (td->o.verify != VERIFY_PATTERN_NO_HDR) {
if (td->o.verify == VERIFY_NULL)
return;
- io_u->numberio = td->io_issues[io_u->ddir];
-
fill_pattern_headers(td, io_u, 0, 0);
}
ret = pthread_cond_wait(&td->verify_cond,
&td->io_u_lock);
if (ret) {
- pthread_mutex_unlock(&td->io_u_lock);
break;
}
}
struct all_io_list *get_all_io_list(int save_mask, size_t *sz)
{
struct all_io_list *rep;
- struct thread_data *td;
size_t depth;
void *next;
- int i, nr;
+ int nr;
compiletime_assert(sizeof(struct all_io_list) == 8, "all_io_list");
*/
depth = 0;
nr = 0;
- for_each_td(td, i) {
- if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
+ for_each_td(td) {
+ if (save_mask != IO_LIST_ALL && (__td_index + 1) != save_mask)
continue;
td->stop_io = 1;
td->flags |= TD_F_VSTATE_SAVED;
depth += (td->o.iodepth * td->o.nr_files);
nr++;
- }
+ } end_for_each();
if (!nr)
return NULL;
*sz = sizeof(*rep);
*sz += nr * sizeof(struct thread_io_list);
*sz += depth * sizeof(struct file_comp);
- rep = malloc(*sz);
- memset(rep, 0, *sz);
+ rep = calloc(1, *sz);
rep->threads = cpu_to_le64((uint64_t) nr);
next = &rep->state[0];
- for_each_td(td, i) {
+ for_each_td(td) {
struct thread_io_list *s = next;
unsigned int comps, index = 0;
- if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
+ if (save_mask != IO_LIST_ALL && (__td_index + 1) != save_mask)
continue;
comps = fill_file_completions(td, s, &index);
s->no_comps = cpu_to_le64((uint64_t) comps);
- s->depth = cpu_to_le64((uint64_t) td->o.iodepth);
- s->nofiles = cpu_to_le64((uint64_t) td->o.nr_files);
+ s->depth = cpu_to_le32((uint32_t) td->o.iodepth);
+ s->nofiles = cpu_to_le32((uint32_t) td->o.nr_files);
s->numberio = cpu_to_le64((uint64_t) td->io_issues[DDIR_WRITE]);
- s->index = cpu_to_le64((uint64_t) i);
+ s->index = cpu_to_le64((uint64_t) __td_index);
if (td->random_state.use64) {
s->rand.state64.s[0] = cpu_to_le64(td->random_state.state64.s1);
s->rand.state64.s[1] = cpu_to_le64(td->random_state.state64.s2);
}
snprintf((char *) s->name, sizeof(s->name), "%s", td->o.name);
next = io_list_next(s);
- }
+ } end_for_each();
return rep;
}
else
flags = O_RDONLY;
+#ifdef _WIN32
+ flags |= O_BINARY;
+#endif
+
verify_state_gen_name(out, sizeof(out), name, prefix, num);
fd = open(out, flags, 0644);
sk_out_assign(sw->sk_out);
if (wq->ops.nice) {
- if (nice(wq->ops.nice) < 0) {
+ errno = 0;
+ if (nice(wq->ops.nice) == -1 && errno != 0) {
log_err("workqueue: nice %s\n", strerror(errno));
ret = 1;
}
#include <sys/stat.h>
#include <unistd.h>
+#include "compiler/compiler.h"
#include "os/os.h"
#include "file.h"
#include "fio.h"
#include "pshared.h"
#include "zbd.h"
+static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
+{
+ return (uint64_t)(offset - f->file_offset) < f->io_size;
+}
+
+static inline unsigned int zbd_zone_idx(const struct fio_file *f,
+ struct fio_zone_info *zone)
+{
+ return zone - f->zbd_info->zone_info;
+}
+
+/**
+ * zbd_offset_to_zone_idx - convert an offset into a zone number
+ * @f: file pointer.
+ * @offset: offset in bytes. If this offset is in the first zone_size bytes
+ * past the disk size then the index of the sentinel is returned.
+ */
+static unsigned int zbd_offset_to_zone_idx(const struct fio_file *f,
+ uint64_t offset)
+{
+ uint32_t zone_idx;
+
+ if (f->zbd_info->zone_size_log2 > 0)
+ zone_idx = offset >> f->zbd_info->zone_size_log2;
+ else
+ zone_idx = offset / f->zbd_info->zone_size;
+
+ return min(zone_idx, f->zbd_info->nr_zones);
+}
+
+/**
+ * zbd_zone_end - Return zone end location
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
+{
+ return (z+1)->start;
+}
+
+/**
+ * zbd_zone_capacity_end - Return zone capacity limit end location
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
+{
+ return z->start + z->capacity;
+}
+
+/**
+ * zbd_zone_remainder - Return the number of bytes that are still available for
+ * writing before the zone gets full
+ * @z: zone info pointer.
+ */
+static inline uint64_t zbd_zone_remainder(struct fio_zone_info *z)
+{
+ if (z->wp >= zbd_zone_capacity_end(z))
+ return 0;
+
+ return zbd_zone_capacity_end(z) - z->wp;
+}
+
+/**
+ * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
+ * @f: file pointer.
+ * @z: zone info pointer.
+ * @required: minimum number of bytes that must remain in a zone.
+ *
+ * The caller must hold z->mutex.
+ */
+static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
+ uint64_t required)
+{
+ assert((required & 511) == 0);
+
+ return z->has_wp && required > zbd_zone_remainder(z);
+}
+
+static void zone_lock(struct thread_data *td, const struct fio_file *f,
+ struct fio_zone_info *z)
+{
+#ifndef NDEBUG
+ unsigned int const nz = zbd_zone_idx(f, z);
+ /* A thread should never lock zones outside its working area. */
+ assert(f->min_zone <= nz && nz < f->max_zone);
+ assert(z->has_wp);
+#endif
+
+ /*
+ * Lock the io_u target zone. The zone will be unlocked if io_u offset
+ * is changed or when io_u completes and zbd_put_io() executed.
+ * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
+ * other waiting for zone locks when building an io_u batch, first
+ * only trylock the zone. If the zone is already locked by another job,
+ * process the currently queued I/Os so that I/O progress is made and
+ * zones unlocked.
+ */
+ if (pthread_mutex_trylock(&z->mutex) != 0) {
+ if (!td_ioengine_flagged(td, FIO_SYNCIO))
+ io_u_quiesce(td);
+ pthread_mutex_lock(&z->mutex);
+ }
+}
+
+static inline void zone_unlock(struct fio_zone_info *z)
+{
+ assert(z->has_wp);
+ pthread_mutex_unlock(&z->mutex);
+}
+
+static inline struct fio_zone_info *zbd_get_zone(const struct fio_file *f,
+ unsigned int zone_idx)
+{
+ return &f->zbd_info->zone_info[zone_idx];
+}
+
+static inline struct fio_zone_info *
+zbd_offset_to_zone(const struct fio_file *f, uint64_t offset)
+{
+ return zbd_get_zone(f, zbd_offset_to_zone_idx(f, offset));
+}
+
+static bool accounting_vdb(struct thread_data *td, const struct fio_file *f)
+{
+ return td->o.zrt.u.f && td_write(td);
+}
+
/**
* zbd_get_zoned_model - Get a device zoned model
* @td: FIO thread data
* @f: FIO file for which to get model information
*/
-int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
- enum zbd_zoned_model *model)
+static int zbd_get_zoned_model(struct thread_data *td, struct fio_file *f,
+ enum zbd_zoned_model *model)
{
int ret;
+ if (f->filetype == FIO_TYPE_PIPE) {
+ log_err("zonemode=zbd does not support pipes\n");
+ return -EINVAL;
+ }
+
+ /* If regular file, always emulate zones inside the file. */
+ if (f->filetype == FIO_TYPE_FILE) {
+ *model = ZBD_NONE;
+ return 0;
+ }
+
if (td->io_ops && td->io_ops->get_zoned_model)
ret = td->io_ops->get_zoned_model(td, f, model);
else
* upon failure. If the zone report is empty, always assume an error (device
* problem) and return -EIO.
*/
-int zbd_report_zones(struct thread_data *td, struct fio_file *f,
- uint64_t offset, struct zbd_zone *zones,
- unsigned int nr_zones)
+static int zbd_report_zones(struct thread_data *td, struct fio_file *f,
+ uint64_t offset, struct zbd_zone *zones,
+ unsigned int nr_zones)
{
int ret;
ret = blkzoned_report_zones(td, f, offset, zones, nr_zones);
if (ret < 0) {
td_verror(td, errno, "report zones failed");
- log_err("%s: report zones from sector %llu failed (%d).\n",
- f->file_name, (unsigned long long)offset >> 9, errno);
+ log_err("%s: report zones from sector %"PRIu64" failed (nr_zones=%d; errno=%d).\n",
+ f->file_name, offset >> 9, nr_zones, errno);
} else if (ret == 0) {
td_verror(td, errno, "Empty zone report");
- log_err("%s: report zones from sector %llu is empty.\n",
- f->file_name, (unsigned long long)offset >> 9);
+ log_err("%s: report zones from sector %"PRIu64" is empty.\n",
+ f->file_name, offset >> 9);
ret = -EIO;
}
* Reset the write pointer of all zones in the range @offset...@offset+@length.
* Returns 0 upon success and a negative error code upon failure.
*/
-int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
- uint64_t offset, uint64_t length)
+static int zbd_reset_wp(struct thread_data *td, struct fio_file *f,
+ uint64_t offset, uint64_t length)
{
int ret;
ret = blkzoned_reset_wp(td, f, offset, length);
if (ret < 0) {
td_verror(td, errno, "resetting wp failed");
- log_err("%s: resetting wp for %llu sectors at sector %llu failed (%d).\n",
- f->file_name, (unsigned long long)length >> 9,
- (unsigned long long)offset >> 9, errno);
+ log_err("%s: resetting wp for %"PRIu64" sectors at sector %"PRIu64" failed (%d).\n",
+ f->file_name, length >> 9, offset >> 9, errno);
}
return ret;
}
/**
- * zbd_zone_idx - convert an offset into a zone number
- * @f: file pointer.
- * @offset: offset in bytes. If this offset is in the first zone_size bytes
- * past the disk size then the index of the sentinel is returned.
+ * __zbd_reset_zone - reset the write pointer of a single zone
+ * @td: FIO thread data.
+ * @f: FIO file associated with the disk for which to reset a write pointer.
+ * @z: Zone to reset.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ *
+ * The caller must hold z->mutex.
*/
-static uint32_t zbd_zone_idx(const struct fio_file *f, uint64_t offset)
+static int __zbd_reset_zone(struct thread_data *td, struct fio_file *f,
+ struct fio_zone_info *z)
{
- uint32_t zone_idx;
+ uint64_t offset = z->start;
+ uint64_t length = (z+1)->start - offset;
+ uint64_t data_in_zone = z->wp - z->start;
+ int ret = 0;
- if (f->zbd_info->zone_size_log2 > 0)
- zone_idx = offset >> f->zbd_info->zone_size_log2;
- else
- zone_idx = offset / f->zbd_info->zone_size;
+ if (!data_in_zone)
+ return 0;
- return min(zone_idx, f->zbd_info->nr_zones);
+ assert(is_valid_offset(f, offset + length - 1));
+
+ dprint(FD_ZBD, "%s: resetting wp of zone %u.\n",
+ f->file_name, zbd_zone_idx(f, z));
+
+ switch (f->zbd_info->model) {
+ case ZBD_HOST_AWARE:
+ case ZBD_HOST_MANAGED:
+ ret = zbd_reset_wp(td, f, offset, length);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ break;
+ }
+
+ if (accounting_vdb(td, f)) {
+ pthread_mutex_lock(&f->zbd_info->mutex);
+ f->zbd_info->wp_valid_data_bytes -= data_in_zone;
+ pthread_mutex_unlock(&f->zbd_info->mutex);
+ }
+
+ z->wp = z->start;
+
+ td->ts.nr_zone_resets++;
+
+ return ret;
}
/**
- * zbd_zone_swr - Test whether a zone requires sequential writes
- * @z: zone info pointer.
+ * zbd_write_zone_put - Remove a zone from the write target zones array.
+ * @td: FIO thread data.
+ * @f: FIO file that has the write zones array to remove.
+ * @zone_idx: Index of the zone to remove.
+ *
+ * The caller must hold f->zbd_info->mutex.
+ */
+static void zbd_write_zone_put(struct thread_data *td, const struct fio_file *f,
+ struct fio_zone_info *z)
+{
+ uint32_t zi;
+
+ if (!z->write)
+ return;
+
+ for (zi = 0; zi < f->zbd_info->num_write_zones; zi++) {
+ if (zbd_get_zone(f, f->zbd_info->write_zones[zi]) == z)
+ break;
+ }
+ if (zi == f->zbd_info->num_write_zones)
+ return;
+
+ dprint(FD_ZBD, "%s: removing zone %u from write zone array\n",
+ f->file_name, zbd_zone_idx(f, z));
+
+ memmove(f->zbd_info->write_zones + zi,
+ f->zbd_info->write_zones + zi + 1,
+ (ZBD_MAX_WRITE_ZONES - (zi + 1)) *
+ sizeof(f->zbd_info->write_zones[0]));
+
+ f->zbd_info->num_write_zones--;
+ td->num_write_zones--;
+ z->write = 0;
+}
+
+/**
+ * zbd_reset_zone - reset the write pointer of a single zone and remove the zone
+ * from the array of write zones.
+ * @td: FIO thread data.
+ * @f: FIO file associated with the disk for which to reset a write pointer.
+ * @z: Zone to reset.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
+ *
+ * The caller must hold z->mutex.
*/
-static inline bool zbd_zone_swr(struct fio_zone_info *z)
+static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
+ struct fio_zone_info *z)
{
- return z->type == ZBD_ZONE_TYPE_SWR;
+ int ret;
+
+ ret = __zbd_reset_zone(td, f, z);
+ if (ret)
+ return ret;
+
+ pthread_mutex_lock(&f->zbd_info->mutex);
+ zbd_write_zone_put(td, f, z);
+ pthread_mutex_unlock(&f->zbd_info->mutex);
+ return 0;
}
/**
- * zbd_zone_end - Return zone end location
- * @z: zone info pointer.
+ * zbd_finish_zone - finish the specified zone
+ * @td: FIO thread data.
+ * @f: FIO file for which to finish a zone
+ * @z: Zone to finish.
+ *
+ * Finish the zone at @offset with open or close status.
*/
-static inline uint64_t zbd_zone_end(const struct fio_zone_info *z)
+static int zbd_finish_zone(struct thread_data *td, struct fio_file *f,
+ struct fio_zone_info *z)
{
- return (z+1)->start;
+ uint64_t offset = z->start;
+ uint64_t length = f->zbd_info->zone_size;
+ int ret = 0;
+
+ switch (f->zbd_info->model) {
+ case ZBD_HOST_AWARE:
+ case ZBD_HOST_MANAGED:
+ if (td->io_ops && td->io_ops->finish_zone)
+ ret = td->io_ops->finish_zone(td, f, offset, length);
+ else
+ ret = blkzoned_finish_zone(td, f, offset, length);
+ break;
+ default:
+ break;
+ }
+
+ if (ret < 0) {
+ td_verror(td, errno, "finish zone failed");
+ log_err("%s: finish zone at sector %"PRIu64" failed (%d).\n",
+ f->file_name, offset >> 9, errno);
+ } else {
+ z->wp = (z+1)->start;
+ }
+
+ return ret;
}
/**
- * zbd_zone_capacity_end - Return zone capacity limit end location
- * @z: zone info pointer.
+ * zbd_reset_zones - Reset a range of zones.
+ * @td: fio thread data.
+ * @f: fio file for which to reset zones
+ * @zb: first zone to reset.
+ * @ze: first zone not to reset.
+ *
+ * Returns 0 upon success and 1 upon failure.
*/
-static inline uint64_t zbd_zone_capacity_end(const struct fio_zone_info *z)
+static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
+ struct fio_zone_info *const zb,
+ struct fio_zone_info *const ze)
{
- return z->start + z->capacity;
+ struct fio_zone_info *z;
+ const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
+ int res = 0;
+
+ if (fio_unlikely(0 == min_bs))
+ return 1;
+
+ dprint(FD_ZBD, "%s: examining zones %u .. %u\n",
+ f->file_name, zbd_zone_idx(f, zb), zbd_zone_idx(f, ze));
+
+ for (z = zb; z < ze; z++) {
+ if (!z->has_wp)
+ continue;
+
+ zone_lock(td, f, z);
+
+ if (z->wp != z->start) {
+ dprint(FD_ZBD, "%s: resetting zone %u\n",
+ f->file_name, zbd_zone_idx(f, z));
+ if (zbd_reset_zone(td, f, z) < 0)
+ res = 1;
+ }
+
+ zone_unlock(z);
+ }
+
+ return res;
}
/**
- * zbd_zone_full - verify whether a minimum number of bytes remain in a zone
- * @f: file pointer.
- * @z: zone info pointer.
- * @required: minimum number of bytes that must remain in a zone.
+ * zbd_get_max_open_zones - Get the maximum number of open zones
+ * @td: FIO thread data
+ * @f: FIO file for which to get max open zones
+ * @max_open_zones: Upon success, result will be stored here.
*
- * The caller must hold z->mutex.
+ * A @max_open_zones value set to zero means no limit.
+ *
+ * Returns 0 upon success and a negative error code upon failure.
*/
-static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
- uint64_t required)
+static int zbd_get_max_open_zones(struct thread_data *td, struct fio_file *f,
+ unsigned int *max_open_zones)
{
- assert((required & 511) == 0);
+ int ret;
+
+ if (td->io_ops && td->io_ops->get_max_open_zones)
+ ret = td->io_ops->get_max_open_zones(td, f, max_open_zones);
+ else
+ ret = blkzoned_get_max_open_zones(td, f, max_open_zones);
+ if (ret < 0) {
+ td_verror(td, errno, "get max open zones failed");
+ log_err("%s: get max open zones failed (%d).\n",
+ f->file_name, errno);
+ }
- return zbd_zone_swr(z) &&
- z->wp + required > zbd_zone_capacity_end(z);
+ return ret;
}
-static void zone_lock(struct thread_data *td, struct fio_file *f, struct fio_zone_info *z)
+/**
+ * zbd_get_max_active_zones - Get the maximum number of active zones
+ * @td: FIO thread data
+ * @f: FIO file for which to get max active zones
+ *
+ * Returns max_active_zones limit value of the target file if it is available.
+ * Otherwise return zero, which means no limit.
+ */
+static unsigned int zbd_get_max_active_zones(struct thread_data *td,
+ struct fio_file *f)
{
- struct zoned_block_device_info *zbd = f->zbd_info;
- uint32_t nz = z - zbd->zone_info;
+ unsigned int max_active_zones;
+ int ret;
- /* A thread should never lock zones outside its working area. */
- assert(f->min_zone <= nz && nz < f->max_zone);
+ if (td->io_ops && td->io_ops->get_max_active_zones)
+ ret = td->io_ops->get_max_active_zones(td, f,
+ &max_active_zones);
+ else
+ ret = blkzoned_get_max_active_zones(td, f, &max_active_zones);
+ if (ret < 0) {
+ dprint(FD_ZBD, "%s: max_active_zones is not available\n",
+ f->file_name);
+ return 0;
+ }
+
+ return max_active_zones;
+}
+
+/**
+ * __zbd_write_zone_get - Add a zone to the array of write zones.
+ * @td: fio thread data.
+ * @f: fio file that has the write zones array to add.
+ * @zone_idx: Index of the zone to add.
+ *
+ * Do same operation as @zbd_write_zone_get, except it adds the zone at
+ * @zone_idx to write target zones array even when it does not have remainder
+ * space to write one block.
+ */
+static bool __zbd_write_zone_get(struct thread_data *td,
+ const struct fio_file *f,
+ struct fio_zone_info *z)
+{
+ struct zoned_block_device_info *zbdi = f->zbd_info;
+ uint32_t zone_idx = zbd_zone_idx(f, z);
+ bool res = true;
+
+ if (z->cond == ZBD_ZONE_COND_OFFLINE)
+ return false;
/*
- * Lock the io_u target zone. The zone will be unlocked if io_u offset
- * is changed or when io_u completes and zbd_put_io() executed.
- * To avoid multiple jobs doing asynchronous I/Os from deadlocking each
- * other waiting for zone locks when building an io_u batch, first
- * only trylock the zone. If the zone is already locked by another job,
- * process the currently queued I/Os so that I/O progress is made and
- * zones unlocked.
+ * Skip full zones with data verification enabled because resetting a
+ * zone causes data loss and hence causes verification to fail.
*/
- if (pthread_mutex_trylock(&z->mutex) != 0) {
- if (!td_ioengine_flagged(td, FIO_SYNCIO))
- io_u_quiesce(td);
- pthread_mutex_lock(&z->mutex);
+ if (td->o.verify != VERIFY_NONE && zbd_zone_remainder(z) == 0)
+ return false;
+
+ /*
+ * zbdi->max_write_zones == 0 means that there is no limit on the
+ * maximum number of write target zones. In this case, do no track write
+ * target zones in zbdi->write_zones array.
+ */
+ if (!zbdi->max_write_zones)
+ return true;
+
+ pthread_mutex_lock(&zbdi->mutex);
+
+ if (z->write) {
+ /*
+ * If the zone is going to be completely filled by writes
+ * already in-flight, handle it as a full zone instead of a
+ * write target zone.
+ */
+ if (!zbd_zone_remainder(z))
+ res = false;
+ goto out;
}
+
+ res = false;
+ /* Zero means no limit */
+ if (td->o.job_max_open_zones > 0 &&
+ td->num_write_zones >= td->o.job_max_open_zones)
+ goto out;
+ if (zbdi->num_write_zones >= zbdi->max_write_zones)
+ goto out;
+
+ dprint(FD_ZBD, "%s: adding zone %u to write zone array\n",
+ f->file_name, zone_idx);
+
+ zbdi->write_zones[zbdi->num_write_zones++] = zone_idx;
+ td->num_write_zones++;
+ z->write = 1;
+ res = true;
+
+out:
+ pthread_mutex_unlock(&zbdi->mutex);
+ return res;
}
-static bool is_valid_offset(const struct fio_file *f, uint64_t offset)
+/**
+ * zbd_write_zone_get - Add a zone to the array of write zones.
+ * @td: fio thread data.
+ * @f: fio file that has the open zones to add.
+ * @zone_idx: Index of the zone to add.
+ *
+ * Add a ZBD zone to write target zones array, if it is not yet added. Returns
+ * true if either the zone was already added or if the zone was successfully
+ * added to the array without exceeding the maximum number of write zones.
+ * Returns false if the zone was not already added and addition of the zone
+ * would cause the zone limit to be exceeded.
+ */
+static bool zbd_write_zone_get(struct thread_data *td, const struct fio_file *f,
+ struct fio_zone_info *z)
{
- return (uint64_t)(offset - f->file_offset) < f->io_size;
+ const uint64_t min_bs = td->o.min_bs[DDIR_WRITE];
+
+ /*
+ * Skip full zones with data verification enabled because resetting a
+ * zone causes data loss and hence causes verification to fail.
+ */
+ if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
+ return false;
+
+ return __zbd_write_zone_get(td, f, z);
}
-/* Verify whether direct I/O is used for all host-managed zoned drives. */
+/* Verify whether direct I/O is used for all host-managed zoned block drives. */
static bool zbd_using_direct_io(void)
{
- struct thread_data *td;
struct fio_file *f;
- int i, j;
+ int j;
- for_each_td(td, i) {
+ for_each_td(td) {
if (td->o.odirect || !(td->o.td_ddir & TD_DDIR_WRITE))
continue;
for_each_file(td, f, j) {
- if (f->zbd_info &&
+ if (f->zbd_info && f->filetype == FIO_TYPE_BLOCK &&
f->zbd_info->model == ZBD_HOST_MANAGED)
return false;
}
- }
+ } end_for_each();
return true;
}
/* Whether or not the I/O range for f includes one or more sequential zones */
-static bool zbd_is_seq_job(struct fio_file *f)
+static bool zbd_is_seq_job(const struct fio_file *f)
{
uint32_t zone_idx, zone_idx_b, zone_idx_e;
assert(f->zbd_info);
+
if (f->io_size == 0)
return false;
- zone_idx_b = zbd_zone_idx(f, f->file_offset);
- zone_idx_e = zbd_zone_idx(f, f->file_offset + f->io_size - 1);
+
+ zone_idx_b = zbd_offset_to_zone_idx(f, f->file_offset);
+ zone_idx_e =
+ zbd_offset_to_zone_idx(f, f->file_offset + f->io_size - 1);
for (zone_idx = zone_idx_b; zone_idx <= zone_idx_e; zone_idx++)
- if (zbd_zone_swr(&f->zbd_info->zone_info[zone_idx]))
+ if (zbd_get_zone(f, zone_idx)->has_wp)
return true;
return false;
}
/*
- * Verify whether offset and size parameters are aligned with zone boundaries.
+ * Verify whether the file offset and size parameters are aligned with zone
+ * boundaries. If the file offset is not aligned, align it down to the start of
+ * the zone containing the start offset and align up the file io_size parameter.
*/
-static bool zbd_verify_sizes(void)
+static bool zbd_zone_align_file_sizes(struct thread_data *td,
+ struct fio_file *f)
{
const struct fio_zone_info *z;
- struct thread_data *td;
- struct fio_file *f;
uint64_t new_offset, new_end;
- uint32_t zone_idx;
- int i, j;
- for_each_td(td, i) {
- for_each_file(td, f, j) {
- if (!f->zbd_info)
- continue;
- if (f->file_offset >= f->real_file_size)
- continue;
- if (!zbd_is_seq_job(f))
- continue;
+ if (!f->zbd_info)
+ return true;
+ if (f->file_offset >= f->real_file_size)
+ return true;
+ if (!zbd_is_seq_job(f))
+ return true;
- if (!td->o.zone_size) {
- td->o.zone_size = f->zbd_info->zone_size;
- if (!td->o.zone_size) {
- log_err("%s: invalid 0 zone size\n",
- f->file_name);
- return false;
- }
- } else if (td->o.zone_size != f->zbd_info->zone_size) {
- log_err("%s: job parameter zonesize %llu does not match disk zone size %llu.\n",
- f->file_name, (unsigned long long) td->o.zone_size,
- (unsigned long long) f->zbd_info->zone_size);
- return false;
- }
+ if (!td->o.zone_size) {
+ td->o.zone_size = f->zbd_info->zone_size;
+ if (!td->o.zone_size) {
+ log_err("%s: invalid 0 zone size\n",
+ f->file_name);
+ return false;
+ }
+ } else if (td->o.zone_size != f->zbd_info->zone_size) {
+ log_err("%s: zonesize %llu does not match the device zone size %"PRIu64".\n",
+ f->file_name, td->o.zone_size,
+ f->zbd_info->zone_size);
+ return false;
+ }
- if (td->o.zone_skip &&
- (td->o.zone_skip < td->o.zone_size ||
- td->o.zone_skip % td->o.zone_size)) {
- log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
- f->file_name, (unsigned long long) td->o.zone_skip,
- (unsigned long long) td->o.zone_size);
- return false;
- }
+ if (td->o.zone_skip % td->o.zone_size) {
+ log_err("%s: zoneskip %llu is not a multiple of the device zone size %llu.\n",
+ f->file_name, td->o.zone_skip,
+ td->o.zone_size);
+ return false;
+ }
- zone_idx = zbd_zone_idx(f, f->file_offset);
- z = &f->zbd_info->zone_info[zone_idx];
- if ((f->file_offset != z->start) &&
- (td->o.td_ddir != TD_DDIR_READ)) {
- new_offset = zbd_zone_end(z);
- if (new_offset >= f->file_offset + f->io_size) {
- log_info("%s: io_size must be at least one zone\n",
- f->file_name);
- return false;
- }
- log_info("%s: rounded up offset from %llu to %llu\n",
- f->file_name, (unsigned long long) f->file_offset,
- (unsigned long long) new_offset);
- f->io_size -= (new_offset - f->file_offset);
- f->file_offset = new_offset;
- }
- zone_idx = zbd_zone_idx(f, f->file_offset + f->io_size);
- z = &f->zbd_info->zone_info[zone_idx];
- new_end = z->start;
- if ((td->o.td_ddir != TD_DDIR_READ) &&
- (f->file_offset + f->io_size != new_end)) {
- if (new_end <= f->file_offset) {
- log_info("%s: io_size must be at least one zone\n",
- f->file_name);
- return false;
- }
- log_info("%s: rounded down io_size from %llu to %llu\n",
- f->file_name, (unsigned long long) f->io_size,
- (unsigned long long) new_end - f->file_offset);
- f->io_size = new_end - f->file_offset;
- }
+ if (td->o.td_ddir == TD_DDIR_READ) {
+ z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
+ new_end = z->start;
+ if (f->file_offset + f->io_size > new_end) {
+ log_info("%s: rounded io_size from %"PRIu64" to %"PRIu64"\n",
+ f->file_name, f->io_size,
+ new_end - f->file_offset);
+ f->io_size = new_end - f->file_offset;
+ }
+ return true;
+ }
- f->min_zone = zbd_zone_idx(f, f->file_offset);
- f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size);
- assert(f->min_zone < f->max_zone);
+ z = zbd_offset_to_zone(f, f->file_offset);
+ if (f->file_offset != z->start) {
+ new_offset = zbd_zone_end(z);
+ if (new_offset >= f->file_offset + f->io_size) {
+ log_info("%s: io_size must be at least one zone\n",
+ f->file_name);
+ return false;
}
+ log_info("%s: rounded up offset from %"PRIu64" to %"PRIu64"\n",
+ f->file_name, f->file_offset,
+ new_offset);
+ f->io_size -= (new_offset - f->file_offset);
+ f->file_offset = new_offset;
+ }
+
+ z = zbd_offset_to_zone(f, f->file_offset + f->io_size);
+ new_end = z->start;
+ if (f->file_offset + f->io_size != new_end) {
+ if (new_end <= f->file_offset) {
+ log_info("%s: io_size must be at least one zone\n",
+ f->file_name);
+ return false;
+ }
+ log_info("%s: rounded down io_size from %"PRIu64" to %"PRIu64"\n",
+ f->file_name, f->io_size,
+ new_end - f->file_offset);
+ f->io_size = new_end - f->file_offset;
}
return true;
}
-static bool zbd_verify_bs(void)
+/*
+ * Verify whether offset and size parameters are aligned with zone boundaries.
+ */
+static bool zbd_verify_sizes(void)
{
- struct thread_data *td;
struct fio_file *f;
- uint32_t zone_size;
- int i, j, k;
+ int j;
+
+ for_each_td(td) {
+ for_each_file(td, f, j) {
+ if (!zbd_zone_align_file_sizes(td, f))
+ return false;
+ }
+ } end_for_each();
- for_each_td(td, i) {
+ return true;
+}
+
+static bool zbd_verify_bs(void)
+{
+ struct fio_file *f;
+ int j;
+
+ for_each_td(td) {
+ if (td_trim(td) &&
+ (td->o.min_bs[DDIR_TRIM] != td->o.max_bs[DDIR_TRIM] ||
+ td->o.bssplit_nr[DDIR_TRIM])) {
+ log_info("bsrange and bssplit are not allowed for trim with zonemode=zbd\n");
+ return false;
+ }
for_each_file(td, f, j) {
+ uint64_t zone_size;
+
if (!f->zbd_info)
continue;
+
zone_size = f->zbd_info->zone_size;
- for (k = 0; k < ARRAY_SIZE(td->o.bs); k++) {
- if (td->o.verify != VERIFY_NONE &&
- zone_size % td->o.bs[k] != 0) {
- log_info("%s: block size %llu is not a divisor of the zone size %d\n",
- f->file_name, td->o.bs[k],
- zone_size);
- return false;
- }
+ if (td_trim(td) && td->o.bs[DDIR_TRIM] != zone_size) {
+ log_info("%s: trim block size %llu is not the zone size %"PRIu64"\n",
+ f->file_name, td->o.bs[DDIR_TRIM],
+ zone_size);
+ return false;
}
}
- }
+ } end_for_each();
return true;
}
int i;
if (zone_size == 0) {
- log_err("%s: Specifying the zone size is mandatory for regular block devices with --zonemode=zbd\n\n",
+ log_err("%s: Specifying the zone size is mandatory for regular file/block device with --zonemode=zbd\n\n",
f->file_name);
return 1;
}
if (zone_capacity > zone_size) {
log_err("%s: job parameter zonecapacity %llu is larger than zone size %llu\n",
- f->file_name, (unsigned long long) td->o.zone_capacity,
- (unsigned long long) td->o.zone_size);
+ f->file_name, td->o.zone_capacity, td->o.zone_size);
return 1;
}
+ if (f->real_file_size < zone_size) {
+ log_err("%s: file/device size %"PRIu64" is smaller than zone size %"PRIu64"\n",
+ f->file_name, f->real_file_size, zone_size);
+ return -EINVAL;
+ }
+
nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
zbd_info = scalloc(1, sizeof(*zbd_info) +
(nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
p->type = ZBD_ZONE_TYPE_SWR;
p->cond = ZBD_ZONE_COND_EMPTY;
p->capacity = zone_capacity;
+ p->has_wp = 1;
}
/* a sentinel */
p->start = nr_zones * zone_size;
int nr_zones, nrz;
struct zbd_zone *zones, *z;
struct fio_zone_info *p;
- uint64_t zone_size, offset;
+ uint64_t zone_size, offset, capacity;
+ bool same_zone_cap = true;
struct zoned_block_device_info *zbd_info = NULL;
- int i, j, ret = 0;
+ int i, j, ret = -ENOMEM;
zones = calloc(ZBD_REPORT_MAX_ZONES, sizeof(struct zbd_zone));
if (!zones)
}
zone_size = zones[0].len;
+ capacity = zones[0].capacity;
nr_zones = (f->real_file_size + zone_size - 1) / zone_size;
if (td->o.zone_size == 0) {
td->o.zone_size = zone_size;
} else if (td->o.zone_size != zone_size) {
- log_err("fio: %s job parameter zonesize %llu does not match disk zone size %llu.\n",
- f->file_name, (unsigned long long) td->o.zone_size,
- (unsigned long long) zone_size);
+ log_err("fio: %s job parameter zonesize %llu does not match disk zone size %"PRIu64".\n",
+ f->file_name, td->o.zone_size, zone_size);
ret = -EINVAL;
goto out;
}
- dprint(FD_ZBD, "Device %s has %d zones of size %llu KB\n", f->file_name,
- nr_zones, (unsigned long long) zone_size / 1024);
+ dprint(FD_ZBD, "Device %s has %d zones of size %"PRIu64" KB\n",
+ f->file_name, nr_zones, zone_size / 1024);
zbd_info = scalloc(1, sizeof(*zbd_info) +
(nr_zones + 1) * sizeof(zbd_info->zone_info[0]));
- ret = -ENOMEM;
if (!zbd_info)
goto out;
mutex_init_pshared(&zbd_info->mutex);
PTHREAD_MUTEX_RECURSIVE);
p->start = z->start;
p->capacity = z->capacity;
+ if (capacity != z->capacity)
+ same_zone_cap = false;
+
switch (z->cond) {
case ZBD_ZONE_COND_NOT_WP:
case ZBD_ZONE_COND_FULL:
p->wp = z->wp;
break;
}
+
+ switch (z->type) {
+ case ZBD_ZONE_TYPE_SWR:
+ p->has_wp = 1;
+ break;
+ default:
+ p->has_wp = 0;
+ }
p->type = z->type;
p->cond = z->cond;
+
if (j > 0 && p->start != p[-1].start + zone_size) {
- log_info("%s: invalid zone data\n",
- f->file_name);
+ log_info("%s: invalid zone data [%d:%d]: %"PRIu64" + %"PRIu64" != %"PRIu64"\n",
+ f->file_name, j, i,
+ p[-1].start, zone_size, p->start);
ret = -EINVAL;
goto out;
}
offset = z->start + z->len;
if (j >= nr_zones)
break;
- nrz = zbd_report_zones(td, f, offset,
- zones, ZBD_REPORT_MAX_ZONES);
+
+ nrz = zbd_report_zones(td, f, offset, zones,
+ min((uint32_t)(nr_zones - j),
+ ZBD_REPORT_MAX_ZONES));
if (nrz < 0) {
ret = nrz;
- log_info("fio: report zones (offset %llu) failed for %s (%d).\n",
- (unsigned long long)offset,
- f->file_name, -ret);
+ log_info("fio: report zones (offset %"PRIu64") failed for %s (%d).\n",
+ offset, f->file_name, -ret);
goto out;
}
}
f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
ilog2(zone_size) : 0;
f->zbd_info->nr_zones = nr_zones;
+ f->zbd_info->max_active_zones = zbd_get_max_active_zones(td, f);
+
+ if (same_zone_cap)
+ dprint(FD_ZBD, "Zone capacity = %"PRIu64" KB\n",
+ capacity / 1024);
+
zbd_info = NULL;
ret = 0;
return ret;
}
+static int zbd_set_max_write_zones(struct thread_data *td, struct fio_file *f)
+{
+ struct zoned_block_device_info *zbd = f->zbd_info;
+ unsigned int max_open_zones;
+ int ret;
+
+ if (zbd->model != ZBD_HOST_MANAGED || td->o.ignore_zone_limits) {
+ /* Only host-managed devices have a max open limit */
+ zbd->max_write_zones = td->o.max_open_zones;
+ goto out;
+ }
+
+ /* If host-managed, get the max open limit */
+ ret = zbd_get_max_open_zones(td, f, &max_open_zones);
+ if (ret)
+ return ret;
+
+ if (!max_open_zones) {
+ /* No device limit */
+ zbd->max_write_zones = td->o.max_open_zones;
+ } else if (!td->o.max_open_zones) {
+ /* No user limit. Set limit to device limit */
+ zbd->max_write_zones = max_open_zones;
+ } else if (td->o.max_open_zones <= max_open_zones) {
+ /* Both user limit and dev limit. User limit not too large */
+ zbd->max_write_zones = td->o.max_open_zones;
+ } else {
+ /* Both user limit and dev limit. User limit too large */
+ td_verror(td, EINVAL,
+ "Specified --max_open_zones is too large");
+ log_err("Specified --max_open_zones (%d) is larger than max (%u)\n",
+ td->o.max_open_zones, max_open_zones);
+ return -EINVAL;
+ }
+
+out:
+ /* Ensure that the limit is not larger than FIO's internal limit */
+ if (zbd->max_write_zones > ZBD_MAX_WRITE_ZONES) {
+ td_verror(td, EINVAL, "'max_open_zones' value is too large");
+ log_err("'max_open_zones' value is larger than %u\n",
+ ZBD_MAX_WRITE_ZONES);
+ return -EINVAL;
+ }
+
+ dprint(FD_ZBD, "%s: using max write zones limit: %"PRIu32"\n",
+ f->file_name, zbd->max_write_zones);
+
+ return 0;
+}
+
/*
* Allocate zone information and store it into f->zbd_info if zonemode=zbd.
*
return ret;
switch (zbd_model) {
- case ZBD_IGNORE:
- return 0;
case ZBD_HOST_AWARE:
case ZBD_HOST_MANAGED:
ret = parse_zone_info(td, f);
+ if (ret)
+ return ret;
break;
case ZBD_NONE:
ret = init_zone_info(td, f);
+ if (ret)
+ return ret;
break;
default:
td_verror(td, EINVAL, "Unsupported zoned model");
return -EINVAL;
}
- if (ret == 0) {
- f->zbd_info->model = zbd_model;
- f->zbd_info->max_open_zones = td->o.max_open_zones;
+ assert(f->zbd_info);
+ f->zbd_info->model = zbd_model;
+
+ ret = zbd_set_max_write_zones(td, f);
+ if (ret) {
+ zbd_free_zone_info(f);
+ return ret;
}
- return ret;
+
+ return 0;
}
void zbd_free_zone_info(struct fio_file *f)
*/
static int zbd_init_zone_info(struct thread_data *td, struct fio_file *file)
{
- struct thread_data *td2;
struct fio_file *f2;
- int i, j, ret;
+ int j, ret;
- for_each_td(td2, i) {
+ for_each_td(td2) {
for_each_file(td2, f2, j) {
if (td2 == td && f2 == file)
continue;
file->zbd_info->refcount++;
return 0;
}
- }
+ } end_for_each();
ret = zbd_create_zone_info(td, file);
if (ret < 0)
td_verror(td, -ret, "zbd_create_zone_info() failed");
+
return ret;
}
-static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
- uint32_t zone_idx);
-static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
- struct fio_zone_info *z);
-
-int zbd_setup_files(struct thread_data *td)
+int zbd_init_files(struct thread_data *td)
{
struct fio_file *f;
int i;
return 1;
}
- if (!zbd_using_direct_io()) {
- log_err("Using direct I/O is mandatory for writing to ZBD drives\n\n");
- return 1;
- }
-
- if (!zbd_verify_sizes())
- return 1;
+ return 0;
+}
- if (!zbd_verify_bs())
- return 1;
+void zbd_recalc_options_with_zone_granularity(struct thread_data *td)
+{
+ struct fio_file *f;
+ int i;
for_each_file(td, f, i) {
struct zoned_block_device_info *zbd = f->zbd_info;
- struct fio_zone_info *z;
- int zi;
+ uint64_t zone_size;
- if (!zbd)
+ /* zonemode=strided doesn't get per-file zone size. */
+ zone_size = zbd ? zbd->zone_size : td->o.zone_size;
+ if (zone_size == 0)
continue;
- zbd->max_open_zones = zbd->max_open_zones ?: ZBD_MAX_OPEN_ZONES;
-
- if (td->o.max_open_zones > 0 &&
- zbd->max_open_zones != td->o.max_open_zones) {
- log_err("Different 'max_open_zones' values\n");
- return 1;
- }
- if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
- log_err("'max_open_zones' value is limited by %u\n", ZBD_MAX_OPEN_ZONES);
- return 1;
- }
-
- for (zi = f->min_zone; zi < f->max_zone; zi++) {
- z = &zbd->zone_info[zi];
- if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
- z->cond != ZBD_ZONE_COND_EXP_OPEN)
- continue;
- if (zbd_open_zone(td, f, zi))
- continue;
- /*
- * If the number of open zones exceeds specified limits,
- * reset all extra open zones.
- */
- if (zbd_reset_zone(td, f, z) < 0) {
- log_err("Failed to reest zone %d\n", zi);
- return 1;
- }
- }
+ if (td->o.size_nz > 0)
+ td->o.size = td->o.size_nz * zone_size;
+ if (td->o.io_size_nz > 0)
+ td->o.io_size = td->o.io_size_nz * zone_size;
+ if (td->o.start_offset_nz > 0)
+ td->o.start_offset = td->o.start_offset_nz * zone_size;
+ if (td->o.offset_increment_nz > 0)
+ td->o.offset_increment =
+ td->o.offset_increment_nz * zone_size;
+ if (td->o.zone_skip_nz > 0)
+ td->o.zone_skip = td->o.zone_skip_nz * zone_size;
}
-
- return 0;
}
-static unsigned int zbd_zone_nr(struct zoned_block_device_info *zbd_info,
- struct fio_zone_info *zone)
+static uint64_t zbd_verify_and_set_vdb(struct thread_data *td,
+ const struct fio_file *f)
{
- return zone - zbd_info->zone_info;
-}
+ struct fio_zone_info *zb, *ze, *z;
+ uint64_t wp_vdb = 0;
+ struct zoned_block_device_info *zbdi = f->zbd_info;
-/**
- * zbd_reset_zone - reset the write pointer of a single zone
- * @td: FIO thread data.
- * @f: FIO file associated with the disk for which to reset a write pointer.
- * @z: Zone to reset.
- *
- * Returns 0 upon success and a negative error code upon failure.
- *
- * The caller must hold z->mutex.
- */
-static int zbd_reset_zone(struct thread_data *td, struct fio_file *f,
- struct fio_zone_info *z)
-{
- uint64_t offset = z->start;
- uint64_t length = (z+1)->start - offset;
- int ret = 0;
+ assert(td->runstate < TD_RUNNING);
+ assert(zbdi);
- assert(is_valid_offset(f, offset + length - 1));
+ if (!accounting_vdb(td, f))
+ return 0;
- dprint(FD_ZBD, "%s: resetting wp of zone %u.\n", f->file_name,
- zbd_zone_nr(f->zbd_info, z));
- switch (f->zbd_info->model) {
- case ZBD_HOST_AWARE:
- case ZBD_HOST_MANAGED:
- ret = zbd_reset_wp(td, f, offset, length);
- if (ret < 0)
- return ret;
- break;
- default:
- break;
+ /*
+ * Ensure that the I/O range includes one or more sequential zones so
+ * that f->min_zone and f->max_zone have different values.
+ */
+ if (!zbd_is_seq_job(f))
+ return 0;
+
+ if (zbdi->write_min_zone != zbdi->write_max_zone) {
+ if (zbdi->write_min_zone != f->min_zone ||
+ zbdi->write_max_zone != f->max_zone) {
+ td_verror(td, EINVAL,
+ "multi-jobs with different write ranges are "
+ "not supported with zone_reset_threshold");
+ log_err("multi-jobs with different write ranges are "
+ "not supported with zone_reset_threshold\n");
+ }
+ return 0;
}
- pthread_mutex_lock(&f->zbd_info->mutex);
- f->zbd_info->sectors_with_data -= z->wp - z->start;
- pthread_mutex_unlock(&f->zbd_info->mutex);
- z->wp = z->start;
- z->verify_block = 0;
+ zbdi->write_min_zone = f->min_zone;
+ zbdi->write_max_zone = f->max_zone;
- td->ts.nr_zone_resets++;
+ zb = zbd_get_zone(f, f->min_zone);
+ ze = zbd_get_zone(f, f->max_zone);
+ for (z = zb; z < ze; z++)
+ if (z->has_wp)
+ wp_vdb += z->wp - z->start;
- return ret;
+ zbdi->wp_valid_data_bytes = wp_vdb;
+
+ return wp_vdb;
}
-/* The caller must hold f->zbd_info->mutex */
-static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
- unsigned int zone_idx)
+int zbd_setup_files(struct thread_data *td)
{
- uint32_t open_zone_idx = 0;
+ struct fio_file *f;
+ int i;
- for (; open_zone_idx < f->zbd_info->num_open_zones; open_zone_idx++) {
- if (f->zbd_info->open_zones[open_zone_idx] == zone_idx)
- break;
+ if (!zbd_using_direct_io()) {
+ log_err("Using direct I/O is mandatory for writing to ZBD drives\n\n");
+ return 1;
}
- if (open_zone_idx == f->zbd_info->num_open_zones) {
- dprint(FD_ZBD, "%s: zone %d is not open\n",
- f->file_name, zone_idx);
- return;
+
+ if (!zbd_verify_sizes())
+ return 1;
+
+ if (!zbd_verify_bs())
+ return 1;
+
+ if (td->o.experimental_verify) {
+ log_err("zonemode=zbd does not support experimental verify\n");
+ return 1;
}
- dprint(FD_ZBD, "%s: closing zone %d\n", f->file_name, zone_idx);
- memmove(f->zbd_info->open_zones + open_zone_idx,
- f->zbd_info->open_zones + open_zone_idx + 1,
- (ZBD_MAX_OPEN_ZONES - (open_zone_idx + 1)) *
- sizeof(f->zbd_info->open_zones[0]));
- f->zbd_info->num_open_zones--;
- td->num_open_zones--;
- f->zbd_info->zone_info[zone_idx].open = 0;
-}
+ for_each_file(td, f, i) {
+ struct zoned_block_device_info *zbd = f->zbd_info;
+ struct fio_zone_info *z;
+ int zi;
+ uint64_t vdb;
-/*
- * Reset a range of zones. Returns 0 upon success and 1 upon failure.
- * @td: fio thread data.
- * @f: fio file for which to reset zones
- * @zb: first zone to reset.
- * @ze: first zone not to reset.
- * @all_zones: whether to reset all zones or only those zones for which the
- * write pointer is not a multiple of td->o.min_bs[DDIR_WRITE].
- */
-static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
- struct fio_zone_info *const zb,
- struct fio_zone_info *const ze, bool all_zones)
-{
- struct fio_zone_info *z;
- const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
- bool reset_wp;
- int res = 0;
+ assert(zbd);
- assert(min_bs);
+ f->min_zone = zbd_offset_to_zone_idx(f, f->file_offset);
+ f->max_zone =
+ zbd_offset_to_zone_idx(f, f->file_offset + f->io_size);
- dprint(FD_ZBD, "%s: examining zones %u .. %u\n", f->file_name,
- zbd_zone_nr(f->zbd_info, zb), zbd_zone_nr(f->zbd_info, ze));
- for (z = zb; z < ze; z++) {
- uint32_t nz = z - f->zbd_info->zone_info;
+ vdb = zbd_verify_and_set_vdb(td, f);
- if (!zbd_zone_swr(z))
- continue;
- zone_lock(td, f, z);
- if (all_zones) {
- pthread_mutex_lock(&f->zbd_info->mutex);
- zbd_close_zone(td, f, nz);
- pthread_mutex_unlock(&f->zbd_info->mutex);
+ dprint(FD_ZBD, "%s(%s): valid data bytes = %" PRIu64 "\n",
+ __func__, f->file_name, vdb);
+
+ /*
+ * When all zones in the I/O range are conventional, io_size
+ * can be smaller than zone size, making min_zone the same
+ * as max_zone. This is why the assert below needs to be made
+ * conditional.
+ */
+ if (zbd_is_seq_job(f))
+ assert(f->min_zone < f->max_zone);
+
+ if (td->o.max_open_zones > 0 &&
+ zbd->max_write_zones != td->o.max_open_zones) {
+ log_err("Different 'max_open_zones' values\n");
+ return 1;
+ }
+
+ /*
+ * The per job max open zones limit cannot be used without a
+ * global max open zones limit. (As the tracking of open zones
+ * is disabled when there is no global max open zones limit.)
+ */
+ if (td->o.job_max_open_zones && !zbd->max_write_zones) {
+ log_err("'job_max_open_zones' cannot be used without a global open zones limit\n");
+ return 1;
+ }
+
+ /*
+ * zbd->max_write_zones is the global limit shared for all jobs
+ * that target the same zoned block device. Force sync the per
+ * thread global limit with the actual global limit. (The real
+ * per thread/job limit is stored in td->o.job_max_open_zones).
+ */
+ td->o.max_open_zones = zbd->max_write_zones;
- reset_wp = z->wp != z->start;
- } else {
- reset_wp = z->wp % min_bs != 0;
- }
- if (reset_wp) {
- dprint(FD_ZBD, "%s: resetting zone %u\n",
- f->file_name,
- zbd_zone_nr(f->zbd_info, z));
- if (zbd_reset_zone(td, f, z) < 0)
- res = 1;
+ for (zi = f->min_zone; zi < f->max_zone; zi++) {
+ z = &zbd->zone_info[zi];
+ if (z->cond != ZBD_ZONE_COND_IMP_OPEN &&
+ z->cond != ZBD_ZONE_COND_EXP_OPEN &&
+ z->cond != ZBD_ZONE_COND_CLOSED)
+ continue;
+ if (!zbd->max_active_zones &&
+ z->cond == ZBD_ZONE_COND_CLOSED)
+ continue;
+ if (__zbd_write_zone_get(td, f, z))
+ continue;
+ /*
+ * If the number of open zones exceeds specified limits,
+ * error out.
+ */
+ log_err("Number of open zones exceeds max_open_zones limit\n");
+ return 1;
}
- pthread_mutex_unlock(&z->mutex);
}
- return res;
+ return 0;
}
/*
return write_cnt == 0;
}
-enum swd_action {
- CHECK_SWD,
- SET_SWD,
-};
-
-/* Calculate the number of sectors with data (swd) and perform action 'a' */
-static uint64_t zbd_process_swd(const struct fio_file *f, enum swd_action a)
-{
- struct fio_zone_info *zb, *ze, *z;
- uint64_t swd = 0;
-
- zb = &f->zbd_info->zone_info[f->min_zone];
- ze = &f->zbd_info->zone_info[f->max_zone];
- for (z = zb; z < ze; z++) {
- pthread_mutex_lock(&z->mutex);
- swd += z->wp - z->start;
- }
- pthread_mutex_lock(&f->zbd_info->mutex);
- switch (a) {
- case CHECK_SWD:
- assert(f->zbd_info->sectors_with_data == swd);
- break;
- case SET_SWD:
- f->zbd_info->sectors_with_data = swd;
- break;
- }
- pthread_mutex_unlock(&f->zbd_info->mutex);
- for (z = zb; z < ze; z++)
- pthread_mutex_unlock(&z->mutex);
-
- return swd;
-}
-
-/*
- * The swd check is useful for debugging but takes too much time to leave
- * it enabled all the time. Hence it is disabled by default.
- */
-static const bool enable_check_swd = false;
-
-/* Check whether the value of zbd_info.sectors_with_data is correct. */
-static void zbd_check_swd(const struct fio_file *f)
-{
- if (!enable_check_swd)
- return;
-
- zbd_process_swd(f, CHECK_SWD);
-}
-
-static void zbd_init_swd(struct fio_file *f)
-{
- uint64_t swd;
-
- if (!enable_check_swd)
- return;
-
- swd = zbd_process_swd(f, SET_SWD);
- dprint(FD_ZBD, "%s(%s): swd = %" PRIu64 "\n", __func__, f->file_name,
- swd);
-}
-
void zbd_file_reset(struct thread_data *td, struct fio_file *f)
{
struct fio_zone_info *zb, *ze;
+ bool verify_data_left = false;
if (!f->zbd_info || !td_write(td))
return;
- zb = &f->zbd_info->zone_info[f->min_zone];
- ze = &f->zbd_info->zone_info[f->max_zone];
- zbd_init_swd(f);
+ zb = zbd_get_zone(f, f->min_zone);
+ ze = zbd_get_zone(f, f->max_zone);
+
/*
* If data verification is enabled reset the affected zones before
* writing any data to avoid that a zone reset has to be issued while
* writing data, which causes data loss.
*/
- zbd_reset_zones(td, f, zb, ze, td->o.verify != VERIFY_NONE &&
- td->runstate != TD_VERIFYING);
+ if (td->o.verify != VERIFY_NONE) {
+ verify_data_left = td->runstate == TD_VERIFYING ||
+ td->io_hist_len || td->verify_batch;
+ if (td->io_hist_len && td->o.verify_backlog)
+ verify_data_left =
+ td->io_hist_len % td->o.verify_backlog;
+ if (!verify_data_left)
+ zbd_reset_zones(td, f, zb, ze);
+ }
+
zbd_reset_write_cnt(td, f);
}
-/* The caller must hold f->zbd_info->mutex. */
-static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
- unsigned int zone_idx)
+/* Return random zone index for one of the write target zones. */
+static uint32_t pick_random_zone_idx(const struct fio_file *f,
+ const struct io_u *io_u)
{
- struct zoned_block_device_info *zbdi = f->zbd_info;
- int i;
-
- assert(td->o.job_max_open_zones == 0 || td->num_open_zones <= td->o.job_max_open_zones);
- assert(td->o.job_max_open_zones <= zbdi->max_open_zones);
- assert(zbdi->num_open_zones <= zbdi->max_open_zones);
-
- for (i = 0; i < zbdi->num_open_zones; i++)
- if (zbdi->open_zones[i] == zone_idx)
- return true;
-
- return false;
+ return (io_u->offset - f->file_offset) *
+ f->zbd_info->num_write_zones / f->io_size;
}
-/*
- * Open a ZBD zone if it was not yet open. Returns true if either the zone was
- * already open or if opening a new zone is allowed. Returns false if the zone
- * was not yet open and opening a new zone would cause the zone limit to be
- * exceeded.
- */
-static bool zbd_open_zone(struct thread_data *td, const struct fio_file *f,
- uint32_t zone_idx)
+static bool any_io_in_flight(void)
{
- const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
- struct fio_zone_info *z = &f->zbd_info->zone_info[zone_idx];
- bool res = true;
-
- if (z->cond == ZBD_ZONE_COND_OFFLINE)
- return false;
-
- /*
- * Skip full zones with data verification enabled because resetting a
- * zone causes data loss and hence causes verification to fail.
- */
- if (td->o.verify != VERIFY_NONE && zbd_zone_full(f, z, min_bs))
- return false;
-
- pthread_mutex_lock(&f->zbd_info->mutex);
- if (is_zone_open(td, f, zone_idx)) {
- /*
- * If the zone is already open and going to be full by writes
- * in-flight, handle it as a full zone instead of an open zone.
- */
- if (z->wp >= zbd_zone_capacity_end(z))
- res = false;
- goto out;
- }
- res = false;
- /* Zero means no limit */
- if (td->o.job_max_open_zones > 0 &&
- td->num_open_zones >= td->o.job_max_open_zones)
- goto out;
- if (f->zbd_info->num_open_zones >= f->zbd_info->max_open_zones)
- goto out;
- dprint(FD_ZBD, "%s: opening zone %d\n", f->file_name, zone_idx);
- f->zbd_info->open_zones[f->zbd_info->num_open_zones++] = zone_idx;
- td->num_open_zones++;
- z->open = 1;
- res = true;
-
-out:
- pthread_mutex_unlock(&f->zbd_info->mutex);
- return res;
-}
+ for_each_td(td) {
+ if (td->io_u_in_flight)
+ return true;
+ } end_for_each();
-/* Anything goes as long as it is not a constant. */
-static uint32_t pick_random_zone_idx(const struct fio_file *f,
- const struct io_u *io_u)
-{
- return io_u->offset * f->zbd_info->num_open_zones / f->real_file_size;
+ return false;
}
/*
- * Modify the offset of an I/O unit that does not refer to an open zone such
- * that it refers to an open zone. Close an open zone and open a new zone if
- * necessary. This algorithm can only work correctly if all write pointers are
+ * Modify the offset of an I/O unit that does not refer to a zone such that
+ * in write target zones array. Add a zone to or remove a zone from the lsit if
+ * necessary. The write target zone is searched across sequential zones.
+ * This algorithm can only work correctly if all write pointers are
* a multiple of the fio block size. The caller must neither hold z->mutex
* nor f->zbd_info->mutex. Returns with z->mutex held upon success.
*/
-static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
- struct io_u *io_u)
+static struct fio_zone_info *zbd_convert_to_write_zone(struct thread_data *td,
+ struct io_u *io_u)
{
- const uint32_t min_bs = td->o.min_bs[io_u->ddir];
+ const uint64_t min_bs = td->o.min_bs[io_u->ddir];
struct fio_file *f = io_u->file;
+ struct zoned_block_device_info *zbdi = f->zbd_info;
struct fio_zone_info *z;
- unsigned int open_zone_idx = -1;
+ unsigned int write_zone_idx = -1;
uint32_t zone_idx, new_zone_idx;
int i;
- bool wait_zone_close;
+ bool wait_zone_write;
+ bool in_flight;
+ bool should_retry = true;
assert(is_valid_offset(f, io_u->offset));
- if (td->o.max_open_zones || td->o.job_max_open_zones) {
+ if (zbdi->max_write_zones || td->o.job_max_open_zones) {
/*
- * This statement accesses f->zbd_info->open_zones[] on purpose
+ * This statement accesses zbdi->write_zones[] on purpose
* without locking.
*/
- zone_idx = f->zbd_info->open_zones[pick_random_zone_idx(f, io_u)];
+ zone_idx = zbdi->write_zones[pick_random_zone_idx(f, io_u)];
} else {
- zone_idx = zbd_zone_idx(f, io_u->offset);
+ zone_idx = zbd_offset_to_zone_idx(f, io_u->offset);
}
if (zone_idx < f->min_zone)
zone_idx = f->min_zone;
else if (zone_idx >= f->max_zone)
zone_idx = f->max_zone - 1;
- dprint(FD_ZBD, "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
+
+ dprint(FD_ZBD,
+ "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
__func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
/*
- * Since z->mutex is the outer lock and f->zbd_info->mutex the inner
+ * Since z->mutex is the outer lock and zbdi->mutex the inner
* lock it can happen that the state of the zone with index zone_idx
- * has changed after 'z' has been assigned and before f->zbd_info->mutex
+ * has changed after 'z' has been assigned and before zbdi->mutex
* has been obtained. Hence the loop.
*/
for (;;) {
uint32_t tmp_idx;
- z = &f->zbd_info->zone_info[zone_idx];
-
- zone_lock(td, f, z);
- pthread_mutex_lock(&f->zbd_info->mutex);
- if (td->o.max_open_zones == 0 && td->o.job_max_open_zones == 0)
- goto examine_zone;
- if (f->zbd_info->num_open_zones == 0) {
- dprint(FD_ZBD, "%s(%s): no zones are open\n",
- __func__, f->file_name);
- goto open_other_zone;
+ z = zbd_get_zone(f, zone_idx);
+ if (z->has_wp)
+ zone_lock(td, f, z);
+
+ pthread_mutex_lock(&zbdi->mutex);
+
+ if (z->has_wp) {
+ if (z->cond != ZBD_ZONE_COND_OFFLINE &&
+ zbdi->max_write_zones == 0 &&
+ td->o.job_max_open_zones == 0)
+ goto examine_zone;
+ if (zbdi->num_write_zones == 0) {
+ dprint(FD_ZBD, "%s(%s): no zone is write target\n",
+ __func__, f->file_name);
+ goto choose_other_zone;
+ }
}
/*
- * List of opened zones is per-device, shared across all threads.
- * Start with quasi-random candidate zone.
- * Ignore zones which don't belong to thread's offset/size area.
+ * Array of write target zones is per-device, shared across all
+ * threads. Start with quasi-random candidate zone. Ignore
+ * zones which don't belong to thread's offset/size area.
*/
- open_zone_idx = pick_random_zone_idx(f, io_u);
- assert(open_zone_idx < f->zbd_info->num_open_zones);
- tmp_idx = open_zone_idx;
- for (i = 0; i < f->zbd_info->num_open_zones; i++) {
+ write_zone_idx = pick_random_zone_idx(f, io_u);
+ assert(!write_zone_idx ||
+ write_zone_idx < zbdi->num_write_zones);
+ tmp_idx = write_zone_idx;
+
+ for (i = 0; i < zbdi->num_write_zones; i++) {
uint32_t tmpz;
- if (tmp_idx >= f->zbd_info->num_open_zones)
+ if (tmp_idx >= zbdi->num_write_zones)
tmp_idx = 0;
- tmpz = f->zbd_info->open_zones[tmp_idx];
+ tmpz = zbdi->write_zones[tmp_idx];
if (f->min_zone <= tmpz && tmpz < f->max_zone) {
- open_zone_idx = tmp_idx;
+ write_zone_idx = tmp_idx;
goto found_candidate_zone;
}
dprint(FD_ZBD, "%s(%s): no candidate zone\n",
__func__, f->file_name);
- pthread_mutex_unlock(&f->zbd_info->mutex);
- pthread_mutex_unlock(&z->mutex);
+
+ pthread_mutex_unlock(&zbdi->mutex);
+
+ if (z->has_wp)
+ zone_unlock(z);
+
return NULL;
found_candidate_zone:
- new_zone_idx = f->zbd_info->open_zones[open_zone_idx];
+ new_zone_idx = zbdi->write_zones[write_zone_idx];
if (new_zone_idx == zone_idx)
break;
zone_idx = new_zone_idx;
- pthread_mutex_unlock(&f->zbd_info->mutex);
- pthread_mutex_unlock(&z->mutex);
+
+ pthread_mutex_unlock(&zbdi->mutex);
+
+ if (z->has_wp)
+ zone_unlock(z);
}
- /* Both z->mutex and f->zbd_info->mutex are held. */
+ /* Both z->mutex and zbdi->mutex are held. */
examine_zone:
- if (z->wp + min_bs <= zbd_zone_capacity_end(z)) {
- pthread_mutex_unlock(&f->zbd_info->mutex);
+ if (zbd_zone_remainder(z) >= min_bs) {
+ pthread_mutex_unlock(&zbdi->mutex);
goto out;
}
-open_other_zone:
- /* Check if number of open zones reaches one of limits. */
- wait_zone_close =
- f->zbd_info->num_open_zones == f->max_zone - f->min_zone ||
- (td->o.max_open_zones &&
- f->zbd_info->num_open_zones == td->o.max_open_zones) ||
+choose_other_zone:
+ /* Check if number of write target zones reaches one of limits. */
+ wait_zone_write =
+ zbdi->num_write_zones == f->max_zone - f->min_zone ||
+ (zbdi->max_write_zones &&
+ zbdi->num_write_zones == zbdi->max_write_zones) ||
(td->o.job_max_open_zones &&
- td->num_open_zones == td->o.job_max_open_zones);
+ td->num_write_zones == td->o.job_max_open_zones);
- pthread_mutex_unlock(&f->zbd_info->mutex);
+ pthread_mutex_unlock(&zbdi->mutex);
/* Only z->mutex is held. */
/*
- * When number of open zones reaches to one of limits, wait for
- * zone close before opening a new zone.
+ * When number of write target zones reaches to one of limits, wait for
+ * zone write completion to one of them before trying a new zone.
*/
- if (wait_zone_close) {
- dprint(FD_ZBD, "%s(%s): quiesce to allow open zones to close\n",
+ if (wait_zone_write) {
+ dprint(FD_ZBD,
+ "%s(%s): quiesce to remove a zone from write target zones array\n",
__func__, f->file_name);
io_u_quiesce(td);
}
- /* Zone 'z' is full, so try to open a new zone. */
- for (i = f->io_size / f->zbd_info->zone_size; i > 0; i--) {
+retry:
+ /* Zone 'z' is full, so try to choose a new zone. */
+ for (i = f->io_size / zbdi->zone_size; i > 0; i--) {
zone_idx++;
- pthread_mutex_unlock(&z->mutex);
+ if (z->has_wp)
+ zone_unlock(z);
z++;
if (!is_valid_offset(f, z->start)) {
/* Wrap-around. */
zone_idx = f->min_zone;
- z = &f->zbd_info->zone_info[zone_idx];
+ z = zbd_get_zone(f, zone_idx);
}
assert(is_valid_offset(f, z->start));
+ if (!z->has_wp)
+ continue;
zone_lock(td, f, z);
- if (z->open)
+ if (z->write)
continue;
- if (zbd_open_zone(td, f, zone_idx))
+ if (zbd_write_zone_get(td, f, z))
goto out;
}
/* Only z->mutex is held. */
- /* Check whether the write fits in any of the already opened zones. */
- pthread_mutex_lock(&f->zbd_info->mutex);
- for (i = 0; i < f->zbd_info->num_open_zones; i++) {
- zone_idx = f->zbd_info->open_zones[i];
+ /* Check whether the write fits in any of the write target zones. */
+ pthread_mutex_lock(&zbdi->mutex);
+ for (i = 0; i < zbdi->num_write_zones; i++) {
+ zone_idx = zbdi->write_zones[i];
if (zone_idx < f->min_zone || zone_idx >= f->max_zone)
continue;
- pthread_mutex_unlock(&f->zbd_info->mutex);
- pthread_mutex_unlock(&z->mutex);
+ pthread_mutex_unlock(&zbdi->mutex);
+ zone_unlock(z);
- z = &f->zbd_info->zone_info[zone_idx];
+ z = zbd_get_zone(f, zone_idx);
zone_lock(td, f, z);
- if (z->wp + min_bs <= zbd_zone_capacity_end(z))
+ if (zbd_zone_remainder(z) >= min_bs)
goto out;
- pthread_mutex_lock(&f->zbd_info->mutex);
+ pthread_mutex_lock(&zbdi->mutex);
}
- pthread_mutex_unlock(&f->zbd_info->mutex);
- pthread_mutex_unlock(&z->mutex);
- dprint(FD_ZBD, "%s(%s): did not open another zone\n", __func__,
- f->file_name);
+
+ /*
+ * When any I/O is in-flight or when all I/Os in-flight get completed,
+ * the I/Os might have removed zones from the write target array then
+ * retry the steps to choose a zone. Before retry, call io_u_quiesce()
+ * to complete in-flight writes.
+ */
+ in_flight = any_io_in_flight();
+ if (in_flight || should_retry) {
+ dprint(FD_ZBD,
+ "%s(%s): wait zone write and retry write target zone selection\n",
+ __func__, f->file_name);
+ should_retry = in_flight;
+ pthread_mutex_unlock(&zbdi->mutex);
+ zone_unlock(z);
+ io_u_quiesce(td);
+ zone_lock(td, f, z);
+ goto retry;
+ }
+
+ pthread_mutex_unlock(&zbdi->mutex);
+
+ zone_unlock(z);
+
+ dprint(FD_ZBD, "%s(%s): did not choose another write zone\n",
+ __func__, f->file_name);
+
return NULL;
out:
- dprint(FD_ZBD, "%s(%s): returning zone %d\n", __func__, f->file_name,
- zone_idx);
- io_u->offset = z->start;
- return z;
-}
-
-/* The caller must hold z->mutex. */
-static struct fio_zone_info *zbd_replay_write_order(struct thread_data *td,
- struct io_u *io_u,
- struct fio_zone_info *z)
-{
- const struct fio_file *f = io_u->file;
- const uint32_t min_bs = td->o.min_bs[DDIR_WRITE];
+ dprint(FD_ZBD, "%s(%s): returning zone %d\n",
+ __func__, f->file_name, zone_idx);
- if (!zbd_open_zone(td, f, z - f->zbd_info->zone_info)) {
- pthread_mutex_unlock(&z->mutex);
- z = zbd_convert_to_open_zone(td, io_u);
- assert(z);
- }
+ io_u->offset = z->start;
+ assert(z->has_wp);
+ assert(z->cond != ZBD_ZONE_COND_OFFLINE);
- if (z->verify_block * min_bs >= z->capacity)
- log_err("%s: %d * %d >= %llu\n", f->file_name, z->verify_block,
- min_bs, (unsigned long long)z->capacity);
- io_u->offset = z->start + z->verify_block++ * min_bs;
return z;
}
/*
- * Find another zone for which @io_u fits below the write pointer. Start
- * searching in zones @zb + 1 .. @zl and continue searching in zones
- * @zf .. @zb - 1.
+ * Find another zone which has @min_bytes of readable data. Search in zones
+ * @zb + 1 .. @zl. For random workload, also search in zones @zb - 1 .. @zf.
*
- * Either returns NULL or returns a zone pointer and holds the mutex for that
- * zone.
+ * Either returns NULL or returns a zone pointer. When the zone has write
+ * pointer, hold the mutex for the zone.
*/
static struct fio_zone_info *
-zbd_find_zone(struct thread_data *td, struct io_u *io_u,
+zbd_find_zone(struct thread_data *td, struct io_u *io_u, uint64_t min_bytes,
struct fio_zone_info *zb, struct fio_zone_info *zl)
{
- const uint32_t min_bs = td->o.min_bs[io_u->ddir];
struct fio_file *f = io_u->file;
struct fio_zone_info *z1, *z2;
- const struct fio_zone_info *const zf =
- &f->zbd_info->zone_info[f->min_zone];
+ const struct fio_zone_info *const zf = zbd_get_zone(f, f->min_zone);
/*
* Skip to the next non-empty zone in case of sequential I/O and to
*/
for (z1 = zb + 1, z2 = zb - 1; z1 < zl || z2 >= zf; z1++, z2--) {
if (z1 < zl && z1->cond != ZBD_ZONE_COND_OFFLINE) {
- zone_lock(td, f, z1);
- if (z1->start + min_bs <= z1->wp)
+ if (z1->has_wp)
+ zone_lock(td, f, z1);
+ if (z1->start + min_bytes <= z1->wp)
return z1;
- pthread_mutex_unlock(&z1->mutex);
+ if (z1->has_wp)
+ zone_unlock(z1);
} else if (!td_random(td)) {
break;
}
+
if (td_random(td) && z2 >= zf &&
z2->cond != ZBD_ZONE_COND_OFFLINE) {
- zone_lock(td, f, z2);
- if (z2->start + min_bs <= z2->wp)
+ if (z2->has_wp)
+ zone_lock(td, f, z2);
+ if (z2->start + min_bytes <= z2->wp)
return z2;
- pthread_mutex_unlock(&z2->mutex);
+ if (z2->has_wp)
+ zone_unlock(z2);
}
}
- dprint(FD_ZBD, "%s: adjusting random read offset failed\n",
- f->file_name);
+
+ dprint(FD_ZBD,
+ "%s: no zone has %"PRIu64" bytes of readable data\n",
+ f->file_name, min_bytes);
+
return NULL;
}
* @io_u: I/O unit
* @z: zone info pointer
*
- * If the write command made the zone full, close it.
+ * If the write command made the zone full, remove it from the write target
+ * zones array.
*
* The caller must hold z->mutex.
*/
if (io_u->ddir == DDIR_WRITE &&
io_u->offset + io_u->buflen >= zbd_zone_capacity_end(z)) {
pthread_mutex_lock(&f->zbd_info->mutex);
- zbd_close_zone(td, f, z - f->zbd_info->zone_info);
+ zbd_write_zone_put(td, f, z);
pthread_mutex_unlock(&f->zbd_info->mutex);
}
}
const struct fio_file *f = io_u->file;
struct zoned_block_device_info *zbd_info = f->zbd_info;
struct fio_zone_info *z;
- uint32_t zone_idx;
uint64_t zone_end;
- if (!zbd_info)
- return;
-
- zone_idx = zbd_zone_idx(f, io_u->offset);
- assert(zone_idx < zbd_info->nr_zones);
- z = &zbd_info->zone_info[zone_idx];
+ assert(zbd_info);
- if (!zbd_zone_swr(z))
- return;
+ z = zbd_offset_to_zone(f, io_u->offset);
+ assert(z->has_wp);
if (!success)
goto unlock;
dprint(FD_ZBD,
"%s: queued I/O (%lld, %llu) for zone %u\n",
- f->file_name, io_u->offset, io_u->buflen, zone_idx);
+ f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
switch (io_u->ddir) {
case DDIR_WRITE:
zone_end = min((uint64_t)(io_u->offset + io_u->buflen),
zbd_zone_capacity_end(z));
- pthread_mutex_lock(&zbd_info->mutex);
+
/*
* z->wp > zone_end means that one or more I/O errors
* have occurred.
*/
- if (z->wp <= zone_end)
- zbd_info->sectors_with_data += zone_end - z->wp;
- pthread_mutex_unlock(&zbd_info->mutex);
+ if (accounting_vdb(td, f) && z->wp <= zone_end) {
+ pthread_mutex_lock(&zbd_info->mutex);
+ zbd_info->wp_valid_data_bytes += zone_end - z->wp;
+ pthread_mutex_unlock(&zbd_info->mutex);
+ }
z->wp = zone_end;
break;
- case DDIR_TRIM:
- assert(z->wp == z->start);
- break;
default:
break;
}
unlock:
if (!success || q != FIO_Q_QUEUED) {
/* BUSY or COMPLETED: unlock the zone */
- pthread_mutex_unlock(&z->mutex);
+ zone_unlock(z);
io_u->zbd_put_io = NULL;
}
}
static void zbd_put_io(struct thread_data *td, const struct io_u *io_u)
{
const struct fio_file *f = io_u->file;
- struct zoned_block_device_info *zbd_info = f->zbd_info;
struct fio_zone_info *z;
- uint32_t zone_idx;
- int ret;
- if (!zbd_info)
- return;
-
- zone_idx = zbd_zone_idx(f, io_u->offset);
- assert(zone_idx < zbd_info->nr_zones);
- z = &zbd_info->zone_info[zone_idx];
+ assert(f->zbd_info);
- if (!zbd_zone_swr(z))
- return;
+ z = zbd_offset_to_zone(f, io_u->offset);
+ assert(z->has_wp);
dprint(FD_ZBD,
"%s: terminate I/O (%lld, %llu) for zone %u\n",
- f->file_name, io_u->offset, io_u->buflen, zone_idx);
+ f->file_name, io_u->offset, io_u->buflen, zbd_zone_idx(f, z));
zbd_end_zone_io(td, io_u, z);
- ret = pthread_mutex_unlock(&z->mutex);
- assert(ret == 0);
- zbd_check_swd(f);
+ zone_unlock(z);
}
/*
struct fio_file *f = io_u->file;
enum fio_ddir ddir = io_u->ddir;
struct fio_zone_info *z;
- uint32_t zone_idx;
assert(td->o.zone_mode == ZONE_MODE_ZBD);
assert(td->o.zone_size);
+ assert(f->zbd_info);
- zone_idx = zbd_zone_idx(f, f->last_pos[ddir]);
- z = &f->zbd_info->zone_info[zone_idx];
+ z = zbd_offset_to_zone(f, f->last_pos[ddir]);
/*
* When the zone capacity is smaller than the zone size and the I/O is
* sequential write, skip to zone end if the latest position is at the
* zone capacity limit.
*/
- if (z->capacity < f->zbd_info->zone_size && !td_random(td) &&
- ddir == DDIR_WRITE &&
+ if (z->capacity < f->zbd_info->zone_size &&
+ !td_random(td) && ddir == DDIR_WRITE &&
f->last_pos[ddir] >= zbd_zone_capacity_end(z)) {
dprint(FD_ZBD,
"%s: Jump from zone capacity limit to zone end:"
- " (%llu -> %llu) for zone %u (%llu)\n",
- f->file_name, (unsigned long long) f->last_pos[ddir],
- (unsigned long long) zbd_zone_end(z),
- zbd_zone_nr(f->zbd_info, z),
- (unsigned long long) z->capacity);
+ " (%"PRIu64" -> %"PRIu64") for zone %u (%"PRIu64")\n",
+ f->file_name, f->last_pos[ddir],
+ zbd_zone_end(z), zbd_zone_idx(f, z), z->capacity);
td->io_skip_bytes += zbd_zone_end(z) - f->last_pos[ddir];
f->last_pos[ddir] = zbd_zone_end(z);
}
* devices with all empty zones. Overwrite the first I/O direction as
* write to make sure data to read exists.
*/
+ assert(io_u->file->zbd_info);
if (ddir != DDIR_READ || !td_rw(td))
return ddir;
- if (io_u->file->zbd_info->sectors_with_data ||
- td->o.read_beyond_wp)
+ if (io_u->file->last_start[DDIR_WRITE] != -1ULL ||
+ td->o.read_beyond_wp || td->o.rwmix[DDIR_WRITE] == 0)
return DDIR_READ;
return DDIR_WRITE;
enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
{
struct fio_file *f = io_u->file;
- uint32_t zone_idx_b;
+ struct zoned_block_device_info *zbdi = f->zbd_info;
struct fio_zone_info *zb, *zl, *orig_zb;
uint32_t orig_len = io_u->buflen;
- uint32_t min_bs = td->o.min_bs[io_u->ddir];
+ uint64_t min_bs = td->o.min_bs[io_u->ddir];
uint64_t new_len;
int64_t range;
- if (!f->zbd_info)
- return io_u_accept;
-
+ assert(zbdi);
assert(min_bs);
assert(is_valid_offset(f, io_u->offset));
assert(io_u->buflen);
- zone_idx_b = zbd_zone_idx(f, io_u->offset);
- zb = &f->zbd_info->zone_info[zone_idx_b];
+
+ zb = zbd_offset_to_zone(f, io_u->offset);
orig_zb = zb;
- /* Accept the I/O offset for conventional zones. */
- if (!zbd_zone_swr(zb))
+ if (!zb->has_wp) {
+ /* Accept non-write I/Os for conventional zones. */
+ if (io_u->ddir != DDIR_WRITE)
+ return io_u_accept;
+
+ /*
+ * Make sure that writes to conventional zones
+ * don't cross over to any sequential zones.
+ */
+ if (!(zb + 1)->has_wp ||
+ io_u->offset + io_u->buflen <= (zb + 1)->start)
+ return io_u_accept;
+
+ if (io_u->offset + min_bs > (zb + 1)->start) {
+ dprint(FD_IO,
+ "%s: off=%llu + min_bs=%"PRIu64" > next zone %"PRIu64"\n",
+ f->file_name, io_u->offset,
+ min_bs, (zb + 1)->start);
+ io_u->offset =
+ zb->start + (zb + 1)->start - io_u->offset;
+ new_len = min(io_u->buflen,
+ (zb + 1)->start - io_u->offset);
+ } else {
+ new_len = (zb + 1)->start - io_u->offset;
+ }
+
+ io_u->buflen = new_len / min_bs * min_bs;
+
return io_u_accept;
+ }
/*
* Accept the I/O offset for reads if reading beyond the write pointer
io_u->ddir == DDIR_READ && td->o.read_beyond_wp)
return io_u_accept;
- zbd_check_swd(f);
-
zone_lock(td, f, zb);
switch (io_u->ddir) {
case DDIR_READ:
- if (td->runstate == TD_VERIFYING && td_write(td)) {
- zb = zbd_replay_write_order(td, io_u, zb);
- pthread_mutex_unlock(&zb->mutex);
+ if (td->runstate == TD_VERIFYING && td_write(td))
goto accept;
- }
+
/*
* Check that there is enough written data in the zone to do an
* I/O of at least min_bs B. If there isn't, find a new zone for
zb->wp - zb->start : 0;
if (range < min_bs ||
((!td_random(td)) && (io_u->offset + min_bs > zb->wp))) {
- pthread_mutex_unlock(&zb->mutex);
- zl = &f->zbd_info->zone_info[f->max_zone];
- zb = zbd_find_zone(td, io_u, zb, zl);
+ zone_unlock(zb);
+ zl = zbd_get_zone(f, f->max_zone);
+ zb = zbd_find_zone(td, io_u, min_bs, zb, zl);
if (!zb) {
dprint(FD_ZBD,
"%s: zbd_find_zone(%lld, %llu) failed\n",
if (!td_random(td))
io_u->offset = zb->start;
}
+
/*
* Make sure the I/O is within the zone valid data range while
* maximizing the I/O size and preserving randomness.
io_u->offset = zb->start +
((io_u->offset - orig_zb->start) %
(range - io_u->buflen)) / min_bs * min_bs;
+
+ /*
+ * When zbd_find_zone() returns a conventional zone,
+ * we can simply accept the new i/o offset here.
+ */
+ if (!zb->has_wp)
+ return io_u_accept;
+
/*
* Make sure the I/O does not cross over the zone wp position.
*/
dprint(FD_IO, "Changed length from %u into %llu\n",
orig_len, io_u->buflen);
}
+
assert(zb->start <= io_u->offset);
assert(io_u->offset + io_u->buflen <= zb->wp);
+
goto accept;
+
case DDIR_WRITE:
- if (io_u->buflen > f->zbd_info->zone_size)
+ if (io_u->buflen > zbdi->zone_size) {
+ td_verror(td, EINVAL, "I/O buflen exceeds zone size");
+ dprint(FD_IO,
+ "%s: I/O buflen %llu exceeds zone size %"PRIu64"\n",
+ f->file_name, io_u->buflen, zbdi->zone_size);
goto eof;
- if (!zbd_open_zone(td, f, zone_idx_b)) {
- pthread_mutex_unlock(&zb->mutex);
- zb = zbd_convert_to_open_zone(td, io_u);
- if (!zb)
+ }
+
+retry:
+ if (zbd_zone_remainder(zb) > 0 &&
+ zbd_zone_remainder(zb) < min_bs) {
+ pthread_mutex_lock(&f->zbd_info->mutex);
+ zbd_write_zone_put(td, f, zb);
+ pthread_mutex_unlock(&f->zbd_info->mutex);
+ dprint(FD_ZBD,
+ "%s: finish zone %d\n",
+ f->file_name, zbd_zone_idx(f, zb));
+ io_u_quiesce(td);
+ zbd_finish_zone(td, f, zb);
+ if (zbd_zone_idx(f, zb) + 1 >= f->max_zone) {
+ if (!td_random(td))
+ goto eof;
+ }
+ zone_unlock(zb);
+
+ /* Find the next write pointer zone */
+ do {
+ zb++;
+ if (zbd_zone_idx(f, zb) >= f->max_zone)
+ zb = zbd_get_zone(f, f->min_zone);
+ } while (!zb->has_wp);
+
+ zone_lock(td, f, zb);
+ }
+
+ if (!zbd_write_zone_get(td, f, zb)) {
+ zone_unlock(zb);
+ zb = zbd_convert_to_write_zone(td, io_u);
+ if (!zb) {
+ dprint(FD_IO, "%s: can't convert to write target zone",
+ f->file_name);
goto eof;
- zone_idx_b = zb - f->zbd_info->zone_info;
+ }
}
+
+ if (zbd_zone_remainder(zb) > 0 &&
+ zbd_zone_remainder(zb) < min_bs)
+ goto retry;
+
/* Check whether the zone reset threshold has been exceeded */
if (td->o.zrf.u.f) {
- if (f->zbd_info->sectors_with_data >=
+ if (zbdi->wp_valid_data_bytes >=
f->io_size * td->o.zrt.u.f &&
- zbd_dec_and_reset_write_cnt(td, f)) {
+ zbd_dec_and_reset_write_cnt(td, f))
zb->reset_zone = 1;
- }
}
+
/* Reset the zone pointer if necessary */
if (zb->reset_zone || zbd_zone_full(f, zb, min_bs)) {
- assert(td->o.verify == VERIFY_NONE);
+ if (td->o.verify != VERIFY_NONE) {
+ /*
+ * Unset io-u->file to tell get_next_verify()
+ * that this IO is not requeue.
+ */
+ io_u->file = NULL;
+ if (!get_next_verify(td, io_u)) {
+ zone_unlock(zb);
+ return io_u_accept;
+ }
+ io_u->file = f;
+ }
+
/*
* Since previous write requests may have been submitted
* asynchronously and since we will submit the zone
*/
io_u_quiesce(td);
zb->reset_zone = 0;
- if (zbd_reset_zone(td, f, zb) < 0)
+ if (__zbd_reset_zone(td, f, zb) < 0)
goto eof;
if (zb->capacity < min_bs) {
- log_err("zone capacity %llu smaller than minimum block size %d\n",
- (unsigned long long)zb->capacity,
- min_bs);
+ td_verror(td, EINVAL, "ZCAP is less min_bs");
+ log_err("zone capacity %"PRIu64" smaller than minimum block size %"PRIu64"\n",
+ zb->capacity, min_bs);
goto eof;
}
}
+
/* Make writes occur at the write pointer */
assert(!zbd_zone_full(f, zb, min_bs));
io_u->offset = zb->wp;
if (!is_valid_offset(f, io_u->offset)) {
- dprint(FD_ZBD, "Dropped request with offset %llu\n",
- io_u->offset);
+ td_verror(td, EINVAL, "invalid WP value");
+ dprint(FD_ZBD, "%s: dropped request with offset %llu\n",
+ f->file_name, io_u->offset);
goto eof;
}
+
/*
* Make sure that the buflen is a multiple of the minimal
* block size. Give up if shrinking would make the request too
orig_len, io_u->buflen);
goto accept;
}
- log_err("Zone remainder %lld smaller than minimum block size %d\n",
- (zbd_zone_capacity_end(zb) - io_u->offset),
- min_bs);
+
+ td_verror(td, EIO, "zone remainder too small");
+ log_err("zone remainder %lld smaller than min block size %"PRIu64"\n",
+ (zbd_zone_capacity_end(zb) - io_u->offset), min_bs);
+
goto eof;
+
case DDIR_TRIM:
- /* fall-through */
+ /* Check random trim targets a non-empty zone */
+ if (!td_random(td) || zb->wp > zb->start)
+ goto accept;
+
+ /* Find out a non-empty zone to trim */
+ zone_unlock(zb);
+ zl = zbd_get_zone(f, f->max_zone);
+ zb = zbd_find_zone(td, io_u, 1, zb, zl);
+ if (zb) {
+ io_u->offset = zb->start;
+ dprint(FD_ZBD, "%s: found new zone(%lld) for trim\n",
+ f->file_name, io_u->offset);
+ goto accept;
+ }
+
+ goto eof;
+
case DDIR_SYNC:
+ /* fall-through */
case DDIR_DATASYNC:
case DDIR_SYNC_FILE_RANGE:
case DDIR_WAIT:
case DDIR_LAST:
case DDIR_INVAL:
+ case DDIR_TIMEOUT:
goto accept;
}
assert(false);
accept:
- assert(zb);
+ assert(zb->has_wp);
assert(zb->cond != ZBD_ZONE_COND_OFFLINE);
assert(!io_u->zbd_queue_io);
assert(!io_u->zbd_put_io);
+
io_u->zbd_queue_io = zbd_queue_io;
io_u->zbd_put_io = zbd_put_io;
+
+ /*
+ * Since we return with the zone lock still held,
+ * add an annotation to let Coverity know that it
+ * is intentional.
+ */
+ /* coverity[missing_unlock] */
+
return io_u_accept;
eof:
- if (zb)
- pthread_mutex_unlock(&zb->mutex);
+ if (zb && zb->has_wp)
+ zone_unlock(zb);
+
return io_u_eof;
}
{
char *res;
- if (asprintf(&res, "; %llu zone resets", (unsigned long long) ts->nr_zone_resets) < 0)
+ if (asprintf(&res, "; %"PRIu64" zone resets", ts->nr_zone_resets) < 0)
return NULL;
return res;
}
+
+/**
+ * zbd_do_io_u_trim - If reset zone is applicable, do reset zone instead of trim
+ *
+ * @td: FIO thread data.
+ * @io_u: FIO I/O unit.
+ *
+ * It is assumed that z->mutex is already locked.
+ * Return io_u_completed when reset zone succeeds. Return 0 when the target zone
+ * does not have write pointer. On error, return negative errno.
+ */
+int zbd_do_io_u_trim(struct thread_data *td, struct io_u *io_u)
+{
+ struct fio_file *f = io_u->file;
+ struct fio_zone_info *z;
+ int ret;
+
+ z = zbd_offset_to_zone(f, io_u->offset);
+ if (!z->has_wp)
+ return 0;
+
+ if (io_u->offset != z->start) {
+ log_err("Trim offset not at zone start (%lld)\n",
+ io_u->offset);
+ return -EINVAL;
+ }
+
+ ret = zbd_reset_zone((struct thread_data *)td, f, z);
+ if (ret < 0)
+ return ret;
+
+ return io_u_completed;
+}
+
+void zbd_log_err(const struct thread_data *td, const struct io_u *io_u)
+{
+ const struct fio_file *f = io_u->file;
+
+ if (td->o.zone_mode != ZONE_MODE_ZBD)
+ return;
+
+ if (io_u->error == EOVERFLOW)
+ log_err("%s: Exceeded max_active_zones limit. Check conditions of zones out of I/O ranges.\n",
+ f->file_name);
+}
enum io_u_action {
io_u_accept = 0,
io_u_eof = 1,
+ io_u_completed = 2,
};
/**
* @start: zone start location (bytes)
* @wp: zone write pointer location (bytes)
* @capacity: maximum size usable from the start of a zone (bytes)
- * @verify_block: number of blocks that have been verified for this zone
* @mutex: protects the modifiable members in this structure
* @type: zone type (BLK_ZONE_TYPE_*)
* @cond: zone state (BLK_ZONE_COND_*)
- * @open: whether or not this zone is currently open. Only relevant if
- * max_open_zones > 0.
+ * @has_wp: whether or not this zone can have a valid write pointer
+ * @write: whether or not this zone is the write target at this moment. Only
+ * relevant if zbd->max_open_zones > 0.
* @reset_zone: whether or not this zone should be reset before writing to it
*/
struct fio_zone_info {
uint64_t start;
uint64_t wp;
uint64_t capacity;
- uint32_t verify_block;
enum zbd_zone_type type:2;
enum zbd_zone_cond cond:4;
- unsigned int open:1;
+ unsigned int has_wp:1;
+ unsigned int write:1;
unsigned int reset_zone:1;
};
/**
* zoned_block_device_info - zoned block device characteristics
* @model: Device model.
- * @max_open_zones: global limit on the number of simultaneously opened
- * sequential write zones.
+ * @max_write_zones: global limit on the number of sequential write zones which
+ * are simultaneously written. A zero value means unlimited zones of
+ * simultaneous writes and that write target zones will not be tracked in
+ * the write_zones array.
+ * @max_active_zones: device side limit on the number of sequential write zones
+ * in open or closed conditions. A zero value means unlimited number of
+ * zones in the conditions.
* @mutex: Protects the modifiable members in this structure (refcount and
* num_open_zones).
* @zone_size: size of a single zone in bytes.
- * @sectors_with_data: total size of data in all zones in units of 512 bytes
+ * @wp_valid_data_bytes: total size of data in zones with write pointers
+ * @write_min_zone: Minimum zone index of all job's write ranges. Inclusive.
+ * @write_max_zone: Maximum zone index of all job's write ranges. Exclusive.
* @zone_size_log2: log2 of the zone size in bytes if it is a power of 2 or 0
* if the zone size is not a power of 2.
* @nr_zones: number of zones
* @refcount: number of fio files that share this structure
- * @num_open_zones: number of open zones
+ * @num_write_zones: number of write target zones
* @write_cnt: Number of writes since the latest zone reset triggered by
* the zone_reset_frequency fio job parameter.
- * @open_zones: zone numbers of open zones
+ * @write_zones: zone numbers of write target zones
* @zone_info: description of the individual zones
*
* Only devices for which all zones have the same size are supported.
*/
struct zoned_block_device_info {
enum zbd_zoned_model model;
- uint32_t max_open_zones;
+ uint32_t max_write_zones;
+ uint32_t max_active_zones;
pthread_mutex_t mutex;
uint64_t zone_size;
- uint64_t sectors_with_data;
+ uint64_t wp_valid_data_bytes;
+ uint32_t write_min_zone;
+ uint32_t write_max_zone;
uint32_t zone_size_log2;
uint32_t nr_zones;
uint32_t refcount;
- uint32_t num_open_zones;
+ uint32_t num_write_zones;
uint32_t write_cnt;
- uint32_t open_zones[ZBD_MAX_OPEN_ZONES];
+ uint32_t write_zones[ZBD_MAX_WRITE_ZONES];
struct fio_zone_info zone_info[0];
};
+int zbd_init_files(struct thread_data *td);
+void zbd_recalc_options_with_zone_granularity(struct thread_data *td);
int zbd_setup_files(struct thread_data *td);
void zbd_free_zone_info(struct fio_file *f);
void zbd_file_reset(struct thread_data *td, struct fio_file *f);
enum fio_ddir ddir);
enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u);
char *zbd_write_status(const struct thread_stat *ts);
+int zbd_do_io_u_trim(struct thread_data *td, struct io_u *io_u);
+void zbd_log_err(const struct thread_data *td, const struct io_u *io_u);
static inline void zbd_close_file(struct fio_file *f)
{
#include <inttypes.h>
-#define ZBD_MAX_OPEN_ZONES 4096
+#define ZBD_MAX_WRITE_ZONES 4096
/*
* Zoned block device models.
*/
enum zbd_zoned_model {
- ZBD_IGNORE, /* Ignore file */
- ZBD_NONE, /* Regular block device */
- ZBD_HOST_AWARE, /* Host-aware zoned block device */
- ZBD_HOST_MANAGED, /* Host-managed zoned block device */
+ ZBD_NONE = 0x1, /* No zone support. Emulate zones. */
+ ZBD_HOST_AWARE = 0x2, /* Host-aware zoned block device */
+ ZBD_HOST_MANAGED = 0x3, /* Host-managed zoned block device */
};
/*